Re: [PATCH 5/5] x86/microcode: Handle NMI's during microcode update.

From: Andy Lutomirski
Date: Sat Aug 13 2022 - 20:13:54 EST




On Sat, Aug 13, 2022, at 3:38 PM, Ashok Raj wrote:
> Microcode updates need a guarantee that the thread sibling that is waiting
> for the update to finish on the primary core will not execute any
> instructions until the update is complete. This is required to guarantee
> any MSR or instruction that's being patched will be executed before the
> update is complete.
>
> After the stop_machine() rendezvous, an NMI handler is registered. If an
> NMI were to happen while the microcode update is not complete, the
> secondary thread will spin until the ucode update state is cleared.
>
> Couple of choices discussed are:
>
> 1. Rendezvous inside the NMI handler, and also perform the update from
> within the handler. This seemed too risky and might cause instability
> with the races that we would need to solve. This would be a difficult
> choice.

I prefer choice 1. As I understand it, Xen has done this for a while to good effect.

If I were implementing this, I would rendezvous via stop_machine as usual. Then I would set a flag or install a handler indicating that we are doing a microcode update, send NMI-to-self, and rendezvous in the NMI handler and do the update.

> 2. Thomas (tglx) suggested that we could look into masking all the LVT
> originating NMI's. Such as LINT1, Perf control LVT entries and such.
> Since we are in the rendezvous loop, we don't need to worry about any
> NMI IPI's generated by the OS.
>
> The one we didn't have any control over is the ACPI mechanism of sending
> notifications to kernel for Firmware First Processing (FFM). Apparently
> it seems there is a PCH register that BIOS in SMI would write to
> generate such an interrupt (ACPI GHES).
> 3. This is a simpler option. OS registers an NMI handler and doesn't do any
> NMI rendezvous dance. But if an NMI were to happen, we check if any of
> the CPUs thread siblings have an update in progress. Only those CPUs
> would take an NMI. The thread performing the wrmsr() will only take an
> NMI after the completion of the wrmsr 0x79 flow.
>
> Signed-off-by: Ashok Raj <ashok.raj@xxxxxxxxx>
> ---
> arch/x86/kernel/cpu/microcode/core.c | 88 +++++++++++++++++++++++++++-
> 1 file changed, 85 insertions(+), 3 deletions(-)
>
> diff --git a/arch/x86/kernel/cpu/microcode/core.c
> b/arch/x86/kernel/cpu/microcode/core.c
> index d24e1c754c27..ec10fa2db8b1 100644
> --- a/arch/x86/kernel/cpu/microcode/core.c
> +++ b/arch/x86/kernel/cpu/microcode/core.c
> @@ -40,6 +40,7 @@
> #include <asm/cmdline.h>
> #include <asm/setup.h>
> #include <asm/mce.h>
> +#include <asm/nmi.h>
>
> #define DRIVER_VERSION "2.2"
>
> @@ -411,6 +412,10 @@ static int check_online_cpus(void)
>
> static atomic_t late_cpus_in;
> static atomic_t late_cpus_out;
> +static atomic_t nmi_cpus;
> +static atomic_t nmi_timeouts;
> +
> +static struct cpumask cpus_in_wait;
>
> static int __wait_for_cpus(atomic_t *t, long long timeout)
> {
> @@ -433,6 +438,53 @@ static int __wait_for_cpus(atomic_t *t, long long timeout)
> return 0;
> }
>
> +static int ucode_nmi_cb(unsigned int val, struct pt_regs *regs)
> +{
> + int cpu = smp_processor_id();
> + int timeout = 100 * NSEC_PER_USEC;
> +
> + atomic_inc(&nmi_cpus);
> + if (!cpumask_test_cpu(cpu, &cpus_in_wait))
> + return NMI_DONE;
> +
> + while (timeout < NSEC_PER_USEC) {
> + if (timeout < NSEC_PER_USEC) {
> + atomic_inc(&nmi_timeouts);
> + break;
> + }
> + ndelay(SPINUNIT);
> + timeout -= SPINUNIT;
> + touch_nmi_watchdog();
> + if (!cpumask_test_cpu(cpu, &cpus_in_wait))
> + break;
> + }
> + return NMI_HANDLED;
> +}
> +
> +static void set_nmi_cpus(struct cpumask *wait_mask)
> +{
> + int first_cpu, wait_cpu, cpu = smp_processor_id();
> +
> + first_cpu = cpumask_first(topology_sibling_cpumask(cpu));
> + for_each_cpu(wait_cpu, topology_sibling_cpumask(cpu)) {
> + if (wait_cpu == first_cpu)
> + continue;
> + cpumask_set_cpu(wait_cpu, wait_mask);
> + }
> +}
> +
> +static void clear_nmi_cpus(struct cpumask *wait_mask)
> +{
> + int first_cpu, wait_cpu, cpu = smp_processor_id();
> +
> + first_cpu = cpumask_first(topology_sibling_cpumask(cpu));
> + for_each_cpu(wait_cpu, topology_sibling_cpumask(cpu)) {
> + if (wait_cpu == first_cpu)
> + continue;
> + cpumask_clear_cpu(wait_cpu, wait_mask);
> + }
> +}
> +
> /*
> * Returns:
> * < 0 - on error
> @@ -440,7 +492,7 @@ static int __wait_for_cpus(atomic_t *t, long long timeout)
> */
> static int __reload_late(void *info)
> {
> - int cpu = smp_processor_id();
> + int first_cpu, cpu = smp_processor_id();
> enum ucode_state err;
> int ret = 0;
>
> @@ -459,6 +511,7 @@ static int __reload_late(void *info)
> * the platform is taken to reset predictively.
> */
> mce_set_mcip();
> +
> /*
> * On an SMT system, it suffices to load the microcode on one sibling of
> * the core because the microcode engine is shared between the threads.
> @@ -466,9 +519,17 @@ static int __reload_late(void *info)
> * loading attempts happen on multiple threads of an SMT core. See
> * below.
> */
> + first_cpu = cpumask_first(topology_sibling_cpumask(cpu));
>
> - if (cpumask_first(topology_sibling_cpumask(cpu)) == cpu)
> + /*
> + * Set the CPUs that we should hold in NMI until the primary has
> + * completed the microcode update.
> + */
> + if (first_cpu == cpu) {
> + set_nmi_cpus(&cpus_in_wait);
> apply_microcode_local(&err);
> + clear_nmi_cpus(&cpus_in_wait);
> + }
> else
> goto wait_for_siblings;
>
> @@ -502,20 +563,41 @@ static int __reload_late(void *info)
> */
> static int microcode_reload_late(void)
> {
> - int ret;
> + int ret = 0;
>
> pr_err("Attempting late microcode loading - it is dangerous and
> taints the kernel.\n");
> pr_err("You should switch to early loading, if possible.\n");
>
> atomic_set(&late_cpus_in, 0);
> atomic_set(&late_cpus_out, 0);
> + atomic_set(&nmi_cpus, 0);
> + atomic_set(&nmi_timeouts, 0);
> + cpumask_clear(&cpus_in_wait);
> +
> + ret = register_nmi_handler(NMI_LOCAL, ucode_nmi_cb, NMI_FLAG_FIRST,
> + "ucode_nmi");
> + if (ret) {
> + pr_err("Unable to register NMI handler\n");
> + goto done;
> + }
>
> ret = stop_machine_cpuslocked(__reload_late, NULL, cpu_online_mask);
> if (ret == 0)
> microcode_check();
>
> + unregister_nmi_handler(NMI_LOCAL, "ucode_nmi");
> +
> + if (atomic_read(&nmi_cpus))
> + pr_info("%d CPUs entered NMI while microcode update"
> + "in progress\n", atomic_read(&nmi_cpus));
> +
> + if (atomic_read(&nmi_timeouts))
> + pr_err("Some CPUs [%d] entered NMI and timedout waiting for its"
> + " mask to be cleared\n", atomic_read(&nmi_timeouts));
> +
> pr_info("Reload completed, microcode revision: 0x%x\n",
> boot_cpu_data.microcode);
>
> +done:
> return ret;
> }
>
> --
> 2.32.0