[PATCH v2 04/11] stop_machine: Add NMI-based execution path

From: Chang S. Bae

Date: Mon Mar 30 2026 - 22:21:54 EST


Currently multi_cpu_stop() executes the stop function in the
MULTI_STOP_RUN state. But NMIs can still preempt the execution. For use
cases requiring stricter isolation from them, provide an option to run
the stop function from NMI context.

Then, the NMI stop function must be built without instrumentation, as
exceptions may re-enable NMIs on return via IRET. Annotate the NMI
handler entirely with noinstr.

However, objtool cannot reliably determine whether the indirect call
target is located in a noinstr section, so it may emit false positives.
To avoid this, temporarily lift the instrumentation restriction around
the indirect call site and document the intention.

The x86 microcode loader is currently the primary user for this. But
other architectures are not expected to use it. Add a build option to
make it opt-in.

Originally-by: David Kaplan <david.kaplan@xxxxxxx>
Suggested-by: Borislav Petkov <bp@xxxxxxxxx>
Suggested-by: Thomas Gleixner <tglx@xxxxxxxxxx>
Signed-off-by: Chang S. Bae <chang.seok.bae@xxxxxxxxx>
Link: https://lore.kernel.org/lkml/20260129121729.GRaXtP2aeWkQKegxC2@fat_crate.local
Link: https://lore.kernel.org/lkml/87wm0zl8p2.ffs@tglx
---
V1 -> V2: Multiple changes
* Fix racy return value read [**]
* Add Kconfig option (Thomas)
* Add cpumask to track NMI delivery (Boris)
* Rework implementation. Consolidate under ifdef

[**]: Observed delay in IPI/NMI delivery when testing error return paths
---
arch/Kconfig | 3 ++
include/linux/stop_machine.h | 16 ++++++
kernel/stop_machine.c | 96 +++++++++++++++++++++++++++++++++++-
3 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 102ddbd4298e..f84fd528aae7 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1841,4 +1841,7 @@ config ARCH_WANTS_PRE_LINK_VMLINUX
config ARCH_HAS_CPU_ATTACK_VECTORS
bool

+config STOP_MACHINE_NMI
+ bool
+
endmenu
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 2f986555113a..9424d363ab38 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -19,6 +19,12 @@
*/
typedef int (*cpu_stop_fn_t)(void *arg);

+/*
+ * Stop function variant runnable from NMI context. This makes the
+ * noinstr requirement explicit at the type level.
+ */
+typedef int (*cpu_stop_nmisafe_fn_t)(void *arg);
+
#ifdef CONFIG_SMP

struct cpu_stop_work {
@@ -189,4 +195,14 @@ stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
}

#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
+
+#ifdef CONFIG_STOP_MACHINE_NMI
+
+void arch_send_self_nmi(void);
+bool noinstr stop_machine_nmi_handler(void);
+
+#else
+static inline bool stop_machine_nmi_handler(void) { return false; }
+#endif /* CONFIG_STOP_MACHINE_NMI */
+
#endif /* _LINUX_STOP_MACHINE */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 092c65c002ff..45ea62f1b2b5 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -22,6 +22,7 @@
#include <linux/atomic.h>
#include <linux/nmi.h>
#include <linux/sched/wake_q.h>
+#include <linux/delay.h>

/*
* Structure to determine completion condition and record errors. May
@@ -174,6 +175,15 @@ struct multi_stop_data {

enum multi_stop_state state;
atomic_t thread_ack;
+
+#ifdef CONFIG_STOP_MACHINE_NMI
+ /* Used in the NMI stop_machine variant */
+ bool use_nmi;
+ /* A separate function type to highlight noinstr requirement */
+ cpu_stop_nmisafe_fn_t nmisafe_fn;
+ /* cpumask of CPUs on which to raise an NMI */
+ cpumask_var_t nmi_cpus;
+#endif
};

static void set_state(struct multi_stop_data *msdata,
@@ -197,6 +207,8 @@ notrace void __weak stop_machine_yield(const struct cpumask *cpumask)
cpu_relax();
}

+static int multi_stop_run(struct multi_stop_data *msdata);
+
/* This is the cpu_stop function which stops the CPU. */
static int multi_cpu_stop(void *data)
{
@@ -235,7 +247,7 @@ static int multi_cpu_stop(void *data)
break;
case MULTI_STOP_RUN:
if (is_active)
- err = msdata->fn(msdata->data);
+ err = multi_stop_run(msdata);
break;
default:
break;
@@ -712,3 +724,85 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
mutex_unlock(&stop_cpus_mutex);
return ret | done.ret;
}
+
+#ifdef CONFIG_STOP_MACHINE_NMI
+
+struct nmi_stop {
+ struct multi_stop_data *data;
+ int ret;
+ bool done;
+};
+
+static DEFINE_PER_CPU(struct nmi_stop, nmi_stop);
+
+/*
+ * Instrumentation may trigger nested exceptions such as #INT3, #DB,
+ * or #PF. IRET from those would re-enable NMIs, which opposes the goal
+ * of this NMI stop-machine facility.
+ */
+bool noinstr stop_machine_nmi_handler(void)
+{
+ struct multi_stop_data *msdata = raw_cpu_read(nmi_stop.data);
+ unsigned int cpu = smp_processor_id();
+ int ret;
+
+ if (!msdata || !cpumask_test_and_clear_cpu(cpu, msdata->nmi_cpus))
+ return false;
+
+ /*
+ * The indirect call to @nmisafe_fn() is indistinguishable at
+ * post-compilation. Temporarily enabling instrumentation avoids
+ * objtool false positives.
+ */
+ instrumentation_begin();
+ ret = msdata->nmisafe_fn(msdata->data);
+ instrumentation_end();
+
+ raw_cpu_write(nmi_stop.ret, ret);
+ raw_cpu_write(nmi_stop.done, true);
+ raw_cpu_write(nmi_stop.data, NULL);
+
+ return true;
+}
+
+static bool wait_for_nmi_handler(void)
+{
+ /* Conservative timeout */
+ unsigned long timeout = USEC_PER_SEC;
+
+ while (!this_cpu_read(nmi_stop.done) && timeout--)
+ udelay(1);
+
+ return this_cpu_read(nmi_stop.done);
+}
+
+static int nmi_stop_run(struct multi_stop_data *msdata)
+{
+ /*
+ * Save per-CPU state accessible from NMI context and raise a
+ * self-NMI to execute the stop function from the NMI handler
+ */
+ this_cpu_write(nmi_stop.data, msdata);
+ this_cpu_write(nmi_stop.done, false);
+ arch_send_self_nmi();
+
+ /* Ensure the handler went through before reading the result */
+ if (!wait_for_nmi_handler())
+ return -ETIMEDOUT;
+
+ return this_cpu_read(nmi_stop.ret);
+}
+
+static int multi_stop_run(struct multi_stop_data *msdata)
+{
+ return msdata->use_nmi ? nmi_stop_run(msdata) : msdata->fn(msdata->data);
+}
+
+#else
+
+static int multi_stop_run(struct multi_stop_data *msdata)
+{
+ return msdata->fn(msdata->data);
+}
+
+#endif /* CONFIG_STOP_MACHINE_NMI */
--
2.51.0