Re: [PATCH 1/7] stop_machine: Introduce stop_machine_nmi()

From: Borislav Petkov

Date: Mon Feb 02 2026 - 05:54:46 EST


On Thu, Jan 29, 2026 at 01:17:29PM +0100, Borislav Petkov wrote:
> What I'm not sure about is:
>
> if (msdata->use_nmi) {
> this_cpu_write(stop_machine_nmi_ctrl.msdata, msdata);
> arch_send_self_nmi();
>
> <--- we send the NMI IPI here...
>
> return raw_cpu_read(stop_machine_nmi_ctrl.err);
>
> ... and we read the err result immediately but what guarantees us that the NMI
> handler on that CPU will have run and written err:
>
> raw_cpu_write(stop_machine_nmi_ctrl.err, err);
>
> ?

Yeah, I don't think we can rely on the NMI handler running immediately after
the ICR write... so I guess we will have to OR-in the retvals after all the
NMIs have been raised on all CPUs.

I had a silly idea about that: another CPU mask. See below.

This still doesn't handle what Chang and you mentioned that the NMI handler
needs to check whether it comes from a stomp_machine call or not. That
later...

Completely untested ofc:

---
diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h
index 72820503514c..ce5f932443cd 100644
--- a/include/linux/stop_machine.h
+++ b/include/linux/stop_machine.h
@@ -141,6 +141,29 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus);
*/
int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus);

+/**
+ * stop_machine_nmi: freeze the machine and run this function in NMI context
+ * @fn: the function to run
+ * @data: the data ptr for the @fn()
+ * @cpus: the CPUs to run the @fn() on (NULL = any online CPU)
+ *
+ * Like stop_machine() but runs the function in NMI context to avoid any risk of
+ * interruption due to NMIs.
+ *
+ * Protects against CPU hotplug.
+ */
+int stop_machine_nmi(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus);
+
+/**
+ * stop_machine_cpuslocked_nmi: freeze and run this function in NMI context
+ * @fn: the function to run
+ * @data: the data ptr for the @fn()
+ * @cpus: the CPUs to run the @fn() on (NULL = any online CPU)
+ *
+ * Same as above. Must be called from within a cpus_read_lock() protected
+ * region. Avoids nested calls to cpus_read_lock().
+ */
+int stop_machine_cpuslocked_nmi(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus);
/**
* stop_core_cpuslocked: - stop all threads on just one core
* @cpu: any cpu in the targeted core
@@ -160,6 +183,9 @@ int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data);

int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
const struct cpumask *cpus);
+
+bool stop_machine_nmi_handler_enabled(void);
+bool noinstr stop_machine_nmi_handler(void);
#else /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */

static __always_inline int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
@@ -186,5 +212,23 @@ stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data,
return stop_machine(fn, data, cpus);
}

+/* stop_machine_nmi() is only supported in SMP systems. */
+static __always_inline int stop_machine_nmi(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
+{
+ return -EINVAL;
+}
+
+static __always_inline bool stop_machine_nmi_handler_enabled(void)
+{
+ return false;
+}
+
+static __always_inline bool stop_machine_nmi_handler(void)
+{
+ return false;
+}
+
#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */
+
+void arch_send_self_nmi(void);
#endif /* _LINUX_STOP_MACHINE */
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 3fe6b0c99f3d..c6c8afc4d03e 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -174,8 +174,26 @@ struct multi_stop_data {

enum multi_stop_state state;
atomic_t thread_ack;
+
+ bool use_nmi;
+
+ /*
+ * cpumasks of CPUs on which to raise an NMI; used in the NMI
+ * stomp_machine variant. nmi_cpus_done is used for tracking
+ * when the NMI handler has executed successfully.
+ */
+ struct cpumask nmi_cpus;
+ struct cpumask nmi_cpus_done;
+
+};
+
+struct stop_machine_nmi_ctrl {
+ struct multi_stop_data *msdata;
+ int err;
};

+static DEFINE_PER_CPU(struct stop_machine_nmi_ctrl, stop_machine_nmi_ctrl);
+
static void set_state(struct multi_stop_data *msdata,
enum multi_stop_state newstate)
{
@@ -197,6 +215,41 @@ notrace void __weak stop_machine_yield(const struct cpumask *cpumask)
cpu_relax();
}

+void __weak arch_send_self_nmi(void)
+{
+ /* Arch code must implement this to support stop_machine_nmi() */
+}
+
+bool noinstr stop_machine_nmi_handler(void)
+{
+ struct multi_stop_data *msdata = raw_cpu_read(stop_machine_nmi_ctrl.msdata);
+ unsigned int cpu = smp_processor_id();
+ int err;
+
+ if (!cpumask_test_and_clear_cpu(cpu, &msdata->nmi_cpus))
+ return false;
+
+ instrumentation_begin();
+ err = msdata->fn(msdata->data);
+ instrumentation_end();
+ raw_cpu_write(stop_machine_nmi_ctrl.err, err);
+
+ cpumask_set_cpu(cpu, &msdata->nmi_cpus_done);
+
+ return true;
+}
+
+static int __multi_cpu_stop(struct multi_stop_data *msdata)
+{
+ if (msdata->use_nmi) {
+ this_cpu_write(stop_machine_nmi_ctrl.msdata, msdata);
+ arch_send_self_nmi();
+ return 0;
+ } else {
+ return msdata->fn(msdata->data);
+ }
+}
+
/* This is the cpu_stop function which stops the CPU. */
static int multi_cpu_stop(void *data)
{
@@ -235,7 +288,7 @@ static int multi_cpu_stop(void *data)
break;
case MULTI_STOP_RUN:
if (is_active)
- err = msdata->fn(msdata->data);
+ err = __multi_cpu_stop(msdata);
break;
default:
break;
@@ -584,15 +637,22 @@ static int __init cpu_stop_init(void)
}
early_initcall(cpu_stop_init);

-int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
- const struct cpumask *cpus)
+static int __stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
+ const struct cpumask *cpus, bool use_nmi)
{
struct multi_stop_data msdata = {
.fn = fn,
.data = data,
.num_threads = num_online_cpus(),
.active_cpus = cpus,
+ .use_nmi = use_nmi,
};
+ int ret, cpu;
+
+ if (use_nmi) {
+ cpumask_copy(&msdata.nmi_cpus, cpus);
+ cpumask_clear(&msdata.nmi_cpus_done);
+ }

lockdep_assert_cpus_held();

@@ -617,7 +677,32 @@ int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,

/* Set the initial state and stop all online cpus. */
set_state(&msdata, MULTI_STOP_PREPARE);
- return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
+ ret = stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
+
+ if (!use_nmi)
+ return ret;
+
+ if (!cpumask_equal(cpus, &msdata.nmi_cpus_done)) {
+ pr_err("Some CPUs didn't run the stomp_machine NMI handler\n");
+ return -EINVAL;
+ } else {
+ for_each_cpu(cpu, cpus)
+ ret |= per_cpu(stop_machine_nmi_ctrl.err, cpu);
+
+ return ret;
+ }
+}
+
+int stop_machine_cpuslocked(cpu_stop_fn_t fn, void *data,
+ const struct cpumask *cpus)
+{
+ return __stop_machine_cpuslocked(fn, data, cpus, false);
+}
+
+int stop_machine_cpuslocked_nmi(cpu_stop_fn_t fn, void *data,
+ const struct cpumask *cpus)
+{
+ return __stop_machine_cpuslocked(fn, data, cpus, true);
}

int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
@@ -632,6 +717,18 @@ int stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
}
EXPORT_SYMBOL_GPL(stop_machine);

+int stop_machine_nmi(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus)
+{
+ int ret;
+
+ cpus_read_lock();
+ ret = stop_machine_cpuslocked_nmi(fn, data, cpus);
+ cpus_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(stop_machine_nmi);
+
#ifdef CONFIG_SCHED_SMT
int stop_core_cpuslocked(unsigned int cpu, cpu_stop_fn_t fn, void *data)
{
--
Regards/Gruss,
Boris.

https://people.kernel.org/tglx/notes-about-netiquette