[PATCH v2] pseries/kexec: skip resetting CPUs added by firmware but not started by the kernel
From: Shivang Upadhyay
Date: Mon Mar 30 2026 - 02:23:07 EST
During DLPAR operations, the newly added CPUs start in halted mode.
The kernel then takes some time to initialize those CPUs internally and
start them using the "start-cpu" RTAS call. However, if a kexec crash
occurs in this window (before the new CPU has been initialized),
the kexec NMI will try to reset all other CPUs from the crashing CPU.
This leads to firmware starting the uninitialized CPUs as well.
This can cause the kdump kernel to hang during bring-up.
Sample Log:
[175993.028231][ T1502] NIP [00007fffb953f394] 0x7fffb953f394
[175993.028314][ T1502] LR [00007fffb953f394] 0x7fffb953f394
[175993.028390][ T1502] --- interrupt: 3000
[ 5.519483][ T1] Processor 0 is stuck.
[ 11.089481][ T1] Processor 1 is stuck.
To fix this, only issue the system-reset hcall to CPUs that have
actually been started by the kernel.
Cc: Madhavan Srinivasan <maddy@xxxxxxxxxxxxx>
Cc: Michael Ellerman <mpe@xxxxxxxxxxxxxx>
Cc: Nicholas Piggin <npiggin@xxxxxxxxx>
Cc: Christophe Leroy <christophe.leroy@xxxxxxxxxx>
Cc: Srikar Dronamraju <srikar@xxxxxxxxxxxxx>
Cc: Shrikanth Hegde <sshegde@xxxxxxxxxxxxx>
Cc: Nysal Jan K.A. <nysal@xxxxxxxxxxxxx>
Cc: Vishal Chourasia <vishalc@xxxxxxxxxxxxx>
Cc: Ritesh Harjani <ritesh.list@xxxxxxxxx>
Cc: Sourabh Jain <sourabhjain@xxxxxxxxxxxxx>
Reported-by: Anushree Mathur <anushree.mathur@xxxxxxxxxxxxxxxxxx>
Signed-off-by: Shivang Upadhyay <shivangu@xxxxxxxxxxxxx>
---
Changelog:
V2:
* added set_crash_nmi_ipi to saperate crash's case from other nmi_ipi
users
V1:
* https://lore.kernel.org/all/20251205142825.44698-1-shivangu@xxxxxxxxxxxxx/
---
arch/powerpc/include/asm/smp.h | 1 +
arch/powerpc/kernel/smp.c | 1 +
arch/powerpc/platforms/pseries/smp.c | 29 +++++++++++++++++++++++++++-
3 files changed, 30 insertions(+), 1 deletion(-)
diff --git a/arch/powerpc/include/asm/smp.h b/arch/powerpc/include/asm/smp.h
index e41b9ea42122..cb74201f5674 100644
--- a/arch/powerpc/include/asm/smp.h
+++ b/arch/powerpc/include/asm/smp.h
@@ -47,6 +47,7 @@ struct smp_ops_t {
void (*cause_ipi)(int cpu);
#endif
int (*cause_nmi_ipi)(int cpu);
+ void (*set_crash_nmi_ipi)(void);
void (*probe)(void);
int (*kick_cpu)(int nr);
int (*prepare_cpu)(int nr);
diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index 3467f86fd78f..3390ee8adf79 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -594,6 +594,7 @@ void crash_send_ipi(void (*crash_ipi_callback)(struct pt_regs *))
{
int cpu;
+ smp_ops->set_crash_nmi_ipi();
smp_send_nmi_ipi(NMI_IPI_ALL_OTHERS, crash_ipi_callback, 1000000);
if (kdump_in_progress() && crash_wake_offline) {
for_each_present_cpu(cpu) {
diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c
index db99725e752b..c6c2baacca9a 100644
--- a/arch/powerpc/platforms/pseries/smp.c
+++ b/arch/powerpc/platforms/pseries/smp.c
@@ -51,6 +51,9 @@
*/
static cpumask_var_t of_spin_mask;
+
+static int crash_nmi_ipi;
+
/* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */
int smp_query_cpu_stopped(unsigned int pcpu)
{
@@ -171,12 +174,35 @@ static void dbell_or_ic_cause_ipi(int cpu)
ic_cause_ipi(cpu);
}
+static void pseries_set_crash_nmi_ipi(void)
+{
+ crash_nmi_ipi = 1;
+}
+
static int pseries_cause_nmi_ipi(int cpu)
{
int hwcpu;
+ int k, curcpu;
+ curcpu = smp_processor_id();
if (cpu == NMI_IPI_ALL_OTHERS) {
- hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
+ if (crash_nmi_ipi) {
+ for_each_present_cpu(k) {
+ if (k != curcpu) {
+ hwcpu = get_hard_smp_processor_id(k);
+
+ /* it is possible that cpu is present,
+ * but not started yet.
+ */
+
+ if (paca_ptrs[hwcpu]->cpu_start == 1) {
+ plpar_signal_sys_reset(hwcpu);
+ }
+ }
+ }
+ return 1;
+ } else
+ hwcpu = H_SIGNAL_SYS_RESET_ALL_OTHERS;
} else {
if (cpu < 0) {
WARN_ONCE(true, "incorrect cpu parameter %d", cpu);
@@ -243,6 +269,7 @@ static struct smp_ops_t pseries_smp_ops = {
.message_pass = NULL, /* Use smp_muxed_ipi_message_pass */
.cause_ipi = NULL, /* Filled at runtime by pSeries_smp_probe() */
.cause_nmi_ipi = pseries_cause_nmi_ipi,
+ .set_crash_nmi_ipi = pseries_set_crash_nmi_ipi,
.probe = pSeries_smp_probe,
.prepare_cpu = pseries_smp_prepare_cpu,
.kick_cpu = smp_pSeries_kick_cpu,
--
2.53.0