[PATCH 2/3] x86/smp: Allow forcing the mwait hint for play dead loop

From: Patryk Wlazlyn
Date: Thu Oct 10 2024 - 09:41:56 EST


The current implementation for looking up the mwait hint for the deepest
cstate depends on them to be continuous in range [0, NUM_SUBSTATES-1].
While that is correct on most Intel x86 platforms, it is not
architectural and may not result in reaching the most optimized idle
state on some of them.

For example Intel's Sierra Forest report two C6 substates in cpuid leaf 5:
C6S (hint 0x22)
C6SP (hint 0x23)

Hints 0x20 and 0x21 are skipped entirely, causing the current
implementation to compute the wrong hint, when looking for the deepest
cstate for offlined CPU to enter. As a result, package with an offlined
CPU can never reach PC6.

Allow the idle driver to communicate the deepest idle cstate to the x86
offline code.

Signed-off-by: Patryk Wlazlyn <patryk.wlazlyn@xxxxxxxxxxxxxxx>
---
arch/x86/include/asm/smp.h | 3 +++
arch/x86/kernel/smpboot.c | 12 +++++++++++-
2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index ca073f40698f..2cb083a84225 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -114,6 +114,7 @@ void wbinvd_on_cpu(int cpu);
int wbinvd_on_all_cpus(void);

void smp_kick_mwait_play_dead(void);
+void smp_set_mwait_play_dead_hint(unsigned int hint);

void native_smp_send_reschedule(int cpu);
void native_send_call_func_ipi(const struct cpumask *mask);
@@ -164,6 +165,8 @@ static inline struct cpumask *cpu_llc_shared_mask(int cpu)
{
return (struct cpumask *)cpumask_of(0);
}
+
+static inline void smp_set_mwait_play_dead_hint(unsigned int hint) { }
#endif /* CONFIG_SMP */

#ifdef CONFIG_DEBUG_NMI_SELFTEST
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 683898e3b20e..67d1fc976683 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -127,6 +127,9 @@ int __read_mostly __max_smt_threads = 1;
/* Flag to indicate if a complete sched domain rebuild is required */
bool x86_topology_update;

+#define PLAY_DEAD_MWAIT_HINT_UNSET 0U
+static unsigned int __read_mostly play_dead_mwait_hint = PLAY_DEAD_MWAIT_HINT_UNSET;
+
int arch_update_cpu_topology(void)
{
int retval = x86_topology_update;
@@ -1270,6 +1273,11 @@ void play_dead_common(void)
local_irq_disable();
}

+void smp_set_mwait_play_dead_hint(unsigned int hint)
+{
+ WRITE_ONCE(play_dead_mwait_hint, hint);
+}
+
/* Computes mwait hint for the deepest mwait hint based on cpuid leaf 0x5 */
static inline unsigned int get_deepest_mwait_hint(void)
{
@@ -1322,7 +1330,9 @@ static inline void mwait_play_dead(void)
if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
return;

- hint = get_deepest_mwait_hint();
+ hint = READ_ONCE(play_dead_mwait_hint);
+ if (hint == PLAY_DEAD_MWAIT_HINT_UNSET)
+ hint = get_deepest_mwait_hint();

/* Set up state for the kexec() hack below */
md->status = CPUDEAD_MWAIT_WAIT;
--
2.46.2