Re: [patch 02/38] x86/cpu: Use native_wrmsrl() in load_percpu_segment()

From: Andrew Cooper
Date: Sat Jul 16 2022 - 20:22:51 EST


On 17/07/2022 00:17, Thomas Gleixner wrote:
> load_percpu_segment() is using wrmsr() which is paravirtualized. That's an
> issue because the code sequence is:
>
> __loadsegment_simple(gs, 0);
> wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
>
> So anything which uses a per CPU variable between setting GS to 0 and
> writing GSBASE is going to end up in a NULL pointer dereference. That's
> can be triggered with instrumentation and is guaranteed to be triggered
> with callthunks for call depth tracking.
>
> Use native_wrmsrl() instead. XEN_PV will trap and emulate, but that's not a
> hot path.
>
> Also make it static and mark it noinstr so neither kprobes, sanitizers or
> whatever can touch it.
>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> ---
> arch/x86/include/asm/processor.h | 1 -
> arch/x86/kernel/cpu/common.c | 12 ++++++++++--
> 2 files changed, 10 insertions(+), 3 deletions(-)
>
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -673,7 +673,6 @@ extern struct desc_ptr early_gdt_descr;
> extern void switch_to_new_gdt(int);
> extern void load_direct_gdt(int);
> extern void load_fixmap_gdt(int);
> -extern void load_percpu_segment(int);
> extern void cpu_init(void);
> extern void cpu_init_secondary(void);
> extern void cpu_init_exception_handling(void);
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -701,13 +701,21 @@ static const char *table_lookup_model(st
> __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
> __u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
>
> -void load_percpu_segment(int cpu)
> +static noinstr void load_percpu_segment(int cpu)
> {
> #ifdef CONFIG_X86_32
> loadsegment(fs, __KERNEL_PERCPU);
> #else
> __loadsegment_simple(gs, 0);
> - wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
> + /*
> + * Because of the __loadsegment_simple(gs, 0) above, any GS-prefixed
> + * instruction will explode right about here. As such, we must not have
> + * any CALL-thunks using per-cpu data.
> + *
> + * Therefore, use native_wrmsrl() and have XenPV take the fault and
> + * emulate.
> + */
> + native_wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
> #endif

Lovely :-/

But I still don't see how that works, because __loadsegment_simple() is
a memory clobber and cpu_kernelmode_gs_base() has a per-cpu lookup in it.

That said, this only has a sole caller, and in context, it's bogus for
64bit.  Can't we fix all the problems by just doing this:

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 736262a76a12..6f393bc9d89d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -701,16 +701,6 @@ static const char *table_lookup_model(struct
cpuinfo_x86 *c)
 __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned
long));
 __u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
 
-void load_percpu_segment(int cpu)
-{
-#ifdef CONFIG_X86_32
-       loadsegment(fs, __KERNEL_PERCPU);
-#else
-       __loadsegment_simple(gs, 0);
-       wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
-#endif
-}
-
 #ifdef CONFIG_X86_32
 /* The 32-bit entry code needs to find cpu_entry_area. */
 DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
@@ -742,12 +732,15 @@ EXPORT_SYMBOL_GPL(load_fixmap_gdt);
  * Current gdt points %fs at the "master" per-cpu area: after this,
  * it's on the real one.
  */
-void switch_to_new_gdt(int cpu)
+void __noinstr switch_to_new_gdt(int cpu)
 {
        /* Load the original GDT */
        load_direct_gdt(cpu);
+
+#ifdef CONFIG_X86_32
        /* Reload the per-cpu base */
-       load_percpu_segment(cpu);
+       loadsegment(fs, __KERNEL_PERCPU);
+#endif
 }
 
 static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};


It's only 32bit where the percpu pointer is tied to the GDT.  On 64bit,
gsbase is good before this, and remains good after.

With this change,

# Make sure load_percpu_segment has no stackprotector
CFLAGS_common.o         := -fno-stack-protector

comes up for re-evaluation too.

~Andrew