Re: [patch 02/38] x86/cpu: Use native_wrmsrl() in load_percpu_segment()
From: Andrew Cooper
Date: Sat Jul 16 2022 - 20:22:51 EST
On 17/07/2022 00:17, Thomas Gleixner wrote:
> load_percpu_segment() is using wrmsr() which is paravirtualized. That's an
> issue because the code sequence is:
>
> __loadsegment_simple(gs, 0);
> wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
>
> So anything which uses a per CPU variable between setting GS to 0 and
> writing GSBASE is going to end up in a NULL pointer dereference. That's
> can be triggered with instrumentation and is guaranteed to be triggered
> with callthunks for call depth tracking.
>
> Use native_wrmsrl() instead. XEN_PV will trap and emulate, but that's not a
> hot path.
>
> Also make it static and mark it noinstr so neither kprobes, sanitizers or
> whatever can touch it.
>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> ---
> arch/x86/include/asm/processor.h | 1 -
> arch/x86/kernel/cpu/common.c | 12 ++++++++++--
> 2 files changed, 10 insertions(+), 3 deletions(-)
>
> --- a/arch/x86/include/asm/processor.h
> +++ b/arch/x86/include/asm/processor.h
> @@ -673,7 +673,6 @@ extern struct desc_ptr early_gdt_descr;
> extern void switch_to_new_gdt(int);
> extern void load_direct_gdt(int);
> extern void load_fixmap_gdt(int);
> -extern void load_percpu_segment(int);
> extern void cpu_init(void);
> extern void cpu_init_secondary(void);
> extern void cpu_init_exception_handling(void);
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -701,13 +701,21 @@ static const char *table_lookup_model(st
> __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
> __u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
>
> -void load_percpu_segment(int cpu)
> +static noinstr void load_percpu_segment(int cpu)
> {
> #ifdef CONFIG_X86_32
> loadsegment(fs, __KERNEL_PERCPU);
> #else
> __loadsegment_simple(gs, 0);
> - wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
> + /*
> + * Because of the __loadsegment_simple(gs, 0) above, any GS-prefixed
> + * instruction will explode right about here. As such, we must not have
> + * any CALL-thunks using per-cpu data.
> + *
> + * Therefore, use native_wrmsrl() and have XenPV take the fault and
> + * emulate.
> + */
> + native_wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
> #endif
Lovely :-/
But I still don't see how that works, because __loadsegment_simple() is
a memory clobber and cpu_kernelmode_gs_base() has a per-cpu lookup in it.
That said, this only has a sole caller, and in context, it's bogus for
64bit. Can't we fix all the problems by just doing this:
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 736262a76a12..6f393bc9d89d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -701,16 +701,6 @@ static const char *table_lookup_model(struct
cpuinfo_x86 *c)
__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned
long));
__u32 cpu_caps_set[NCAPINTS + NBUGINTS] __aligned(sizeof(unsigned long));
-void load_percpu_segment(int cpu)
-{
-#ifdef CONFIG_X86_32
- loadsegment(fs, __KERNEL_PERCPU);
-#else
- __loadsegment_simple(gs, 0);
- wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu));
-#endif
-}
-
#ifdef CONFIG_X86_32
/* The 32-bit entry code needs to find cpu_entry_area. */
DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area);
@@ -742,12 +732,15 @@ EXPORT_SYMBOL_GPL(load_fixmap_gdt);
* Current gdt points %fs at the "master" per-cpu area: after this,
* it's on the real one.
*/
-void switch_to_new_gdt(int cpu)
+void __noinstr switch_to_new_gdt(int cpu)
{
/* Load the original GDT */
load_direct_gdt(cpu);
+
+#ifdef CONFIG_X86_32
/* Reload the per-cpu base */
- load_percpu_segment(cpu);
+ loadsegment(fs, __KERNEL_PERCPU);
+#endif
}
static const struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
It's only 32bit where the percpu pointer is tied to the GDT. On 64bit,
gsbase is good before this, and remains good after.
With this change,
# Make sure load_percpu_segment has no stackprotector
CFLAGS_common.o := -fno-stack-protector
comes up for re-evaluation too.
~Andrew