Re: [PATCH 3.18 04/84] x86_64, switch_to(): Load TLS descriptors before switching DS and ES

From: Jiri Slaby
Date: Wed Jan 07 2015 - 10:26:47 EST


On 01/07/2015, 02:49 AM, Greg Kroah-Hartman wrote:
> 3.18-stable review patch. If anyone has any objections, please let me know.

Greg, Andi raised an objection against this one for 3.12 which still
holds here and for other trees:
https://www.mail-archive.com/stable@xxxxxxxxxxxxxxx/msg106471.html

Quoting:
On 01/06/2015, 07:53 PM, Andi Kleen wrote:
> IMHO this is not stable material. Significant risk you broke
> something obscure, and it's not clear it fixes any real problem.
>
> At least wait some more time first.

Thanks.

> ------------------
>
> From: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
>
> commit f647d7c155f069c1a068030255c300663516420e upstream.
>
> Otherwise, if buggy user code points DS or ES into the TLS
> array, they would be corrupted after a context switch.
>
> This also significantly improves the comments and documents some
> gotchas in the code.
>
> Before this patch, the both tests below failed. With this
> patch, the es test passes, although the gsbase test still fails.
>
> ----- begin es test -----
>
> /*
> * Copyright (c) 2014 Andy Lutomirski
> * GPL v2
> */
>
> static unsigned short GDT3(int idx)
> {
> return (idx << 3) | 3;
> }
>
> static int create_tls(int idx, unsigned int base)
> {
> struct user_desc desc = {
> .entry_number = idx,
> .base_addr = base,
> .limit = 0xfffff,
> .seg_32bit = 1,
> .contents = 0, /* Data, grow-up */
> .read_exec_only = 0,
> .limit_in_pages = 1,
> .seg_not_present = 0,
> .useable = 0,
> };
>
> if (syscall(SYS_set_thread_area, &desc) != 0)
> err(1, "set_thread_area");
>
> return desc.entry_number;
> }
>
> int main()
> {
> int idx = create_tls(-1, 0);
> printf("Allocated GDT index %d\n", idx);
>
> unsigned short orig_es;
> asm volatile ("mov %%es,%0" : "=rm" (orig_es));
>
> int errors = 0;
> int total = 1000;
> for (int i = 0; i < total; i++) {
> asm volatile ("mov %0,%%es" : : "rm" (GDT3(idx)));
> usleep(100);
>
> unsigned short es;
> asm volatile ("mov %%es,%0" : "=rm" (es));
> asm volatile ("mov %0,%%es" : : "rm" (orig_es));
> if (es != GDT3(idx)) {
> if (errors == 0)
> printf("[FAIL]\tES changed from 0x%hx to 0x%hx\n",
> GDT3(idx), es);
> errors++;
> }
> }
>
> if (errors) {
> printf("[FAIL]\tES was corrupted %d/%d times\n", errors, total);
> return 1;
> } else {
> printf("[OK]\tES was preserved\n");
> return 0;
> }
> }
>
> ----- end es test -----
>
> ----- begin gsbase test -----
>
> /*
> * gsbase.c, a gsbase test
> * Copyright (c) 2014 Andy Lutomirski
> * GPL v2
> */
>
> static unsigned char *testptr, *testptr2;
>
> static unsigned char read_gs_testvals(void)
> {
> unsigned char ret;
> asm volatile ("movb %%gs:%1, %0" : "=r" (ret) : "m" (*testptr));
> return ret;
> }
>
> int main()
> {
> int errors = 0;
>
> testptr = mmap((void *)0x200000000UL, 1, PROT_READ | PROT_WRITE,
> MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
> if (testptr == MAP_FAILED)
> err(1, "mmap");
>
> testptr2 = mmap((void *)0x300000000UL, 1, PROT_READ | PROT_WRITE,
> MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0);
> if (testptr2 == MAP_FAILED)
> err(1, "mmap");
>
> *testptr = 0;
> *testptr2 = 1;
>
> if (syscall(SYS_arch_prctl, ARCH_SET_GS,
> (unsigned long)testptr2 - (unsigned long)testptr) != 0)
> err(1, "ARCH_SET_GS");
>
> usleep(100);
>
> if (read_gs_testvals() == 1) {
> printf("[OK]\tARCH_SET_GS worked\n");
> } else {
> printf("[FAIL]\tARCH_SET_GS failed\n");
> errors++;
> }
>
> asm volatile ("mov %0,%%gs" : : "r" (0));
>
> if (read_gs_testvals() == 0) {
> printf("[OK]\tWriting 0 to gs worked\n");
> } else {
> printf("[FAIL]\tWriting 0 to gs failed\n");
> errors++;
> }
>
> usleep(100);
>
> if (read_gs_testvals() == 0) {
> printf("[OK]\tgsbase is still zero\n");
> } else {
> printf("[FAIL]\tgsbase was corrupted\n");
> errors++;
> }
>
> return errors == 0 ? 0 : 1;
> }
>
> ----- end gsbase test -----
>
> Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
> Cc: Andi Kleen <andi@xxxxxxxxxxxxxx>
> Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
> Link: http://lkml.kernel.org/r/509d27c9fec78217691c3dad91cec87e1006b34a.1418075657.git.luto@xxxxxxxxxxxxxx
> Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
>
> ---
> arch/x86/kernel/process_64.c | 101 +++++++++++++++++++++++++++++++------------
> 1 file changed, 73 insertions(+), 28 deletions(-)
>
> --- a/arch/x86/kernel/process_64.c
> +++ b/arch/x86/kernel/process_64.c
> @@ -283,24 +283,9 @@ __switch_to(struct task_struct *prev_p,
>
> fpu = switch_fpu_prepare(prev_p, next_p, cpu);
>
> - /*
> - * Reload esp0, LDT and the page table pointer:
> - */
> + /* Reload esp0 and ss1. */
> load_sp0(tss, next);
>
> - /*
> - * Switch DS and ES.
> - * This won't pick up thread selector changes, but I guess that is ok.
> - */
> - savesegment(es, prev->es);
> - if (unlikely(next->es | prev->es))
> - loadsegment(es, next->es);
> -
> - savesegment(ds, prev->ds);
> - if (unlikely(next->ds | prev->ds))
> - loadsegment(ds, next->ds);
> -
> -
> /* We must save %fs and %gs before load_TLS() because
> * %fs and %gs may be cleared by load_TLS().
> *
> @@ -309,41 +294,101 @@ __switch_to(struct task_struct *prev_p,
> savesegment(fs, fsindex);
> savesegment(gs, gsindex);
>
> + /*
> + * Load TLS before restoring any segments so that segment loads
> + * reference the correct GDT entries.
> + */
> load_TLS(next, cpu);
>
> /*
> - * Leave lazy mode, flushing any hypercalls made here.
> - * This must be done before restoring TLS segments so
> - * the GDT and LDT are properly updated, and must be
> - * done before math_state_restore, so the TS bit is up
> - * to date.
> + * Leave lazy mode, flushing any hypercalls made here. This
> + * must be done after loading TLS entries in the GDT but before
> + * loading segments that might reference them, and and it must
> + * be done before math_state_restore, so the TS bit is up to
> + * date.
> */
> arch_end_context_switch(next_p);
>
> + /* Switch DS and ES.
> + *
> + * Reading them only returns the selectors, but writing them (if
> + * nonzero) loads the full descriptor from the GDT or LDT. The
> + * LDT for next is loaded in switch_mm, and the GDT is loaded
> + * above.
> + *
> + * We therefore need to write new values to the segment
> + * registers on every context switch unless both the new and old
> + * values are zero.
> + *
> + * Note that we don't need to do anything for CS and SS, as
> + * those are saved and restored as part of pt_regs.
> + */
> + savesegment(es, prev->es);
> + if (unlikely(next->es | prev->es))
> + loadsegment(es, next->es);
> +
> + savesegment(ds, prev->ds);
> + if (unlikely(next->ds | prev->ds))
> + loadsegment(ds, next->ds);
> +
> /*
> * Switch FS and GS.
> *
> - * Segment register != 0 always requires a reload. Also
> - * reload when it has changed. When prev process used 64bit
> - * base always reload to avoid an information leak.
> + * These are even more complicated than FS and GS: they have
> + * 64-bit bases are that controlled by arch_prctl. Those bases
> + * only differ from the values in the GDT or LDT if the selector
> + * is 0.
> + *
> + * Loading the segment register resets the hidden base part of
> + * the register to 0 or the value from the GDT / LDT. If the
> + * next base address zero, writing 0 to the segment register is
> + * much faster than using wrmsr to explicitly zero the base.
> + *
> + * The thread_struct.fs and thread_struct.gs values are 0
> + * if the fs and gs bases respectively are not overridden
> + * from the values implied by fsindex and gsindex. They
> + * are nonzero, and store the nonzero base addresses, if
> + * the bases are overridden.
> + *
> + * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should
> + * be impossible.
> + *
> + * Therefore we need to reload the segment registers if either
> + * the old or new selector is nonzero, and we need to override
> + * the base address if next thread expects it to be overridden.
> + *
> + * This code is unnecessarily slow in the case where the old and
> + * new indexes are zero and the new base is nonzero -- it will
> + * unnecessarily write 0 to the selector before writing the new
> + * base address.
> + *
> + * Note: This all depends on arch_prctl being the only way that
> + * user code can override the segment base. Once wrfsbase and
> + * wrgsbase are enabled, most of this code will need to change.
> */
> if (unlikely(fsindex | next->fsindex | prev->fs)) {
> loadsegment(fs, next->fsindex);
> +
> /*
> - * Check if the user used a selector != 0; if yes
> - * clear 64bit base, since overloaded base is always
> - * mapped to the Null selector
> + * If user code wrote a nonzero value to FS, then it also
> + * cleared the overridden base address.
> + *
> + * XXX: if user code wrote 0 to FS and cleared the base
> + * address itself, we won't notice and we'll incorrectly
> + * restore the prior base address next time we reschdule
> + * the process.
> */
> if (fsindex)
> prev->fs = 0;
> }
> - /* when next process has a 64bit base use it */
> if (next->fs)
> wrmsrl(MSR_FS_BASE, next->fs);
> prev->fsindex = fsindex;
>
> if (unlikely(gsindex | next->gsindex | prev->gs)) {
> load_gs_index(next->gsindex);
> +
> + /* This works (and fails) the same way as fsindex above. */
> if (gsindex)
> prev->gs = 0;
> }
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe stable" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
>


--
js
suse labs
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/