Re: [RFC][PATCH] x86, kaiser: do not require mapping process kernel stacks

From: Andy Lutomirski
Date: Sat Nov 04 2017 - 07:25:15 EST


On Fri, Nov 3, 2017 at 3:03 PM, Dave Hansen <dave.hansen@xxxxxxxxx> wrote:
>
> With the KAISER code that I posted a few days ago, we map and unmap
> each of the kernel stacks when they are created. That's slow and
> it is also the single largest thing still mapped into the user
> address space.
>
> This patch is on top of Andy's new trampoline stack code[1] plus
> the previous KAISER patches I posted. I figured I'd post this now
> so folks can take a look before I go re-spin the whole thing.
>
> Andy, I think the do_double_fault() handling is arguably buggy
> with your trampoline stacks: it points the iret frame to the task
> stack instead of the trampoline stack. It survives that normally,
> but it breaks with KAISER of course.

That is, indeed, a bug. Will fix.

>
> The other oddity in this is the SWITCH_TO_KERNEL_CR3 in the
> entry_SYSCALL_64 path. We do not have a stack to use or any
> regs to clobber, so we have to fall back to a per-cpu area to
> stash a register. I'm open to any better ideas.

I think we have no choice but to use *two* words of temporary storage,
one for rsp and one for whatever scratch reg we use. Unless we're
sneaky and use rsp as our scratch reg. *snicker*

>
> 1. https://git.kernel.org/pub/scm/linux/kernel/git/luto/linux.git/log/?h=x86/entry_consolidation
>
> ---
>
> b/arch/x86/entry/calling.h | 15 +++++++++++++
> b/arch/x86/entry/entry_64.S | 8 ++++--
> b/arch/x86/include/asm/kaiser.h | 2 -
> b/arch/x86/kernel/traps.c | 46 +++++++++++++++++++++++++++++++++-------
> b/arch/x86/mm/kaiser.c | 12 +---------
> b/include/linux/kaiser.h | 5 ----
> b/kernel/fork.c | 5 ----
> 7 files changed, 60 insertions(+), 33 deletions(-)
>
> diff -puN arch/x86/mm/kaiser.c~kaiser-no-process-stacks arch/x86/mm/kaiser.c
> --- a/arch/x86/mm/kaiser.c~kaiser-no-process-stacks 2017-11-02 11:07:35.624523990 -0700
> +++ b/arch/x86/mm/kaiser.c 2017-11-03 14:22:34.878872556 -0700
> @@ -30,6 +30,8 @@
> #include <asm/tlbflush.h>
> #include <asm/desc.h>
> +DEFINE_PER_CPU_USER_MAPPED(unsigned long, syscall_r11_tmp);
> +
> /*
> * At runtime, the only things we map are some things for CPU
> * hotplug, and stacks for new processes. No two CPUs will ever
> @@ -251,16 +253,6 @@ int kaiser_add_user_map(const void *__st
> return 0;
> }
> -/*
> - * The stack mapping is called in generic code and can't use
> - * __PAGE_KERNEL
> - */
> -int kaiser_map_stack(struct task_struct *tsk)
> -{
> - return kaiser_add_mapping((unsigned long)tsk->stack, THREAD_SIZE,
> - __PAGE_KERNEL);
> -}
> -
> int kaiser_add_user_map_ptrs(const void *__start_addr,
> const void *__end_addr,
> unsigned long flags)
> diff -puN kernel/fork.c~kaiser-no-process-stacks kernel/fork.c
> --- a/kernel/fork.c~kaiser-no-process-stacks 2017-11-02 11:08:33.103722857 -0700
> +++ b/kernel/fork.c 2017-11-03 14:19:35.720775925 -0700
> @@ -248,8 +248,6 @@ static unsigned long *alloc_thread_stack
> static inline void free_thread_stack(struct task_struct *tsk)
> {
> - kaiser_remove_mapping((unsigned long)tsk->stack, THREAD_SIZE);
> -
> #ifdef CONFIG_VMAP_STACK
> if (task_stack_vm_area(tsk)) {
> int i;
> @@ -539,9 +537,6 @@ static struct task_struct *dup_task_stru
> * functions again.
> */
> tsk->stack = stack;
> - err = kaiser_map_stack(tsk);
> - if (err)
> - goto free_stack;
> #ifdef CONFIG_VMAP_STACK
> tsk->stack_vm_area = stack_vm_area;
> #endif
> diff -puN arch/x86/entry/entry_64.S~kaiser-no-process-stacks arch/x86/entry/entry_64.S
> --- a/arch/x86/entry/entry_64.S~kaiser-no-process-stacks 2017-11-02 11:31:30.904900271 -0700
> +++ b/arch/x86/entry/entry_64.S 2017-11-03 11:20:26.825735849 -0700
> @@ -145,6 +145,9 @@ ENTRY(entry_SYSCALL_64)
> swapgs
> movq %rsp, PER_CPU_VAR(rsp_scratch)
> +
> + SWITCH_TO_KERNEL_CR3_SYSCALL
> +
> movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
> /* Construct struct pt_regs on stack */
> @@ -169,8 +172,6 @@ GLOBAL(entry_SYSCALL_64_after_hwframe)
> /* NB: right here, all regs except r11 are live. */
> - SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
> -
> /* Must wait until we have the kernel CR3 to call C functions: */
> TRACE_IRQS_OFF
> @@ -1270,6 +1271,7 @@ ENTRY(error_entry)
> * gsbase and proceed. We'll fix up the exception and land in
> * .Lgs_change's error handler with kernel gsbase.
> */
> + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax
> SWAPGS
> jmp .Lerror_entry_done
> @@ -1379,6 +1381,7 @@ ENTRY(nmi)
> swapgs
> cld
> + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx
> movq %rsp, %rdx
> movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp
> UNWIND_HINT_IRET_REGS base=%rdx offset=8
> @@ -1407,7 +1410,6 @@ ENTRY(nmi)
> UNWIND_HINT_REGS
> ENCODE_FRAME_POINTER
> - SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi
> /*
> * At this point we no longer need to worry about stack damage
> * due to nesting -- we're on the normal thread stack and we're
> diff -puN arch/x86/entry/calling.h~kaiser-no-process-stacks arch/x86/entry/calling.h
> --- a/arch/x86/entry/calling.h~kaiser-no-process-stacks 2017-11-02 12:19:37.999595414 -0700
> +++ b/arch/x86/entry/calling.h 2017-11-02 12:24:14.784007027 -0700
> @@ -3,6 +3,7 @@
> #include <asm/cpufeatures.h>
> #include <asm/page_types.h>
> #include <asm/pgtable_types.h>
> +#include <asm/percpu.h>
> /*
> @@ -214,6 +215,18 @@ For 32-bit we have the following convent
> mov \scratch_reg, %cr3
> .endm
> +/*
> + * At syscall entry, we do not have a register to clobber and
> + * do not yet have a stack. Even if we did, we no longer map
> + * the process stack, so it does us no good. Just use some
> + * percpu space to stash r11.
> + */
> +.macro SWITCH_TO_KERNEL_CR3_SYSCALL
> + movq %r11, PER_CPU_VAR(syscall_r11_tmp)
> + SWITCH_TO_KERNEL_CR3 scratch_reg=%r11
> + movq PER_CPU_VAR(syscall_r11_tmp), %r11
> +.endm
> +
> .macro SWITCH_TO_USER_CR3 scratch_reg:req
> mov %cr3, \scratch_reg
> ADJUST_USER_CR3 \scratch_reg
> @@ -254,6 +267,8 @@ For 32-bit we have the following convent
> .macro SWITCH_TO_KERNEL_CR3 scratch_reg:req
> .endm
> +.macro SWITCH_TO_KERNEL_CR3_SYSCALL
> +.endm
> .macro SWITCH_TO_USER_CR3 scratch_reg:req
> .endm
> .macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req
> diff -puN arch/x86/kernel/traps.c~kaiser-no-process-stacks arch/x86/kernel/traps.c
> --- a/arch/x86/kernel/traps.c~kaiser-no-process-stacks 2017-11-02 12:46:27.978440257 -0700
> +++ b/arch/x86/kernel/traps.c 2017-11-03 14:34:21.658546483 -0700
> @@ -329,6 +329,43 @@ __visible void __noreturn handle_stack_o
> }
> #endif
> +/*
> + * This "fakes" a #GP from userspace upon returning (iret'ing)
> + * from this double fault.
> + */
> +void setup_fake_gp_at_iret(struct pt_regs *regs)
> +{
> + unsigned long *new_stack_top = (unsigned long *)
> + (this_cpu_read(cpu_tss.x86_tss.ist[0]) - 0x1500);
> +
> + /*
> + * Set up a stack just like the hardware would for a #GP.
> + *
> + * This format is an "iret frame", plus the error code
> + * that the hardware puts on the stack for us for
> + * exceptions. (see struct pt_regs).
> + */
> + new_stack_top[-1] = regs->ss;
> + new_stack_top[-2] = regs->sp;
> + new_stack_top[-3] = regs->flags;
> + new_stack_top[-4] = regs->cs;
> + new_stack_top[-5] = regs->ip;
> + new_stack_top[-6] = 0; /* faked #GP error code */
> +
> + /*
> + * 'regs' points to the "iret frame" for *this*
> + * exception, *not* the #GP we are faking. Here,
> + * we are telling 'iret' to jump to general_protection
> + * when returning from this double fault.
> + */
> + regs->ip = (unsigned long)general_protection;
> + /*
> + * Make iret move the stack to the "fake #GP" stack
> + * we created above.
> + */
> + regs->sp = (unsigned long)&new_stack_top[-6];
> +}
> +
> #ifdef CONFIG_X86_64
> /* Runs on IST stack */
> dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
> @@ -354,14 +391,7 @@ dotraplinkage void do_double_fault(struc
> regs->cs == __KERNEL_CS &&
> regs->ip == (unsigned long)native_irq_return_iret)
> {
> - struct pt_regs *normal_regs = task_pt_regs(current);
> -
> - /* Fake a #GP(0) from userspace. */
> - memmove(&normal_regs->ip, (void *)regs->sp, 5*8);
> - normal_regs->orig_ax = 0; /* Missing (lost) #GP error code */
> - regs->ip = (unsigned long)general_protection;
> - regs->sp = (unsigned long)&normal_regs->orig_ax;
> -
> + setup_fake_gp_at_iret(regs);
> return;
> }
> #endif
> diff -puN arch/x86/include/asm/kaiser.h~kaiser-no-process-stacks arch/x86/include/asm/kaiser.h
> --- a/arch/x86/include/asm/kaiser.h~kaiser-no-process-stacks 2017-11-03 14:22:37.594995310 -0700
> +++ b/arch/x86/include/asm/kaiser.h 2017-11-03 14:22:40.848142333 -0700
> @@ -33,8 +33,6 @@
> extern int kaiser_add_mapping(unsigned long addr, unsigned long size,
> unsigned long flags);
> -extern int kaiser_map_stack(struct task_struct *tsk);
> -
> /**
> * kaiser_remove_mapping - remove a kernel mapping from the userpage tables
> * @addr: the start address of the range
> diff -puN include/linux/kaiser.h~kaiser-no-process-stacks include/linux/kaiser.h
> --- a/include/linux/kaiser.h~kaiser-no-process-stacks 2017-11-03 14:22:37.654998020 -0700
> +++ b/include/linux/kaiser.h 2017-11-03 14:22:48.646494777 -0700
> @@ -11,11 +11,6 @@
> * disabled.
> */
> -static inline int kaiser_map_stack(struct task_struct *tsk)
> -{
> - return 0;
> -}
> -
> static inline void kaiser_init(void)
> {
> }
> _