Re: [PATCH 16/33] riscv/shstk: If needed allocate a new shadow stack on clone
From: Zong Li
Date: Mon Oct 07 2024 - 04:18:17 EST
On Wed, Oct 2, 2024 at 12:20 AM Deepak Gupta <debug@xxxxxxxxxxxx> wrote:
>
> Userspace specifies CLONE_VM to share address space and spawn new thread.
> `clone` allow userspace to specify a new stack for new thread. However
> there is no way to specify new shadow stack base address without changing
> API. This patch allocates a new shadow stack whenever CLONE_VM is given.
>
> In case of CLONE_VFORK, parent is suspended until child finishes and thus
> can child use parent shadow stack. In case of !CLONE_VM, COW kicks in
> because entire address space is copied from parent to child.
>
> `clone3` is extensible and can provide mechanisms using which shadow stack
> as an input parameter can be provided. This is not settled yet and being
> extensively discussed on mailing list. Once that's settled, this commit
> will adapt to that.
>
> Signed-off-by: Deepak Gupta <debug@xxxxxxxxxxxx>
> ---
> arch/riscv/include/asm/usercfi.h | 25 ++++++++
> arch/riscv/kernel/process.c | 11 +++-
> arch/riscv/kernel/usercfi.c | 121 +++++++++++++++++++++++++++++++++++++++
> 3 files changed, 156 insertions(+), 1 deletion(-)
>
> diff --git a/arch/riscv/include/asm/usercfi.h b/arch/riscv/include/asm/usercfi.h
> index 4fa201b4fc4e..719e28e043c8 100644
> --- a/arch/riscv/include/asm/usercfi.h
> +++ b/arch/riscv/include/asm/usercfi.h
> @@ -8,6 +8,9 @@
> #ifndef __ASSEMBLY__
> #include <linux/types.h>
>
> +struct task_struct;
> +struct kernel_clone_args;
> +
> #ifdef CONFIG_RISCV_USER_CFI
> struct cfi_status {
> unsigned long ubcfi_en : 1; /* Enable for backward cfi. */
> @@ -17,6 +20,28 @@ struct cfi_status {
> unsigned long shdw_stk_size; /* size of shadow stack */
> };
>
> +unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
> + const struct kernel_clone_args *args);
> +void shstk_release(struct task_struct *tsk);
> +void set_shstk_base(struct task_struct *task, unsigned long shstk_addr, unsigned long size);
> +unsigned long get_shstk_base(struct task_struct *task, unsigned long *size);
> +void set_active_shstk(struct task_struct *task, unsigned long shstk_addr);
> +bool is_shstk_enabled(struct task_struct *task);
> +
> +#else
> +
> +#define shstk_alloc_thread_stack(tsk, args) 0
> +
> +#define shstk_release(tsk)
> +
> +#define get_shstk_base(task, size) 0
> +
> +#define set_shstk_base(task, shstk_addr, size)
> +
> +#define set_active_shstk(task, shstk_addr)
> +
> +#define is_shstk_enabled(task) false
> +
> #endif /* CONFIG_RISCV_USER_CFI */
>
> #endif /* __ASSEMBLY__ */
> diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c
> index 1f2574fb2edb..f6f58b1ed905 100644
> --- a/arch/riscv/kernel/process.c
> +++ b/arch/riscv/kernel/process.c
> @@ -28,6 +28,7 @@
> #include <asm/vector.h>
> #include <asm/cpufeature.h>
> #include <asm/exec.h>
> +#include <asm/usercfi.h>
>
> #if defined(CONFIG_STACKPROTECTOR) && !defined(CONFIG_STACKPROTECTOR_PER_TASK)
> #include <linux/stackprotector.h>
> @@ -203,7 +204,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
>
> void exit_thread(struct task_struct *tsk)
> {
> -
> + shstk_release(tsk);
> }
>
> int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
> @@ -211,6 +212,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
> unsigned long clone_flags = args->flags;
> unsigned long usp = args->stack;
> unsigned long tls = args->tls;
> + unsigned long ssp = 0;
> struct pt_regs *childregs = task_pt_regs(p);
>
> memset(&p->thread.s, 0, sizeof(p->thread.s));
> @@ -225,11 +227,18 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args)
> p->thread.s[0] = (unsigned long)args->fn;
> p->thread.s[1] = (unsigned long)args->fn_arg;
> } else {
> + /* allocate new shadow stack if needed. In case of CLONE_VM we have to */
> + ssp = shstk_alloc_thread_stack(p, args);
> + if (IS_ERR_VALUE(ssp))
> + return PTR_ERR((void *)ssp);
> +
> *childregs = *(current_pt_regs());
> /* Turn off status.VS */
> riscv_v_vstate_off(childregs);
> if (usp) /* User fork */
> childregs->sp = usp;
> + /* if needed, set new ssp */
> + ssp ? set_active_shstk(p, ssp) : 0;
> if (clone_flags & CLONE_SETTLS)
> childregs->tp = tls;
> childregs->a0 = 0; /* Return value of fork() */
> diff --git a/arch/riscv/kernel/usercfi.c b/arch/riscv/kernel/usercfi.c
> index ce002eabbdc1..7a7f0b57b2d4 100644
> --- a/arch/riscv/kernel/usercfi.c
> +++ b/arch/riscv/kernel/usercfi.c
> @@ -19,6 +19,41 @@
>
> #define SHSTK_ENTRY_SIZE sizeof(void *)
>
> +bool is_shstk_enabled(struct task_struct *task)
> +{
> + return task->thread_info.user_cfi_state.ubcfi_en ? true : false;
> +}
> +
> +void set_shstk_base(struct task_struct *task, unsigned long shstk_addr, unsigned long size)
> +{
> + task->thread_info.user_cfi_state.shdw_stk_base = shstk_addr;
> + task->thread_info.user_cfi_state.shdw_stk_size = size;
> +}
> +
> +unsigned long get_shstk_base(struct task_struct *task, unsigned long *size)
> +{
> + if (size)
> + *size = task->thread_info.user_cfi_state.shdw_stk_size;
> + return task->thread_info.user_cfi_state.shdw_stk_base;
> +}
> +
> +void set_active_shstk(struct task_struct *task, unsigned long shstk_addr)
> +{
> + task->thread_info.user_cfi_state.user_shdw_stk = shstk_addr;
> +}
> +
> +/*
> + * If size is 0, then to be compatible with regular stack we want it to be as big as
> + * regular stack. Else PAGE_ALIGN it and return back
> + */
> +static unsigned long calc_shstk_size(unsigned long size)
> +{
> + if (size)
> + return PAGE_ALIGN(size);
> +
> + return PAGE_ALIGN(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G));
> +}
> +
> /*
> * Writes on shadow stack can either be `sspush` or `ssamoswap`. `sspush` can happen
> * implicitly on current shadow stack pointed to by CSR_SSP. `ssamoswap` takes pointer to
> @@ -143,3 +178,89 @@ SYSCALL_DEFINE3(map_shadow_stack, unsigned long, addr, unsigned long, size, unsi
>
> return allocate_shadow_stack(addr, aligned_size, size, set_tok);
> }
> +
> +/*
> + * This gets called during clone/clone3/fork. And is needed to allocate a shadow stack for
> + * cases where CLONE_VM is specified and thus a different stack is specified by user. We
> + * thus need a separate shadow stack too. How does separate shadow stack is specified by
> + * user is still being debated. Once that's settled, remove this part of the comment.
> + * This function simply returns 0 if shadow stack are not supported or if separate shadow
> + * stack allocation is not needed (like in case of !CLONE_VM)
> + */
> +unsigned long shstk_alloc_thread_stack(struct task_struct *tsk,
> + const struct kernel_clone_args *args)
> +{
> + unsigned long addr, size;
> +
> + /* If shadow stack is not supported, return 0 */
> + if (!cpu_supports_shadow_stack())
> + return 0;
> +
> + /*
> + * If shadow stack is not enabled on the new thread, skip any
> + * switch to a new shadow stack.
> + */
> + if (is_shstk_enabled(tsk))
Hi Deepak,
Should it be '!' is_shstk_enabled(tsk)?
> + return 0;
> +
> + /*
> + * For CLONE_VFORK the child will share the parents shadow stack.
> + * Set base = 0 and size = 0, this is special means to track this state
> + * so the freeing logic run for child knows to leave it alone.
> + */
> + if (args->flags & CLONE_VFORK) {
> + set_shstk_base(tsk, 0, 0);
> + return 0;
> + }
> +
> + /*
> + * For !CLONE_VM the child will use a copy of the parents shadow
> + * stack.
> + */
> + if (!(args->flags & CLONE_VM))
> + return 0;
> +
> + /*
> + * reaching here means, CLONE_VM was specified and thus a separate shadow
> + * stack is needed for new cloned thread. Note: below allocation is happening
> + * using current mm.
> + */
> + size = calc_shstk_size(args->stack_size);
> + addr = allocate_shadow_stack(0, size, 0, false);
> + if (IS_ERR_VALUE(addr))
> + return addr;
> +
> + set_shstk_base(tsk, addr, size);
> +
> + return addr + size;
> +}
> +
> +void shstk_release(struct task_struct *tsk)
> +{
> + unsigned long base = 0, size = 0;
> + /* If shadow stack is not supported or not enabled, nothing to release */
> + if (!cpu_supports_shadow_stack() ||
> + !is_shstk_enabled(tsk))
> + return;
> +
> + /*
> + * When fork() with CLONE_VM fails, the child (tsk) already has a
> + * shadow stack allocated, and exit_thread() calls this function to
> + * free it. In this case the parent (current) and the child share
> + * the same mm struct. Move forward only when they're same.
> + */
> + if (!tsk->mm || tsk->mm != current->mm)
> + return;
> +
> + /*
> + * We know shadow stack is enabled but if base is NULL, then
> + * this task is not managing its own shadow stack (CLONE_VFORK). So
> + * skip freeing it.
> + */
> + base = get_shstk_base(tsk, &size);
> + if (!base)
> + return;
> +
> + vm_munmap(base, size);
> + set_shstk_base(tsk, 0, 0);
> +}
>
> --
> 2.45.0
>
>
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@xxxxxxxxxxxxxxxxxxx
> http://lists.infradead.org/mailman/listinfo/linux-riscv