Re: [PATCH 1/2] clone: Support passing tls argument via C rather than pt_regs magic

From: Vineet Gupta
Date: Mon May 11 2015 - 10:31:56 EST


On Tuesday 21 April 2015 11:17 PM, Josh Triplett wrote:
> clone with CLONE_SETTLS accepts an argument to set the thread-local
> storage area for the new thread. sys_clone declares an int argument
> tls_val in the appropriate point in the argument list (based on the
> various CLONE_BACKWARDS variants), but doesn't actually use or pass
> along that argument. Instead, sys_clone calls do_fork, which calls
> copy_process, which calls the arch-specific copy_thread, and copy_thread
> pulls the corresponding syscall argument out of the pt_regs captured at
> kernel entry (knowing what argument of clone that architecture passes
> tls in).
>
> Apart from being awful and inscrutable, that also only works because
> only one code path into copy_thread can pass the CLONE_SETTLS flag, and
> that code path comes from sys_clone with its architecture-specific
> argument-passing order. This prevents introducing a new version of the
> clone system call without propagating the same architecture-specific
> position of the tls argument.
>
> However, there's no reason to pull the argument out of pt_regs when
> sys_clone could just pass it down via C function call arguments.
>
> Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt
> into, and a new copy_thread_tls that accepts the tls parameter as an
> additional unsigned long (syscall-argument-sized) argument.
> Change sys_clone's tls argument to an unsigned long (which does
> not change the ABI), and pass that down to copy_thread_tls.
>
> Architectures that don't opt into copy_thread_tls will continue to
> ignore the C argument to sys_clone in favor of the pt_regs captured at
> kernel entry, and thus will be unable to introduce new versions of the
> clone syscall.
>
> Signed-off-by: Josh Triplett <josh@xxxxxxxxxxxxxxxx>
> Signed-off-by: Thiago Macieira <thiago.macieira@xxxxxxxxx>
> Acked-by: Andy Lutomirski <luto@xxxxxxxxxx>
> ---
> arch/Kconfig | 7 ++++++
> include/linux/sched.h | 14 ++++++++++++
> include/linux/syscalls.h | 6 +++---
> kernel/fork.c | 55 +++++++++++++++++++++++++++++++-----------------
> 4 files changed, 60 insertions(+), 22 deletions(-)
>
> diff --git a/arch/Kconfig b/arch/Kconfig
> index 05d7a8a..4834a58 100644
> --- a/arch/Kconfig
> +++ b/arch/Kconfig
> @@ -484,6 +484,13 @@ config HAVE_IRQ_EXIT_ON_IRQ_STACK
> This spares a stack switch and improves cache usage on softirq
> processing.
>
> +config HAVE_COPY_THREAD_TLS
> + bool
> + help
> + Architecture provides copy_thread_tls to accept tls argument via
> + normal C parameter passing, rather than extracting the syscall
> + argument from pt_regs.
> +
> #
> # ABI hall of shame
> #
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index a419b65..2cc88c6 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -2480,8 +2480,22 @@ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
> /* Remove the current tasks stale references to the old mm_struct */
> extern void mm_release(struct task_struct *, struct mm_struct *);
>
> +#ifdef CONFIG_HAVE_COPY_THREAD_TLS
> +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
> + struct task_struct *, unsigned long);
> +#else
> extern int copy_thread(unsigned long, unsigned long, unsigned long,
> struct task_struct *);
> +
> +/* Architectures that haven't opted into copy_thread_tls get the tls argument
> + * via pt_regs, so ignore the tls argument passed via C. */
> +static inline int copy_thread_tls(
> + unsigned long clone_flags, unsigned long sp, unsigned long arg,
> + struct task_struct *p, unsigned long tls)
> +{
> + return copy_thread(clone_flags, sp, arg, p);
> +}
> +#endif

Is this detour really needed. Can we not update copy_thread() of all arches in one
go and add the tls arg, w/o using it.

And then arch maintainers can micro-optimize their code to use that arg vs.
pt_regs->rxx version at their own leisure. The only downside I see with that is
bigger churn (touches all arches), and a interim unused arg warning ?


-Vineet

> extern void flush_thread(void);
> extern void exit_thread(void);
>
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index 76d1e38..bb51bec 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -827,15 +827,15 @@ asmlinkage long sys_syncfs(int fd);
> asmlinkage long sys_fork(void);
> asmlinkage long sys_vfork(void);
> #ifdef CONFIG_CLONE_BACKWARDS
> -asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int,
> +asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned long,
> int __user *);
> #else
> #ifdef CONFIG_CLONE_BACKWARDS3
> asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *,
> - int __user *, int);
> + int __user *, unsigned long);
> #else
> asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
> - int __user *, int);
> + int __user *, unsigned long);
> #endif
> #endif
>
> diff --git a/kernel/fork.c b/kernel/fork.c
> index cf65139..b3dadf4 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1192,7 +1192,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> unsigned long stack_size,
> int __user *child_tidptr,
> struct pid *pid,
> - int trace)
> + int trace,
> + unsigned long tls)
> {
> int retval;
> struct task_struct *p;
> @@ -1401,7 +1402,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> retval = copy_io(clone_flags, p);
> if (retval)
> goto bad_fork_cleanup_namespaces;
> - retval = copy_thread(clone_flags, stack_start, stack_size, p);
> + retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
> if (retval)
> goto bad_fork_cleanup_io;
>
> @@ -1613,7 +1614,7 @@ static inline void init_idle_pids(struct pid_link *links)
> struct task_struct *fork_idle(int cpu)
> {
> struct task_struct *task;
> - task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0);
> + task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0);
> if (!IS_ERR(task)) {
> init_idle_pids(task->pids);
> init_idle(task, cpu);
> @@ -1628,11 +1629,13 @@ struct task_struct *fork_idle(int cpu)
> * It copies the process, and if successful kick-starts
> * it and waits for it to finish using the VM if required.
> */
> -long do_fork(unsigned long clone_flags,
> - unsigned long stack_start,
> - unsigned long stack_size,
> - int __user *parent_tidptr,
> - int __user *child_tidptr)
> +static long _do_fork(
> + unsigned long clone_flags,
> + unsigned long stack_start,
> + unsigned long stack_size,
> + int __user *parent_tidptr,
> + int __user *child_tidptr,
> + unsigned long tls)
> {
> struct task_struct *p;
> int trace = 0;
> @@ -1657,7 +1660,7 @@ long do_fork(unsigned long clone_flags,
> }
>
> p = copy_process(clone_flags, stack_start, stack_size,
> - child_tidptr, NULL, trace);
> + child_tidptr, NULL, trace, tls);
> /*
> * Do this prior waking up the new thread - the thread pointer
> * might get invalid after that point, if the thread exits quickly.
> @@ -1698,20 +1701,34 @@ long do_fork(unsigned long clone_flags,
> return nr;
> }
>
> +#ifndef CONFIG_HAVE_COPY_THREAD_TLS
> +/* For compatibility with architectures that call do_fork directly rather than
> + * using the syscall entry points below. */
> +long do_fork(unsigned long clone_flags,
> + unsigned long stack_start,
> + unsigned long stack_size,
> + int __user *parent_tidptr,
> + int __user *child_tidptr)
> +{
> + return _do_fork(clone_flags, stack_start, stack_size,
> + parent_tidptr, child_tidptr, 0);
> +}
> +#endif
> +
> /*
> * Create a kernel thread.
> */
> pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
> {
> - return do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
> - (unsigned long)arg, NULL, NULL);
> + return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
> + (unsigned long)arg, NULL, NULL, 0);
> }
>
> #ifdef __ARCH_WANT_SYS_FORK
> SYSCALL_DEFINE0(fork)
> {
> #ifdef CONFIG_MMU
> - return do_fork(SIGCHLD, 0, 0, NULL, NULL);
> + return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
> #else
> /* can not support in nommu mode */
> return -EINVAL;
> @@ -1722,8 +1739,8 @@ SYSCALL_DEFINE0(fork)
> #ifdef __ARCH_WANT_SYS_VFORK
> SYSCALL_DEFINE0(vfork)
> {
> - return do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
> - 0, NULL, NULL);
> + return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
> + 0, NULL, NULL, 0);
> }
> #endif
>
> @@ -1731,27 +1748,27 @@ SYSCALL_DEFINE0(vfork)
> #ifdef CONFIG_CLONE_BACKWARDS
> SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
> int __user *, parent_tidptr,
> - int, tls_val,
> + unsigned long, tls,
> int __user *, child_tidptr)
> #elif defined(CONFIG_CLONE_BACKWARDS2)
> SYSCALL_DEFINE5(clone, unsigned long, newsp, unsigned long, clone_flags,
> int __user *, parent_tidptr,
> int __user *, child_tidptr,
> - int, tls_val)
> + unsigned long, tls)
> #elif defined(CONFIG_CLONE_BACKWARDS3)
> SYSCALL_DEFINE6(clone, unsigned long, clone_flags, unsigned long, newsp,
> int, stack_size,
> int __user *, parent_tidptr,
> int __user *, child_tidptr,
> - int, tls_val)
> + unsigned long, tls)
> #else
> SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp,
> int __user *, parent_tidptr,
> int __user *, child_tidptr,
> - int, tls_val)
> + unsigned long, tls)
> #endif
> {
> - return do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr);
> + return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls);
> }
> #endif
>
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/