Re: [PATCH 1/2] exec: don't wait for zombie threads with cred_guard_mutex held

From: Mika PenttilÃ
Date: Mon Feb 13 2017 - 12:28:15 EST



On 13.02.2017 16:15, Oleg Nesterov wrote:
> + retval = de_thread(current);
> + if (retval)
> + return retval;
>
> if (N_MAGIC(ex) == OMAGIC) {
> unsigned long text_addr, map_size;
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 4223702..79508f7 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -855,13 +855,17 @@ static int load_elf_binary(struct linux_binprm *bprm)
> setup_new_exec(bprm);
> install_exec_creds(bprm);
>
> + retval = de_thread(current);
> + if (retval)
> + goto out_free_dentry;
> +
> /* Do this so that we can load the interpreter, if need be. We will
> change some of these later */
> retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
> executable_stack);
> if (retval < 0)
> goto out_free_dentry;
> -
> +
> current->mm->start_stack = bprm->p;
>
> /* Now we do a little grungy work by mmapping the ELF image into
> diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
> index d2e36f8..75fd6d8 100644
> --- a/fs/binfmt_elf_fdpic.c
> +++ b/fs/binfmt_elf_fdpic.c
> @@ -430,6 +430,10 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
> #endif
>
> install_exec_creds(bprm);
> + retval = de_thread(current);
> + if (retval)
> + goto error;
> +
> if (create_elf_fdpic_tables(bprm, current->mm,
> &exec_params, &interp_params) < 0)
> goto error;
> diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
> index 9b2917a..a0ad9a3 100644
> --- a/fs/binfmt_flat.c
> +++ b/fs/binfmt_flat.c
> @@ -953,6 +953,9 @@ static int load_flat_binary(struct linux_binprm *bprm)
> }
>
> install_exec_creds(bprm);
> + res = de_thread(current);
> + if (res)
> + return res;
>
> set_binfmt(&flat_format);
>
> diff --git a/fs/exec.c b/fs/exec.c
> index e579466..8591c56 100644
> --- a/fs/exec.c
> +++ b/fs/exec.c
> @@ -1036,13 +1036,62 @@ static int exec_mmap(struct mm_struct *mm)
> return 0;
> }
>
> +static int wait_for_notify_count(struct task_struct *tsk, struct signal_struct *sig)
> +{
> + for (;;) {
> + if (unlikely(__fatal_signal_pending(tsk)))
> + goto killed;
> + set_current_state(TASK_KILLABLE);
> + if (!sig->notify_count)
> + break;
> + schedule();
> + }
> + __set_current_state(TASK_RUNNING);
> + return 0;
> +
> +killed:
> + /* protects against exit_notify() and __exit_signal() */
> + read_lock(&tasklist_lock);
> + sig->group_exit_task = NULL;
> + sig->notify_count = 0;
> + read_unlock(&tasklist_lock);
> + return -EINTR;
> +}
> +
> +/*
> + * Kill all the sub-threads and wait until they all pass exit_notify().
> + */
> +static int kill_sub_threads(struct task_struct *tsk)
> +{
> + struct signal_struct *sig = tsk->signal;
> + int err = -EINTR;
> +
> + if (thread_group_empty(tsk))
> + return 0;
> +
> + read_lock(&tasklist_lock);
> + spin_lock_irq(&tsk->sighand->siglock);
> + if (!signal_group_exit(sig)) {
> + sig->group_exit_task = tsk;
> + sig->notify_count = -zap_other_threads(tsk);
> + err = 0;
> + }
> + spin_unlock_irq(&tsk->sighand->siglock);
> + read_unlock(&tasklist_lock);
> +
> + if (!err)
> + err = wait_for_notify_count(tsk, sig);
> + return err;
> +
> +}
> +
> /*
> - * This function makes sure the current process has its own signal table,
> - * so that flush_signal_handlers can later reset the handlers without
> - * disturbing other processes. (Other processes might share the signal
> - * table via the CLONE_SIGHAND option to clone().)
> + * This function makes sure the current process has no other threads and
> + * has a private signal table so that flush_signal_handlers() can reset
> + * the handlers without disturbing other processes which might share the
> + * signal table via the CLONE_SIGHAND option to clone().
> */
> -static int de_thread(struct task_struct *tsk)
> +int de_thread(struct task_struct *tsk)
> {
> struct signal_struct *sig = tsk->signal;
> struct sighand_struct *oldsighand = tsk->sighand;
> @@ -1051,60 +1100,24 @@ static int de_thread(struct task_struct *tsk)
> if (thread_group_empty(tsk))
> goto no_thread_group;
>
> - /*
> - * Kill all other threads in the thread group.
> - */
> spin_lock_irq(lock);
> - if (signal_group_exit(sig)) {
> - /*
> - * Another group action in progress, just
> - * return so that the signal is processed.
> - */
> - spin_unlock_irq(lock);
> - return -EAGAIN;
> - }
> -
> - sig->group_exit_task = tsk;
> - sig->notify_count = zap_other_threads(tsk);
> + sig->notify_count = sig->nr_threads;


maybe nr_threads - 1 since nr_threads includes us ?

+ sig->notify_count = sig->nr_threads - 1;



> if (!thread_group_leader(tsk))
> sig->notify_count--;
> -
> - while (sig->notify_count) {
> - __set_current_state(TASK_KILLABLE);
> - spin_unlock_irq(lock);
> - schedule();
> - if (unlikely(__fatal_signal_pending(tsk)))
> - goto killed;
> - spin_lock_irq(lock);
> - }
> spin_unlock_irq(lock);
>
> + if (wait_for_notify_count(tsk, sig))
> + return -EINTR;
> +
> /*
> * At this point all other threads have exited, all we have to
> - * do is to wait for the thread group leader to become inactive,
> - * and to assume its PID:
> + * do is to reap the old leader and assume its PID.
> */
> if (!thread_group_leader(tsk)) {
> struct task_struct *leader = tsk->group_leader;
>
> - for (;;) {
> - threadgroup_change_begin(tsk);
> - write_lock_irq(&tasklist_lock);
> - /*
> - * Do this under tasklist_lock to ensure that
> - * exit_notify() can't miss ->group_exit_task
> - */
> - sig->notify_count = -1;
> - if (likely(leader->exit_state))
> - break;
> - __set_current_state(TASK_KILLABLE);
> - write_unlock_irq(&tasklist_lock);
> - threadgroup_change_end(tsk);
> - schedule();
> - if (unlikely(__fatal_signal_pending(tsk)))
> - goto killed;
> - }
> -
> + threadgroup_change_begin(tsk);
> + write_lock_irq(&tasklist_lock);
> /*
> * The only record we have of the real-time age of a
> * process, regardless of execs it's done, is start_time.
> @@ -1162,10 +1175,9 @@ static int de_thread(struct task_struct *tsk)
> release_task(leader);
> }
>
> +no_thread_group:
> sig->group_exit_task = NULL;
> sig->notify_count = 0;
> -
> -no_thread_group:
> /* we have changed execution domain */
> tsk->exit_signal = SIGCHLD;
>
> @@ -1198,15 +1210,8 @@ static int de_thread(struct task_struct *tsk)
> }
>
> BUG_ON(!thread_group_leader(tsk));
> + flush_signal_handlers(current, 0);
> return 0;
> -
> -killed:
> - /* protects against exit_notify() and __exit_signal() */
> - read_lock(&tasklist_lock);
> - sig->group_exit_task = NULL;
> - sig->notify_count = 0;
> - read_unlock(&tasklist_lock);
> - return -EAGAIN;
> }
>
> char *get_task_comm(char *buf, struct task_struct *tsk)
> @@ -1237,11 +1242,7 @@ int flush_old_exec(struct linux_binprm * bprm)
> {
> int retval;
>
> - /*
> - * Make sure we have a private signal table and that
> - * we are unassociated from the previous thread group.
> - */
> - retval = de_thread(current);
> + retval = kill_sub_threads(current);
> if (retval)
> goto out;
>
> @@ -1336,7 +1337,6 @@ void setup_new_exec(struct linux_binprm * bprm)
> /* An exec changes our domain. We are no longer part of the thread
> group */
> current->self_exec_id++;
> - flush_signal_handlers(current, 0);
> }
> EXPORT_SYMBOL(setup_new_exec);
>
> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
> index 1303b57..06a5a7b 100644
> --- a/include/linux/binfmts.h
> +++ b/include/linux/binfmts.h
> @@ -101,6 +101,7 @@ extern int __must_check remove_arg_zero(struct linux_binprm *);
> extern int search_binary_handler(struct linux_binprm *);
> extern int flush_old_exec(struct linux_binprm * bprm);
> extern void setup_new_exec(struct linux_binprm * bprm);
> +extern int de_thread(struct task_struct *tsk);
> extern void would_dump(struct linux_binprm *, struct file *);
>
> extern int suid_dumpable;
> diff --git a/kernel/exit.c b/kernel/exit.c
> index 8f14b86..169d9f2 100644
> --- a/kernel/exit.c
> +++ b/kernel/exit.c
> @@ -699,8 +699,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
> if (tsk->exit_state == EXIT_DEAD)
> list_add(&tsk->ptrace_entry, &dead);
>
> - /* mt-exec, de_thread() is waiting for group leader */
> - if (unlikely(tsk->signal->notify_count < 0))
> + /* mt-exec, kill_sub_threads() is waiting for group exit */
> + if (unlikely(tsk->signal->notify_count < 0) &&
> + !++tsk->signal->notify_count)
> wake_up_process(tsk->signal->group_exit_task);
> write_unlock_irq(&tasklist_lock);
>
> diff --git a/kernel/signal.c b/kernel/signal.c
> index 3603d93..b78ce63 100644
> --- a/kernel/signal.c
> +++ b/kernel/signal.c
> @@ -1200,13 +1200,12 @@ int zap_other_threads(struct task_struct *p)
>
> while_each_thread(p, t) {
> task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
> - count++;
> -
> /* Don't bother with already dead threads */
> if (t->exit_state)
> continue;
> sigaddset(&t->pending.signal, SIGKILL);
> signal_wake_up(t, 1);
> + count++;
> }
>
> return count;