[PATCH 1/2] exec: don't wait for zombie threads with cred_guard_mutex held

From: Oleg Nesterov
Date: Mon Feb 13 2017 - 09:15:30 EST


de_thread() waits for other threads with ->cred_guard_mutex held and this
is really bad because the time is not bounded, debugger can delay the exit
and this lock has a lot of users (mostly abusers imo) in fs/proc and more.
And this leads to deadlock if debugger tries to take the same mutex:

#include <unistd.h>
#include <signal.h>
#include <sys/ptrace.h>
#include <pthread.h>

void *thread(void *arg)
{
ptrace(PTRACE_TRACEME, 0,0,0);
return NULL;
}

int main(void)
{
int pid = fork();

if (!pid) {
pthread_t pt;
pthread_create(&pt, NULL, thread, NULL);
pthread_join(pt, NULL);
execlp("echo", "echo", "passed", NULL);
}

sleep(1);
// or anything else which needs ->cred_guard_mutex,
// say open(/proc/$pid/mem)
ptrace(PTRACE_ATTACH, pid, 0,0);
kill(pid, SIGCONT);

return 0;
}

This hangs because de_thread() waits for debugger which should release the
killed thread with cred_guard_mutex held, while the debugger sleeps waiting
for the same mutex. Not really that bad, the tracer can be killed, but still
this is a bug and people hit it in practice.

The patch changes flush_old_exec() to wait until all the threads have passed
exit_notify(), we use the new kill_sub_threads() helper instead of de_thread().

However, we still need to wait until they all disappear. The main reason is
that we can not unshare sighand until that, execing thread and zombies must
use the same sighand->siglock to serialize the access to ->thread_head/etc.
So the patch changes ->load_binary() methods to call de_thread() right after
install_exec_creds() drops cred_guard_mutex.

Note that de_thread() is simplified, it no longer needs to send SIGKILL to
sub-threads and wait for the group_leader to become a zombie. But since it
is called after flush_old_exec() we also need to move flush_signal_handlers()
to de_thread().

TODO:

- Move install_exec_creds() and de_thread() into setup_new_exec(), unexport
de_thread().

- Avoid tasklist_lock in kill_sub_threads(), we can change exit_notify() to
set ->exit_state under siglock.

- Simplify/cleanup the ->notify_count logic, it is really ugly. I think we
should just turn this counter into signal->nr_live_threads. __exit_signal
can use signal->nr_threads instead.

- In any case we should limit the scope of cred_guard_mutex in execve paths.
It is not clear why do we take it at the start of execve, and worse, it is
not clear why we do we actually overload this mutex to exclude other threads
(except check_unsafe_exec() but this is solveable). The original motivation
was signal->in_exec_mm protection but this idea was replaced by 3c77f8457221
("exec: make argv/envp memory visible to oom-killer"). It is just ugly to
call copy_strings/etc with this mutex held.

Reported-by: Ulrich Obergfell <uobergfe@xxxxxxxxxx>
Reported-by: Attila Fazekas <afazekas@xxxxxxxxxx>
Reported-by: Aleksa Sarai <asarai@xxxxxxxx>
Signed-off-by: Oleg Nesterov <oleg@xxxxxxxxxx>
---
arch/x86/ia32/ia32_aout.c | 3 ++
fs/binfmt_aout.c | 3 ++
fs/binfmt_elf.c | 6 ++-
fs/binfmt_elf_fdpic.c | 4 ++
fs/binfmt_flat.c | 3 ++
fs/exec.c | 128 +++++++++++++++++++++++-----------------------
include/linux/binfmts.h | 1 +
kernel/exit.c | 5 +-
kernel/signal.c | 3 +-
9 files changed, 87 insertions(+), 69 deletions(-)

diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 7c0a711..a6b9cc9 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -312,6 +312,9 @@ static int load_aout_binary(struct linux_binprm *bprm)
return retval;

install_exec_creds(bprm);
+ retval = de_thread(current);
+ if (retval)
+ return retval;

if (N_MAGIC(ex) == OMAGIC) {
unsigned long text_addr, map_size;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 2a59139..edd1335 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -256,6 +256,9 @@ static int load_aout_binary(struct linux_binprm * bprm)
return retval;

install_exec_creds(bprm);
+ retval = de_thread(current);
+ if (retval)
+ return retval;

if (N_MAGIC(ex) == OMAGIC) {
unsigned long text_addr, map_size;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 4223702..79508f7 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -855,13 +855,17 @@ static int load_elf_binary(struct linux_binprm *bprm)
setup_new_exec(bprm);
install_exec_creds(bprm);

+ retval = de_thread(current);
+ if (retval)
+ goto out_free_dentry;
+
/* Do this so that we can load the interpreter, if need be. We will
change some of these later */
retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
executable_stack);
if (retval < 0)
goto out_free_dentry;
-
+
current->mm->start_stack = bprm->p;

/* Now we do a little grungy work by mmapping the ELF image into
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index d2e36f8..75fd6d8 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -430,6 +430,10 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm)
#endif

install_exec_creds(bprm);
+ retval = de_thread(current);
+ if (retval)
+ goto error;
+
if (create_elf_fdpic_tables(bprm, current->mm,
&exec_params, &interp_params) < 0)
goto error;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 9b2917a..a0ad9a3 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -953,6 +953,9 @@ static int load_flat_binary(struct linux_binprm *bprm)
}

install_exec_creds(bprm);
+ res = de_thread(current);
+ if (res)
+ return res;

set_binfmt(&flat_format);

diff --git a/fs/exec.c b/fs/exec.c
index e579466..8591c56 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1036,13 +1036,62 @@ static int exec_mmap(struct mm_struct *mm)
return 0;
}

+static int wait_for_notify_count(struct task_struct *tsk, struct signal_struct *sig)
+{
+ for (;;) {
+ if (unlikely(__fatal_signal_pending(tsk)))
+ goto killed;
+ set_current_state(TASK_KILLABLE);
+ if (!sig->notify_count)
+ break;
+ schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+ return 0;
+
+killed:
+ /* protects against exit_notify() and __exit_signal() */
+ read_lock(&tasklist_lock);
+ sig->group_exit_task = NULL;
+ sig->notify_count = 0;
+ read_unlock(&tasklist_lock);
+ return -EINTR;
+}
+
+/*
+ * Kill all the sub-threads and wait until they all pass exit_notify().
+ */
+static int kill_sub_threads(struct task_struct *tsk)
+{
+ struct signal_struct *sig = tsk->signal;
+ int err = -EINTR;
+
+ if (thread_group_empty(tsk))
+ return 0;
+
+ read_lock(&tasklist_lock);
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (!signal_group_exit(sig)) {
+ sig->group_exit_task = tsk;
+ sig->notify_count = -zap_other_threads(tsk);
+ err = 0;
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+ read_unlock(&tasklist_lock);
+
+ if (!err)
+ err = wait_for_notify_count(tsk, sig);
+ return err;
+
+}
+
/*
- * This function makes sure the current process has its own signal table,
- * so that flush_signal_handlers can later reset the handlers without
- * disturbing other processes. (Other processes might share the signal
- * table via the CLONE_SIGHAND option to clone().)
+ * This function makes sure the current process has no other threads and
+ * has a private signal table so that flush_signal_handlers() can reset
+ * the handlers without disturbing other processes which might share the
+ * signal table via the CLONE_SIGHAND option to clone().
*/
-static int de_thread(struct task_struct *tsk)
+int de_thread(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
struct sighand_struct *oldsighand = tsk->sighand;
@@ -1051,60 +1100,24 @@ static int de_thread(struct task_struct *tsk)
if (thread_group_empty(tsk))
goto no_thread_group;

- /*
- * Kill all other threads in the thread group.
- */
spin_lock_irq(lock);
- if (signal_group_exit(sig)) {
- /*
- * Another group action in progress, just
- * return so that the signal is processed.
- */
- spin_unlock_irq(lock);
- return -EAGAIN;
- }
-
- sig->group_exit_task = tsk;
- sig->notify_count = zap_other_threads(tsk);
+ sig->notify_count = sig->nr_threads;
if (!thread_group_leader(tsk))
sig->notify_count--;
-
- while (sig->notify_count) {
- __set_current_state(TASK_KILLABLE);
- spin_unlock_irq(lock);
- schedule();
- if (unlikely(__fatal_signal_pending(tsk)))
- goto killed;
- spin_lock_irq(lock);
- }
spin_unlock_irq(lock);

+ if (wait_for_notify_count(tsk, sig))
+ return -EINTR;
+
/*
* At this point all other threads have exited, all we have to
- * do is to wait for the thread group leader to become inactive,
- * and to assume its PID:
+ * do is to reap the old leader and assume its PID.
*/
if (!thread_group_leader(tsk)) {
struct task_struct *leader = tsk->group_leader;

- for (;;) {
- threadgroup_change_begin(tsk);
- write_lock_irq(&tasklist_lock);
- /*
- * Do this under tasklist_lock to ensure that
- * exit_notify() can't miss ->group_exit_task
- */
- sig->notify_count = -1;
- if (likely(leader->exit_state))
- break;
- __set_current_state(TASK_KILLABLE);
- write_unlock_irq(&tasklist_lock);
- threadgroup_change_end(tsk);
- schedule();
- if (unlikely(__fatal_signal_pending(tsk)))
- goto killed;
- }
-
+ threadgroup_change_begin(tsk);
+ write_lock_irq(&tasklist_lock);
/*
* The only record we have of the real-time age of a
* process, regardless of execs it's done, is start_time.
@@ -1162,10 +1175,9 @@ static int de_thread(struct task_struct *tsk)
release_task(leader);
}

+no_thread_group:
sig->group_exit_task = NULL;
sig->notify_count = 0;
-
-no_thread_group:
/* we have changed execution domain */
tsk->exit_signal = SIGCHLD;

@@ -1198,15 +1210,8 @@ static int de_thread(struct task_struct *tsk)
}

BUG_ON(!thread_group_leader(tsk));
+ flush_signal_handlers(current, 0);
return 0;
-
-killed:
- /* protects against exit_notify() and __exit_signal() */
- read_lock(&tasklist_lock);
- sig->group_exit_task = NULL;
- sig->notify_count = 0;
- read_unlock(&tasklist_lock);
- return -EAGAIN;
}

char *get_task_comm(char *buf, struct task_struct *tsk)
@@ -1237,11 +1242,7 @@ int flush_old_exec(struct linux_binprm * bprm)
{
int retval;

- /*
- * Make sure we have a private signal table and that
- * we are unassociated from the previous thread group.
- */
- retval = de_thread(current);
+ retval = kill_sub_threads(current);
if (retval)
goto out;

@@ -1336,7 +1337,6 @@ void setup_new_exec(struct linux_binprm * bprm)
/* An exec changes our domain. We are no longer part of the thread
group */
current->self_exec_id++;
- flush_signal_handlers(current, 0);
}
EXPORT_SYMBOL(setup_new_exec);

diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 1303b57..06a5a7b 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -101,6 +101,7 @@ extern int __must_check remove_arg_zero(struct linux_binprm *);
extern int search_binary_handler(struct linux_binprm *);
extern int flush_old_exec(struct linux_binprm * bprm);
extern void setup_new_exec(struct linux_binprm * bprm);
+extern int de_thread(struct task_struct *tsk);
extern void would_dump(struct linux_binprm *, struct file *);

extern int suid_dumpable;
diff --git a/kernel/exit.c b/kernel/exit.c
index 8f14b86..169d9f2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -699,8 +699,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
if (tsk->exit_state == EXIT_DEAD)
list_add(&tsk->ptrace_entry, &dead);

- /* mt-exec, de_thread() is waiting for group leader */
- if (unlikely(tsk->signal->notify_count < 0))
+ /* mt-exec, kill_sub_threads() is waiting for group exit */
+ if (unlikely(tsk->signal->notify_count < 0) &&
+ !++tsk->signal->notify_count)
wake_up_process(tsk->signal->group_exit_task);
write_unlock_irq(&tasklist_lock);

diff --git a/kernel/signal.c b/kernel/signal.c
index 3603d93..b78ce63 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1200,13 +1200,12 @@ int zap_other_threads(struct task_struct *p)

while_each_thread(p, t) {
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
- count++;
-
/* Don't bother with already dead threads */
if (t->exit_state)
continue;
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
+ count++;
}

return count;
--
2.5.0