[PATCH 6/6] coredump: Limit coredumps to a single thread group

From: Eric W. Biederman
Date: Thu Sep 23 2021 - 20:12:47 EST



Today when a signal is delivered with a handler of SIG_DFL whose
default behavior is to generate a core dump not only that process but
every process that shares the mm is killed.

In the case of vfork this looks like a real world problem. Consider
the following well defined sequence.

if (vfork() == 0) {
execve(...);
_exit(EXIT_FAILURE);
}

If a signal that generates a core dump is received after vfork but
before the execve changes the mm the process that called vfork will
also be killed (as the mm is shared).

Similarly if the execve fails after the point of no return the kernel
delivers SIGSEGV which will kill both the exec'ing process and because
the mm is shared the process that called vfork as well.

As far as I can tell this behavior is a violation of people's
reasonable expectations, POSIX, and is unnecessarily fragile when the
system is low on memory.

Solve this by making a userspace visible change to only kill a single
process/thread group. This is possible because Jann Horn recently
modified[1] the coredump code so that the mm can safely be modified
while the coredump is happening. With LinuxThreads long gone I don't
expect anyone to have a notice this behavior change in practice.

To accomplish this move the core_state pointer from mm_struct to
signal_struct, which allows different thread groups to coredump
simultatenously.

In zap_threads remove the work to kill anything except for the current
thread group.

[1] a07279c9a8cd ("binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot")
Fixes: d89f3847def4 ("[PATCH] thread-aware coredumps, 2.5.43-C3")
History-tree: git://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git
Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx>
---
fs/binfmt_elf.c | 4 +-
fs/binfmt_elf_fdpic.c | 2 +-
fs/coredump.c | 84 ++++--------------------------------
fs/proc/array.c | 6 +--
include/linux/mm_types.h | 13 ------
include/linux/sched/signal.h | 13 ++++++
kernel/exit.c | 13 ++----
kernel/fork.c | 1 -
8 files changed, 32 insertions(+), 104 deletions(-)

diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 69d900a8473d..796e5327ee7d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1834,7 +1834,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
/*
* Allocate a structure for each thread.
*/
- for (ct = &dump_task->mm->core_state->dumper; ct; ct = ct->next) {
+ for (ct = &dump_task->signal->core_state->dumper; ct; ct = ct->next) {
t = kzalloc(offsetof(struct elf_thread_core_info,
notes[info->thread_notes]),
GFP_KERNEL);
@@ -2024,7 +2024,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
if (!elf_note_info_init(info))
return 0;

- for (ct = current->mm->core_state->dumper.next;
+ for (ct = current->signal->core_state->dumper.next;
ct; ct = ct->next) {
ets = kzalloc(sizeof(*ets), GFP_KERNEL);
if (!ets)
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 6d8fd6030cbb..c6f588dc4a9d 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1494,7 +1494,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
goto end_coredump;

- for (ct = current->mm->core_state->dumper.next;
+ for (ct = current->signal->core_state->dumper.next;
ct; ct = ct->next) {
tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
ct->task, &thread_status_size);
diff --git a/fs/coredump.c b/fs/coredump.c
index d576287fb88b..a6b3c196cdef 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -369,99 +369,34 @@ static int zap_process(struct task_struct *start, int exit_code, int flags)
return nr;
}

-static int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+static int zap_threads(struct task_struct *tsk,
struct core_state *core_state, int exit_code)
{
- struct task_struct *g, *p;
- unsigned long flags;
int nr = -EAGAIN;

spin_lock_irq(&tsk->sighand->siglock);
if (!signal_group_exit(tsk->signal)) {
- mm->core_state = core_state;
+ tsk->signal->core_state = core_state;
tsk->signal->group_exit_task = tsk;
nr = zap_process(tsk, exit_code, 0);
clear_tsk_thread_flag(tsk, TIF_SIGPENDING);
+ tsk->flags |= PF_DUMPCORE;
+ atomic_set(&core_state->nr_threads, nr);
}
spin_unlock_irq(&tsk->sighand->siglock);
- if (unlikely(nr < 0))
- return nr;
-
- tsk->flags |= PF_DUMPCORE;
- if (atomic_read(&mm->mm_users) == nr + 1)
- goto done;
- /*
- * We should find and kill all tasks which use this mm, and we should
- * count them correctly into ->nr_threads. We don't take tasklist
- * lock, but this is safe wrt:
- *
- * fork:
- * None of sub-threads can fork after zap_process(leader). All
- * processes which were created before this point should be
- * visible to zap_threads() because copy_process() adds the new
- * process to the tail of init_task.tasks list, and lock/unlock
- * of ->siglock provides a memory barrier.
- *
- * do_exit:
- * The caller holds mm->mmap_lock. This means that the task which
- * uses this mm can't pass coredump_task_exit(), so it can't exit
- * or clear its ->mm.
- *
- * de_thread:
- * It does list_replace_rcu(&leader->tasks, &current->tasks),
- * we must see either old or new leader, this does not matter.
- * However, it can change p->sighand, so lock_task_sighand(p)
- * must be used. Since p->mm != NULL and we hold ->mmap_lock
- * it can't fail.
- *
- * Note also that "g" can be the old leader with ->mm == NULL
- * and already unhashed and thus removed from ->thread_group.
- * This is OK, __unhash_process()->list_del_rcu() does not
- * clear the ->next pointer, we will find the new leader via
- * next_thread().
- */
- rcu_read_lock();
- for_each_process(g) {
- if (g == tsk->group_leader)
- continue;
- if (g->flags & PF_KTHREAD)
- continue;
-
- for_each_thread(g, p) {
- if (unlikely(!p->mm))
- continue;
- if (unlikely(p->mm == mm)) {
- lock_task_sighand(p, &flags);
- nr += zap_process(p, exit_code,
- SIGNAL_GROUP_EXIT);
- unlock_task_sighand(p, &flags);
- }
- break;
- }
- }
- rcu_read_unlock();
-done:
- atomic_set(&core_state->nr_threads, nr);
return nr;
}

static int coredump_wait(int exit_code, struct core_state *core_state)
{
struct task_struct *tsk = current;
- struct mm_struct *mm = tsk->mm;
int core_waiters = -EBUSY;

init_completion(&core_state->startup);
core_state->dumper.task = tsk;
core_state->dumper.next = NULL;

- if (mmap_write_lock_killable(mm))
- return -EINTR;
-
- if (!mm->core_state)
- core_waiters = zap_threads(tsk, mm, core_state, exit_code);
- mmap_write_unlock(mm);
-
+ core_waiters = zap_threads(tsk, core_state, exit_code);
if (core_waiters > 0) {
struct core_thread *ptr;

@@ -483,7 +418,7 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
return core_waiters;
}

-static void coredump_finish(struct mm_struct *mm, bool core_dumped)
+static void coredump_finish(bool core_dumped)
{
struct core_thread *curr, *next;
struct task_struct *task;
@@ -493,9 +428,10 @@ static void coredump_finish(struct mm_struct *mm, bool core_dumped)
current->signal->group_exit_code |= 0x80;
current->signal->group_exit_task = NULL;
current->signal->flags = SIGNAL_GROUP_EXIT;
+ next = current->signal->core_state->dumper.next;
+ current->signal->core_state = NULL;
spin_unlock_irq(&current->sighand->siglock);

- next = mm->core_state->dumper.next;
while ((curr = next) != NULL) {
next = curr->next;
task = curr->task;
@@ -507,8 +443,6 @@ static void coredump_finish(struct mm_struct *mm, bool core_dumped)
curr->task = NULL;
wake_up_process(task);
}
-
- mm->core_state = NULL;
}

static bool dump_interrupted(void)
@@ -839,7 +773,7 @@ void do_coredump(const kernel_siginfo_t *siginfo)
fail_unlock:
kfree(argv);
kfree(cn.corename);
- coredump_finish(mm, core_dumped);
+ coredump_finish(core_dumped);
revert_creds(old_cred);
fail_creds:
put_cred(cred);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 49be8c8ef555..520c51be1e57 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -408,9 +408,9 @@ static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
cpumask_pr_args(&task->cpus_mask));
}

-static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm)
+static inline void task_core_dumping(struct seq_file *m, struct task_struct *task)
{
- seq_put_decimal_ull(m, "CoreDumping:\t", !!mm->core_state);
+ seq_put_decimal_ull(m, "CoreDumping:\t", !!task->signal->core_state);
seq_putc(m, '\n');
}

@@ -436,7 +436,7 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,

if (mm) {
task_mem(m, mm);
- task_core_dumping(m, mm);
+ task_core_dumping(m, task);
task_thp_status(m, mm);
mmput(mm);
}
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 7f8ee09c711f..1039f6ae922c 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -387,17 +387,6 @@ struct vm_area_struct {
struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
} __randomize_layout;

-struct core_thread {
- struct task_struct *task;
- struct core_thread *next;
-};
-
-struct core_state {
- atomic_t nr_threads;
- struct core_thread dumper;
- struct completion startup;
-};
-
struct kioctx_table;
struct mm_struct {
struct {
@@ -518,8 +507,6 @@ struct mm_struct {

unsigned long flags; /* Must use atomic bitops to access */

- struct core_state *core_state; /* coredumping support */
-
#ifdef CONFIG_AIO
spinlock_t ioctx_lock;
struct kioctx_table __rcu *ioctx_table;
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index e5f4ce622ee6..a8fe2a593a3a 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -72,6 +72,17 @@ struct multiprocess_signals {
struct hlist_node node;
};

+struct core_thread {
+ struct task_struct *task;
+ struct core_thread *next;
+};
+
+struct core_state {
+ atomic_t nr_threads;
+ struct core_thread dumper;
+ struct completion startup;
+};
+
/*
* NOTE! "signal_struct" does not have its own
* locking, because a shared signal_struct always
@@ -110,6 +121,8 @@ struct signal_struct {
int group_stop_count;
unsigned int flags; /* see SIGNAL_* flags below */

+ struct core_state *core_state; /* coredumping support */
+
/*
* PR_SET_CHILD_SUBREAPER marks a process, like a service
* manager, to re-parent orphan (double-forking) child processes
diff --git a/kernel/exit.c b/kernel/exit.c
index 774e6b5061b8..2b355e926c13 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -342,23 +342,18 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
static void coredump_task_exit(struct task_struct *tsk)
{
struct core_state *core_state;
- struct mm_struct *mm;
-
- mm = tsk->mm;
- if (!mm)
- return;

/*
* Serialize with any possible pending coredump.
- * We must hold mmap_lock around checking core_state
+ * We must hold siglock around checking core_state
* and setting PF_POSTCOREDUMP. The core-inducing thread
* will increment ->nr_threads for each thread in the
* group without PF_POSTCOREDUMP set.
*/
- mmap_read_lock(mm);
+ spin_lock_irq(&tsk->sighand->siglock);
tsk->flags |= PF_POSTCOREDUMP;
- core_state = mm->core_state;
- mmap_read_unlock(mm);
+ core_state = tsk->signal->core_state;
+ spin_unlock_irq(&tsk->sighand->siglock);
if (core_state) {
struct core_thread self;

diff --git a/kernel/fork.c b/kernel/fork.c
index 9bd9f2da9e41..c8adb76982f7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1044,7 +1044,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
- mm->core_state = NULL;
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
--
2.20.1