[RFC PATCH 1/6] ptrace: Keep mm around after exit_mm() for __ptrace_may_access()

From: Jann Horn
Date: Thu Oct 15 2020 - 22:40:38 EST


__ptrace_may_access() checks can happen on target tasks that are in the
middle of do_exit(), past exit_mm(). At that point, the ->mm pointer has
been NULLed out, and the mm_struct has been mmput().

Unfortunately, the mm_struct contains the dumpability and the user_ns in
which the task last went through execve(), and we need those for
__ptrace_may_access(). Currently, that problem is handled by failing open:
If the ->mm is gone, we assume that the task was dumpable. In some edge
cases, this could potentially expose access to things like
/proc/$pid/fd/$fd of originally non-dumpable processes.
(exit_files() comes after exit_mm(), so the file descriptor table is still
there when we've gone through exit_mm().)

One way to fix this would be to move mm->user_ns and the dumpability state
over into the task_struct. However, that gets quite ugly if we want to
preserve existing semantics because e.g. PR_SET_DUMPABLE and commit_creds()
would then have to scan through all tasks sharing the mm_struct and keep
them in sync manually - that'd be a bit error-prone and overcomplicated.

(Moving these things into the signal_struct is not an option because that
is kept across executions, and pre-execve co-threads will share the
signal_struct that is also used by the task that has gone through
execve().)

I believe that this patch may be the least bad option to fix this - keep
the mm_struct (but not process memory) around with an mmgrab() reference
from exit_mm() until the task goes away completely.

Note that this moves free_task() down in order to make mmdrop_async()
available without a forward declaration.

Cc: stable@xxxxxxxxxxxxxxx
Fixes: bfedb589252c ("mm: Add a user_ns owner to mm_struct and fix ptrace permission checks")
Signed-off-by: Jann Horn <jannh@xxxxxxxxxx>
---
include/linux/sched.h | 8 +++++++
kernel/exit.c | 2 ++
kernel/fork.c | 54 ++++++++++++++++++++++---------------------
kernel/ptrace.c | 10 ++++++++
4 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index afe01e232935..55bec6ff5626 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -747,6 +747,14 @@ struct task_struct {

struct mm_struct *mm;
struct mm_struct *active_mm;
+ /*
+ * When we exit and ->mm (the reference pinning ->mm's address space)
+ * goes away, we stash a reference to the mm_struct itself (counted via
+ * exit_mm->mm_count) in this member.
+ * This allows us to continue using the mm_struct for security checks
+ * and such even after the task has started exiting.
+ */
+ struct mm_struct *exit_mm;

/* Per-thread vma caching: */
struct vmacache vmacache;
diff --git a/kernel/exit.c b/kernel/exit.c
index 733e80f334e7..97253ef33486 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -476,6 +476,8 @@ static void exit_mm(void)
/* more a memory barrier than a real lock */
task_lock(current);
current->mm = NULL;
+ mmgrab(mm); /* for current->exit_mm */
+ current->exit_mm = mm;
mmap_read_unlock(mm);
enter_lazy_tlb(mm, current);
task_unlock(current);
diff --git a/kernel/fork.c b/kernel/fork.c
index da8d360fb032..4942428a217c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -438,32 +438,6 @@ void put_task_stack(struct task_struct *tsk)
}
#endif

-void free_task(struct task_struct *tsk)
-{
- scs_release(tsk);
-
-#ifndef CONFIG_THREAD_INFO_IN_TASK
- /*
- * The task is finally done with both the stack and thread_info,
- * so free both.
- */
- release_task_stack(tsk);
-#else
- /*
- * If the task had a separate stack allocation, it should be gone
- * by now.
- */
- WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
-#endif
- rt_mutex_debug_task_free(tsk);
- ftrace_graph_exit_task(tsk);
- arch_release_task_struct(tsk);
- if (tsk->flags & PF_KTHREAD)
- free_kthread_struct(tsk);
- free_task_struct(tsk);
-}
-EXPORT_SYMBOL(free_task);
-
#ifdef CONFIG_MMU
static __latent_entropy int dup_mmap(struct mm_struct *mm,
struct mm_struct *oldmm)
@@ -722,6 +696,34 @@ static inline void put_signal_struct(struct signal_struct *sig)
free_signal_struct(sig);
}

+void free_task(struct task_struct *tsk)
+{
+ scs_release(tsk);
+
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+ /*
+ * The task is finally done with both the stack and thread_info,
+ * so free both.
+ */
+ release_task_stack(tsk);
+#else
+ /*
+ * If the task had a separate stack allocation, it should be gone
+ * by now.
+ */
+ WARN_ON_ONCE(refcount_read(&tsk->stack_refcount) != 0);
+#endif
+ rt_mutex_debug_task_free(tsk);
+ ftrace_graph_exit_task(tsk);
+ arch_release_task_struct(tsk);
+ if (tsk->flags & PF_KTHREAD)
+ free_kthread_struct(tsk);
+ if (tsk->exit_mm)
+ mmdrop_async(tsk->exit_mm);
+ free_task_struct(tsk);
+}
+EXPORT_SYMBOL(free_task);
+
void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 43d6179508d6..0aedc6cf5bdc 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -342,7 +342,17 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
* Pairs with a write barrier in commit_creds().
*/
smp_rmb();
+ /*
+ * Look up the target task's mm_struct. If it fails because the task is
+ * exiting and has gone through exit_mm(), we can instead use ->exit_mm
+ * as long as we only use members that are preserved by an mmgrab()
+ * reference.
+ * The only case in which both ->mm and ->exit_mm can be NULL should be
+ * kernel threads.
+ */
mm = task->mm;
+ if (!mm)
+ mm = task->exit_mm;
if (mm &&
((get_dumpable(mm) != SUID_DUMP_USER) &&
!ptrace_has_cap(cred, mm->user_ns, mode)))
--
2.29.0.rc1.297.gfa9743e501-goog