Re: [PATCH v3 5/5] exec: Add a exec_update_mutex to replace cred_guard_mutex

From: Bernd Edlinger
Date: Tue Mar 17 2020 - 17:54:04 EST


On 3/17/20 9:56 AM, Kirill Tkhai wrote:
> On 14.03.2020 12:11, Bernd Edlinger wrote:
>> The cred_guard_mutex is problematic. The cred_guard_mutex is held
>> over the userspace accesses as the arguments from userspace are read.
>> The cred_guard_mutex is held of PTRACE_EVENT_EXIT as the the other
>> threads are killed. The cred_guard_mutex is held over
>> "put_user(0, tsk->clear_child_tid)" in exit_mm().
>>
>> Any of those can result in deadlock, as the cred_guard_mutex is held
>> over a possible indefinite userspace waits for userspace.
>>
>> Add exec_update_mutex that is only held over exec updating process
>> with the new contents of exec, so that code that needs not to be
>> confused by exec changing the mm and the cred in ways that can not
>> happen during ordinary execution of a process.
>>
>> The plan is to switch the users of cred_guard_mutex to
>> exec_udpate_mutex one by one. This lets us move forward while still
>> being careful and not introducing any regressions.
>>
>> Link: https://lore.kernel.org/lkml/20160921152946.GA24210@xxxxxxxxxxxxxx/
>> Link: https://lore.kernel.org/lkml/AM6PR03MB5170B06F3A2B75EFB98D071AE4E60@xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx/
>> Link: https://lore.kernel.org/linux-fsdevel/20161102181806.GB1112@xxxxxxxxxx/
>> Link: https://lore.kernel.org/lkml/20160923095031.GA14923@xxxxxxxxxx/
>> Link: https://lore.kernel.org/lkml/20170213141452.GA30203@xxxxxxxxxx/
>> Ref: 45c1a159b85b ("Add PTRACE_O_TRACEVFORKDONE and PTRACE_O_TRACEEXIT facilities.")
>> Ref: 456f17cd1a28 ("[PATCH] user-vm-unlock-2.5.31-A2")
>> Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx>
>> Signed-off-by: Bernd Edlinger <bernd.edlinger@xxxxxxxxxx>
>> ---
>> fs/exec.c | 17 ++++++++++++++---
>> include/linux/binfmts.h | 8 +++++++-
>> include/linux/sched/signal.h | 9 ++++++++-
>> init/init_task.c | 1 +
>> kernel/fork.c | 1 +
>> 5 files changed, 31 insertions(+), 5 deletions(-)
>>
>> v3: this update fixes lock-order and adds an explicit data member in linux_binprm
>>
>> diff --git a/fs/exec.c b/fs/exec.c
>> index d820a72..11974a1 100644
>> --- a/fs/exec.c
>> +++ b/fs/exec.c
>> @@ -1014,12 +1014,17 @@ static int exec_mmap(struct mm_struct *mm)
>> {
>> struct task_struct *tsk;
>> struct mm_struct *old_mm, *active_mm;
>> + int ret;
>>
>> /* Notify parent that we're no longer interested in the old VM */
>> tsk = current;
>> old_mm = current->mm;
>> exec_mm_release(tsk, old_mm);
>>
>> + ret = mutex_lock_killable(&tsk->signal->exec_update_mutex);
>> + if (ret)
>> + return ret;
>> +
>> if (old_mm) {
>> sync_mm_rss(old_mm);
>> /*
>> @@ -1031,9 +1036,11 @@ static int exec_mmap(struct mm_struct *mm)
>> down_read(&old_mm->mmap_sem);
>> if (unlikely(old_mm->core_state)) {
>> up_read(&old_mm->mmap_sem);
>> + mutex_unlock(&tsk->signal->exec_update_mutex);
>> return -EINTR;
>> }
>> }
>> +
>> task_lock(tsk);
>> active_mm = tsk->active_mm;
>> membarrier_exec_mmap(mm);
>> @@ -1288,11 +1295,12 @@ int flush_old_exec(struct linux_binprm * bprm)
>> goto out;
>>
>> /*
>> - * After clearing bprm->mm (to mark that current is using the
>> - * prepared mm now), we have nothing left of the original
>> + * After setting bprm->called_exec_mmap (to mark that current is
>> + * using the prepared mm now), we have nothing left of the original
>> * process. If anything from here on returns an error, the check
>> * in search_binary_handler() will SEGV current.
>> */
>> + bprm->called_exec_mmap = 1;
>
> The two below is non-breaking pair:
>
> exec_mmap(bprm->mm);
> bprm->called_exec_mmap = 1;
>
> Why not move this into exec_mmap(), so nobody definitely inserts something
> between them?
>

Hmm, could be done, but then I would probably need a different name than
"called_exec_mmap".

How about adding a nice function comment to exec_mmap that calls out the
changed behaviour that the exec_update_mutex is taken unless the function
fails?


Bernd.


>> bprm->mm = NULL;
>>
>> #ifdef CONFIG_POSIX_TIMERS
>> @@ -1438,6 +1446,8 @@ static void free_bprm(struct linux_binprm *bprm)
>> {
>> free_arg_pages(bprm);
>> if (bprm->cred) {
>> + if (bprm->called_exec_mmap)
>> + mutex_unlock(&current->signal->exec_update_mutex);
>> mutex_unlock(&current->signal->cred_guard_mutex);
>> abort_creds(bprm->cred);
>> }
>> @@ -1487,6 +1497,7 @@ void install_exec_creds(struct linux_binprm *bprm)
>> * credentials; any time after this it may be unlocked.
>> */
>> security_bprm_committed_creds(bprm);
>> + mutex_unlock(&current->signal->exec_update_mutex);
>> mutex_unlock(&current->signal->cred_guard_mutex);
>> }
>> EXPORT_SYMBOL(install_exec_creds);
>> @@ -1678,7 +1689,7 @@ int search_binary_handler(struct linux_binprm *bprm)
>>
>> read_lock(&binfmt_lock);
>> put_binfmt(fmt);
>> - if (retval < 0 && !bprm->mm) {
>> + if (retval < 0 && bprm->called_exec_mmap) {
>> /* we got to flush_old_exec() and failed after it */
>> read_unlock(&binfmt_lock);
>> force_sigsegv(SIGSEGV);
>> diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
>> index b40fc63..a345d9f 100644
>> --- a/include/linux/binfmts.h
>> +++ b/include/linux/binfmts.h
>> @@ -44,7 +44,13 @@ struct linux_binprm {
>> * exec has happened. Used to sanitize execution environment
>> * and to set AT_SECURE auxv for glibc.
>> */
>> - secureexec:1;
>> + secureexec:1,
>> + /*
>> + * Set by flush_old_exec, when exec_mmap has been called.
>> + * This is past the point of no return, when the
>> + * exec_update_mutex has been taken.
>> + */
>> + called_exec_mmap:1;
>> #ifdef __alpha__
>> unsigned int taso:1;
>> #endif
>> diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
>> index 8805025..a29df79 100644
>> --- a/include/linux/sched/signal.h
>> +++ b/include/linux/sched/signal.h
>> @@ -224,7 +224,14 @@ struct signal_struct {
>>
>> struct mutex cred_guard_mutex; /* guard against foreign influences on
>> * credential calculations
>> - * (notably. ptrace) */
>> + * (notably. ptrace)
>> + * Deprecated do not use in new code.
>> + * Use exec_update_mutex instead.
>> + */
>> + struct mutex exec_update_mutex; /* Held while task_struct is being
>> + * updated during exec, and may have
>> + * inconsistent permissions.
>> + */
>> } __randomize_layout;
>>
>> /*
>> diff --git a/init/init_task.c b/init/init_task.c
>> index 9e5cbe5..bd403ed 100644
>> --- a/init/init_task.c
>> +++ b/init/init_task.c
>> @@ -26,6 +26,7 @@
>> .multiprocess = HLIST_HEAD_INIT,
>> .rlim = INIT_RLIMITS,
>> .cred_guard_mutex = __MUTEX_INITIALIZER(init_signals.cred_guard_mutex),
>> + .exec_update_mutex = __MUTEX_INITIALIZER(init_signals.exec_update_mutex),
>> #ifdef CONFIG_POSIX_TIMERS
>> .posix_timers = LIST_HEAD_INIT(init_signals.posix_timers),
>> .cputimer = {
>> diff --git a/kernel/fork.c b/kernel/fork.c
>> index 8642530..036b692 100644
>> --- a/kernel/fork.c
>> +++ b/kernel/fork.c
>> @@ -1594,6 +1594,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
>> sig->oom_score_adj_min = current->signal->oom_score_adj_min;
>>
>> mutex_init(&sig->cred_guard_mutex);
>> + mutex_init(&sig->exec_update_mutex);
>>
>> return 0;
>> }
>>
>