[PATCH 1/2] fs/exec: Explicitly unshare fs_struct on exec

From: Kees Cook
Date: Thu Oct 06 2022 - 04:28:52 EST


The check_unsafe_exec() counting of n_fs would not add up under a heavily
threaded process trying to perform a suid exec, causing the suid portion
to fail. This counting error appears to be unneeded, but to catch any
possible conditions, explicitly unshare fs_struct on exec, if it ends up
needing to happen. This means fs->in_exec must be removed (since it would
never get cleared in the old copy), and is specifically no longer needed.

See also commit 498052bba55e ("New locking/refcounting for fs_struct").

This additionally allows the "in_exec" member to be dropped, showing an
8 bytes savings per fs_struct on 64-bit. Before:

struct fs_struct {
int users; /* 0 4 */
spinlock_t lock; /* 4 4 */
seqcount_spinlock_t seq; /* 8 4 */
int umask; /* 12 4 */
int in_exec; /* 16 4 */

/* XXX 4 bytes hole, try to pack */

struct path root; /* 24 16 */
struct path pwd; /* 40 16 */

/* size: 56, cachelines: 1, members: 7 */
/* sum members: 52, holes: 1, sum holes: 4 */
/* last cacheline: 56 bytes */
};

After:

struct fs_struct {
int users; /* 0 4 */
spinlock_t lock; /* 4 4 */
seqcount_spinlock_t seq; /* 8 4 */
int umask; /* 12 4 */
struct path root; /* 16 16 */
struct path pwd; /* 32 16 */

/* size: 48, cachelines: 1, members: 6 */
/* last cacheline: 48 bytes */
};

Reported-by: Jorge Merlino <jorge.merlino@xxxxxxxxxxxxx>
Link: https://lore.kernel.org/lkml/20220910211215.140270-1-jorge.merlino@xxxxxxxxxxxxx/
Cc: Eric Biederman <ebiederm@xxxxxxxxxxxx>
Cc: Alexander Viro <viro@xxxxxxxxxxxxxxxxxx>
Cc: "Christian Brauner (Microsoft)" <brauner@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Andy Lutomirski <luto@xxxxxxxxxx>
Cc: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: linux-mm@xxxxxxxxx
Cc: linux-fsdevel@xxxxxxxxxxxxxxx
Signed-off-by: Kees Cook <keescook@xxxxxxxxxxxx>
---
fs/exec.c | 9 +++---
fs/fs_struct.c | 1 -
include/linux/fdtable.h | 1 +
include/linux/fs_struct.h | 1 -
kernel/fork.c | 62 ++++++++++++++++++++++++++-------------
5 files changed, 48 insertions(+), 26 deletions(-)

diff --git a/fs/exec.c b/fs/exec.c
index 902bce45b116..7d5f63f03c58 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1272,6 +1272,11 @@ int begin_new_exec(struct linux_binprm * bprm)
if (retval)
goto out;

+ /* Ensure the fs_struct is not shared. */
+ retval = unshare_fs();
+ if (retval)
+ goto out;
+
/*
* Must be called _before_ exec_mmap() as bprm->mm is
* not visible until then. This also enables the update
@@ -1583,8 +1588,6 @@ static void check_unsafe_exec(struct linux_binprm *bprm)

if (p->fs->users > n_fs)
bprm->unsafe |= LSM_UNSAFE_SHARE;
- else
- p->fs->in_exec = 1;
spin_unlock(&p->fs->lock);
}

@@ -1843,7 +1846,6 @@ static int bprm_execve(struct linux_binprm *bprm,
goto out;

/* execve succeeded */
- current->fs->in_exec = 0;
current->in_execve = 0;
rseq_execve(current);
acct_update_integrals(current);
@@ -1861,7 +1863,6 @@ static int bprm_execve(struct linux_binprm *bprm,
force_fatal_sig(SIGSEGV);

out_unmark:
- current->fs->in_exec = 0;
current->in_execve = 0;

return retval;
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 04b3f5b9c629..c27c67023d01 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -115,7 +115,6 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
/* We don't need to lock fs - think why ;-) */
if (fs) {
fs->users = 1;
- fs->in_exec = 0;
spin_lock_init(&fs->lock);
seqcount_spinlock_init(&fs->seq, &fs->lock);
fs->umask = old->umask;
diff --git a/include/linux/fdtable.h b/include/linux/fdtable.h
index e066816f3519..fbfb89ee3bda 100644
--- a/include/linux/fdtable.h
+++ b/include/linux/fdtable.h
@@ -117,6 +117,7 @@ struct task_struct;

void put_files_struct(struct files_struct *fs);
int unshare_files(void);
+int unshare_fs(void);
struct files_struct *dup_fd(struct files_struct *, unsigned, int *) __latent_entropy;
void do_close_on_exec(struct files_struct *);
int iterate_fd(struct files_struct *, unsigned,
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 783b48dedb72..08b82a90ceae 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -11,7 +11,6 @@ struct fs_struct {
spinlock_t lock;
seqcount_spinlock_t seq;
int umask;
- int in_exec;
struct path root, pwd;
} __randomize_layout;

diff --git a/kernel/fork.c b/kernel/fork.c
index b4a799d9c50f..53b7248f7a4b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1589,10 +1589,6 @@ static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
if (clone_flags & CLONE_FS) {
/* tsk->fs is already what we want */
spin_lock(&fs->lock);
- if (fs->in_exec) {
- spin_unlock(&fs->lock);
- return -EAGAIN;
- }
fs->users++;
spin_unlock(&fs->lock);
return 0;
@@ -3070,10 +3066,9 @@ static int check_unshare_flags(unsigned long unshare_flags)
return 0;
}

-/*
- * Unshare the filesystem structure if it is being shared
- */
-static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
+/* Allocate an fs_struct if it is currently shared and CLONE_FS requested. */
+static int unshare_fs_alloc(unsigned long unshare_flags,
+ struct fs_struct **new_fsp)
{
struct fs_struct *fs = current->fs;

@@ -3091,6 +3086,41 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
return 0;
}

+/* Swap out fs_struct, returning old fs if it needs freeing. */
+static void unshare_fs_finalize(struct fs_struct **new_fsp)
+{
+ struct task_struct *task = current;
+ struct fs_struct *fs = task->fs;
+
+ spin_lock(&fs->lock);
+ task->fs = *new_fsp;
+ if (--fs->users)
+ *new_fsp = NULL;
+ else
+ *new_fsp = fs;
+ spin_unlock(&fs->lock);
+}
+
+/*
+ * Unshare the filesystem structure if it is being shared
+ */
+int unshare_fs(void)
+{
+ struct fs_struct *new_fs = NULL;
+ int error;
+
+ error = unshare_fs_alloc(CLONE_FS, &new_fs);
+ if (error || !new_fs)
+ return error;
+
+ unshare_fs_finalize(&new_fs);
+
+ if (new_fs)
+ free_fs_struct(new_fs);
+
+ return 0;
+}
+
/*
* Unshare file descriptor table if it is being shared
*/
@@ -3120,7 +3150,7 @@ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
*/
int ksys_unshare(unsigned long unshare_flags)
{
- struct fs_struct *fs, *new_fs = NULL;
+ struct fs_struct *new_fs = NULL;
struct files_struct *new_fd = NULL;
struct cred *new_cred = NULL;
struct nsproxy *new_nsproxy = NULL;
@@ -3159,7 +3189,7 @@ int ksys_unshare(unsigned long unshare_flags)
*/
if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
do_sysvsem = 1;
- err = unshare_fs(unshare_flags, &new_fs);
+ err = unshare_fs_alloc(unshare_flags, &new_fs);
if (err)
goto bad_unshare_out;
err = unshare_fd(unshare_flags, NR_OPEN_MAX, &new_fd);
@@ -3197,16 +3227,8 @@ int ksys_unshare(unsigned long unshare_flags)

task_lock(current);

- if (new_fs) {
- fs = current->fs;
- spin_lock(&fs->lock);
- current->fs = new_fs;
- if (--fs->users)
- new_fs = NULL;
- else
- new_fs = fs;
- spin_unlock(&fs->lock);
- }
+ if (new_fs)
+ unshare_fs_finalize(&new_fs);

if (new_fd)
swap(current->files, new_fd);
--
2.34.1