Re: [PATCH] fs/exec.c: Add fast path for ENOENT on PATH search before allocating mm

From: Kees Cook
Date: Fri Sep 16 2022 - 20:11:26 EST


[Hi Peter, apologies for dumping you into the middle of this thread.
I've got a question about sched_exec() below...]

On Fri, Sep 16, 2022 at 09:13:44PM +0100, Josh Triplett wrote:
> musl does the same thing, as do python and perl (likely via execvp or
> posix_spawnp). As does gcc when it executes `as`. And I've seen more
> than a few programs hand-implement a PATH search the same way. Seems
> worth optimizing for.

Yeah, it does seem like a simple way to eliminate needless work, though
I'd really like to see some kind of perf count of "in a given kernel
build, how many execve() system calls fail due to path search vs succeed",
just to get a better sense of the scale of the problem.

I don't like the idea of penalizing the _succeeding_ case, though, which
happens if we do the path walk twice. So, I went and refactoring the setup
order, moving the do_open_execat() up into alloc_bprm() instead of where
it was in bprm_exec(). The result makes it so it is, as you observed,
before the mm creation and generally expensive argument copying. The
difference to your patch seems to only be the allocation of the file
table entry, but avoids the double lookup, so I'm hoping the result is
actually even faster.

This cleanup is actually quite satisfying organizationally too -- the
fd and filename were passed around rather oddly.

The interaction with sched_exec() should be no worse (the file is opened
before it in either case), but in reading that function, it talks about
taking the opportunity to move the process to another CPU (IIUC) since,
paraphrasing, "it is at its lowest memory/cache size." But I wonder if
there is an existing accidental pessimistic result in that the process
stack has already been allocated. I am only passingly familiar with how
tasks get moved around under NUMA -- is the scheduler going to move
this process onto a different NUMA node and now it will be forced to
have the userspace process stack on one node and the program text and
heap on another? Or is that totally lost in the noise?

More specifically, I was wondering if processes would benefit from having
sched_exec() moved before the mm creation?

Regardless, here's a very lightly tested patch. Can you take this for a
spin and check your benchmark? Thanks!

-Kees

diff --git a/fs/exec.c b/fs/exec.c
index 9a5ca7b82bfc..5534301d67ca 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -898,6 +898,10 @@ EXPORT_SYMBOL(transfer_args_to_stack);

#endif /* CONFIG_MMU */

+/*
+ * On success, callers must call do_close_execat() on the returned
+ * struct file.
+ */
static struct file *do_open_execat(int fd, struct filename *name, int flags)
{
struct file *file;
@@ -945,6 +949,16 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
return ERR_PTR(err);
}

+/**
+ * open_exec - Open a path name for execution
+ *
+ * @name: path name to open with the intent of executing it.
+ *
+ * Returns ERR_PTR on failure or allocated struct file on success.
+ *
+ * As this is a wrapper for the internal do_open_execat(), callers
+ * must call allow_write_access() before fput() on release.
+ */
struct file *open_exec(const char *name)
{
struct filename *filename = getname_kernel(name);
@@ -1485,6 +1499,15 @@ static int prepare_bprm_creds(struct linux_binprm *bprm)
return -ENOMEM;
}

+/* Matches do_open_execat() */
+static void do_close_execat(struct file *file)
+{
+ if (!file)
+ return;
+ allow_write_access(file);
+ fput(file);
+}
+
static void free_bprm(struct linux_binprm *bprm)
{
if (bprm->mm) {
@@ -1496,10 +1519,7 @@ static void free_bprm(struct linux_binprm *bprm)
mutex_unlock(&current->signal->cred_guard_mutex);
abort_creds(bprm->cred);
}
- if (bprm->file) {
- allow_write_access(bprm->file);
- fput(bprm->file);
- }
+ do_close_execat(bprm->file);
if (bprm->executable)
fput(bprm->executable);
/* If a binfmt changed the interp, free it. */
@@ -1509,12 +1529,26 @@ static void free_bprm(struct linux_binprm *bprm)
kfree(bprm);
}

-static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
+static struct linux_binprm *alloc_bprm(int fd, struct filename *filename,
+ int flags)
{
- struct linux_binprm *bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
- int retval = -ENOMEM;
- if (!bprm)
+ struct linux_binprm *bprm;
+ struct file *file;
+ int retval;
+
+ file = do_open_execat(fd, filename, flags);
+ if (IS_ERR(file)) {
+ retval = PTR_ERR(file);
goto out;
+ }
+
+ retval = -ENOMEM;
+ bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ if (!bprm) {
+ do_close_execat(file);
+ goto out;
+ }
+ bprm->file = file;

if (fd == AT_FDCWD || filename->name[0] == '/') {
bprm->filename = filename->name;
@@ -1531,6 +1565,18 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename)
}
bprm->interp = bprm->filename;

+ /*
+ * Record that a name derived from an O_CLOEXEC fd will be
+ * inaccessible after exec. This allows the code in exec to
+ * choose to fail when the executable is not mmaped into the
+ * interpreter and an open file descriptor is not passed to
+ * the interpreter. This makes for a better user experience
+ * than having the interpreter start and then immediately fail
+ * when it finds the executable is inaccessible.
+ */
+ if (bprm->fdpath && get_close_on_exec(fd))
+ bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
+
retval = bprm_mm_init(bprm);
if (retval)
goto out_free;
@@ -1803,10 +1849,8 @@ static int exec_binprm(struct linux_binprm *bprm)
/*
* sys_execve() executes a new program.
*/
-static int bprm_execve(struct linux_binprm *bprm,
- int fd, struct filename *filename, int flags)
+static int bprm_execve(struct linux_binprm *bprm)
{
- struct file *file;
int retval;

retval = prepare_bprm_creds(bprm);
@@ -1816,26 +1860,8 @@ static int bprm_execve(struct linux_binprm *bprm,
check_unsafe_exec(bprm);
current->in_execve = 1;

- file = do_open_execat(fd, filename, flags);
- retval = PTR_ERR(file);
- if (IS_ERR(file))
- goto out_unmark;
-
sched_exec();

- bprm->file = file;
- /*
- * Record that a name derived from an O_CLOEXEC fd will be
- * inaccessible after exec. This allows the code in exec to
- * choose to fail when the executable is not mmaped into the
- * interpreter and an open file descriptor is not passed to
- * the interpreter. This makes for a better user experience
- * than having the interpreter start and then immediately fail
- * when it finds the executable is inaccessible.
- */
- if (bprm->fdpath && get_close_on_exec(fd))
- bprm->interp_flags |= BINPRM_FLAGS_PATH_INACCESSIBLE;
-
/* Set the unchanging part of bprm->cred */
retval = security_bprm_creds_for_exec(bprm);
if (retval)
@@ -1863,7 +1889,6 @@ static int bprm_execve(struct linux_binprm *bprm,
if (bprm->point_of_no_return && !fatal_signal_pending(current))
force_fatal_sig(SIGSEGV);

-out_unmark:
current->fs->in_exec = 0;
current->in_execve = 0;

@@ -1897,7 +1922,7 @@ static int do_execveat_common(int fd, struct filename *filename,
* further execve() calls fail. */
current->flags &= ~PF_NPROC_EXCEEDED;

- bprm = alloc_bprm(fd, filename);
+ bprm = alloc_bprm(fd, filename, flags);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
@@ -1946,7 +1971,7 @@ static int do_execveat_common(int fd, struct filename *filename,
bprm->argc = 1;
}

- retval = bprm_execve(bprm, fd, filename, flags);
+ retval = bprm_execve(bprm);
out_free:
free_bprm(bprm);

@@ -1971,7 +1996,7 @@ int kernel_execve(const char *kernel_filename,
if (IS_ERR(filename))
return PTR_ERR(filename);

- bprm = alloc_bprm(fd, filename);
+ bprm = alloc_bprm(fd, filename, 0);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
@@ -2006,7 +2031,7 @@ int kernel_execve(const char *kernel_filename,
if (retval < 0)
goto out_free;

- retval = bprm_execve(bprm, fd, filename, 0);
+ retval = bprm_execve(bprm);
out_free:
free_bprm(bprm);
out_ret:


--
Kees Cook