[PATCH v4 1/3] exec: add PR_HIDE_SELF_EXE prctl

From: Giuseppe Scrivano
Date: Fri Mar 10 2023 - 06:27:12 EST


This patch adds a new prctl called PR_HIDE_SELF_EXE which allows
processes to hide their own /proc/self/exe file. When this prctl is
used, every access to /proc/self/exe for the calling process will
fail with EPERM.

This is useful for preventing issues like CVE-2019-5736, where an
attacker can gain host root access by overwriting the binary
in OCI runtimes through file-descriptor mishandling in containers.

The current fix for CVE-2019-5736 is to create a read-only copy or
a bind-mount of the current executable, and then re-exec the current
process. With the new prctl, the read-only copy or bind-mount copy is
not needed anymore.

Once prctl(PR_HIDE_SELF_EXE) is set, it will blocks PR_SET_DUMPABLE to
be set again. It is not done automatically so that PR_SET_DUMPABLE
can be set at a later time if needed, e.g. a container runtime might
want to do that after the user namespace is joined.

It is currently kind of expected that a container uses execve to pass
control to a binary in the container chroot, since the /proc/self/exe
and /proc/self/map_files refer to files that are not part of the
container itself; so if the process can be compromised in any way
(e.g. an interpreter running a script in the container without calling
execve), it could potentially lead to host compromise. It is
necessary to ensure that the current process cannot be set to dumpable
anymore, since that would allow another process in the container to
get access to its proc files.

A container runtime could use the new prctl in the following way:

- run prctl(PR_HIDE_SELF_EXE) as soon as possible.
- if prctl(PR_HIDE_SELF_EXE) fails then use the more expensive current
fallbacks mechanism.
- perform all operations that require the process to be dumpable.
- prctl(PR_SET_DUMPABLE, 0), as it is already done today.
- execve the new binary or run directly the interpreter.

Signed-off-by: Giuseppe Scrivano <gscrivan@xxxxxxxxxx>
---
v3: https://lkml.org/lkml/2023/1/20/434

Differences from v3:

- rebased on top of the latest upstream master.
- now prctl(PR_HIDE_SELF_EXE) blocks access only from the same thread group, it
won't break CRIU as v3 did.
- extend the prctl to also cover files under /proc/self/map_files.
- once prctl(PR_HIDE_SELF_EXE) is used, it is not possible to unset
SUID_DUMP_DISABLE anymore. This is useful for the case where the container
doesn't use execve to pass the control to a binary in the container chroot, and it
prevents the process to be set to dumpable again since it would allow a different
process in the container to get access to its proc files.
- cleaned up the selftest and added more checks.

v2: https://lkml.org/lkml/2023/1/19/849

Differences from v2:

- fixed the test to check PR_SET_HIDE_SELF_EXE after fork

v1: https://lkml.org/lkml/2023/1/4/334

Differences from v1:

- amended more information in the commit message wrt map_files not
requiring the same protection.
- changed the test to verify PR_HIDE_SELF_EXE cannot be unset after
a fork.

fs/exec.c | 1 +
fs/proc/base.c | 9 +++++++++
include/linux/sched.h | 5 +++++
include/uapi/linux/prctl.h | 3 +++
kernel/sys.c | 13 +++++++++++++
tools/include/uapi/linux/prctl.h | 3 +++
6 files changed, 34 insertions(+)

diff --git a/fs/exec.c b/fs/exec.c
index 7c44d0c65b1b..d40e019a5ac5 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1858,6 +1858,7 @@ static int bprm_execve(struct linux_binprm *bprm,
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
+ task_clear_hide_self_exe(current);
rseq_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 5e0e0ccd47aa..14102a133655 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1719,16 +1719,25 @@ static const struct file_operations proc_pid_set_comm_operations = {
.release = single_release,
};

+static inline bool is_hide_self_exe_blocked(struct task_struct *task)
+{
+ return task_hide_self_exe(task) && same_thread_group(current, task);
+}
+
static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
{
struct task_struct *task;
struct file *exe_file;
+ bool hide_self_exe;

task = get_proc_task(d_inode(dentry));
if (!task)
return -ENOENT;
exe_file = get_task_exe_file(task);
+ hide_self_exe = is_hide_self_exe_blocked(task);
put_task_struct(task);
+ if (hide_self_exe)
+ return -EPERM;
if (exe_file) {
*exe_path = exe_file->f_path;
path_get(&exe_file->f_path);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 63d242164b1a..fb1faaabe795 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1796,6 +1796,7 @@ static __always_inline bool is_percpu_thread(void)
#define PFA_SPEC_IB_DISABLE 5 /* Indirect branch speculation restricted */
#define PFA_SPEC_IB_FORCE_DISABLE 6 /* Indirect branch speculation permanently restricted */
#define PFA_SPEC_SSB_NOEXEC 7 /* Speculative Store Bypass clear on execve() */
+#define PFA_HIDE_SELF_EXE 8 /* Hide /proc/self/exe for the process */

#define TASK_PFA_TEST(name, func) \
static inline bool task_##func(struct task_struct *p) \
@@ -1838,6 +1839,10 @@ TASK_PFA_CLEAR(SPEC_IB_DISABLE, spec_ib_disable)
TASK_PFA_TEST(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)
TASK_PFA_SET(SPEC_IB_FORCE_DISABLE, spec_ib_force_disable)

+TASK_PFA_TEST(HIDE_SELF_EXE, hide_self_exe)
+TASK_PFA_SET(HIDE_SELF_EXE, hide_self_exe)
+TASK_PFA_CLEAR(HIDE_SELF_EXE, hide_self_exe)
+
static inline void
current_restore_flags(unsigned long orig_flags, unsigned long flags)
{
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 1312a137f7fb..715c601b4fed 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -290,4 +290,7 @@ struct prctl_mm_map {
#define PR_SET_VMA 0x53564d41
# define PR_SET_VMA_ANON_NAME 0

+#define PR_SET_HIDE_SELF_EXE 67
+#define PR_GET_HIDE_SELF_EXE 68
+
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 495cd87d9bf4..6e5aae861ae7 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2408,6 +2408,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = -EINVAL;
break;
}
+ if (task_hide_self_exe(me) && arg2 != SUID_DUMP_DISABLE) {
+ error = -EPERM;
+ break;
+ }
set_dumpable(me->mm, arg2);
break;

@@ -2661,6 +2665,15 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SET_VMA:
error = prctl_set_vma(arg2, arg3, arg4, arg5);
break;
+ case PR_SET_HIDE_SELF_EXE:
+ if (arg2 != 1 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ task_set_hide_self_exe(current);
+ break;
+ case PR_GET_HIDE_SELF_EXE:
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+ return task_hide_self_exe(current) ? 1 : 0;
default:
error = -EINVAL;
break;
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index a5e06dcbba13..b4c7a6c248a9 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -284,4 +284,7 @@ struct prctl_mm_map {
#define PR_SET_VMA 0x53564d41
# define PR_SET_VMA_ANON_NAME 0

+#define PR_SET_HIDE_SELF_EXE 67
+#define PR_GET_HIDE_SELF_EXE 68
+
#endif /* _LINUX_PRCTL_H */
--
2.39.2