[RFC PATCH v4 1/2] mm: restrictedmem: Allow userspace to specify mount for memfd_restricted

From: Ackerley Tng
Date: Mon Apr 10 2023 - 21:29:48 EST


By default, the backing shmem file for a restrictedmem fd is created
on shmem's kernel space mount.

With this patch, an optional tmpfs mount can be specified via an fd,
which will be used as the mountpoint for backing the shmem file
associated with a restrictedmem fd.

This will help restrictedmem fds inherit the properties of the
provided tmpfs mounts, for example, hugepage (THP) allocation hints,
NUMA binding hints, etc.

Permissions for the fd passed to memfd_restricted() is modeled after
the openat() syscall, since both of these allow creation of a file
upon a mount/directory.

Permission to reference the mount the fd represents is checked upon fd
creation by other syscalls (e.g. fsmount(), open(), or open_tree(),
etc) and any process that can present memfd_restricted() with a valid
fd is expected to have obtained permission to use the mount
represented by the fd. This behavior is intended to parallel that of
the openat() syscall.

memfd_restricted() will check that the tmpfs superblock is
writable, and that the mount is also writable, before attempting to
create a restrictedmem file on the mount.

Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
---
include/linux/syscalls.h | 2 +-
include/uapi/linux/restrictedmem.h | 8 ++++
mm/restrictedmem.c | 73 ++++++++++++++++++++++++++++--
3 files changed, 77 insertions(+), 6 deletions(-)
create mode 100644 include/uapi/linux/restrictedmem.h

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 660be0bf89d5..90c73b9e14e5 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1058,7 +1058,7 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
unsigned long home_node,
unsigned long flags);
-asmlinkage long sys_memfd_restricted(unsigned int flags);
+asmlinkage long sys_memfd_restricted(unsigned int flags, int mount_fd);

/*
* Architecture-specific system calls
diff --git a/include/uapi/linux/restrictedmem.h b/include/uapi/linux/restrictedmem.h
new file mode 100644
index 000000000000..73e31bce73dc
--- /dev/null
+++ b/include/uapi/linux/restrictedmem.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_RESTRICTEDMEM_H
+#define _UAPI_LINUX_RESTRICTEDMEM_H
+
+/* flags for memfd_restricted */
+#define MEMFD_RSTD_USERMNT 0x0001U
+
+#endif /* _UAPI_LINUX_RESTRICTEDMEM_H */
diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
index 55e99e6c09a1..032ad1f15138 100644
--- a/mm/restrictedmem.c
+++ b/mm/restrictedmem.c
@@ -6,6 +6,7 @@
#include <linux/syscalls.h>
#include <uapi/linux/falloc.h>
#include <uapi/linux/magic.h>
+#include <uapi/linux/restrictedmem.h>
#include <linux/restrictedmem.h>

struct restrictedmem {
@@ -250,19 +251,20 @@ static struct address_space_operations restricted_aops = {
#endif
};

-SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
+static int restrictedmem_create(struct vfsmount *mount)
{
struct file *file, *restricted_file;
int fd, err;

- if (flags)
- return -EINVAL;
-
fd = get_unused_fd_flags(0);
if (fd < 0)
return fd;

- file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+ if (mount)
+ file = shmem_file_setup_with_mnt(mount, "memfd:restrictedmem", 0, VM_NORESERVE);
+ else
+ file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto err_fd;
@@ -286,6 +288,67 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
return err;
}

+static struct vfsmount *restrictedmem_get_user_mount(struct file *file)
+{
+ int ret;
+ struct vfsmount *mnt;
+ struct path *path;
+
+ path = &file->f_path;
+ if (path->dentry != path->mnt->mnt_root)
+ return ERR_PTR(-EINVAL);
+
+ /*
+ * Disallow bind-mounts that aren't bind-mounts of the whole
+ * filesystem
+ */
+ mnt = path->mnt;
+ if (mnt->mnt_root != mnt->mnt_sb->s_root)
+ return ERR_PTR(-EINVAL);
+
+ if (mnt->mnt_sb->s_magic != TMPFS_MAGIC)
+ return ERR_PTR(-EINVAL);
+
+ ret = mnt_want_write(mnt);
+ if (ret)
+ return ERR_PTR(ret);
+
+ return mnt;
+}
+
+SYSCALL_DEFINE2(memfd_restricted, unsigned int, flags, int, mount_fd)
+{
+ int ret;
+ struct fd f = {};
+ struct vfsmount *mnt = NULL;
+
+ if (flags & ~MEMFD_RSTD_USERMNT)
+ return -EINVAL;
+
+ if (flags & MEMFD_RSTD_USERMNT) {
+ f = fdget_raw(mount_fd);
+ if (!f.file)
+ return -EBADF;
+
+ mnt = restrictedmem_get_user_mount(f.file);
+ if (IS_ERR(mnt)) {
+ ret = PTR_ERR(mnt);
+ goto out;
+ }
+ }
+
+ ret = restrictedmem_create(mnt);
+
+ if (mnt)
+ mnt_drop_write(mnt);
+
+out:
+ if (f.file)
+ fdput(f);
+
+ return ret;
+}
+
int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end,
struct restrictedmem_notifier *notifier, bool exclusive)
{
--
2.40.0.577.gac1e443424-goog