Re: [RFC PATCH v3 1/2] mm: restrictedmem: Allow userspace to specify mount for memfd_restricted

From: David Hildenbrand
Date: Mon Apr 03 2023 - 04:23:23 EST


On 01.04.23 01:50, Ackerley Tng wrote:
By default, the backing shmem file for a restrictedmem fd is created
on shmem's kernel space mount.

With this patch, an optional tmpfs mount can be specified via an fd,
which will be used as the mountpoint for backing the shmem file
associated with a restrictedmem fd.

This will help restrictedmem fds inherit the properties of the
provided tmpfs mounts, for example, hugepage allocation hints, NUMA
binding hints, etc.

Permissions for the fd passed to memfd_restricted() is modeled after
the openat() syscall, since both of these allow creation of a file
upon a mount/directory.

Permission to reference the mount the fd represents is checked upon fd
creation by other syscalls (e.g. fsmount(), open(), or open_tree(),
etc) and any process that can present memfd_restricted() with a valid
fd is expected to have obtained permission to use the mount
represented by the fd. This behavior is intended to parallel that of
the openat() syscall.

memfd_restricted() will check that the tmpfs superblock is
writable, and that the mount is also writable, before attempting to
create a restrictedmem file on the mount.

Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
---
include/linux/syscalls.h | 2 +-
include/uapi/linux/restrictedmem.h | 8 ++++
mm/restrictedmem.c | 74 +++++++++++++++++++++++++++---
3 files changed, 77 insertions(+), 7 deletions(-)
create mode 100644 include/uapi/linux/restrictedmem.h

diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f9e9e0c820c5..a23c4c385cd3 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1056,7 +1056,7 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
unsigned long home_node,
unsigned long flags);
-asmlinkage long sys_memfd_restricted(unsigned int flags);
+asmlinkage long sys_memfd_restricted(unsigned int flags, int mount_fd);

/*
* Architecture-specific system calls
diff --git a/include/uapi/linux/restrictedmem.h b/include/uapi/linux/restrictedmem.h
new file mode 100644
index 000000000000..22d6f2285f6d
--- /dev/null
+++ b/include/uapi/linux/restrictedmem.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_RESTRICTEDMEM_H
+#define _UAPI_LINUX_RESTRICTEDMEM_H
+
+/* flags for memfd_restricted */
+#define RMFD_USERMNT 0x0001U

I wonder if we can come up with a more expressive prefix than RMFD. Sounds more like "rm fd" ;) Maybe it should better match the "memfd_restricted" syscall name, like "MEMFD_RSTD_USERMNT".


+
+#endif /* _UAPI_LINUX_RESTRICTEDMEM_H */
diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
index c5d869d8c2d8..f7b62364a31a 100644
--- a/mm/restrictedmem.c
+++ b/mm/restrictedmem.c
@@ -1,11 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
-#include "linux/sbitmap.h"

Looks like an unrelated change?

+#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/pseudo_fs.h>
#include <linux/shmem_fs.h>
#include <linux/syscalls.h>
#include <uapi/linux/falloc.h>
#include <uapi/linux/magic.h>
+#include <uapi/linux/restrictedmem.h>
#include <linux/restrictedmem.h>

struct restrictedmem {
@@ -189,19 +190,20 @@ static struct file *restrictedmem_file_create(struct file *memfd)
return file;
}

-SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
+static int restrictedmem_create(struct vfsmount *mount)
{
struct file *file, *restricted_file;
int fd, err;

- if (flags)
- return -EINVAL;
-
fd = get_unused_fd_flags(0);
if (fd < 0)
return fd;

- file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+ if (mount)
+ file = shmem_file_setup_with_mnt(mount, "memfd:restrictedmem", 0, VM_NORESERVE);
+ else
+ file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto err_fd;
@@ -223,6 +225,66 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
return err;
}

+static bool is_shmem_mount(struct vfsmount *mnt)
+{
+ return mnt && mnt->mnt_sb && mnt->mnt_sb->s_magic == TMPFS_MAGIC;
+}
+
+static bool is_mount_root(struct file *file)
+{
+ return file->f_path.dentry == file->f_path.mnt->mnt_root;
+}

I'd inline at least that function, pretty self-explaining.

+
+static int restrictedmem_create_on_user_mount(int mount_fd)
+{
+ int ret;
+ struct fd f;
+ struct vfsmount *mnt;
+
+ f = fdget_raw(mount_fd);
+ if (!f.file)
+ return -EBADF;
+
+ ret = -EINVAL;
+ if (!is_mount_root(f.file))
+ goto out;
+
+ mnt = f.file->f_path.mnt;
+ if (!is_shmem_mount(mnt))
+ goto out;
+
+ ret = file_permission(f.file, MAY_WRITE | MAY_EXEC);
+ if (ret)
+ goto out;
+
+ ret = mnt_want_write(mnt);
+ if (unlikely(ret))
+ goto out;
+
+ ret = restrictedmem_create(mnt);
+
+ mnt_drop_write(mnt);
+out:
+ fdput(f);
+
+ return ret;
+}
+
+SYSCALL_DEFINE2(memfd_restricted, unsigned int, flags, int, mount_fd)
+{
+ if (flags & ~RMFD_USERMNT)
+ return -EINVAL;
+
+ if (flags == RMFD_USERMNT) {
+ if (mount_fd < 0)
+ return -EINVAL;
+
+ return restrictedmem_create_on_user_mount(mount_fd);
+ } else {
+ return restrictedmem_create(NULL);
+ }


You can drop the else case:

if (flags == RMFD_USERMNT) {
...
return restrictedmem_create_on_user_mount(mount_fd);
}
return restrictedmem_create(NULL);


I do wonder if you want to properly check for a flag instead of comparing values. Results in a more natural way to deal with flags:

if (flags & RMFD_USERMNT) {

}

+}
+
int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end,
struct restrictedmem_notifier *notifier, bool exclusive)
{

The "memfd_restricted" vs. "restrictedmem" terminology is a bit unfortunate, but not your fault here.


I'm not a FS person, but it does look good to me.

--
Thanks,

David / dhildenb