Re: [RFC PATCH v3 1/2] mm: restrictedmem: Allow userspace to specify mount for memfd_restricted

From: Kirill A. Shutemov
Date: Tue Apr 04 2023 - 04:25:38 EST


On Fri, Mar 31, 2023 at 11:50:39PM +0000, Ackerley Tng wrote:
> By default, the backing shmem file for a restrictedmem fd is created
> on shmem's kernel space mount.
>
> With this patch, an optional tmpfs mount can be specified via an fd,
> which will be used as the mountpoint for backing the shmem file
> associated with a restrictedmem fd.
>
> This will help restrictedmem fds inherit the properties of the
> provided tmpfs mounts, for example, hugepage allocation hints, NUMA
> binding hints, etc.
>
> Permissions for the fd passed to memfd_restricted() is modeled after
> the openat() syscall, since both of these allow creation of a file
> upon a mount/directory.
>
> Permission to reference the mount the fd represents is checked upon fd
> creation by other syscalls (e.g. fsmount(), open(), or open_tree(),
> etc) and any process that can present memfd_restricted() with a valid
> fd is expected to have obtained permission to use the mount
> represented by the fd. This behavior is intended to parallel that of
> the openat() syscall.
>
> memfd_restricted() will check that the tmpfs superblock is
> writable, and that the mount is also writable, before attempting to
> create a restrictedmem file on the mount.
>
> Signed-off-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
> ---
> include/linux/syscalls.h | 2 +-
> include/uapi/linux/restrictedmem.h | 8 ++++
> mm/restrictedmem.c | 74 +++++++++++++++++++++++++++---
> 3 files changed, 77 insertions(+), 7 deletions(-)
> create mode 100644 include/uapi/linux/restrictedmem.h
>
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index f9e9e0c820c5..a23c4c385cd3 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -1056,7 +1056,7 @@ asmlinkage long sys_memfd_secret(unsigned int flags);
> asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
> unsigned long home_node,
> unsigned long flags);
> -asmlinkage long sys_memfd_restricted(unsigned int flags);
> +asmlinkage long sys_memfd_restricted(unsigned int flags, int mount_fd);
>
> /*
> * Architecture-specific system calls
> diff --git a/include/uapi/linux/restrictedmem.h b/include/uapi/linux/restrictedmem.h
> new file mode 100644
> index 000000000000..22d6f2285f6d
> --- /dev/null
> +++ b/include/uapi/linux/restrictedmem.h
> @@ -0,0 +1,8 @@
> +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
> +#ifndef _UAPI_LINUX_RESTRICTEDMEM_H
> +#define _UAPI_LINUX_RESTRICTEDMEM_H
> +
> +/* flags for memfd_restricted */
> +#define RMFD_USERMNT 0x0001U
> +
> +#endif /* _UAPI_LINUX_RESTRICTEDMEM_H */
> diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
> index c5d869d8c2d8..f7b62364a31a 100644
> --- a/mm/restrictedmem.c
> +++ b/mm/restrictedmem.c
> @@ -1,11 +1,12 @@
> // SPDX-License-Identifier: GPL-2.0
> -#include "linux/sbitmap.h"
> +#include <linux/namei.h>
> #include <linux/pagemap.h>
> #include <linux/pseudo_fs.h>
> #include <linux/shmem_fs.h>
> #include <linux/syscalls.h>
> #include <uapi/linux/falloc.h>
> #include <uapi/linux/magic.h>
> +#include <uapi/linux/restrictedmem.h>
> #include <linux/restrictedmem.h>
>
> struct restrictedmem {
> @@ -189,19 +190,20 @@ static struct file *restrictedmem_file_create(struct file *memfd)
> return file;
> }
>
> -SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
> +static int restrictedmem_create(struct vfsmount *mount)
> {
> struct file *file, *restricted_file;
> int fd, err;
>
> - if (flags)
> - return -EINVAL;
> -
> fd = get_unused_fd_flags(0);
> if (fd < 0)
> return fd;
>
> - file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
> + if (mount)
> + file = shmem_file_setup_with_mnt(mount, "memfd:restrictedmem", 0, VM_NORESERVE);
> + else
> + file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
> +
> if (IS_ERR(file)) {
> err = PTR_ERR(file);
> goto err_fd;
> @@ -223,6 +225,66 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
> return err;
> }
>
> +static bool is_shmem_mount(struct vfsmount *mnt)
> +{
> + return mnt && mnt->mnt_sb && mnt->mnt_sb->s_magic == TMPFS_MAGIC;
> +}
> +
> +static bool is_mount_root(struct file *file)
> +{
> + return file->f_path.dentry == file->f_path.mnt->mnt_root;
> +}
> +
> +static int restrictedmem_create_on_user_mount(int mount_fd)
> +{
> + int ret;
> + struct fd f;
> + struct vfsmount *mnt;
> +
> + f = fdget_raw(mount_fd);
> + if (!f.file)
> + return -EBADF;
> +
> + ret = -EINVAL;
> + if (!is_mount_root(f.file))
> + goto out;
> +
> + mnt = f.file->f_path.mnt;
> + if (!is_shmem_mount(mnt))
> + goto out;
> +
> + ret = file_permission(f.file, MAY_WRITE | MAY_EXEC);

Why MAY_EXEC?

> + if (ret)
> + goto out;
> +
> + ret = mnt_want_write(mnt);
> + if (unlikely(ret))
> + goto out;
> +
> + ret = restrictedmem_create(mnt);
> +
> + mnt_drop_write(mnt);
> +out:
> + fdput(f);
> +
> + return ret;
> +}

We need review from fs folks. Look mostly sensible, but I have no
experience in fs.

> +
> +SYSCALL_DEFINE2(memfd_restricted, unsigned int, flags, int, mount_fd)
> +{
> + if (flags & ~RMFD_USERMNT)
> + return -EINVAL;
> +
> + if (flags == RMFD_USERMNT) {
> + if (mount_fd < 0)
> + return -EINVAL;
> +
> + return restrictedmem_create_on_user_mount(mount_fd);
> + } else {
> + return restrictedmem_create(NULL);
> + }

Maybe restructure with single restrictedmem_create() call?

struct vfsmount *mnt = NULL;

if (flags == RMFD_USERMNT) {
...
mnt = ...();
}

return restrictedmem_create(mnt);
> +}
> +
> int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end,
> struct restrictedmem_notifier *notifier, bool exclusive)
> {
> --
> 2.40.0.348.gf938b09366-goog

--
Kiryl Shutsemau / Kirill A. Shutemov