Re: [RFC PATCH v3 1/2] mm: restrictedmem: Allow userspace to specify mount for memfd_restricted

From: Ackerley Tng
Date: Wed Apr 05 2023 - 18:29:07 EST



Thanks for your review!

David Hildenbrand <david@xxxxxxxxxx> writes:

On 01.04.23 01:50, Ackerley Tng wrote:

...

diff --git a/include/uapi/linux/restrictedmem.h b/include/uapi/linux/restrictedmem.h
new file mode 100644
index 000000000000..22d6f2285f6d
--- /dev/null
+++ b/include/uapi/linux/restrictedmem.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+#ifndef _UAPI_LINUX_RESTRICTEDMEM_H
+#define _UAPI_LINUX_RESTRICTEDMEM_H
+
+/* flags for memfd_restricted */
+#define RMFD_USERMNT 0x0001U

I wonder if we can come up with a more expressive prefix than RMFD.
Sounds more like "rm fd" ;) Maybe it should better match the
"memfd_restricted" syscall name, like "MEMFD_RSTD_USERMNT".


RMFD did actually sound vulgar, I'm good with MEMFD_RSTD_USERMNT!

+
+#endif /* _UAPI_LINUX_RESTRICTEDMEM_H */
diff --git a/mm/restrictedmem.c b/mm/restrictedmem.c
index c5d869d8c2d8..f7b62364a31a 100644
--- a/mm/restrictedmem.c
+++ b/mm/restrictedmem.c
@@ -1,11 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
-#include "linux/sbitmap.h"

Looks like an unrelated change?


Will remove this in the next revision.

+#include <linux/namei.h>
#include <linux/pagemap.h>
#include <linux/pseudo_fs.h>
#include <linux/shmem_fs.h>
#include <linux/syscalls.h>
#include <uapi/linux/falloc.h>
#include <uapi/linux/magic.h>
+#include <uapi/linux/restrictedmem.h>
#include <linux/restrictedmem.h>

struct restrictedmem {
@@ -189,19 +190,20 @@ static struct file *restrictedmem_file_create(struct file *memfd)
return file;
}

-SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
+static int restrictedmem_create(struct vfsmount *mount)
{
struct file *file, *restricted_file;
int fd, err;

- if (flags)
- return -EINVAL;
-
fd = get_unused_fd_flags(0);
if (fd < 0)
return fd;

- file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+ if (mount)
+ file = shmem_file_setup_with_mnt(mount, "memfd:restrictedmem", 0, VM_NORESERVE);
+ else
+ file = shmem_file_setup("memfd:restrictedmem", 0, VM_NORESERVE);
+
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto err_fd;
@@ -223,6 +225,66 @@ SYSCALL_DEFINE1(memfd_restricted, unsigned int, flags)
return err;
}

+static bool is_shmem_mount(struct vfsmount *mnt)
+{
+ return mnt && mnt->mnt_sb && mnt->mnt_sb->s_magic == TMPFS_MAGIC;
+}
+
+static bool is_mount_root(struct file *file)
+{
+ return file->f_path.dentry == file->f_path.mnt->mnt_root;
+}

I'd inline at least that function, pretty self-explaining.


Will inline this in the next revision.

+
+static int restrictedmem_create_on_user_mount(int mount_fd)
+{
+ int ret;
+ struct fd f;
+ struct vfsmount *mnt;
+
+ f = fdget_raw(mount_fd);
+ if (!f.file)
+ return -EBADF;
+
+ ret = -EINVAL;
+ if (!is_mount_root(f.file))
+ goto out;
+
+ mnt = f.file->f_path.mnt;
+ if (!is_shmem_mount(mnt))
+ goto out;
+
+ ret = file_permission(f.file, MAY_WRITE | MAY_EXEC);
+ if (ret)
+ goto out;
+
+ ret = mnt_want_write(mnt);
+ if (unlikely(ret))
+ goto out;
+
+ ret = restrictedmem_create(mnt);
+
+ mnt_drop_write(mnt);
+out:
+ fdput(f);
+
+ return ret;
+}
+
+SYSCALL_DEFINE2(memfd_restricted, unsigned int, flags, int, mount_fd)
+{
+ if (flags & ~RMFD_USERMNT)
+ return -EINVAL;
+
+ if (flags == RMFD_USERMNT) {
+ if (mount_fd < 0)
+ return -EINVAL;
+
+ return restrictedmem_create_on_user_mount(mount_fd);
+ } else {
+ return restrictedmem_create(NULL);
+ }


You can drop the else case:

if (flags == RMFD_USERMNT) {
...
return restrictedmem_create_on_user_mount(mount_fd);
}
return restrictedmem_create(NULL);


I'll be refactoring this to adopt Kirill's suggestion of using a single
restrictedmem_create(mnt) call.


I do wonder if you want to properly check for a flag instead of
comparing values. Results in a more natural way to deal with flags:

if (flags & RMFD_USERMNT) {

}


Will use this in the next revision.

+}
+
int restrictedmem_bind(struct file *file, pgoff_t start, pgoff_t end,
struct restrictedmem_notifier *notifier, bool exclusive)
{

The "memfd_restricted" vs. "restrictedmem" terminology is a bit
unfortunate, but not your fault here.


I'm not a FS person, but it does look good to me.