Re: Sealed memfd & no-fault mmap

From: Lin, Ming
Date: Fri May 28 2021 - 13:10:37 EST


On 5/5/2021 11:42 AM, Linus Torvalds wrote:
On Wed, May 5, 2021 at 3:21 AM Simon Ser <contact@xxxxxxxxxxx> wrote:

Is there some very specific and targeted pattern for that "shared
mapping" case? For example, if it's always a shared anonymous mapping
with no filesystem backing, then that would possibly be a simpler case
than the "random arbitrary shared file descriptor".

Yes. I don't know of any Wayland client using buffers with real
filesystem backing. I think the main cases are:

- shm_open(3) immediately followed by shm_unlink(3). On Linux, this is
implemented with /dev/shm which is a tmpfs.
- Abusing /tmp or /run's tmpfs by creating a file there and unlinking
it immediately afterwards. Kind of similar to the first case.
- memfd_create(2) on Linux.

Is this enough to make it work on shared memory mappings? Is it
important that the mapping is anonymous?

All of those should be anonymous in the sense that the backing store
is all the kernel's notion of anonymous pages, and there is no actual
file backing. The mappings may then be shared, of course.

So that does make Peter's idea to have some inode flag for "don't
SIGBUS on fault" be more reasonable, because there isn't some random
actual filesystem involved, only the core VM layer.

I'm not going to write the patch, though, but maybe you can convince
somebody else to try it..

Does something like following draft patch on the right track?

1. Application set S_NOFAULT flag on shm mmap fd

#define S_NOFAULT (1 << 17)
fd = shm_open(shmpath, O_RDONLY, S_IRUSR | S_IWUSR);
ioctl(fd, FS_IOC_GETFLAGS, &flags);
flags |= S_NOFAULT;
ioctl(fd, FS_IOC_SETFLAGS, &flags)

2. Don't SIGBUS on read beyond i_size if S_NOFAULT flag set in inode.
Use zero page instead.

---

[RFC DRAFT PATCH] shm: no SIGBUS fault on out-of-band mmap read
---
include/linux/fs.h | 2 ++
mm/shmem.c | 44 +++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/include/linux/fs.h b/include/linux/fs.h
index c3c88fdb9b2a..a9be7cd71b94 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2202,6 +2202,7 @@ struct super_operations {
#define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */
#define S_CASEFOLD (1 << 15) /* Casefolded file */
#define S_VERITY (1 << 16) /* Verity file (using fs/verity/) */
+#define S_NOFAULT (1 << 17) /* No SIGBUS fault on out-of-band mmap read */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
@@ -2244,6 +2245,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
#define IS_ENCRYPTED(inode) ((inode)->i_flags & S_ENCRYPTED)
#define IS_CASEFOLDED(inode) ((inode)->i_flags & S_CASEFOLD)
#define IS_VERITY(inode) ((inode)->i_flags & S_VERITY)
+#define IS_NOFAULT(inode) ((inode)->i_flags & S_NOFAULT)
#define IS_WHITEOUT(inode) (S_ISCHR(inode->i_mode) && \
(inode)->i_rdev == WHITEOUT_DEV)
diff --git a/mm/shmem.c b/mm/shmem.c
index 5d46611cba8d..856d2d8d4cdf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -38,8 +38,11 @@
#include <linux/hugetlb.h>
#include <linux/frontswap.h>
#include <linux/fs_parser.h>
+#include <linux/fs.h>
+#include <linux/fileattr.h>
#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
+#include <asm/pgalloc.h>
static struct vfsmount *shm_mnt;
@@ -1812,7 +1815,27 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
repeat:
if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
- return -EINVAL;
+ unsigned long dst_addr = vmf->address;
+ pte_t _dst_pte, *dst_pte;
+ spinlock_t *ptl;
+ int ret;
+
+ if (!IS_NOFAULT(inode))
+ return -EINVAL;
+
+ _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
+ vma->vm_page_prot));
+ dst_pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, dst_addr, &ptl);
+ ret = -EEXIST;
+ if (!pte_none(*dst_pte))
+ goto out_unlock;
+ set_pte_at(vma->vm_mm, dst_addr, dst_pte, _dst_pte);
+ update_mmu_cache(vma, dst_addr, dst_pte);
+ *fault_type = VM_FAULT_NOPAGE;
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(dst_pte, ptl);
+ return ret;
}
sbinfo = SHMEM_SB(inode->i_sb);
@@ -3819,6 +3842,23 @@ const struct address_space_operations shmem_aops = {
};
EXPORT_SYMBOL(shmem_aops);
+static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+
+ fileattr_fill_flags(fa, inode->i_flags);
+
+ return 0;
+}
+
+static int shmem_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ inode->i_flags = fa->flags;
+ return 0;
+}
+
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
.get_unmapped_area = shmem_get_unmapped_area,
@@ -3836,6 +3876,8 @@ static const struct file_operations shmem_file_operations = {
static const struct inode_operations shmem_inode_operations = {
.getattr = shmem_getattr,
.setattr = shmem_setattr,
+ .fileattr_get = shmem_fileattr_get,
+ .fileattr_set = shmem_fileattr_set,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
.set_acl = simple_set_acl,