[RFC PATCH 6/6] mm: Add basic page table sharing using mshare
From: Khalid Aziz
Date: Tue Jan 18 2022 - 16:21:21 EST
This patch adds basic page table sharing across tasks by making
mshare syscall. It does this by creating a new mm_struct which
hosts the shared vmas and page tables. This mm_struct is
maintained as long as there is at least one task using the mshare'd
range. It is cleaned up by the last mshare_unlink syscall.
NOTE: WORK IN PRGRESS. This is only a working prototype and has
bugs and many missing pieces. mm/mshare.c documents these bugs and
issues that should be addressed in a more complete implementation.
Signed-off-by: Khalid Aziz <khalid.aziz@xxxxxxxxxx>
Signed-off-by: Matthew Wilcox (Oracle) <willy@xxxxxxxxxxxxx>
---
include/linux/mm.h | 8 +
include/trace/events/mmflags.h | 3 +-
mm/internal.h | 7 +
mm/memory.c | 35 ++++-
mm/mshare.c | 265 +++++++++++++++++++++++++++++++--
5 files changed, 299 insertions(+), 19 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a7e4a9e7d807..63128f6c83cd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -308,11 +308,13 @@ extern unsigned int kobjsize(const void *objp);
#define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */
+#define VM_HIGH_ARCH_BIT_5 37 /* bit only usable on 64-bit architectures */
#define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
#define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
#define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
#define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
#define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
+#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
#endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
#ifdef CONFIG_ARCH_HAS_PKEYS
@@ -354,6 +356,12 @@ extern unsigned int kobjsize(const void *objp);
# define VM_MTE_ALLOWED VM_NONE
#endif
+#ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS
+#define VM_SHARED_PT VM_HIGH_ARCH_5
+#else
+#define VM_SHARED_PT 0
+#endif
+
#ifndef VM_GROWSUP
# define VM_GROWSUP VM_NONE
#endif
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 116ed4d5d0f8..002dbf2711c5 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -184,7 +184,8 @@ IF_HAVE_VM_SOFTDIRTY(VM_SOFTDIRTY, "softdirty" ) \
{VM_MIXEDMAP, "mixedmap" }, \
{VM_HUGEPAGE, "hugepage" }, \
{VM_NOHUGEPAGE, "nohugepage" }, \
- {VM_MERGEABLE, "mergeable" } \
+ {VM_MERGEABLE, "mergeable" }, \
+ {VM_SHARED_PT, "sharedpt" } \
#define show_vma_flags(flags) \
(flags) ? __print_flags(flags, "|", \
diff --git a/mm/internal.h b/mm/internal.h
index 3b79a5c9427a..9bfc4dde7d70 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -713,4 +713,11 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid, int *flags);
+extern vm_fault_t find_shared_vma(struct vm_area_struct **vma,
+ unsigned long *addrp);
+static inline bool vma_is_shared(const struct vm_area_struct *vma)
+{
+ return vma->vm_flags & VM_SHARED_PT;
+}
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/memory.c b/mm/memory.c
index 8f1de811a1dc..b506bbbfae60 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -418,16 +418,25 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
} else {
/*
* Optimization: gather nearby vmas into one call down
+ * as long as they all belong to the same mm (that
+ * may not be the case if a vma is part of mshare'd
+ * range
*/
while (next && next->vm_start <= vma->vm_end + PMD_SIZE
- && !is_vm_hugetlb_page(next)) {
+ && !is_vm_hugetlb_page(next)
+ && vma->vm_mm == tlb->mm) {
vma = next;
next = vma->vm_next;
unlink_anon_vmas(vma);
unlink_file_vma(vma);
}
- free_pgd_range(tlb, addr, vma->vm_end,
- floor, next ? next->vm_start : ceiling);
+ /*
+ * Free pgd only if pgd is not allocated for an
+ * mshare'd range
+ */
+ if (vma->vm_mm == tlb->mm)
+ free_pgd_range(tlb, addr, vma->vm_end,
+ floor, next ? next->vm_start : ceiling);
}
vma = next;
}
@@ -1528,6 +1537,13 @@ void unmap_page_range(struct mmu_gather *tlb,
pgd_t *pgd;
unsigned long next;
+ /*
+ * If this is an mshare'd page, do not unmap it since it might
+ * still be in use.
+ */
+ if (vma->vm_mm != tlb->mm)
+ return;
+
BUG_ON(addr >= end);
tlb_start_vma(tlb, vma);
pgd = pgd_offset(vma->vm_mm, addr);
@@ -4757,6 +4773,7 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags, struct pt_regs *regs)
{
vm_fault_t ret;
+ bool shared = false;
__set_current_state(TASK_RUNNING);
@@ -4766,6 +4783,15 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
/* do counter updates before entering really critical section. */
check_sync_rss_stat(current);
+ if (unlikely(vma_is_shared(vma))) {
+ ret = find_shared_vma(&vma, &address);
+ if (ret)
+ return ret;
+ if (!vma)
+ return VM_FAULT_SIGSEGV;
+ shared = true;
+ }
+
if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
flags & FAULT_FLAG_INSTRUCTION,
flags & FAULT_FLAG_REMOTE))
@@ -4783,6 +4809,9 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
else
ret = __handle_mm_fault(vma, address, flags);
+ if (shared)
+ mmap_read_unlock(vma->vm_mm);
+
if (flags & FAULT_FLAG_USER) {
mem_cgroup_exit_user_fault();
/*
diff --git a/mm/mshare.c b/mm/mshare.c
index adfd5a280e5b..ffdc72963b6b 100644
--- a/mm/mshare.c
+++ b/mm/mshare.c
@@ -6,8 +6,37 @@
*
*
* Copyright (C) 2021 Oracle Corp. All rights reserved.
- * Authors: Matthew Wilcox
- * Khalid Aziz
+ * Authors: Matthew Wilcox <willy@xxxxxxxxxxxxx>
+ * Khalid Aziz <khalid.aziz@xxxxxxxxxx>
+ *
+ * Current issues/questions:
+ * - mshare_unlink should unmap all shared VMAs for the calling task
+ * - If a task that had called mshare dies, make sure its shared VMAs
+ * are cleaned up properly and refcount to shared region is
+ * updated correctly.
+ * - Should mshare_unlink be allowed only for the tasks that called
+ * mshare() originally so the two calls are matched up? If yes,
+ * should there still be a way to clean up stale shared regions?
+ * - Allow already mapped in VMAs to be mshare'd by the task creating
+ * mshare region. This will potentially require splitting VMAs
+ * - What happens when a task tries to attach to an existing mshare
+ * region and it already has VMAs mapped into that address range?
+ * Possibilities - (1) mshare() fails, or (2) unmap current VMAs
+ * and create new ones with new mapping. If (2), what happens
+ * if exisiting VMA is larger than mshare region - split the
+ * VMA and leave partial original mapping intact, or unmap all
+ * overlapping VMAs
+ * - Can the tasks using mshare region mmap/mremap things into sections
+ * of the range? If yes, that will require additional work. Which
+ * mmaps should be supported - anonymous, files??
+ * - How does this work with hugepages?
+ * - How does this work with OOM killer?
+ * - BUG: PTEs no longer work once the task that created mshare
+ * range dies.
+ * - mmu_notifier uses vma->vm_mm. That very likely breaks with
+ * this code
+ * - Should consumer processes be allowed to only map the entire
+ * mshare'd region or should they be allowed to map subset of it?
*/
#include <linux/fs.h>
@@ -17,17 +46,50 @@
#include <linux/pseudo_fs.h>
#include <linux/fileattr.h>
#include <linux/refcount.h>
+#include <linux/mman.h>
#include <uapi/linux/magic.h>
#include <uapi/linux/limits.h>
+#include "internal.h"
struct mshare_data {
- struct mm_struct *mm;
+ struct mm_struct *mm, *host_mm;
mode_t mode;
refcount_t refcnt;
};
static struct super_block *msharefs_sb;
+/* Returns holding the host mm's lock for read. Caller must release. */
+vm_fault_t
+find_shared_vma(struct vm_area_struct **vmap, unsigned long *addrp)
+{
+ struct vm_area_struct *vma, *guest = *vmap;
+ struct mshare_data *info = guest->vm_private_data;
+ struct mm_struct *host_mm = info->mm;
+ unsigned long host_addr;
+ pgd_t *pgd, *guest_pgd;
+
+ host_addr = *addrp - guest->vm_start + host_mm->mmap_base;
+ pgd = pgd_offset(host_mm, host_addr);
+ guest_pgd = pgd_offset(current->mm, *addrp);
+ if (!pgd_same(*guest_pgd, *pgd)) {
+ set_pgd(guest_pgd, *pgd);
+ return VM_FAULT_NOPAGE;
+ }
+
+ *addrp = host_addr;
+ mmap_read_lock(host_mm);
+ vma = find_vma(host_mm, host_addr);
+
+ /* XXX: expand stack? */
+ if (vma && vma->vm_start > host_addr)
+ vma = NULL;
+
+ *vmap = vma;
+ vma->vm_mm = host_mm;
+ return 0;
+}
+
static void
msharefs_evict_inode(struct inode *inode)
{
@@ -168,13 +230,23 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr,
unsigned long, len, int, oflag, mode_t, mode)
{
struct mshare_data *info;
- struct mm_struct *mm;
struct filename *fname = getname(name);
struct dentry *dentry;
struct inode *inode;
struct qstr namestr;
+ struct vm_area_struct *vma, *next, *new_vma;
+ struct mm_struct *new_mm;
+ unsigned long end;
int err = PTR_ERR(fname);
+
+ /*
+ * Is msharefs mounted? TODO: If not mounted, return error
+ * or automount?
+ */
+ if (msharefs_sb == NULL)
+ return -ENOENT;
+
/*
* Address range being shared must be aligned to pgdir
* boundary and its size must be a multiple of pgdir size
@@ -185,6 +257,8 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr,
if (IS_ERR(fname))
goto err_out;
+ end = addr + len;
+
/*
* Does this mshare entry exist already? If it does, calling
* mshare with O_EXCL|O_CREAT is an error
@@ -197,49 +271,183 @@ SYSCALL_DEFINE5(mshare, const char __user *, name, unsigned long, addr,
inode_lock(d_inode(msharefs_sb->s_root));
dentry = d_lookup(msharefs_sb->s_root, &namestr);
if (dentry && (oflag & (O_EXCL|O_CREAT))) {
+ inode = d_inode(dentry);
err = -EEXIST;
dput(dentry);
goto err_unlock_inode;
}
if (dentry) {
+ unsigned long mapaddr, prot = PROT_NONE;
+
+ /*
+ * TODO: Address the following comment
+ *
+ * For now, we do not allow mshare mapping an existing mshare
+ * region if any overlapping mappings exist in calling
+ * process already
+ */
+ mmap_write_lock(current->mm);
+ vma = find_vma_intersection(current->mm, addr, end);
+ if (vma) {
+ mmap_write_unlock(current->mm);
+ err = -EINVAL;
+ goto err_out;
+ }
+
inode = d_inode(dentry);
if (inode == NULL) {
+ mmap_write_unlock(current->mm);
err = -EINVAL;
goto err_out;
}
info = inode->i_private;
- refcount_inc(&info->refcnt);
dput(dentry);
+
+ /*
+ * Map in the address range as anonymous mappings
+ */
+ mmap_write_unlock(current->mm);
+ oflag &= (O_RDONLY | O_WRONLY | O_RDWR);
+ if (oflag & O_RDONLY)
+ prot |= PROT_READ;
+ else if (oflag & O_WRONLY)
+ prot |= PROT_WRITE;
+ else if (oflag & O_RDWR)
+ prot |= (PROT_READ | PROT_WRITE);
+ mapaddr = vm_mmap(NULL, addr, len, prot,
+ MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, 0);
+ if (IS_ERR((void *)mapaddr)) {
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ refcount_inc(&info->refcnt);
+
+ /*
+ * Now that we have mmap'd the mshare'd range, update vma
+ * flags and vm_mm pointer for this mshare'd range.
+ */
+ mmap_write_lock(current->mm);
+ vma = find_vma(current->mm, addr);
+ if (vma && vma->vm_start < addr) {
+ mmap_write_unlock(current->mm);
+ err = -EINVAL;
+ goto err_out;
+ }
+
+ while (vma && vma->vm_start < (addr + len)) {
+ vma->vm_private_data = info;
+ vma->vm_mm = info->mm;
+ vma->vm_flags |= VM_SHARED_PT;
+ next = vma->vm_next;
+ vma = next;
+ }
} else {
- mm = mm_alloc();
- if (!mm)
+ unsigned long myaddr;
+ struct mm_struct *old_mm;
+
+ old_mm = current->mm;
+ new_mm = mm_alloc();
+ if (!new_mm)
return -ENOMEM;
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
err = -ENOMEM;
goto err_relmm;
}
- mm->mmap_base = addr;
- mm->task_size = addr + len;
- if (!mm->task_size)
- mm->task_size--;
- info->mm = mm;
+ new_mm->mmap_base = addr;
+ new_mm->task_size = addr + len;
+ if (!new_mm->task_size)
+ new_mm->task_size--;
+ info->mm = new_mm;
+ info->host_mm = old_mm;
info->mode = mode;
refcount_set(&info->refcnt, 1);
+
+ /*
+ * VMAs for this address range may or may not exist.
+ * If VMAs exist, they should be marked as shared at
+ * this point and page table info should be copied
+ * over to newly created mm_struct. TODO: If VMAs do not
+ * exist, create them and mark them as shared.
+ */
+ mmap_write_lock(old_mm);
+ vma = find_vma_intersection(old_mm, addr, end);
+ if (!vma) {
+ err = -EINVAL;
+ goto unlock;
+ }
+ /*
+ * TODO: If the currently allocated VMA goes beyond the
+ * mshare'd range, this VMA needs to be split.
+ *
+ * Double check that source VMAs do not extend outside
+ * the range
+ */
+ vma = find_vma(old_mm, addr + len);
+ if (vma && vma->vm_start < (addr + len)) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ vma = find_vma(old_mm, addr);
+ if (vma && vma->vm_start < addr) {
+ err = -EINVAL;
+ goto unlock;
+ }
+
+ mmap_write_lock(new_mm);
+ while (vma && vma->vm_start < (addr + len)) {
+ /*
+ * Copy this vma over to host mm
+ */
+ new_vma = vm_area_dup(vma);
+ if (!new_vma) {
+ err = -ENOMEM;
+ goto unlock;
+ }
+ vma->vm_mm = new_mm;
+ err = insert_vm_struct(new_mm, new_vma);
+ if (err)
+ goto unlock;
+
+ vma->vm_private_data = info;
+ vma->vm_mm = new_mm;
+ vma->vm_flags |= VM_SHARED_PT;
+ vma = vma->vm_next;
+ }
+ mmap_write_unlock(new_mm);
+
+
err = mshare_file_create(fname, oflag, info);
if (err)
- goto err_relinfo;
+ goto unlock;
+
+ /*
+ * Copy over current PTEs
+ */
+ myaddr = addr;
+ while (myaddr < new_mm->task_size) {
+ *pgd_offset(new_mm, myaddr) = *pgd_offset(old_mm, myaddr);
+ myaddr += PGDIR_SIZE;
+ }
+ /*
+ * TODO: Free the corresponding page table in calling
+ * process
+ */
}
+ mmap_write_unlock(current->mm);
inode_unlock(d_inode(msharefs_sb->s_root));
putname(fname);
return 0;
-err_relinfo:
+unlock:
+ mmap_write_unlock(current->mm);
kfree(info);
err_relmm:
- mmput(mm);
+ mmput(new_mm);
err_unlock_inode:
inode_unlock(d_inode(msharefs_sb->s_root));
err_out:
@@ -259,6 +467,9 @@ SYSCALL_DEFINE1(mshare_unlink, const char *, name)
struct mshare_data *info;
struct qstr namestr;
+ if (msharefs_sb == NULL)
+ return -ENOENT;
+
if (IS_ERR(fname))
goto err_out;
@@ -283,14 +494,38 @@ SYSCALL_DEFINE1(mshare_unlink, const char *, name)
/*
* Is this the last reference?
+ * TODO: permission checks are needed before proceeding
*/
if (refcount_dec_and_test(&info->refcnt)) {
simple_unlink(d_inode(msharefs_sb->s_root), dentry);
d_drop(dentry);
d_delete(dentry);
+ /*
+ * TODO: Release all physical pages allocated for this
+ * mshare range and release associated page table. If
+ * the final unlink happens from the process that created
+ * mshare'd range, do we return page tables and pages to
+ * that process so the creating process can continue using
+ * the address range it had chosen to mshare at some
+ * point?
+ *
+ * TODO: unmap shared vmas from every task that is using
+ * this mshare'd range.
+ */
mmput(info->mm);
kfree(info);
} else {
+ /*
+ * TODO: If mshare'd range is still mapped in the process,
+ * it should be unmapped. Following is minimal code and
+ * might need fix up
+ */
+ unsigned long tmp;
+
+ tmp = info->mm->task_size - info->mm->mmap_base;
+ if (info->host_mm != current->mm)
+ vm_munmap(info->mm->mmap_base, tmp);
+
dput(dentry);
}
--
2.32.0