[RFC PATCH v1 1/4] mm/remote_mapping: mirror a process address space
From: Mircea CIRJALIU - MELIU
Date: Wed Dec 11 2019 - 04:29:26 EST
Use a device to inspect another process address space via page table mirroring.
Give this device a source process PID via an ioctl(), then use mmap()
to analyze the source process address space like an ordinary file.
Process address space mirroring is limited to anon VMAs.
The device mirrors page tables on demand (faults) and invalidates them
by listening to MMU notifier events.
Signed-off-by: Mircea Cirjaliu <mcirjaliu@xxxxxxxxxxxxxxx>
---
include/linux/remote_mapping.h | 33 ++
include/uapi/linux/remote_mapping.h | 12 +
mm/Kconfig | 9 +
mm/Makefile | 1 +
mm/remote_mapping.c | 615 ++++++++++++++++++++++++++++++++++++
5 files changed, 670 insertions(+)
create mode 100644 include/linux/remote_mapping.h
create mode 100644 include/uapi/linux/remote_mapping.h
create mode 100644 mm/remote_mapping.c
diff --git a/include/linux/remote_mapping.h b/include/linux/remote_mapping.h
new file mode 100644
index 0000000..ad0995d
--- /dev/null
+++ b/include/linux/remote_mapping.h
@@ -0,0 +1,33 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _REMOTE_MAPPING_H
+#define _REMOTE_MAPPING_H
+
+#include <linux/mm.h>
+#include <linux/mmu_notifier.h>
+#include <linux/rbtree.h>
+#include <linux/srcu.h>
+#include <linux/rwsem.h>
+
+struct remote_file_context {
+ struct srcu_struct mm_srcu;
+ spinlock_t mm_lock;
+ struct mm_struct __rcu *mm;
+ struct mmu_notifier mn;
+
+ // interval tree for mapped ranges
+ struct rb_root_cached rb_root;
+ struct rw_semaphore tree_lock;
+};
+
+// describes a mapped range
+// mirror VMA points here
+struct remote_vma_context {
+ // al information about the mapped interval is found in the VMA
+ struct vm_area_struct *vma;
+
+ // interval tree link
+ struct rb_node target_rb;
+ unsigned long rb_subtree_last;
+};
+
+#endif /* _REMOTE_MAPPING_H */
diff --git a/include/uapi/linux/remote_mapping.h b/include/uapi/linux/remote_mapping.h
new file mode 100644
index 0000000..eb0eec3
--- /dev/null
+++ b/include/uapi/linux/remote_mapping.h
@@ -0,0 +1,12 @@
+/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
+
+#ifndef __UAPI_REMOTE_MAPPING_H__
+#define __UAPI_REMOTE_MAPPING_H__
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define REMOTE_PROC_MAP _IOW('r', 0x01, int)
+// TODO: also ioctl for pidfd
+
+#endif /* __UAPI_REMOTE_MAPPING_H__ */
diff --git a/mm/Kconfig b/mm/Kconfig
index ab80933..c10dd5c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -739,4 +739,13 @@ config ARCH_HAS_HUGEPD
config MAPPING_DIRTY_HELPERS
bool
+config REMOTE_MAPPING
+ bool "Remote memory mapping"
+ depends on MMU && MMU_NOTIFIER
+ default n
+
+ help
+ Allows a given process to map pages of another process in its own
+ address space.
+
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 1937cc2..595f1a8c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -108,3 +108,4 @@ obj-$(CONFIG_ZONE_DEVICE) += memremap.o
obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
obj-$(CONFIG_MAPPING_DIRTY_HELPERS) += mapping_dirty_helpers.o
+obj-$(CONFIG_REMOTE_MAPPING) += remote_mapping.o
diff --git a/mm/remote_mapping.c b/mm/remote_mapping.c
new file mode 100644
index 0000000..358b1f5
--- /dev/null
+++ b/mm/remote_mapping.c
@@ -0,0 +1,615 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Remote memory mapping.
+ *
+ * Copyright (C) 2017-2018 Bitdefender S.R.L.
+ *
+ * Author:
+ * Mircea Cirjaliu <mcirjaliu@xxxxxxxxxxxxxxx>
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/rmap.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/rbtree.h>
+#include <linux/pagemap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/mm.h>
+#include <linux/pid.h>
+#include <linux/oom.h>
+#include <linux/huge_mm.h>
+#include <linux/mmu_notifier.h>
+#include <linux/sched/mm.h>
+#include <linux/interval_tree_generic.h>
+#include <linux/refcount.h>
+#include <linux/debugfs.h>
+#include <linux/miscdevice.h>
+#include <linux/remote_mapping.h>
+#include <uapi/linux/remote_mapping.h>
+#include <asm/pgalloc.h>
+#include <linux/mman.h>
+#include <linux/pfn_t.h>
+#include <linux/errno.h>
+#include <linux/lockdep.h>
+
+#include "internal.h"
+
+#include <linux/kgdb.h>
+
+#define ASSERT(exp) BUG_ON(!(exp))
+
+
+static inline unsigned long ctx_start(struct remote_vma_context *ctx)
+{
+ return ctx->vma->vm_pgoff << PAGE_SHIFT;
+}
+
+static inline unsigned long ctx_end(struct remote_vma_context *ctx)
+{
+ return (ctx->vma->vm_pgoff << PAGE_SHIFT) +
+ (ctx->vma->vm_end - ctx->vma->vm_start);
+}
+
+static inline unsigned long range_start(struct remote_vma_context *ctx)
+{
+ return ctx_start(ctx) + 1;
+}
+
+static inline unsigned long range_last(struct remote_vma_context *ctx)
+{
+ return ctx_end(ctx) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct remote_vma_context, target_rb, unsigned long,
+ rb_subtree_last, range_start, range_last,
+ static inline, range_interval_tree)
+
+#define range_tree_foreach(ctx, root, start, last) \
+ for (ctx = range_interval_tree_iter_first(root, start, last);\
+ ctx; ctx = range_interval_tree_iter_next(ctx, start, last))
+
+static inline bool
+range_interval_tree_overlaps(struct remote_vma_context *ctx,
+ struct remote_file_context *fctx)
+{
+ struct remote_vma_context *iter;
+
+ range_tree_foreach(iter, &fctx->rb_root, ctx_start(ctx), ctx_end(ctx))
+ return true;
+
+ return false;
+}
+
+static int
+mirror_invalidate_range_start(struct mmu_notifier *mn,
+ const struct mmu_notifier_range *range)
+{
+ struct remote_file_context *fctx =
+ container_of(mn, struct remote_file_context, mn);
+ struct remote_vma_context *ctx;
+ unsigned long src_start, src_end;
+ unsigned long vma_start, vma_end;
+
+ /* quick filter - we only map pages from anon VMAs */
+ if (!vma_is_anonymous(range->vma))
+ return 0;
+
+ /*
+ * If ctx + VMA were found here, then the VMA + its address space
+ * haven't been unmapped. See comments in mirror_vm_close().
+ */
+ down_read(&fctx->tree_lock);
+
+ range_tree_foreach(ctx, &fctx->rb_root, range->start, range->end) {
+ pr_debug("%s: %lx-%lx found %lx-%lx\n",
+ __func__, range->start, range->end,
+ ctx_start(ctx), ctx_end(ctx));
+
+ // intersect these intervals (source process address range)
+ src_start = max(range->start, ctx_start(ctx));
+ src_end = min(range->end, ctx_end(ctx));
+
+ // translate to destination process address range
+ vma_start = ctx->vma->vm_start + (src_start - ctx_start(ctx));
+ vma_end = ctx->vma->vm_end + (src_end - ctx_end(ctx));
+
+ zap_vma_ptes(ctx->vma, vma_start, vma_end - vma_start);
+ }
+
+ up_read(&fctx->tree_lock);
+
+ return 0;
+}
+
+/* get notified when source MM is shutting down, so we avoid faulting in vain */
+static void
+mirror_release(struct mmu_notifier *mn, struct mm_struct *mm)
+{
+ struct remote_file_context *fctx =
+ container_of(mn, struct remote_file_context, mn);
+
+ spin_lock(&fctx->mm_lock);
+ rcu_assign_pointer(fctx->mm, NULL);
+ spin_unlock(&fctx->mm_lock);
+
+ /* delay address space closing until local faults finish */
+ synchronize_srcu(&fctx->mm_srcu);
+}
+
+static const struct mmu_notifier_ops mirror_notifier_ops = {
+ .invalidate_range_start = mirror_invalidate_range_start,
+ .release = mirror_release,
+};
+
+
+static void remote_file_context_init(struct remote_file_context *ctx)
+{
+ ctx->mm = NULL;
+ ctx->mn.ops = &mirror_notifier_ops;
+ init_srcu_struct(&ctx->mm_srcu);
+ spin_lock_init(&ctx->mm_lock);
+
+ ctx->rb_root = RB_ROOT_CACHED;
+ init_rwsem(&ctx->tree_lock);
+}
+
+static struct remote_file_context *remote_file_context_alloc(void)
+{
+ struct remote_file_context *ctx;
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (ctx)
+ remote_file_context_init(ctx);
+
+ return ctx;
+}
+
+static void remote_file_context_free(struct remote_file_context *ctx)
+{
+ kfree(ctx);
+}
+
+
+static void remote_vma_context_init(struct remote_vma_context *ctx)
+{
+ ctx->vma = NULL;
+}
+
+static struct remote_vma_context *remote_vma_context_alloc(void)
+{
+ struct remote_vma_context *ctx;
+
+ ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
+ if (ctx)
+ remote_vma_context_init(ctx);
+
+ return ctx;
+}
+
+static void remote_vma_context_free(struct remote_vma_context *ctx)
+{
+ kfree(ctx);
+}
+
+
+static struct page *mm_remote_get_page(struct mm_struct *req_mm,
+ unsigned long req_hva, unsigned int flags)
+{
+ struct page *req_page = NULL;
+ long nrpages;
+
+ might_sleep();
+
+ flags |= FOLL_ANON | FOLL_MIGRATION;
+
+ /* get host page corresponding to requested address */
+ nrpages = get_user_pages_remote(NULL, req_mm, req_hva, 1,
+ flags, &req_page, NULL, NULL);
+ if (unlikely(nrpages == 0)) {
+ pr_err("no page for req_hva %016lx\n", req_hva);
+ return ERR_PTR(-ENOENT);
+ } else if (IS_ERR_VALUE(nrpages)) {
+ pr_err("get_user_pages_remote() failed: %d\n", (int)nrpages);
+ return ERR_PTR(nrpages);
+ }
+
+ /* limit introspection to anon memory (this also excludes zero-page) */
+ if (!PageAnon(req_page)) {
+ put_page(req_page);
+ pr_err("page at req_hva %016lx not anon\n", req_hva);
+ return ERR_PTR(-EINVAL);
+ }
+
+ return req_page;
+}
+
+static int mirror_dev_open(struct inode *inode, struct file *file)
+{
+ struct remote_file_context *fctx;
+
+ fctx = remote_file_context_alloc();
+ if (!fctx)
+ return -ENOMEM;
+ file->private_data = fctx;
+
+ return 0;
+}
+
+static int do_remote_proc_map(struct file *file, int pid)
+{
+ struct task_struct *req_task;
+ struct mm_struct *req_mm;
+ struct remote_file_context *fctx = file->private_data;
+ int result;
+
+ /* this function may race with mirror_release() notifier */
+ spin_lock(&fctx->mm_lock);
+ if (fctx->mm) {
+ spin_unlock(&fctx->mm_lock);
+ return -EALREADY;
+ }
+ spin_unlock(&fctx->mm_lock);
+
+ // find task
+ req_task = find_get_task_by_vpid(pid);
+ if (!req_task)
+ return -ESRCH;
+
+ // find + get mm
+ req_mm = get_task_mm(req_task);
+ put_task_struct(req_task);
+ if (!req_mm)
+ return -EINVAL;
+
+ /* there should be no mirror VMA faults at the moment */
+ spin_lock(&fctx->mm_lock);
+ rcu_assign_pointer(fctx->mm, req_mm);
+ spin_unlock(&fctx->mm_lock);
+
+ // register MMU notifier
+ result = mmu_notifier_register(&fctx->mn, req_mm);
+ if (result) {
+ mmput(req_mm);
+ pr_err("unable to register MMU notifier\n");
+
+ return result;
+ }
+
+ mmput(req_mm);
+
+ return 0;
+}
+
+static long mirror_dev_ioctl(struct file *file, unsigned int ioctl,
+ unsigned long arg)
+{
+ long result;
+
+ switch (ioctl) {
+ case REMOTE_PROC_MAP: {
+ int pid = (int)arg;
+
+ result = do_remote_proc_map(file, pid);
+ break;
+ }
+
+ default:
+ pr_err("ioctl %d not implemented\n", ioctl);
+ result = -ENOTTY;
+ }
+
+ return result;
+}
+
+static int mirror_dev_release(struct inode *inode, struct file *file)
+{
+ struct remote_file_context *fctx = file->private_data;
+ struct mm_struct *src_mm = NULL;
+
+ /* this function may race with mirror_release() notifier */
+ spin_lock(&fctx->mm_lock);
+ if (fctx->mm) {
+ mmgrab(fctx->mm);
+ src_mm = fctx->mm;
+ }
+ spin_unlock(&fctx->mm_lock);
+
+ /* attempt unregistering if pointer found to be valid */
+ if (src_mm) {
+ mmu_notifier_unregister(&fctx->mn, fctx->mm);
+ mmdrop(src_mm);
+ }
+
+ /*
+ * the synchronization inside mmu_notifier_unregister() makes sure no
+ * notifier will run after the call
+ */
+ remote_file_context_free(fctx);
+
+ return 0;
+}
+
+/*
+ * We end up here if the local PMD is NULL.
+ * Doesn't matter if the address is aligned to huge page boundary or not.
+ * We look for a huge page mapped at the target equivalent address and try to
+ * map it in our page tables without splitting it.
+ */
+static vm_fault_t
+mirror_vm_hugefault(struct vm_fault *vmf, enum page_entry_size pe_size)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct file *file = vma->vm_file;
+ struct remote_file_context *fctx = file->private_data;
+ unsigned long req_address;
+ unsigned int gup_flags;
+ struct page *req_page;
+ vm_fault_t result;
+ struct mm_struct *src_mm;
+ int idx;
+
+ pr_debug("%s: pe_size %d, address %016lx\n",
+ __func__, pe_size, vmf->address);
+
+ /* No support for anonymous transparent PUD pages yet */
+ if (pe_size == PE_SIZE_PUD)
+ return VM_FAULT_FALLBACK;
+
+ idx = srcu_read_lock(&fctx->mm_srcu);
+
+ /* check if source mm still exists */
+ src_mm = srcu_dereference(fctx->mm, &fctx->mm_srcu);
+ if (!src_mm) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+
+ /* attempt near-deadlock situation */
+ if (!down_read_trylock(&src_mm->mmap_sem)) {
+ srcu_read_unlock(&fctx->mm_srcu, idx);
+ up_read(¤t->mm->mmap_sem);
+
+ return VM_FAULT_RETRY;
+ }
+
+ /* set GUP flags depending on the VMA */
+ gup_flags = FOLL_HUGE;
+ if (vma->vm_flags & VM_WRITE)
+ gup_flags |= FOLL_WRITE | FOLL_FORCE;
+
+ req_address = vmf->pgoff << PAGE_SHIFT;
+ req_page = mm_remote_get_page(src_mm, req_address, gup_flags);
+
+ /* check for validity of the page */
+ if (IS_ERR_OR_NULL(req_page)) {
+ up_read(&src_mm->mmap_sem);
+
+ if (PTR_ERR(req_page) == -ERESTARTSYS) {
+ srcu_read_unlock(&fctx->mm_srcu, idx);
+ up_read(¤t->mm->mmap_sem);
+
+ return VM_FAULT_RETRY;
+ }
+
+ result = VM_FAULT_FALLBACK;
+ goto out;
+ }
+
+ /* shouldn't reach this case, but check anyway */
+ if (unlikely(!PageCompound(req_page))) {
+ result = VM_FAULT_FALLBACK;
+ goto out_page;
+ }
+
+ result = vmf_insert_pfn_pmd(vmf, page_to_pfn_t(req_page),
+ vmf->flags & FAULT_FLAG_WRITE);
+
+out_page:
+ put_page(req_page);
+ up_read(&src_mm->mmap_sem);
+
+out:
+ srcu_read_unlock(&fctx->mm_srcu, idx);
+
+ return result;
+}
+
+static vm_fault_t mirror_vm_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct file *file = vma->vm_file;
+ struct remote_file_context *fctx = file->private_data;
+ unsigned long req_address;
+ unsigned int gup_flags;
+ struct page *req_page;
+ vm_fault_t result;
+ struct mm_struct *src_mm;
+ int idx;
+
+ pr_debug("%s: address %016lx\n", __func__, vmf->address);
+
+ idx = srcu_read_lock(&fctx->mm_srcu);
+
+ /* check if source mm still exists */
+ src_mm = srcu_dereference(fctx->mm, &fctx->mm_srcu);
+ if (!src_mm) {
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+
+ /* attempt near-deadlock situation */
+ if (!down_read_trylock(&src_mm->mmap_sem)) {
+ srcu_read_unlock(&fctx->mm_srcu, idx);
+ up_read(¤t->mm->mmap_sem);
+
+ return VM_FAULT_RETRY;
+ }
+
+ /* set GUP flags depending on the VMA */
+ gup_flags = FOLL_SPLIT;
+ if (vma->vm_flags & VM_WRITE)
+ gup_flags |= FOLL_WRITE | FOLL_FORCE;
+
+ req_address = vmf->pgoff << PAGE_SHIFT;
+ req_page = mm_remote_get_page(src_mm, req_address, gup_flags);
+
+ /* check for validity of the page */
+ if (IS_ERR_OR_NULL(req_page)) {
+ up_read(&src_mm->mmap_sem);
+
+ if (PTR_ERR(req_page) == -ERESTARTSYS ||
+ PTR_ERR(req_page) == -EBUSY) {
+ srcu_read_unlock(&fctx->mm_srcu, idx);
+ up_read(¤t->mm->mmap_sem);
+
+ return VM_FAULT_RETRY;
+ }
+
+ result = VM_FAULT_SIGBUS;
+ goto out;
+ }
+
+ result = vmf_insert_pfn(vmf->vma, vmf->address, page_to_pfn(req_page));
+
+//out_page:
+ put_page(req_page);
+ up_read(&src_mm->mmap_sem);
+
+out:
+ srcu_read_unlock(&fctx->mm_srcu, idx);
+
+ return result;
+}
+
+/*
+ * This is called in remove_vma() at the end of __do_munmap() after the address
+ * space has been unmapped and the page tables have been freed.
+ */
+static void mirror_vm_close(struct vm_area_struct *vma)
+{
+ struct remote_vma_context *ctx = vma->vm_private_data;
+ struct remote_file_context *fctx = vma->vm_file->private_data;
+
+ pr_debug("%s: %016lx - %016lx (%lu bytes)\n",
+ __func__, vma->vm_start, vma->vm_end,
+ vma->vm_end - vma->vm_start);
+
+ /* will wait for any running invalidate notifiers to finish */
+ down_write(&fctx->tree_lock);
+ range_interval_tree_remove(ctx, &fctx->rb_root);
+ up_write(&fctx->tree_lock);
+
+ remote_vma_context_free(ctx);
+ vma->vm_private_data = NULL;
+}
+
+// this will prevent partial unmap of destination VMA
+static int mirror_vm_split(struct vm_area_struct *area, unsigned long addr)
+{
+ return -EINVAL;
+}
+
+static const struct vm_operations_struct mirror_vm_ops = {
+ .close = mirror_vm_close,
+ .fault = mirror_vm_fault,
+ .huge_fault = mirror_vm_hugefault,
+ .split = mirror_vm_split,
+};
+
+
+static int mirror_dev_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct remote_file_context *fctx = file->private_data;
+ struct remote_vma_context *ctx;
+
+ pr_debug("%s: %016lx - %016lx (%lu bytes)\n",
+ __func__, vma->vm_start, vma->vm_end,
+ vma->vm_end - vma->vm_start);
+
+ if (!(vma->vm_flags & VM_SHARED))
+ return -EINVAL;
+
+ /* prepare the context */
+ ctx = remote_vma_context_alloc();
+ if (!ctx)
+ return -ENOMEM;
+
+ vma->vm_private_data = ctx;
+ ctx->vma = vma;
+
+ down_write(&fctx->tree_lock);
+ if (range_interval_tree_overlaps(ctx, fctx)) {
+ up_write(&fctx->tree_lock);
+
+ pr_err("part of range already mirrored\n");
+ remote_vma_context_free(ctx);
+ return -EALREADY;
+ }
+
+ range_interval_tree_insert(ctx, &fctx->rb_root);
+ up_write(&fctx->tree_lock);
+
+ /* set basic VMA properties */
+ vma->vm_flags |= VM_DONTCOPY | VM_DONTDUMP | VM_PFNMAP;
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+ vma->vm_ops = &mirror_vm_ops;
+
+ return 0;
+}
+
+/*
+ * We must have the same alignment relative to a huge page boundary
+ * as the target VMA or requested address
+ */
+static unsigned long
+mirror_get_unmapped_area(struct file *file, const unsigned long addr0,
+ const unsigned long len, const unsigned long pgoff,
+ const unsigned long flags)
+{
+ struct vm_unmapped_area_info info;
+ unsigned long address = pgoff << PAGE_SHIFT;
+ bool huge_align = !(address & ~HPAGE_PMD_MASK);
+
+ pr_debug("%s: len %lu, pgoff 0x%016lx, %s alignment.\n",
+ __func__, len, pgoff, huge_align ? "PMD" : "page");
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = PAGE_SIZE;
+ info.high_limit = get_mmap_base(0);
+ info.align_mask = ~HPAGE_PMD_MASK;
+ info.align_offset = address & ~HPAGE_PMD_MASK;
+
+ address = vm_unmapped_area(&info);
+
+ pr_debug("%s: address 0x%016lx\n", __func__, address);
+
+ return address;
+}
+
+static const struct file_operations mirror_ops = {
+ .open = mirror_dev_open,
+ .unlocked_ioctl = mirror_dev_ioctl,
+ .compat_ioctl = mirror_dev_ioctl,
+ .get_unmapped_area = mirror_get_unmapped_area,
+ .llseek = no_llseek,
+ .mmap = mirror_dev_mmap,
+ .release = mirror_dev_release,
+};
+
+static struct miscdevice mirror_dev = {
+ .minor = MISC_DYNAMIC_MINOR,
+ .name = "mirror-proc",
+ .fops = &mirror_ops,
+};
+
+builtin_misc_device(mirror_dev);
+