[PATCH] [RFC] proc: Add mmap callback for /proc/<pid>/mem

From: Haider Miraj
Date: Fri Sep 13 2024 - 13:40:36 EST


This patch introduces memory mapping (mmap) support for the /proc/<pid>/mem
interface. The new functionality allows users to map the memory of a
process into their address space reusing the same pages

The idea is to mmap another process's memory by first pinning the pages in
memory and then using `remap_pfn_range` to map them as device memory, reusing
the same pages. A list of pinned pages is maintained and released back on the
close call. This design has certain limitations.

I am seeking comments and advice on the following:
- Given that read access to `/proc/<pid>/mem` is already allowed for
privileged users, are there specific reasons or concerns that have prevented
the implementation of `mmap` for this interface?
- Is there a way to insert anonymous pages into a file-backed VMA so that it
honors reverse mapping, eliminating the need to keep track of pinned pages?
- I plan to implement a page fault handler as well.

I am looking for feedback on how to improve this implementation and what
additional considerations are necessary for it to be accepted by the community.

Cc: xe-linux-external@xxxxxxxxx
Signed-off-by: Haider Miraj <hmiraj@xxxxxxxxx>
---
fs/proc/base.c | 129 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 129 insertions(+)

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 72a1acd03675..405de47d0c1c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -117,6 +117,17 @@
static u8 nlink_tid __ro_after_init;
static u8 nlink_tgid __ro_after_init;

+struct vma_info {
+ struct list_head page_list_head;
+ uintptr_t vma_start_addr;
+ uintptr_t vma_end_addr;
+};
+
+struct page_list_item {
+ struct list_head list;
+ struct page *page;
+};
+
struct pid_entry {
const char *name;
unsigned int len;
@@ -926,12 +937,130 @@ static int mem_release(struct inode *inode, struct file *file)
return 0;
}

+static void mem_vma_close(struct vm_area_struct *vma)
+{
+ struct vma_info *info;
+ struct page_list_item *item, *tmp;
+
+ info = vma->vm_private_data;
+
+ if (info) {
+ /* Avoid cleanup if we are being split, instead print warning */
+ if (info->vma_start_addr == vma->vm_start &&
+ info->vma_end_addr == vma->vm_end) {
+ /* Iterate over the list and free each item and call put_page */
+ list_for_each_entry_safe(item, tmp,
+ &info->page_list_head, list) {
+ list_del(&item->list);
+ put_page(item->page);
+ kfree(item);
+ }
+
+ kfree(info);
+ vma->vm_private_data = NULL;
+ } else {
+ pr_warn("%s: VMA has been split, operation not supported\n", __func__);
+ }
+ }
+}
+
+static const struct vm_operations_struct mem_vm_ops = {
+ .close = mem_vma_close,
+};
+
+/**
+ * mem_mmap - Memory mapping function
+ *
+ * This function implements mmap call for /proc/<pid>/mem.
+ *
+ * Assumptions and Limitations:
+ * - This function does not handle reverse mapping, which is required for swapping.
+ * - The VMA is not expected to be split with an unmap call.
+ */
+static int mem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ uintptr_t addr, target_start_addr, target_end_addr;
+ struct page_list_item *item;
+ struct page *page, *zero_page;
+ unsigned long zero_page_pfn;
+ struct vma_info *info;
+ long pinned;
+ int ret;
+
+ /* Retrieve mm of the target process*/
+ struct mm_struct *mm = (struct mm_struct *)file->private_data;
+ size_t size = vma->vm_end - vma->vm_start;
+ uintptr_t start_addr = vma->vm_start;
+
+ target_start_addr = vma->vm_pgoff << PAGE_SHIFT; /* Multiply by PAGE_SIZE */
+ target_end_addr = target_start_addr + size;
+
+ if (!mm)
+ return -EINVAL;
+
+ info = kmalloc(sizeof(struct vma_info), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
+ INIT_LIST_HEAD(&info->page_list_head);
+ info->vma_start_addr = vma->vm_start;
+ info->vma_end_addr = vma->vm_end;
+
+ vma->vm_private_data = info;
+ vma->vm_ops = &mem_vm_ops;
+
+ zero_page = ZERO_PAGE(0);
+ zero_page_pfn = page_to_pfn(zero_page);
+
+ /* Acquire the mmap_lock before pinning the page (get_user_pages_remote) */
+ down_read(&mm->mmap_lock);
+
+ for (addr = target_start_addr; addr < target_end_addr; addr += PAGE_SIZE) {
+ unsigned long pfn;
+
+ /* Pin the user page */
+ pinned = get_user_pages_remote(mm, addr, 1, FOLL_GET | FOLL_NOFAULT,
+ &page, NULL, NULL);
+ /* Page is not resident (FOLL_NOFAULT), we will skip to the next address */
+ if (pinned <= 0) {
+ ret = remap_pfn_range(vma, start_addr, zero_page_pfn, PAGE_SIZE,
+ vma->vm_page_prot);
+ if (ret)
+ goto err_unlock;
+ start_addr += PAGE_SIZE;
+ continue;
+ }
+
+ /* We need to keep track of pages which are pinned */
+ item = kmalloc(sizeof(struct page_list_item), GFP_KERNEL);
+ if (!item) {
+ kfree(info);
+ return -ENOMEM;
+ }
+
+ item->page = page;
+ list_add(&item->list, &info->page_list_head);
+ pfn = page_to_pfn(page);
+
+ /* Remap the page frame under current vma */
+ ret = remap_pfn_range(vma, start_addr, pfn, PAGE_SIZE,
+ vma->vm_page_prot);
+ if (ret)
+ kfree(item);
+
+ start_addr += PAGE_SIZE;
+ }
+err_unlock:
+ up_read(&mm->mmap_lock);
+ return 0;
+}
+
static const struct file_operations proc_mem_operations = {
.llseek = mem_lseek,
.read = mem_read,
.write = mem_write,
.open = mem_open,
.release = mem_release,
+ .mmap = mem_mmap,
};

static int environ_open(struct inode *inode, struct file *file)
--
2.35.6