[RFC 8/8] p2pmem: Added char device user interface

From: Logan Gunthorpe
Date: Thu Mar 30 2017 - 18:14:52 EST


This creates a userspace interface to use p2pmemory. A user can use
mmap on the p2pmem char device to get buffers from the corresponding
device. This allows a user to use p2p memory with existing
interfaces like RDMA and O_DIRECT.

This patch is a bit more controversial because people don't want to
expose these interfaces to userspace without more consideration.
However, this patch is _very_ useful for expirementing with p2p memory.

For example, with this patch, you can test with commands like:

ib_write_bw -R --mmap=/dev/p2pmem0 -D 30

or use an fio script like:

[rdma-server]
rw=read
mem=mmapshared:/dev/p2pmem0
ioengine=rdma
port=14242
bs=64k
size=10G
iodepth=2

which would test the bandwidth of RDMA to/from the specified p2p memory.

Signed-off-by: Logan Gunthorpe <logang@xxxxxxxxxxxx>
Signed-off-by: Stephen Bates <sbates@xxxxxxxxxxxx>
Signed-off-by: Steve Wise <swise@xxxxxxxxxxxxxxxxxxxxx>
---
drivers/memory/p2pmem.c | 184 +++++++++++++++++++++++++++++++++++++++++++++++-
include/linux/p2pmem.h | 4 ++
2 files changed, 186 insertions(+), 2 deletions(-)

diff --git a/drivers/memory/p2pmem.c b/drivers/memory/p2pmem.c
index 499d42c..129c49c 100644
--- a/drivers/memory/p2pmem.c
+++ b/drivers/memory/p2pmem.c
@@ -19,14 +19,20 @@
#include <linux/genalloc.h>
#include <linux/memremap.h>
#include <linux/debugfs.h>
+#include <linux/pfn_t.h>

MODULE_DESCRIPTION("Peer 2 Peer Memory Device");
MODULE_VERSION("0.1");
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Microsemi Corporation");

+static int max_devices = 16;
+module_param(max_devices, int, 0444);
+MODULE_PARM_DESC(max_devices, "Maximum number of char devices");
+
static struct class *p2pmem_class;
static DEFINE_IDA(p2pmem_ida);
+static dev_t p2pmem_devt;

static struct dentry *p2pmem_debugfs_root;

@@ -67,6 +73,144 @@ static struct p2pmem_dev *to_p2pmem(struct device *dev)
return container_of(dev, struct p2pmem_dev, dev);
}

+struct p2pmem_vma {
+ struct p2pmem_dev *p2pmem_dev;
+ atomic_t mmap_count;
+ size_t nr_pages;
+
+ /* Protects the used_pages array */
+ struct mutex mutex;
+ struct page *used_pages[];
+};
+
+static void p2pmem_vma_open(struct vm_area_struct *vma)
+{
+ struct p2pmem_vma *pv = vma->vm_private_data;
+
+ atomic_inc(&pv->mmap_count);
+}
+
+static void p2pmem_vma_free_pages(struct vm_area_struct *vma)
+{
+ int i;
+ struct p2pmem_vma *pv = vma->vm_private_data;
+
+ mutex_lock(&pv->mutex);
+
+ for (i = 0; i < pv->nr_pages; i++) {
+ if (pv->used_pages[i]) {
+ p2pmem_free_page(pv->p2pmem_dev, pv->used_pages[i]);
+ pv->used_pages[i] = NULL;
+ }
+ }
+
+ mutex_unlock(&pv->mutex);
+}
+
+static void p2pmem_vma_close(struct vm_area_struct *vma)
+{
+ struct p2pmem_vma *pv = vma->vm_private_data;
+
+ if (!atomic_dec_and_test(&pv->mmap_count))
+ return;
+
+ p2pmem_vma_free_pages(vma);
+
+ dev_dbg(&pv->p2pmem_dev->dev, "vma close");
+ kfree(pv);
+}
+
+static int p2pmem_vma_fault(struct vm_fault *vmf)
+{
+ struct p2pmem_vma *pv = vmf->vma->vm_private_data;
+ unsigned int pg_idx;
+ struct page *pg;
+ pfn_t pfn;
+ int rc;
+
+ if (!pv->p2pmem_dev->alive)
+ return VM_FAULT_SIGBUS;
+
+ pg_idx = (vmf->address - vmf->vma->vm_start) / PAGE_SIZE;
+
+ mutex_lock(&pv->mutex);
+
+ if (pv->used_pages[pg_idx])
+ pg = pv->used_pages[pg_idx];
+ else
+ pg = p2pmem_alloc_page(pv->p2pmem_dev);
+
+ if (!pg)
+ return VM_FAULT_OOM;
+
+ pv->used_pages[pg_idx] = pg;
+
+ pfn = phys_to_pfn_t(page_to_phys(pg), PFN_DEV | PFN_MAP);
+ rc = vm_insert_mixed(vmf->vma, vmf->address, pfn);
+
+ mutex_unlock(&pv->mutex);
+
+ if (rc == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (rc < 0 && rc != -EBUSY)
+ return VM_FAULT_SIGBUS;
+
+ return VM_FAULT_NOPAGE;
+}
+
+const struct vm_operations_struct p2pmem_vmops = {
+ .open = p2pmem_vma_open,
+ .close = p2pmem_vma_close,
+ .fault = p2pmem_vma_fault,
+};
+
+static int p2pmem_open(struct inode *inode, struct file *filp)
+{
+ struct p2pmem_dev *p;
+
+ p = container_of(inode->i_cdev, struct p2pmem_dev, cdev);
+ filp->private_data = p;
+ p->inode = inode;
+
+ return 0;
+}
+
+static int p2pmem_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+ struct p2pmem_dev *p = filp->private_data;
+ struct p2pmem_vma *pv;
+ size_t nr_pages = (vma->vm_end - vma->vm_start) / PAGE_SIZE;
+
+ if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
+ dev_warn(&p->dev, "mmap failed: can't create private mapping\n");
+ return -EINVAL;
+ }
+
+ dev_dbg(&p->dev, "Allocating mmap with %zd pages.\n", nr_pages);
+
+ pv = kzalloc(sizeof(*pv) + sizeof(pv->used_pages[0]) * nr_pages,
+ GFP_KERNEL);
+ if (!pv)
+ return -ENOMEM;
+
+ mutex_init(&pv->mutex);
+ pv->nr_pages = nr_pages;
+ pv->p2pmem_dev = p;
+ atomic_set(&pv->mmap_count, 1);
+
+ vma->vm_private_data = pv;
+ vma->vm_ops = &p2pmem_vmops;
+ vma->vm_flags |= VM_MIXEDMAP;
+
+ return 0;
+}
+
+static const struct file_operations p2pmem_fops = {
+ .owner = THIS_MODULE,
+ .open = p2pmem_open,
+ .mmap = p2pmem_mmap,
+};
+
static void p2pmem_percpu_release(struct percpu_ref *ref)
{
struct p2pmem_dev *p = container_of(ref, struct p2pmem_dev, ref);
@@ -114,10 +258,23 @@ struct remove_callback {
static void p2pmem_remove(struct p2pmem_dev *p)
{
struct remove_callback *remove_call, *tmp;
+ struct vm_area_struct *vma;

p->alive = false;
list_for_each_entry_safe(remove_call, tmp, &p->remove_list, list)
remove_call->callback(remove_call->context);
+
+ if (!p->inode)
+ return;
+
+ unmap_mapping_range(p->inode->i_mapping, 0, 0, 1);
+
+ i_mmap_lock_write(p->inode->i_mapping);
+ vma_interval_tree_foreach(vma, &p->inode->i_mapping->i_mmap, 0,
+ ULONG_MAX) {
+ p2pmem_vma_free_pages(vma);
+ }
+ i_mmap_unlock_write(p->inode->i_mapping);
}

/**
@@ -147,6 +304,10 @@ struct p2pmem_dev *p2pmem_create(struct device *parent)
p->dev.parent = parent;
p->dev.release = p2pmem_release;

+ cdev_init(&p->cdev, &p2pmem_fops);
+ p->cdev.owner = THIS_MODULE;
+ p->cdev.kobj.parent = &p->dev.kobj;
+
p->id = ida_simple_get(&p2pmem_ida, 0, 0, GFP_KERNEL);
if (p->id < 0) {
rc = p->id;
@@ -154,6 +315,7 @@ struct p2pmem_dev *p2pmem_create(struct device *parent)
}

dev_set_name(&p->dev, "p2pmem%d", p->id);
+ p->dev.devt = MKDEV(MAJOR(p2pmem_devt), p->id);

p->pool = gen_pool_create(PAGE_SHIFT, nid);
if (!p->pool) {
@@ -177,14 +339,20 @@ struct p2pmem_dev *p2pmem_create(struct device *parent)
setup_debugfs(p);
}

- rc = device_add(&p->dev);
+ rc = cdev_add(&p->cdev, p->dev.devt, 1);
if (rc)
goto err_id;

- dev_info(&p->dev, "registered");
+ rc = device_add(&p->dev);
+ if (rc)
+ goto err_cdev;

+ dev_info(&p->dev, "registered");
return p;

+err_cdev:
+ cdev_del(&p->cdev);
+ p2pmem_remove(p);
err_id:
ida_simple_remove(&p2pmem_ida, p->id);
err_free:
@@ -206,6 +374,7 @@ void p2pmem_unregister(struct p2pmem_dev *p)

dev_info(&p->dev, "unregistered");
device_del(&p->dev);
+ cdev_del(&p->cdev);
p2pmem_remove(p);
ida_simple_remove(&p2pmem_ida, p->id);
put_device(&p->dev);
@@ -495,21 +664,32 @@ EXPORT_SYMBOL(p2pmem_put);

static int __init p2pmem_init(void)
{
+ int rc;
+
p2pmem_class = class_create(THIS_MODULE, "p2pmem");
if (IS_ERR(p2pmem_class))
return PTR_ERR(p2pmem_class);

+ rc = alloc_chrdev_region(&p2pmem_devt, 0, max_devices, "iopmemc");
+ if (rc)
+ goto err_chrdev;
+
p2pmem_debugfs_root = debugfs_create_dir("p2pmem", NULL);
if (!p2pmem_debugfs_root)
pr_info("could not create debugfs entry, continuing\n");

return 0;
+
+err_chrdev:
+ class_destroy(p2pmem_class);
+ return rc;
}
module_init(p2pmem_init);

static void __exit p2pmem_exit(void)
{
debugfs_remove_recursive(p2pmem_debugfs_root);
+ unregister_chrdev_region(p2pmem_devt, max_devices);
class_destroy(p2pmem_class);

pr_info(KBUILD_MODNAME ": unloaded.\n");
diff --git a/include/linux/p2pmem.h b/include/linux/p2pmem.h
index 9365b02..aeee60d 100644
--- a/include/linux/p2pmem.h
+++ b/include/linux/p2pmem.h
@@ -18,6 +18,7 @@

#include <linux/device.h>
#include <linux/pci.h>
+#include <linux/cdev.h>

struct p2pmem_dev {
struct device dev;
@@ -32,6 +33,9 @@ struct p2pmem_dev {

struct mutex remove_mutex; /* protects the remove callback list */
struct list_head remove_list;
+
+ struct cdev cdev;
+ struct inode *inode;
};

#ifdef CONFIG_P2PMEM
--
2.1.4