[PATCH 11/11] hmm/dummy_driver: add support for fake remote memory using pages.

From: j . glisse
Date: Fri May 02 2014 - 09:53:45 EST


From: JÃrÃme Glisse <jglisse@xxxxxxxxxx>

Fake the existent of remote memory using preallocated pages and
demonstrate how to use the hmm api related to remote memory.

Signed-off-by: JÃrÃme Glisse <jglisse@xxxxxxxxxx>
---
drivers/char/hmm_dummy.c | 450 ++++++++++++++++++++++++++++++++++++++++-
include/uapi/linux/hmm_dummy.h | 8 +-
2 files changed, 453 insertions(+), 5 deletions(-)

diff --git a/drivers/char/hmm_dummy.c b/drivers/char/hmm_dummy.c
index e87dc7c..2443374 100644
--- a/drivers/char/hmm_dummy.c
+++ b/drivers/char/hmm_dummy.c
@@ -48,6 +48,8 @@

#define HMM_DUMMY_DEVICE_NAME "hmm_dummy_device"
#define HMM_DUMMY_DEVICE_MAX_MIRRORS 4
+#define HMM_DUMMY_DEVICE_RMEM_SIZE (32UL << 20UL)
+#define HMM_DUMMY_DEVICE_RMEM_NBITS (HMM_DUMMY_DEVICE_RMEM_SIZE >> PAGE_SHIFT)

struct hmm_dummy_device;

@@ -73,8 +75,16 @@ struct hmm_dummy_device {
/* device file mapping tracking (keep track of all vma) */
struct hmm_dummy_mirror *dmirrors[HMM_DUMMY_DEVICE_MAX_MIRRORS];
struct address_space *fmapping[HMM_DUMMY_DEVICE_MAX_MIRRORS];
+ struct page **rmem_pages;
+ unsigned long *rmem_bitmap;
};

+struct hmm_dummy_rmem {
+ struct hmm_rmem rmem;
+ unsigned long fuid;
+ unsigned long luid;
+ uint16_t *rmem_idx;
+};

/* We only create 2 device to show the inter device rmem sharing/migration
* capabilities.
@@ -482,6 +492,51 @@ static void hmm_dummy_pt_free(struct hmm_dummy_mirror *dmirror,
}


+/* hmm_dummy_rmem - dummy remote memory using system memory pages
+ *
+ * Helper function to allocate fake remote memory out of the device rmem_pages.
+ */
+static void hmm_dummy_rmem_free(struct hmm_dummy_rmem *drmem)
+{
+ struct hmm_dummy_device *ddevice;
+ struct hmm_rmem *rmem = &drmem->rmem;
+ unsigned long i, npages;
+
+ npages = (rmem->luid - rmem->fuid);
+ ddevice = container_of(rmem->device, struct hmm_dummy_device, device);
+ mutex_lock(&ddevice->mutex);
+ for (i = 0; i < npages; ++i) {
+ clear_bit(drmem->rmem_idx[i], ddevice->rmem_bitmap);
+ }
+ mutex_unlock(&ddevice->mutex);
+
+ kfree(drmem->rmem_idx);
+ drmem->rmem_idx = NULL;
+}
+
+static struct hmm_dummy_rmem *hmm_dummy_rmem_new(void)
+{
+ struct hmm_dummy_rmem *drmem;
+
+ drmem = kzalloc(sizeof(*drmem), GFP_KERNEL);
+ return drmem;
+}
+
+static int hmm_dummy_mirror_lmem_to_rmem(struct hmm_dummy_mirror *dmirror,
+ unsigned long faddr,
+ unsigned long laddr)
+{
+ struct hmm_mirror *mirror = &dmirror->mirror;
+ struct hmm_fault fault;
+ int ret;
+
+ fault.faddr = faddr & PAGE_MASK;
+ fault.laddr = PAGE_ALIGN(laddr);
+ ret = hmm_migrate_lmem_to_rmem(&fault, mirror);
+ return ret;
+}
+
+
/* hmm_ops - hmm callback for the hmm dummy driver.
*
* Below are the various callback that the hmm api require for a device. The
@@ -574,7 +629,7 @@ static struct hmm_fence *hmm_dummy_lmem_update(struct hmm_mirror *mirror,

page = hmm_dummy_pte_to_page(*pldp);
if (page) {
- set_page_dirty(page);
+ set_page_dirty_lock(page);
}
}
*pldp &= ~HMM_DUMMY_PTE_DIRTY;
@@ -631,6 +686,318 @@ static int hmm_dummy_lmem_fault(struct hmm_mirror *mirror,
return 0;
}

+static struct hmm_rmem *hmm_dummy_rmem_alloc(struct hmm_device *device,
+ struct hmm_fault *fault)
+{
+ struct hmm_dummy_device *ddevice;
+ struct hmm_dummy_rmem *drmem;
+ struct hmm_rmem *rmem;
+ unsigned long i, npages;
+
+ ddevice = container_of(device, struct hmm_dummy_device, device);
+
+ drmem = hmm_dummy_rmem_new();
+ if (drmem == NULL) {
+ return ERR_PTR(-ENOMEM);
+ }
+ rmem = &drmem->rmem;
+
+ npages = (fault->laddr - fault->faddr) >> PAGE_SHIFT;
+ drmem->rmem_idx = kmalloc(npages * sizeof(uint16_t), GFP_KERNEL);
+ if (drmem->rmem_idx == NULL) {
+ kfree(drmem);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ mutex_lock(&ddevice->mutex);
+ for (i = 0; i < npages; ++i) {
+ int r;
+
+ r = find_first_zero_bit(ddevice->rmem_bitmap,
+ HMM_DUMMY_DEVICE_RMEM_NBITS);
+ if (r < 0) {
+ while ((--i)) {
+ clear_bit(drmem->rmem_idx[i],
+ ddevice->rmem_bitmap);
+ }
+ kfree(drmem->rmem_idx);
+ kfree(drmem);
+ mutex_unlock(&ddevice->mutex);
+ return ERR_PTR(-ENOMEM);
+ }
+ drmem->rmem_idx[i] = r;
+ }
+ mutex_unlock(&ddevice->mutex);
+
+ return rmem;
+}
+
+static struct hmm_fence *hmm_dummy_rmem_update(struct hmm_mirror *mirror,
+ struct hmm_rmem *rmem,
+ unsigned long faddr,
+ unsigned long laddr,
+ unsigned long fuid,
+ enum hmm_etype etype,
+ bool dirty)
+{
+ struct hmm_dummy_mirror *dmirror;
+ struct hmm_dummy_pt_map pt_map = {0};
+ unsigned long addr, i, mask, or, idx;
+
+ dmirror = container_of(mirror, struct hmm_dummy_mirror, mirror);
+ pt_map.dmirror = dmirror;
+ idx = fuid - rmem->fuid;
+
+ /* Sanity check for debugging hmm real device driver do not have to do that. */
+ switch (etype) {
+ case HMM_UNREGISTER:
+ case HMM_UNMAP:
+ case HMM_MUNMAP:
+ case HMM_MPROT_WONLY:
+ case HMM_MIGRATE_TO_RMEM:
+ case HMM_MIGRATE_TO_LMEM:
+ mask = 0;
+ or = 0;
+ break;
+ case HMM_MPROT_RONLY:
+ case HMM_WRITEBACK:
+ mask = ~HMM_DUMMY_PTE_WRITE;
+ or = 0;
+ break;
+ case HMM_MPROT_RANDW:
+ mask = -1L;
+ or = HMM_DUMMY_PTE_WRITE;
+ break;
+ default:
+ printk(KERN_ERR "%4d:%s invalid event type %d\n",
+ __LINE__, __func__, etype);
+ return ERR_PTR(-EIO);
+ }
+
+ mutex_lock(&dmirror->mutex);
+ for (i = 0, addr = faddr; addr < laddr; ++i, addr += PAGE_SIZE, ++idx) {
+ unsigned long *pldp;
+
+ pldp = hmm_dummy_pt_pld_map(&pt_map, addr);
+ if (!pldp) {
+ continue;
+ }
+ if (dirty && ((*pldp) & HMM_DUMMY_PTE_DIRTY)) {
+ hmm_pfn_set_dirty(&rmem->pfns[idx]);
+ }
+ *pldp &= ~HMM_DUMMY_PTE_DIRTY;
+ *pldp &= mask;
+ *pldp |= or;
+ }
+ hmm_dummy_pt_unmap(&pt_map);
+
+ switch (etype) {
+ case HMM_UNREGISTER:
+ case HMM_MUNMAP:
+ hmm_dummy_pt_free(dmirror, faddr, laddr);
+ break;
+ default:
+ break;
+ }
+ mutex_unlock(&dmirror->mutex);
+ return NULL;
+}
+
+static int hmm_dummy_rmem_fault(struct hmm_mirror *mirror,
+ struct hmm_rmem *rmem,
+ unsigned long faddr,
+ unsigned long laddr,
+ unsigned long fuid,
+ struct hmm_fault *fault)
+{
+ struct hmm_dummy_mirror *dmirror;
+ struct hmm_dummy_device *ddevice;
+ struct hmm_dummy_pt_map pt_map = {0};
+ struct hmm_dummy_rmem *drmem;
+ unsigned long i;
+ bool write = fault ? !!(fault->flags & HMM_FAULT_WRITE) : false;
+
+ dmirror = container_of(mirror, struct hmm_dummy_mirror, mirror);
+ drmem = container_of(rmem, struct hmm_dummy_rmem, rmem);
+ ddevice = dmirror->ddevice;
+ pt_map.dmirror = dmirror;
+
+ mutex_lock(&dmirror->mutex);
+ for (i = fuid; faddr < laddr; ++i, faddr += PAGE_SIZE) {
+ unsigned long *pldp, pld_idx, pfn, idx = i - rmem->fuid;
+
+ pldp = hmm_dummy_pt_pld_map(&pt_map, faddr);
+ if (!pldp) {
+ continue;
+ }
+ pfn = page_to_pfn(ddevice->rmem_pages[drmem->rmem_idx[idx]]);
+ pld_idx = hmm_dummy_pld_index(faddr);
+ pldp[pld_idx] = (pfn << HMM_DUMMY_PFN_SHIFT);
+ if (test_bit(HMM_PFN_WRITE, &rmem->pfns[idx])) {
+ pldp[pld_idx] |= HMM_DUMMY_PTE_WRITE;
+ hmm_pfn_clear_lmem_uptodate(&rmem->pfns[idx]);
+ }
+ pldp[pld_idx] |= HMM_DUMMY_PTE_VALID_PAGE;
+ if (write && !test_bit(HMM_PFN_WRITE, &rmem->pfns[idx])) {
+ /* Fallback to use system memory. Other solution would be
+ * to migrate back to system memory.
+ */
+ hmm_pfn_clear_rmem_uptodate(&rmem->pfns[idx]);
+ if (!test_bit(HMM_PFN_LMEM_UPTODATE, &rmem->pfns[idx])) {
+ struct page *spage, *dpage;
+
+ dpage = hmm_pfn_to_page(rmem->pfns[idx]);
+ spage = ddevice->rmem_pages[drmem->rmem_idx[idx]];
+ copy_highpage(dpage, spage);
+ hmm_pfn_set_lmem_uptodate(&rmem->pfns[idx]);
+ }
+ pfn = rmem->pfns[idx] >> HMM_PFN_SHIFT;
+ pldp[pld_idx] = (pfn << HMM_DUMMY_PFN_SHIFT);
+ pldp[pld_idx] |= HMM_DUMMY_PTE_WRITE;
+ pldp[pld_idx] |= HMM_DUMMY_PTE_VALID_PAGE;
+ }
+ }
+ hmm_dummy_pt_unmap(&pt_map);
+ mutex_unlock(&dmirror->mutex);
+ return 0;
+}
+
+struct hmm_fence *hmm_dummy_rmem_to_lmem(struct hmm_rmem *rmem,
+ unsigned long fuid,
+ unsigned long luid)
+{
+ struct hmm_dummy_device *ddevice;
+ struct hmm_dummy_rmem *drmem;
+ unsigned long i;
+
+ ddevice = container_of(rmem->device, struct hmm_dummy_device, device);
+ drmem = container_of(rmem, struct hmm_dummy_rmem, rmem);
+
+ for (i = fuid; i < luid; ++i) {
+ unsigned long idx = i - rmem->fuid;
+ struct page *spage, *dpage;
+
+ if (test_bit(HMM_PFN_LMEM_UPTODATE, &rmem->pfns[idx])) {
+ /* This lmem page is already uptodate. */
+ continue;
+ }
+ spage = ddevice->rmem_pages[drmem->rmem_idx[idx]];
+ dpage = hmm_pfn_to_page(rmem->pfns[idx]);
+ if (!dpage) {
+ return ERR_PTR(-EINVAL);
+ }
+ copy_highpage(dpage, spage);
+ hmm_pfn_set_lmem_uptodate(&rmem->pfns[idx]);
+ }
+
+ return NULL;
+}
+
+struct hmm_fence *hmm_dummy_lmem_to_rmem(struct hmm_rmem *rmem,
+ unsigned long fuid,
+ unsigned long luid)
+{
+ struct hmm_dummy_device *ddevice;
+ struct hmm_dummy_rmem *drmem;
+ unsigned long i;
+
+ ddevice = container_of(rmem->device, struct hmm_dummy_device, device);
+ drmem = container_of(rmem, struct hmm_dummy_rmem, rmem);
+
+ for (i = fuid; i < luid; ++i) {
+ unsigned long idx = i - rmem->fuid;
+ struct page *spage, *dpage;
+
+ if (test_bit(HMM_PFN_RMEM_UPTODATE, &rmem->pfns[idx])) {
+ /* This rmem page is already uptodate. */
+ continue;
+ }
+ dpage = ddevice->rmem_pages[drmem->rmem_idx[idx]];
+ spage = hmm_pfn_to_page(rmem->pfns[idx]);
+ if (!spage) {
+ return ERR_PTR(-EINVAL);
+ }
+ copy_highpage(dpage, spage);
+ hmm_pfn_set_rmem_uptodate(&rmem->pfns[idx]);
+ }
+
+ return NULL;
+}
+
+static int hmm_dummy_rmem_do_split(struct hmm_rmem *rmem,
+ unsigned long fuid,
+ unsigned long luid)
+{
+ struct hmm_dummy_rmem *drmem, *dnew;
+ struct hmm_fault fault;
+ struct hmm_rmem *new;
+ unsigned long i, pgoff, npages;
+ int ret;
+
+ drmem = container_of(rmem, struct hmm_dummy_rmem, rmem);
+ npages = (luid - fuid);
+ pgoff = (fuid == rmem->fuid) ? 0 : fuid - rmem->fuid;
+ fault.faddr = 0;
+ fault.laddr = npages << PAGE_SHIFT;
+ new = hmm_dummy_rmem_alloc(rmem->device, &fault);
+ if (IS_ERR(new)) {
+ return PTR_ERR(new);
+ }
+ dnew = container_of(new, struct hmm_dummy_rmem, rmem);
+
+ new->fuid = fuid;
+ new->luid = luid;
+ ret = hmm_rmem_split_new(rmem, new);
+ if (ret) {
+ return ret;
+ }
+
+ /* Update the rmem it is fine to hold no lock as no one else can access
+ * both of this rmem object as long as the range are reserved.
+ */
+ for (i = 0; i < npages; ++i) {
+ dnew->rmem_idx[i] = drmem->rmem_idx[i + pgoff];
+ }
+ if (!pgoff) {
+ for (i = 0; i < (rmem->luid - rmem->fuid); ++i) {
+ drmem->rmem_idx[i] = drmem->rmem_idx[i + npages];
+ }
+ }
+
+ return 0;
+}
+
+static int hmm_dummy_rmem_split(struct hmm_rmem *rmem,
+ unsigned long fuid,
+ unsigned long luid)
+{
+ int ret;
+
+ if (fuid > rmem->fuid) {
+ ret = hmm_dummy_rmem_do_split(rmem, rmem->fuid, fuid);
+ if (ret) {
+ return ret;
+ }
+ }
+ if (luid < rmem->luid) {
+ ret = hmm_dummy_rmem_do_split(rmem, luid, rmem->luid);
+ if (ret) {
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static void hmm_dummy_rmem_destroy(struct hmm_rmem *rmem)
+{
+ struct hmm_dummy_rmem *drmem;
+
+ drmem = container_of(rmem, struct hmm_dummy_rmem, rmem);
+ hmm_dummy_rmem_free(drmem);
+ kfree(drmem);
+}
+
static const struct hmm_device_ops hmm_dummy_ops = {
.device_destroy = &hmm_dummy_device_destroy,
.mirror_release = &hmm_dummy_mirror_release,
@@ -638,6 +1005,14 @@ static const struct hmm_device_ops hmm_dummy_ops = {
.fence_wait = &hmm_dummy_fence_wait,
.lmem_update = &hmm_dummy_lmem_update,
.lmem_fault = &hmm_dummy_lmem_fault,
+ .rmem_alloc = &hmm_dummy_rmem_alloc,
+ .rmem_update = &hmm_dummy_rmem_update,
+ .rmem_fault = &hmm_dummy_rmem_fault,
+ .rmem_to_lmem = &hmm_dummy_rmem_to_lmem,
+ .lmem_to_rmem = &hmm_dummy_lmem_to_rmem,
+ .rmem_split = &hmm_dummy_rmem_split,
+ .rmem_split_adjust = &hmm_dummy_rmem_split,
+ .rmem_destroy = &hmm_dummy_rmem_destroy,
};


@@ -880,7 +1255,7 @@ static ssize_t hmm_dummy_fops_write(struct file *filp,
if (!(pldp[pld_idx] & HMM_DUMMY_PTE_WRITE)) {
hmm_dummy_pt_unmap(&pt_map);
mutex_unlock(&dmirror->mutex);
- goto fault;
+ goto fault;
}
pldp[pld_idx] |= HMM_DUMMY_PTE_DIRTY;
page = hmm_dummy_pte_to_page(pldp[pld_idx]);
@@ -964,8 +1339,11 @@ static long hmm_dummy_fops_unlocked_ioctl(struct file *filp,
unsigned int command,
unsigned long arg)
{
+ struct hmm_dummy_migrate dmigrate;
struct hmm_dummy_device *ddevice;
struct hmm_dummy_mirror *dmirror;
+ struct hmm_mirror *mirror;
+ void __user *uarg = (void __user *)arg;
unsigned minor;
int ret;

@@ -1011,6 +1389,31 @@ static long hmm_dummy_fops_unlocked_ioctl(struct file *filp,
"mirroring address space of %d\n",
dmirror->pid);
return 0;
+ case HMM_DUMMY_MIGRATE_TO_RMEM:
+ mutex_lock(&ddevice->mutex);
+ dmirror = ddevice->dmirrors[minor];
+ if (!dmirror) {
+ mutex_unlock(&ddevice->mutex);
+ return -EINVAL;
+ }
+ mirror = &dmirror->mirror;
+ mutex_unlock(&ddevice->mutex);
+
+ if (copy_from_user(&dmigrate, uarg, sizeof(dmigrate))) {
+ return -EFAULT;
+ }
+
+ ret = hmm_dummy_pt_alloc(dmirror,
+ dmigrate.faddr,
+ dmigrate.laddr);
+ if (ret) {
+ return ret;
+ }
+
+ ret = hmm_dummy_mirror_lmem_to_rmem(dmirror,
+ dmigrate.faddr,
+ dmigrate.laddr);
+ return ret;
default:
return -EINVAL;
}
@@ -1034,7 +1437,31 @@ static const struct file_operations hmm_dummy_fops = {
*/
static int hmm_dummy_device_init(struct hmm_dummy_device *ddevice)
{
- int ret, i;
+ struct page **pages;
+ unsigned long *bitmap;
+ int ret, i, npages;
+
+ npages = HMM_DUMMY_DEVICE_RMEM_SIZE >> PAGE_SHIFT;
+ bitmap = kzalloc(BITS_TO_LONGS(npages) * sizeof(long), GFP_KERNEL);
+ if (!bitmap) {
+ return -ENOMEM;
+ }
+ pages = kzalloc(npages * sizeof(void*), GFP_KERNEL);
+ if (!pages) {
+ kfree(bitmap);
+ return -ENOMEM;
+ }
+ for (i = 0; i < npages; ++i) {
+ pages[i] = alloc_page(GFP_KERNEL);
+ if (!pages[i]) {
+ while ((--i)) {
+ __free_page(pages[i]);
+ }
+ kfree(bitmap);
+ kfree(pages);
+ return -ENOMEM;
+ }
+ }

ret = alloc_chrdev_region(&ddevice->dev, 0,
HMM_DUMMY_DEVICE_MAX_MIRRORS,
@@ -1066,15 +1493,23 @@ static int hmm_dummy_device_init(struct hmm_dummy_device *ddevice)
goto error;
}

+ ddevice->rmem_bitmap = bitmap;
+ ddevice->rmem_pages = pages;
+
return 0;

error:
+ for (i = 0; i < npages; ++i) {
+ __free_page(pages[i]);
+ }
+ kfree(bitmap);
+ kfree(pages);
return ret;
}

static void hmm_dummy_device_fini(struct hmm_dummy_device *ddevice)
{
- unsigned i;
+ unsigned i, npages;

/* First finish hmm. */
for (i = 0; i < HMM_DUMMY_DEVICE_MAX_MIRRORS; i++) {
@@ -1092,6 +1527,13 @@ static void hmm_dummy_device_fini(struct hmm_dummy_device *ddevice)
cdev_del(&ddevice->cdev);
unregister_chrdev_region(ddevice->dev,
HMM_DUMMY_DEVICE_MAX_MIRRORS);
+
+ npages = HMM_DUMMY_DEVICE_RMEM_SIZE >> PAGE_SHIFT;
+ for (i = 0; i < npages; ++i) {
+ __free_page(ddevice->rmem_pages[i]);
+ }
+ kfree(ddevice->rmem_bitmap);
+ kfree(ddevice->rmem_pages);
}

static int __init hmm_dummy_init(void)
diff --git a/include/uapi/linux/hmm_dummy.h b/include/uapi/linux/hmm_dummy.h
index 16ae0d3..027c453 100644
--- a/include/uapi/linux/hmm_dummy.h
+++ b/include/uapi/linux/hmm_dummy.h
@@ -29,6 +29,12 @@
#include <linux/irqnr.h>

/* Expose the address space of the calling process through hmm dummy dev file */
-#define HMM_DUMMY_EXPOSE_MM _IO( 'R', 0x00 )
+#define HMM_DUMMY_EXPOSE_MM _IO( 'R', 0x00 )
+#define HMM_DUMMY_MIGRATE_TO_RMEM _IO( 'R', 0x01 )
+
+struct hmm_dummy_migrate {
+ uint64_t faddr;
+ uint64_t laddr;
+};

#endif /* _UAPI_LINUX_RANDOM_H */
--
1.9.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/