[PATCH 15/15] HMM/dummy: add fake device memory to dummy HMM device driver.

From: JÃrÃme Glisse
Date: Thu Aug 13 2015 - 15:38:38 EST


This patch add fake device memory by simply using regular system memory
page and pretending they are not accessible by the CPU directly. This
serve to showcase how migration to device memory can be impemented inside
a real device driver.

Signed-off-by: JÃrÃme Glisse <jglisse@xxxxxxxxxx>
---
drivers/char/hmm_dummy.c | 395 +++++++++++++++++++++++++++++++++++++++--
include/uapi/linux/hmm_dummy.h | 17 +-
2 files changed, 391 insertions(+), 21 deletions(-)

diff --git a/drivers/char/hmm_dummy.c b/drivers/char/hmm_dummy.c
index 52843cb..a4af5b1 100644
--- a/drivers/char/hmm_dummy.c
+++ b/drivers/char/hmm_dummy.c
@@ -43,6 +43,9 @@
#define HMM_DUMMY_MAX_DEVICES 4
#define HMM_DUMMY_MAX_MIRRORS 4

+#define HMM_DUMMY_RMEM_SIZE (32UL << 20UL)
+#define HMM_DUMMY_RMEM_NBITS (HMM_DUMMY_RMEM_SIZE >> PAGE_SHIFT)
+
struct dummy_device;

struct dummy_mirror {
@@ -70,6 +73,8 @@ struct dummy_device {
/* device file mapping tracking (keep track of all vma) */
struct dummy_mirror *dmirrors[HMM_DUMMY_MAX_MIRRORS];
struct address_space *fmapping[HMM_DUMMY_MAX_MIRRORS];
+ struct page **rmem_pages;
+ unsigned long *rmem_bitmap;
};

struct dummy_event {
@@ -77,11 +82,30 @@ struct dummy_event {
struct list_head list;
uint64_t nsys_pages;
uint64_t nfaulted_sys_pages;
+ uint64_t ndev_pages;
+ uint64_t nfaulted_dev_pages;
+ unsigned *dpfn;
+ unsigned npages;
bool backoff;
};

static struct dummy_device ddevices[HMM_DUMMY_MAX_DEVICES];

+/* dummy_device_pfn_to_page() - Return struct page of fake device memory.
+ *
+ * @ddevice: The dummy device.
+ * @pfn: The fake device page frame number.
+ * Return: The pointer to the struct page of the fake device memory.
+ *
+ * For the dummy device remote memory we simply allocate regular page and
+ * pretend they are not accessible directly by the CPU.
+ */
+struct page *dummy_device_pfn_to_page(struct dummy_device *ddevice,
+ unsigned pfn)
+{
+ return ddevice->rmem_pages[pfn];
+}
+

static void dummy_mirror_release(struct hmm_mirror *mirror)
{
@@ -233,9 +257,11 @@ static int dummy_mirror_pt_invalidate(struct hmm_mirror *mirror,
unsigned long addr = event->start;
struct hmm_pt_iter miter, diter;
struct dummy_mirror *dmirror;
+ struct dummy_device *ddevice;
int ret = 0;

dmirror = container_of(mirror, struct dummy_mirror, mirror);
+ ddevice = dmirror->ddevice;

hmm_pt_iter_init(&diter, &dmirror->pt);
hmm_pt_iter_init(&miter, &mirror->pt);
@@ -259,6 +285,24 @@ static int dummy_mirror_pt_invalidate(struct hmm_mirror *mirror,
*/
hmm_pt_iter_directory_lock(&diter);

+ /* Handle the fake device memory page table entry case. */
+ if (hmm_pte_test_valid_dev(dpte)) {
+ unsigned dpfn = hmm_pte_dev_addr(*dpte) >> PAGE_SHIFT;
+
+ *dpte &= event->pte_mask;
+ if (!hmm_pte_test_valid_dev(dpte)) {
+ /*
+ * Just directly free the fake device memory.
+ */
+ clear_bit(dpfn, ddevice->rmem_bitmap);
+ hmm_pt_iter_directory_unref(&diter);
+ }
+ hmm_pt_iter_directory_unlock(&diter);
+
+ addr += PAGE_SIZE;
+ continue;
+ }
+
/*
* Just skip this entry if it is not valid inside the dummy
* mirror page table.
@@ -341,10 +385,178 @@ static int dummy_mirror_update(struct hmm_mirror *mirror,
}
}

+static int dummy_copy_from_device(struct hmm_mirror *mirror,
+ const struct hmm_event *event,
+ dma_addr_t *dst,
+ unsigned long start,
+ unsigned long end)
+{
+ struct hmm_pt_iter miter, diter;
+ struct dummy_device *ddevice;
+ struct dummy_mirror *dmirror;
+ struct dummy_event *devent;
+ unsigned long addr = start;
+ int ret = 0, i = 0;
+
+ dmirror = container_of(mirror, struct dummy_mirror, mirror);
+ devent = container_of(event, struct dummy_event, hevent);
+ ddevice = dmirror->ddevice;
+
+ hmm_pt_iter_init(&diter, &dmirror->pt);
+ hmm_pt_iter_init(&miter, &mirror->pt);
+
+ do {
+ struct page *spage, *dpage;
+ unsigned long dpfn, next = end;
+ dma_addr_t *mpte, *dpte;
+
+ mpte = hmm_pt_iter_lookup(&miter, addr, &next);
+ if (!mpte || !hmm_pte_test_valid_dev(mpte) ||
+ !hmm_pte_test_select(&dst[i])) {
+ i++;
+ continue;
+ }
+
+ dpte = hmm_pt_iter_lookup(&diter, addr, &next);
+ /*
+ * Sanity check, that that device driver page table is a valid
+ * entry pointing to device memory.
+ */
+ if (!dpte || !hmm_pte_test_valid_dev(dpte) ||
+ !hmm_pte_test_select(&dst[i])) {
+ ret = -EINVAL;
+ break;
+ }
+
+ dpfn = hmm_pte_dev_addr(*mpte) >> PAGE_SHIFT;
+ spage = dummy_device_pfn_to_page(ddevice, dpfn);
+ dpage = pfn_to_page(hmm_pte_pfn(dst[i]));
+ copy_highpage(dpage, spage);
+
+ /* Directly free the fake device memory. */
+ clear_bit(dpfn, ddevice->rmem_bitmap);
+
+ if (hmm_pte_test_and_clear_dirty(dpte))
+ hmm_pte_set_dirty(&dst[i]);
+
+ /*
+ * This is bit inefficient to lock directoy per entry instead
+ * of locking directory and going over all its entry. But this
+ * is a dummy driver and we do not care about efficiency here.
+ */
+ hmm_pt_iter_directory_lock(&diter);
+ *dpte = dst[i];
+ hmm_pte_clear_dirty(dpte);
+ hmm_pt_iter_directory_unlock(&diter);
+
+ i++;
+ } while (addr += PAGE_SIZE, addr < end);
+
+ hmm_pt_iter_fini(&miter);
+ hmm_pt_iter_fini(&diter);
+
+ return ret;
+}
+
+static int dummy_copy_to_device(struct hmm_mirror *mirror,
+ const struct hmm_event *event,
+ struct vm_area_struct *vma,
+ dma_addr_t *dst,
+ unsigned long start,
+ unsigned long end)
+{
+ struct hmm_pt_iter miter, diter;
+ struct dummy_device *ddevice;
+ struct dummy_mirror *dmirror;
+ struct dummy_event *devent;
+ unsigned long addr = start;
+ int ret = 0, i = 0;
+
+ dmirror = container_of(mirror, struct dummy_mirror, mirror);
+ devent = container_of(event, struct dummy_event, hevent);
+ ddevice = dmirror->ddevice;
+
+ hmm_pt_iter_init(&diter, &dmirror->pt);
+ hmm_pt_iter_init(&miter, &mirror->pt);
+
+ do {
+ struct page *spage, *dpage;
+ dma_addr_t *mpte, *dpte;
+ unsigned long next = end;
+
+ mpte = hmm_pt_iter_lookup(&miter, addr, &next);
+ /*
+ * Sanity check, this is only important for debugging HMM, a
+ * device driver can ignore those test and assume everything
+ * below is false (ie mpte is not NULL and it is a valid pfn
+ * entry with the select bit set).
+ */
+ if (!mpte || !hmm_pte_test_valid_pfn(mpte) ||
+ !hmm_pte_test_select(mpte)) {
+ pr_debug("(%s:%4d) (HMM FATAL) empty pt at 0x%lX\n",
+ __FILE__, __LINE__, addr);
+ ret = -EINVAL;
+ break;
+ }
+
+ dpte = hmm_pt_iter_populate(&diter, addr, &next);
+ if (!dpte) {
+ ret = -ENOMEM;
+ break;
+ }
+ /*
+ * Sanity check, this is only important for debugging HMM, a
+ * device driver can ignore those test and assume everything
+ * below is false (ie dpte is not a valid device entry).
+ */
+ if (hmm_pte_test_valid_dev(dpte)) {
+ pr_debug("(%s:%4d) (DUMMY FATAL) existing device entry %pad at 0x%lX\n",
+ __FILE__, __LINE__, dpte, addr);
+ ret = -EINVAL;
+ break;
+ }
+
+ spage = pfn_to_page(hmm_pte_pfn(*mpte));
+ dpage = dummy_device_pfn_to_page(ddevice, devent->dpfn[i]);
+ dst[i] = hmm_pte_from_dev_addr(devent->dpfn[i] << PAGE_SHIFT);
+ copy_highpage(dpage, spage);
+ devent->dpfn[i] = -1;
+ devent->nfaulted_dev_pages++;
+
+ /*
+ * This is bit inefficient to lock directoy per entry instead
+ * of locking directory and going over all its entry. But this
+ * is a dummy driver and we do not care about efficiency here.
+ */
+ hmm_pt_iter_directory_lock(&diter);
+ if (hmm_pte_test_and_clear_dirty(dpte))
+ hmm_pte_set_dirty(&dst[i]);
+ if (vma->vm_flags & VM_WRITE)
+ hmm_pte_set_write(&dst[i]);
+ /*
+ * Increment ref count of dummy page table directory if the
+ * previous entry was not valid. Note that previous entry
+ * can not be a valid device memory entry.
+ */
+ if (!hmm_pte_test_valid_pfn(dpte))
+ hmm_pt_iter_directory_ref(&diter);
+ *dpte = dst[i];
+ hmm_pt_iter_directory_unlock(&diter);
+
+ } while (i++, addr += PAGE_SIZE, addr < end);
+
+ hmm_pt_iter_fini(&miter);
+ hmm_pt_iter_fini(&diter);
+
+ return ret;
+}
+
static const struct hmm_device_ops hmm_dummy_ops = {
.release = &dummy_mirror_release,
.free = &dummy_mirror_free,
.update = &dummy_mirror_update,
+ .copy_from_device = &dummy_copy_from_device,
+ .copy_to_device = &dummy_copy_to_device,
};


@@ -443,6 +655,7 @@ static int dummy_read(struct dummy_mirror *dmirror,
char __user *buf,
size_t size)
{
+ struct dummy_device *ddevice = dmirror->ddevice;
struct hmm_event *event = &devent->hevent;
long r = 0;

@@ -483,14 +696,21 @@ static int dummy_read(struct dummy_mirror *dmirror,
* coherent value for each page table entry.
*/
dpte = ACCESS_ONCE(*dptep);
- if (!hmm_pte_test_valid_pfn(&dpte)) {
+
+ if (hmm_pte_test_valid_dev(&dpte)) {
+ dma_addr_t dpfn;
+
+ dpfn = hmm_pte_dev_addr(dpte) >> PAGE_SHIFT;
+ page = dummy_device_pfn_to_page(ddevice, dpfn);
+ devent->ndev_pages++;
+ } else if (hmm_pte_test_valid_pfn(&dpte)) {
+ page = pfn_to_page(hmm_pte_pfn(dpte));
+ devent->nsys_pages++;
+ } else {
dummy_mirror_access_stop(dmirror, devent);
break;
}

- devent->nsys_pages++;
-
- page = pfn_to_page(hmm_pte_pfn(dpte));
ptr = kmap(page);
r = copy_to_user(buf, ptr + offset, count);

@@ -515,6 +735,7 @@ static int dummy_write(struct dummy_mirror *dmirror,
char __user *buf,
size_t size)
{
+ struct dummy_device *ddevice = dmirror->ddevice;
struct hmm_event *event = &devent->hevent;
long r = 0;

@@ -555,15 +776,25 @@ static int dummy_write(struct dummy_mirror *dmirror,
* coherent value for each page table entry.
*/
dpte = ACCESS_ONCE(*dptep);
- if (!hmm_pte_test_valid_pfn(&dpte) ||
- !hmm_pte_test_write(&dpte)) {
+ if (!hmm_pte_test_write(&dpte)) {
+ dummy_mirror_access_stop(dmirror, devent);
+ break;
+ }
+
+ if (hmm_pte_test_valid_dev(&dpte)) {
+ dma_addr_t dpfn;
+
+ dpfn = hmm_pte_dev_addr(dpte) >> PAGE_SHIFT;
+ page = dummy_device_pfn_to_page(ddevice, dpfn);
+ devent->ndev_pages++;
+ } else if (hmm_pte_test_valid_pfn(&dpte)) {
+ page = pfn_to_page(hmm_pte_pfn(dpte));
+ devent->nsys_pages++;
+ } else {
dummy_mirror_access_stop(dmirror, devent);
break;
}

- devent->nsys_pages++;
-
- page = pfn_to_page(hmm_pte_pfn(dpte));
ptr = kmap(page);
r = copy_from_user(ptr + offset, buf, count);

@@ -583,6 +814,58 @@ static int dummy_write(struct dummy_mirror *dmirror,
return r;
}

+static int dummy_lmem_to_rmem(struct dummy_mirror *dmirror,
+ struct dummy_event *devent)
+{
+ struct dummy_device *ddevice = dmirror->ddevice;
+ struct hmm_mirror *mirror = &dmirror->mirror;
+ int i, ret;
+
+ devent->hevent.start = PAGE_MASK & devent->hevent.start;
+ devent->hevent.end = PAGE_ALIGN(devent->hevent.end);
+ devent->hevent.etype = HMM_COPY_TO_DEVICE;
+
+ /* Simple bitmap allocator for fake device memory. */
+ devent->dpfn = kcalloc(devent->npages, sizeof(unsigned), GFP_KERNEL);
+ if (devent->dpfn == NULL) {
+ return -ENOMEM;
+ }
+
+ /*
+ * Pre-allocate device memory. Device driver is free to pre-allocate
+ * memory or to allocate it inside the copy callback.
+ */
+ mutex_lock(&ddevice->mutex);
+ for (i = 0; i < devent->npages; ++i) {
+ int idx;
+
+ idx = find_first_zero_bit(ddevice->rmem_bitmap,
+ HMM_DUMMY_RMEM_NBITS);
+ if (idx < 0) {
+ while ((--i) > 0) {
+ idx = devent->dpfn[i];
+ clear_bit(idx, ddevice->rmem_bitmap);
+ }
+ mutex_unlock(&ddevice->mutex);
+ kfree(devent->dpfn);
+ return -ENOMEM;
+ }
+ devent->dpfn[i] = idx;
+ set_bit(idx, ddevice->rmem_bitmap);
+ }
+ mutex_unlock(&ddevice->mutex);
+
+ ret = hmm_mirror_fault(mirror, &devent->hevent);
+ for (i = 0; i < devent->npages; ++i) {
+ if (devent->dpfn[i] == -1U)
+ continue;
+ clear_bit(devent->dpfn[i], ddevice->rmem_bitmap);
+ }
+ kfree(devent->dpfn);
+
+ return ret;
+}
+

/*
* Below are the vm operation for the dummy device file. Sadly we can not allow
@@ -695,11 +978,26 @@ static int dummy_fops_release(struct inode *inode, struct file *filp)
return 0;
}

+struct dummy_ioctlp {
+ uint64_t address;
+ uint64_t size;
+};
+
+static void dummy_event_init(struct dummy_event *devent,
+ const struct dummy_ioctlp *ioctlp)
+{
+ memset(devent, 0, sizeof(*devent));
+ devent->hevent.start = ioctlp->address;
+ devent->hevent.end = ioctlp->address + ioctlp->size;
+ devent->npages = PAGE_ALIGN(ioctlp->size) >> PAGE_SHIFT;
+}
+
static long dummy_fops_unlocked_ioctl(struct file *filp,
unsigned int command,
unsigned long arg)
{
void __user *uarg = (void __user *)arg;
+ struct hmm_dummy_migrate dmigrate;
struct dummy_device *ddevice;
struct dummy_mirror *dmirror;
struct hmm_dummy_write dwrite;
@@ -765,15 +1063,15 @@ static long dummy_fops_unlocked_ioctl(struct file *filp,
return -EFAULT;
}

- memset(&devent, 0, sizeof(devent));
- devent.hevent.start = dread.address;
- devent.hevent.end = dread.address + dread.size;
+ dummy_event_init(&devent, (struct dummy_ioctlp*)&dread);
ret = dummy_read(dmirror, &devent,
(void __user *)dread.ptr,
dread.size);

dread.nsys_pages = devent.nsys_pages;
dread.nfaulted_sys_pages = devent.nfaulted_sys_pages;
+ dread.ndev_pages = devent.ndev_pages;
+ dread.nfaulted_dev_pages = devent.nfaulted_dev_pages;
if (copy_to_user(uarg, &dread, sizeof(dread))) {
dummy_mirror_worker_thread_stop(dmirror);
return -EFAULT;
@@ -787,15 +1085,15 @@ static long dummy_fops_unlocked_ioctl(struct file *filp,
return -EFAULT;
}

- memset(&devent, 0, sizeof(devent));
- devent.hevent.start = dwrite.address;
- devent.hevent.end = dwrite.address + dwrite.size;
+ dummy_event_init(&devent, (struct dummy_ioctlp*)&dwrite);
ret = dummy_write(dmirror, &devent,
(void __user *)dwrite.ptr,
dwrite.size);

dwrite.nsys_pages = devent.nsys_pages;
dwrite.nfaulted_sys_pages = devent.nfaulted_sys_pages;
+ dwrite.ndev_pages = devent.ndev_pages;
+ dwrite.nfaulted_dev_pages = devent.nfaulted_dev_pages;
if (copy_to_user(uarg, &dwrite, sizeof(dwrite))) {
dummy_mirror_worker_thread_stop(dmirror);
return -EFAULT;
@@ -803,6 +1101,23 @@ static long dummy_fops_unlocked_ioctl(struct file *filp,

dummy_mirror_worker_thread_stop(dmirror);
return ret;
+ case HMM_DUMMY_MIGRATE_TO:
+ if (copy_from_user(&dmigrate, uarg, sizeof(dmigrate))) {
+ dummy_mirror_worker_thread_stop(dmirror);
+ return -EFAULT;
+ }
+
+ dummy_event_init(&devent, (struct dummy_ioctlp*)&dmigrate);
+ ret = dummy_lmem_to_rmem(dmirror, &devent);
+
+ dmigrate.nfaulted_dev_pages = devent.nfaulted_dev_pages;
+ if (copy_to_user(uarg, &dmigrate, sizeof(dmigrate))) {
+ dummy_mirror_worker_thread_stop(dmirror);
+ return -EFAULT;
+ }
+
+ dummy_mirror_worker_thread_stop(dmirror);
+ return ret;
default:
return -EINVAL;
}
@@ -826,20 +1141,44 @@ static const struct file_operations hmm_dummy_fops = {
*/
static int dummy_device_init(struct dummy_device *ddevice)
{
- int ret, i;
+ struct page **pages;
+ unsigned long *bitmap;
+ int ret, i, npages;
+
+ npages = HMM_DUMMY_RMEM_SIZE >> PAGE_SHIFT;
+ bitmap = kzalloc(BITS_TO_LONGS(npages) * sizeof(long), GFP_KERNEL);
+ if (!bitmap) {
+ return -ENOMEM;
+ }
+ pages = kzalloc(npages * sizeof(void*), GFP_KERNEL);
+ if (!pages) {
+ kfree(bitmap);
+ return -ENOMEM;
+ }
+ for (i = 0; i < npages; ++i) {
+ pages[i] = alloc_page(GFP_KERNEL);
+ if (!pages[i]) {
+ while ((--i)) {
+ __free_page(pages[i]);
+ }
+ kfree(bitmap);
+ kfree(pages);
+ return -ENOMEM;
+ }
+ }

ret = alloc_chrdev_region(&ddevice->dev, 0,
HMM_DUMMY_MAX_DEVICES,
ddevice->name);
if (ret < 0)
- return ret;
+ goto error;
ddevice->major = MAJOR(ddevice->dev);

cdev_init(&ddevice->cdevice, &hmm_dummy_fops);
ret = cdev_add(&ddevice->cdevice, ddevice->dev, HMM_DUMMY_MAX_MIRRORS);
if (ret) {
unregister_chrdev_region(ddevice->dev, HMM_DUMMY_MAX_MIRRORS);
- return ret;
+ goto error;
}

/* Register the hmm device. */
@@ -853,14 +1192,25 @@ static int dummy_device_init(struct dummy_device *ddevice)
if (ret) {
cdev_del(&ddevice->cdevice);
unregister_chrdev_region(ddevice->dev, HMM_DUMMY_MAX_MIRRORS);
+ goto error;
}
+ ddevice->rmem_bitmap = bitmap;
+ ddevice->rmem_pages = pages;
+ return 0;
+
+error:
+ for (i = 0; i < npages; ++i) {
+ __free_page(pages[i]);
+ }
+ kfree(bitmap);
+ kfree(pages);
return ret;
}

static void dummy_device_fini(struct dummy_device *ddevice)
{
struct dummy_mirror *dmirror;
- unsigned i;
+ unsigned i, npages;

/* First unregister all mirror. */
do {
@@ -880,6 +1230,13 @@ static void dummy_device_fini(struct dummy_device *ddevice)

cdev_del(&ddevice->cdevice);
unregister_chrdev_region(ddevice->dev, HMM_DUMMY_MAX_MIRRORS);
+
+ npages = HMM_DUMMY_RMEM_SIZE >> PAGE_SHIFT;
+ for (i = 0; i < npages; ++i) {
+ __free_page(ddevice->rmem_pages[i]);
+ }
+ kfree(ddevice->rmem_bitmap);
+ kfree(ddevice->rmem_pages);
}

static int __init hmm_dummy_init(void)
diff --git a/include/uapi/linux/hmm_dummy.h b/include/uapi/linux/hmm_dummy.h
index 3af71d4..a98b03d 100644
--- a/include/uapi/linux/hmm_dummy.h
+++ b/include/uapi/linux/hmm_dummy.h
@@ -31,7 +31,9 @@ struct hmm_dummy_read {
uint64_t ptr;
uint64_t nsys_pages;
uint64_t nfaulted_sys_pages;
- uint64_t reserved[11];
+ uint64_t ndev_pages;
+ uint64_t nfaulted_dev_pages;
+ uint64_t reserved[9];
};

struct hmm_dummy_write {
@@ -40,12 +42,23 @@ struct hmm_dummy_write {
uint64_t ptr;
uint64_t nsys_pages;
uint64_t nfaulted_sys_pages;
- uint64_t reserved[11];
+ uint64_t ndev_pages;
+ uint64_t nfaulted_dev_pages;
+ uint64_t reserved[9];
+};
+
+struct hmm_dummy_migrate {
+ uint64_t address;
+ uint64_t size;
+ uint64_t nfaulted_sys_pages;
+ uint64_t nfaulted_dev_pages;
+ uint64_t reserved[12];
};

/* Expose the address space of the calling process through hmm dummy dev file */
#define HMM_DUMMY_EXPOSE_MM _IO('H', 0x00)
#define HMM_DUMMY_READ _IOWR('H', 0x01, struct hmm_dummy_read)
#define HMM_DUMMY_WRITE _IOWR('H', 0x02, struct hmm_dummy_write)
+#define HMM_DUMMY_MIGRATE_TO _IOWR('H', 0x03, struct hmm_dummy_migrate)

#endif /* _UAPI_LINUX_HMM_DUMMY_H */
--
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/