[PATCH 10/15] IB/core: disable memory registration of fileystem-dax vmas

From: Dan Williams
Date: Tue Oct 31 2017 - 19:29:03 EST


Until there is a solution to the dma-to-dax vs truncate problem it is
not safe to allow RDMA to create long standing memory registrations
against filesytem-dax vmas. Device-dax vmas do not have this problem and
are explicitly allowed.

This is temporary until a "memory registration with layout-lease"
mechanism can be implemented, and is limited to non-ODP (On Demand
Paging) capable RDMA devices.

Cc: Sean Hefty <sean.hefty@xxxxxxxxx>
Cc: Doug Ledford <dledford@xxxxxxxxxx>
Cc: Hal Rosenstock <hal.rosenstock@xxxxxxxxx>
Cc: Jeff Moyer <jmoyer@xxxxxxxxxx>
Cc: Christoph Hellwig <hch@xxxxxx>
Cc: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
Cc: Jason Gunthorpe <jgunthorpe@xxxxxxxxxxxxxxxxxxxx>
Cc: <linux-rdma@xxxxxxxxxxxxxxx>
Cc: <stable@xxxxxxxxxxxxxxx>
Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---
drivers/infiniband/core/umem.c | 49 +++++++++++++++++++++++++++++++---------
1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index 21e60b1e2ff4..c30d286c1f24 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -147,19 +147,21 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
umem->hugetlb = 1;

page_list = (struct page **) __get_free_page(GFP_KERNEL);
- if (!page_list) {
- put_pid(umem->pid);
- kfree(umem);
- return ERR_PTR(-ENOMEM);
- }
+ if (!page_list)
+ goto err_pagelist;

/*
- * if we can't alloc the vma_list, it's not so bad;
- * just assume the memory is not hugetlb memory
+ * If DAX is enabled we need the vma to protect against
+ * registering filesystem-dax memory. Otherwise we can tolerate
+ * a failure to allocate the vma_list and just assume that all
+ * vmas are not hugetlb-vmas.
*/
vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
- if (!vma_list)
+ if (!vma_list) {
+ if (IS_ENABLED(CONFIG_FS_DAX))
+ goto err_vmalist;
umem->hugetlb = 0;
+ }

npages = ib_umem_num_pages(umem);

@@ -199,15 +201,34 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
if (ret < 0)
goto out;

- umem->npages += ret;
cur_base += ret * PAGE_SIZE;
npages -= ret;

for_each_sg(sg_list_start, sg, ret, i) {
- if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
- umem->hugetlb = 0;
+ struct vm_area_struct *vma;
+ struct inode *inode;

sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
+ umem->npages++;
+
+ if (!vma_list)
+ continue;
+ vma = vma_list[i];
+
+ if (!is_vm_hugetlb_page(vma))
+ umem->hugetlb = 0;
+
+ if (!vma_is_dax(vma))
+ continue;
+
+ /* device-dax is safe for rdma... */
+ inode = file_inode(vma->vm_file);
+ if (inode->i_mode == S_IFCHR)
+ continue;
+
+ /* ...filesystem-dax is not. */
+ ret = -EOPNOTSUPP;
+ goto out;
}

/* preparing for next loop */
@@ -242,6 +263,12 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
free_page((unsigned long) page_list);

return ret < 0 ? ERR_PTR(ret) : umem;
+err_vmalist:
+ free_page((unsigned long) page_list);
+err_pagelist:
+ put_pid(umem->pid);
+ kfree(umem);
+ return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(ib_umem_get);