At the moment registering an MR breaks COW. This breaks memory
overcommit for users such as KVM: we have a lot of COW pages, e.g.
instances of the zero page or pages shared using KSM.
If the application does not care that adapter sees stale data (for
example, it tracks writes reregisters and resends), it can use a new
IBV_ACCESS_GIFT flag to prevent registration from breaking COW.
The semantics are similar to that of SPLICE_F_GIFT thus the name.
Signed-off-by: Michael S. Tsirkin <mst@xxxxxxxxxx>
---
Please review and consider for 3.10.
Changes from v1:
rename APP_READONLY to _GIFT: similar to vmsplice's F_GIFT.
drivers/infiniband/core/umem.c | 21 ++++++++++++---------
include/rdma/ib_verbs.h | 9 ++++++++-
2 files changed, 20 insertions(+), 10 deletions(-)
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
index a841123..5dee86d 100644
--- a/drivers/infiniband/core/umem.c
+++ b/drivers/infiniband/core/umem.c
@@ -89,6 +89,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
int ret;
int off;
int i;
+ bool gift, writable;
DEFINE_DMA_ATTRS(attrs);
if (dmasync)
@@ -96,6 +97,15 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
if (!can_do_mlock())
return ERR_PTR(-EPERM);
+ /*
+ * We ask for writable memory if any access flags other than
+ * "remote read" or "gift" are set. "Local write" and "remote write"
+ * obviously require write access. "Remote atomic" can do
+ * things like fetch and add, which will modify memory, and
+ * "MW bind" can change permissions by binding a window.
+ */
+ gift = access & IB_ACCESS_GIFT;
+ writable = access & ~(IB_ACCESS_REMOTE_READ | IB_ACCESS_GIFT);
umem = kmalloc(sizeof *umem, GFP_KERNEL);
if (!umem)
@@ -105,14 +115,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
umem->length = size;
umem->offset = addr & ~PAGE_MASK;
umem->page_size = PAGE_SIZE;
- /*
- * We ask for writable memory if any access flags other than
- * "remote read" are set. "Local write" and "remote write"
- * obviously require write access. "Remote atomic" can do
- * things like fetch and add, which will modify memory, and
- * "MW bind" can change permissions by binding a window.
- */
- umem->writable = !!(access & ~IB_ACCESS_REMOTE_READ);
+ umem->writable = writable;
/* We assume the memory is from hugetlb until proved otherwise */
umem->hugetlb = 1;
@@ -152,7 +155,7 @@ struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
ret = get_user_pages(current, current->mm, cur_base,
min_t(unsigned long, npages,
PAGE_SIZE / sizeof (struct page *)),
- 1, !umem->writable, page_list, vma_list);
+ !gift, !umem->writable, page_list, vma_list);
if (ret < 0)
goto out;
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 98cc4b2..2e6e13c 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -871,7 +871,14 @@ enum ib_access_flags {
IB_ACCESS_REMOTE_READ = (1<<2),
IB_ACCESS_REMOTE_ATOMIC = (1<<3),
IB_ACCESS_MW_BIND = (1<<4),
- IB_ZERO_BASED = (1<<5)
+ IB_ZERO_BASED = (1<<5),
+ /*
+ * IB_ACCESS_GIFT: This memory is a gift to the adapter. If memory is
+ * modified after registration, the local version and data seen by the
+ * adapter through this region rkey may differ.
+ * Only legal with IB_ACCESS_REMOTE_READ or no permissions.
+ */
+ IB_ACCESS_GIFT = (1<<6)
};
struct ib_phys_buf {