Re: [PATCH for-next v5 6/7] RDMA/rxe: Add support for Send/Recv/Write/Read with ODP

From: Bob Pearson
Date: Fri May 19 2023 - 13:10:53 EST


On 5/18/23 03:21, Daisuke Matsuda wrote:
> rxe_mr_copy() is used widely to copy data to/from a user MR. requester uses
> it to load payloads of requesting packets; responder uses it to process
> Send, Write, and Read operaetions; completer uses it to copy data from
> response packets of Read and Atomic operations to a user MR.
>
> Allow these operations to be used with ODP by adding a subordinate function
> rxe_odp_mr_copy(). It is comprised of the following steps:
> 1. Check the driver page table(umem_odp->dma_list) to see if pages being
> accessed are present with appropriate permission.
> 2. If necessary, trigger page fault to map the pages.
> 3. Update the MR xarray using PFNs in umem_odp->pfn_list.
> 4. Execute data copy to/from the pages.
>
> umem_mutex is used to ensure that dma_list (an array of addresses of an MR)
> is not changed while it is being checked and that mapped pages are not
> invalidated before data copy completes.
>
> Signed-off-by: Daisuke Matsuda <matsuda-daisuke@xxxxxxxxxxx>
> ---
> drivers/infiniband/sw/rxe/rxe.c | 10 +++
> drivers/infiniband/sw/rxe/rxe_loc.h | 8 ++
> drivers/infiniband/sw/rxe/rxe_mr.c | 2 +-
> drivers/infiniband/sw/rxe/rxe_odp.c | 109 ++++++++++++++++++++++++++++
> 4 files changed, 128 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/infiniband/sw/rxe/rxe.c b/drivers/infiniband/sw/rxe/rxe.c
> index f2284d27229b..207a022156f0 100644
> --- a/drivers/infiniband/sw/rxe/rxe.c
> +++ b/drivers/infiniband/sw/rxe/rxe.c
> @@ -79,6 +79,16 @@ static void rxe_init_device_param(struct rxe_dev *rxe)
>
> /* IB_ODP_SUPPORT_IMPLICIT is not supported right now. */
> rxe->attr.odp_caps.general_caps |= IB_ODP_SUPPORT;
> +
> + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SEND;
> + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_RECV;
> + rxe->attr.odp_caps.per_transport_caps.ud_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
> +
> + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SEND;
> + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_RECV;
> + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_WRITE;
> + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_READ;
> + rxe->attr.odp_caps.per_transport_caps.rc_odp_caps |= IB_ODP_SUPPORT_SRQ_RECV;
> }
> }
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_loc.h b/drivers/infiniband/sw/rxe/rxe_loc.h
> index 93247d123642..4b95c8c46bdc 100644
> --- a/drivers/infiniband/sw/rxe/rxe_loc.h
> +++ b/drivers/infiniband/sw/rxe/rxe_loc.h
> @@ -206,6 +206,8 @@ static inline unsigned int wr_opcode_mask(int opcode, struct rxe_qp *qp)
> #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
> int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
> u64 iova, int access_flags, struct rxe_mr *mr);
> +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
> + enum rxe_mr_copy_dir dir);
> #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
> static inline int
> rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
> @@ -213,6 +215,12 @@ rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length, u64 iova,
> {
> return -EOPNOTSUPP;
> }
> +static inline int
> +rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
> + int length, enum rxe_mr_copy_dir dir)
> +{
> + return -EOPNOTSUPP;
> +}
>
> #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
>
> diff --git a/drivers/infiniband/sw/rxe/rxe_mr.c b/drivers/infiniband/sw/rxe/rxe_mr.c
> index cd368cd096c8..0e3cda59d702 100644
> --- a/drivers/infiniband/sw/rxe/rxe_mr.c
> +++ b/drivers/infiniband/sw/rxe/rxe_mr.c
> @@ -319,7 +319,7 @@ int rxe_mr_copy(struct rxe_mr *mr, u64 iova, void *addr,
> }
>
> if (mr->odp_enabled)
> - return -EOPNOTSUPP;
> + return rxe_odp_mr_copy(mr, iova, addr, length, dir);
> else
> return rxe_mr_copy_xarray(mr, iova, addr, length, dir);
> }
> diff --git a/drivers/infiniband/sw/rxe/rxe_odp.c b/drivers/infiniband/sw/rxe/rxe_odp.c
> index e5497d09c399..cbe5d0c3fcc4 100644
> --- a/drivers/infiniband/sw/rxe/rxe_odp.c
> +++ b/drivers/infiniband/sw/rxe/rxe_odp.c
> @@ -174,3 +174,112 @@ int rxe_odp_mr_init_user(struct rxe_dev *rxe, u64 start, u64 length,
>
> return err;
> }
> +
> +static inline bool rxe_is_pagefault_neccesary(struct ib_umem_odp *umem_odp,
> + u64 iova, int length, u32 perm)
> +{
> + int idx;
> + u64 addr;
> + bool need_fault = false;
> +
> + addr = iova & (~(BIT(umem_odp->page_shift) - 1));
> +
> + /* Skim through all pages that are to be accessed. */
> + while (addr < iova + length) {
> + idx = (addr - ib_umem_start(umem_odp)) >> umem_odp->page_shift;
> +
> + if (!(umem_odp->dma_list[idx] & perm)) {
> + need_fault = true;
> + break;
> + }
> +
> + addr += BIT(umem_odp->page_shift);
> + }
> + return need_fault;
> +}
> +
> +/* umem mutex must be locked before entering this function. */
> +static int rxe_odp_map_range(struct rxe_mr *mr, u64 iova, int length, u32 flags)
> +{
> + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
> + const int max_tries = 3;
> + int cnt = 0;
> +
> + int err;
> + u64 perm;
> + bool need_fault;
> +
> + if (unlikely(length < 1)) {
> + mutex_unlock(&umem_odp->umem_mutex);
> + return -EINVAL;
> + }
> +
> + perm = ODP_READ_ALLOWED_BIT;
> + if (!(flags & RXE_PAGEFAULT_RDONLY))
> + perm |= ODP_WRITE_ALLOWED_BIT;
> +
> + /*
> + * A successful return from rxe_odp_do_pagefault() does not guarantee
> + * that all pages in the range became present. Recheck the DMA address
> + * array, allowing max 3 tries for pagefault.
> + */
> + while ((need_fault = rxe_is_pagefault_neccesary(umem_odp,
> + iova, length, perm))) {
> + if (cnt >= max_tries)
> + break;
> +
> + mutex_unlock(&umem_odp->umem_mutex);
> +
> + /* umem_mutex is locked on success. */
> + err = rxe_odp_do_pagefault(mr, iova, length, flags);
> + if (err < 0)
> + return err;
> +
> + cnt++;
> + }
> +
> + if (need_fault)
> + return -EFAULT;
> +
> + return 0;
> +}
> +
> +int rxe_odp_mr_copy(struct rxe_mr *mr, u64 iova, void *addr, int length,
> + enum rxe_mr_copy_dir dir)
> +{
> + struct ib_umem_odp *umem_odp = to_ib_umem_odp(mr->umem);
> + u32 flags = 0;
> + int err;
> +
> + if (unlikely(!mr->odp_enabled))
> + return -EOPNOTSUPP;
> +
> + switch (dir) {
> + case RXE_TO_MR_OBJ:
> + break;
> +
> + case RXE_FROM_MR_OBJ:
> + flags = RXE_PAGEFAULT_RDONLY;
> + break;
> +
> + default:
> + return -EINVAL;
> + }
> +
> + /* If pagefault is not required, umem mutex will be held until data
> + * copy to the MR completes. Otherwise, it is released and locked
> + * again in rxe_odp_map_range() to let invalidation handler do its
> + * work meanwhile.
> + */
> + mutex_lock(&umem_odp->umem_mutex);
> +
> + err = rxe_odp_map_range(mr, iova, length, flags);
> + if (err)
> + return err;
> +
> + err = rxe_mr_copy_xarray(mr, iova, addr, length, dir);
> +
> + mutex_unlock(&umem_odp->umem_mutex);
> +
> + return err;
> +}

Reviewed-by: Bob Pearson <rpearsonhpe@xxxxxxxxx>