[PATCH for-next] RDMA/hns: Support flexible WQE buffer page size

From: Junxian Huang
Date: Tue Apr 30 2024 - 05:33:44 EST


From: Chengchang Tang <tangchengchang@xxxxxxxxxx>

Currently, driver fixedly allocates 4K pages for userspace WQE buffer
and results in HW reading WQE with a granularity of 4K even in a 64K
system. HW has to switch pages every 4K, leading to a loss of performance.

In order to improve performance, add support for userspace to allocate
flexible WQE buffer page size between 4K to system PAGESIZE.

For old-version userspace driver that does not support this feature,
the kernel driver will use a fixed 4K pagesize.

Signed-off-by: Chengchang Tang <tangchengchang@xxxxxxxxxx>
Signed-off-by: Junxian Huang <huangjunxian6@xxxxxxxxxxxxx>
---
drivers/infiniband/hw/hns/hns_roce_main.c | 5 ++++
drivers/infiniband/hw/hns/hns_roce_qp.c | 32 ++++++++++++++---------
include/uapi/rdma/hns-abi.h | 5 +++-
3 files changed, 29 insertions(+), 13 deletions(-)

diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c b/drivers/infiniband/hw/hns/hns_roce_main.c
index 4cb0af733587..19b13c79b67b 100644
--- a/drivers/infiniband/hw/hns/hns_roce_main.c
+++ b/drivers/infiniband/hw/hns/hns_roce_main.c
@@ -405,6 +405,11 @@ static int hns_roce_alloc_ucontext(struct ib_ucontext *uctx,
if (hr_dev->pci_dev->revision >= PCI_REVISION_ID_HIP09)
resp.congest_type = hr_dev->caps.cong_cap;

+ if (ucmd.config & HNS_ROCE_UCTX_DYN_QP_PGSZ_FLAGS) {
+ context->config |= HNS_ROCE_UCTX_DYN_QP_PGSZ_FLAGS;
+ resp.config |= HNS_ROCE_RSP_UCTX_DYN_QP_PGSZ_FLAGS;
+ }
+
ret = hns_roce_uar_alloc(hr_dev, &context->uar);
if (ret)
goto error_out;
diff --git a/drivers/infiniband/hw/hns/hns_roce_qp.c b/drivers/infiniband/hw/hns/hns_roce_qp.c
index db34665d1dfb..df8aba6a7840 100644
--- a/drivers/infiniband/hw/hns/hns_roce_qp.c
+++ b/drivers/infiniband/hw/hns/hns_roce_qp.c
@@ -643,18 +643,21 @@ static int set_user_sq_size(struct hns_roce_dev *hr_dev,
}

static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev,
- struct hns_roce_qp *hr_qp,
+ struct hns_roce_qp *hr_qp, u8 page_shift,
struct hns_roce_buf_attr *buf_attr)
{
+ unsigned int page_size = BIT(page_shift);
int buf_size;
int idx = 0;

hr_qp->buff_size = 0;

+ if (page_shift > PAGE_SHIFT || page_shift < HNS_HW_PAGE_SHIFT)
+ return -EOPNOTSUPP;
+
/* SQ WQE */
hr_qp->sq.offset = 0;
- buf_size = to_hr_hem_entries_size(hr_qp->sq.wqe_cnt,
- hr_qp->sq.wqe_shift);
+ buf_size = ALIGN(hr_qp->sq.wqe_cnt << hr_qp->sq.wqe_shift, page_size);
if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) {
buf_attr->region[idx].size = buf_size;
buf_attr->region[idx].hopnum = hr_dev->caps.wqe_sq_hop_num;
@@ -664,8 +667,7 @@ static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev,

/* extend SGE WQE in SQ */
hr_qp->sge.offset = hr_qp->buff_size;
- buf_size = to_hr_hem_entries_size(hr_qp->sge.sge_cnt,
- hr_qp->sge.sge_shift);
+ buf_size = ALIGN(hr_qp->sge.sge_cnt << hr_qp->sge.sge_shift, page_size);
if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) {
buf_attr->region[idx].size = buf_size;
buf_attr->region[idx].hopnum = hr_dev->caps.wqe_sge_hop_num;
@@ -675,8 +677,7 @@ static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev,

/* RQ WQE */
hr_qp->rq.offset = hr_qp->buff_size;
- buf_size = to_hr_hem_entries_size(hr_qp->rq.wqe_cnt,
- hr_qp->rq.wqe_shift);
+ buf_size = ALIGN(hr_qp->rq.wqe_cnt << hr_qp->rq.wqe_shift, page_size);
if (buf_size > 0 && idx < ARRAY_SIZE(buf_attr->region)) {
buf_attr->region[idx].size = buf_size;
buf_attr->region[idx].hopnum = hr_dev->caps.wqe_rq_hop_num;
@@ -687,8 +688,8 @@ static int set_wqe_buf_attr(struct hns_roce_dev *hr_dev,
if (hr_qp->buff_size < 1)
return -EINVAL;

- buf_attr->page_shift = HNS_HW_PAGE_SHIFT + hr_dev->caps.mtt_buf_pg_sz;
buf_attr->region_count = idx;
+ buf_attr->page_shift = page_shift;

return 0;
}
@@ -744,20 +745,27 @@ static int hns_roce_qp_has_rq(struct ib_qp_init_attr *attr)

static int alloc_qp_buf(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp,
struct ib_qp_init_attr *init_attr,
- struct ib_udata *udata, unsigned long addr)
+ struct ib_udata *udata,
+ struct hns_roce_ib_create_qp *ucmd)
{
+ struct hns_roce_ucontext *uctx = rdma_udata_to_drv_context(udata,
+ struct hns_roce_ucontext, ibucontext);
struct ib_device *ibdev = &hr_dev->ib_dev;
struct hns_roce_buf_attr buf_attr = {};
+ u8 page_shift = HNS_HW_PAGE_SHIFT;
int ret;

- ret = set_wqe_buf_attr(hr_dev, hr_qp, &buf_attr);
+ if (uctx && (uctx->config & HNS_ROCE_UCTX_DYN_QP_PGSZ_FLAGS))
+ page_shift = ucmd->pageshift;
+
+ ret = set_wqe_buf_attr(hr_dev, hr_qp, page_shift, &buf_attr);
if (ret) {
ibdev_err(ibdev, "failed to split WQE buf, ret = %d.\n", ret);
goto err_inline;
}
ret = hns_roce_mtr_create(hr_dev, &hr_qp->mtr, &buf_attr,
PAGE_SHIFT + hr_dev->caps.mtt_ba_pg_sz,
- udata, addr);
+ udata, ucmd->buf_addr);
if (ret) {
ibdev_err(ibdev, "failed to create WQE mtr, ret = %d.\n", ret);
goto err_inline;
@@ -1152,7 +1160,7 @@ static int hns_roce_create_qp_common(struct hns_roce_dev *hr_dev,
}
}

- ret = alloc_qp_buf(hr_dev, hr_qp, init_attr, udata, ucmd.buf_addr);
+ ret = alloc_qp_buf(hr_dev, hr_qp, init_attr, udata, &ucmd);
if (ret) {
ibdev_err(ibdev, "failed to alloc QP buffer, ret = %d.\n", ret);
goto err_buf;
diff --git a/include/uapi/rdma/hns-abi.h b/include/uapi/rdma/hns-abi.h
index 94e861870e27..c5211b8dbf91 100644
--- a/include/uapi/rdma/hns-abi.h
+++ b/include/uapi/rdma/hns-abi.h
@@ -90,7 +90,8 @@ struct hns_roce_ib_create_qp {
__u8 log_sq_bb_count;
__u8 log_sq_stride;
__u8 sq_no_prefetch;
- __u8 reserved[5];
+ __u8 pageshift;
+ __u8 reserved[4];
__aligned_u64 sdb_addr;
__aligned_u64 comp_mask; /* Use enum hns_roce_create_qp_comp_mask */
__aligned_u64 create_flags;
@@ -119,12 +120,14 @@ enum {
HNS_ROCE_EXSGE_FLAGS = 1 << 0,
HNS_ROCE_RQ_INLINE_FLAGS = 1 << 1,
HNS_ROCE_CQE_INLINE_FLAGS = 1 << 2,
+ HNS_ROCE_UCTX_DYN_QP_PGSZ_FLAGS = 1 << 3,
};

enum {
HNS_ROCE_RSP_EXSGE_FLAGS = 1 << 0,
HNS_ROCE_RSP_RQ_INLINE_FLAGS = 1 << 1,
HNS_ROCE_RSP_CQE_INLINE_FLAGS = 1 << 2,
+ HNS_ROCE_RSP_UCTX_DYN_QP_PGSZ_FLAGS = 1 << 3,
};

struct hns_roce_ib_alloc_ucontext_resp {
--
2.30.0