Re: [Patch v7 12/12] RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter
From: Yunsheng Lin
Date: Tue Oct 18 2022 - 21:17:16 EST
On 2022/10/18 3:20, longli@xxxxxxxxxxxxxxxxx wrote:
> From: Long Li <longli@xxxxxxxxxxxxx>
>
> Add a RDMA VF driver for Microsoft Azure Network Adapter (MANA).
>
> Co-developed-by: Ajay Sharma <sharmaajay@xxxxxxxxxxxxx>
> Signed-off-by: Ajay Sharma <sharmaajay@xxxxxxxxxxxxx>
> Reviewed-by: Dexuan Cui <decui@xxxxxxxxxxxxx>
> Signed-off-by: Long Li <longli@xxxxxxxxxxxxx>
> ---
> Change log:
> v2:
> Changed coding sytles/formats
> Checked undersize for udata length
> Changed all logging to use ibdev_xxx()
> Avoided page array copy when doing MR
> Sorted driver ops
> Fixed warnings reported by kernel test robot <lkp@xxxxxxxxx>
>
> v3:
> More coding sytle/format changes
>
> v4:
> Process error on hardware vport configuration
>
> v5:
> Change licenses to GPL-2.0-only
> Fix error handling in mana_ib_gd_create_dma_region()
>
> v6:
> rebased to rdma-next
> removed redundant initialization to return value in mana_ib_probe()
> added missing tabs at the end of mana_ib_gd_create_dma_region()
>
> v7:
> move mana_gd_destroy_doorbell_page() and mana_gd_allocate_doorbell_page() from GDMA to this driver
> use ib_umem_find_best_pgsz() for finding page size for registering dma regions with hardware
> fix a bug that may double free mana_ind_table in mana_ib_create_qp_rss()
> add Ajay Sharma <sharmaajay@xxxxxxxxxxxxx> to maintainer list
> add details to description in drivers/infiniband/hw/mana/Kconfig
> change multiple lines comments to use RDMA style from NETDEV style
> change mana_ib_dev_ops to static
> use module_auxiliary_driver() in place of module_init and module_exit
> move all user-triggerable error messages to debug messages
> check for ind_tbl_size overflow in mana_ib_create_qp_rss()
>
> MAINTAINERS | 9 +
> drivers/infiniband/Kconfig | 1 +
> drivers/infiniband/hw/Makefile | 1 +
> drivers/infiniband/hw/mana/Kconfig | 10 +
> drivers/infiniband/hw/mana/Makefile | 4 +
> drivers/infiniband/hw/mana/cq.c | 79 ++++
> drivers/infiniband/hw/mana/device.c | 117 ++++++
> drivers/infiniband/hw/mana/main.c | 508 ++++++++++++++++++++++++
> drivers/infiniband/hw/mana/mana_ib.h | 156 ++++++++
> drivers/infiniband/hw/mana/mr.c | 200 ++++++++++
> drivers/infiniband/hw/mana/qp.c | 505 +++++++++++++++++++++++
> drivers/infiniband/hw/mana/wq.c | 115 ++++++
> include/net/mana/mana.h | 3 +
> include/uapi/rdma/ib_user_ioctl_verbs.h | 1 +
> include/uapi/rdma/mana-abi.h | 66 +++
> 15 files changed, 1775 insertions(+)
> create mode 100644 drivers/infiniband/hw/mana/Kconfig
> create mode 100644 drivers/infiniband/hw/mana/Makefile
> create mode 100644 drivers/infiniband/hw/mana/cq.c
> create mode 100644 drivers/infiniband/hw/mana/device.c
> create mode 100644 drivers/infiniband/hw/mana/main.c
> create mode 100644 drivers/infiniband/hw/mana/mana_ib.h
> create mode 100644 drivers/infiniband/hw/mana/mr.c
> create mode 100644 drivers/infiniband/hw/mana/qp.c
> create mode 100644 drivers/infiniband/hw/mana/wq.c
> create mode 100644 include/uapi/rdma/mana-abi.h
>
[...]
> +
> +int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct mana_ib_pd *pd,
> + u32 doorbell_id)
> +{
> + struct gdma_dev *mdev = dev->gdma_dev;
> + struct mana_port_context *mpc;
> + struct mana_context *mc;
> + struct net_device *ndev;
> + int err;
> +
> + mc = mdev->driver_data;
> + ndev = mc->ports[port];
> + mpc = netdev_priv(ndev);
> +
> + mutex_lock(&pd->vport_mutex);
> +
> + pd->vport_use_count++;
> + if (pd->vport_use_count > 1) {
> + ibdev_dbg(&dev->ib_dev,
> + "Skip as this PD is already configured vport\n");
> + mutex_unlock(&pd->vport_mutex);
> + return 0;
> + }
> + mutex_unlock(&pd->vport_mutex);
> +
> + err = mana_cfg_vport(mpc, pd->pdn, doorbell_id);
> + if (err) {
> + mutex_lock(&pd->vport_mutex);
> + pd->vport_use_count--;
> + mutex_unlock(&pd->vport_mutex);
It seems there might be a race between the "pd->vport_use_count > 1"
checking above and the error handling here, it may cause other user using a
unconfigured vport if other user is checking the "pd->vport_use_count > 1)"
while mana_cfg_vport() fails before doing "pd->vport_use_count--".
> +
> + ibdev_dbg(&dev->ib_dev, "Failed to configure vPort %d\n", err);
> + return err;
> + }
> +
> + pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
> + pd->tx_vp_offset = mpc->tx_vp_offset;
> +
> + ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x doorbell_id %x\n",
> + mpc->port_handle, pd->pdn, doorbell_id);
> +
> + return 0;
> +}
> +
[...]
> +
> +static int mana_gd_allocate_doorbell_page(struct gdma_context *gc,
> + int *doorbell_page)
> +{
> + struct gdma_allocate_resource_range_req req = {};
> + struct gdma_allocate_resource_range_resp resp = {};
> + int err;
> +
> + mana_gd_init_req_hdr(&req.hdr, GDMA_ALLOCATE_RESOURCE_RANGE,
> + sizeof(req), sizeof(resp));
> +
> + req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE;
> + req.num_resources = 1;
> + req.alignment = 1;
> +
> + /* Have GDMA start searching from 0 */
> + req.allocated_resources = 0;
> +
> + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
> + if (err || resp.hdr.status) {
> + dev_err(gc->dev,
> + "Failed to allocate doorbell page: ret %d, 0x%x\n",
> + err, resp.hdr.status);
> + return err ? err : -EPROTO;
> + }
> +
> + *doorbell_page = resp.allocated_resources;
> +
> + return 0;
> +}
> +
> +int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
> + struct ib_udata *udata)
> +{
> + struct mana_ib_ucontext *ucontext =
> + container_of(ibcontext, struct mana_ib_ucontext, ibucontext);
> + struct ib_device *ibdev = ibcontext->device;
> + struct mana_ib_dev *mdev;
> + struct gdma_context *gc;
> + struct gdma_dev *dev;
> + int doorbell_page;
> + int ret;
> +
> + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> + dev = mdev->gdma_dev;
> + gc = dev->gdma_context;
> +
> + /* Allocate a doorbell page index */
> + ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page);
> + if (ret) {
> + ibdev_dbg(ibdev, "Failed to allocate doorbell page %d\n", ret);
> + return -ENOMEM;
It does not make much sense to do "err ? err : -EPROTO" in
mana_gd_allocate_doorbell_page() if -ENOMEM is returned unconditionally
here.
> + }
> +
> + ibdev_dbg(ibdev, "Doorbell page allocated %d\n", doorbell_page);
> +
> + ucontext->doorbell = doorbell_page;
> +
> + return 0;
> +}
> +
[...]
> diff --git a/drivers/infiniband/hw/mana/mana_ib.h b/drivers/infiniband/hw/mana/mana_ib.h
> new file mode 100644
> index 000000000000..2225a6d6f8e1
> --- /dev/null
> +++ b/drivers/infiniband/hw/mana/mana_ib.h
> @@ -0,0 +1,156 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
> + */
> +
> +#ifndef _MANA_IB_H_
> +#define _MANA_IB_H_
> +
> +#include <rdma/ib_verbs.h>
> +#include <rdma/ib_mad.h>
> +#include <rdma/ib_umem.h>
> +#include <rdma/mana-abi.h>
> +#include <rdma/uverbs_ioctl.h>
> +
> +#include <net/mana/mana.h>
> +
> +#define PAGE_SZ_BM \
> + (SZ_4K | SZ_8K | SZ_16K | SZ_32K | SZ_64K | SZ_128K | SZ_256K | \
> + SZ_512K | SZ_1M | SZ_2M)
> +
> +/* MANA doesn't have any limit for MR size */
> +#define MANA_IB_MAX_MR_SIZE ((u64)(~(0ULL)))
Use U64_MAX?
> +
> +struct mana_ib_dev {
> + struct ib_device ib_dev;
> + struct gdma_dev *gdma_dev;
> +};
> +
[...]
> diff --git a/drivers/infiniband/hw/mana/mr.c b/drivers/infiniband/hw/mana/mr.c
> new file mode 100644
> index 000000000000..09124dd1792d
> --- /dev/null
> +++ b/drivers/infiniband/hw/mana/mr.c
> @@ -0,0 +1,200 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
> + */
> +
> +#include "mana_ib.h"
> +
> +#define VALID_MR_FLAGS \
> + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ)
> +
> +static enum gdma_mr_access_flags
> +mana_ib_verbs_to_gdma_access_flags(int access_flags)
> +{
> + enum gdma_mr_access_flags flags = GDMA_ACCESS_FLAG_LOCAL_READ;
> +
> + if (access_flags & IB_ACCESS_LOCAL_WRITE)
> + flags |= GDMA_ACCESS_FLAG_LOCAL_WRITE;
> +
> + if (access_flags & IB_ACCESS_REMOTE_WRITE)
> + flags |= GDMA_ACCESS_FLAG_REMOTE_WRITE;
> +
> + if (access_flags & IB_ACCESS_REMOTE_READ)
> + flags |= GDMA_ACCESS_FLAG_REMOTE_READ;
> +
> + return flags;
> +}
> +
> +static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct mana_ib_mr *mr,
> + struct gdma_create_mr_params *mr_params)
> +{
> + struct gdma_create_mr_response resp = {};
> + struct gdma_create_mr_request req = {};
> + struct gdma_dev *mdev = dev->gdma_dev;
> + struct gdma_context *gc;
> + int err;
> +
> + gc = mdev->gdma_context;
> +
> + mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req),
> + sizeof(resp));
> + req.pd_handle = mr_params->pd_handle;
> + req.mr_type = mr_params->mr_type;
> +
> + switch (mr_params->mr_type) {
> + case GDMA_MR_TYPE_GVA:
> + req.gva.dma_region_handle = mr_params->gva.dma_region_handle;
> + req.gva.virtual_address = mr_params->gva.virtual_address;
> + req.gva.access_flags = mr_params->gva.access_flags;
> + break;
> +
> + default:
> + ibdev_dbg(&dev->ib_dev,
> + "invalid param (GDMA_MR_TYPE) passed, type %d\n",
> + req.mr_type);
> + err = -EINVAL;
> + goto error;
> + }
> +
> + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
> +
> + if (err || resp.hdr.status) {
> + ibdev_dbg(&dev->ib_dev, "Failed to create mr %d, %u", err,
> + resp.hdr.status);
> + if (!err)
> + err = -EPROTO;
> +
> + goto error;
> + }
> +
> + mr->ibmr.lkey = resp.lkey;
> + mr->ibmr.rkey = resp.rkey;
> + mr->mr_handle = resp.mr_handle;
> +
> + return 0;
> +error:
> + return err;
There is no error handling here, maybe just return error directly instead of
a goto.
> +}
> +
> +static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev, gdma_obj_handle_t mr_handle)
> +{
> + struct gdma_destroy_mr_response resp = {};
> + struct gdma_destroy_mr_request req = {};
> + struct gdma_dev *mdev = dev->gdma_dev;
> + struct gdma_context *gc;
> + int err;
> +
> + gc = mdev->gdma_context;
> +
> + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_MR, sizeof(req),
> + sizeof(resp));
> +
> + req.mr_handle = mr_handle;
> +
> + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp), &resp);
> + if (err || resp.hdr.status) {
> + dev_err(gc->dev, "Failed to destroy MR: %d, 0x%x\n", err,
> + resp.hdr.status);
> + if (!err)
> + err = -EPROTO;
> + return err;
> + }
> +
> + return 0;
> +}
> +
[...]
> +
> +static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
> + struct ib_qp_init_attr *attr,
> + struct ib_udata *udata)
> +{
> + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp, ibqp);
> + struct mana_ib_dev *mdev =
> + container_of(pd->device, struct mana_ib_dev, ib_dev);
> + struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl;
> + struct mana_ib_create_qp_rss_resp resp = {};
> + struct mana_ib_create_qp_rss ucmd = {};
> + struct gdma_dev *gd = mdev->gdma_dev;
> + mana_handle_t *mana_ind_table;
> + struct mana_port_context *mpc;
> + struct mana_context *mc;
> + struct net_device *ndev;
> + struct mana_ib_cq *cq;
> + struct mana_ib_wq *wq;
> + unsigned int ind_tbl_size;
> + struct ib_cq *ibcq;
> + struct ib_wq *ibwq;
> + u32 port;
> + int ret;
> + int i;
> +
> + mc = gd->driver_data;
> +
> + if (!udata || udata->inlen < sizeof(ucmd))
> + return -EINVAL;
> +
> + ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata->inlen));
> + if (ret) {
> + ibdev_dbg(&mdev->ib_dev,
> + "Failed copy from udata for create rss-qp, err %d\n",
> + ret);
> + return -EFAULT;
Why not just return 'ret' directly?