RE: [Patch v7 12/12] RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter
From: Long Li
Date: Wed Oct 19 2022 - 15:36:57 EST
> Subject: Re: [Patch v7 12/12] RDMA/mana_ib: Add a driver for Microsoft
> Azure Network Adapter
>
> On 2022/10/18 3:20, longli@xxxxxxxxxxxxxxxxx wrote:
> > From: Long Li <longli@xxxxxxxxxxxxx>
> >
> > Add a RDMA VF driver for Microsoft Azure Network Adapter (MANA).
> >
> > Co-developed-by: Ajay Sharma <sharmaajay@xxxxxxxxxxxxx>
> > Signed-off-by: Ajay Sharma <sharmaajay@xxxxxxxxxxxxx>
> > Reviewed-by: Dexuan Cui <decui@xxxxxxxxxxxxx>
> > Signed-off-by: Long Li <longli@xxxxxxxxxxxxx>
> > ---
> > Change log:
> > v2:
> > Changed coding sytles/formats
> > Checked undersize for udata length
> > Changed all logging to use ibdev_xxx() Avoided page array copy when
> > doing MR Sorted driver ops Fixed warnings reported by kernel test
> > robot <lkp@xxxxxxxxx>
> >
> > v3:
> > More coding sytle/format changes
> >
> > v4:
> > Process error on hardware vport configuration
> >
> > v5:
> > Change licenses to GPL-2.0-only
> > Fix error handling in mana_ib_gd_create_dma_region()
> >
> > v6:
> > rebased to rdma-next
> > removed redundant initialization to return value in mana_ib_probe()
> > added missing tabs at the end of mana_ib_gd_create_dma_region()
> >
> > v7:
> > move mana_gd_destroy_doorbell_page() and
> > mana_gd_allocate_doorbell_page() from GDMA to this driver use
> > ib_umem_find_best_pgsz() for finding page size for registering dma
> > regions with hardware fix a bug that may double free mana_ind_table in
> > mana_ib_create_qp_rss() add Ajay Sharma <sharmaajay@xxxxxxxxxxxxx>
> to
> > maintainer list add details to description in
> > drivers/infiniband/hw/mana/Kconfig
> > change multiple lines comments to use RDMA style from NETDEV style
> > change mana_ib_dev_ops to static use module_auxiliary_driver() in
> > place of module_init and module_exit move all user-triggerable error
> > messages to debug messages check for ind_tbl_size overflow in
> > mana_ib_create_qp_rss()
> >
> > MAINTAINERS | 9 +
> > drivers/infiniband/Kconfig | 1 +
> > drivers/infiniband/hw/Makefile | 1 +
> > drivers/infiniband/hw/mana/Kconfig | 10 +
> > drivers/infiniband/hw/mana/Makefile | 4 +
> > drivers/infiniband/hw/mana/cq.c | 79 ++++
> > drivers/infiniband/hw/mana/device.c | 117 ++++++
> > drivers/infiniband/hw/mana/main.c | 508
> ++++++++++++++++++++++++
> > drivers/infiniband/hw/mana/mana_ib.h | 156 ++++++++
> > drivers/infiniband/hw/mana/mr.c | 200 ++++++++++
> > drivers/infiniband/hw/mana/qp.c | 505 +++++++++++++++++++++++
> > drivers/infiniband/hw/mana/wq.c | 115 ++++++
> > include/net/mana/mana.h | 3 +
> > include/uapi/rdma/ib_user_ioctl_verbs.h | 1 +
> > include/uapi/rdma/mana-abi.h | 66 +++
> > 15 files changed, 1775 insertions(+)
> > create mode 100644 drivers/infiniband/hw/mana/Kconfig
> > create mode 100644 drivers/infiniband/hw/mana/Makefile
> > create mode 100644 drivers/infiniband/hw/mana/cq.c create mode
> > 100644 drivers/infiniband/hw/mana/device.c
> > create mode 100644 drivers/infiniband/hw/mana/main.c create mode
> > 100644 drivers/infiniband/hw/mana/mana_ib.h
> > create mode 100644 drivers/infiniband/hw/mana/mr.c create mode
> > 100644 drivers/infiniband/hw/mana/qp.c create mode 100644
> > drivers/infiniband/hw/mana/wq.c create mode 100644
> > include/uapi/rdma/mana-abi.h
> >
>
> [...]
>
> > +
> > +int mana_ib_cfg_vport(struct mana_ib_dev *dev, u32 port, struct
> mana_ib_pd *pd,
> > + u32 doorbell_id)
> > +{
> > + struct gdma_dev *mdev = dev->gdma_dev;
> > + struct mana_port_context *mpc;
> > + struct mana_context *mc;
> > + struct net_device *ndev;
> > + int err;
> > +
> > + mc = mdev->driver_data;
> > + ndev = mc->ports[port];
> > + mpc = netdev_priv(ndev);
> > +
> > + mutex_lock(&pd->vport_mutex);
> > +
> > + pd->vport_use_count++;
> > + if (pd->vport_use_count > 1) {
> > + ibdev_dbg(&dev->ib_dev,
> > + "Skip as this PD is already configured vport\n");
> > + mutex_unlock(&pd->vport_mutex);
> > + return 0;
> > + }
> > + mutex_unlock(&pd->vport_mutex);
> > +
> > + err = mana_cfg_vport(mpc, pd->pdn, doorbell_id);
> > + if (err) {
> > + mutex_lock(&pd->vport_mutex);
> > + pd->vport_use_count--;
> > + mutex_unlock(&pd->vport_mutex);
>
> It seems there might be a race between the "pd->vport_use_count > 1"
> checking above and the error handling here, it may cause other user using a
> unconfigured vport if other user is checking the "pd->vport_use_count > 1)"
> while mana_cfg_vport() fails before doing "pd->vport_use_count--".
Thank you. You are correct about the possible race. I'm fixing it.
>
> > +
> > + ibdev_dbg(&dev->ib_dev, "Failed to configure vPort %d\n",
> err);
> > + return err;
> > + }
> > +
> > + pd->tx_shortform_allowed = mpc->tx_shortform_allowed;
> > + pd->tx_vp_offset = mpc->tx_vp_offset;
> > +
> > + ibdev_dbg(&dev->ib_dev, "vport handle %llx pdid %x
> doorbell_id %x\n",
> > + mpc->port_handle, pd->pdn, doorbell_id);
> > +
> > + return 0;
> > +}
> > +
>
> [...]
>
> > +
> > +static int mana_gd_allocate_doorbell_page(struct gdma_context *gc,
> > + int *doorbell_page)
> > +{
> > + struct gdma_allocate_resource_range_req req = {};
> > + struct gdma_allocate_resource_range_resp resp = {};
> > + int err;
> > +
> > + mana_gd_init_req_hdr(&req.hdr,
> GDMA_ALLOCATE_RESOURCE_RANGE,
> > + sizeof(req), sizeof(resp));
> > +
> > + req.resource_type = GDMA_RESOURCE_DOORBELL_PAGE;
> > + req.num_resources = 1;
> > + req.alignment = 1;
> > +
> > + /* Have GDMA start searching from 0 */
> > + req.allocated_resources = 0;
> > +
> > + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp),
> &resp);
> > + if (err || resp.hdr.status) {
> > + dev_err(gc->dev,
> > + "Failed to allocate doorbell page: ret %d, 0x%x\n",
> > + err, resp.hdr.status);
> > + return err ? err : -EPROTO;
> > + }
> > +
> > + *doorbell_page = resp.allocated_resources;
> > +
> > + return 0;
> > +}
> > +
> > +int mana_ib_alloc_ucontext(struct ib_ucontext *ibcontext,
> > + struct ib_udata *udata)
> > +{
> > + struct mana_ib_ucontext *ucontext =
> > + container_of(ibcontext, struct mana_ib_ucontext,
> ibucontext);
> > + struct ib_device *ibdev = ibcontext->device;
> > + struct mana_ib_dev *mdev;
> > + struct gdma_context *gc;
> > + struct gdma_dev *dev;
> > + int doorbell_page;
> > + int ret;
> > +
> > + mdev = container_of(ibdev, struct mana_ib_dev, ib_dev);
> > + dev = mdev->gdma_dev;
> > + gc = dev->gdma_context;
> > +
> > + /* Allocate a doorbell page index */
> > + ret = mana_gd_allocate_doorbell_page(gc, &doorbell_page);
> > + if (ret) {
> > + ibdev_dbg(ibdev, "Failed to allocate doorbell page %d\n",
> ret);
> > + return -ENOMEM;
>
> It does not make much sense to do "err ? err : -EPROTO" in
> mana_gd_allocate_doorbell_page() if -ENOMEM is returned unconditionally
> here.
I will change it to return "ret", to be consistent with other usage on mana_gd_xxx functions.
>
> > + }
> > +
> > + ibdev_dbg(ibdev, "Doorbell page allocated %d\n", doorbell_page);
> > +
> > + ucontext->doorbell = doorbell_page;
> > +
> > + return 0;
> > +}
> > +
>
> [...]
>
> > diff --git a/drivers/infiniband/hw/mana/mana_ib.h
> > b/drivers/infiniband/hw/mana/mana_ib.h
> > new file mode 100644
> > index 000000000000..2225a6d6f8e1
> > --- /dev/null
> > +++ b/drivers/infiniband/hw/mana/mana_ib.h
> > @@ -0,0 +1,156 @@
> > +/* SPDX-License-Identifier: GPL-2.0-only */
> > +/*
> > + * Copyright (c) 2022 Microsoft Corporation. All rights reserved.
> > + */
> > +
> > +#ifndef _MANA_IB_H_
> > +#define _MANA_IB_H_
> > +
> > +#include <rdma/ib_verbs.h>
> > +#include <rdma/ib_mad.h>
> > +#include <rdma/ib_umem.h>
> > +#include <rdma/mana-abi.h>
> > +#include <rdma/uverbs_ioctl.h>
> > +
> > +#include <net/mana/mana.h>
> > +
> > +#define PAGE_SZ_BM \
> > + (SZ_4K | SZ_8K | SZ_16K | SZ_32K | SZ_64K | SZ_128K | SZ_256K |
> \
> > + SZ_512K | SZ_1M | SZ_2M)
> > +
> > +/* MANA doesn't have any limit for MR size */ #define
> > +MANA_IB_MAX_MR_SIZE ((u64)(~(0ULL)))
>
> Use U64_MAX?
Good idea. Will change it.
>
> > +
> > +struct mana_ib_dev {
> > + struct ib_device ib_dev;
> > + struct gdma_dev *gdma_dev;
> > +};
> > +
>
> [...]
>
> > diff --git a/drivers/infiniband/hw/mana/mr.c
> > b/drivers/infiniband/hw/mana/mr.c new file mode 100644 index
> > 000000000000..09124dd1792d
> > --- /dev/null
> > +++ b/drivers/infiniband/hw/mana/mr.c
> > @@ -0,0 +1,200 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Copyright (c) 2022, Microsoft Corporation. All rights reserved.
> > + */
> > +
> > +#include "mana_ib.h"
> > +
> > +#define VALID_MR_FLAGS \
> > + (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE |
> > +IB_ACCESS_REMOTE_READ)
> > +
> > +static enum gdma_mr_access_flags
> > +mana_ib_verbs_to_gdma_access_flags(int access_flags) {
> > + enum gdma_mr_access_flags flags =
> GDMA_ACCESS_FLAG_LOCAL_READ;
> > +
> > + if (access_flags & IB_ACCESS_LOCAL_WRITE)
> > + flags |= GDMA_ACCESS_FLAG_LOCAL_WRITE;
> > +
> > + if (access_flags & IB_ACCESS_REMOTE_WRITE)
> > + flags |= GDMA_ACCESS_FLAG_REMOTE_WRITE;
> > +
> > + if (access_flags & IB_ACCESS_REMOTE_READ)
> > + flags |= GDMA_ACCESS_FLAG_REMOTE_READ;
> > +
> > + return flags;
> > +}
> > +
> > +static int mana_ib_gd_create_mr(struct mana_ib_dev *dev, struct
> mana_ib_mr *mr,
> > + struct gdma_create_mr_params *mr_params)
> {
> > + struct gdma_create_mr_response resp = {};
> > + struct gdma_create_mr_request req = {};
> > + struct gdma_dev *mdev = dev->gdma_dev;
> > + struct gdma_context *gc;
> > + int err;
> > +
> > + gc = mdev->gdma_context;
> > +
> > + mana_gd_init_req_hdr(&req.hdr, GDMA_CREATE_MR, sizeof(req),
> > + sizeof(resp));
> > + req.pd_handle = mr_params->pd_handle;
> > + req.mr_type = mr_params->mr_type;
> > +
> > + switch (mr_params->mr_type) {
> > + case GDMA_MR_TYPE_GVA:
> > + req.gva.dma_region_handle = mr_params-
> >gva.dma_region_handle;
> > + req.gva.virtual_address = mr_params->gva.virtual_address;
> > + req.gva.access_flags = mr_params->gva.access_flags;
> > + break;
> > +
> > + default:
> > + ibdev_dbg(&dev->ib_dev,
> > + "invalid param (GDMA_MR_TYPE) passed,
> type %d\n",
> > + req.mr_type);
> > + err = -EINVAL;
> > + goto error;
> > + }
> > +
> > + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp),
> > +&resp);
> > +
> > + if (err || resp.hdr.status) {
> > + ibdev_dbg(&dev->ib_dev, "Failed to create mr %d, %u", err,
> > + resp.hdr.status);
> > + if (!err)
> > + err = -EPROTO;
> > +
> > + goto error;
> > + }
> > +
> > + mr->ibmr.lkey = resp.lkey;
> > + mr->ibmr.rkey = resp.rkey;
> > + mr->mr_handle = resp.mr_handle;
> > +
> > + return 0;
> > +error:
> > + return err;
>
> There is no error handling here, maybe just return error directly instead of a
> goto.
Yes, will remove the goto.
>
> > +}
> > +
> > +static int mana_ib_gd_destroy_mr(struct mana_ib_dev *dev,
> > +gdma_obj_handle_t mr_handle) {
> > + struct gdma_destroy_mr_response resp = {};
> > + struct gdma_destroy_mr_request req = {};
> > + struct gdma_dev *mdev = dev->gdma_dev;
> > + struct gdma_context *gc;
> > + int err;
> > +
> > + gc = mdev->gdma_context;
> > +
> > + mana_gd_init_req_hdr(&req.hdr, GDMA_DESTROY_MR, sizeof(req),
> > + sizeof(resp));
> > +
> > + req.mr_handle = mr_handle;
> > +
> > + err = mana_gd_send_request(gc, sizeof(req), &req, sizeof(resp),
> &resp);
> > + if (err || resp.hdr.status) {
> > + dev_err(gc->dev, "Failed to destroy MR: %d, 0x%x\n", err,
> > + resp.hdr.status);
> > + if (!err)
> > + err = -EPROTO;
> > + return err;
> > + }
> > +
> > + return 0;
> > +}
> > +
>
> [...]
>
> > +
> > +static int mana_ib_create_qp_rss(struct ib_qp *ibqp, struct ib_pd *pd,
> > + struct ib_qp_init_attr *attr,
> > + struct ib_udata *udata)
> > +{
> > + struct mana_ib_qp *qp = container_of(ibqp, struct mana_ib_qp,
> ibqp);
> > + struct mana_ib_dev *mdev =
> > + container_of(pd->device, struct mana_ib_dev, ib_dev);
> > + struct ib_rwq_ind_table *ind_tbl = attr->rwq_ind_tbl;
> > + struct mana_ib_create_qp_rss_resp resp = {};
> > + struct mana_ib_create_qp_rss ucmd = {};
> > + struct gdma_dev *gd = mdev->gdma_dev;
> > + mana_handle_t *mana_ind_table;
> > + struct mana_port_context *mpc;
> > + struct mana_context *mc;
> > + struct net_device *ndev;
> > + struct mana_ib_cq *cq;
> > + struct mana_ib_wq *wq;
> > + unsigned int ind_tbl_size;
> > + struct ib_cq *ibcq;
> > + struct ib_wq *ibwq;
> > + u32 port;
> > + int ret;
> > + int i;
> > +
> > + mc = gd->driver_data;
> > +
> > + if (!udata || udata->inlen < sizeof(ucmd))
> > + return -EINVAL;
> > +
> > + ret = ib_copy_from_udata(&ucmd, udata, min(sizeof(ucmd), udata-
> >inlen));
> > + if (ret) {
> > + ibdev_dbg(&mdev->ib_dev,
> > + "Failed copy from udata for create rss-qp, err %d\n",
> > + ret);
> > + return -EFAULT;
>
> Why not just return 'ret' directly?
Yes, it's better to just return "ret". Will fix this.
Thank you.