RE: [Patch v9 12/12] RDMA/mana_ib: Add a driver for Microsoft Azure Network Adapter
From: Long Li
Date: Mon Oct 31 2022 - 15:32:34 EST
> > +int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct
> ib_umem *umem,
> > + mana_handle_t *gdma_region)
> > +{
> > + struct gdma_dma_region_add_pages_req *add_req = NULL;
> > + struct gdma_create_dma_region_resp create_resp = {};
> > + struct gdma_create_dma_region_req *create_req;
> > + size_t num_pages_cur, num_pages_to_handle;
> > + unsigned int create_req_msg_size;
> > + struct hw_channel_context *hwc;
> > + struct ib_block_iter biter;
> > + size_t max_pgs_create_cmd;
> > + struct gdma_context *gc;
> > + size_t num_pages_total;
> > + struct gdma_dev *mdev;
> > + unsigned long page_sz;
> > + void *request_buf;
> > + unsigned int i;
> > + int err;
> > +
> > + mdev = dev->gdma_dev;
> > + gc = mdev->gdma_context;
> > + hwc = gc->hwc.driver_data;
> > +
> > + /* Hardware requires dma region to align to chosen page size */
> > + page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0);
>
> Does your HW support arbitary MR offsets in the IOVA?
Yes, the HW supports arbitrary MR offsets. I'm checking with hardware guys to confirm.
>
> struct ib_mr *mana_ib_reg_user_mr(struct ib_pd *ibpd, u64 start, u64
> length,
> u64 iova, int access_flags,
> struct ib_udata *udata)
> {
> [..]
>
> err = mana_ib_gd_create_dma_region(dev, mr-
> >umem,&dma_region_handle);
> ..
> mr_params.gva.virtual_address = iova;
>
> Eg if I set iova to 1 and length to PAGE_SIZE and pass in a umem which is fully
> page aligned, will the HW work, or will it DMA to the wrong locations?
>
> All other RDMA HW requires passing iova to the
> ib_umem_find_best_pgsz() specifically to reject/adjust the misalignment of
> the IOVA relative to the selected pagesize.
>
> > + __rdma_umem_block_iter_start(&biter, umem, page_sz);
> > +
> > + for (i = 0; i < num_pages_to_handle; ++i) {
> > + dma_addr_t cur_addr;
> > +
> > + __rdma_block_iter_next(&biter);
> > + cur_addr = rdma_block_iter_dma_address(&biter);
> > +
> > + create_req->page_addr_list[i] = cur_addr;
> > + }
>
> This loop is still a mess, why can you not write it as I said for v6?
>
> Usually the way these loops are structured is to fill the array and then check
> for fullness, trigger an action to drain the array, and reset the indexes back
> to the start.
>
> so do the usual
>
> rdma_umem_for_each_dma_block() {
> page_addr_list[tail++] = rdma_block_iter_dma_address(&biter);
> if (tail >= num_pages_to_handle) {
> mana_gd_send_request()
> reset buffer
> tail = 0
> }
> }
>
> if (tail)
> mana_gd_send_request()
I tried to recode this section; the new code looks like the following.
It's 30 lines longer than the previous version.
The difficulty is that there are two PF messages involved: the 1st one is
for creating the initial pages, the 2nd (and subsequent) ones are for adding
more pages. So going through the page sequence as seen from the hardware
side makes it shorter.
What do you think of this version?
static int
mana_ib_gd_first_dma_region(struct mana_ib_dev *dev,
struct gdma_context *gc,
struct gdma_create_dma_region_req *create_req,
size_t num_pages, mana_handle_t *gdma_region)
{
struct gdma_create_dma_region_resp create_resp = {};
unsigned int create_req_msg_size;
int err;
create_req_msg_size =
struct_size(create_req, page_addr_list, num_pages);
create_req->page_addr_list_len = num_pages;
err = mana_gd_send_request(gc, create_req_msg_size, create_req,
sizeof(create_resp), &create_resp);
if (err || create_resp.hdr.status) {
ibdev_dbg(&dev->ib_dev,
"Failed to create DMA region: %d, 0x%x\n",
err, create_resp.hdr.status);
if (!err)
err = -EPROTO;
return err;
}
*gdma_region = create_resp.dma_region_handle;
ibdev_dbg(&dev->ib_dev, "Created DMA region handle 0x%llx\n",
*gdma_region);
return 0;
}
static int mana_ib_gd_add_dma_region(struct mana_ib_dev *dev,
struct gdma_context *gc,
struct gdma_dma_region_add_pages_req *add_req,
unsigned int num_pages, u32 expected_status)
{
unsigned int add_req_msg_size =
struct_size(add_req, page_addr_list, num_pages);
struct gdma_general_resp add_resp = {};
int err;
mana_gd_init_req_hdr(&add_req->hdr, GDMA_DMA_REGION_ADD_PAGES,
add_req_msg_size, sizeof(add_resp));
add_req->page_addr_list_len = num_pages;
err = mana_gd_send_request(gc, add_req_msg_size, add_req,
sizeof(add_resp), &add_resp);
if (err || add_resp.hdr.status != expected_status) {
ibdev_dbg(&dev->ib_dev,
"Failed to create DMA region: %d, 0x%x\n",
err, add_resp.hdr.status);
if (!err)
err = -EPROTO;
return err;
}
return 0;
}
int mana_ib_gd_create_dma_region(struct mana_ib_dev *dev, struct ib_umem *umem,
mana_handle_t *gdma_region)
{
struct gdma_dma_region_add_pages_req *add_req = NULL;
size_t num_pages_processed = 0, num_pages_to_handle;
struct gdma_create_dma_region_resp create_resp = {};
struct gdma_create_dma_region_req *create_req;
unsigned int create_req_msg_size;
struct hw_channel_context *hwc;
size_t max_pgs_create_cmd;
size_t max_pgs_add_cmd;
struct ib_block_iter biter;
struct gdma_context *gc;
size_t num_pages_total;
struct gdma_dev *mdev;
unsigned long page_sz;
unsigned int tail = 0;
u64 *page_addr_list;
void *request_buf;
int err;
mdev = dev->gdma_dev;
gc = mdev->gdma_context;
hwc = gc->hwc.driver_data;
/* Hardware requires dma region to align to chosen page size */
page_sz = ib_umem_find_best_pgsz(umem, PAGE_SZ_BM, 0);
if (!page_sz) {
ibdev_dbg(&dev->ib_dev, "failed to find page size.\n");
return -ENOMEM;
}
num_pages_total = ib_umem_num_dma_blocks(umem, page_sz);
max_pgs_create_cmd =
(hwc->max_req_msg_size - sizeof(*create_req)) / sizeof(u64);
num_pages_to_handle =
min_t(size_t, num_pages_total, max_pgs_create_cmd);
create_req_msg_size =
struct_size(create_req, page_addr_list, num_pages_to_handle);
request_buf = kzalloc(hwc->max_req_msg_size, GFP_KERNEL);
if (!request_buf)
return -ENOMEM;
create_req = request_buf;
mana_gd_init_req_hdr(&create_req->hdr, GDMA_CREATE_DMA_REGION,
create_req_msg_size, sizeof(create_resp));
create_req->length = umem->length;
create_req->offset_in_page = umem->address & (page_sz - 1);
create_req->gdma_page_type = order_base_2(page_sz) - PAGE_SHIFT;
create_req->page_count = num_pages_total;
ibdev_dbg(&dev->ib_dev, "size_dma_region %lu num_pages_total %lu\n",
umem->length, num_pages_total);
ibdev_dbg(&dev->ib_dev, "page_sz %lu offset_in_page %u\n",
page_sz, create_req->offset_in_page);
ibdev_dbg(&dev->ib_dev, "num_pages_to_handle %lu, gdma_page_type %u",
num_pages_to_handle, create_req->gdma_page_type);
add_req = request_buf;
max_pgs_add_cmd =
(hwc->max_req_msg_size - sizeof(*add_req)) /
sizeof(u64);
page_addr_list = create_req->page_addr_list;
rdma_umem_for_each_dma_block(umem, &biter, page_sz) {
page_addr_list[tail++] = rdma_block_iter_dma_address(&biter);
if (tail >= num_pages_to_handle) {
u32 expected_s = 0;
if (num_pages_processed &&
num_pages_processed + num_pages_to_handle <
num_pages_total) {
/* Status indicating more pages are needed */
expected_s = GDMA_STATUS_MORE_ENTRIES;
}
if (!num_pages_processed) {
/* First message */
err = mana_ib_gd_first_dma_region(dev, gc,
create_req,
tail,
gdma_region);
if (err)
goto out;
page_addr_list = add_req->page_addr_list;
} else {
err = mana_ib_gd_add_dma_region(dev, gc,
add_req, tail,
expected_s);
if (err) {
tail = 0;
break;
}
}
num_pages_processed += tail;
/* Prepare to send ADD_PAGE requests */
num_pages_to_handle =
min_t(size_t,
num_pages_total - num_pages_processed,
max_pgs_add_cmd);
tail = 0;
}
}
if (tail) {
if (!num_pages_processed) {
err = mana_ib_gd_first_dma_region(dev, gc, create_req,
tail, gdma_region);
if (err)
goto out;
} else {
err = mana_ib_gd_add_dma_region(dev, gc, add_req,
tail, 0);
}
}
if (err)
mana_ib_gd_destroy_dma_region(dev, create_resp.dma_region_handle);
out:
kfree(request_buf);
return err;
}