Re: [PATCH for-next 2/4] RDMA/hns: Add IOMMU enable support in hip08

From: Robin Murphy
Date: Thu Oct 12 2017 - 08:59:23 EST


On 12/10/17 13:31, Wei Hu (Xavier) wrote:
>
>
> On 2017/10/1 0:10, Leon Romanovsky wrote:
>> On Sat, Sep 30, 2017 at 05:28:59PM +0800, Wei Hu (Xavier) wrote:
>>> If the IOMMU is enabled, the length of sg obtained from
>>> __iommu_map_sg_attrs is not 4kB. When the IOVA is set with the sg
>>> dma address, the IOVA will not be page continuous. and the VA
>>> returned from dma_alloc_coherent is a vmalloc address. However,
>>> the VA obtained by the page_address is a discontinuous VA. Under
>>> these circumstances, the IOVA should be calculated based on the
>>> sg length, and record the VA returned from dma_alloc_coherent
>>> in the struct of hem.
>>>
>>> Signed-off-by: Wei Hu (Xavier) <xavier.huwei@xxxxxxxxxx>
>>> Signed-off-by: Shaobo Xu <xushaobo2@xxxxxxxxxx>
>>> Signed-off-by: Lijun Ou <oulijun@xxxxxxxxxx>
>>> ---
>> Doug,
>>
>> I didn't invest time in reviewing it, but having "is_vmalloc_addr" in
>> driver code to deal with dma_alloc_coherent is most probably wrong.
>>
>> Thanks
> Hi, Leon & Doug
> ÂÂÂ We refered the function named __ttm_dma_alloc_page in the kernel
> code as below:
> ÂÂÂ And there are similar methods in bch_bio_map and mem_to_page
> functions in current 4.14-rcx.
>
> ÂÂÂÂÂÂÂ static struct dma_page *__ttm_dma_alloc_page(struct dma_pool *pool)
> ÂÂÂÂÂÂÂ {
> ÂÂÂÂÂÂÂÂÂÂÂ struct dma_page *d_page;
>
> ÂÂÂÂÂÂÂÂÂÂÂ d_page = kmalloc(sizeof(struct dma_page), GFP_KERNEL);
> ÂÂÂÂÂÂÂÂÂÂÂ if (!d_page)
> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ return NULL;
>
> ÂÂÂÂÂÂÂÂÂÂÂ d_page->vaddr = dma_alloc_coherent(pool->dev, pool->size,
> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ &d_page->dma,
> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ pool->gfp_flags);
> ÂÂÂÂÂÂÂÂÂÂÂ if (d_page->vaddr) {
> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ if (is_vmalloc_addr(d_page->vaddr))
> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ d_page->p = vmalloc_to_page(d_page->vaddr);
> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ else
> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ d_page->p = virt_to_page(d_page->vaddr);

There are cases on various architectures where neither of those is
right. Whether those actually intersect with TTM or RDMA use-cases is
another matter, of course.

What definitely is a problem is if you ever take that page and end up
accessing it through any virtual address other than the one explicitly
returned by dma_alloc_coherent(). That can blow the coherency wide open
and invite data loss, right up to killing the whole system with a
machine check on certain architectures.

Robin.

> ÂÂÂÂÂÂÂÂÂÂÂ } else {
> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ kfree(d_page);
> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ d_page = NULL;
> ÂÂÂÂÂÂÂÂÂÂÂ }
> ÂÂÂÂÂÂÂÂÂÂÂ return d_page;
> ÂÂÂÂÂÂÂ }
>
> ÂÂÂ Regards
> Wei Hu
>>
>>> Â drivers/infiniband/hw/hns/hns_roce_alloc.c |Â 5 ++++-
>>> Â drivers/infiniband/hw/hns/hns_roce_hem.cÂÂ | 30
>>> +++++++++++++++++++++++++++---
>>> Â drivers/infiniband/hw/hns/hns_roce_hem.hÂÂ |Â 6 ++++++
>>> Â drivers/infiniband/hw/hns/hns_roce_hw_v2.c | 22 +++++++++++++++-------
>>> Â 4 files changed, 52 insertions(+), 11 deletions(-)
>>>
>>> diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c
>>> b/drivers/infiniband/hw/hns/hns_roce_alloc.c
>>> index 3e4c525..a69cd4b 100644
>>> --- a/drivers/infiniband/hw/hns/hns_roce_alloc.c
>>> +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c
>>> @@ -243,7 +243,10 @@ int hns_roce_buf_alloc(struct hns_roce_dev
>>> *hr_dev, u32 size, u32 max_direct,
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ goto err_free;
>>>
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ for (i = 0; i < buf->nbufs; ++i)
>>> -ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ pages[i] = virt_to_page(buf->page_list[i].buf);
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ pages[i] =
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ is_vmalloc_addr(buf->page_list[i].buf) ?
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ vmalloc_to_page(buf->page_list[i].buf) :
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ virt_to_page(buf->page_list[i].buf);
>>>
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ buf->direct.buf = vmap(pages, buf->nbufs, VM_MAP,
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ PAGE_KERNEL);
>>> diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.c
>>> b/drivers/infiniband/hw/hns/hns_roce_hem.c
>>> index 8388ae2..4a3d1d4 100644
>>> --- a/drivers/infiniband/hw/hns/hns_roce_hem.c
>>> +++ b/drivers/infiniband/hw/hns/hns_roce_hem.c
>>> @@ -200,6 +200,7 @@ static struct hns_roce_hem
>>> *hns_roce_alloc_hem(struct hns_roce_dev *hr_dev,
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ gfp_t gfp_mask)
>>> Â {
>>> ÂÂÂÂÂ struct hns_roce_hem_chunk *chunk = NULL;
>>> +ÂÂÂ struct hns_roce_vmalloc *vmalloc;
>>> ÂÂÂÂÂ struct hns_roce_hem *hem;
>>> ÂÂÂÂÂ struct scatterlist *mem;
>>> ÂÂÂÂÂ int order;
>>> @@ -227,6 +228,7 @@ static struct hns_roce_hem
>>> *hns_roce_alloc_hem(struct hns_roce_dev *hr_dev,
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ sg_init_table(chunk->mem, HNS_ROCE_HEM_CHUNK_LEN);
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ chunk->npages = 0;
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ chunk->nsg = 0;
>>> +ÂÂÂÂÂÂÂÂÂÂÂ memset(chunk->vmalloc, 0, sizeof(chunk->vmalloc));
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ list_add_tail(&chunk->list, &hem->chunk_list);
>>> ÂÂÂÂÂÂÂÂÂ }
>>>
>>> @@ -243,7 +245,15 @@ static struct hns_roce_hem
>>> *hns_roce_alloc_hem(struct hns_roce_dev *hr_dev,
>>> ÂÂÂÂÂÂÂÂÂ if (!buf)
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ goto fail;
>>>
>>> -ÂÂÂÂÂÂÂ sg_set_buf(mem, buf, PAGE_SIZE << order);
>>> +ÂÂÂÂÂÂÂ if (is_vmalloc_addr(buf)) {
>>> +ÂÂÂÂÂÂÂÂÂÂÂ vmalloc = &chunk->vmalloc[chunk->npages];
>>> +ÂÂÂÂÂÂÂÂÂÂÂ vmalloc->is_vmalloc_addr = true;
>>> +ÂÂÂÂÂÂÂÂÂÂÂ vmalloc->vmalloc_addr = buf;
>>> +ÂÂÂÂÂÂÂÂÂÂÂ sg_set_page(mem, vmalloc_to_page(buf),
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ PAGE_SIZE << order, offset_in_page(buf));
>>> +ÂÂÂÂÂÂÂ } else {
>>> +ÂÂÂÂÂÂÂÂÂÂÂ sg_set_buf(mem, buf, PAGE_SIZE << order);
>>> +ÂÂÂÂÂÂÂ }
>>> ÂÂÂÂÂÂÂÂÂ WARN_ON(mem->offset);
>>> ÂÂÂÂÂÂÂÂÂ sg_dma_len(mem) = PAGE_SIZE << order;
>>>
>>> @@ -262,17 +272,25 @@ static struct hns_roce_hem
>>> *hns_roce_alloc_hem(struct hns_roce_dev *hr_dev,
>>> Â void hns_roce_free_hem(struct hns_roce_dev *hr_dev, struct
>>> hns_roce_hem *hem)
>>> Â {
>>> ÂÂÂÂÂ struct hns_roce_hem_chunk *chunk, *tmp;
>>> +ÂÂÂ void *cpu_addr;
>>> ÂÂÂÂÂ int i;
>>>
>>> ÂÂÂÂÂ if (!hem)
>>> ÂÂÂÂÂÂÂÂÂ return;
>>>
>>> ÂÂÂÂÂ list_for_each_entry_safe(chunk, tmp, &hem->chunk_list, list) {
>>> -ÂÂÂÂÂÂÂ for (i = 0; i < chunk->npages; ++i)
>>> +ÂÂÂÂÂÂÂ for (i = 0; i < chunk->npages; ++i) {
>>> +ÂÂÂÂÂÂÂÂÂÂÂ if (chunk->vmalloc[i].is_vmalloc_addr)
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ cpu_addr = chunk->vmalloc[i].vmalloc_addr;
>>> +ÂÂÂÂÂÂÂÂÂÂÂ else
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ cpu_addr =
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ lowmem_page_address(sg_page(&chunk->mem[i]));
>>> +
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ dma_free_coherent(hr_dev->dev,
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ chunk->mem[i].length,
>>> -ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ lowmem_page_address(sg_page(&chunk->mem[i])),
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ cpu_addr,
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ sg_dma_address(&chunk->mem[i]));
>>> +ÂÂÂÂÂÂÂ }
>>> ÂÂÂÂÂÂÂÂÂ kfree(chunk);
>>> ÂÂÂÂÂ }
>>>
>>> @@ -774,6 +792,12 @@ void *hns_roce_table_find(struct hns_roce_dev
>>> *hr_dev,
>>>
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ if (chunk->mem[i].length > (u32)offset) {
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ page = sg_page(&chunk->mem[i]);
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ if (chunk->vmalloc[i].is_vmalloc_addr) {
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ mutex_unlock(&table->mutex);
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ return page ?
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ chunk->vmalloc[i].vmalloc_addr
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ + offset : NULL;
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ }
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ goto out;
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ }
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂ offset -= chunk->mem[i].length;
>>> diff --git a/drivers/infiniband/hw/hns/hns_roce_hem.h
>>> b/drivers/infiniband/hw/hns/hns_roce_hem.h
>>> index af28bbf..62d712a 100644
>>> --- a/drivers/infiniband/hw/hns/hns_roce_hem.h
>>> +++ b/drivers/infiniband/hw/hns/hns_roce_hem.h
>>> @@ -72,11 +72,17 @@ enum {
>>> ÂÂÂÂÂÂ HNS_ROCE_HEM_PAGE_SIZEÂ = 1 << HNS_ROCE_HEM_PAGE_SHIFT,
>>> Â };
>>>
>>> +struct hns_roce_vmalloc {
>>> +ÂÂÂ boolÂÂÂ is_vmalloc_addr;
>>> +ÂÂÂ voidÂÂÂ *vmalloc_addr;
>>> +};
>>> +
>>> Â struct hns_roce_hem_chunk {
>>> ÂÂÂÂÂ struct list_headÂÂÂÂ list;
>>> ÂÂÂÂÂ intÂÂÂÂÂÂÂÂÂÂÂÂ npages;
>>> ÂÂÂÂÂ intÂÂÂÂÂÂÂÂÂÂÂÂ nsg;
>>> ÂÂÂÂÂ struct scatterlistÂÂÂÂ mem[HNS_ROCE_HEM_CHUNK_LEN];
>>> +ÂÂÂ struct hns_roce_vmallocÂÂÂÂ vmalloc[HNS_ROCE_HEM_CHUNK_LEN];
>>> Â };
>>>
>>> Â struct hns_roce_hem {
>>> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
>>> b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
>>> index b99d70a..9e19bf1 100644
>>> --- a/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
>>> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v2.c
>>> @@ -1093,9 +1093,11 @@ static int hns_roce_v2_write_mtpt(void
>>> *mb_buf, struct hns_roce_mr *mr,
>>> Â {
>>> ÂÂÂÂÂ struct hns_roce_v2_mpt_entry *mpt_entry;
>>> ÂÂÂÂÂ struct scatterlist *sg;
>>> +ÂÂÂ u64 page_addr = 0;
>>> ÂÂÂÂÂ u64 *pages;
>>> +ÂÂÂ int i = 0, j = 0;
>>> +ÂÂÂ int len = 0;
>>> ÂÂÂÂÂ int entry;
>>> -ÂÂÂ int i;
>>>
>>> ÂÂÂÂÂ mpt_entry = mb_buf;
>>> ÂÂÂÂÂ memset(mpt_entry, 0, sizeof(*mpt_entry));
>>> @@ -1153,14 +1155,20 @@ static int hns_roce_v2_write_mtpt(void
>>> *mb_buf, struct hns_roce_mr *mr,
>>>
>>> ÂÂÂÂÂ i = 0;
>>> ÂÂÂÂÂ for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
>>> -ÂÂÂÂÂÂÂ pages[i] = ((u64)sg_dma_address(sg)) >> 6;
>>> -
>>> -ÂÂÂÂÂÂÂ /* Record the first 2 entry directly to MTPT table */
>>> -ÂÂÂÂÂÂÂ if (i >= HNS_ROCE_V2_MAX_INNER_MTPT_NUM - 1)
>>> -ÂÂÂÂÂÂÂÂÂÂÂ break;
>>> -ÂÂÂÂÂÂÂ i++;
>>> +ÂÂÂÂÂÂÂ len = sg_dma_len(sg) >> PAGE_SHIFT;
>>> +ÂÂÂÂÂÂÂ for (j = 0; j < len; ++j) {
>>> +ÂÂÂÂÂÂÂÂÂÂÂ page_addr = sg_dma_address(sg) +
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ (j << mr->umem->page_shift);
>>> +ÂÂÂÂÂÂÂÂÂÂÂ pages[i] = page_addr >> 6;
>>> +
>>> +ÂÂÂÂÂÂÂÂÂÂÂ /* Record the first 2 entry directly to MTPT table */
>>> +ÂÂÂÂÂÂÂÂÂÂÂ if (i >= HNS_ROCE_V2_MAX_INNER_MTPT_NUM - 1)
>>> +ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ goto found;
>>> +ÂÂÂÂÂÂÂÂÂÂÂ i++;
>>> +ÂÂÂÂÂÂÂ }
>>> ÂÂÂÂÂ }
>>>
>>> +found:
>>> ÂÂÂÂÂ mpt_entry->pa0_l = cpu_to_le32(lower_32_bits(pages[0]));
>>> ÂÂÂÂÂ roce_set_field(mpt_entry->byte_56_pa0_h, V2_MPT_BYTE_56_PA0_H_M,
>>> ÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂÂ V2_MPT_BYTE_56_PA0_H_S,
>>> --
>>> 1.9.1
>>>
>
>
> _______________________________________________
> iommu mailing list
> iommu@xxxxxxxxxxxxxxxxxxxxxxxxxx
> https://lists.linuxfoundation.org/mailman/listinfo/iommu