Re: [PATCH v2] vfio iommu type1: Improve vfio_iommu_type1_pin_pages performance

From: xuxiaoyang (C)
Date: Tue Dec 08 2020 - 08:57:34 EST




On 2020/11/21 15:58, xuxiaoyang (C) wrote:
> vfio_pin_pages() accepts an array of unrelated iova pfns and processes
> each to return the physical pfn. When dealing with large arrays of
> contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
> it is processed page by page.In this case, we can divide the iova pfn
> array into multiple continuous ranges and optimize them. For example,
> when the iova pfn array is {1,5,6,7,9}, it will be divided into three
> groups {1}, {5,6,7}, {9} for processing. When processing {5,6,7}, the
> number of calls to pin_user_pages_remote is reduced from 3 times to once.
> For single page or large array of discontinuous iovas, we still use
> vfio_pin_page_external to deal with it to reduce the performance loss
> caused by refactoring.
>
> Signed-off-by: Xiaoyang Xu <xuxiaoyang2@xxxxxxxxxx>
> ---
> v1 -> v2:
> * make vfio_iommu_type1_pin_contiguous_pages use vfio_pin_page_external
> to pin single page when npage=1
> * make vfio_pin_contiguous_pages_external use set npage to mark
> consecutive pages as dirty. simplify the processing logic of unwind
> * remove unnecessary checks in vfio_get_contiguous_pages_length, put
> the least costly judgment logic at the top, and replace
> vfio_iova_get_vfio_pfn with vfio_find_vpfn
>
> drivers/vfio/vfio_iommu_type1.c | 231 ++++++++++++++++++++++++++++----
> 1 file changed, 204 insertions(+), 27 deletions(-)
>
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index 67e827638995..080727b531c6 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -628,6 +628,196 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> return unlocked;
> }
>
> +static int contiguous_vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> + int prot, long npage, unsigned long *phys_pfn)
> +{
> + struct page **pages = NULL;
> + unsigned int flags = 0;
> + int i, ret;
> +
> + pages = kvmalloc_array(npage, sizeof(struct page *), GFP_KERNEL);
> + if (!pages)
> + return -ENOMEM;
> +
> + if (prot & IOMMU_WRITE)
> + flags |= FOLL_WRITE;
> +
> + mmap_read_lock(mm);
> + ret = pin_user_pages_remote(mm, vaddr, npage, flags | FOLL_LONGTERM,
> + pages, NULL, NULL);
> + mmap_read_unlock(mm);
> +
> + for (i = 0; i < ret; i++)
> + *(phys_pfn + i) = page_to_pfn(pages[i]);
> +
> + kvfree(pages);
> +
> + return ret;
> +}
> +
> +static int vfio_pin_contiguous_pages_external(struct vfio_iommu *iommu,
> + struct vfio_dma *dma,
> + unsigned long *user_pfn,
> + int npage, unsigned long *phys_pfn,
> + bool do_accounting)
> +{
> + int ret, i, j, lock_acct = 0;
> + unsigned long remote_vaddr;
> + dma_addr_t iova;
> + struct mm_struct *mm;
> + struct vfio_pfn *vpfn;
> +
> + mm = get_task_mm(dma->task);
> + if (!mm)
> + return -ENODEV;
> +
> + iova = user_pfn[0] << PAGE_SHIFT;
> + remote_vaddr = dma->vaddr + iova - dma->iova;
> + ret = contiguous_vaddr_get_pfn(mm, remote_vaddr, dma->prot,
> + npage, phys_pfn);
> + mmput(mm);
> + if (ret <= 0)
> + return ret;
> +
> + npage = ret;
> + for (i = 0; i < npage; i++) {
> + iova = user_pfn[i] << PAGE_SHIFT;
> + ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> + if (ret)
> + goto unwind;
> +
> + if (!is_invalid_reserved_pfn(phys_pfn[i]))
> + lock_acct++;
> + }
> +
> + if (do_accounting) {
> + ret = vfio_lock_acct(dma, lock_acct, true);
> + if (ret) {
> + if (ret == -ENOMEM)
> + pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n",
> + __func__, dma->task->comm, task_pid_nr(dma->task),
> + task_rlimit(dma->task, RLIMIT_MEMLOCK));
> + goto unwind;
> + }
> + }
> +
> + if (iommu->dirty_page_tracking) {
> + unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
> +
> + /*
> + * Bitmap populated with the smallest supported page
> + * size
> + */
> + bitmap_set(dma->bitmap,
> + ((user_pfn[0] << PAGE_SHIFT) - dma->iova) >> pgshift, npage);
> + }
> +
> + return i;
> +unwind:
> + for (j = 0; j < npage; j++) {
> + if (j < i) {
> + iova = user_pfn[j] << PAGE_SHIFT;
> + vpfn = vfio_find_vpfn(dma, iova);
> + vfio_iova_put_vfio_pfn(dma, vpfn);
> + } else {
> + put_pfn(phys_pfn[j], dma->prot);
> + }
> +
> + phys_pfn[j] = 0;
> + }
> +
> + return ret;
> +}
> +
> +static int vfio_iommu_type1_pin_contiguous_pages(struct vfio_iommu *iommu,
> + struct vfio_dma *dma,
> + unsigned long *user_pfn,
> + int npage, unsigned long *phys_pfn,
> + bool do_accounting)
> +{
> + int ret = 0, i, j;
> + unsigned long remote_vaddr;
> + dma_addr_t iova;
> +
> + if (npage == 1)
> + goto pin_single_page;
> +
> + ret = vfio_pin_contiguous_pages_external(iommu, dma, user_pfn, npage,
> + phys_pfn, do_accounting);
> + if (ret == npage)
> + return ret;
> +
> + if (ret < 0)
> + ret = 0;
> +
> +pin_single_page:
> + for (i = ret; i < npage; i++) {
> + iova = user_pfn[i] << PAGE_SHIFT;
> + remote_vaddr = dma->vaddr + iova - dma->iova;
> +
> + ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
> + do_accounting);
> + if (ret)
> + goto pin_unwind;
> +
> + ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> + if (ret) {
> + if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
> + vfio_lock_acct(dma, -1, true);
> + goto pin_unwind;
> + }
> +
> + if (iommu->dirty_page_tracking) {
> + unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
> +
> + /*
> + * Bitmap populated with the smallest supported page
> + * size
> + */
> + bitmap_set(dma->bitmap,
> + (iova - dma->iova) >> pgshift, 1);
> + }
> + }
> +
> + return i;
> +
> +pin_unwind:
> + phys_pfn[i] = 0;
> + for (j = 0; j < i; j++) {
> + iova = user_pfn[j] << PAGE_SHIFT;
> + vfio_unpin_page_external(dma, iova, do_accounting);
> + phys_pfn[j] = 0;
> + }
> +
> + return ret;
> +}
> +
> +static int vfio_get_contiguous_pages_length(struct vfio_dma *dma,
> + unsigned long *user_pfn, int npage)
> +{
> + int i;
> + dma_addr_t iova = user_pfn[0] << PAGE_SHIFT;
> + struct vfio_pfn *vpfn;
> +
> + if (npage <= 1)
> + return npage;
> +
> + for (i = 1; i < npage; i++) {
> + if (user_pfn[i] != user_pfn[0] + i)
> + break;
> +
> + iova = user_pfn[i] << PAGE_SHIFT;
> + if (iova >= dma->iova + dma->size ||
> + iova + PAGE_SIZE <= dma->iova)
> + break;
> +
> + vpfn = vfio_find_vpfn(dma, iova);
> + if (vpfn)
> + break;
> + }
> + return i;
> +}
> +
> static int vfio_iommu_type1_pin_pages(void *iommu_data,
> struct iommu_group *iommu_group,
> unsigned long *user_pfn,
> @@ -637,9 +827,9 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> struct vfio_iommu *iommu = iommu_data;
> struct vfio_group *group;
> int i, j, ret;
> - unsigned long remote_vaddr;
> struct vfio_dma *dma;
> bool do_accounting;
> + int contiguous_npage;
>
> if (!iommu || !user_pfn || !phys_pfn)
> return -EINVAL;
> @@ -663,7 +853,7 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> */
> do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
>
> - for (i = 0; i < npage; i++) {
> + for (i = 0; i < npage; i += contiguous_npage) {
> dma_addr_t iova;
> struct vfio_pfn *vpfn;
>
> @@ -682,31 +872,18 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data,
> vpfn = vfio_iova_get_vfio_pfn(dma, iova);
> if (vpfn) {
> phys_pfn[i] = vpfn->pfn;
> - continue;
> - }
> -
> - remote_vaddr = dma->vaddr + (iova - dma->iova);
> - ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
> - do_accounting);
> - if (ret)
> - goto pin_unwind;
> -
> - ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
> - if (ret) {
> - if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
> - vfio_lock_acct(dma, -1, true);
> - goto pin_unwind;
> - }
> -
> - if (iommu->dirty_page_tracking) {
> - unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
> -
> - /*
> - * Bitmap populated with the smallest supported page
> - * size
> - */
> - bitmap_set(dma->bitmap,
> - (iova - dma->iova) >> pgshift, 1);
> + contiguous_npage = 1;
> + } else {
> + ret = vfio_get_contiguous_pages_length(dma,
> + &user_pfn[i], npage - i);
> + if (ret < 0)
> + goto pin_unwind;
> +
> + ret = vfio_iommu_type1_pin_contiguous_pages(iommu,
> + dma, &user_pfn[i], ret, &phys_pfn[i], do_accounting);
> + if (ret < 0)
> + goto pin_unwind;
> + contiguous_npage = ret;
> }
> }
> ret = i;
> --
> 2.19.1
> .
>

hi Cornelia Huck, Eric Farman, Zhenyu Wang, Zhi Wang

vfio_pin_pages() accepts an array of unrelated iova pfns and processes
each to return the physical pfn. When dealing with large arrays of
contiguous iovas, vfio_iommu_type1_pin_pages is very inefficient because
it is processed page by page. In this case, we can divide the iova pfn
array into multiple continuous ranges and optimize them. I have a set
of performance test data for reference.

The patch was not applied
1 page 512 pages
no huge pages: 1638ns 223651ns
THP: 1668ns 222330ns
HugeTLB: 1526ns 208151ns

The patch was applied
1 page 512 pages
no huge pages 1735ns 167286ns
THP: 1934ns 126900ns
HugeTLB: 1713ns 102188ns

As Alex Williamson said, this patch lacks proof that it works in the
real world. I think you will have some valuable opinions.

Regards,
Xu