Re: [PATCH] vfio/type1: Restore mapping performance with mdev support

From: Alex Williamson
Date: Thu Dec 15 2016 - 03:14:46 EST


On Thu, 15 Dec 2016 12:05:35 +0530
Kirti Wankhede <kwankhede@xxxxxxxxxx> wrote:

> On 12/14/2016 2:28 AM, Alex Williamson wrote:
> > As part of the mdev support, type1 now gets a task reference per
> > vfio_dma and uses that to get an mm reference for the task while
> > working on accounting. That's the correct thing to do for paths
> > where we can't rely on using current, but there are still hot paths
> > where we can optimize because we know we're invoked by the user.
> >
> > Specifically, vfio_pin_pages_remote() is only called when the user
> > does DMA mapping (vfio_dma_do_map) or if an IOMMU group is added to
> > a container with existing mappings (vfio_iommu_replay). We can
> > therefore use current->mm as well as rlimit() and capable() directly
> > rather than going through the high overhead path via the stored
> > task_struct. We also know that vfio_dma_do_unmap() is only called
> > via user ioctl, so we can also tune that path to be more lightweight.
> >
> > In a synthetic guest mapping test emulating a 1TB VM backed by a
> > single 4GB range remapped multiple times across the address space,
> > the mdev changes to the type1 backend introduced a roughly 25% hit
> > in runtime of this test. These changes restore it to nearly the
> > previous performance for the interfaces exercised here,
> > VFIO_IOMMU_MAP_DMA and release on close.
> >
> > Signed-off-by: Alex Williamson <alex.williamson@xxxxxxxxxx>
> > ---
> > drivers/vfio/vfio_iommu_type1.c | 145 +++++++++++++++++++++------------------
> > 1 file changed, 79 insertions(+), 66 deletions(-)
> >
> > diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> > index 9815e45..8dfeafb 100644
> > --- a/drivers/vfio/vfio_iommu_type1.c
> > +++ b/drivers/vfio/vfio_iommu_type1.c
> > @@ -103,6 +103,10 @@ struct vfio_pfn {
> > #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
> > (!list_empty(&iommu->domain_list))
> >
> > +/* Make function bool options readable */
> > +#define IS_CURRENT (true)
> > +#define DO_ACCOUNTING (true)
> > +
> > static int put_pfn(unsigned long pfn, int prot);
> >
> > /*
> > @@ -264,7 +268,8 @@ static void vfio_lock_acct_bg(struct work_struct *work)
> > kfree(vwork);
> > }
> >
> > -static void vfio_lock_acct(struct task_struct *task, long npage)
> > +static void vfio_lock_acct(struct task_struct *task,
> > + long npage, bool is_current)
> > {
> > struct vwork *vwork;
> > struct mm_struct *mm;
> > @@ -272,24 +277,31 @@ static void vfio_lock_acct(struct task_struct *task, long npage)
> > if (!npage)
> > return;
> >
> > - mm = get_task_mm(task);
> > + mm = is_current ? task->mm : get_task_mm(task);
> > if (!mm)
> > - return; /* process exited or nothing to do */
> > + return; /* process exited */
> >
> > if (down_write_trylock(&mm->mmap_sem)) {
> > mm->locked_vm += npage;
> > up_write(&mm->mmap_sem);
> > - mmput(mm);
> > + if (!is_current)
> > + mmput(mm);
> > return;
> > }
> >
> > + if (is_current) {
> > + mm = get_task_mm(task);
> > + if (!mm)
> > + return;
> > + }
> > +
> > /*
> > * Couldn't get mmap_sem lock, so must setup to update
> > * mm->locked_vm later. If locked_vm were atomic, we
> > * wouldn't need this silliness
> > */
> > vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL);
> > - if (!vwork) {
> > + if (WARN_ON(!vwork)) {
> > mmput(mm);
> > return;
> > }
> > @@ -345,13 +357,13 @@ static int put_pfn(unsigned long pfn, int prot)
> > }
> >
> > static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> > - int prot, unsigned long *pfn)
> > + int prot, unsigned long *pfn, bool is_current)
> > {
> > struct page *page[1];
> > struct vm_area_struct *vma;
> > int ret;
> >
> > - if (mm == current->mm) {
> > + if (is_current) {
>
> With this change, if vfio_pin_page_external() gets called from QEMU
> process context, for example in response to some BAR0 register access,
> it will still fallback to slow path, get_user_pages_remote(). We don't
> have to change this function. This path already takes care of taking
> best possible path.
>
> That also makes me think, vfio_pin_page_external() uses task structure
> to get mlock limit and capability. Expectation is mdev vendor driver
> shouldn't pin all system memory, but if any mdev driver does that, then
> that driver might see such performance impact. Should we optimize this
> path if (dma->task == current)?

Hi Kirti,

I was actually trying to avoid the (task == current) test with this
change because I wasn't sure how reliable it is. Is there a
possibility that this test generates a false positive if current
coincidentally matches our task and does that allow us the same
opportunities for making use of current that we have when we know in a
process context execution path? The above change makes this a more
direct association. Can you show that inferring the process context is
correct? Thanks,

Alex

> > ret = get_user_pages_fast(vaddr, 1, !!(prot & IOMMU_WRITE),
> > page);
> > } else {
> > @@ -393,96 +405,92 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
> > static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
> > long npage, unsigned long *pfn_base)
> > {
> > - unsigned long limit;
> > - bool lock_cap = ns_capable(task_active_pid_ns(dma->task)->user_ns,
> > - CAP_IPC_LOCK);
> > - struct mm_struct *mm;
> > - long ret, i = 0, lock_acct = 0;
> > + unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> > + bool lock_cap = capable(CAP_IPC_LOCK);
> > + long ret, pinned = 0, lock_acct = 0;
> > bool rsvd;
> > dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
> >
> > - mm = get_task_mm(dma->task);
> > - if (!mm)
> > + /* This code path is only user initiated */
> > + if (!current->mm)
> > return -ENODEV;
> >
> > - ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
> > + ret = vaddr_get_pfn(current->mm, vaddr,
> > + dma->prot, pfn_base, IS_CURRENT);
> > if (ret)
> > - goto pin_pg_remote_exit;
> > + return ret;
> >
> > + pinned++;
> > rsvd = is_invalid_reserved_pfn(*pfn_base);
> > - limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
> >
> > /*
> > * Reserved pages aren't counted against the user, externally pinned
> > * pages are already counted against the user.
> > */
> > if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> > - if (!lock_cap && mm->locked_vm + 1 > limit) {
> > + if (!lock_cap && current->mm->locked_vm + 1 > limit) {
> > put_pfn(*pfn_base, dma->prot);
> > pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
> > limit << PAGE_SHIFT);
> > - ret = -ENOMEM;
> > - goto pin_pg_remote_exit;
> > + return -ENOMEM;
> > }
> > lock_acct++;
> > }
> >
> > - i++;
> > - if (likely(!disable_hugepages)) {
> > - /* Lock all the consecutive pages from pfn_base */
> > - for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; i < npage;
> > - i++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
> > - unsigned long pfn = 0;
> > + if (unlikely(disable_hugepages))
> > + goto out;
> >
> > - ret = vaddr_get_pfn(mm, vaddr, dma->prot, &pfn);
> > - if (ret)
> > - break;
> > + /* Lock all the consecutive pages from pfn_base */
> > + for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
> > + pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
> > + unsigned long pfn = 0;
> >
> > - if (pfn != *pfn_base + i ||
> > - rsvd != is_invalid_reserved_pfn(pfn)) {
> > + ret = vaddr_get_pfn(current->mm, vaddr,
> > + dma->prot, &pfn, IS_CURRENT);
> > + if (ret)
> > + break;
> > +
> > + if (pfn != *pfn_base + pinned ||
> > + rsvd != is_invalid_reserved_pfn(pfn)) {
> > + put_pfn(pfn, dma->prot);
> > + break;
> > + }
> > +
> > + if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> > + if (!lock_cap &&
> > + current->mm->locked_vm + lock_acct + 1 > limit) {
> > put_pfn(pfn, dma->prot);
> > + pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
> > + __func__, limit << PAGE_SHIFT);
> > break;
> > }
> > -
> > - if (!rsvd && !vfio_find_vpfn(dma, iova)) {
> > - if (!lock_cap &&
> > - mm->locked_vm + lock_acct + 1 > limit) {
> > - put_pfn(pfn, dma->prot);
> > - pr_warn("%s: RLIMIT_MEMLOCK (%ld) "
> > - "exceeded\n", __func__,
> > - limit << PAGE_SHIFT);
> > - break;
> > - }
> > - lock_acct++;
> > - }
> > + lock_acct++;
> > }
> > }
> >
> > - vfio_lock_acct(dma->task, lock_acct);
> > - ret = i;
> > +out:
> > + vfio_lock_acct(current, lock_acct, IS_CURRENT);
> >
> > -pin_pg_remote_exit:
> > - mmput(mm);
> > - return ret;
> > + return pinned;
> > }
> >
> > static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
> > unsigned long pfn, long npage,
> > - bool do_accounting)
> > + bool do_accounting, bool is_current)
> > {
> > long unlocked = 0, locked = 0;
> > long i;
> >
> > - for (i = 0; i < npage; i++) {
> > + for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
> > if (put_pfn(pfn++, dma->prot)) {
> > unlocked++;
> > - if (vfio_find_vpfn(dma, iova + (i << PAGE_SHIFT)))
> > + if (vfio_find_vpfn(dma, iova))
> > locked++;
> > }
> > }
> >
> > if (do_accounting)
> > - vfio_lock_acct(dma->task, locked - unlocked);
> > + vfio_lock_acct(dma->task, locked - unlocked, is_current);
> >
> > return unlocked;
> > }
> > @@ -501,7 +509,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> > if (!mm)
> > return -ENODEV;
> >
> > - ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
> > + ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base, !IS_CURRENT);
> > if (ret)
> > goto pin_page_exit;
> >
> > @@ -518,7 +526,7 @@ static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
> > }
> >
> > if (!rsvd && do_accounting)
> > - vfio_lock_acct(dma->task, 1);
> > + vfio_lock_acct(dma->task, 1, !IS_CURRENT);
> > ret = 1;
> >
> > pin_page_exit:
> > @@ -538,7 +546,7 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
> > unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
> >
> > if (do_accounting)
> > - vfio_lock_acct(dma->task, -unlocked);
> > + vfio_lock_acct(dma->task, -unlocked, !IS_CURRENT);
> >
> > return unlocked;
> > }
> > @@ -671,7 +679,7 @@ static int vfio_iommu_type1_unpin_pages(void *iommu_data,
> > }
> >
> > static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > - bool do_accounting)
> > + bool do_accounting, bool is_current)
> > {
> > dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
> > struct vfio_domain *domain, *d;
> > @@ -727,7 +735,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > unlocked += vfio_unpin_pages_remote(dma, iova,
> > phys >> PAGE_SHIFT,
> > unmapped >> PAGE_SHIFT,
> > - false);
> > + !DO_ACCOUNTING, is_current);
> > iova += unmapped;
> >
> > cond_resched();
> > @@ -735,15 +743,16 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
> >
> > dma->iommu_mapped = false;
> > if (do_accounting) {
> > - vfio_lock_acct(dma->task, -unlocked);
> > + vfio_lock_acct(dma->task, -unlocked, is_current);
> > return 0;
> > }
> > return unlocked;
> > }
> >
> > -static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
> > +static void vfio_remove_dma(struct vfio_iommu *iommu,
> > + struct vfio_dma *dma, bool is_current)
> > {
> > - vfio_unmap_unpin(iommu, dma, true);
> > + vfio_unmap_unpin(iommu, dma, DO_ACCOUNTING, is_current);
> > vfio_unlink_dma(iommu, dma);
> > put_task_struct(dma->task);
> > kfree(dma);
> > @@ -874,7 +883,7 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
> > goto again;
> > }
> > unmapped += dma->size;
> > - vfio_remove_dma(iommu, dma);
> > + vfio_remove_dma(iommu, dma, IS_CURRENT);
> > }
> >
> > unlock:
> > @@ -964,7 +973,8 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > dma->prot);
> > if (ret) {
> > vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
> > - npage, true);
> > + npage, DO_ACCOUNTING,
> > + IS_CURRENT);
> > break;
> > }
> >
> > @@ -975,7 +985,7 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
> > dma->iommu_mapped = true;
> >
> > if (ret)
> > - vfio_remove_dma(iommu, dma);
> > + vfio_remove_dma(iommu, dma, IS_CURRENT);
> >
> > return ret;
> > }
> > @@ -1322,7 +1332,9 @@ static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
> > struct rb_node *node;
> >
> > while ((node = rb_first(&iommu->dma_list)))
> > - vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
> > + vfio_remove_dma(iommu,
> > + rb_entry(node, struct vfio_dma, node),
> > + !IS_CURRENT);
> > }
> >
> > static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> > @@ -1335,7 +1347,8 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> > long locked = 0, unlocked = 0;
> >
> > dma = rb_entry(n, struct vfio_dma, node);
> > - unlocked += vfio_unmap_unpin(iommu, dma, false);
> > + unlocked += vfio_unmap_unpin(iommu, dma,
> > + !DO_ACCOUNTING, !IS_CURRENT);
> > p = rb_first(&dma->pfn_list);
> > for (; p; p = rb_next(p)) {
> > struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
> > @@ -1344,7 +1357,7 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
> > if (!is_invalid_reserved_pfn(vpfn->pfn))
> > locked++;
> > }
> > - vfio_lock_acct(dma->task, locked - unlocked);
> > + vfio_lock_acct(dma->task, locked - unlocked, !IS_CURRENT);
> > }
> > }
> >
> >