Re: [PATCH v1 4/4] staging: media: tegra-vde: Defer dmabuf's unmapping

From: Hans Verkuil
Date: Mon Jun 17 2019 - 09:38:55 EST


On 6/2/19 11:37 PM, Dmitry Osipenko wrote:
> Frequent IOMMU remappings take about 50% of CPU usage because there is
> quite a lot to remap. Defer dmabuf's unmapping by 5 seconds in order to
> mitigate the mapping overhead which goes away completely and driver works
> as fast as in a case of a disabled IOMMU. The case of a disabled IOMMU
> should also benefit a tad from the caching since CPU cache maintenance
> that happens on dmabuf's attaching takes some resources.
>
> Signed-off-by: Dmitry Osipenko <digetx@xxxxxxxxx>
> ---
> drivers/staging/media/tegra-vde/Makefile | 2 +-
> .../staging/media/tegra-vde/dmabuf-cache.c | 223 ++++++++++++++++++
> drivers/staging/media/tegra-vde/iommu.c | 2 -
> drivers/staging/media/tegra-vde/vde.c | 143 +++--------
> drivers/staging/media/tegra-vde/vde.h | 18 +-
> 5 files changed, 273 insertions(+), 115 deletions(-)
> create mode 100644 drivers/staging/media/tegra-vde/dmabuf-cache.c
>
> diff --git a/drivers/staging/media/tegra-vde/Makefile b/drivers/staging/media/tegra-vde/Makefile
> index c11867e28233..2827f7601de8 100644
> --- a/drivers/staging/media/tegra-vde/Makefile
> +++ b/drivers/staging/media/tegra-vde/Makefile
> @@ -1,3 +1,3 @@
> # SPDX-License-Identifier: GPL-2.0
> -tegra-vde-y := vde.o iommu.o
> +tegra-vde-y := vde.o iommu.o dmabuf-cache.o
> obj-$(CONFIG_TEGRA_VDE) += tegra-vde.o
> diff --git a/drivers/staging/media/tegra-vde/dmabuf-cache.c b/drivers/staging/media/tegra-vde/dmabuf-cache.c
> new file mode 100644
> index 000000000000..fcde8d1c37e7
> --- /dev/null
> +++ b/drivers/staging/media/tegra-vde/dmabuf-cache.c
> @@ -0,0 +1,223 @@
> +// SPDX-License-Identifier: GPL-2.0+
> +/*
> + * NVIDIA Tegra Video decoder driver
> + *
> + * Copyright (C) 2016-2019 GRATE-DRIVER project
> + */
> +
> +#include <linux/dma-buf.h>
> +#include <linux/iova.h>
> +#include <linux/kernel.h>
> +#include <linux/list.h>
> +#include <linux/sched.h>
> +#include <linux/slab.h>
> +#include <linux/workqueue.h>
> +
> +#include "vde.h"
> +
> +struct tegra_vde_cache_entry {
> + enum dma_data_direction dma_dir;
> + struct dma_buf_attachment *a;
> + struct delayed_work dwork;
> + struct tegra_vde *vde;
> + struct list_head list;
> + struct sg_table *sgt;
> + struct iova *iova;
> + unsigned int refcnt;
> +};
> +
> +static void tegra_vde_release_entry(struct tegra_vde_cache_entry *entry)
> +{
> + struct dma_buf *dmabuf = entry->a->dmabuf;
> +
> + WARN_ON_ONCE(entry->refcnt);
> +
> + if (entry->vde->domain)
> + tegra_vde_iommu_unmap(entry->vde, entry->iova);
> +
> + dma_buf_unmap_attachment(entry->a, entry->sgt, entry->dma_dir);
> + dma_buf_detach(dmabuf, entry->a);
> + dma_buf_put(dmabuf);
> +
> + list_del(&entry->list);
> + kfree(entry);
> +}
> +
> +static void tegra_vde_delayed_unmap(struct work_struct *work)
> +{
> + struct tegra_vde_cache_entry *entry;
> +
> + entry = container_of(work, struct tegra_vde_cache_entry,
> + dwork.work);
> +
> + mutex_lock(&entry->vde->map_lock);
> + tegra_vde_release_entry(entry);
> + mutex_unlock(&entry->vde->map_lock);

>From smatch:

drivers/staging/media/tegra-vde/dmabuf-cache.c:55 tegra_vde_delayed_unmap() error: dereferencing freed memory 'entry'

> +}
> +
> +int tegra_vde_dmabuf_cache_map(struct tegra_vde *vde,
> + struct dma_buf *dmabuf,
> + enum dma_data_direction dma_dir,
> + struct dma_buf_attachment **ap,
> + dma_addr_t *addrp)
> +{
> + struct device *dev = vde->miscdev.parent;
> + struct tegra_vde_cache_entry *entry;
> + struct dma_buf_attachment *attachment;
> + struct sg_table *sgt;
> + struct iova *iova;
> + int err;
> +
> + mutex_lock(&vde->map_lock);
> +
> + list_for_each_entry(entry, &vde->map_list, list) {
> + if (entry->a->dmabuf != dmabuf)
> + continue;
> +
> + if (!cancel_delayed_work(&entry->dwork))
> + continue;
> +
> + if (entry->dma_dir != dma_dir)
> + entry->dma_dir = DMA_BIDIRECTIONAL;
> +
> + dma_buf_put(dmabuf);
> +
> + if (vde->domain)
> + *addrp = iova_dma_addr(&vde->iova, entry->iova);
> + else
> + *addrp = sg_dma_address(entry->sgt->sgl);
> +
> + goto ref;
> + }
> +
> + attachment = dma_buf_attach(dmabuf, dev);
> + if (IS_ERR(attachment)) {
> + dev_err(dev, "Failed to attach dmabuf\n");
> + err = PTR_ERR(attachment);
> + goto err_unlock;
> + }
> +
> + sgt = dma_buf_map_attachment(attachment, dma_dir);
> + if (IS_ERR(sgt)) {
> + dev_err(dev, "Failed to get dmabufs sg_table\n");
> + err = PTR_ERR(sgt);
> + goto err_detach;
> + }
> +
> + if (!vde->domain && sgt->nents > 1) {
> + dev_err(dev, "Sparse DMA region is unsupported, please enable IOMMU\n");
> + err = -EINVAL;
> + goto err_unmap;
> + }
> +
> + entry = kzalloc(sizeof(*entry), GFP_KERNEL);
> + if (!entry) {
> + err = -ENOMEM;
> + goto err_unmap;
> + }
> +
> + if (vde->domain) {
> + err = tegra_vde_iommu_map(vde, sgt, &iova, dmabuf->size);
> + if (err)
> + goto err_free;
> +
> + *addrp = iova_dma_addr(&vde->iova, iova);
> + } else {
> + *addrp = sg_dma_address(sgt->sgl);
> + }
> +
> + INIT_DELAYED_WORK(&entry->dwork, tegra_vde_delayed_unmap);
> + list_add(&entry->list, &vde->map_list);
> +
> + entry->dma_dir = dma_dir;
> + entry->iova = iova;

>From smatch:

drivers/staging/media/tegra-vde/dmabuf-cache.c:133 tegra_vde_dmabuf_cache_map() error: uninitialized symbol 'iova'.

Regards,

Hans

> + entry->vde = vde;
> + entry->sgt = sgt;
> + entry->a = attachment;
> +ref:
> + entry->refcnt++;
> +
> + *ap = entry->a;
> +
> + mutex_unlock(&vde->map_lock);
> +
> + return 0;
> +
> +err_free:
> + kfree(entry);
> +err_unmap:
> + dma_buf_unmap_attachment(attachment, sgt, dma_dir);
> +err_detach:
> + dma_buf_detach(dmabuf, attachment);
> +err_unlock:
> + mutex_unlock(&vde->map_lock);
> +
> + return err;
> +}
> +
> +void tegra_vde_dmabuf_cache_unmap(struct tegra_vde *vde,
> + struct dma_buf_attachment *a,
> + bool release)
> +{
> + struct tegra_vde_cache_entry *entry;
> +
> + mutex_lock(&vde->map_lock);
> +
> + list_for_each_entry(entry, &vde->map_list, list) {
> + if (entry->a != a)
> + continue;
> +
> + WARN_ON_ONCE(!entry->refcnt);
> +
> + if (--entry->refcnt == 0) {
> + if (release)
> + tegra_vde_release_entry(entry);
> + else
> + schedule_delayed_work(&entry->dwork, 5 * HZ);
> + }
> + break;
> + }
> +
> + mutex_unlock(&vde->map_lock);
> +}
> +
> +void tegra_vde_dmabuf_cache_unmap_sync(struct tegra_vde *vde)
> +{
> + struct tegra_vde_cache_entry *entry, *tmp;
> +
> + mutex_lock(&vde->map_lock);
> +
> + list_for_each_entry_safe(entry, tmp, &vde->map_list, list) {
> + if (entry->refcnt)
> + continue;
> +
> + if (!cancel_delayed_work(&entry->dwork))
> + continue;
> +
> + tegra_vde_release_entry(entry);
> + }
> +
> + mutex_unlock(&vde->map_lock);
> +}
> +
> +void tegra_vde_dmabuf_cache_unmap_all(struct tegra_vde *vde)
> +{
> + struct tegra_vde_cache_entry *entry, *tmp;
> +
> + mutex_lock(&vde->map_lock);
> +
> + do {
> + list_for_each_entry_safe(entry, tmp, &vde->map_list, list) {
> + if (!cancel_delayed_work(&entry->dwork))
> + continue;
> +
> + tegra_vde_release_entry(entry);
> + }
> +
> + mutex_unlock(&vde->map_lock);
> + schedule();
> + mutex_lock(&vde->map_lock);
> + } while (!list_empty(&vde->map_list));
> +
> + mutex_unlock(&vde->map_lock);
> +}
> diff --git a/drivers/staging/media/tegra-vde/iommu.c b/drivers/staging/media/tegra-vde/iommu.c
> index 295c3d7cccd3..6d635332e0ec 100644
> --- a/drivers/staging/media/tegra-vde/iommu.c
> +++ b/drivers/staging/media/tegra-vde/iommu.c
> @@ -19,7 +19,6 @@
> int tegra_vde_iommu_map(struct tegra_vde *vde,
> struct sg_table *sgt,
> struct iova **iovap,
> - dma_addr_t *addrp,
> size_t size)
> {
> struct iova *iova;
> @@ -45,7 +44,6 @@ int tegra_vde_iommu_map(struct tegra_vde *vde,
> }
>
> *iovap = iova;
> - *addrp = addr;
>
> return 0;
> }
> diff --git a/drivers/staging/media/tegra-vde/vde.c b/drivers/staging/media/tegra-vde/vde.c
> index cbcdbfef072d..3466daddf663 100644
> --- a/drivers/staging/media/tegra-vde/vde.c
> +++ b/drivers/staging/media/tegra-vde/vde.c
> @@ -11,6 +11,7 @@
> #include <linux/genalloc.h>
> #include <linux/interrupt.h>
> #include <linux/iopoll.h>
> +#include <linux/list.h>
> #include <linux/miscdevice.h>
> #include <linux/module.h>
> #include <linux/of_device.h>
> @@ -37,18 +38,10 @@
> #define BSE_DMA_BUSY BIT(23)
>
> struct video_frame {
> - struct iova *y_iova;
> - struct iova *cb_iova;
> - struct iova *cr_iova;
> - struct iova *aux_iova;
> struct dma_buf_attachment *y_dmabuf_attachment;
> struct dma_buf_attachment *cb_dmabuf_attachment;
> struct dma_buf_attachment *cr_dmabuf_attachment;
> struct dma_buf_attachment *aux_dmabuf_attachment;
> - struct sg_table *y_sgt;
> - struct sg_table *cb_sgt;
> - struct sg_table *cr_sgt;
> - struct sg_table *aux_sgt;
> dma_addr_t y_addr;
> dma_addr_t cb_addr;
> dma_addr_t cr_addr;
> @@ -494,22 +487,6 @@ static void tegra_vde_decode_frame(struct tegra_vde *vde,
> vde->sxe, 0x00);
> }
>
> -static void tegra_vde_detach_and_put_dmabuf(struct tegra_vde *vde,
> - enum dma_data_direction dma_dir,
> - struct dma_buf_attachment *a,
> - struct sg_table *sgt,
> - struct iova *iova)
> -{
> - struct dma_buf *dmabuf = a->dmabuf;
> -
> - if (vde->domain)
> - tegra_vde_iommu_unmap(vde, iova);
> -
> - dma_buf_unmap_attachment(a, sgt, dma_dir);
> - dma_buf_detach(dmabuf, a);
> - dma_buf_put(dmabuf);
> -}
> -
> static int tegra_vde_attach_dmabuf(struct tegra_vde *vde,
> int fd,
> unsigned long offset,
> @@ -517,15 +494,11 @@ static int tegra_vde_attach_dmabuf(struct tegra_vde *vde,
> size_t align_size,
> struct dma_buf_attachment **a,
> dma_addr_t *addrp,
> - struct sg_table **s,
> - struct iova **iovap,
> size_t *size,
> enum dma_data_direction dma_dir)
> {
> struct device *dev = vde->miscdev.parent;
> - struct dma_buf_attachment *attachment;
> struct dma_buf *dmabuf;
> - struct sg_table *sgt;
> int err;
>
> dmabuf = dma_buf_get(fd);
> @@ -546,49 +519,17 @@ static int tegra_vde_attach_dmabuf(struct tegra_vde *vde,
> return -EINVAL;
> }
>
> - attachment = dma_buf_attach(dmabuf, dev);
> - if (IS_ERR(attachment)) {
> - dev_err(dev, "Failed to attach dmabuf\n");
> - err = PTR_ERR(attachment);
> + err = tegra_vde_dmabuf_cache_map(vde, dmabuf, dma_dir, a, addrp);
> + if (err)
> goto err_put;
> - }
> -
> - sgt = dma_buf_map_attachment(attachment, dma_dir);
> - if (IS_ERR(sgt)) {
> - dev_err(dev, "Failed to get dmabufs sg_table\n");
> - err = PTR_ERR(sgt);
> - goto err_detach;
> - }
> -
> - if (!vde->domain && sgt->nents > 1) {
> - dev_err(dev, "Sparse DMA region is unsupported, please enable IOMMU\n");
> - err = -EINVAL;
> - goto err_unmap;
> - }
> -
> - if (vde->domain) {
> - err = tegra_vde_iommu_map(vde, sgt, iovap, addrp, dmabuf->size);
> - if (err) {
> - dev_err(dev, "IOMMU mapping failed: %d\n", err);
> - goto err_unmap;
> - }
> - } else {
> - *addrp = sg_dma_address(sgt->sgl);
> - }
>
> *addrp = *addrp + offset;
> - *a = attachment;
> - *s = sgt;
>
> if (size)
> *size = dmabuf->size - offset;
>
> return 0;
>
> -err_unmap:
> - dma_buf_unmap_attachment(attachment, sgt, dma_dir);
> -err_detach:
> - dma_buf_detach(dmabuf, attachment);
> err_put:
> dma_buf_put(dmabuf);
>
> @@ -608,8 +549,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
> src->y_offset, lsize, SZ_256,
> &frame->y_dmabuf_attachment,
> &frame->y_addr,
> - &frame->y_sgt,
> - &frame->y_iova,
> NULL, dma_dir);
> if (err)
> return err;
> @@ -618,8 +557,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
> src->cb_offset, csize, SZ_256,
> &frame->cb_dmabuf_attachment,
> &frame->cb_addr,
> - &frame->cb_sgt,
> - &frame->cb_iova,
> NULL, dma_dir);
> if (err)
> goto err_release_y;
> @@ -628,8 +565,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
> src->cr_offset, csize, SZ_256,
> &frame->cr_dmabuf_attachment,
> &frame->cr_addr,
> - &frame->cr_sgt,
> - &frame->cr_iova,
> NULL, dma_dir);
> if (err)
> goto err_release_cb;
> @@ -643,8 +578,6 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
> src->aux_offset, csize, SZ_256,
> &frame->aux_dmabuf_attachment,
> &frame->aux_addr,
> - &frame->aux_sgt,
> - &frame->aux_iova,
> NULL, dma_dir);
> if (err)
> goto err_release_cr;
> @@ -652,20 +585,11 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
> return 0;
>
> err_release_cr:
> - tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
> - frame->cr_dmabuf_attachment,
> - frame->cr_sgt,
> - frame->cr_iova);
> + tegra_vde_dmabuf_cache_unmap(vde, frame->cr_dmabuf_attachment, true);
> err_release_cb:
> - tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
> - frame->cb_dmabuf_attachment,
> - frame->cb_sgt,
> - frame->cb_iova);
> + tegra_vde_dmabuf_cache_unmap(vde, frame->cb_dmabuf_attachment, true);
> err_release_y:
> - tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
> - frame->y_dmabuf_attachment,
> - frame->y_sgt,
> - frame->y_iova);
> + tegra_vde_dmabuf_cache_unmap(vde, frame->y_dmabuf_attachment, true);
>
> return err;
> }
> @@ -673,28 +597,16 @@ static int tegra_vde_attach_dmabufs_to_frame(struct tegra_vde *vde,
> static void tegra_vde_release_frame_dmabufs(struct tegra_vde *vde,
> struct video_frame *frame,
> enum dma_data_direction dma_dir,
> - bool baseline_profile)
> + bool baseline_profile,
> + bool release)
> {
> if (!baseline_profile)
> - tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
> - frame->aux_dmabuf_attachment,
> - frame->aux_sgt,
> - frame->aux_iova);
> -
> - tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
> - frame->cr_dmabuf_attachment,
> - frame->cr_sgt,
> - frame->cr_iova);
> -
> - tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
> - frame->cb_dmabuf_attachment,
> - frame->cb_sgt,
> - frame->cb_iova);
> -
> - tegra_vde_detach_and_put_dmabuf(vde, dma_dir,
> - frame->y_dmabuf_attachment,
> - frame->y_sgt,
> - frame->y_iova);
> + tegra_vde_dmabuf_cache_unmap(vde, frame->aux_dmabuf_attachment,
> + release);
> +
> + tegra_vde_dmabuf_cache_unmap(vde, frame->cr_dmabuf_attachment, release);
> + tegra_vde_dmabuf_cache_unmap(vde, frame->cb_dmabuf_attachment, release);
> + tegra_vde_dmabuf_cache_unmap(vde, frame->y_dmabuf_attachment, release);
> }
>
> static int tegra_vde_validate_frame(struct device *dev,
> @@ -786,8 +698,6 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
> struct tegra_vde_h264_frame __user *frames_user;
> struct video_frame *dpb_frames;
> struct dma_buf_attachment *bitstream_data_dmabuf_attachment;
> - struct sg_table *bitstream_sgt;
> - struct iova *bitstream_iova;
> enum dma_data_direction dma_dir;
> dma_addr_t bitstream_data_addr;
> dma_addr_t bsev_ptr;
> @@ -812,8 +722,6 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
> SZ_16K, SZ_16K,
> &bitstream_data_dmabuf_attachment,
> &bitstream_data_addr,
> - &bitstream_sgt,
> - &bitstream_iova,
> &bitstream_data_size,
> DMA_TO_DEVICE);
> if (ret)
> @@ -944,7 +852,7 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
> dma_dir = (i == 0) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
>
> tegra_vde_release_frame_dmabufs(vde, &dpb_frames[i], dma_dir,
> - ctx.baseline_profile);
> + ctx.baseline_profile, ret != 0);
> }
>
> free_dpb_frames:
> @@ -954,10 +862,8 @@ static int tegra_vde_ioctl_decode_h264(struct tegra_vde *vde,
> kfree(frames);
>
> release_bitstream_dmabuf:
> - tegra_vde_detach_and_put_dmabuf(vde, DMA_TO_DEVICE,
> - bitstream_data_dmabuf_attachment,
> - bitstream_sgt,
> - bitstream_iova);
> + tegra_vde_dmabuf_cache_unmap(vde, bitstream_data_dmabuf_attachment,
> + ret != 0);
>
> return ret;
> }
> @@ -979,9 +885,21 @@ static long tegra_vde_unlocked_ioctl(struct file *filp,
> return -ENOTTY;
> }
>
> +static int tegra_vde_release_file(struct inode *inode, struct file *filp)
> +{
> + struct miscdevice *miscdev = filp->private_data;
> + struct tegra_vde *vde = container_of(miscdev, struct tegra_vde,
> + miscdev);
> +
> + tegra_vde_dmabuf_cache_unmap_sync(vde);
> +
> + return 0;
> +}
> +
> static const struct file_operations tegra_vde_fops = {
> .owner = THIS_MODULE,
> .unlocked_ioctl = tegra_vde_unlocked_ioctl,
> + .release = tegra_vde_release_file,
> };
>
> static irqreturn_t tegra_vde_isr(int irq, void *data)
> @@ -1159,6 +1077,8 @@ static int tegra_vde_probe(struct platform_device *pdev)
> return -ENOMEM;
> }
>
> + INIT_LIST_HEAD(&vde->map_list);
> + mutex_init(&vde->map_lock);
> mutex_init(&vde->lock);
> init_completion(&vde->decode_completion);
>
> @@ -1221,6 +1141,7 @@ static int tegra_vde_remove(struct platform_device *pdev)
>
> misc_deregister(&vde->miscdev);
>
> + tegra_vde_dmabuf_cache_unmap_all(vde);
> tegra_vde_iommu_deinit(vde);
>
> gen_pool_free(vde->iram_pool, (unsigned long)vde->iram,
> diff --git a/drivers/staging/media/tegra-vde/vde.h b/drivers/staging/media/tegra-vde/vde.h
> index 37414b7fdee1..e138878e8e14 100644
> --- a/drivers/staging/media/tegra-vde/vde.h
> +++ b/drivers/staging/media/tegra-vde/vde.h
> @@ -9,16 +9,20 @@
> #define TEGRA_VDE_H
>
> #include <linux/completion.h>
> +#include <linux/dma-direction.h>
> +#include <linux/list.h>
> #include <linux/miscdevice.h>
> #include <linux/mutex.h>
> #include <linux/types.h>
> #include <linux/iova.h>
>
> struct clk;
> +struct dma_buf;
> struct gen_pool;
> struct iommu_group;
> struct iommu_domain;
> struct reset_control;
> +struct dma_buf_attachment;
>
> struct tegra_vde {
> void __iomem *sxe;
> @@ -31,6 +35,8 @@ struct tegra_vde {
> void __iomem *vdma;
> void __iomem *frameid;
> struct mutex lock;
> + struct mutex map_lock;
> + struct list_head map_list;
> struct miscdevice miscdev;
> struct reset_control *rst;
> struct reset_control *rst_mc;
> @@ -49,10 +55,20 @@ void tegra_vde_iommu_deinit(struct tegra_vde *vde);
> int tegra_vde_iommu_map(struct tegra_vde *vde,
> struct sg_table *sgt,
> struct iova **iovap,
> - dma_addr_t *addrp,
> size_t size);
> void tegra_vde_iommu_unmap(struct tegra_vde *vde, struct iova *iova);
>
> +int tegra_vde_dmabuf_cache_map(struct tegra_vde *vde,
> + struct dma_buf *dmabuf,
> + enum dma_data_direction dma_dir,
> + struct dma_buf_attachment **ap,
> + dma_addr_t *addrp);
> +void tegra_vde_dmabuf_cache_unmap(struct tegra_vde *vde,
> + struct dma_buf_attachment *a,
> + bool release);
> +void tegra_vde_dmabuf_cache_unmap_sync(struct tegra_vde *vde);
> +void tegra_vde_dmabuf_cache_unmap_all(struct tegra_vde *vde);
> +
> static __maybe_unused char const *
> tegra_vde_reg_base_name(struct tegra_vde *vde, void __iomem *base)
> {
>