Re: [PATCH 11/15] iio: buffer-dma: Boost performance using write-combine cache setting
From: Jonathan Cameron
Date: Sun Nov 21 2021 - 09:55:53 EST
On Mon, 15 Nov 2021 14:19:21 +0000
Paul Cercueil <paul@xxxxxxxxxxxxxxx> wrote:
> We can be certain that the input buffers will only be accessed by
> userspace for reading, and output buffers will mostly be accessed by
> userspace for writing.
Mostly? Perhaps a little more info on why that's not 'only'.
>
> Therefore, it makes more sense to use only fully cached input buffers,
> and to use the write-combine cache coherency setting for output buffers.
>
> This boosts performance, as the data written to the output buffers does
> not have to be sync'd for coherency. It will halve performance if the
> userspace application tries to read from the output buffer, but this
> should never happen.
>
> Since we don't need to sync the cache when disabling CPU access either
> for input buffers or output buffers, the .end_cpu_access() callback can
> be dropped completely.
We have an odd mix of coherent and non coherent DMA in here as you noted,
but are you sure this is safe on all platforms?
>
> Signed-off-by: Paul Cercueil <paul@xxxxxxxxxxxxxxx>
Any numbers to support this patch? The mapping types are performance
optimisations so nice to know how much of a difference they make.
> ---
> drivers/iio/buffer/industrialio-buffer-dma.c | 82 +++++++++++++-------
> 1 file changed, 54 insertions(+), 28 deletions(-)
>
> diff --git a/drivers/iio/buffer/industrialio-buffer-dma.c b/drivers/iio/buffer/industrialio-buffer-dma.c
> index 92356ee02f30..fb39054d8c15 100644
> --- a/drivers/iio/buffer/industrialio-buffer-dma.c
> +++ b/drivers/iio/buffer/industrialio-buffer-dma.c
> @@ -229,8 +229,33 @@ static int iio_buffer_dma_buf_mmap(struct dma_buf *dbuf,
> if (vma->vm_ops->open)
> vma->vm_ops->open(vma);
>
> - return dma_mmap_pages(dev, vma, vma->vm_end - vma->vm_start,
> - virt_to_page(block->vaddr));
> + if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) {
> + /*
> + * With an input buffer, userspace will only read the data and
> + * never write. We can mmap the buffer fully cached.
> + */
> + return dma_mmap_pages(dev, vma, vma->vm_end - vma->vm_start,
> + virt_to_page(block->vaddr));
> + } else {
> + /*
> + * With an output buffer, userspace will only write the data
> + * and should rarely (if never) read from it. It is better to
> + * use write-combine in this case.
> + */
> + return dma_mmap_wc(dev, vma, block->vaddr, block->phys_addr,
> + vma->vm_end - vma->vm_start);
> + }
> +}
> +
> +static void iio_dma_buffer_free_dmamem(struct iio_dma_buffer_block *block)
> +{
> + struct device *dev = block->queue->dev;
> + size_t size = PAGE_ALIGN(block->size);
> +
> + if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN)
> + dma_free_coherent(dev, size, block->vaddr, block->phys_addr);
> + else
> + dma_free_wc(dev, size, block->vaddr, block->phys_addr);
> }
>
> static void iio_buffer_dma_buf_release(struct dma_buf *dbuf)
> @@ -243,9 +268,7 @@ static void iio_buffer_dma_buf_release(struct dma_buf *dbuf)
>
> mutex_lock(&queue->lock);
>
> - dma_free_coherent(queue->dev, PAGE_ALIGN(block->size),
> - block->vaddr, block->phys_addr);
> -
> + iio_dma_buffer_free_dmamem(block);
> kfree(block);
>
> queue->num_blocks--;
> @@ -268,19 +291,6 @@ static int iio_buffer_dma_buf_begin_cpu_access(struct dma_buf *dbuf,
> return 0;
> }
>
> -static int iio_buffer_dma_buf_end_cpu_access(struct dma_buf *dbuf,
> - enum dma_data_direction dma_dir)
> -{
> - struct iio_dma_buffer_block *block = dbuf->priv;
> - struct device *dev = block->queue->dev;
> -
> - /* We only need to sync the cache for output buffers */
> - if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_OUT)
> - dma_sync_single_for_device(dev, block->phys_addr, block->size, dma_dir);
> -
> - return 0;
> -}
> -
> static const struct dma_buf_ops iio_dma_buffer_dmabuf_ops = {
> .attach = iio_buffer_dma_buf_attach,
> .map_dma_buf = iio_buffer_dma_buf_map,
> @@ -288,9 +298,28 @@ static const struct dma_buf_ops iio_dma_buffer_dmabuf_ops = {
> .mmap = iio_buffer_dma_buf_mmap,
> .release = iio_buffer_dma_buf_release,
> .begin_cpu_access = iio_buffer_dma_buf_begin_cpu_access,
> - .end_cpu_access = iio_buffer_dma_buf_end_cpu_access,
> };
>
> +static int iio_dma_buffer_alloc_dmamem(struct iio_dma_buffer_block *block)
> +{
> + struct device *dev = block->queue->dev;
> + size_t size = PAGE_ALIGN(block->size);
> +
> + if (block->queue->buffer.direction == IIO_BUFFER_DIRECTION_IN) {
> + block->vaddr = dma_alloc_coherent(dev, size,
> + &block->phys_addr,
> + GFP_KERNEL);
> + } else {
> + block->vaddr = dma_alloc_wc(dev, size,
> + &block->phys_addr,
> + GFP_KERNEL);
> + }
> + if (!block->vaddr)
> + return -ENOMEM;
> +
> + return 0;
> +}
> +
> static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
> struct iio_dma_buffer_queue *queue, size_t size, bool fileio)
> {
> @@ -303,12 +332,12 @@ static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
> if (!block)
> return ERR_PTR(-ENOMEM);
>
> - block->vaddr = dma_alloc_coherent(queue->dev, PAGE_ALIGN(size),
> - &block->phys_addr, GFP_KERNEL);
> - if (!block->vaddr) {
> - err = -ENOMEM;
> + block->size = size;
> + block->queue = queue;
> +
> + err = iio_dma_buffer_alloc_dmamem(block);
> + if (err)
> goto err_free_block;
> - }
>
> einfo.ops = &iio_dma_buffer_dmabuf_ops;
> einfo.size = PAGE_ALIGN(size);
> @@ -322,10 +351,8 @@ static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
> }
>
> block->dmabuf = dmabuf;
> - block->size = size;
> block->bytes_used = size;
> block->state = IIO_BLOCK_STATE_DONE;
> - block->queue = queue;
> block->fileio = fileio;
> INIT_LIST_HEAD(&block->head);
>
> @@ -338,8 +365,7 @@ static struct iio_dma_buffer_block *iio_dma_buffer_alloc_block(
> return block;
>
> err_free_dma:
> - dma_free_coherent(queue->dev, PAGE_ALIGN(size),
> - block->vaddr, block->phys_addr);
> + iio_dma_buffer_free_dmamem(block);
> err_free_block:
> kfree(block);
> return ERR_PTR(err);