RE: [PATCH v4 1/2] swiotlb: extend buffer pre-padding to alloc_align_mask if necessary

From: Michael Kelley
Date: Mon Mar 25 2024 - 15:16:31 EST


From: Petr Tesarik <petrtesarik@xxxxxxxxxxxxxxx> Sent: Monday, March 25, 2024 1:31 AM
>
> Allow a buffer pre-padding of up to alloc_align_mask, even if it requires
> allocating additional IO TLB slots.
>
> If the allocation alignment is bigger than IO_TLB_SIZE and min_align_mask
> covers any non-zero bits in the original address between IO_TLB_SIZE and
> alloc_align_mask, these bits are not preserved in the swiotlb buffer
> address.
>
> To fix this case, increase the allocation size and use a larger offset
> within the allocated buffer. As a result, extra padding slots may be
> allocated before the mapping start address.
>
> Leave orig_addr in these padding slots initialized to INVALID_PHYS_ADDR.
> These slots do not correspond to any CPU buffer, so attempts to sync the
> data should be ignored.
>
> The padding slots should be automatically released when the buffer is
> unmapped. However, swiotlb_tbl_unmap_single() takes only the address of the
> DMA buffer slot, not the first padding slot. Save the number of padding
> slots in struct io_tlb_slot and use it to adjust the slot index in
> swiotlb_release_slots(), so all allocated slots are properly freed.
>
> Fixes: 2fd4fa5d3fb5 ("swiotlb: Fix alignment checks when both allocation and
> DMA masks are present")
> Link: https://lore.kernel.org/linux-iommu/20240311210507.217daf8b@xxxxxxxxxxxxxxxxxxxx/
> Signed-off-by: Petr Tesarik <petr.tesarik1@xxxxxxxxxxxxxxxxxxx>
> ---
> kernel/dma/swiotlb.c | 59 ++++++++++++++++++++++++++++++++++-------
> ---
> 1 file changed, 46 insertions(+), 13 deletions(-)
>
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index 86fe172b5958..d7a8cb93ef2d 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -69,11 +69,14 @@
> * @alloc_size: Size of the allocated buffer.
> * @list: The free list describing the number of free entries available
> * from each index.
> + * @pad_slots: Number of preceding padding slots. Valid only in the first
> + * allocated non-padding slot.
> */
> struct io_tlb_slot {
> phys_addr_t orig_addr;
> size_t alloc_size;
> - unsigned int list;
> + unsigned short list;
> + unsigned short pad_slots;
> };
>
> static bool swiotlb_force_bounce;
> @@ -287,6 +290,7 @@ static void swiotlb_init_io_tlb_pool(struct io_tlb_pool *mem, phys_addr_t start,
> mem->nslabs - i);
> mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
> mem->slots[i].alloc_size = 0;
> + mem->slots[i].pad_slots = 0;
> }
>
> memset(vaddr, 0, bytes);
> @@ -821,12 +825,30 @@ void swiotlb_dev_init(struct device *dev)
> #endif
> }
>
> -/*
> - * Return the offset into a iotlb slot required to keep the device happy.
> +/**
> + * swiotlb_align_offset() - Get required offset into an IO TLB allocation.
> + * @dev: Owning device.
> + * @align_mask: Allocation alignment mask.
> + * @addr: DMA address.
> + *
> + * Return the minimum offset from the start of an IO TLB allocation which is
> + * required for a given buffer address and allocation alignment to keep the
> + * device happy.
> + *
> + * First, the address bits covered by min_align_mask must be identical in the
> + * original address and the bounce buffer address. High bits are preserved by
> + * choosing a suitable IO TLB slot, but bits below IO_TLB_SHIFT require extra
> + * padding bytes before the bounce buffer.
> + *
> + * Second, @align_mask specifies which bits of the first allocated slot must
> + * be zero. This may require allocating additional padding slots, and then the
> + * offset (in bytes) from the first such padding slot is returned.
> */
> -static unsigned int swiotlb_align_offset(struct device *dev, u64 addr)
> +static unsigned int swiotlb_align_offset(struct device *dev,
> + unsigned int align_mask, u64 addr)
> {
> - return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1);
> + return addr & dma_get_min_align_mask(dev) &
> + (align_mask | (IO_TLB_SIZE - 1));
> }
>
> /*
> @@ -847,7 +869,7 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
> return;
>
> tlb_offset = tlb_addr & (IO_TLB_SIZE - 1);
> - orig_addr_offset = swiotlb_align_offset(dev, orig_addr);
> + orig_addr_offset = swiotlb_align_offset(dev, 0, orig_addr);
> if (tlb_offset < orig_addr_offset) {
> dev_WARN_ONCE(dev, 1,
> "Access before mapping start detected. orig offset %u, requested offset %u.\n",
> @@ -1005,7 +1027,7 @@ static int swiotlb_search_pool_area(struct device *dev, struct io_tlb_pool *pool
> unsigned long max_slots = get_max_slots(boundary_mask);
> unsigned int iotlb_align_mask = dma_get_min_align_mask(dev);
> unsigned int nslots = nr_slots(alloc_size), stride;
> - unsigned int offset = swiotlb_align_offset(dev, orig_addr);
> + unsigned int offset = swiotlb_align_offset(dev, 0, orig_addr);
> unsigned int index, slots_checked, count = 0, i;
> unsigned long flags;
> unsigned int slot_base;
> @@ -1328,11 +1350,12 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
> unsigned long attrs)
> {
> struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
> - unsigned int offset = swiotlb_align_offset(dev, orig_addr);
> + unsigned int offset;
> struct io_tlb_pool *pool;
> unsigned int i;
> int index;
> phys_addr_t tlb_addr;
> + unsigned short pad_slots;
>
> if (!mem || !mem->nslabs) {
> dev_warn_ratelimited(dev,
> @@ -1349,6 +1372,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
> return (phys_addr_t)DMA_MAPPING_ERROR;
> }
>
> + offset = swiotlb_align_offset(dev, alloc_align_mask, orig_addr);
> index = swiotlb_find_slots(dev, orig_addr,
> alloc_size + offset, alloc_align_mask, &pool);
> if (index == -1) {
> @@ -1364,6 +1388,10 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
> * This is needed when we sync the memory. Then we sync the buffer if
> * needed.
> */
> + pad_slots = offset >> IO_TLB_SHIFT;
> + offset &= (IO_TLB_SIZE - 1);
> + index += pad_slots;
> + pool->slots[index].pad_slots = pad_slots;
> for (i = 0; i < nr_slots(alloc_size + offset); i++)
> pool->slots[index + i].orig_addr = slot_addr(orig_addr, i);
> tlb_addr = slot_addr(pool->start, index) + offset;
> @@ -1384,13 +1412,17 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
> {
> struct io_tlb_pool *mem = swiotlb_find_pool(dev, tlb_addr);
> unsigned long flags;
> - unsigned int offset = swiotlb_align_offset(dev, tlb_addr);
> - int index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
> - int nslots = nr_slots(mem->slots[index].alloc_size + offset);
> - int aindex = index / mem->area_nslabs;
> - struct io_tlb_area *area = &mem->areas[aindex];
> + unsigned int offset = swiotlb_align_offset(dev, 0, tlb_addr);
> + int index, nslots, aindex;
> + struct io_tlb_area *area;
> int count, i;
>
> + index = (tlb_addr - offset - mem->start) >> IO_TLB_SHIFT;
> + index -= mem->slots[index].pad_slots;
> + nslots = nr_slots(mem->slots[index].alloc_size + offset);
> + aindex = index / mem->area_nslabs;
> + area = &mem->areas[aindex];
> +
> /*
> * Return the buffer to the free list by setting the corresponding
> * entries to indicate the number of contiguous entries available.
> @@ -1413,6 +1445,7 @@ static void swiotlb_release_slots(struct device *dev, phys_addr_t tlb_addr)
> mem->slots[i].list = ++count;
> mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
> mem->slots[i].alloc_size = 0;
> + mem->slots[i].pad_slots = 0;
> }
>
> /*
> --
> 2.34.1

Did the same testing as on v3, covering both x86/x64 and arm64.
All looks good.

Tested-by: Michael Kelley <mhklinux@xxxxxxxxxxx>
Reviewed-by: Michael Kelley <mhklinux@xxxxxxxxxxx>