[RFC v2 2/2] iommu/iova: Add support for best-fit algorithm

From: Georgi Djakov
Date: Mon Dec 12 2022 - 16:53:16 EST


In some multimedia use-cases that involve allocating buffers less than
PAGE_SIZE, combined with allocating ~400MB buffers, the IOVA space is
getting fragmented and allocation failures are observed.

In these scenarios, there are hundreds of randomly distributed non-power
of two allocations, which in some cases leave behind holes of up to 100MB
in the IOVA space.

To reduce the memory fragmentation in these use-cases, introduce the
"best-fit" algorithm, which can be enabled by individual devices with
a device-tree property. If the DT property is not present, the default
"first-fit" algorithm will be used.

Example usage:

device {
compatible = "example-device";
iommus = <&smmu 0x21 0>;
iova-best-fit;
};

Signed-off-by: Georgi Djakov <quic_c_gdjako@xxxxxxxxxxx>
---

RFC v1: https://lore.kernel.org/r/1581721602-17010-1-git-send-email-isaacm@xxxxxxxxxxxxxx/
- Use a DT property instead of an API to enable the best-fit algorithm for the device
- Rename __alloc_and_insert_iova_range() to __alloc_and_insert_iova_first_fit()


drivers/iommu/dma-iommu.c | 5 +++
drivers/iommu/iova.c | 79 ++++++++++++++++++++++++++++++++++++---
include/linux/iova.h | 1 +
3 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index f798c44e0903..1be25e54d050 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -582,6 +582,11 @@ static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
if (ret)
goto done_unlock;

+ if (dev && dev->of_node) {
+ if (of_property_read_bool(dev->of_node, "iommu-best-fit"))
+ iovad->best_fit = true;
+ }
+
/* If the FQ fails we can simply fall back to strict mode */
if (domain->type == IOMMU_DOMAIN_DMA_FQ && iommu_dma_init_fq(domain))
domain->type = IOMMU_DOMAIN_DMA;
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index a44ad92fc5eb..34ad05a4095f 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -64,6 +64,7 @@ init_iova_domain(struct iova_domain *iovad, unsigned long granule,
iovad->cached_node = &iovad->anchor.node;
iovad->cached32_node = &iovad->anchor.node;
iovad->granule = granule;
+ iovad->best_fit = false;
iovad->start_pfn = start_pfn;
iovad->dma_32bit_pfn = 1UL << (32 - iova_shift(iovad));
iovad->max32_alloc_size = iovad->dma_32bit_pfn;
@@ -175,9 +176,10 @@ iova_insert_rbtree(struct rb_root *root, struct iova *iova,
rb_insert_color(&iova->node, root);
}

-static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
- unsigned long size, unsigned long limit_pfn,
- struct iova *new, bool size_aligned)
+static int __alloc_and_insert_iova_first_fit(struct iova_domain *iovad,
+ unsigned long size,
+ unsigned long limit_pfn,
+ struct iova *new, bool size_aligned)
{
struct rb_node *curr, *prev;
struct iova *curr_iova;
@@ -236,6 +238,68 @@ static int __alloc_and_insert_iova_range(struct iova_domain *iovad,
return -ENOMEM;
}

+static int __alloc_and_insert_iova_best_fit(struct iova_domain *iovad,
+ unsigned long size,
+ unsigned long limit_pfn,
+ struct iova *new, bool size_aligned)
+{
+ struct rb_node *curr, *prev;
+ struct iova *curr_iova, *prev_iova;
+ unsigned long flags;
+ unsigned long align_mask = ~0UL;
+ struct rb_node *candidate_rb_parent;
+ unsigned long new_pfn, candidate_pfn = ~0UL;
+ unsigned long gap, candidate_gap = ~0UL;
+
+ if (size_aligned)
+ align_mask <<= fls_long(size - 1);
+
+ /* Walk the tree backwards */
+ spin_lock_irqsave(&iovad->iova_rbtree_lock, flags);
+ curr = &iovad->anchor.node;
+ prev = rb_prev(curr);
+ for (; prev; curr = prev, prev = rb_prev(curr)) {
+ curr_iova = rb_entry(curr, struct iova, node);
+ prev_iova = rb_entry(prev, struct iova, node);
+
+ limit_pfn = min(limit_pfn, curr_iova->pfn_lo);
+ new_pfn = (limit_pfn - size) & align_mask;
+ gap = curr_iova->pfn_lo - prev_iova->pfn_hi - 1;
+ if (limit_pfn >= size && new_pfn > prev_iova->pfn_hi && gap < candidate_gap) {
+ candidate_gap = gap;
+ candidate_pfn = new_pfn;
+ candidate_rb_parent = curr;
+ if (gap == size)
+ goto insert;
+ }
+ }
+
+ curr_iova = rb_entry(curr, struct iova, node);
+ limit_pfn = min(limit_pfn, curr_iova->pfn_lo);
+ new_pfn = (limit_pfn - size) & align_mask;
+ gap = curr_iova->pfn_lo - iovad->start_pfn;
+ if (limit_pfn >= size && new_pfn >= iovad->start_pfn && gap < candidate_gap) {
+ candidate_gap = gap;
+ candidate_pfn = new_pfn;
+ candidate_rb_parent = curr;
+ }
+
+insert:
+ if (candidate_pfn == ~0UL) {
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ return -ENOMEM;
+ }
+
+ /* pfn_lo will point to size aligned address if size_aligned is set */
+ new->pfn_lo = candidate_pfn;
+ new->pfn_hi = new->pfn_lo + size - 1;
+
+ /* If we have 'prev', it's a valid place to start the insertion. */
+ iova_insert_rbtree(&iovad->rbroot, new, candidate_rb_parent);
+ spin_unlock_irqrestore(&iovad->iova_rbtree_lock, flags);
+ return 0;
+}
+
static struct kmem_cache *iova_cache;
static unsigned int iova_cache_users;
static DEFINE_MUTEX(iova_cache_mutex);
@@ -322,8 +386,13 @@ alloc_iova(struct iova_domain *iovad, unsigned long size,
if (!new_iova)
return NULL;

- ret = __alloc_and_insert_iova_range(iovad, size, limit_pfn + 1,
- new_iova, size_aligned);
+ if (iovad->best_fit) {
+ ret = __alloc_and_insert_iova_best_fit(iovad, size, limit_pfn + 1,
+ new_iova, size_aligned);
+ } else {
+ ret = __alloc_and_insert_iova_first_fit(iovad, size, limit_pfn + 1,
+ new_iova, size_aligned);
+ }

if (ret) {
free_iova_mem(new_iova);
diff --git a/include/linux/iova.h b/include/linux/iova.h
index 83c00fac2acb..c0c77f676e17 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -38,6 +38,7 @@ struct iova_domain {

struct iova_rcache *rcaches;
struct hlist_node cpuhp_dead;
+ bool best_fit; /* best-fit algorithm is enabled */
};

static inline unsigned long iova_size(struct iova *iova)