[PATCH] iommu/iova: Bettering utilizing cpu_rcaches in no-strict mode

From: Zhang Zekun
Date: Mon Jun 24 2024 - 04:51:14 EST


Currently, when iommu working in no-strict mode, fq_flush_timeout()
will always try to free iovas on one cpu. Freeing the iovas from all
cpus on one cpu is not cache-friendly to iova_rcache, because it will
first filling up the cpu_rcache and then pushing iovas to the depot,
iovas in the depot will finally goto the underlying rbtree if the
depot_size is greater than num_online_cpus(). Let fq_flush_timeout()
freeing iovas on cpus who call dma_unmap* APIs, can decrease the overall
time caused by fq_flush_timeout() by better utilizing the iova_rcache,
and minimizing the competition for the iova_rbtree_lock().

Signed-off-by: Zhang Zekun <zhangzekun11@xxxxxxxxxx>
---
drivers/iommu/dma-iommu.c | 22 +++++++++++++---------
drivers/iommu/iova.c | 21 ++++++++++++++-------
include/linux/iova.h | 7 +++++++
3 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 43520e7275cc..217b7c70d06c 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -145,7 +145,8 @@ static inline unsigned int fq_ring_add(struct iova_fq *fq)
return idx;
}

-static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
+static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq *fq,
+ int cpu)
{
u64 counter = atomic64_read(&cookie->fq_flush_finish_cnt);
unsigned int idx;
@@ -158,20 +159,21 @@ static void fq_ring_free_locked(struct iommu_dma_cookie *cookie, struct iova_fq
break;

iommu_put_pages_list(&fq->entries[idx].freelist);
- free_iova_fast(&cookie->iovad,
+ free_iova_fast_cpu(&cookie->iovad,
fq->entries[idx].iova_pfn,
- fq->entries[idx].pages);
+ fq->entries[idx].pages, cpu);

fq->head = (fq->head + 1) & fq->mod_mask;
}
}

-static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq)
+static void fq_ring_free(struct iommu_dma_cookie *cookie, struct iova_fq *fq,
+ int cpu)
{
unsigned long flags;

spin_lock_irqsave(&fq->lock, flags);
- fq_ring_free_locked(cookie, fq);
+ fq_ring_free_locked(cookie, fq, cpu);
spin_unlock_irqrestore(&fq->lock, flags);
}

@@ -191,10 +193,11 @@ static void fq_flush_timeout(struct timer_list *t)
fq_flush_iotlb(cookie);

if (cookie->options.qt == IOMMU_DMA_OPTS_SINGLE_QUEUE) {
- fq_ring_free(cookie, cookie->single_fq);
+ cpu = smp_processor_id();
+ fq_ring_free(cookie, cookie->single_fq, cpu);
} else {
for_each_possible_cpu(cpu)
- fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu));
+ fq_ring_free(cookie, per_cpu_ptr(cookie->percpu_fq, cpu), cpu);
}
}

@@ -205,6 +208,7 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
struct iova_fq *fq;
unsigned long flags;
unsigned int idx;
+ int cpu = smp_processor_id();

/*
* Order against the IOMMU driver's pagetable update from unmapping
@@ -227,11 +231,11 @@ static void queue_iova(struct iommu_dma_cookie *cookie,
* flushed out on another CPU. This makes the fq_full() check below less
* likely to be true.
*/
- fq_ring_free_locked(cookie, fq);
+ fq_ring_free_locked(cookie, fq, cpu);

if (fq_full(fq)) {
fq_flush_iotlb(cookie);
- fq_ring_free_locked(cookie, fq);
+ fq_ring_free_locked(cookie, fq, cpu);
}

idx = fq_ring_add(fq);
diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c
index d59d0ea2fd21..46a2188c263b 100644
--- a/drivers/iommu/iova.c
+++ b/drivers/iommu/iova.c
@@ -20,7 +20,7 @@

static bool iova_rcache_insert(struct iova_domain *iovad,
unsigned long pfn,
- unsigned long size);
+ unsigned long size, int cpu);
static unsigned long iova_rcache_get(struct iova_domain *iovad,
unsigned long size,
unsigned long limit_pfn);
@@ -423,12 +423,19 @@ EXPORT_SYMBOL_GPL(alloc_iova_fast);
void
free_iova_fast(struct iova_domain *iovad, unsigned long pfn, unsigned long size)
{
- if (iova_rcache_insert(iovad, pfn, size))
+ free_iova_fast_cpu(iovad, pfn, size, smp_processor_id());
+}
+EXPORT_SYMBOL_GPL(free_iova_fast);
+
+void
+free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
+ unsigned long size, int cpu)
+{
+ if (iova_rcache_insert(iovad, pfn, size, cpu))
return;

free_iova(iovad, pfn);
}
-EXPORT_SYMBOL_GPL(free_iova_fast);

static void iova_domain_free_rcaches(struct iova_domain *iovad)
{
@@ -762,13 +769,13 @@ EXPORT_SYMBOL_GPL(iova_domain_init_rcaches);
*/
static bool __iova_rcache_insert(struct iova_domain *iovad,
struct iova_rcache *rcache,
- unsigned long iova_pfn)
+ unsigned long iova_pfn, int cpu)
{
struct iova_cpu_rcache *cpu_rcache;
bool can_insert = false;
unsigned long flags;

- cpu_rcache = raw_cpu_ptr(rcache->cpu_rcaches);
+ cpu_rcache = per_cpu_ptr(rcache->cpu_rcaches, cpu);
spin_lock_irqsave(&cpu_rcache->lock, flags);

if (!iova_magazine_full(cpu_rcache->loaded)) {
@@ -799,14 +806,14 @@ static bool __iova_rcache_insert(struct iova_domain *iovad,
}

static bool iova_rcache_insert(struct iova_domain *iovad, unsigned long pfn,
- unsigned long size)
+ unsigned long size, int cpu)
{
unsigned int log_size = order_base_2(size);

if (log_size >= IOVA_RANGE_CACHE_MAX_SIZE)
return false;

- return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn);
+ return __iova_rcache_insert(iovad, &iovad->rcaches[log_size], pfn, cpu);
}

/*
diff --git a/include/linux/iova.h b/include/linux/iova.h
index d2c4fd923efa..91e4e3d5bcb3 100644
--- a/include/linux/iova.h
+++ b/include/linux/iova.h
@@ -93,6 +93,8 @@ struct iova *alloc_iova(struct iova_domain *iovad, unsigned long size,
bool size_aligned);
void free_iova_fast(struct iova_domain *iovad, unsigned long pfn,
unsigned long size);
+void free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
+ unsigned long size, int cpu);
unsigned long alloc_iova_fast(struct iova_domain *iovad, unsigned long size,
unsigned long limit_pfn, bool flush_rcache);
struct iova *reserve_iova(struct iova_domain *iovad, unsigned long pfn_lo,
@@ -134,6 +136,11 @@ static inline void free_iova_fast(struct iova_domain *iovad,
{
}

+static inline void free_iova_fast_cpu(struct iova_domain *iovad, unsigned long pfn,
+ unsigned long size, int cpu);
+{
+}
+
static inline unsigned long alloc_iova_fast(struct iova_domain *iovad,
unsigned long size,
unsigned long limit_pfn,
--
2.17.1