[RFC 1/3] iommu/intel: Use page->refcount to count number of entries in IOMMU

From: Pasha Tatashin
Date: Wed Dec 20 2023 - 22:19:37 EST


In order to be able to efficiently free empty page table levels, count the
number of entries in each page table my incremeanting and decremeanting
refcount every time a PTE is inserted or removed form the page table.

For this to work correctly, add two helper function:
dma_clear_pte and dma_set_pte where counting is performed,

Also, modify the code so every page table entry is always updated using the
two new functions.

Signed-off-by: Pasha Tatashin <pasha.tatashin@xxxxxxxxxx>
---
drivers/iommu/intel/iommu.c | 40 +++++++++++++++++++++---------------
drivers/iommu/intel/iommu.h | 41 +++++++++++++++++++++++++++++++------
2 files changed, 58 insertions(+), 23 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 897159dba47d..4688ef797161 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -949,7 +949,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
if (domain->use_first_level)
pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;

- if (cmpxchg64(&pte->val, 0ULL, pteval))
+ if (dma_set_pte(pte, pteval))
/* Someone else set it while we were thinking; use theirs. */
free_pgtable_page(tmp_page);
else
@@ -1021,7 +1021,8 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
continue;
}
do {
- dma_clear_pte(pte);
+ if (dma_pte_present(pte))
+ dma_clear_pte(pte);
start_pfn += lvl_to_nr_pages(large_page);
pte++;
} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
@@ -1062,7 +1063,8 @@ static void dma_pte_free_level(struct dmar_domain *domain, int level,
*/
if (level < retain_level && !(start_pfn > level_pfn ||
last_pfn < level_pfn + level_size(level) - 1)) {
- dma_clear_pte(pte);
+ if (dma_pte_present(pte))
+ dma_clear_pte(pte);
domain_flush_cache(domain, pte, sizeof(*pte));
free_pgtable_page(level_pte);
}
@@ -1093,12 +1095,13 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
}
}

-/* When a page at a given level is being unlinked from its parent, we don't
- need to *modify* it at all. All we need to do is make a list of all the
- pages which can be freed just as soon as we've flushed the IOTLB and we
- know the hardware page-walk will no longer touch them.
- The 'pte' argument is the *parent* PTE, pointing to the page that is to
- be freed. */
+/*
+ * A given page at a given level is being unlinked from its parent.
+ * We need to make a list of all the pages which can be freed just as soon as
+ * we've flushed the IOTLB and we know the hardware page-walk will no longer
+ * touch them. The 'pte' argument is the *parent* PTE, pointing to the page
+ * that is to be freed.
+ */
static void dma_pte_list_pagetables(struct dmar_domain *domain,
int level, struct dma_pte *pte,
struct list_head *freelist)
@@ -1106,17 +1109,20 @@ static void dma_pte_list_pagetables(struct dmar_domain *domain,
struct page *pg;

pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
- list_add_tail(&pg->lru, freelist);
-
- if (level == 1)
- return;
-
pte = page_address(pg);
+
do {
- if (dma_pte_present(pte) && !dma_pte_superpage(pte))
- dma_pte_list_pagetables(domain, level - 1, pte, freelist);
+ if (dma_pte_present(pte)) {
+ if (level > 1 && !dma_pte_superpage(pte)) {
+ dma_pte_list_pagetables(domain, level - 1, pte,
+ freelist);
+ }
+ dma_clear_pte(pte);
+ }
pte++;
} while (!first_pte_in_page(pte));
+
+ list_add_tail(&pg->lru, freelist);
}

static void dma_pte_clear_level(struct dmar_domain *domain, int level,
@@ -2244,7 +2250,7 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
/* We don't need lock here, nobody else
* touches the iova range
*/
- tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
+ tmp = dma_set_pte(pte, pteval);
if (tmp) {
static int dumps = 5;
pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index ce030c5b5772..f1ea508f45bd 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -802,11 +802,6 @@ struct dma_pte {
u64 val;
};

-static inline void dma_clear_pte(struct dma_pte *pte)
-{
- pte->val = 0;
-}
-
static inline u64 dma_pte_addr(struct dma_pte *pte)
{
#ifdef CONFIG_64BIT
@@ -818,9 +813,43 @@ static inline u64 dma_pte_addr(struct dma_pte *pte)
#endif
}

+#define DMA_PTEVAL_PRESENT(pteval) (((pteval) & 3) != 0)
static inline bool dma_pte_present(struct dma_pte *pte)
{
- return (pte->val & 3) != 0;
+ return DMA_PTEVAL_PRESENT(pte->val);
+}
+
+static inline void dma_clear_pte(struct dma_pte *pte)
+{
+ u64 old_pteval;
+
+ old_pteval = xchg(&pte->val, 0ULL);
+ if (DMA_PTEVAL_PRESENT(old_pteval)) {
+ struct page *pg = virt_to_page(pte);
+ int rc = page_ref_dec_return(pg);
+
+ WARN_ON_ONCE(rc > 512 || rc < 1);
+ } else {
+ /* Ensure that we cleared a valid entry from the page table */
+ WARN_ON(1);
+ }
+}
+
+static inline u64 dma_set_pte(struct dma_pte *pte, u64 pteval)
+{
+ u64 old_pteval;
+
+ /* Ensure we about to set a valid entry to the page table */
+ WARN_ON(!DMA_PTEVAL_PRESENT(pteval));
+ old_pteval = cmpxchg64(&pte->val, 0ULL, pteval);
+ if (old_pteval == 0) {
+ struct page *pg = virt_to_page(pte);
+ int rc = page_ref_inc_return(pg);
+
+ WARN_ON_ONCE(rc > 513 || rc < 2);
+ }
+
+ return old_pteval;
}

static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
--
2.43.0.472.g3155946c3a-goog