Re: [PATCH v5 01/11] iommu/vt-d: add wrapper functions for page allocations

From: Linu Cherian
Date: Thu Apr 04 2024 - 08:20:35 EST


Hi Pasha,

On 2024-02-22 at 23:09:27, Pasha Tatashin (pasha.tatashin@xxxxxxxxxx) wrote:
> In order to improve observability and accountability of IOMMU layer, we
> must account the number of pages that are allocated by functions that
> are calling directly into buddy allocator.
>
> This is achieved by first wrapping the allocation related functions into a
> separate inline functions in new file:
>
> drivers/iommu/iommu-pages.h
>
> Convert all page allocation calls under iommu/intel to use these new
> functions.
>
> Signed-off-by: Pasha Tatashin <pasha.tatashin@xxxxxxxxxx>
> Acked-by: David Rientjes <rientjes@xxxxxxxxxx>
> Tested-by: Bagas Sanjaya <bagasdotme@xxxxxxxxx>
> ---
> drivers/iommu/intel/dmar.c | 16 +--
> drivers/iommu/intel/iommu.c | 47 +++------
> drivers/iommu/intel/iommu.h | 2 -
> drivers/iommu/intel/irq_remapping.c | 16 +--
> drivers/iommu/intel/pasid.c | 18 ++--
> drivers/iommu/intel/svm.c | 11 +-
> drivers/iommu/iommu-pages.h | 154 ++++++++++++++++++++++++++++
> 7 files changed, 201 insertions(+), 63 deletions(-)
> create mode 100644 drivers/iommu/iommu-pages.h


Few minor nits.

>
> diff --git a/drivers/iommu/intel/dmar.c b/drivers/iommu/intel/dmar.c
> index 23cb80d62a9a..ff6045ae8e97 100644
> --- a/drivers/iommu/intel/dmar.c
> +++ b/drivers/iommu/intel/dmar.c
> @@ -32,6 +32,7 @@
>
> #include "iommu.h"
> #include "../irq_remapping.h"
> +#include "../iommu-pages.h"
> #include "perf.h"
> #include "trace.h"
> #include "perfmon.h"
> @@ -1185,7 +1186,7 @@ static void free_iommu(struct intel_iommu *iommu)
> }
>
> if (iommu->qi) {
> - free_page((unsigned long)iommu->qi->desc);
> + iommu_free_page(iommu->qi->desc);
> kfree(iommu->qi->desc_status);
> kfree(iommu->qi);
> }
> @@ -1731,7 +1732,8 @@ static void __dmar_enable_qi(struct intel_iommu *iommu)
> int dmar_enable_qi(struct intel_iommu *iommu)
> {
> struct q_inval *qi;
> - struct page *desc_page;
> + void *desc;
> + int order;
>
> if (!ecap_qis(iommu->ecap))
> return -ENOENT;
> @@ -1752,19 +1754,19 @@ int dmar_enable_qi(struct intel_iommu *iommu)
> * Need two pages to accommodate 256 descriptors of 256 bits each
> * if the remapping hardware supports scalable mode translation.
> */
> - desc_page = alloc_pages_node(iommu->node, GFP_ATOMIC | __GFP_ZERO,
> - !!ecap_smts(iommu->ecap));
> - if (!desc_page) {
> + order = ecap_smts(iommu->ecap) ? 1 : 0;
> + desc = iommu_alloc_pages_node(iommu->node, GFP_ATOMIC, order);
> + if (!desc) {
> kfree(qi);
> iommu->qi = NULL;
> return -ENOMEM;
> }
>
> - qi->desc = page_address(desc_page);
> + qi->desc = desc;
>
> qi->desc_status = kcalloc(QI_LENGTH, sizeof(int), GFP_ATOMIC);
> if (!qi->desc_status) {
> - free_page((unsigned long) qi->desc);
> + iommu_free_page(qi->desc);
> kfree(qi);
> iommu->qi = NULL;
> return -ENOMEM;
> diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> index 6fb5f6fceea1..2c676f46e38c 100644
> --- a/drivers/iommu/intel/iommu.c
> +++ b/drivers/iommu/intel/iommu.c
> @@ -28,6 +28,7 @@
> #include "../dma-iommu.h"
> #include "../irq_remapping.h"
> #include "../iommu-sva.h"
> +#include "../iommu-pages.h"
> #include "pasid.h"
> #include "cap_audit.h"
> #include "perfmon.h"
> @@ -224,22 +225,6 @@ static int __init intel_iommu_setup(char *str)
> }
> __setup("intel_iommu=", intel_iommu_setup);
>
> -void *alloc_pgtable_page(int node, gfp_t gfp)
> -{
> - struct page *page;
> - void *vaddr = NULL;
> -
> - page = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
> - if (page)
> - vaddr = page_address(page);
> - return vaddr;
> -}
> -
> -void free_pgtable_page(void *vaddr)
> -{
> - free_page((unsigned long)vaddr);
> -}
> -
> static int domain_type_is_si(struct dmar_domain *domain)
> {
> return domain->domain.type == IOMMU_DOMAIN_IDENTITY;
> @@ -473,7 +458,7 @@ struct context_entry *iommu_context_addr(struct intel_iommu *iommu, u8 bus,
> if (!alloc)
> return NULL;
>
> - context = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
> + context = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
> if (!context)
> return NULL;
>
> @@ -647,17 +632,17 @@ static void free_context_table(struct intel_iommu *iommu)
> for (i = 0; i < ROOT_ENTRY_NR; i++) {
> context = iommu_context_addr(iommu, i, 0, 0);
> if (context)
> - free_pgtable_page(context);
> + iommu_free_page(context);
>
> if (!sm_supported(iommu))
> continue;
>
> context = iommu_context_addr(iommu, i, 0x80, 0);
> if (context)
> - free_pgtable_page(context);
> + iommu_free_page(context);
> }
>
> - free_pgtable_page(iommu->root_entry);
> + iommu_free_page(iommu->root_entry);
> iommu->root_entry = NULL;
> }
>
> @@ -795,7 +780,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
> if (!dma_pte_present(pte)) {
> uint64_t pteval;
>
> - tmp_page = alloc_pgtable_page(domain->nid, gfp);
> + tmp_page = iommu_alloc_page_node(domain->nid, gfp);
>
> if (!tmp_page)
> return NULL;
> @@ -807,7 +792,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
>
> if (cmpxchg64(&pte->val, 0ULL, pteval))
> /* Someone else set it while we were thinking; use theirs. */
> - free_pgtable_page(tmp_page);
> + iommu_free_page(tmp_page);
> else
> domain_flush_cache(domain, pte, sizeof(*pte));
> }
> @@ -920,7 +905,7 @@ static void dma_pte_free_level(struct dmar_domain *domain, int level,
> last_pfn < level_pfn + level_size(level) - 1)) {
> dma_clear_pte(pte);
> domain_flush_cache(domain, pte, sizeof(*pte));
> - free_pgtable_page(level_pte);
> + iommu_free_page(level_pte);
> }
> next:
> pfn += level_size(level);
> @@ -944,7 +929,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
>
> /* free pgd */
> if (start_pfn == 0 && last_pfn == DOMAIN_MAX_PFN(domain->gaw)) {
> - free_pgtable_page(domain->pgd);
> + iommu_free_page(domain->pgd);
> domain->pgd = NULL;
> }
> }
> @@ -1046,7 +1031,7 @@ static int iommu_alloc_root_entry(struct intel_iommu *iommu)
> {
> struct root_entry *root;
>
> - root = alloc_pgtable_page(iommu->node, GFP_ATOMIC);
> + root = iommu_alloc_page_node(iommu->node, GFP_ATOMIC);
> if (!root) {
> pr_err("Allocating root entry for %s failed\n",
> iommu->name);
> @@ -1718,7 +1703,7 @@ static void domain_exit(struct dmar_domain *domain)
> LIST_HEAD(freelist);
>
> domain_unmap(domain, 0, DOMAIN_MAX_PFN(domain->gaw), &freelist);
> - put_pages_list(&freelist);
> + iommu_put_pages_list(&freelist);
> }
>
> if (WARN_ON(!list_empty(&domain->devices)))
> @@ -2452,7 +2437,7 @@ static int copy_context_table(struct intel_iommu *iommu,
> if (!old_ce)
> goto out;
>
> - new_ce = alloc_pgtable_page(iommu->node, GFP_KERNEL);
> + new_ce = iommu_alloc_page_node(iommu->node, GFP_KERNEL);
> if (!new_ce)
> goto out_unmap;
>
> @@ -3385,7 +3370,7 @@ static int intel_iommu_memory_notifier(struct notifier_block *nb,
> start_vpfn, mhp->nr_pages,
> list_empty(&freelist), 0);
> rcu_read_unlock();
> - put_pages_list(&freelist);
> + iommu_put_pages_list(&freelist);
> }
> break;
> }
> @@ -3816,7 +3801,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width)
> domain->max_addr = 0;
>
> /* always allocate the top pgd */
> - domain->pgd = alloc_pgtable_page(domain->nid, GFP_ATOMIC);
> + domain->pgd = iommu_alloc_page_node(domain->nid, GFP_ATOMIC);
> if (!domain->pgd)
> return -ENOMEM;
> domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
> @@ -3960,7 +3945,7 @@ int prepare_domain_attach_device(struct iommu_domain *domain,
> pte = dmar_domain->pgd;
> if (dma_pte_present(pte)) {
> dmar_domain->pgd = phys_to_virt(dma_pte_addr(pte));
> - free_pgtable_page(pte);
> + iommu_free_page(pte);
> }
> dmar_domain->agaw--;
> }
> @@ -4107,7 +4092,7 @@ static void intel_iommu_tlb_sync(struct iommu_domain *domain,
> start_pfn, nrpages,
> list_empty(&gather->freelist), 0);
>
> - put_pages_list(&gather->freelist);
> + iommu_put_pages_list(&gather->freelist);
> }
>
> static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
> diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
> index d02f916d8e59..9fe04cea29c4 100644
> --- a/drivers/iommu/intel/iommu.h
> +++ b/drivers/iommu/intel/iommu.h
> @@ -1069,8 +1069,6 @@ void domain_update_iommu_cap(struct dmar_domain *domain);
>
> int dmar_ir_support(void);
>
> -void *alloc_pgtable_page(int node, gfp_t gfp);
> -void free_pgtable_page(void *vaddr);
> void iommu_flush_write_buffer(struct intel_iommu *iommu);
> struct iommu_domain *intel_nested_domain_alloc(struct iommu_domain *parent,
> const struct iommu_user_data *user_data);
> diff --git a/drivers/iommu/intel/irq_remapping.c b/drivers/iommu/intel/irq_remapping.c
> index 566297bc87dd..39cd9626eb8d 100644
> --- a/drivers/iommu/intel/irq_remapping.c
> +++ b/drivers/iommu/intel/irq_remapping.c
> @@ -22,6 +22,7 @@
>
> #include "iommu.h"
> #include "../irq_remapping.h"
> +#include "../iommu-pages.h"
> #include "cap_audit.h"
>
> enum irq_mode {
> @@ -527,7 +528,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
> struct ir_table *ir_table;
> struct fwnode_handle *fn;
> unsigned long *bitmap;
> - struct page *pages;
> + void *ir_table_base;
>
> if (iommu->ir_table)
> return 0;
> @@ -536,9 +537,9 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
> if (!ir_table)
> return -ENOMEM;
>
> - pages = alloc_pages_node(iommu->node, GFP_KERNEL | __GFP_ZERO,
> - INTR_REMAP_PAGE_ORDER);
> - if (!pages) {
> + ir_table_base = iommu_alloc_pages_node(iommu->node, GFP_KERNEL,
> + INTR_REMAP_PAGE_ORDER);
> + if (!ir_table_base) {
> pr_err("IR%d: failed to allocate pages of order %d\n",
> iommu->seq_id, INTR_REMAP_PAGE_ORDER);
> goto out_free_table;
> @@ -573,7 +574,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
> else
> iommu->ir_domain->msi_parent_ops = &dmar_msi_parent_ops;
>
> - ir_table->base = page_address(pages);
> + ir_table->base = ir_table_base;
> ir_table->bitmap = bitmap;
> iommu->ir_table = ir_table;
>
> @@ -622,7 +623,7 @@ static int intel_setup_irq_remapping(struct intel_iommu *iommu)
> out_free_bitmap:
> bitmap_free(bitmap);
> out_free_pages:
> - __free_pages(pages, INTR_REMAP_PAGE_ORDER);
> + iommu_free_pages(ir_table_base, INTR_REMAP_PAGE_ORDER);
> out_free_table:
> kfree(ir_table);
>
> @@ -643,8 +644,7 @@ static void intel_teardown_irq_remapping(struct intel_iommu *iommu)
> irq_domain_free_fwnode(fn);
> iommu->ir_domain = NULL;
> }
> - free_pages((unsigned long)iommu->ir_table->base,
> - INTR_REMAP_PAGE_ORDER);
> + iommu_free_pages(iommu->ir_table->base, INTR_REMAP_PAGE_ORDER);
> bitmap_free(iommu->ir_table->bitmap);
> kfree(iommu->ir_table);
> iommu->ir_table = NULL;
> diff --git a/drivers/iommu/intel/pasid.c b/drivers/iommu/intel/pasid.c
> index 3239cefa4c33..d46f661dd971 100644
> --- a/drivers/iommu/intel/pasid.c
> +++ b/drivers/iommu/intel/pasid.c
> @@ -20,6 +20,7 @@
>
> #include "iommu.h"
> #include "pasid.h"
> +#include "../iommu-pages.h"
>
> /*
> * Intel IOMMU system wide PASID name space:
> @@ -38,7 +39,7 @@ int intel_pasid_alloc_table(struct device *dev)
> {
> struct device_domain_info *info;
> struct pasid_table *pasid_table;
> - struct page *pages;
> + struct pasid_dir_entry *dir;
> u32 max_pasid = 0;
> int order, size;
>
> @@ -59,14 +60,13 @@ int intel_pasid_alloc_table(struct device *dev)
>
> size = max_pasid >> (PASID_PDE_SHIFT - 3);
> order = size ? get_order(size) : 0;
> - pages = alloc_pages_node(info->iommu->node,
> - GFP_KERNEL | __GFP_ZERO, order);
> - if (!pages) {
> + dir = iommu_alloc_pages_node(info->iommu->node, GFP_KERNEL, order);
> + if (!dir) {
> kfree(pasid_table);
> return -ENOMEM;
> }
>
> - pasid_table->table = page_address(pages);
> + pasid_table->table = dir;
> pasid_table->order = order;
> pasid_table->max_pasid = 1 << (order + PAGE_SHIFT + 3);
> info->pasid_table = pasid_table;
> @@ -97,10 +97,10 @@ void intel_pasid_free_table(struct device *dev)
> max_pde = pasid_table->max_pasid >> PASID_PDE_SHIFT;
> for (i = 0; i < max_pde; i++) {
> table = get_pasid_table_from_pde(&dir[i]);
> - free_pgtable_page(table);
> + iommu_free_page(table);
> }
>
> - free_pages((unsigned long)pasid_table->table, pasid_table->order);
> + iommu_free_pages(pasid_table->table, pasid_table->order);
> kfree(pasid_table);
> }
>
> @@ -146,7 +146,7 @@ static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
> retry:
> entries = get_pasid_table_from_pde(&dir[dir_index]);
> if (!entries) {
> - entries = alloc_pgtable_page(info->iommu->node, GFP_ATOMIC);
> + entries = iommu_alloc_page_node(info->iommu->node, GFP_ATOMIC);
> if (!entries)
> return NULL;
>
> @@ -158,7 +158,7 @@ static struct pasid_entry *intel_pasid_get_entry(struct device *dev, u32 pasid)
> */
> if (cmpxchg64(&dir[dir_index].val, 0ULL,
> (u64)virt_to_phys(entries) | PASID_PTE_PRESENT)) {
> - free_pgtable_page(entries);
> + iommu_free_page(entries);
> goto retry;
> }
> if (!ecap_coherent(info->iommu->ecap)) {
> diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c
> index 40edd282903f..a691f917456c 100644
> --- a/drivers/iommu/intel/svm.c
> +++ b/drivers/iommu/intel/svm.c
> @@ -23,6 +23,7 @@
> #include "pasid.h"
> #include "perf.h"
> #include "../iommu-sva.h"
> +#include "../iommu-pages.h"
> #include "trace.h"
>
> static irqreturn_t prq_event_thread(int irq, void *d);
> @@ -64,16 +65,14 @@ svm_lookup_device_by_dev(struct intel_svm *svm, struct device *dev)
> int intel_svm_enable_prq(struct intel_iommu *iommu)
> {
> struct iopf_queue *iopfq;
> - struct page *pages;
> int irq, ret;
>
> - pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, PRQ_ORDER);
> - if (!pages) {
> + iommu->prq = iommu_alloc_pages(GFP_KERNEL, PRQ_ORDER);
> + if (!iommu->prq) {
> pr_warn("IOMMU: %s: Failed to allocate page request queue\n",
> iommu->name);
> return -ENOMEM;
> }
> - iommu->prq = page_address(pages);
>
> irq = dmar_alloc_hwirq(IOMMU_IRQ_ID_OFFSET_PRQ + iommu->seq_id, iommu->node, iommu);
> if (irq <= 0) {
> @@ -118,7 +117,7 @@ int intel_svm_enable_prq(struct intel_iommu *iommu)
> dmar_free_hwirq(irq);
> iommu->pr_irq = 0;
> free_prq:
> - free_pages((unsigned long)iommu->prq, PRQ_ORDER);
> + iommu_free_pages(iommu->prq, PRQ_ORDER);
> iommu->prq = NULL;
>
> return ret;
> @@ -141,7 +140,7 @@ int intel_svm_finish_prq(struct intel_iommu *iommu)
> iommu->iopf_queue = NULL;
> }
>
> - free_pages((unsigned long)iommu->prq, PRQ_ORDER);
> + iommu_free_pages(iommu->prq, PRQ_ORDER);
> iommu->prq = NULL;
>
> return 0;
> diff --git a/drivers/iommu/iommu-pages.h b/drivers/iommu/iommu-pages.h
> new file mode 100644
> index 000000000000..35bfa369b134
> --- /dev/null
> +++ b/drivers/iommu/iommu-pages.h
> @@ -0,0 +1,154 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2024, Google LLC.
> + * Pasha Tatashin <pasha.tatashin@xxxxxxxxxx>
> + */
> +
> +#ifndef __IOMMU_PAGES_H
> +#define __IOMMU_PAGES_H
> +
> +#include <linux/vmstat.h>
> +#include <linux/gfp.h>
> +#include <linux/mm.h>
> +
> +/*
> + * All page allocations that should be reported to as "iommu-pagetables" to
> + * userspace must use on of the functions below. This includes allocations of
> + * page-tables and other per-iommu_domain configuration structures.

/s/use on/use one/?

> + *
> + * This is necessary for the proper accounting as IOMMU state can be rather
> + * large, i.e. multiple gigabytes in size.
> + */
> +
> +/**
> + * __iommu_alloc_pages - allocate a zeroed page of a given order.
> + * @gfp: buddy allocator flags

Shall we keep the comments generic here(avoid reference to allocator
algo) ?

> + * @order: page order
> + *
> + * returns the head struct page of the allocated page.
> + */
> +static inline struct page *__iommu_alloc_pages(gfp_t gfp, int order)
> +{
> + struct page *page;
> +
> + page = alloc_pages(gfp | __GFP_ZERO, order);
> + if (unlikely(!page))
> + return NULL;
> +
> + return page;
> +}
> +
> +/**
> + * __iommu_free_pages - free page of a given order
> + * @page: head struct page of the page
> + * @order: page order
> + */
> +static inline void __iommu_free_pages(struct page *page, int order)
> +{
> + if (!page)
> + return;
> +
> + __free_pages(page, order);
> +}
> +
> +/**
> + * iommu_alloc_pages_node - allocate a zeroed page of a given order from
> + * specific NUMA node.
> + * @nid: memory NUMA node id
> + * @gfp: buddy allocator flags

Same here for this one and other references below.

> + * @order: page order
> + *
> + * returns the virtual address of the allocated page
> + */
> +static inline void *iommu_alloc_pages_node(int nid, gfp_t gfp, int order)
> +{
> + struct page *page = alloc_pages_node(nid, gfp | __GFP_ZERO, order);
> +
> + if (unlikely(!page))
> + return NULL;
> +
> + return page_address(page);
> +}
> +
> +/**
> + * iommu_alloc_pages - allocate a zeroed page of a given order
> + * @gfp: buddy allocator flags
> + * @order: page order
> + *
> + * returns the virtual address of the allocated page
> + */
> +static inline void *iommu_alloc_pages(gfp_t gfp, int order)
> +{
> + struct page *page = __iommu_alloc_pages(gfp, order);
> +
> + if (unlikely(!page))
> + return NULL;
> +
> + return page_address(page);
> +}
> +
> +/**
> + * iommu_alloc_page_node - allocate a zeroed page at specific NUMA node.
> + * @nid: memory NUMA node id
> + * @gfp: buddy allocator flags
> + *
> + * returns the virtual address of the allocated page
> + */
> +static inline void *iommu_alloc_page_node(int nid, gfp_t gfp)
> +{
> + return iommu_alloc_pages_node(nid, gfp, 0);
> +}
> +
> +/**
> + * iommu_alloc_page - allocate a zeroed page
> + * @gfp: buddy allocator flags
> + *
> + * returns the virtual address of the allocated page
> + */
> +static inline void *iommu_alloc_page(gfp_t gfp)
> +{
> + return iommu_alloc_pages(gfp, 0);
> +}
> +
> +/**
> + * iommu_free_pages - free page of a given order
> + * @virt: virtual address of the page to be freed.
> + * @order: page order
> + */
> +static inline void iommu_free_pages(void *virt, int order)
> +{
> + if (!virt)
> + return;
> +
> + __iommu_free_pages(virt_to_page(virt), order);
> +}
> +
> +/**
> + * iommu_free_page - free page
> + * @virt: virtual address of the page to be freed.
> + */
> +static inline void iommu_free_page(void *virt)
> +{
> + iommu_free_pages(virt, 0);
> +}
> +
> +/**
> + * iommu_put_pages_list - free a list of pages.
> + * @page: the head of the lru list to be freed.
> + *
> + * There are no locking requirement for these pages, as they are going to be
> + * put on a free list as soon as refcount reaches 0. Pages are put on this LRU
> + * list once they are removed from the IOMMU page tables. However, they can
> + * still be access through debugfs.
> + */
> +static inline void iommu_put_pages_list(struct list_head *page)
> +{
> + while (!list_empty(page)) {
> + struct page *p = list_entry(page->prev, struct page, lru);
> +
> + list_del(&p->lru);
> + put_page(p);
> + }
> +}
> +
> +#endif /* __IOMMU_PAGES_H */
> --
> 2.44.0.rc0.258.g7320e95886-goog
>