[PATCH v2 21/26] iommu/amd: Map kvmfd to shared translate device ID for vIOMMU
From: Suravee Suthikulpanit
Date: Thu May 28 2026 - 01:23:30 EST
Add per-PCI-segment kvmfd xarray amd_iommu_kvmfd_trans_entry with
refcounted so all vIOMMUs for one VM share one translate-device-id and
GPA->SPA DTE.
Expose helper functions:
* amd_iommu_get_trans_devid_by_kvmfd()
* amd_iommu_free_trans_devid_by_kvmfd()
* amd_iommu_trans_devid_alloc()
and extend segment init/fini for the kvmfd map.
Wire iommufd vIOMMU init/destroy to obtain the ID, program the translation
DTE and VFctrl TransDevID field, and tear down on error or uninit. Clear
translation state in amd_viommu_uninit_one().
Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@xxxxxxx>
---
drivers/iommu/amd/amd_iommu.h | 4 +
drivers/iommu/amd/amd_iommu_types.h | 13 ++
drivers/iommu/amd/iommu.c | 2 +-
drivers/iommu/amd/iommufd.c | 18 ++-
drivers/iommu/amd/trans_devid.c | 194 +++++++++++++++++++++++++++-
drivers/iommu/amd/viommu.c | 3 +
6 files changed, 230 insertions(+), 4 deletions(-)
diff --git a/drivers/iommu/amd/amd_iommu.h b/drivers/iommu/amd/amd_iommu.h
index ddfc6329d235..cf2b051948a3 100644
--- a/drivers/iommu/amd/amd_iommu.h
+++ b/drivers/iommu/amd/amd_iommu.h
@@ -223,6 +223,10 @@ void amd_iommu_pci_seg_trans_devid_fini(struct amd_iommu_pci_seg *pci_seg);
int amd_iommu_trans_devid_reserve(struct amd_iommu_pci_seg *pci_seg, u16 id);
int amd_iommu_trans_devid_reserve_pci_aliases(struct amd_iommu *iommu,
struct device *dev);
+int amd_iommu_get_trans_devid_by_kvmfd(struct amd_iommu_pci_seg *pci_seg,
+ u32 kvmfd, u16 *trans_devid);
+void amd_iommu_free_trans_devid_by_kvmfd(struct amd_iommu_pci_seg *pci_seg,
+ u32 kvmfd);
#else
static inline void
amd_iommu_pci_seg_trans_devid_init(struct amd_iommu_pci_seg *pci_seg) { }
diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index ffa338c8735f..92c83c68b2b1 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -559,6 +559,7 @@ struct amd_iommu_viommu {
u64 *devid_table;
u64 *domid_table;
+ u16 trans_devid;
/* Offset for mmap() of guest VF MMIO; set after iommufd_viommu_alloc_mmap(). */
unsigned long vfmmio_mmap_offset;
@@ -621,6 +622,11 @@ enum trans_devid_state {
TRANS_DEVID_RESERVED,
TRANS_DEVID_ALLOCATED,
};
+
+struct amd_iommu_kvmfd_trans_entry {
+ refcount_t refs;
+ u16 trans_devid;
+};
#endif
/*
@@ -692,6 +698,13 @@ struct amd_iommu_pci_seg {
*/
struct mutex trans_devid_mutex;
struct xarray trans_devid_xa;
+
+ /*
+ * Per-segment kvmfd mapping. The xarray is indexed by the kvmfd.
+ * Values are struct amd_iommu_kvmfd_trans_entry * or xa_mk_value(trans_devid).
+ */
+ struct mutex kvmfd_xa_mutex;
+ struct xarray kvmfd_xa;
#endif
};
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 2f9ca8f2d3c6..2530c8e5490c 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -3058,7 +3058,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, struct device *dev,
#if IS_ENABLED(CONFIG_AMD_IOMMU_IOMMUFD)
/* Translate-device-id reservation must be done before setting up
* the DTE for the device to make sure that the id has not been allocated
- * yet. (See amd_iommu_trans_devid_alloc().)
+ * yet. (See trans_devid_alloc() in trans_devid.c.)
*/
ret = amd_iommu_trans_devid_reserve(iommu->pci_seg, dev_data->devid);
if (ret) {
diff --git a/drivers/iommu/amd/iommufd.c b/drivers/iommu/amd/iommufd.c
index 7c7ef267088b..23a2c8c20365 100644
--- a/drivers/iommu/amd/iommufd.c
+++ b/drivers/iommu/amd/iommufd.c
@@ -64,6 +64,7 @@ int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *
int ret;
phys_addr_t page_base;
unsigned long flags;
+ u16 trans_devid;
struct iommu_viommu_amd data = {};
struct protection_domain *pdom = to_pdomain(parent);
struct amd_iommu *iommu = container_of(viommu->iommu_dev, struct amd_iommu, iommu);
@@ -104,9 +105,17 @@ int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *
goto err_kvmfd;
}
+ ret = amd_iommu_get_trans_devid_by_kvmfd(iommu->pci_seg, data.kvmfd,
+ &trans_devid);
+ if (ret)
+ goto err_kvmfd;
+
/* Reset vIOMMU MMIOs to initialize the vIOMMU */
iommu_reset_vmmio(iommu, aviommu->gid);
+ amd_iommu_set_translate_dte(iommu, aviommu->gid, pdom, trans_devid);
+ amd_iommu_update_vfctrl_mmio_translate_devid(iommu, aviommu->gid, trans_devid);
+
ret = amd_viommu_init_one(iommu, aviommu);
if (ret)
goto err_init;
@@ -117,6 +126,7 @@ int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *
if (ret)
goto err_init;
+ aviommu->trans_devid = trans_devid;
aviommu->kvmfd = data.kvmfd;
viommu->ops = &amd_viommu_ops;
@@ -126,6 +136,9 @@ int amd_iommufd_viommu_init(struct iommufd_viommu *viommu, struct iommu_domain *
return 0;
err_init:
+ amd_iommu_update_vfctrl_mmio_translate_devid(iommu, aviommu->gid, 0);
+ amd_iommu_clear_translate_dte(iommu, aviommu->gid, trans_devid);
+ amd_iommu_free_trans_devid_by_kvmfd(iommu->pci_seg, data.kvmfd);
err_kvmfd:
iommufd_viommu_destroy_mmap(&aviommu->core, aviommu->vfmmio_mmap_offset);
err_mmap:
@@ -147,10 +160,11 @@ static void amd_iommufd_viommu_destroy(struct iommufd_viommu *viommu)
list_del(&aviommu->pdom_list);
spin_unlock_irqrestore(&pdom->lock, flags);
xa_destroy(&aviommu->gdomid_array);
- if (aviommu->vfmmio_mmap_offset)
- iommufd_viommu_destroy_mmap(&aviommu->core, aviommu->vfmmio_mmap_offset);
amd_iommu_gid_free(iommu, aviommu->gid);
amd_viommu_uninit_one(iommu, aviommu);
+ if (aviommu->vfmmio_mmap_offset)
+ iommufd_viommu_destroy_mmap(&aviommu->core, aviommu->vfmmio_mmap_offset);
+ amd_iommu_free_trans_devid_by_kvmfd(iommu->pci_seg, aviommu->kvmfd);
}
/*
diff --git a/drivers/iommu/amd/trans_devid.c b/drivers/iommu/amd/trans_devid.c
index 76450d1735e2..712e0aabcd94 100644
--- a/drivers/iommu/amd/trans_devid.c
+++ b/drivers/iommu/amd/trans_devid.c
@@ -2,15 +2,22 @@
/*
* Copyright (C) 2025 Advanced Micro Devices, Inc.
*
- * AMD vIOMMU translate-device-id pool per PCI segment.
+ * AMD vIOMMU translate-device-id management.
+ *
+ * The id must be allocated from unused range. It is used to program the vIOMMU VF Control
+ * register to specify the DTE used to contain the GPA->SPA mapping (v1 page table).
*/
#include <linux/kernel.h>
#include <linux/pci.h>
+#include <linux/refcount.h>
+#include <linux/slab.h>
#include <linux/xarray.h>
#include "amd_iommu.h"
+static void trans_devid_free(struct amd_iommu_pci_seg *pci_seg, u16 id);
+
static inline enum trans_devid_state trans_devid_xa_get_state(void *entry)
{
if (!entry)
@@ -29,10 +36,32 @@ void amd_iommu_pci_seg_trans_devid_init(struct amd_iommu_pci_seg *pci_seg)
{
mutex_init(&pci_seg->trans_devid_mutex);
xa_init(&pci_seg->trans_devid_xa);
+ mutex_init(&pci_seg->kvmfd_xa_mutex);
+ xa_init(&pci_seg->kvmfd_xa);
}
void amd_iommu_pci_seg_trans_devid_fini(struct amd_iommu_pci_seg *pci_seg)
{
+ unsigned long index = 0;
+ void *e;
+
+ while ((e = xa_find(&pci_seg->kvmfd_xa, &index, ULONG_MAX, XA_PRESENT))) {
+ unsigned long cur = index;
+
+ if (xa_is_value(e))
+ trans_devid_free(pci_seg, (u16)xa_to_value(e));
+ else {
+ struct amd_iommu_kvmfd_trans_entry *entry = e;
+
+ trans_devid_free(pci_seg, entry->trans_devid);
+ kfree(entry);
+ }
+ xa_erase(&pci_seg->kvmfd_xa, cur);
+ if (cur == ULONG_MAX)
+ break;
+ index = cur + 1;
+ }
+ xa_destroy(&pci_seg->kvmfd_xa);
xa_destroy(&pci_seg->trans_devid_xa);
}
@@ -123,3 +152,166 @@ int amd_iommu_trans_devid_reserve_pci_aliases(struct amd_iommu *iommu,
return pci_for_each_dma_alias(pdev, reserve_trans_devid_each_dma_alias,
pci_seg);
}
+
+/**
+ * trans_devid_alloc - allocate a translate-device-id for @pci_seg
+ *
+ * The trans_devid is allocated from the highest id to the lowest id.
+ * Generally, the PCI devices enumerated from the beginning of the bus range.
+ * Therefore, ids in the high range are likely to not be used.
+ *
+ * Return: allocated id on success, negative errno on failure.
+ */
+static int trans_devid_alloc(struct amd_iommu_pci_seg *pci_seg)
+{
+ int id;
+
+ mutex_lock(&pci_seg->trans_devid_mutex);
+ for (id = U16_MAX; id >= 0; id--) {
+ void *entry, *old;
+
+ entry = xa_load(&pci_seg->trans_devid_xa, id);
+ if (entry)
+ continue;
+
+ old = xa_store(&pci_seg->trans_devid_xa, id,
+ trans_devid_xa_mk_state(TRANS_DEVID_ALLOCATED), GFP_KERNEL);
+ if (xa_is_err(old)) {
+ int err = xa_err(old);
+
+ mutex_unlock(&pci_seg->trans_devid_mutex);
+ return err;
+ }
+ WARN_ON_ONCE(old);
+ mutex_unlock(&pci_seg->trans_devid_mutex);
+ pr_debug("%s: Allocated trans_devid %#x (seg %#x)\n", __func__, id,
+ pci_seg->id);
+ return id;
+ }
+ pr_err("%s: No free trans_devid found (seg %#x)\n", __func__, pci_seg->id);
+ mutex_unlock(&pci_seg->trans_devid_mutex);
+ return -ENOSPC;
+}
+
+static void trans_devid_free(struct amd_iommu_pci_seg *pci_seg, u16 id)
+{
+ void *old;
+
+ mutex_lock(&pci_seg->trans_devid_mutex);
+ old = xa_erase(&pci_seg->trans_devid_xa, id);
+ if (WARN_ON_ONCE(!old || trans_devid_xa_get_state(old) == TRANS_DEVID_FREE))
+ goto out;
+ pr_debug("%s: Freed trans_devid %#x (seg %#x)\n", __func__, id, pci_seg->id);
+out:
+ mutex_unlock(&pci_seg->trans_devid_mutex);
+}
+
+/**
+ * amd_iommu_get_trans_devid_by_kvmfd - look up or allocate trans_devid for @kvmfd
+ *
+ * If an entry already exists for @kvmfd, bumps its refcount and returns the same
+ * @trans_devid. Otherwise allocates a new translate devid, inserts an entry with
+ * refcount 1, and returns it.
+ *
+ * Note: Each translate-device-id is allocated per VM (kvmfd) since there is one
+ * GPA->SPA mapping per VM. In case of multiple vIOMMUs, all vIOMMUs share the same
+ * translate-device-id.
+ *
+ * Return: 0 on success, %-ENOMEM on allocation failure, %-EIO if the map holds an
+ * unexpected entry type.
+ */
+int amd_iommu_get_trans_devid_by_kvmfd(struct amd_iommu_pci_seg *pci_seg, u32 kvmfd,
+ u16 *trans_devid)
+{
+ struct amd_iommu_kvmfd_trans_entry *entry;
+ void *prev;
+ int id, ret = 0;
+
+ mutex_lock(&pci_seg->kvmfd_xa_mutex);
+ entry = xa_load(&pci_seg->kvmfd_xa, kvmfd);
+ if (entry) {
+ if (WARN_ON_ONCE(xa_is_value(entry))) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+ refcount_inc(&entry->refs);
+ *trans_devid = entry->trans_devid;
+ pr_debug("%s: Got trans_devid %#x for kvmfd %#x (seg %#x)\n",
+ __func__, *trans_devid, kvmfd, pci_seg->id);
+ goto out_unlock;
+ }
+
+ id = trans_devid_alloc(pci_seg);
+ if (id < 0) {
+ pr_err("%s: Failed to allocate trans_devid (kvmfd=%#x seg=%#x err=%d)\n",
+ __func__, kvmfd, pci_seg->id, id);
+ ret = id;
+ goto out_unlock;
+ }
+
+ entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry) {
+ trans_devid_free(pci_seg, id);
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ refcount_set(&entry->refs, 1);
+ entry->trans_devid = id;
+
+ prev = xa_store(&pci_seg->kvmfd_xa, kvmfd, entry, GFP_KERNEL);
+ if (xa_is_err(prev)) {
+ ret = xa_err(prev);
+ kfree(entry);
+ trans_devid_free(pci_seg, id);
+ goto out_unlock;
+ }
+ WARN_ON_ONCE(prev);
+
+ *trans_devid = id;
+ pr_debug("%s: Allocated trans_devid %#x for kvmfd %#x (seg %#x)\n",
+ __func__, id, kvmfd, pci_seg->id);
+
+out_unlock:
+ mutex_unlock(&pci_seg->kvmfd_xa_mutex);
+ return ret;
+}
+
+/**
+ * amd_iommu_free_trans_devid_by_kvmfd - drop one reference for @kvmfd
+ *
+ * Decrements the per-kvmfd refcount. The translate devid is returned to the
+ * segment pool and the map entry is removed only when the refcount reaches zero.
+ */
+void amd_iommu_free_trans_devid_by_kvmfd(struct amd_iommu_pci_seg *pci_seg, u32 kvmfd)
+{
+ struct amd_iommu_kvmfd_trans_entry *entry;
+ u16 tid;
+
+ mutex_lock(&pci_seg->kvmfd_xa_mutex);
+ entry = xa_load(&pci_seg->kvmfd_xa, kvmfd);
+ if (!entry) {
+ mutex_unlock(&pci_seg->kvmfd_xa_mutex);
+ return;
+ }
+
+ if (WARN_ON_ONCE(xa_is_value(entry))) {
+ mutex_unlock(&pci_seg->kvmfd_xa_mutex);
+ return;
+ }
+
+ if (!refcount_dec_and_test(&entry->refs)) {
+ pr_debug("%s: kvmfd %#x, trans_devid %#x (seg %#x)\n",
+ __func__, kvmfd, entry->trans_devid, pci_seg->id);
+ mutex_unlock(&pci_seg->kvmfd_xa_mutex);
+ return;
+ }
+
+ tid = entry->trans_devid;
+ trans_devid_free(pci_seg, tid);
+ xa_erase(&pci_seg->kvmfd_xa, kvmfd);
+ kfree(entry);
+ mutex_unlock(&pci_seg->kvmfd_xa_mutex);
+ pr_debug("%s: Freed trans_devid %#x for kvmfd %#x (seg %#x)\n", __func__, tid,
+ kvmfd, pci_seg->id);
+}
diff --git a/drivers/iommu/amd/viommu.c b/drivers/iommu/amd/viommu.c
index b3150f7bcec3..d2c883e314f8 100644
--- a/drivers/iommu/amd/viommu.c
+++ b/drivers/iommu/amd/viommu.c
@@ -449,6 +449,9 @@ void amd_viommu_uninit_one(struct amd_iommu *iommu, struct amd_iommu_viommu *avi
VIOMMU_DOMID_MAPPING_BASE,
VIOMMU_DOMID_MAPPING_ENTRY_SIZE,
aviommu->gid);
+
+ amd_iommu_update_vfctrl_mmio_translate_devid(iommu, aviommu->gid, 0);
+ amd_iommu_clear_translate_dte(iommu, aviommu->gid, aviommu->trans_devid);
viommu_clear_mapping(iommu, aviommu);
}
--
2.34.1