[PATCH] iommu/amd: Modify set_dte_entry() to use 128-bit cmpxchg operation

From: Suravee Suthikulpanit
Date: Mon Aug 19 2024 - 12:19:23 EST


The current implementation does not follow the 128-bit write
requirement to update DTE as specified in the AMD I/O Virtualization
Techonology (IOMMU) Specification.

In addition, the function is used to program several DTE fields
(e.g. stage1 table, stage2 table, domain id, and etc.), which is
difficult to keep track with current implementation.

Therefore, introduce new a new dte256_t data type and a helper function
update_dte_256(), which uses two try_cmpxchg128 operations to update
256-bit DTE.

Also, separate logic for setting up the GCR3 Table Root Pointer, GIOV, GV,
GLX, and GuestPagingMode into another helper function set_dte_gcr3_table().

Signed-off-by: Suravee Suthikulpanit <suravee.suthikulpanit@xxxxxxx>
---
drivers/iommu/amd/amd_iommu_types.h | 17 ++++
drivers/iommu/amd/iommu.c | 143 +++++++++++++++++-----------
2 files changed, 107 insertions(+), 53 deletions(-)

diff --git a/drivers/iommu/amd/amd_iommu_types.h b/drivers/iommu/amd/amd_iommu_types.h
index c9f9a598eb82..295138447476 100644
--- a/drivers/iommu/amd/amd_iommu_types.h
+++ b/drivers/iommu/amd/amd_iommu_types.h
@@ -886,6 +886,23 @@ struct dev_table_entry {
u64 data[4];
};

+struct dte256 {
+ union {
+ struct {
+ u64 lo;
+ u64 hi;
+ };
+ u128 data;
+ } qw_lo;
+ union {
+ struct {
+ u64 lo;
+ u64 hi;
+ };
+ u128 data;
+ } qw_hi;
+};
+
/*
* One entry for unity mappings parsed out of the ACPI table.
*/
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 87c5385ce3f2..189f65af45fe 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1851,90 +1851,127 @@ int amd_iommu_clear_gcr3(struct iommu_dev_data *dev_data, ioasid_t pasid)
return ret;
}

+static void set_dte_gcr3_table(struct amd_iommu *iommu,
+ struct iommu_dev_data *dev_data,
+ struct dte256 *target)
+{
+ struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
+ u64 tmp, gcr3;
+
+ if (!gcr3_info->gcr3_tbl)
+ return;
+
+ pr_debug("%s: devid=%#x, glx=%#x, gcr3_tbl=%#llx\n",
+ __func__, dev_data->devid, gcr3_info->glx,
+ (unsigned long long)gcr3_info->gcr3_tbl);
+
+ tmp = gcr3_info->glx;
+ target->qw_lo.lo |= (tmp & DTE_GLX_MASK) << DTE_GLX_SHIFT;
+ if (pdom_is_v2_pgtbl_mode(dev_data->domain))
+ target->qw_lo.lo |= DTE_FLAG_GIOV;
+ target->qw_lo.lo |= DTE_FLAG_GV;
+
+ /* First mask out possible old values for GCR3 table */
+ tmp = DTE_GCR3_VAL_A(~0ULL) << DTE_GCR3_SHIFT_A;
+ target->qw_lo.lo &= ~tmp;
+ tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
+ tmp |= DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
+ target->qw_lo.hi &= ~tmp;
+
+ gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
+
+ /* Encode GCR3 table into DTE */
+ tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
+ target->qw_lo.lo |= tmp;
+ tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
+ tmp |= DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
+ target->qw_lo.hi |= tmp;
+
+ /* Mask out old values for GuestPagingMode */
+ target->qw_hi.lo &= ~(0x3ULL << DTE_GPT_LEVEL_SHIFT);
+ /* Guest page table can only support 4 and 5 levels */
+ if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL)
+ target->qw_hi.lo |= ((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
+}
+
+static void update_dte256(struct amd_iommu *iommu, u16 devid, struct dte256 *new)
+{
+ struct dev_table_entry *dev_table = get_dev_table(iommu);
+ struct dte256 *ptr = (struct dte256 *)&dev_table[devid];
+ struct dte256 old = {
+ .qw_lo.data = ptr->qw_lo.data,
+ .qw_hi.data = ptr->qw_hi.data,
+ };
+
+ /* Update qw_lo */
+ if (!try_cmpxchg128(&ptr->qw_lo.data, &old.qw_lo.data, new->qw_lo.data))
+ goto err_out;
+
+ /* Update qw_hi */
+ if (!try_cmpxchg128(&ptr->qw_hi.data, &old.qw_hi.data, new->qw_hi.data)) {
+ /* Restore qw_lo */
+ try_cmpxchg128(&ptr->qw_lo.data, &new->qw_lo.data, old.qw_lo.data);
+ goto err_out;
+ }
+ return;
+err_out:
+ pr_err("%s: Failed to update DTE for devid %#x\n", __func__, devid);
+}
+
static void set_dte_entry(struct amd_iommu *iommu,
struct iommu_dev_data *dev_data)
{
- u64 pte_root = 0;
- u64 flags = 0;
- u32 old_domid;
- u16 devid = dev_data->devid;
u16 domid;
+ struct dte256 new = { .qw_lo.data = 0, .qw_hi.data = 0 };
+ u16 devid = dev_data->devid;
struct protection_domain *domain = dev_data->domain;
struct dev_table_entry *dev_table = get_dev_table(iommu);
struct gcr3_tbl_info *gcr3_info = &dev_data->gcr3_info;
+ u32 old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;

- if (gcr3_info && gcr3_info->gcr3_tbl)
+ if (gcr3_info->gcr3_tbl)
domid = dev_data->gcr3_info.domid;
else
domid = domain->id;

+ /*
+ * Need to get the current value in dte[1,2] because they contain
+ * interrupt-remapping settings, which has been programmed earlier.
+ */
+ new.qw_lo.hi = dev_table[devid].data[1];
+ new.qw_hi.lo = dev_table[devid].data[2];
+ new.qw_hi.hi = dev_table[devid].data[3];
+
if (domain->iop.mode != PAGE_MODE_NONE)
- pte_root = iommu_virt_to_phys(domain->iop.root);
+ new.qw_lo.lo = iommu_virt_to_phys(domain->iop.root);

- pte_root |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
+ new.qw_lo.lo |= (domain->iop.mode & DEV_ENTRY_MODE_MASK)
<< DEV_ENTRY_MODE_SHIFT;

- pte_root |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;
+ new.qw_lo.lo |= DTE_FLAG_IR | DTE_FLAG_IW | DTE_FLAG_V;

/*
* When SNP is enabled, Only set TV bit when IOMMU
* page translation is in use.
*/
if (!amd_iommu_snp_en || (domid != 0))
- pte_root |= DTE_FLAG_TV;
-
- flags = dev_table[devid].data[1];
+ new.qw_lo.lo |= DTE_FLAG_TV;

if (dev_data->ats_enabled)
- flags |= DTE_FLAG_IOTLB;
+ new.qw_lo.hi |= DTE_FLAG_IOTLB;

if (dev_data->ppr)
- pte_root |= 1ULL << DEV_ENTRY_PPR;
+ new.qw_lo.lo |= 1ULL << DEV_ENTRY_PPR;

if (domain->dirty_tracking)
- pte_root |= DTE_FLAG_HAD;
-
- if (gcr3_info && gcr3_info->gcr3_tbl) {
- u64 gcr3 = iommu_virt_to_phys(gcr3_info->gcr3_tbl);
- u64 glx = gcr3_info->glx;
- u64 tmp;
+ new.qw_lo.lo |= DTE_FLAG_HAD;

- pte_root |= DTE_FLAG_GV;
- pte_root |= (glx & DTE_GLX_MASK) << DTE_GLX_SHIFT;
-
- /* First mask out possible old values for GCR3 table */
- tmp = DTE_GCR3_VAL_B(~0ULL) << DTE_GCR3_SHIFT_B;
- flags &= ~tmp;
-
- tmp = DTE_GCR3_VAL_C(~0ULL) << DTE_GCR3_SHIFT_C;
- flags &= ~tmp;
-
- /* Encode GCR3 table into DTE */
- tmp = DTE_GCR3_VAL_A(gcr3) << DTE_GCR3_SHIFT_A;
- pte_root |= tmp;
-
- tmp = DTE_GCR3_VAL_B(gcr3) << DTE_GCR3_SHIFT_B;
- flags |= tmp;
-
- tmp = DTE_GCR3_VAL_C(gcr3) << DTE_GCR3_SHIFT_C;
- flags |= tmp;
-
- if (amd_iommu_gpt_level == PAGE_MODE_5_LEVEL) {
- dev_table[devid].data[2] |=
- ((u64)GUEST_PGTABLE_5_LEVEL << DTE_GPT_LEVEL_SHIFT);
- }
-
- /* GIOV is supported with V2 page table mode only */
- if (pdom_is_v2_pgtbl_mode(domain))
- pte_root |= DTE_FLAG_GIOV;
- }
+ new.qw_lo.hi &= ~DEV_DOMID_MASK;
+ new.qw_lo.hi |= domid;

- flags &= ~DEV_DOMID_MASK;
- flags |= domid;
+ set_dte_gcr3_table(iommu, dev_data, &new);

- old_domid = dev_table[devid].data[1] & DEV_DOMID_MASK;
- dev_table[devid].data[1] = flags;
- dev_table[devid].data[0] = pte_root;
+ update_dte256(iommu, devid, &new);

/*
* A kdump kernel might be replacing a domain ID that was copied from
--
2.34.1