[PATCH v3 04/18] iommupt: Implement preserve/unpreserve/restore callbacks

From: Samiullah Khawaja

Date: Sun Jun 14 2026 - 19:38:34 EST


Add iommupt ops for presevation, unpresevation and restoration of iommu
page tables for liveupdate. Use the existing page walker to preserve the
ioptdesc of the top_table and the lower tables.

Preserve top_level, VASZ and FEAT Sign Extended to restore the domain in
the next kernel. On restore, the domain has only the preserved features
enabled and all the other features are zeroed. This is ok since the
restored domain is made immutable and can only be freed. A kunit test is
added to verify that the IOMMU domain free can be done with trimmed
features.

Signed-off-by: Samiullah Khawaja <skhawaja@xxxxxxxxxx>
---
drivers/iommu/generic_pt/iommu_pt.h | 122 ++++++++++++++++++++++
drivers/iommu/generic_pt/kunit_iommu_pt.h | 28 +++++
include/linux/generic_pt/iommu.h | 30 ++++++
3 files changed, 180 insertions(+)

diff --git a/drivers/iommu/generic_pt/iommu_pt.h b/drivers/iommu/generic_pt/iommu_pt.h
index 19b6daf88f2a..7759bbf12d10 100644
--- a/drivers/iommu/generic_pt/iommu_pt.h
+++ b/drivers/iommu/generic_pt/iommu_pt.h
@@ -16,6 +16,7 @@
#include "../iommu-pages.h"
#include <linux/cleanup.h>
#include <linux/dma-mapping.h>
+#include <linux/iommu-liveupdate.h>

enum {
SW_BIT_CACHE_FLUSH_DONE = 0,
@@ -961,6 +962,118 @@ static int NS(map_range)(struct pt_iommu *iommu_table, dma_addr_t iova,
return ret;
}

+#ifdef CONFIG_IOMMU_LIVEUPDATE
+static void NS(unpreserve)(struct pt_iommu *iommu_table, struct iommu_domain_ser *ser)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range range = pt_all_range(common);
+ struct pt_iommu_collect_args collect = {
+ .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list),
+ };
+
+ iommu_pages_list_add(&collect.free_list, range.top_table);
+ pt_walk_range(&range, __collect_tables, &collect);
+
+ iommu_unpreserve_pages_list(&collect.free_list);
+}
+
+static int NS(preserve)(struct pt_iommu *iommu_table, struct iommu_domain_ser *ser)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range range = pt_all_range(common);
+ struct pt_iommu_collect_args collect = {
+ .free_list = IOMMU_PAGES_LIST_INIT(collect.free_list),
+ };
+ int ret;
+
+ iommu_pages_list_add(&collect.free_list, range.top_table);
+ pt_walk_range(&range, __collect_tables, &collect);
+
+ ret = iommu_preserve_pages_list(&collect.free_list);
+ if (ret)
+ return ret;
+
+ ser->top_table_phys = virt_to_phys(range.top_table);
+ ser->top_level = range.top_level;
+
+ /*
+ * VASZ and SIGN_EXTEND will be needed in next kernel for collector page
+ * table walk to restore and free pages.
+ *
+ * Use the max_vasz_lg2 from range, as that is the current one if
+ * DYNAMIC_TOP is supported.
+ */
+ ser->vasz = range.max_vasz_lg2;
+ ser->sign_extend = pt_feature(common, PT_FEAT_SIGN_EXTEND);
+
+ return 0;
+}
+
+static int __restore_tables(struct pt_range *range, void *arg,
+ unsigned int level, struct pt_table_p *table)
+{
+ struct pt_state pts = pt_init(range, level, table);
+ int ret;
+
+ for_each_pt_level_entry(&pts) {
+ if (pts.type == PT_ENTRY_TABLE) {
+ iommu_restore_pages(virt_to_phys(pts.table_lower));
+
+ /*
+ * pt_descend can only fail if pts.table_lower is not
+ * init. So the if statement below is dead code.
+ */
+ ret = pt_descend(&pts, arg, __restore_tables);
+ if (ret)
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static const struct pt_iommu_ops NS(ops_immutable);
+
+static int NS(restore)(struct pt_iommu *iommu_table, struct iommu_domain_ser *ser)
+{
+ struct pt_common *common = common_from_iommu(iommu_table);
+ struct pt_range range;
+
+ common->max_vasz_lg2 = ser->vasz;
+
+ /*
+ * Restored page tables are strictly transient and only permitted to be
+ * destroyed via deinit() op. Because we only preserve user-assigned
+ * devices utilizing pass-through frameworks (VFIO / IOMMUFD), any
+ * concurrent or subsequent map/unmap operations on a restored domain
+ * are explicitly blocked at the subsystem boundary (e.g., via IOAS
+ * immutability).
+ */
+ iommu_table->ops = &NS(ops_immutable);
+
+ /*
+ * It is safe to override this here since this domain is immutable and
+ * can only be freed.
+ */
+ common->features = 0;
+ if (ser->sign_extend)
+ common->features |= BIT(PT_FEAT_SIGN_EXTEND);
+
+ range = pt_all_range(common);
+ iommu_restore_pages(ser->top_table_phys);
+
+ /* Free new table */
+ iommu_free_pages(range.top_table);
+
+ /* Set the restored top table */
+ pt_top_set(common, phys_to_virt(ser->top_table_phys), ser->top_level);
+
+ /* Restore all pages*/
+ range = pt_all_range(common);
+ return pt_walk_range(&range, __restore_tables, NULL);
+}
+#endif
+
struct pt_unmap_args {
struct iommu_pages_list free_list;
pt_vaddr_t unmapped;
@@ -1136,6 +1249,15 @@ static const struct pt_iommu_ops NS(ops) = {
#endif
.get_info = NS(get_info),
.deinit = NS(deinit),
+#ifdef CONFIG_IOMMU_LIVEUPDATE
+ .preserve = NS(preserve),
+ .unpreserve = NS(unpreserve),
+ .restore = NS(restore),
+#endif
+};
+
+static const struct pt_iommu_ops NS(ops_immutable) = {
+ .deinit = NS(deinit),
};

static int pt_init_common(struct pt_common *common)
diff --git a/drivers/iommu/generic_pt/kunit_iommu_pt.h b/drivers/iommu/generic_pt/kunit_iommu_pt.h
index e8a63c8ea850..af1918d693ed 100644
--- a/drivers/iommu/generic_pt/kunit_iommu_pt.h
+++ b/drivers/iommu/generic_pt/kunit_iommu_pt.h
@@ -426,6 +426,33 @@ static void test_mixed(struct kunit *test)
check_iova(test, start, oa, len);
}

+static void test_restore_free(struct kunit *test)
+{
+ struct kunit_iommu_priv *priv = test->priv;
+ struct pt_range top_range = pt_top_range(priv->common);
+ u64 start = 0x3fe400ULL << 12;
+ u64 end = 0x4c0600ULL << 12;
+ pt_vaddr_t len = end - start;
+
+ if (top_range.last_va <= start || sizeof(unsigned long) == 4)
+ kunit_skip(test, "range is too small");
+ if ((priv->safe_pgsize_bitmap & GENMASK(30, 21)) != (BIT(30) | BIT(21)))
+ kunit_skip(test, "incompatible psize");
+
+ /* Map a large mixed range to populate multiple levels of page tables */
+ do_map(test, start, start, len);
+
+ /*
+ * Simulate a restored state by clearing all features except
+ * SIGN_EXTEND. This verifies that the generic page table free walker
+ * can correctly tear down a populated domain when other features are
+ * zeroed.
+ */
+ priv->common->features &= BIT(PT_FEAT_SIGN_EXTEND);
+
+ /* The domain will be freed when the test exits. */
+}
+
static struct kunit_case iommu_test_cases[] = {
KUNIT_CASE_FMT(test_increase_level),
KUNIT_CASE_FMT(test_map_simple),
@@ -434,6 +461,7 @@ static struct kunit_case iommu_test_cases[] = {
KUNIT_CASE_FMT(test_random_map),
KUNIT_CASE_FMT(test_pgsize_boundary),
KUNIT_CASE_FMT(test_mixed),
+ KUNIT_CASE_FMT(test_restore_free),
{},
};

diff --git a/include/linux/generic_pt/iommu.h b/include/linux/generic_pt/iommu.h
index dd0edd02a48a..faa41d8032fe 100644
--- a/include/linux/generic_pt/iommu.h
+++ b/include/linux/generic_pt/iommu.h
@@ -13,6 +13,7 @@ struct iommu_iotlb_gather;
struct pt_iommu_ops;
struct pt_iommu_driver_ops;
struct iommu_dirty_bitmap;
+struct iommu_domain_ser;

/**
* DOC: IOMMU Radix Page Table
@@ -166,6 +167,35 @@ struct pt_iommu_ops {
* table from all HW access and all caches.
*/
void (*deinit)(struct pt_iommu *iommu_table);
+
+ /**
+ * @preserve: Preserve the iommu page table for liveupdate
+ * @iommu_table: Table to preserve
+ * @ser: Serialization struct to fill with preserved state
+ *
+ * Preserve iommu page table and the relevant state for liveupdate. The
+ * caller must make sure that the page table is not updated during and
+ * after preservation.
+ */
+ int (*preserve)(struct pt_iommu *iommu_table, struct iommu_domain_ser *ser);
+
+ /**
+ * @unpreserve: Unpreserve the iommu page table
+ * @iommu_table: Table to unpreserve
+ * @ser: Serialization struct that contains preserved state
+ */
+ void (*unpreserve)(struct pt_iommu *iommu_table, struct iommu_domain_ser *ser);
+
+ /**
+ * @restore: Restore the iommu page table after liveupdate
+ * @iommu_table: Table to restore the state into
+ * @ser: Serialization struct that contains preserved state
+ *
+ * The iommu_table is back filled with the restored state that was
+ * preserved in the serialization struct.
+ */
+ int (*restore)(struct pt_iommu *iommu_table, struct iommu_domain_ser *ser);
+
};

/**
--
2.54.0.1136.gdb2ca164c4-goog