[PATCH v9 4/6] x86/sev: Add support to perform RMP optimizations asynchronously

From: Ashish Kalra

Date: Wed Jun 24 2026 - 17:58:38 EST

From: Ashish Kalra <ashish.kalra@xxxxxxx>

When SEV-SNP is enabled, all writes to memory are checked to ensure
integrity of SNP guest memory. This imposes performance overhead on the
whole system.

RMPOPT is a new instruction that minimizes the performance overhead of
RMP checks on the hypervisor and on non-SNP guests by allowing RMP
checks to be skipped for 1GB regions of memory that are known not to
contain any SEV-SNP guest memory.

Add support for performing RMP optimizations asynchronously using a
dedicated workqueue.

Enable RMPOPT optimizations for up to 2TB of system RAM starting from
the lowest physical memory address aligned down to a 1GB boundary at
RMP initialization time. RMP checks can initially be skipped for 1GB
memory ranges that do not contain SEV-SNP guest memory (excluding
preassigned pages such as the RMP table and firmware pages). As SNP
guests are launched, RMPUPDATE will disable the corresponding RMPOPT
optimizations.

Suggested-by: Thomas Lendacky <thomas.lendacky@xxxxxxx>
Suggested-by: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Reviewed-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
Signed-off-by: Ashish Kalra <ashish.kalra@xxxxxxx>
---
arch/x86/virt/svm/sev.c | 165 +++++++++++++++++++++++++++++++++++++++-
1 file changed, 162 insertions(+), 3 deletions(-)

diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
index 60984f76b4e9..5f99cbbc6cbd 100644
--- a/arch/x86/virt/svm/sev.c
+++ b/arch/x86/virt/svm/sev.c
@@ -19,6 +19,7 @@
#include <linux/iommu.h>
#include <linux/amd-iommu.h>
#include <linux/nospec.h>
+#include <linux/workqueue.h>

#include <asm/sev.h>
#include <asm/processor.h>
@@ -125,9 +126,20 @@ static void *rmp_bookkeeping __ro_after_init;
static u64 probed_rmp_base, probed_rmp_size;

static cpumask_var_t rmpopt_cpumask;
-static phys_addr_t rmpopt_pa_start;
+static phys_addr_t rmpopt_pa_start, rmpopt_pa_end;
static bool rmpopt_capable;

+enum rmpopt_function {
+ RMPOPT_FUNC_VERIFY_AND_REPORT_STATUS,
+ RMPOPT_FUNC_REPORT_STATUS
+};
+
+#define RMPOPT_WORK_TIMEOUT 10000
+
+static struct workqueue_struct *rmpopt_wq;
+static struct delayed_work rmpopt_delayed_work;
+static DEFINE_MUTEX(rmpopt_wq_mutex);
+
static LIST_HEAD(snp_leaked_pages_list);
static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);

@@ -571,13 +583,22 @@ static void rmpopt_cleanup(void)
{
int cpu;

+ guard(mutex)(&rmpopt_wq_mutex);
+
+ if (!rmpopt_wq)
+ return;
+
+ cancel_delayed_work_sync(&rmpopt_delayed_work);
+ destroy_workqueue(rmpopt_wq);
+
scoped_guard(cpus_read_lock) {
for_each_cpu(cpu, rmpopt_cpumask)
wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, 0);
}

free_cpumask_var(rmpopt_cpumask);
- rmpopt_pa_start = 0;
+ rmpopt_pa_start = rmpopt_pa_end = 0;
+ rmpopt_wq = NULL;
}

/*
@@ -627,6 +648,101 @@ void snp_clear_rmpopt_capable(void)
rmpopt_capable = false;
}

+/*
+ * RMPOPT: F2 0F 01 FC
+ * Input: RAX = system physical address (1GB aligned)
+ * RCX = operation type
+ * Output: CF set if the range was optimized
+ */
+static inline bool __rmpopt(u64 pa_start, u64 op_type)
+{
+ bool optimized;
+
+ asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfc"
+ : "=@ccc" (optimized)
+ : "a" (pa_start), "c" (op_type)
+ : "memory", "cc");
+
+ return optimized;
+}
+
+static void rmpopt(u64 pa)
+{
+ u64 pa_start = ALIGN_DOWN(pa, SZ_1G);
+ u64 op_type = RMPOPT_FUNC_VERIFY_AND_REPORT_STATUS;
+
+ __rmpopt(pa_start, op_type);
+}
+
+/*
+ * 'val' is a system physical address.
+ */
+static void rmpopt_smp(void *val)
+{
+ rmpopt((u64)val);
+}
+
+/*
+ * RMPOPT optimizations skip RMP checks at 1GB granularity if this
+ * range of memory does not contain any SNP guest memory.
+ */
+static void rmpopt_work_handler(struct work_struct *work)
+{
+ cpumask_var_t follower_mask;
+ phys_addr_t pa;
+ int this_cpu;
+
+ pr_info("Attempt RMP optimizations on physical address range @1GB alignment [0x%016llx - 0x%016llx]\n",
+ rmpopt_pa_start, rmpopt_pa_end);
+
+ if (!alloc_cpumask_var(&follower_mask, GFP_KERNEL))
+ return;
+
+ /*
+ * RMPOPT scans the RMP table, stores the result of the scan in the
+ * reserved processor memory. The RMP scan is the most expensive
+ * part. If a second RMPOPT occurs, it can skip the expensive scan
+ * if they can see a cached result in the reserved processor memory.
+ *
+ * Do RMPOPT on one CPU alone. Then, follow that up with RMPOPT
+ * on every other primary thread. Followers are "designed to"
+ * skip the scan if they see the "cached" scan results.
+ *
+ * Pin the worker to the current CPU for the leader loop so that
+ * this_cpu remains valid and the RMPOPT instruction executes on
+ * the correct CPU. Use migrate_disable() rather than get_cpu() to
+ * prevent migration while still allowing preemption.
+ */
+ migrate_disable();
+ this_cpu = smp_processor_id();
+
+ cpumask_andnot(follower_mask, rmpopt_cpumask,
+ topology_sibling_cpumask(this_cpu));
+
+ for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
+ rmpopt(pa);
+ cond_resched();
+ }
+ migrate_enable();
+
+ /*
+ * Followers: run RMPOPT on remaining cores. CPUs cannot go offline
+ * while SNP is active, so the follower set stays valid across the
+ * scan and cpus_read_lock() is uncontended.
+ */
+ scoped_guard(cpus_read_lock) {
+ for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
+ on_each_cpu_mask(follower_mask, rmpopt_smp,
+ (void *)pa, true);
+
+ /* Give a chance for other threads to run */
+ cond_resched();
+ }
+ }
+
+ free_cpumask_var(follower_mask);
+}
+
void snp_setup_rmpopt(void)
{
u64 rmpopt_base;
@@ -635,14 +751,42 @@ void snp_setup_rmpopt(void)
if (!cpu_feature_enabled(X86_FEATURE_RMPOPT) || !rmpopt_capable)
return;

+ guard(mutex)(&rmpopt_wq_mutex);
+
+ /*
+ * Guard against re-initialization. When SNP_SHUTDOWN_EX is issued
+ * with x86_snp_shutdown=0, snp_shutdown() is not called and
+ * rmpopt_cleanup() is skipped, but snp_initialized is still cleared.
+ * A subsequent __sev_snp_init_locked() would call snp_setup_rmpopt()
+ * again, leaking the existing workqueue, delayed work, and cpumask
+ * state.
+ */
+ if (rmpopt_wq)
+ return;
+
+ /*
+ * Create an RMPOPT-specific workqueue to avoid scheduling
+ * RMPOPT workitem on the global system workqueue.
+ */
+ rmpopt_wq = alloc_workqueue("rmpopt_wq", WQ_UNBOUND, 1);
+ if (!rmpopt_wq) {
+ pr_err("Failed to allocate RMPOPT workqueue\n");
+ return;
+ }
+
+ INIT_DELAYED_WORK(&rmpopt_delayed_work, rmpopt_work_handler);
+
if (!zalloc_cpumask_var(&rmpopt_cpumask, GFP_KERNEL)) {
pr_err("Failed to allocate RMPOPT cpumask\n");
+ destroy_workqueue(rmpopt_wq);
+ rmpopt_wq = NULL;
return;
}

/*
* The RMPOPT_BASE MSR is per-core, so only one thread per core needs
- * to set up the RMPOPT_BASE MSR.
+ * to set up the RMPOPT_BASE MSR. Likewise, only one thread per core
+ * needs to issue the RMPOPT instruction.
*
* Note: only online primary threads are included. If a core's
* primary thread is offline, that core is not covered. CPU hotplug
@@ -665,6 +809,21 @@ void snp_setup_rmpopt(void)
for_each_cpu(cpu, rmpopt_cpumask)
wrmsrq_on_cpu(cpu, MSR_AMD64_RMPOPT_BASE, rmpopt_base);
}
+
+ rmpopt_pa_end = ALIGN(PFN_PHYS(max_pfn), SZ_1G);
+
+ /* Limit memory scanning to 2TB of RAM */
+ if ((rmpopt_pa_end - rmpopt_pa_start) > SZ_2T) {
+ pr_info("RMPOPT coverage limited to 2TB; memory above 0x%llx not optimized\n",
+ rmpopt_pa_start + SZ_2T);
+ rmpopt_pa_end = rmpopt_pa_start + SZ_2T;
+ }
+
+ /*
+ * Once all per-CPU RMPOPT tables have been configured, enable RMPOPT
+ * optimizations on all physical memory.
+ */
+ queue_delayed_work(rmpopt_wq, &rmpopt_delayed_work, 0);
}
EXPORT_SYMBOL_FOR_MODULES(snp_setup_rmpopt, "ccp");

--
2.43.0