Re: [PATCH v5 4/7] x86/sev: Add support to perform RMP optimizations asynchronously

From: Kalra, Ashish

Date: Thu May 28 2026 - 19:53:15 EST

Hello Ackerley,

On 5/28/2026 9:45 AM, Ackerley Tng wrote:
> Ashish Kalra <Ashish.Kalra@xxxxxxx> writes:
>
> Thank you Ashish!
>
>> From: Ashish Kalra <ashish.kalra@xxxxxxx>
>>
>> When SEV-SNP is enabled, all writes to memory are checked to ensure
>> integrity of SNP guest memory. This imposes performance overhead on the
>> whole system.
>>
>> RMPOPT is a new instruction that minimizes the performance overhead of
>> RMP checks on the hypervisor and on non-SNP guests by allowing RMP
>> checks to be skipped for 1GB regions of memory that are known not to
>> contain any SEV-SNP guest memory.
>>
>> Add support for performing RMP optimizations asynchronously using a
>> dedicated workqueue.
>>
>> Enable RMPOPT optimizations globally for all system RAM up to 2TB at
>
> This should also be updated to say "Enable RMPOPT optimizations for up
> to 2TB worth of system RAM at..."
>
> The current phrasing sounds like only addresses [0, 2TB) are allowed to
> be optimized, but actually any address [start, start + 2TB) can be
> optimized?

Yes, i will update it.

>
>> RMP initialization time. RMP checks can initially be skipped for 1GB
>> memory ranges that do not contain SEV-SNP guest memory (excluding
>> preassigned pages such as the RMP table and firmware pages). As SNP
>> guests are launched, RMPUPDATE will disable the corresponding RMPOPT
>> optimizations.
>>
>> Suggested-by: Thomas Lendacky <thomas.lendacky@xxxxxxx>
>> Suggested-by: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
>> Reviewed-by: Ackerley Tng <ackerleytng@xxxxxxxxxx>
>> Signed-off-by: Ashish Kalra <ashish.kalra@xxxxxxx>
>> ---
>> arch/x86/virt/svm/sev.c | 167 +++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 164 insertions(+), 3 deletions(-)
>>
>> diff --git a/arch/x86/virt/svm/sev.c b/arch/x86/virt/svm/sev.c
>> index 82f9dc7a57c3..8876cac052d5 100644
>> --- a/arch/x86/virt/svm/sev.c
>> +++ b/arch/x86/virt/svm/sev.c
>> @@ -19,6 +19,7 @@
>> #include <linux/iommu.h>
>> #include <linux/amd-iommu.h>
>> #include <linux/nospec.h>
>> +#include <linux/workqueue.h>
>>
>> #include <asm/sev.h>
>> #include <asm/processor.h>
>> @@ -125,7 +126,18 @@ static void *rmp_bookkeeping __ro_after_init;
>> static u64 probed_rmp_base, probed_rmp_size;
>>
>> static cpumask_t rmpopt_cpumask;
>> -static phys_addr_t rmpopt_pa_start;
>> +static phys_addr_t rmpopt_pa_start, rmpopt_pa_end;
>> +
>> +enum rmpopt_function {
>> + RMPOPT_FUNC_VERIFY_AND_REPORT_STATUS,
>> + RMPOPT_FUNC_REPORT_STATUS
>> +};
>> +
>> +#define RMPOPT_WORK_TIMEOUT 10000
>> +
>> +static struct workqueue_struct *rmpopt_wq;
>> +static struct delayed_work rmpopt_delayed_work;
>> +static DEFINE_MUTEX(rmpopt_wq_mutex);
>>
>> static LIST_HEAD(snp_leaked_pages_list);
>> static DEFINE_SPINLOCK(snp_leaked_pages_list_lock);
>> @@ -564,12 +576,21 @@ EXPORT_SYMBOL_FOR_MODULES(snp_prepare, "ccp");
>>
>> static void rmpopt_cleanup(void)
>> {
>> + guard(mutex)(&rmpopt_wq_mutex);
>> +
>> + if (!rmpopt_wq)
>> + return;
>> +
>> + cancel_delayed_work_sync(&rmpopt_delayed_work);
>> + destroy_workqueue(rmpopt_wq);
>> +
>> cpus_read_lock();
>> wrmsrq_on_cpus(&rmpopt_cpumask, MSR_AMD64_RMPOPT_BASE, 0);
>> cpus_read_unlock();
>>
>> cpumask_clear(&rmpopt_cpumask);
>> - rmpopt_pa_start = 0;
>> + rmpopt_pa_start = rmpopt_pa_end = 0;
>> + rmpopt_wq = NULL;
>> }
>>
>> void snp_shutdown(void)
>> @@ -587,6 +608,105 @@ void snp_shutdown(void)
>> }
>> EXPORT_SYMBOL_FOR_MODULES(snp_shutdown, "ccp");
>>
>> +static inline bool __rmpopt(u64 rax, u64 rcx)
>
> Perhaps use pa_start instead of rax and op_type for rcx?
>

I used these parameters to align with the RMPOPT specifications (rax and rcx)
which i think makes more sense.

>> +{
>> + bool optimized;
>> +
>> + asm volatile(".byte 0xf2, 0x0f, 0x01, 0xfc"
>> + : "=@ccc" (optimized)
>> + : "a" (rax), "c" (rcx)
>> + : "memory", "cc");
>> +
>> + return optimized;
>> +}
>> +
>> +static void rmpopt(u64 pa)
>> +{
>> + u64 rax = ALIGN_DOWN(pa, SZ_1G);
>> + u64 rcx = RMPOPT_FUNC_VERIFY_AND_REPORT_STATUS;
>> +
>
> And pa_start and op_type here too.
>
>> + __rmpopt(rax, rcx);
>> +}
>> +
>> +/*
>> + * 'val' is a system physical address.
>> + */
>> +static void rmpopt_smp(void *val)
>> +{
>> + rmpopt((u64)val);
>> +}
>> +
>> +/*
>> + * RMPOPT optimizations skip RMP checks at 1GB granularity if this
>> + * range of memory does not contain any SNP guest memory.
>> + */
>> +static void rmpopt_work_handler(struct work_struct *work)
>> +{
>> + bool current_cpu_cleared = false;
>> + phys_addr_t pa;
>> + int this_cpu;
>> +
>> + pr_info("Attempt RMP optimizations on physical address range @1GB alignment [0x%016llx - 0x%016llx]\n",
>> + rmpopt_pa_start, rmpopt_pa_end);
>> +
>> + /*
>> + * RMPOPT scans the RMP table, stores the result of the scan in the
>> + * reserved processor memory. The RMP scan is the most expensive
>> + * part. If a second RMPOPT occurs, it can skip the expensive scan
>> + * if they can see a cached result in the reserved processor memory.
>> + *
>> + * Do RMPOPT on one CPU alone. Then, follow that up with RMPOPT
>> + * on every other primary thread. This potentially allows the
>
> I like the leader and follower comments below, thanks! With this
> leader/follower setup, will the followers definitely see the cached scan
> results, or might the followers still potentially not benefit from the
> caching? If it's still only "potentially", why?

I am verifying with the H/W architects if this is always going to be true or not,
will the followers always benefit from the scan results cached by the leader (first CPU)
or there is a possibility that the followers cannot see/access/get the cached results
and instead do full RMP scanning ?

>
>> + * followers to use the "cached" scan results to avoid repeating
>> + * full scans.
>> + */
>> +
>> + /*
>> + * Pin the worker to the current CPU for the leader loop so that
>> + * this_cpu remains valid and the RMPOPT instruction executes on
>> + * the CPU that was cleared from the cpumask. The workqueue is
>> + * WQ_UNBOUND, so without pinning, the scheduler could migrate
>> + * the worker between the cpumask manipulation and the leader
>> + * loop, causing the leader to run on a different CPU while
>> + * this_cpu's core is skipped entirely.
>> + *
>> + * Use migrate_disable() rather than get_cpu() to prevent
>> + * migration while still allowing preemption.
>> + *
>> + * Note: rmpopt_cpumask is modified here without holding
>> + * rmpopt_wq_mutex. This is safe because the delayed_work
>> + * mechanism guarantees single-threaded execution of this
>> + * handler, and rmpopt_cleanup() calls cancel_delayed_work_sync()
>> + * to ensure handler completion before tearing down the cpumask.
>> + */
>> + migrate_disable();
>> + this_cpu = smp_processor_id();
>> + if (cpumask_test_cpu(this_cpu, &rmpopt_cpumask)) {
>> + cpumask_clear_cpu(this_cpu, &rmpopt_cpumask);
>> + current_cpu_cleared = true;
>> + }
>> +
>
> Instead of reusing the global rmpopt_cpumask, why not make a copy of
> rmpopt_cpumask for this function? Then this function won't have to
> figure out current_cpu_cleared or restore rmpopt_cpumask at the end.
>
> I'm thinking to also drop the test and clear, this function can just
> always clear, like
>
> cpumask_clear_cpu(smp_processor_id(), followers_cpumask);
>
> and later
>
> on_each_cpu_mask(&followers_cpumask, ...);

That's surely a much cleaner approach. Instead of modifying global
rmpopt_cpumask and using a local copy:

cpumask_var_t follower_mask;
alloc_cpumask_var(&follower_mask, GFP_KERNEL);
cpumask_copy(follower_mask, &rmpopt_cpumask);

migrate_disable();
this_cpu = smp_processor_id();
cpumask_clear_cpu(this_cpu, follower_mask); // modify local only

// leader loop on this_cpu...
migrate_enable();

// follower loop with follower_mask...
on_each_cpu_mask(follower_mask, rmpopt_smp, ...);

free_cpumask_var(follower_mask);

This eliminates:
- current_cpu_cleared variable
- The restore at the end

Additionally, the global rmpopt_cpumask is never modified, so no concurrency concerns with debugfs or other readers.

>
> Actually, if for whatever reason cpumask_test_cpu(this_cpu,
> &rmpopt_cpumask) above returns false, would that mean somehow some cpu
> exists that wasn't enabled right when rmpopt was initialized?

The work handler can always execute on a cpu which is not in the rmpopt_cpumask, so i believe the
cpumask_test_cpu() needs to be there.

The leader loop must only run on a CPU that has RMPOPT_BASE MSR programmed. If the WQ_UNBOUND scheduler puts the
handler on a CPU not in rmpopt_cpumask, that CPU's core never had RMPOPT enabled -> RMPOPT instruction causes #UD.

So the leader should be conditional:

cpumask_copy(follower_mask, &rmpopt_cpumask);

migrate_disable();
this_cpu = smp_processor_id();

if (cpumask_test_cpu(this_cpu, follower_mask)) {
cpumask_clear_cpu(this_cpu, follower_mask);

/* Leader: prime the RMPOPT cache on this CPU */
for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
rmpopt(pa);
}

migrate_enable();

/* Followers: run RMPOPT on remaining cores */
cpus_read_lock();
for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
on_each_cpu_mask(follower_mask, rmpopt_smp, (void *)pa, true);
cond_resched();
}
cpus_read_unlock();

If the current CPU isn't in rmpopt_cpumask, the leader is skipped and all cores run as followers — they lose the caching
optimization from a leader priming pass, but correctness is maintained.

Alternatively, i could pick the first CPU from rmpopt_cpumask as the explicit leader instead of relying on whichever CPU the
scheduler chose.

cpumask_copy(follower_mask, &rmpopt_cpumask);

migrate_disable();
this_cpu = smp_processor_id();

if (cpumask_test_cpu(this_cpu, follower_mask)) {
/* Fast path: leader runs locally, no IPIs */
cpumask_clear_cpu(this_cpu, follower_mask);

for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
rmpopt(pa);
} else {
/*
* Current CPU does not have RMPOPT_BASE MSR programmed.
* Pick an explicit leader from the cpumask to avoid #UD.
*/
int leader_cpu = cpumask_first(follower_mask);

cpumask_clear_cpu(leader_cpu, follower_mask);

for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
smp_call_function_single(leader_cpu, rmpopt_smp,
(void *)pa, true);
}

migrate_enable();

/* Followers: run RMPOPT on remaining cores */
cpus_read_lock();
for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
on_each_cpu_mask(follower_mask, rmpopt_smp,
(void *)pa, true);

/* Give a chance for other threads to run */
cond_resched();
}
cpus_read_unlock();

free_cpumask_var(follower_mask);
}

> If yes, what happens if we call rmpopt() on a cpu where it wasn't initialized?

That will cause a #UD exception, if RMPOPT instruction is issued on
a CPU where RMPOPT is not enabled (RMPOPT_BASE.RMPOPT_EN==0), so
it is essential to issue RMPOPT instruction only on the cpumask (covers both
primary and secondary threads) which was setup initially when rmpopt was
initialized and on which the RMPOPT_BASE MSR was setup and RMPOPT enabled.

I believe, there are actually three cases to be considered here:

1. Current CPU is in rmpopt_cpumask -> primary thread, run leader locally, remove from followers
2. Current CPU's sibling is in rmpopt_cpumask -> sibling thread, RMPOPT_BASE per-core is programmed, run leader locally,
remove the sibling's primary from the follower mask
3. Neither -> new/unknown CPU, RMPOPT_BASE never programmed on this core, fall back to explicit leader via IPI.

So this seems to the *correct* implementation of the RMPOPT loop:

cpumask_copy(follower_mask, &rmpopt_cpumask);

migrate_disable();
this_cpu = smp_processor_id();

if (cpumask_test_cpu(this_cpu, follower_mask)) {
/*
* Current CPU is a primary thread in rmpopt_cpumask.
* Run leader locally and remove from follower mask.
*/
cpumask_clear_cpu(this_cpu, follower_mask);

for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
rmpopt(pa);
} else if (cpumask_intersects(topology_sibling_cpumask(this_cpu),
follower_mask)) {
/*
* Current CPU is a sibling thread whose primary is in
* rmpopt_cpumask. RMPOPT_BASE MSR is per-core, so it
* is safe to run the leader locally. Remove the sibling's
* primary from the follower mask as this core is already
* covered by the leader.
*/
cpumask_andnot(follower_mask, follower_mask,
topology_sibling_cpumask(this_cpu));

for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
rmpopt(pa);
} else {
/*
* Current CPU's core does not have RMPOPT_BASE MSR
* programmed. Pick an explicit leader from the cpumask
* to avoid #UD.
*/
int leader_cpu = cpumask_first(follower_mask);

cpumask_clear_cpu(leader_cpu, follower_mask);

for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
smp_call_function_single(leader_cpu, rmpopt_smp,
(void *)pa, true);
}

migrate_enable();

/* Followers: run RMPOPT on remaining cores */
cpus_read_lock();
for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
on_each_cpu_mask(follower_mask, rmpopt_smp,
(void *)pa, true);

/* Give a chance for other threads to run */
cond_resched();
}
cpus_read_unlock();

free_cpumask_var(follower_mask);
}

Thanks,
Ashish

>
>> + /* Leader: prime the RMPOPT cache on this CPU */
>> + for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G)
>> + rmpopt(pa);
>> +
>> + migrate_enable();
>> +
>> + /* Followers: run RMPOPT on all other cores */
>> + cpus_read_lock();
>> + for (pa = rmpopt_pa_start; pa < rmpopt_pa_end; pa += SZ_1G) {
>> + on_each_cpu_mask(&rmpopt_cpumask, rmpopt_smp,
>> + (void *)pa, true);
>> +
>> + /* Give a chance for other threads to run */
>> + cond_resched();
>> + }
>> + cpus_read_unlock();
>> +
>> + if (current_cpu_cleared)
>> + cpumask_set_cpu(this_cpu, &rmpopt_cpumask);
>> +}
>> +
>>
>> [...snip...]
>>