Re: [PATCH 2/6] x86/sev: add support for enabling RMPOPT

From: Kalra, Ashish

Date: Wed Feb 18 2026 - 17:18:14 EST

Hello Dave,

On 2/17/2026 4:06 PM, Dave Hansen wrote:
>> +#define RMPOPT_TABLE_MAX_LIMIT_IN_TB 2
>> +#define NUM_TB(pfn_min, pfn_max) \
>> + (((pfn_max) - (pfn_min)) / (1 << (40 - PAGE_SHIFT)))
>
> IMNHO, you should just keep these in bytes. No reason to keep them in TB.
>
>> +struct rmpopt_socket_config {
>> + unsigned long start_pfn, end_pfn;
>> + cpumask_var_t cpulist;
>> + int *node_id;
>> + int current_node_idx;
>> +};
>
> This looks like optimization complexity before the groundwork is in
> place. Also, don't we *have* CPU lists for NUMA nodes? This seems rather
> redundant.
>

Yes, we do have CPU lists for NUMA nodes, but we need a socket specific
cpumask, let me explain more about that below.

>> +/*
>> + * Build a cpumask of online primary threads, accounting for primary threads
>> + * that have been offlined while their secondary threads are still online.
>> + */
>> +static void get_cpumask_of_primary_threads(cpumask_var_t cpulist)
>> +{
>> + cpumask_t cpus;
>> + int cpu;
>> +
>> + cpumask_copy(&cpus, cpu_online_mask);
>> + for_each_cpu(cpu, &cpus) {
>> + cpumask_set_cpu(cpu, cpulist);
>> + cpumask_andnot(&cpus, &cpus, cpu_smt_mask(cpu));
>> + }
>> +}
>
> Don't we have a primary thread mask already? I thought we did.
>

Already discussed this.

>> +static void __configure_rmpopt(void *val)
>> +{
>> + u64 rmpopt_base = ((u64)val & PUD_MASK) | MSR_AMD64_RMPOPT_ENABLE;
>> +
>> + wrmsrq(MSR_AMD64_RMPOPT_BASE, rmpopt_base);
>> +}
>
> I'd honestly just make the callers align the address..
>
>> +static void configure_rmpopt_non_numa(cpumask_var_t primary_threads_cpulist)
>> +{
>> + on_each_cpu_mask(primary_threads_cpulist, __configure_rmpopt, (void *)0, true);
>> +}
>> +
>> +static void free_rmpopt_socket_config(struct rmpopt_socket_config *socket)
>> +{
>> + int i;
>> +
>> + if (!socket)
>> + return;
>> +
>> + for (i = 0; i < topology_max_packages(); i++) {
>> + free_cpumask_var(socket[i].cpulist);
>> + kfree(socket[i].node_id);
>> + }
>> +
>> + kfree(socket);
>> +}
>> +DEFINE_FREE(free_rmpopt_socket_config, struct rmpopt_socket_config *, free_rmpopt_socket_config(_T))
>
> Looking at all this, I really think you need a more organized series.
>
> Make something that's _functional_ and works for all <2TB configs. Then,
> go add all this NUMA complexity in a follow-on patch or patches. There's
> too much going on here.

Sure.

>
>> +static void configure_rmpopt_large_physmem(cpumask_var_t primary_threads_cpulist)
>> +{
>> + struct rmpopt_socket_config *socket __free(free_rmpopt_socket_config) = NULL;
>> + int max_packages = topology_max_packages();
>> + struct rmpopt_socket_config *sc;
>> + int cpu, i;
>> +
>> + socket = kcalloc(max_packages, sizeof(struct rmpopt_socket_config), GFP_KERNEL);
>> + if (!socket)
>> + return;
>> +
>> + for (i = 0; i < max_packages; i++) {
>> + sc = &socket[i];
>> + if (!zalloc_cpumask_var(&sc->cpulist, GFP_KERNEL))
>> + return;
>> + sc->node_id = kcalloc(nr_node_ids, sizeof(int), GFP_KERNEL);
>> + if (!sc->node_id)
>> + return;
>> + sc->current_node_idx = -1;
>> + }
>> +
>> + /*
>> + * Handle case of virtualized NUMA software domains, such as AMD Nodes Per Socket(NPS)
>> + * configurations. The kernel does not have an abstraction for physical sockets,
>> + * therefore, enumerate the physical sockets and Nodes Per Socket(NPS) information by
>> + * walking the online CPU list.
>> + */
>
> By this point, I've forgotten why sockets are important here.
>
> Why are they important?
Because, Nodes per Socket (NPS) configuration is enabled by default, therefore, we have to
look at Sockets instead of simply NUMA nodes, and collect/aggregate all the Node data per Socket
and then accordingly setup the RMPOPT tables, so that the 2TB limit of RMPOPT tables is covered
appropriately and we try to map the maximum possible memory in RMPOPT tables per-Socket rather
than per-Node.

And as there is no per-Socket information available in kernel, we walk through the online
CPU list and collect all this per-Socket information (including socket's start, end addresses,
NUMA nodes in the socket, cpumask of the socket, etc.)

>
>> + for_each_cpu(cpu, primary_threads_cpulist) {
>> + int socket_id, nid;
>> +
>> + socket_id = topology_logical_package_id(cpu);
>> + nid = cpu_to_node(cpu);
>> + sc = &socket[socket_id];
>> +
>> + /*
>> + * For each socket, determine the corresponding nodes and the socket's start
>> + * and end PFNs.
>> + * Record the node and the start and end PFNs of the first node found on the
>> + * socket, then record each subsequent node and update the end PFN for that
>> + * socket as additional nodes are found.
>> + */
>> + if (sc->current_node_idx == -1) {
>> + sc->current_node_idx = 0;
>> + sc->node_id[sc->current_node_idx] = nid;
>> + sc->start_pfn = node_start_pfn(nid);
>> + sc->end_pfn = node_end_pfn(nid);
>> + } else if (sc->node_id[sc->current_node_idx] != nid) {
>> + sc->current_node_idx++;
>> + sc->node_id[sc->current_node_idx] = nid;
>> + sc->end_pfn = node_end_pfn(nid);
>> + }
>> +
>> + cpumask_set_cpu(cpu, sc->cpulist);
>> + }
>> +
>> + /*
>> + * If the "physical" socket has up to 2TB of memory, the per-CPU RMPOPT tables are
>> + * configured to the starting physical address of the socket, otherwise the tables
>> + * are configured per-node.
>> + */
>> + for (i = 0; i < max_packages; i++) {
>> + int num_tb_socket;
>> + phys_addr_t pa;
>> + int j;
>> +
>> + sc = &socket[i];
>> + num_tb_socket = NUM_TB(sc->start_pfn, sc->end_pfn) + 1;
>> +
>> + pr_debug("socket start_pfn 0x%lx, end_pfn 0x%lx, socket cpu mask %*pbl\n",
>> + sc->start_pfn, sc->end_pfn, cpumask_pr_args(sc->cpulist));
>> +
>> + if (num_tb_socket <= RMPOPT_TABLE_MAX_LIMIT_IN_TB) {
>> + pa = PFN_PHYS(sc->start_pfn);
>> + on_each_cpu_mask(sc->cpulist, __configure_rmpopt, (void *)pa, true);
>> + continue;
>> + }
>> +
>> + for (j = 0; j <= sc->current_node_idx; j++) {
>> + int nid = sc->node_id[j];
>> + struct cpumask node_mask;
>> +
>> + cpumask_and(&node_mask, cpumask_of_node(nid), sc->cpulist);
>> + pa = PFN_PHYS(node_start_pfn(nid));
>> +
>> + pr_debug("RMPOPT_BASE MSR on nodeid %d cpu mask %*pbl set to 0x%llx\n",
>> + nid, cpumask_pr_args(&node_mask), pa);
>> + on_each_cpu_mask(&node_mask, __configure_rmpopt, (void *)pa, true);
>> + }
>> + }
>> +}
>
> Ahh, so you're not optimizing by NUMA itself: you're assuming that there
> are groups of NUMA nodes in a socket and then optimizing for those groups.
>
Yes, by default Venice platform has the NPS2 configuration enabled by default,
so we have 'X' nodes per socket and we have to consider this NPSx configuration
and optimize for those groups.

> It would have been nice to say that. It would make great material for
> the changelog for your broken out patches.

Ok.

>
> I have the feeling that the structure here could be one of these in a patch:
>
> 1. Support systems with <2TB of memory
> 2. Support a RMPOPT range per NUMA node
> 3. Group NUMA nodes at socket boundaries and have them share a common
> RMPOPT config.
>
> Right?

Yes, sure.

>
>> +static __init void configure_and_enable_rmpopt(void)
>> +{
>> + cpumask_var_t primary_threads_cpulist;
>> + int num_tb;
>> +
>> + if (!cpu_feature_enabled(X86_FEATURE_RMPOPT)) {
>> + pr_debug("RMPOPT not supported on this platform\n");
>> + return;
>> + }
>> +
>> + if (!cc_platform_has(CC_ATTR_HOST_SEV_SNP)) {
>> + pr_debug("RMPOPT optimizations not enabled as SNP support is not enabled\n");
>> + return;
>> + }
>> +
>> + if (!(rmp_cfg & MSR_AMD64_SEG_RMP_ENABLED)) {
>> + pr_info("RMPOPT optimizations not enabled, segmented RMP required\n");
>> + return;
>> + }
>> +
>> + if (!zalloc_cpumask_var(&primary_threads_cpulist, GFP_KERNEL))
>> + return;
>> +
>> + num_tb = NUM_TB(min_low_pfn, max_pfn) + 1;
>> + pr_debug("NUM_TB pages in system %d\n", num_tb);
>
> This looks wrong. Earlier, you program 0 as the base RMPOPT address into
> the MSR. But this uses 'min_low_pfn'. Why not 0?

You are right, we should have used min_low_pfn earlier to program the
base RMPOPT address into the MSR.

>
>> + /* Only one thread per core needs to set RMPOPT_BASE MSR as it is per-core */
>> + get_cpumask_of_primary_threads(primary_threads_cpulist);
>> +
>> + /*
>> + * Per-CPU RMPOPT tables support at most 2 TB of addressable memory for RMP optimizations.
>> + *
>> + * Fastpath RMPOPT configuration and setup:
>> + * For systems with <= 2 TB of RAM, configure each per-core RMPOPT base to 0,
>> + * ensuring all system RAM is RMP-optimized on all CPUs.
>> + */
>> + if (num_tb <= RMPOPT_TABLE_MAX_LIMIT_IN_TB)
>> + configure_rmpopt_non_numa(primary_threads_cpulist);
>
> this part:
>
>> + else
>> + configure_rmpopt_large_physmem(primary_threads_cpulist);
>
> ^^ needs to be broken out into a separate optimization patch.
>

Ok.

Thanks,
Ashish