Re: [PATCH V5 6/7] KVM: x86/pmu: Emulate RDPMC on performance metrics

From: Chen, Zide

Date: Thu Jun 25 2026 - 12:31:39 EST




On 6/25/2026 4:17 AM, Mi, Dapeng wrote:
>
> On 6/25/2026 11:45 AM, Zide Chen wrote:
>> If the host has the PERF_METRICS capability but it's not present on
>> the guest, RDPMC interception must be enabled and KVM should inject
>> an #GP when the guest attempts to read it.
>>
>> If the guest has PERF_METRICS, but RDPMC interception is enabled for
>> other reasons, KVM needs to emulate RDPMC with type 2000H.
>>
>> For simplicity, Metrics Clear Mode is not supported.
>>
>> Signed-off-by: Zide Chen <zide.chen@xxxxxxxxx>
>> ---
>> v5: new patch.
>> ---
>> arch/x86/kvm/pmu.c | 30 ++++++++++++++++++++++++++----
>> 1 file changed, 26 insertions(+), 4 deletions(-)
>>
>> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
>> index 7aafc5db1346..af5b14f44e4b 100644
>> --- a/arch/x86/kvm/pmu.c
>> +++ b/arch/x86/kvm/pmu.c
>> @@ -755,6 +755,16 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
>> return 0;
>> }
>>
>> +static int kvm_pmu_rdpmc_metrics(struct kvm_vcpu *vcpu,
>> + unsigned idx, u64 *data)
>> +{
>> + if (!kvm_vcpu_has_perf_metrics(vcpu))
>> + return 1;
>> +
>> + *data = vcpu_to_pmu(vcpu)->perf_metrics;
>> + return 0;
>> +}
>> +
>> int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
>> {
>> struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
>> @@ -767,15 +777,18 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
>> if (is_vmware_backdoor_pmc(idx))
>> return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
>>
>> - pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
>> - if (!pmc)
>> - return 1;
>> -
>> if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
>> (kvm_x86_call(get_cpl)(vcpu) != 0) &&
>> kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
>> return 1;
>>
>> + if (idx & INTEL_PMC_FIXED_RDPMC_METRICS)
>
> I'm not quite sure if this check is good enough. Although the SDM vol.2
> says "Performance metrics use type 2000H. This type can be used only if
> IA32_PERF_CAPABILITIES.PERF_METRICS_AVAILABLE[bit 15]=1. For this type, the
> index in ECX[15:0] is implementation specific." It doesn't say the PMC
> index must be 0, but we always set the PMC index to 0 when reading topdown
> metrics.

Simply AND the idx is problematic for two reasons: 1) it doesn't catch
invalid type. for example 6000H; 2) fundamentally,
INTEL_PMC_FIXED_RDPMC_METRICS is Intel specific and it can't be compared
in common code.

The proposal is to change the callback rdpmc_ecx_to_pmc() to
emu_rdpmc(), and mode the RDPMC metrics code into vendor. Part of the
proposed code:

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 7aafc5db1346..af6fc8c96037 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -758,8 +758,6 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu
*vcpu, unsigned idx, u64 *data)
int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- struct kvm_pmc *pmc;
- u64 mask = ~0ull;

if (!pmu->version)
return 1;
@@ -767,17 +765,12 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned
idx, u64 *data)
if (is_vmware_backdoor_pmc(idx))
return kvm_pmu_rdpmc_vmware(vcpu, idx, data);

- pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
- if (!pmc)
- return 1;
-
if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
(kvm_x86_call(get_cpl)(vcpu) != 0) &&
kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
return 1;

- *data = pmc_read_counter(pmc) & mask;
- return 0;
+ return kvm_pmu_call(emu_rdpmc)(vcpu, idx, data);
}

static bool kvm_need_any_pmc_intercept(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 3066cade5790..24634aa92875 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -24,8 +24,7 @@
#define KVM_FIXED_PMC_BASE_IDX INTEL_PMC_IDX_FIXED

struct kvm_pmu_ops {
- struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu,
- unsigned int idx, u64 *mask);
+ int (*emu_rdpmc)(struct kvm_vcpu *vcpu, unsigned idx, u64 *data);
struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr);
int (*check_rdpmc_early)(struct kvm_vcpu *vcpu, unsigned int idx);
bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
@@ -84,14 +85,12 @@ static void reprogram_fixed_counters(struct kvm_pmu
*pmu, u64 data)
}
}

-static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
- unsigned int idx, u64 *mask)
+static int intel_emu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
{
unsigned int type = idx & INTEL_RDPMC_TYPE_MASK;
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
- struct kvm_pmc *counters;
+ struct kvm_pmc *counters, *pmc;
unsigned int num_counters;
- u64 bitmask;

/*
* The encoding of ECX for RDPMC is different for architectural
versus
@@ -104,7 +103,7 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct
kvm_vcpu *vcpu,
* as KVM doesn't support such PMUs.
*/
if (WARN_ON_ONCE(!pmu->version))
- return NULL;
+ return 1;

/*
* General Purpose (GP) PMCs are supported on all PMUs, and
fixed PMCs
@@ -118,23 +117,28 @@ static struct kvm_pmc
*intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
case INTEL_RDPMC_FIXED:
counters = pmu->fixed_counters;
num_counters = pmu->nr_arch_fixed_counters;
- bitmask = pmu->counter_bitmask[KVM_PMC_FIXED];
break;
case INTEL_RDPMC_GP:
counters = pmu->gp_counters;
num_counters = pmu->nr_arch_gp_counters;
- bitmask = pmu->counter_bitmask[KVM_PMC_GP];
break;
+ case INTEL_RDPMC_METRICS:
+ if (!kvm_vcpu_has_perf_metrics(vcpu))
+ return 1;
+
+ *data = vcpu_to_pmu(vcpu)->perf_metrics;
+ return 0;
default:
- return NULL;
+ return 1;
}

idx &= INTEL_RDPMC_INDEX_MASK;
if (idx >= num_counters)
- return NULL;
+ return 1;

- *mask &= bitmask;
- return &counters[array_index_nospec(idx, num_counters)];
+ pmc = &counters[array_index_nospec(idx, num_counters)];
+ *data = pmc_read_counter(pmc);
+ return 0;
}

static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)


> Could you please test to run the rdpmc instruction to read topdown metrics
> with valid and invalid PMC index on real HW and look what would happen? Thanks.

RDPMC with invalid type, for example 0x6000 got an #GP. non-zero index
(ECX[15:0]) with type 2000H got an #GP on SPR/NVL/GNR as well.

I haven't found any documentation talking about non-zero index on type
2000H. Since it's implementation specific, for now seems it's OK to
reject (#GP) non-zero index.


>> + return kvm_pmu_rdpmc_metrics(vcpu, idx, data);
>> +
>> + pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
>> + if (!pmc)
>> + return 1;
>> +
>> *data = pmc_read_counter(pmc) & mask;
>> return 0;
>> }
>> @@ -803,6 +816,14 @@ bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu)
>> }
>> EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_need_perf_global_ctrl_intercept);
>>
>> +static bool kvm_need_perf_metrics_intercept(struct kvm_vcpu *vcpu)
>> +{
>> + if (!(kvm_host.perf_capabilities & PERF_CAP_PERF_METRICS))
>> + return false;
>> +
>> + return !kvm_vcpu_has_perf_metrics(vcpu);
>> +}
>> +
>> bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu)
>> {
>> struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
>> @@ -815,6 +836,7 @@ bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu)
>> return true;
>>
>> return kvm_need_any_pmc_intercept(vcpu) ||
>> + kvm_need_perf_metrics_intercept(vcpu) ||
>> pmu->counter_bitmask[KVM_PMC_GP] != (BIT_ULL(kvm_host_pmu.bit_width_gp) - 1) ||
>> pmu->counter_bitmask[KVM_PMC_FIXED] != (BIT_ULL(kvm_host_pmu.bit_width_fixed) - 1);
>> }