Re: [PATCH V5 6/7] KVM: x86/pmu: Emulate RDPMC on performance metrics

From: Chen, Zide

Date: Thu Jun 25 2026 - 21:41:23 EST




On 6/25/2026 7:28 PM, Mi, Dapeng wrote:
>
> On 6/26/2026 12:30 AM, Chen, Zide wrote:
>>
>> On 6/25/2026 4:17 AM, Mi, Dapeng wrote:
>>> On 6/25/2026 11:45 AM, Zide Chen wrote:
>>>> If the host has the PERF_METRICS capability but it's not present on
>>>> the guest, RDPMC interception must be enabled and KVM should inject
>>>> an #GP when the guest attempts to read it.
>>>>
>>>> If the guest has PERF_METRICS, but RDPMC interception is enabled for
>>>> other reasons, KVM needs to emulate RDPMC with type 2000H.
>>>>
>>>> For simplicity, Metrics Clear Mode is not supported.
>>>>
>>>> Signed-off-by: Zide Chen <zide.chen@xxxxxxxxx>
>>>> ---
>>>> v5: new patch.
>>>> ---
>>>> arch/x86/kvm/pmu.c | 30 ++++++++++++++++++++++++++----
>>>> 1 file changed, 26 insertions(+), 4 deletions(-)
>>>>
>>>> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
>>>> index 7aafc5db1346..af5b14f44e4b 100644
>>>> --- a/arch/x86/kvm/pmu.c
>>>> +++ b/arch/x86/kvm/pmu.c
>>>> @@ -755,6 +755,16 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
>>>> return 0;
>>>> }
>>>>
>>>> +static int kvm_pmu_rdpmc_metrics(struct kvm_vcpu *vcpu,
>>>> + unsigned idx, u64 *data)
>>>> +{
>>>> + if (!kvm_vcpu_has_perf_metrics(vcpu))
>>>> + return 1;
>>>> +
>>>> + *data = vcpu_to_pmu(vcpu)->perf_metrics;
>>>> + return 0;
>>>> +}
>>>> +
>>>> int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
>>>> {
>>>> struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
>>>> @@ -767,15 +777,18 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
>>>> if (is_vmware_backdoor_pmc(idx))
>>>> return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
>>>>
>>>> - pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
>>>> - if (!pmc)
>>>> - return 1;
>>>> -
>>>> if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
>>>> (kvm_x86_call(get_cpl)(vcpu) != 0) &&
>>>> kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
>>>> return 1;
>>>>
>>>> + if (idx & INTEL_PMC_FIXED_RDPMC_METRICS)
>>> I'm not quite sure if this check is good enough. Although the SDM vol.2
>>> says "Performance metrics use type 2000H. This type can be used only if
>>> IA32_PERF_CAPABILITIES.PERF_METRICS_AVAILABLE[bit 15]=1. For this type, the
>>> index in ECX[15:0] is implementation specific." It doesn't say the PMC
>>> index must be 0, but we always set the PMC index to 0 when reading topdown
>>> metrics.
>> Simply AND the idx is problematic for two reasons: 1) it doesn't catch
>> invalid type. for example 6000H; 2) fundamentally,
>> INTEL_PMC_FIXED_RDPMC_METRICS is Intel specific and it can't be compared
>> in common code.
>>
>> The proposal is to change the callback rdpmc_ecx_to_pmc() to
>> emu_rdpmc(), and mode the RDPMC metrics code into vendor. Part of the
>> proposed code:
>>
>> diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
>> index 7aafc5db1346..af6fc8c96037 100644
>> --- a/arch/x86/kvm/pmu.c
>> +++ b/arch/x86/kvm/pmu.c
>> @@ -758,8 +758,6 @@ static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu
>> *vcpu, unsigned idx, u64 *data)
>> int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
>> {
>> struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
>> - struct kvm_pmc *pmc;
>> - u64 mask = ~0ull;
>>
>> if (!pmu->version)
>> return 1;
>> @@ -767,17 +765,12 @@ int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned
>> idx, u64 *data)
>> if (is_vmware_backdoor_pmc(idx))
>> return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
>>
>> - pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
>> - if (!pmc)
>> - return 1;
>> -
>> if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
>> (kvm_x86_call(get_cpl)(vcpu) != 0) &&
>> kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
>> return 1;
>>
>> - *data = pmc_read_counter(pmc) & mask;
>> - return 0;
>> + return kvm_pmu_call(emu_rdpmc)(vcpu, idx, data);
>> }
>>
>> static bool kvm_need_any_pmc_intercept(struct kvm_vcpu *vcpu)
>> diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
>> index 3066cade5790..24634aa92875 100644
>> --- a/arch/x86/kvm/pmu.h
>> +++ b/arch/x86/kvm/pmu.h
>> @@ -24,8 +24,7 @@
>> #define KVM_FIXED_PMC_BASE_IDX INTEL_PMC_IDX_FIXED
>>
>> struct kvm_pmu_ops {
>> - struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu,
>> - unsigned int idx, u64 *mask);
>> + int (*emu_rdpmc)(struct kvm_vcpu *vcpu, unsigned idx, u64 *data);
>> struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr);
>> int (*check_rdpmc_early)(struct kvm_vcpu *vcpu, unsigned int idx);
>> bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr);
>> @@ -84,14 +85,12 @@ static void reprogram_fixed_counters(struct kvm_pmu
>> *pmu, u64 data)
>> }
>> }
>>
>> -static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
>> - unsigned int idx, u64 *mask)
>> +static int intel_emu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data)
>
> What does "emu" mean? Is it a typo?

Short for emulate. Or we can use emulate_rdpmc().


>
>> {
>> unsigned int type = idx & INTEL_RDPMC_TYPE_MASK;
>> struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
>> - struct kvm_pmc *counters;
>> + struct kvm_pmc *counters, *pmc;
>> unsigned int num_counters;
>> - u64 bitmask;
>>
>> /*
>> * The encoding of ECX for RDPMC is different for architectural
>> versus
>> @@ -104,7 +103,7 @@ static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct
>> kvm_vcpu *vcpu,
>> * as KVM doesn't support such PMUs.
>> */
>> if (WARN_ON_ONCE(!pmu->version))
>> - return NULL;
>> + return 1;
>>
>> /*
>> * General Purpose (GP) PMCs are supported on all PMUs, and
>> fixed PMCs
>> @@ -118,23 +117,28 @@ static struct kvm_pmc
>> *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
>> case INTEL_RDPMC_FIXED:
>> counters = pmu->fixed_counters;
>> num_counters = pmu->nr_arch_fixed_counters;
>> - bitmask = pmu->counter_bitmask[KVM_PMC_FIXED];
>> break;
>> case INTEL_RDPMC_GP:
>> counters = pmu->gp_counters;
>> num_counters = pmu->nr_arch_gp_counters;
>> - bitmask = pmu->counter_bitmask[KVM_PMC_GP];
>> break;
>> + case INTEL_RDPMC_METRICS:
>> + if (!kvm_vcpu_has_perf_metrics(vcpu))
>> + return 1;
>> +
>> + *data = vcpu_to_pmu(vcpu)->perf_metrics;
>> + return 0;
>> default:
>> - return NULL;
>> + return 1;
>> }
>>
>> idx &= INTEL_RDPMC_INDEX_MASK;
>> if (idx >= num_counters)
>> - return NULL;
>> + return 1;
>>
>> - *mask &= bitmask;
>> - return &counters[array_index_nospec(idx, num_counters)];
>> + pmc = &counters[array_index_nospec(idx, num_counters)];
>> + *data = pmc_read_counter(pmc);
>> + return 0;
>> }
>>
>> static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
>>
>>
>>> Could you please test to run the rdpmc instruction to read topdown metrics
>>> with valid and invalid PMC index on real HW and look what would happen? Thanks.
>> RDPMC with invalid type, for example 0x6000 got an #GP. non-zero index
>> (ECX[15:0]) with type 2000H got an #GP on SPR/NVL/GNR as well.
>>
>> I haven't found any documentation talking about non-zero index on type
>> 2000H. Since it's implementation specific, for now seems it's OK to
>> reject (#GP) non-zero index.
>
> Yeah, this is consistent with what we ever know. Let us follow the HW's
> behavior here. Thanks.
>
>
>>
>>
>>>> + return kvm_pmu_rdpmc_metrics(vcpu, idx, data);
>>>> +
>>>> + pmc = kvm_pmu_call(rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
>>>> + if (!pmc)
>>>> + return 1;
>>>> +
>>>> *data = pmc_read_counter(pmc) & mask;
>>>> return 0;
>>>> }
>>>> @@ -803,6 +816,14 @@ bool kvm_need_perf_global_ctrl_intercept(struct kvm_vcpu *vcpu)
>>>> }
>>>> EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_need_perf_global_ctrl_intercept);
>>>>
>>>> +static bool kvm_need_perf_metrics_intercept(struct kvm_vcpu *vcpu)
>>>> +{
>>>> + if (!(kvm_host.perf_capabilities & PERF_CAP_PERF_METRICS))
>>>> + return false;
>>>> +
>>>> + return !kvm_vcpu_has_perf_metrics(vcpu);
>>>> +}
>>>> +
>>>> bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu)
>>>> {
>>>> struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
>>>> @@ -815,6 +836,7 @@ bool kvm_need_rdpmc_intercept(struct kvm_vcpu *vcpu)
>>>> return true;
>>>>
>>>> return kvm_need_any_pmc_intercept(vcpu) ||
>>>> + kvm_need_perf_metrics_intercept(vcpu) ||
>>>> pmu->counter_bitmask[KVM_PMC_GP] != (BIT_ULL(kvm_host_pmu.bit_width_gp) - 1) ||
>>>> pmu->counter_bitmask[KVM_PMC_FIXED] != (BIT_ULL(kvm_host_pmu.bit_width_fixed) - 1);
>>>> }