Re: [PATCH v4 11/12] KVM: x86/svm/pmu: Add AMD PerfMonV2 support

From: Sean Christopherson
Date: Thu Apr 06 2023 - 21:35:14 EST


On Tue, Feb 14, 2023, Like Xu wrote:
> + case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
> + if (!msr_info->host_initiated)
> + return 0; /* Writes are ignored */

Where is the "writes ignored" behavior documented? I can't find anything in the
APM that defines write behavior.

>
> pmu->global_status = data;
> return 0;
> case MSR_CORE_PERF_GLOBAL_CTRL:
> if (!kvm_valid_perf_global_ctrl(pmu, data))
> return 1;
> -
> + fallthrough;

This _definitely_ needs a comment. Hmm, and I would prefer to reverse these, i.e.

case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
data &= ~pmu->global_ctrl_mask;
fallthrough;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (!kvm_valid_perf_global_ctrl(pmu, data))
return 1;

It's a bit arbitrary, but either Intel or AMD is going to end up with extra code,
and IMO skipping a validity check is more alarming than skipping clearing of
reserved bits, i.e. will look like a bug to future readers.

> + case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
> + data &= ~pmu->global_ctrl_mask;
> if (pmu->global_ctrl != data) {
> diff = pmu->global_ctrl ^ data;
> pmu->global_ctrl = data;
> @@ -616,7 +625,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
> if (data & pmu->global_ovf_ctrl_mask)
> return 1;
> -
> + fallthrough;

Here too. Argh, the APM doesn't actually define what happens on reserved bits,
it just says "WO". I vote to be conservative and ignore writes to reserved bits.
And then we can have one comment for the whole block, e.g.

/*
* Note, AMD ignores writes to read-only PMU MSRs/bits, whereas Intel
* generates #GP on attempts to write reserved bits or RO MSRs.
*/
switch (msr) {
case MSR_CORE_PERF_GLOBAL_STATUS:
if (!msr_info->host_initiated)
return 1; /* RO MSR */
fallthrough;
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
if (!msr_info->host_initiated)
break;

pmu->global_status = data;
break;
case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
data &= ~pmu->global_ctrl_mask;
fallthrough;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (!kvm_valid_perf_global_ctrl(pmu, data))
return 1;

if (pmu->global_ctrl != data) {
diff = pmu->global_ctrl ^ data;
pmu->global_ctrl = data;
reprogram_counters(pmu, diff);
}
break;
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
fallthrough;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
if (data & pmu->global_ovf_ctrl_mask)
return 1;

if (!msr_info->host_initiated)
pmu->global_status &= ~data;
break;
default:
kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
}

return 0;

> @@ -164,20 +181,34 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
> static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
> {
> struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
> + struct kvm_cpuid_entry2 *entry;
> + union cpuid_0x80000022_ebx ebx;
>
> - if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
> + pmu->version = 1;
> + if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) {
> + pmu->version = 2;
> + entry = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0);

No need for the intermediate "entry".
> + ebx.full = entry->ebx;

Oof, at first glance this looks like a potential null-pointer deref bug. I
believe we can do

/*
* Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest
* CPUID entry is guaranteed to be non-NULL.
*/
BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 ||
x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index != 0x80000022);
ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx;

> + pmu->nr_arch_gp_counters = min_t(unsigned int,
> + ebx.split.num_core_pmc,
> + kvm_pmu_cap.num_counters_gp);
> + } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
> pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;

This needs to be sanitized, no? E.g. if KVM only has access to 4 counters, but
userspace sets X86_FEATURE_PERFCTR_CORE anyways. Hrm, unless I'm missing something,
that's a pre-existing bug.

If I'm right, can you add a patch to cap nr_arch_gp_counters at
kvm_pmu_cap.num_counters_gp in the common flow, i.e. after this if-else block?
Then there is no change needed in this patch, e.g. we'll naturally end up with:

union cpuid_0x80000022_ebx ebx;

pmu->version = 1;
if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) {
pmu->version = 2;
/*
* Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest
* CPUID entry is guaranteed to be non-NULL.
*/
BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 ||
x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index);
ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx;
pmu->nr_arch_gp_counters = ebx.split.num_core_pmc;
} else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
} else {
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
}

pmu->nr_arch_gp_counters = min_t(unsigned int,
pmu->nr_arch_gp_counters,
kvm_pmu_cap.num_counters_gp);