Re: [PATCH v4 11/12] KVM: x86/svm/pmu: Add AMD PerfMonV2 support

From: Like Xu
Date: Fri Apr 07 2023 - 03:08:45 EST


On 7/4/2023 9:35 am, Sean Christopherson wrote:
On Tue, Feb 14, 2023, Like Xu wrote:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ if (!msr_info->host_initiated)
+ return 0; /* Writes are ignored */

Where is the "writes ignored" behavior documented? I can't find anything in the
APM that defines write behavior.

KVM would follow the real hardware behavior once specifications stay silent on details or secret.


pmu->global_status = data;
return 0;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (!kvm_valid_perf_global_ctrl(pmu, data))
return 1;
-
+ fallthrough;

This _definitely_ needs a comment. Hmm, and I would prefer to reverse these, i.e.

case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
data &= ~pmu->global_ctrl_mask;
fallthrough;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (!kvm_valid_perf_global_ctrl(pmu, data))
return 1;

It's a bit arbitrary, but either Intel or AMD is going to end up with extra code,
and IMO skipping a validity check is more alarming than skipping clearing of
reserved bits, i.e. will look like a bug to future readers.

+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ data &= ~pmu->global_ctrl_mask;
if (pmu->global_ctrl != data) {
diff = pmu->global_ctrl ^ data;
pmu->global_ctrl = data;
@@ -616,7 +625,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
if (data & pmu->global_ovf_ctrl_mask)
return 1;
-
+ fallthrough;

Here too. Argh, the APM doesn't actually define what happens on reserved bits,
it just says "WO". I vote to be conservative and ignore writes to reserved bits.
And then we can have one comment for the whole block, e.g.

/*
* Note, AMD ignores writes to read-only PMU MSRs/bits, whereas Intel
* generates #GP on attempts to write reserved bits or RO MSRs.
*/
switch (msr) {
case MSR_CORE_PERF_GLOBAL_STATUS:
if (!msr_info->host_initiated)
return 1; /* RO MSR */
fallthrough;
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
if (!msr_info->host_initiated)
break;

pmu->global_status = data;
break;
case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
data &= ~pmu->global_ctrl_mask;
fallthrough;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (!kvm_valid_perf_global_ctrl(pmu, data))
return 1;

if (pmu->global_ctrl != data) {
diff = pmu->global_ctrl ^ data;
pmu->global_ctrl = data;
reprogram_counters(pmu, diff);
}
break;
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
fallthrough;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
if (data & pmu->global_ovf_ctrl_mask)
return 1;

if (!msr_info->host_initiated)
pmu->global_status &= ~data;
break;
default:
kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
}

return 0;

AMD doesn't generates #GP on attempts to write PMU RO MSRs and reserved bits.

How about this:

/*
* Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
* whereas Intel generates #GP on attempts to write reserved/RO MSRs.
*/
switch (msr) {
case MSR_CORE_PERF_GLOBAL_STATUS:
if (!msr_info->host_initiated || (data & pmu->global_ovf_ctrl_mask))
return 1; /* RO MSR */
fallthrough;
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
if (!msr_info->host_initiated)
break;

pmu->global_status = data;
break;
case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
data &= ~pmu->global_ctrl_mask;
fallthrough;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (!kvm_valid_perf_global_ctrl(pmu, data))
return 1;

if (pmu->global_ctrl != data) {
diff = pmu->global_ctrl ^ data;
pmu->global_ctrl = data;
reprogram_counters(pmu, diff);
}
break;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
if (data & pmu->global_ovf_ctrl_mask)
return 1;
fallthrough;
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
if (!msr_info->host_initiated)
pmu->global_status &= ~data;
break;
default:
kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
}

return 0;


@@ -164,20 +181,34 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+ union cpuid_0x80000022_ebx ebx;
- if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+ pmu->version = 1;
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) {
+ pmu->version = 2;
+ entry = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0);

No need for the intermediate "entry".
+ ebx.full = entry->ebx;

Oof, at first glance this looks like a potential null-pointer deref bug. I
believe we can do

/*
* Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest
* CPUID entry is guaranteed to be non-NULL.
*/
BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 ||
x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index != 0x80000022);

x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index);

ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx;

+ pmu->nr_arch_gp_counters = min_t(unsigned int,
+ ebx.split.num_core_pmc,
+ kvm_pmu_cap.num_counters_gp);
+ } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;

This needs to be sanitized, no? E.g. if KVM only has access to 4 counters, but
userspace sets X86_FEATURE_PERFCTR_CORE anyways. Hrm, unless I'm missing something,
that's a pre-existing bug.

Now your point is that if a user space more capbility than KVM can support, KVM should constrain it.
Your previous preference was that the user space can set capbilities that evene if KVM doesn't support
as long as it doesn't break KVM and host and the guest will eat its own.


If I'm right, can you add a patch to cap nr_arch_gp_counters at
kvm_pmu_cap.num_counters_gp in the common flow, i.e. after this if-else block?
Then there is no change needed in this patch, e.g. we'll naturally end up with:

union cpuid_0x80000022_ebx ebx;

pmu->version = 1;
if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) {
pmu->version = 2;
/*
* Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest
* CPUID entry is guaranteed to be non-NULL.
*/
BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 ||
x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index);
ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx;
pmu->nr_arch_gp_counters = ebx.split.num_core_pmc;
} else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
} else {
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
}

pmu->nr_arch_gp_counters = min_t(unsigned int,
pmu->nr_arch_gp_counters,
kvm_pmu_cap.num_counters_gp);