On Tue, Feb 14, 2023, Like Xu wrote:
+ case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
+ if (!msr_info->host_initiated)
+ return 0; /* Writes are ignored */
Where is the "writes ignored" behavior documented? I can't find anything in the
APM that defines write behavior.
pmu->global_status = data;
return 0;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (!kvm_valid_perf_global_ctrl(pmu, data))
return 1;
-
+ fallthrough;
This _definitely_ needs a comment. Hmm, and I would prefer to reverse these, i.e.
case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
data &= ~pmu->global_ctrl_mask;
fallthrough;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (!kvm_valid_perf_global_ctrl(pmu, data))
return 1;
It's a bit arbitrary, but either Intel or AMD is going to end up with extra code,
and IMO skipping a validity check is more alarming than skipping clearing of
reserved bits, i.e. will look like a bug to future readers.
+ case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
+ data &= ~pmu->global_ctrl_mask;
if (pmu->global_ctrl != data) {
diff = pmu->global_ctrl ^ data;
pmu->global_ctrl = data;
@@ -616,7 +625,8 @@ int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
if (data & pmu->global_ovf_ctrl_mask)
return 1;
-
+ fallthrough;
Here too. Argh, the APM doesn't actually define what happens on reserved bits,
it just says "WO". I vote to be conservative and ignore writes to reserved bits.
And then we can have one comment for the whole block, e.g.
/*
* Note, AMD ignores writes to read-only PMU MSRs/bits, whereas Intel
* generates #GP on attempts to write reserved bits or RO MSRs.
*/
switch (msr) {
case MSR_CORE_PERF_GLOBAL_STATUS:
if (!msr_info->host_initiated)
return 1; /* RO MSR */
fallthrough;
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
if (!msr_info->host_initiated)
break;
pmu->global_status = data;
break;
case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
data &= ~pmu->global_ctrl_mask;
fallthrough;
case MSR_CORE_PERF_GLOBAL_CTRL:
if (!kvm_valid_perf_global_ctrl(pmu, data))
return 1;
if (pmu->global_ctrl != data) {
diff = pmu->global_ctrl ^ data;
pmu->global_ctrl = data;
reprogram_counters(pmu, diff);
}
break;
case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
fallthrough;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
if (data & pmu->global_ovf_ctrl_mask)
return 1;
if (!msr_info->host_initiated)
pmu->global_status &= ~data;
break;
default:
kvm_pmu_mark_pmc_in_use(vcpu, msr_info->index);
return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
}
return 0;
@@ -164,20 +181,34 @@ static int amd_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
static void amd_pmu_refresh(struct kvm_vcpu *vcpu)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_cpuid_entry2 *entry;
+ union cpuid_0x80000022_ebx ebx;
- if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE))
+ pmu->version = 1;
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) {
+ pmu->version = 2;
+ entry = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0);
No need for the intermediate "entry".
+ ebx.full = entry->ebx;
Oof, at first glance this looks like a potential null-pointer deref bug. I
believe we can do
/*
* Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest
* CPUID entry is guaranteed to be non-NULL.
*/
BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 ||
x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index != 0x80000022);
ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx;
+ pmu->nr_arch_gp_counters = min_t(unsigned int,
+ ebx.split.num_core_pmc,
+ kvm_pmu_cap.num_counters_gp);
+ } else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
This needs to be sanitized, no? E.g. if KVM only has access to 4 counters, but
userspace sets X86_FEATURE_PERFCTR_CORE anyways. Hrm, unless I'm missing something,
that's a pre-existing bug.
If I'm right, can you add a patch to cap nr_arch_gp_counters at
kvm_pmu_cap.num_counters_gp in the common flow, i.e. after this if-else block?
Then there is no change needed in this patch, e.g. we'll naturally end up with:
union cpuid_0x80000022_ebx ebx;
pmu->version = 1;
if (guest_cpuid_has(vcpu, X86_FEATURE_PERFMON_V2)) {
pmu->version = 2;
/*
* Note, PERFMON_V2 is also in 0x80000022.0x0, i.e. the guest
* CPUID entry is guaranteed to be non-NULL.
*/
BUILD_BUG_ON(x86_feature_cpuid(X86_FEATURE_PERFMON_V2).function != 0x80000022 ||
x86_feature_cpuid(X86_FEATURE_PERFMON_V2).index);
ebx.full = kvm_find_cpuid_entry_index(vcpu, 0x80000022, 0)->ebx;
pmu->nr_arch_gp_counters = ebx.split.num_core_pmc;
} else if (guest_cpuid_has(vcpu, X86_FEATURE_PERFCTR_CORE)) {
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS_CORE;
} else {
pmu->nr_arch_gp_counters = AMD64_NUM_COUNTERS;
}
pmu->nr_arch_gp_counters = min_t(unsigned int,
pmu->nr_arch_gp_counters,
kvm_pmu_cap.num_counters_gp);