[PATCH] KVM: X86: Emulate APERF/MPERF to report actual VCPU frequency

From: Like Xu
Date: Tue Jun 23 2020 - 02:36:48 EST


The aperf/mperf are used to report current CPU frequency after 7d5905dc14a
"x86 / CPU: Always show current CPU frequency in /proc/cpuinfo". But guest
kernel always reports a fixed VCPU frequency in the /proc/cpuinfo, which
may confuse users especially when turbo is enabled on the host.

Emulate guest APERF/MPERF capability based their values on the host.

Co-developed-by: Li RongQing <lirongqing@xxxxxxxxx>
Signed-off-by: Li RongQing <lirongqing@xxxxxxxxx>
Reviewed-by: Chai Wen <chaiwen@xxxxxxxxx>
Reviewed-by: Jia Lina <jialina01@xxxxxxxxx>
Signed-off-by: Like Xu <like.xu@xxxxxxxxxxxxxxx>
---
arch/x86/include/asm/kvm_host.h | 12 ++++++
arch/x86/kvm/cpuid.c | 8 +++-
arch/x86/kvm/x86.c | 76 ++++++++++++++++++++++++++++++++-
3 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index f852ee350beb..c48b9a0a086e 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -539,6 +539,16 @@ struct kvm_vcpu_hv_stimer {
bool msg_pending;
};

+/* vCPU thermal and power context */
+struct kvm_vcpu_hwp {
+ /* Hardware Coordination Feedback Capability (Presence of APERF/MPERF) */
+ bool hw_coord_fb_cap;
+ /* MPERF increases with a fixed frequency */
+ u64 mperf;
+ /* APERF increases with the current/actual frequency */
+ u64 aperf;
+};
+
/* Hyper-V synthetic interrupt controller (SynIC)*/
struct kvm_vcpu_hv_synic {
u64 version;
@@ -829,6 +839,8 @@ struct kvm_vcpu_arch {

/* AMD MSRC001_0015 Hardware Configuration */
u64 msr_hwcr;
+
+ struct kvm_vcpu_hwp hwp;
};

struct kvm_lpage_info {
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 8a294f9747aa..7057809e7cfd 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -78,6 +78,11 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
apic->lapic_timer.timer_mode_mask = 1 << 17;
}

+ best = kvm_find_cpuid_entry(vcpu, 0x6, 0);
+ if (best && best->function == 0x6 &&
+ boot_cpu_has(X86_FEATURE_APERFMPERF) && (best->ecx & 0x1))
+ vcpu->arch.hwp.hw_coord_fb_cap = true;
+
best = kvm_find_cpuid_entry(vcpu, 7, 0);
if (best && boot_cpu_has(X86_FEATURE_PKU) && best->function == 0x7)
cpuid_entry_change(best, X86_FEATURE_OSPKE,
@@ -561,7 +566,8 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function)
case 6: /* Thermal management */
entry->eax = 0x4; /* allow ARAT */
entry->ebx = 0;
- entry->ecx = 0;
+ /* allow aperf/mperf to report the true VCPU frequency. */
+ entry->ecx = boot_cpu_has(X86_FEATURE_APERFMPERF) ? 0x1 : 0;
entry->edx = 0;
break;
/* function 7 has additional index. */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 00c88c2f34e4..d220d9cc904a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3056,6 +3056,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
vcpu->arch.msr_misc_features_enables = data;
break;
+ case MSR_IA32_MPERF:
+ if (!msr_info->host_initiated && !vcpu->arch.hwp.hw_coord_fb_cap)
+ return 1;
+ vcpu->arch.hwp.mperf = 0;
+ return 0;
+ case MSR_IA32_APERF:
+ if (!msr_info->host_initiated && !vcpu->arch.hwp.hw_coord_fb_cap)
+ return 1;
+ vcpu->arch.hwp.aperf = 0;
+ return 0;
default:
if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
return xen_hvm_config(vcpu, data);
@@ -3323,6 +3333,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_K7_HWCR:
msr_info->data = vcpu->arch.msr_hwcr;
break;
+ case MSR_IA32_MPERF:
+ if (!msr_info->host_initiated && !vcpu->arch.hwp.hw_coord_fb_cap)
+ return 1;
+ msr_info->data = vcpu->arch.hwp.mperf;
+ break;
+ case MSR_IA32_APERF:
+ if (!msr_info->host_initiated && !vcpu->arch.hwp.hw_coord_fb_cap)
+ return 1;
+ msr_info->data = vcpu->arch.hwp.aperf;
+ break;
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
return kvm_pmu_get_msr(vcpu, msr_info);
@@ -8300,6 +8320,50 @@ void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
}
EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);

+static inline void get_host_amperf(u64 *mperf, u64 *aperf)
+{
+ rdmsrl(MSR_IA32_MPERF, *mperf);
+ rdmsrl(MSR_IA32_APERF, *aperf);
+}
+
+static inline u64 get_amperf_delta(u64 enter, u64 exit)
+{
+ return (exit >= enter) ? (exit - enter) : (ULONG_MAX - enter + exit);
+}
+
+static inline void vcpu_update_amperf(struct kvm_vcpu *vcpu, u64 adelta, u64 mdelta)
+{
+ u64 aperf_left, mperf_left, delta, tmp;
+
+ aperf_left = ULONG_MAX - vcpu->arch.hwp.aperf;
+ mperf_left = ULONG_MAX - vcpu->arch.hwp.mperf;
+
+ /* fast path when neither MSR overflows */
+ if (adelta <= aperf_left && mdelta <= mperf_left) {
+ vcpu->arch.hwp.aperf += adelta;
+ vcpu->arch.hwp.mperf += mdelta;
+ return;
+ }
+
+ /* when either MSR overflows, both MSRs are reset to zero and continue to increment. */
+ delta = min(adelta, mdelta);
+ if (delta > aperf_left || delta > mperf_left) {
+ tmp = max(vcpu->arch.hwp.aperf, vcpu->arch.hwp.mperf);
+ tmp = delta - (ULONG_MAX - tmp) - 1;
+ vcpu->arch.hwp.aperf = tmp + adelta - delta;
+ vcpu->arch.hwp.mperf = tmp + mdelta - delta;
+ return;
+ }
+
+ if (mdelta > adelta && mdelta > aperf_left) {
+ vcpu->arch.hwp.mperf = mdelta - mperf_left - 1;
+ vcpu->arch.hwp.aperf = 0;
+ } else {
+ vcpu->arch.hwp.mperf = 0;
+ vcpu->arch.hwp.aperf = adelta - aperf_left - 1;
+ }
+}
+
/*
* Returns 1 to let vcpu_run() continue the guest execution loop without
* exiting to the userspace. Otherwise, the value will be returned to the
@@ -8312,7 +8376,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
fastpath_t exit_fastpath;
-
+ u64 enter_mperf = 0, enter_aperf = 0, exit_mperf = 0, exit_aperf = 0;
bool req_immediate_exit = false;

if (kvm_request_pending(vcpu)) {
@@ -8516,8 +8580,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}

+ if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap))
+ get_host_amperf(&enter_mperf, &enter_aperf);
+
exit_fastpath = kvm_x86_ops.run(vcpu);

+ if (unlikely(vcpu->arch.hwp.hw_coord_fb_cap)) {
+ get_host_amperf(&exit_mperf, &exit_aperf);
+ vcpu_update_amperf(vcpu, get_amperf_delta(enter_aperf, exit_aperf),
+ get_amperf_delta(enter_mperf, exit_mperf));
+ }
+
/*
* Do this here before restoring debug registers on the host. And
* since we do this before handling the vmexit, a DR access vmexit
@@ -9482,6 +9555,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)

vcpu->arch.pending_external_vector = -1;
vcpu->arch.preempted_in_kernel = false;
+ vcpu->arch.hwp.hw_coord_fb_cap = false;

kvm_hv_vcpu_init(vcpu);

--
2.21.3