[RFC 3/3] arch/x86/kvm: SVM: Introduce pause loop exit logic in SVM

From: Babu Moger
Date: Fri Mar 02 2018 - 11:18:41 EST


Bring the PLE(pause loop exit) logic to AMD svm driver.
We have noticed it help in situations where numerous pauses are generated
due to spinlock or other scenarios. Tested it with idle=poll and noticed
pause interceptions go down considerably.

Signed-off-by: Babu Moger <babu.moger@xxxxxxx>
---
arch/x86/kvm/svm.c | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
arch/x86/kvm/x86.h | 1 +
2 files changed, 114 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 50a4e95..30bc851 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -263,6 +263,55 @@ struct amd_svm_iommu_ir {
static bool npt_enabled;
#endif

+/*
+ * These 2 parameters are used to config the controls for Pause-Loop Exiting:
+ * pause_filter_thresh: On processors that support Pause filtering(indicated
+ * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter
+ * count value. On VMRUN this value is loaded into an internal counter.
+ * Each time a pause instruction is executed, this counter is decremented
+ * until it reaches zero at which time a #VMEXIT is generated if pause
+ * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause
+ * Intercept Filtering for more details.
+ * This also indicate if ple logic enabled.
+ *
+ * pause_filter_count: In addition, some processor families support advanced
+ * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on
+ * the amount of time a guest is allowed to execute in a pause loop.
+ * In this mode, a 16-bit pause filter threshold field is added in the
+ * VMCB. The threshold value is a cycle count that is used to reset the
+ * pause counter. As with simple pause filtering, VMRUN loads the pause
+ * count value from VMCB into an internal counter. Then, on each pause
+ * instruction the hardware checks the elapsed number of cycles since
+ * the most recent pause instruction against the pause filter threshold.
+ * If the elapsed cycle count is greater than the pause filter threshold,
+ * then the internal pause count is reloaded from the VMCB and execution
+ * continues. If the elapsed cycle count is less than the pause filter
+ * threshold, then the internal pause count is decremented. If the count
+ * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is
+ * triggered. If advanced pause filtering is supported and pause filter
+ * threshold field is set to zero, the filter will operate in the simpler,
+ * count only mode.
+ */
+
+static int pause_filter_thresh = KVM_DEFAULT_PLE_GAP;
+module_param(pause_filter_thresh, int, S_IRUGO);
+
+static int pause_filter_count = KVM_DEFAULT_PLE_WINDOW;
+module_param(pause_filter_count, int, S_IRUGO);
+
+/* Default doubles per-vcpu window every exit. */
+static int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
+module_param(ple_window_grow, int, S_IRUGO);
+
+/* Default resets per-vcpu window every exit to ple_window. */
+static int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
+module_param(ple_window_shrink, int, S_IRUGO);
+
+/* Default is to compute the maximum so we can never overflow. */
+static int ple_window_actual_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
+static int ple_window_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX;
+module_param(ple_window_max, int, S_IRUGO);
+
/* allow nested paging (virtualized MMU) for all guests */
static int npt = true;
module_param(npt, int, S_IRUGO);
@@ -1046,6 +1095,58 @@ static int avic_ga_log_notifier(u32 ga_tag)
return 0;
}

+static void grow_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ control->pause_filter_count = __grow_ple_window(old,
+ pause_filter_count,
+ ple_window_grow,
+ ple_window_actual_max);
+
+ if (control->pause_filter_count != old)
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ trace_kvm_ple_window_grow(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+}
+
+static void shrink_ple_window(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ struct vmcb_control_area *control = &svm->vmcb->control;
+ int old = control->pause_filter_count;
+
+ control->pause_filter_count = __shrink_ple_window(old,
+ pause_filter_count,
+ ple_window_shrink,
+ pause_filter_count);
+
+ if (control->pause_filter_count != old)
+ mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
+
+ trace_kvm_ple_window_shrink(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+}
+
+/*
+ * ple_window_actual_max is computed to be one grow_ple_window() below
+ * ple_window_max. (See __grow_ple_window for the reason.)
+ * This prevents overflows, because ple_window_max is int.
+ * ple_window_max effectively rounded down to a multiple of ple_window_grow in
+ * this process.
+ * ple_window_max is also prevented from setting control->pause_filter_count <
+ * pause_filter_count.
+ */
+static void update_ple_window_actual_max(void)
+{
+ ple_window_actual_max =
+ __shrink_ple_window(max(ple_window_max, pause_filter_count),
+ pause_filter_count,
+ ple_window_grow, SHRT_MIN);
+}
static __init int svm_hardware_setup(void)
{
int cpu;
@@ -1101,6 +1202,8 @@ static __init int svm_hardware_setup(void)
} else
kvm_disable_tdp();

+ update_ple_window_actual_max();
+
if (avic) {
if (!npt_enabled ||
!boot_cpu_has(X86_FEATURE_AVIC) ||
@@ -1309,7 +1412,11 @@ static void init_vmcb(struct vcpu_svm *svm)
svm->vcpu.arch.hflags = 0;

if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
- control->pause_filter_count = 3000;
+ control->pause_filter_count = pause_filter_count;
+ if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD))
+ control->pause_filter_thresh = pause_filter_thresh;
+ else
+ pause_filter_thresh = 0;
set_intercept(svm, INTERCEPT_PAUSE);
}

@@ -3802,6 +3909,9 @@ static int pause_interception(struct vcpu_svm *svm)
struct kvm_vcpu *vcpu = &svm->vcpu;
bool in_kernel = (svm_get_cpl(vcpu) == 0);

+ if (pause_filter_thresh)
+ grow_ple_window(vcpu);
+
kvm_vcpu_on_spin(vcpu, in_kernel);
return 1;
}
@@ -5424,6 +5534,8 @@ static void svm_handle_external_intr(struct kvm_vcpu *vcpu)

static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
+ if (pause_filter_thresh)
+ shrink_ple_window(vcpu);
}

static inline void avic_post_state_restore(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index d1fb7bb..4c4f6b8 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -15,6 +15,7 @@
#define KVM_DEFAULT_PLE_WINDOW_SHRINK 0
#define KVM_VMX_DEFAULT_PLE_WINDOW_MAX \
(INT_MAX / KVM_DEFAULT_PLE_WINDOW_GROW)
+#define KVM_SVM_DEFAULT_PLE_WINDOW_MAX USHRT_MAX

static inline int __grow_ple_window(int val, int base, int modifier, int max)
{
--
1.8.3.1