[PATCH 2/3] KVM: x86: Fix Intel PT Host/Guest mode when host tracing also

From: Adrian Hunter
Date: Fri Sep 06 2024 - 09:01:44 EST


Ensure Intel PT tracing is disabled before VM-Entry in Intel PT Host/Guest
mode.

Intel PT has 2 modes for tracing virtual machines. The default is System
mode whereby host and guest output to the host trace buffer. The other is
Host/Guest mode whereby host and guest output to their own buffers.
Host/Guest mode is selected by kvm_intel module parameter pt_mode=1.

In Host/Guest mode, the following rule must be followed:

If the logical processor is operating with Intel PT enabled
(if IA32_RTIT_CTL.TraceEn = 1) at the time of VM entry, the
"load IA32_RTIT_CTL" VM-entry control must be 0.

However, "load IA32_RTIT_CTL" VM-entry control is always 1 in Host/Guest
mode, so IA32_RTIT_CTL.TraceEn must always be 0 at VM entry, irrespective
of whether guest IA32_RTIT_CTL.TraceEn is 1.

Fix by stopping host Intel PT tracing always at VM entry in Host/Guest
mode. That also fixes the issue whereby the Intel PT NMI handler would
set IA32_RTIT_CTL.TraceEn back to 1 after KVM has just set it to 0.

Fixes: 2ef444f1600b ("KVM: x86: Add Intel PT context switch for each vcpu")
Cc: stable@xxxxxxxxxxxxxxx
Signed-off-by: Adrian Hunter <adrian.hunter@xxxxxxxxx>
---
arch/x86/events/intel/pt.c | 131 +++++++++++++++++++++++++++++++-
arch/x86/events/intel/pt.h | 10 +++
arch/x86/include/asm/intel_pt.h | 4 +
arch/x86/kvm/vmx/vmx.c | 23 ++----
arch/x86/kvm/vmx/vmx.h | 1 -
5 files changed, 147 insertions(+), 22 deletions(-)

diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c
index fd4670a6694e..a4c8feb94040 100644
--- a/arch/x86/events/intel/pt.c
+++ b/arch/x86/events/intel/pt.c
@@ -480,16 +480,20 @@ static u64 pt_config_filters(struct perf_event *event)
*/

/* avoid redundant msr writes */
- if (pt->filters.filter[range].msr_a != filter->msr_a) {
+ if (pt->filters.filter[range].msr_a != filter->msr_a ||
+ pt->write_filter_msrs[range]) {
wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a);
pt->filters.filter[range].msr_a = filter->msr_a;
}

- if (pt->filters.filter[range].msr_b != filter->msr_b) {
+ if (pt->filters.filter[range].msr_b != filter->msr_b ||
+ pt->write_filter_msrs[range]) {
wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b);
pt->filters.filter[range].msr_b = filter->msr_b;
}

+ pt->write_filter_msrs[range] = false;
+
rtit_ctl |= (u64)filter->config << pt_address_ranges[range].reg_off;
}

@@ -534,6 +538,11 @@ static void pt_config(struct perf_event *event)
reg |= (event->attr.config & PT_CONFIG_MASK);

event->hw.aux_config = reg;
+
+ /* Configuration is complete, it is now OK to handle an NMI */
+ barrier();
+ WRITE_ONCE(pt->handle_nmi, 1);
+
pt_config_start(event);
}

@@ -945,6 +954,7 @@ static void pt_handle_status(struct pt *pt)
pt_buffer_advance(buf);

wrmsrl(MSR_IA32_RTIT_STATUS, status);
+ pt->status = status;
}

/**
@@ -1583,7 +1593,6 @@ static void pt_event_start(struct perf_event *event, int mode)
goto fail_end_stop;
}

- WRITE_ONCE(pt->handle_nmi, 1);
hwc->state = 0;

pt_config_buffer(buf);
@@ -1638,6 +1647,104 @@ static void pt_event_stop(struct perf_event *event, int mode)
}
}

+#define PT_VM_NO_TRANSITION 0
+#define PT_VM_ENTRY 1
+#define PT_VM_EXIT 2
+
+void intel_pt_vm_entry(bool guest_trace_enable)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ struct perf_event *event;
+
+ pt->restart_event = NULL;
+ pt->stashed_buf_sz = 0;
+
+ WRITE_ONCE(pt->vm_transition, PT_VM_ENTRY);
+ barrier();
+
+ if (READ_ONCE(pt->handle_nmi)) {
+ /* Must stop handler before reading pt->handle.event */
+ WRITE_ONCE(pt->handle_nmi, 0);
+ barrier();
+ event = pt->handle.event;
+ if (event && !event->hw.state) {
+ struct pt_buffer *buf = perf_get_aux(&pt->handle);
+
+ if (buf && buf->snapshot)
+ pt->stashed_buf_sz = buf->nr_pages << PAGE_SHIFT;
+ pt->restart_event = event;
+ pt_event_stop(event, PERF_EF_UPDATE);
+ }
+ }
+
+ /*
+ * If guest_trace_enable, MSRs need to be saved, but the values are
+ * either already cached or not needed:
+ * MSR_IA32_RTIT_CTL event->hw.aux_config
+ * MSR_IA32_RTIT_STATUS pt->status
+ * MSR_IA32_RTIT_CR3_MATCH not used
+ * MSR_IA32_RTIT_OUTPUT_BASE pt->output_base
+ * MSR_IA32_RTIT_OUTPUT_MASK pt->output_mask
+ * MSR_IA32_RTIT_ADDR... pt->filters
+ */
+}
+EXPORT_SYMBOL_GPL(intel_pt_vm_entry);
+
+void intel_pt_vm_exit(bool guest_trace_enable)
+{
+ struct pt *pt = this_cpu_ptr(&pt_ctx);
+ u64 base = pt->output_base;
+ u64 mask = pt->output_mask;
+
+ WRITE_ONCE(pt->vm_transition, PT_VM_EXIT);
+ barrier();
+
+ /*
+ * If guest_trace_enable, MSRs need to be restored, but that is handled
+ * in different ways:
+ * MSR_IA32_RTIT_CTL written next start
+ * MSR_IA32_RTIT_STATUS restored below
+ * MSR_IA32_RTIT_CR3_MATCH not used
+ * MSR_IA32_RTIT_OUTPUT_BASE written next start or restored
+ * further below
+ * MSR_IA32_RTIT_OUTPUT_MASK written next start or restored
+ * further below
+ * MSR_IA32_RTIT_ADDR... flagged to be written when
+ * needed
+ */
+ if (guest_trace_enable) {
+ wrmsrl(MSR_IA32_RTIT_STATUS, pt->status);
+ /*
+ * Force address filter MSR writes during reconfiguration,
+ * refer pt_config_filters().
+ */
+ for (int range = 0; range < PT_FILTERS_NUM; range++)
+ pt->write_filter_msrs[range] = true;
+ }
+
+ if (pt->restart_event) {
+ if (guest_trace_enable) {
+ /* Invalidate to force buffer reconfiguration */
+ pt->output_base = ~0ULL;
+ pt->output_mask = 0;
+ }
+ pt_event_start(pt->restart_event, 0);
+ pt->restart_event = NULL;
+ }
+
+ /* If tracing wasn't started, restore buffer configuration */
+ if (guest_trace_enable && !READ_ONCE(pt->handle_nmi)) {
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base);
+ wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, mask);
+ pt->output_base = base;
+ pt->output_mask = mask;
+ }
+
+ barrier();
+ WRITE_ONCE(pt->vm_transition, PT_VM_NO_TRANSITION);
+}
+EXPORT_SYMBOL_GPL(intel_pt_vm_exit);
+
static long pt_event_snapshot_aux(struct perf_event *event,
struct perf_output_handle *handle,
unsigned long size)
@@ -1646,6 +1753,24 @@ static long pt_event_snapshot_aux(struct perf_event *event,
struct pt_buffer *buf = perf_get_aux(&pt->handle);
unsigned long from = 0, to;
long ret;
+ int tr;
+
+ /*
+ * Special handling during VM transition. At VM-Entry stage, once
+ * tracing is stopped, as indicated by buf == NULL, snapshot using the
+ * saved head position. At VM-Exit do that also until tracing is
+ * reconfigured as indicated by handle_nmi.
+ */
+ tr = READ_ONCE(pt->vm_transition);
+ if ((tr == PT_VM_ENTRY && !buf) || (tr == PT_VM_EXIT && !READ_ONCE(pt->handle_nmi))) {
+ if (WARN_ON_ONCE(!pt->stashed_buf_sz))
+ return 0;
+ to = pt->handle.head;
+ if (to < size)
+ from = pt->stashed_buf_sz;
+ from += to - size;
+ return perf_output_copy_aux(&pt->handle, handle, from, to);
+ }

if (WARN_ON_ONCE(!buf))
return 0;
diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h
index f5e46c04c145..ecaaf112b923 100644
--- a/arch/x86/events/intel/pt.h
+++ b/arch/x86/events/intel/pt.h
@@ -119,6 +119,11 @@ struct pt_filters {
* @vmx_on: 1 if VMX is ON on this cpu
* @output_base: cached RTIT_OUTPUT_BASE MSR value
* @output_mask: cached RTIT_OUTPUT_MASK MSR value
+ * @status: cached RTIT_STATUS MSR value
+ * @vm_transition: VM transition (snapshot_aux needs special handling)
+ * @write_filter_msrs: write address filter MSRs during configuration
+ * @stashed_buf_sz: buffer size during VM transition
+ * @restart_event: event to restart after VM-Exit
*/
struct pt {
struct perf_output_handle handle;
@@ -127,6 +132,11 @@ struct pt {
int vmx_on;
u64 output_base;
u64 output_mask;
+ u64 status;
+ int vm_transition;
+ bool write_filter_msrs[PT_FILTERS_NUM];
+ unsigned long stashed_buf_sz;
+ struct perf_event *restart_event;
};

#endif /* __INTEL_PT_H__ */
diff --git a/arch/x86/include/asm/intel_pt.h b/arch/x86/include/asm/intel_pt.h
index c796e9bc98b6..a673ac3a825e 100644
--- a/arch/x86/include/asm/intel_pt.h
+++ b/arch/x86/include/asm/intel_pt.h
@@ -30,11 +30,15 @@ enum pt_capabilities {
void cpu_emergency_stop_pt(void);
extern u32 intel_pt_validate_hw_cap(enum pt_capabilities cap);
extern u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities cap);
+extern void intel_pt_vm_entry(bool guest_trace_enable);
+extern void intel_pt_vm_exit(bool guest_trace_enable);
extern int is_intel_pt_event(struct perf_event *event);
#else
static inline void cpu_emergency_stop_pt(void) {}
static inline u32 intel_pt_validate_hw_cap(enum pt_capabilities cap) { return 0; }
static inline u32 intel_pt_validate_cap(u32 *caps, enum pt_capabilities capability) { return 0; }
+static inline void intel_pt_vm_entry(bool guest_trace_enable) {}
+static inline void intel_pt_vm_exit(bool guest_trace_enable) {}
static inline int is_intel_pt_event(struct perf_event *event) { return 0; }
#endif

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 3f1e3be552c0..d20458d83829 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1224,16 +1224,10 @@ static void pt_guest_enter(struct vcpu_vmx *vmx)
if (vmx_pt_mode_is_system())
return;

- /*
- * GUEST_IA32_RTIT_CTL is already set in the VMCS.
- * Save host state before VM entry.
- */
- rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
- if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
- wrmsrl(MSR_IA32_RTIT_CTL, 0);
- pt_save_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
+ intel_pt_vm_entry(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
+
+ if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN)
pt_load_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
- }
}

static void pt_guest_exit(struct vcpu_vmx *vmx)
@@ -1241,17 +1235,10 @@ static void pt_guest_exit(struct vcpu_vmx *vmx)
if (vmx_pt_mode_is_system())
return;

- if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
+ if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN)
pt_save_msr(&vmx->pt_desc.guest, vmx->pt_desc.num_address_ranges);
- pt_load_msr(&vmx->pt_desc.host, vmx->pt_desc.num_address_ranges);
- }

- /*
- * KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
- * i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
- */
- if (vmx->pt_desc.host.ctl)
- wrmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
+ intel_pt_vm_exit(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
}

void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 42498fa63abb..e1616282d97f 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -67,7 +67,6 @@ struct pt_desc {
u64 ctl_bitmask;
u32 num_address_ranges;
u32 caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
- struct pt_ctx host;
struct pt_ctx guest;
};

--
2.43.0