RE: [PATCH 4/4] KVM: selftests: Add test case for guest and host LBR preemption
From: Zhang, Xiong Y
Date: Wed Jun 28 2023 - 22:39:38 EST
> This direction could be a vPMU "coexistence" feature, please feel free to test it.
> But the first step might be to test the behavior of vPMC when its event is
> preempted.
It is harder to construct vPMC preemption as several counters exist in processor, while only one LBR and one PEBS exist in processor.
> Then expand to Guest LBR and Guest PEBS etc.
>
> On 16/6/2023 7:33 pm, Xiong Zhang wrote:
> > When guest access LBR msr at the first time, kvm will create a vLBR
> > event, vLBR event joins perf scheduler and occupy physical LBR for guest usage.
> > Once vLBR event is active and own LBR, guest could access LBR msr.
> >
> > But vLBR event is per process pinned event, perf has higher priority event:
> > per cpu pinned LBR event, perf has lower priority events also: per cpu
> > LBR event and per process LBR event.
> > So if host doesn't have higher priority per cpu pinned LBR event, vLBR
> > event could occupy physical LBR always. But once per cpu pinned LBR
> > event is active, vLBR event couldn't be active anymore, then guest
> > couldn't access LBR msr.
> >
> > This commit adds test case to cover guest and host lbr contend.
> >
> > Signed-off-by: Xiong Zhang <xiong.y.zhang@xxxxxxxxx>
> > ---
> > tools/testing/selftests/kvm/Makefile | 1 +
> > .../selftests/kvm/include/ucall_common.h | 17 ++
> > .../kvm/x86_64/pmu_event_filter_test.c | 16 --
> > .../kvm/x86_64/vmx_pmu_lbr_contend.c | 171 ++++++++++++++++++
> > 4 files changed, 189 insertions(+), 16 deletions(-)
> > create mode 100644
> > tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> >
> > diff --git a/tools/testing/selftests/kvm/Makefile
> > b/tools/testing/selftests/kvm/Makefile
> > index 4761b768b773..422bbc16ba2a 100644
> > --- a/tools/testing/selftests/kvm/Makefile
> > +++ b/tools/testing/selftests/kvm/Makefile
> > @@ -100,6 +100,7 @@ TEST_GEN_PROGS_x86_64 +=
> x86_64/vmx_dirty_log_test
> > TEST_GEN_PROGS_x86_64 +=
> x86_64/vmx_exception_with_invalid_guest_state
> > TEST_GEN_PROGS_x86_64 += x86_64/vmx_msrs_test
> > TEST_GEN_PROGS_x86_64 += x86_64/vmx_invalid_nested_guest_state
> > +TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_lbr_contend
>
> x86_64/vmx_pmu_coexistence ?
Yes, once this expand to vPMC and PEBS
>
> > TEST_GEN_PROGS_x86_64 += x86_64/vmx_set_nested_state_test
> > TEST_GEN_PROGS_x86_64 += x86_64/vmx_tsc_adjust_test
> > TEST_GEN_PROGS_x86_64 += x86_64/vmx_nested_tsc_scaling_test
> > diff --git a/tools/testing/selftests/kvm/include/ucall_common.h
> > b/tools/testing/selftests/kvm/include/ucall_common.h
> > index 1a6aaef5ccae..c1bb0cacf390 100644
> > --- a/tools/testing/selftests/kvm/include/ucall_common.h
> > +++ b/tools/testing/selftests/kvm/include/ucall_common.h
> > @@ -35,6 +35,23 @@ void ucall(uint64_t cmd, int nargs, ...);
> > uint64_t get_ucall(struct kvm_vcpu *vcpu, struct ucall *uc);
> > void ucall_init(struct kvm_vm *vm, vm_paddr_t mmio_gpa);
> >
> > +/*
> > + * Run the VM to the next GUEST_SYNC(value), and return the value
> > +passed
> > + * to the sync. Any other exit from the guest is fatal.
> > + */
> > +static inline uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu) {
> > + struct ucall uc;
> > +
> > + vcpu_run(vcpu);
> > + TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> > + get_ucall(vcpu, &uc);
> > + TEST_ASSERT(uc.cmd == UCALL_SYNC,
> > + "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
> > +
> > + return uc.args[1];
> > +}
> > +
> > /*
> > * Perform userspace call without any associated data. This bare call avoids
> > * allocating a ucall struct, which can be useful if the atomic
> > operations in diff --git
> > a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> > b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> > index 40507ed9fe8a..8c68029cfb4b 100644
> > --- a/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> > +++ b/tools/testing/selftests/kvm/x86_64/pmu_event_filter_test.c
> > @@ -177,22 +177,6 @@ static void amd_guest_code(void)
> > }
> > }
> >
> > -/*
> > - * Run the VM to the next GUEST_SYNC(value), and return the value
> > passed
> > - * to the sync. Any other exit from the guest is fatal.
> > - */
> > -static uint64_t run_vcpu_to_sync(struct kvm_vcpu *vcpu) -{
> > - struct ucall uc;
> > -
> > - vcpu_run(vcpu);
> > - TEST_ASSERT_KVM_EXIT_REASON(vcpu, KVM_EXIT_IO);
> > - get_ucall(vcpu, &uc);
> > - TEST_ASSERT(uc.cmd == UCALL_SYNC,
> > - "Received ucall other than UCALL_SYNC: %lu", uc.cmd);
> > - return uc.args[1];
> > -}
>
> Can this part be a separate patch ?
OK.
>
> > -
> > static void run_vcpu_and_sync_pmc_results(struct kvm_vcpu *vcpu)
> > {
> > uint64_t r;
> > diff --git a/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> > b/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> > new file mode 100644
> > index 000000000000..a6a793f08515
> > --- /dev/null
> > +++ b/tools/testing/selftests/kvm/x86_64/vmx_pmu_lbr_contend.c
> > @@ -0,0 +1,171 @@
> > +// SPDX-License-Identifier: GPL-2.0
> > +/*
> > + * Test for host and guest LBR preemption
> > + *
> > + * Copyright (C) 2021 Intel Corporation
> > + *
> > + */
> > +
> > +#define _GNU_SOURCEGG
> > +
> > +#include <linux/perf_event.h>
> > +#include <sys/syscall.h>
> > +#include <sys/sysinfo.h>
> > +#include <unistd.h>
> > +
> > +#include "test_util.h"
> > +#include "kvm_util.h"
> > +#include "processor.h"
> > +
> > +static void create_perf_events(int *fds, int cpu_num, bool pinned) {
> > + struct perf_event_attr attr = {
> > + .type = PERF_TYPE_HARDWARE,
> > + .size = sizeof(attr),
> > + .config = PERF_COUNT_HW_CPU_CYCLES,
> > + .sample_type = PERF_SAMPLE_BRANCH_STACK,
> > + .sample_period = 1000,
> > + .pinned = pinned,
> > + .branch_sample_type = PERF_SAMPLE_BRANCH_CALL_STACK |
> > + PERF_SAMPLE_BRANCH_USER |
> > + PERF_SAMPLE_BRANCH_KERNEL,
> > + };
> > + int i;
> > +
> > + for (i = 0; i < cpu_num; i++) {
> > + fds[i] = syscall(__NR_perf_event_open, &attr, -1, i, -1,
> > +PERF_FLAG_FD_CLOEXEC);
>
> Which field can point to the generation of a "per cpu pinned" event ?
> More comments are required.
Yes, I should add more comments. This function create pinned and flexible event, it is controlled by input parameter "bool pinned".
>
> > + TEST_ASSERT(fds[i] != -1, "Failed to create lbr event on cpu%d",
> i);
> > + }
> > +}
> > +
> > +static void release_perf_events(int *fds, int cpu_num) {
> > + int i;
> > +
> > + for (i = 0; i < cpu_num; i++)
> > + close(fds[i]);
> > +}
> > +
> > +#define PERF_CAP_LBR_FMT_MASK 0x1F
> > +
> > +#define LBR_NOT_SUPPORTED 0xFFFE
> > +#define LBR_MSR_WRITE_ERROR 0xFFFD
> > +
> > +#define LBR_MODE_CHECK_PASS 0x0
> > +#define LBR_MSR_WRITE_SUCC 0x1
> > +
> > +static bool check_lbr_msr(void)
> > +{
> > + uint64_t v, old_val;
> > +
> > + old_val = rdmsr(MSR_LBR_TOS);
>
> Why focus only on MSR_LBR_TOS ?
MSR_LBR_FROMx/TOx could be used also, I choose TOS without special reason.
>
> > +
> > + v = old_val ^ 0x3UL;
> > +
> > + wrmsr(MSR_LBR_TOS, v);
> > + if (rdmsr(MSR_LBR_TOS) != v)
> > + return false;
> > +
> > + wrmsr(MSR_LBR_TOS, old_val);
> > + if (rdmsr(MSR_LBR_TOS) != old_val)
> > + return false;
> > +
> > + return true;
> > +}
> > +
> > +static void guest_code(void)
> > +{
> > + uint64_t v;
> > +
> > + v = rdmsr(MSR_IA32_PERF_CAPABILITIES);
> > + if ((v & PERF_CAP_LBR_FMT_MASK) == 0)
> > + GUEST_SYNC(LBR_NOT_SUPPORTED);
> > +
> > + GUEST_SYNC(LBR_MODE_CHECK_PASS);
> > +
> > + while (1) {
> > + if (!check_lbr_msr()) {
> > + GUEST_SYNC(LBR_MSR_WRITE_ERROR);
> > + continue;
> > + }
> > +
> > + /* Enable LBR to avoid KVM recyling LBR. */
> > + v = rdmsr(MSR_IA32_DEBUGCTLMSR);
> > + v |= DEBUGCTLMSR_LBR;
> > + wrmsr(MSR_IA32_DEBUGCTLMSR, v);
> > +
> > + GUEST_SYNC(LBR_MSR_WRITE_SUCC);
> > + }
> > +}
> > +
> > +int main(int argc, char *argv[])
> > +{
> > + int *fds, ncpus;
> > + struct kvm_vcpu *vcpu;
> > + struct kvm_vm *vm;
> > + uint64_t r;
> > +
> > + TEST_REQUIRE(get_kvm_param_bool("enable_pmu"));
> > + TEST_REQUIRE(host_cpu_is_intel);
> > + TEST_REQUIRE(kvm_cpu_property(X86_PROPERTY_PMU_VERSION));
> > +
> > + vm = vm_create_with_one_vcpu(&vcpu, guest_code);
> > + r = run_vcpu_to_sync(vcpu);
> > + TEST_ASSERT(r == LBR_MODE_CHECK_PASS,
> > + "LBR format in guest PERF_CAP msr isn't correct");
> > +
> > + ncpus = get_nprocs();
>
> Could we limit the test to a specific cpu, since it will affect the load on other
> cpus?
Yes, If selftest or vcpu could be bind to a specific cpu, only one perf event could be created on the target cpu. But I don't know how to specify cpu.
>
> > + fds = malloc(sizeof(int) * ncpus);
> > + TEST_ASSERT(fds != NULL, "Failed to create fds for all cpus");
> > +
> > + /* Create per cpu pinned LBR event, then it will own LBR. */
> > + create_perf_events(fds, ncpus, true);
> > +
> > + /* Since LBR is owned by per cpu pinned LBR event, guest couldn't get it,
> > + * so guest couldn't access LBR_TOS msr.
> > + */
> > + r = run_vcpu_to_sync(vcpu);
> > + TEST_ASSERT(r == LBR_MSR_WRITE_ERROR,
> > + "1. Unexpected successfully read/write guest LBR_TO msr");
> > +
> > + release_perf_events(fds, ncpus);
>
> Obviously there are duplicate calls on release_perf_events() that can be omitted.
>
> > +
> > + /* Since per cpu pinned event is closed and LBR is free, guest could get it,
> > + * so guest could access LBR_TOS msr.
> > + */
> > + r = run_vcpu_to_sync(vcpu);
> > + TEST_ASSERT(r == LBR_MSR_WRITE_SUCC,
> > + "2. Failed to read/write guest LBR_TO msr");
> > +
> > + /* Create per cpu LBR event, its priority is lower than vLBR event, and it
> > + * couldn't get LBR back from vLBR
> > + */
> > + create_perf_events(fds, ncpus, false);
> > +
> > + /* LBR is still owned by guest, So guest could access LBR_TOS
> successfully. */
> > + r = run_vcpu_to_sync(vcpu);
> > + TEST_ASSERT(r == LBR_MSR_WRITE_SUCC,
> > + "3. Failed read/write guest LBR_TO msr");
> > +
> > + release_perf_events(fds, ncpus);
> > +
> > + /* Create per cpu pinned LBR event, its priority is higher than vLBR event,
> > + * so it will get LBR back from vLBR.
> > + */
> > + create_perf_events(fds, ncpus, true);
> > +
> > + /* LBR is preepmted by per cpu pinned LBR event, guest couldn't access
> > + * LBR_TOS msr.
> > + */
> > + r = run_vcpu_to_sync(vcpu);
> > + TEST_ASSERT(r == LBR_MSR_WRITE_ERROR,
> > + "4. Unexpected successfully read/write guest LBR_TO msr");
> > +
> > + release_perf_events(fds, ncpus);
>
> Why not add more tests to cover all possibilities ?
>
> per cpu pinned event
> per process pinned event
> per cpu event
> per process event
Per cpu pinned/flexible event have been covered here.
Per process, I think it means attaching perf onto qemu's process, I will add it.
Anyway, without the previous commits, vLBR has highest priority, this test result will change a lot.
>
> > +
> > + kvm_vm_free(vm);
> > +
> > + free(fds);
> > +
> > + return 0;
> > +}