Re: [PATCH 2/5] KVM: selftests: access_tracking_perf_test: Add option to skip the sanity check
From: Maxim Levitsky
Date: Fri Mar 28 2025 - 15:32:52 EST
On Thu, 2025-03-27 at 01:23 +0000, James Houghton wrote:
> From: Maxim Levitsky <mlevitsk@xxxxxxxxxx>
>
> Add an option to skip sanity check of number of still idle pages,
> and set it by default to skip, in case hypervisor or NUMA balancing
> is detected.
>
> Signed-off-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx>
> Co-developed-by: James Houghton <jthoughton@xxxxxxxxxx>
> Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx>
> ---
> .../selftests/kvm/access_tracking_perf_test.c | 61 ++++++++++++++++---
> .../testing/selftests/kvm/include/test_util.h | 1 +
> tools/testing/selftests/kvm/lib/test_util.c | 7 +++
> 3 files changed, 60 insertions(+), 9 deletions(-)
>
> diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c
> index 447e619cf856e..0e594883ec13e 100644
> --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c
> +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c
> @@ -65,6 +65,16 @@ static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];
> /* Whether to overlap the regions of memory vCPUs access. */
> static bool overlap_memory_access;
>
> +/*
> + * If the test should only warn if there are too many idle pages (i.e., it is
> + * expected).
> + * -1: Not yet set.
> + * 0: We do not expect too many idle pages, so FAIL if too many idle pages.
> + * 1: Having too many idle pages is expected, so merely print a warning if
> + * too many idle pages are found.
> + */
> +static int idle_pages_warn_only = -1;
> +
> struct test_params {
> /* The backing source for the region of memory. */
> enum vm_mem_backing_src_type backing_src;
> @@ -177,18 +187,12 @@ static void mark_vcpu_memory_idle(struct kvm_vm *vm,
> * arbitrary; high enough that we ensure most memory access went through
> * access tracking but low enough as to not make the test too brittle
> * over time and across architectures.
> - *
> - * When running the guest as a nested VM, "warn" instead of asserting
> - * as the TLB size is effectively unlimited and the KVM doesn't
> - * explicitly flush the TLB when aging SPTEs. As a result, more pages
> - * are cached and the guest won't see the "idle" bit cleared.
> */
> if (still_idle >= pages / 10) {
> -#ifdef __x86_64__
> - TEST_ASSERT(this_cpu_has(X86_FEATURE_HYPERVISOR),
> + TEST_ASSERT(idle_pages_warn_only,
> "vCPU%d: Too many pages still idle (%lu out of %lu)",
> vcpu_idx, still_idle, pages);
> -#endif
> +
> printf("WARNING: vCPU%d: Too many pages still idle (%lu out of %lu), "
> "this will affect performance results.\n",
> vcpu_idx, still_idle, pages);
> @@ -328,6 +332,31 @@ static void run_test(enum vm_guest_mode mode, void *arg)
> memstress_destroy_vm(vm);
> }
>
> +static int access_tracking_unreliable(void)
> +{
> +#ifdef __x86_64__
> + /*
> + * When running nested, the TLB size is effectively unlimited and the
> + * KVM doesn't explicitly flush the TLB when aging SPTEs. As a result,
> + * more pages are cached and the guest won't see the "idle" bit cleared.
> + */
Tiny nitpick: nested on KVM, because on other hypervisors it might work differently,
but overall most of them probably suffer from the same problem,
so its probably better to say something like that:
'When running nested, the TLB size might be effectively unlimited,
for example this is the case when running on top of KVM L0'
> + if (this_cpu_has(X86_FEATURE_HYPERVISOR)) {
> + puts("Skipping idle page count sanity check, because the test is run nested");
> + return 1;
> + }
> +#endif
> + /*
> + * When NUMA balancing is enabled, guest memory can be mapped
> + * PROT_NONE, and the Accessed bits won't be queriable.
Tiny nitpick: the accessed bit in this case are lost, because KVM no longer propagates
it from secondary to primary paging, and the secondary mapping might be lost due to zapping,
and the biggest offender of this is NUMA balancing.
> + */
> + if (is_numa_balancing_enabled()) {
> + puts("Skipping idle page count sanity check, because NUMA balancing is enabled");
> + return 1;
> + }
> +
> + return 0;
> +}
Very good idea of extracting this logic into a function and documenting it.
> +
> static void help(char *name)
> {
> puts("");
> @@ -342,6 +371,12 @@ static void help(char *name)
> printf(" -v: specify the number of vCPUs to run.\n");
> printf(" -o: Overlap guest memory accesses instead of partitioning\n"
> " them into a separate region of memory for each vCPU.\n");
> + printf(" -w: Control whether the test warns or fails if more than 10%\n"
> + " of pages are still seen as idle/old after accessing guest\n"
> + " memory. >0 == warn only, 0 == fail, <0 == auto. For auto\n"
> + " mode, the test fails by default, but switches to warn only\n"
> + " if NUMA balancing is enabled or the test detects it's running\n"
> + " in a VM.\n");
> backing_src_help("-s");
> puts("");
> exit(0);
> @@ -359,7 +394,7 @@ int main(int argc, char *argv[])
>
> guest_modes_append_default();
>
> - while ((opt = getopt(argc, argv, "hm:b:v:os:")) != -1) {
> + while ((opt = getopt(argc, argv, "hm:b:v:os:w:")) != -1) {
> switch (opt) {
> case 'm':
> guest_modes_cmdline(optarg);
> @@ -376,6 +411,11 @@ int main(int argc, char *argv[])
> case 's':
> params.backing_src = parse_backing_src_type(optarg);
> break;
> + case 'w':
> + idle_pages_warn_only =
> + atoi_non_negative("Idle pages warning",
> + optarg);
> + break;
> case 'h':
> default:
> help(argv[0]);
> @@ -388,6 +428,9 @@ int main(int argc, char *argv[])
> "CONFIG_IDLE_PAGE_TRACKING is not enabled");
> close(page_idle_fd);
>
> + if (idle_pages_warn_only == -1)
> + idle_pages_warn_only = access_tracking_unreliable();
> +
> for_each_guest_mode(run_test, ¶ms);
>
> return 0;
> diff --git a/tools/testing/selftests/kvm/include/test_util.h b/tools/testing/selftests/kvm/include/test_util.h
> index 77d13d7920cb8..c6ef895fbd9ab 100644
> --- a/tools/testing/selftests/kvm/include/test_util.h
> +++ b/tools/testing/selftests/kvm/include/test_util.h
> @@ -153,6 +153,7 @@ bool is_backing_src_hugetlb(uint32_t i);
> void backing_src_help(const char *flag);
> enum vm_mem_backing_src_type parse_backing_src_type(const char *type_name);
> long get_run_delay(void);
> +bool is_numa_balancing_enabled(void);
>
> /*
> * Whether or not the given source type is shared memory (as opposed to
> diff --git a/tools/testing/selftests/kvm/lib/test_util.c b/tools/testing/selftests/kvm/lib/test_util.c
> index 3dc8538f5d696..03eb99af9b8de 100644
> --- a/tools/testing/selftests/kvm/lib/test_util.c
> +++ b/tools/testing/selftests/kvm/lib/test_util.c
> @@ -176,6 +176,13 @@ size_t get_trans_hugepagesz(void)
> return get_sysfs_val("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size");
> }
>
> +bool is_numa_balancing_enabled(void)
> +{
> + if (!test_sysfs_path("/proc/sys/kernel/numa_balancing"))
> + return false;
> + return get_sysfs_val("/proc/sys/kernel/numa_balancing") == 1;
> +}
> +
> size_t get_def_hugetlb_pagesz(void)
> {
> char buf[64];
Reviewed-by: Maxim Levitsky <mlevitsk@xxxxxxxxxx>
Best regards,
Maxim Levitsky