[PATCH v4 7/7] KVM: selftests: Add multi-gen LRU aging to access_tracking_perf_test

From: James Houghton
Date: Wed May 29 2024 - 14:08:01 EST


This test now has two modes of operation:
1. (default) To check how much vCPU performance was affected by access
tracking (previously existed, now supports MGLRU aging).
2. (-p) To also benchmark how fast MGLRU can do aging while vCPUs are
faulting in memory.

Mode (1) also serves as a way to verify that aging is working properly
for pages only accessed by KVM. It will fail if one does not have the
0x8 lru_gen feature bit.

To support MGLRU, the test creates a memory cgroup, moves itself into
it, then uses the lru_gen debugfs output to track memory in that cgroup.
The logic to parse the lru_gen debugfs output has been put into
selftests/kvm/lib/lru_gen_util.c.

Co-developed-by: Axel Rasmussen <axelrasmussen@xxxxxxxxxx>
Signed-off-by: Axel Rasmussen <axelrasmussen@xxxxxxxxxx>
Signed-off-by: James Houghton <jthoughton@xxxxxxxxxx>
---
tools/testing/selftests/kvm/Makefile | 1 +
.../selftests/kvm/access_tracking_perf_test.c | 365 ++++++++++++++--
.../selftests/kvm/include/lru_gen_util.h | 55 +++
.../testing/selftests/kvm/lib/lru_gen_util.c | 391 ++++++++++++++++++
4 files changed, 782 insertions(+), 30 deletions(-)
create mode 100644 tools/testing/selftests/kvm/include/lru_gen_util.h
create mode 100644 tools/testing/selftests/kvm/lib/lru_gen_util.c

diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile
index ce8ff8e8ce3a..86415f524c48 100644
--- a/tools/testing/selftests/kvm/Makefile
+++ b/tools/testing/selftests/kvm/Makefile
@@ -22,6 +22,7 @@ LIBKVM += lib/elf.c
LIBKVM += lib/guest_modes.c
LIBKVM += lib/io.c
LIBKVM += lib/kvm_util.c
+LIBKVM += lib/lru_gen_util.c
LIBKVM += lib/memstress.c
LIBKVM += lib/guest_sprintf.c
LIBKVM += lib/rbtree.c
diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c
index 3c7defd34f56..15be99ff3bdc 100644
--- a/tools/testing/selftests/kvm/access_tracking_perf_test.c
+++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c
@@ -38,6 +38,7 @@
#include <inttypes.h>
#include <limits.h>
#include <pthread.h>
+#include <stdio.h>
#include <sys/mman.h>
#include <sys/types.h>
#include <sys/stat.h>
@@ -47,6 +48,20 @@
#include "memstress.h"
#include "guest_modes.h"
#include "processor.h"
+#include "lru_gen_util.h"
+
+static const char *TEST_MEMCG_NAME = "access_tracking_perf_test";
+static const int LRU_GEN_ENABLED = 0x1;
+static const int LRU_GEN_MM_WALK = 0x2;
+static const int LRU_GEN_SECONDARY_MMU_WALK = 0x8;
+static const char *CGROUP_PROCS = "cgroup.procs";
+/*
+ * If using MGLRU, this test assumes a cgroup v2 or cgroup v1 memory hierarchy
+ * is mounted at cgroup_root.
+ *
+ * Can be changed with -r.
+ */
+static const char *cgroup_root = "/sys/fs/cgroup";

/* Global variable used to synchronize all of the vCPU threads. */
static int iteration;
@@ -62,6 +77,9 @@ static enum {
/* The iteration that was last completed by each vCPU. */
static int vcpu_last_completed_iteration[KVM_MAX_VCPUS];

+/* The time at which the last iteration was completed */
+static struct timespec vcpu_last_completed_time[KVM_MAX_VCPUS];
+
/* Whether to overlap the regions of memory vCPUs access. */
static bool overlap_memory_access;

@@ -74,6 +92,12 @@ struct test_params {

/* The number of vCPUs to create in the VM. */
int nr_vcpus;
+
+ /* Whether to use lru_gen aging instead of idle page tracking. */
+ bool lru_gen;
+
+ /* Whether to test the performance of aging itself. */
+ bool benchmark_lru_gen;
};

static uint64_t pread_uint64(int fd, const char *filename, uint64_t index)
@@ -89,6 +113,50 @@ static uint64_t pread_uint64(int fd, const char *filename, uint64_t index)

}

+static void write_file_long(const char *path, long v)
+{
+ FILE *f;
+
+ f = fopen(path, "w");
+ TEST_ASSERT(f, "fopen(%s) failed", path);
+ TEST_ASSERT(fprintf(f, "%ld\n", v) > 0,
+ "fprintf to %s failed", path);
+ TEST_ASSERT(!fclose(f), "fclose(%s) failed", path);
+}
+
+static char *path_join(const char *parent, const char *child)
+{
+ char *out = NULL;
+
+ return asprintf(&out, "%s/%s", parent, child) >= 0 ? out : NULL;
+}
+
+static char *memcg_path(const char *memcg)
+{
+ return path_join(cgroup_root, memcg);
+}
+
+static char *memcg_file_path(const char *memcg, const char *file)
+{
+ char *mp = memcg_path(memcg);
+ char *fp;
+
+ if (!mp)
+ return NULL;
+ fp = path_join(mp, file);
+ free(mp);
+ return fp;
+}
+
+static void move_to_memcg(const char *memcg, pid_t pid)
+{
+ char *procs = memcg_file_path(memcg, CGROUP_PROCS);
+
+ TEST_ASSERT(procs, "Failed to construct cgroup.procs path");
+ write_file_long(procs, pid);
+ free(procs);
+}
+
#define PAGEMAP_PRESENT (1ULL << 63)
#define PAGEMAP_PFN_MASK ((1ULL << 55) - 1)

@@ -242,6 +310,8 @@ static void vcpu_thread_main(struct memstress_vcpu_args *vcpu_args)
};

vcpu_last_completed_iteration[vcpu_idx] = current_iteration;
+ clock_gettime(CLOCK_MONOTONIC,
+ &vcpu_last_completed_time[vcpu_idx]);
}
}

@@ -253,38 +323,68 @@ static void spin_wait_for_vcpu(int vcpu_idx, int target_iteration)
}
}

+static bool all_vcpus_done(int target_iteration, int nr_vcpus)
+{
+ for (int i = 0; i < nr_vcpus; ++i)
+ if (READ_ONCE(vcpu_last_completed_iteration[i]) !=
+ target_iteration)
+ return false;
+
+ return true;
+}
+
/* The type of memory accesses to perform in the VM. */
enum access_type {
ACCESS_READ,
ACCESS_WRITE,
};

-static void run_iteration(struct kvm_vm *vm, int nr_vcpus, const char *description)
+static void run_iteration(struct kvm_vm *vm, int nr_vcpus, const char *description,
+ bool wait)
{
- struct timespec ts_start;
- struct timespec ts_elapsed;
int next_iteration, i;

/* Kick off the vCPUs by incrementing iteration. */
next_iteration = ++iteration;

- clock_gettime(CLOCK_MONOTONIC, &ts_start);
-
/* Wait for all vCPUs to finish the iteration. */
- for (i = 0; i < nr_vcpus; i++)
- spin_wait_for_vcpu(i, next_iteration);
+ if (wait) {
+ struct timespec ts_start;
+ struct timespec ts_elapsed;
+
+ clock_gettime(CLOCK_MONOTONIC, &ts_start);

- ts_elapsed = timespec_elapsed(ts_start);
- pr_info("%-30s: %ld.%09lds\n",
- description, ts_elapsed.tv_sec, ts_elapsed.tv_nsec);
+ for (i = 0; i < nr_vcpus; i++)
+ spin_wait_for_vcpu(i, next_iteration);
+
+ ts_elapsed = timespec_elapsed(ts_start);
+
+ pr_info("%-30s: %ld.%09lds\n",
+ description, ts_elapsed.tv_sec, ts_elapsed.tv_nsec);
+ } else
+ pr_info("%-30s\n", description);
}

-static void access_memory(struct kvm_vm *vm, int nr_vcpus,
- enum access_type access, const char *description)
+static void _access_memory(struct kvm_vm *vm, int nr_vcpus,
+ enum access_type access, const char *description,
+ bool wait)
{
memstress_set_write_percent(vm, (access == ACCESS_READ) ? 0 : 100);
iteration_work = ITERATION_ACCESS_MEMORY;
- run_iteration(vm, nr_vcpus, description);
+ run_iteration(vm, nr_vcpus, description, wait);
+}
+
+static void access_memory(struct kvm_vm *vm, int nr_vcpus,
+ enum access_type access, const char *description)
+{
+ return _access_memory(vm, nr_vcpus, access, description, true);
+}
+
+static void access_memory_async(struct kvm_vm *vm, int nr_vcpus,
+ enum access_type access,
+ const char *description)
+{
+ return _access_memory(vm, nr_vcpus, access, description, false);
}

static void mark_memory_idle(struct kvm_vm *vm, int nr_vcpus)
@@ -297,19 +397,111 @@ static void mark_memory_idle(struct kvm_vm *vm, int nr_vcpus)
*/
pr_debug("Marking VM memory idle (slow)...\n");
iteration_work = ITERATION_MARK_IDLE;
- run_iteration(vm, nr_vcpus, "Mark memory idle");
+ run_iteration(vm, nr_vcpus, "Mark memory idle", true);
}

-static void run_test(enum vm_guest_mode mode, void *arg)
+static void create_memcg(const char *memcg)
+{
+ const char *full_memcg_path = memcg_path(memcg);
+ int ret;
+
+ TEST_ASSERT(full_memcg_path, "Failed to construct full memcg path");
+retry:
+ ret = mkdir(full_memcg_path, 0755);
+ if (ret && errno == EEXIST) {
+ TEST_ASSERT(!rmdir(full_memcg_path),
+ "Found existing memcg at %s, but rmdir failed",
+ full_memcg_path);
+ goto retry;
+ }
+ TEST_ASSERT(!ret, "Creating the memcg failed: mkdir(%s) failed",
+ full_memcg_path);
+
+ pr_info("Created memcg at %s\n", full_memcg_path);
+}
+
+/*
+ * Test lru_gen aging speed while vCPUs are faulting memory in.
+ *
+ * This test will run lru_gen aging until the vCPUs have finished all of
+ * the faulting work, reporting:
+ * - vcpu wall time (wall time for slowest vCPU)
+ * - average aging pass duration
+ * - total number of aging passes
+ * - total time spent aging
+ *
+ * This test produces the most useful results when the vcpu wall time and the
+ * total time spent aging are similar (i.e., we want to avoid timing aging
+ * while the vCPUs aren't doing any work).
+ */
+static void run_benchmark(enum vm_guest_mode mode, struct kvm_vm *vm,
+ struct test_params *params)
{
- struct test_params *params = arg;
- struct kvm_vm *vm;
int nr_vcpus = params->nr_vcpus;
+ struct memcg_stats stats;
+ struct timespec ts_start, ts_max, ts_vcpus_elapsed,
+ ts_aging_elapsed, ts_aging_elapsed_avg;
+ int num_passes = 0;

- vm = memstress_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1,
- params->backing_src, !overlap_memory_access);
+ printf("Running lru_gen benchmark...\n");

- memstress_start_vcpu_threads(nr_vcpus, vcpu_thread_main);
+ clock_gettime(CLOCK_MONOTONIC, &ts_start);
+ access_memory_async(vm, nr_vcpus, ACCESS_WRITE,
+ "Populating memory (async)");
+ while (!all_vcpus_done(iteration, nr_vcpus)) {
+ lru_gen_do_aging_quiet(&stats, TEST_MEMCG_NAME);
+ ++num_passes;
+ }
+
+ ts_aging_elapsed = timespec_elapsed(ts_start);
+ ts_aging_elapsed_avg = timespec_div(ts_aging_elapsed, num_passes);
+
+ /* Find out when the slowest vCPU finished. */
+ ts_max = ts_start;
+ for (int i = 0; i < nr_vcpus; ++i) {
+ struct timespec *vcpu_ts = &vcpu_last_completed_time[i];
+
+ if (ts_max.tv_sec < vcpu_ts->tv_sec ||
+ (ts_max.tv_sec == vcpu_ts->tv_sec &&
+ ts_max.tv_nsec < vcpu_ts->tv_nsec))
+ ts_max = *vcpu_ts;
+ }
+
+ ts_vcpus_elapsed = timespec_sub(ts_max, ts_start);
+
+ pr_info("%-30s: %ld.%09lds\n", "vcpu wall time",
+ ts_vcpus_elapsed.tv_sec, ts_vcpus_elapsed.tv_nsec);
+
+ pr_info("%-30s: %ld.%09lds, (passes:%d, total:%ld.%09lds)\n",
+ "lru_gen avg pass duration",
+ ts_aging_elapsed_avg.tv_sec,
+ ts_aging_elapsed_avg.tv_nsec,
+ num_passes,
+ ts_aging_elapsed.tv_sec,
+ ts_aging_elapsed.tv_nsec);
+}
+
+/*
+ * Test how much access tracking affects vCPU performance.
+ *
+ * Supports two modes of access tracking:
+ * - idle page tracking
+ * - lru_gen aging
+ *
+ * When using lru_gen, this test additionally verifies that the pages are in
+ * fact getting younger and older, otherwise the performance data would be
+ * invalid.
+ *
+ * The forced lru_gen aging can race with aging that occurs naturally.
+ */
+static void run_test(enum vm_guest_mode mode, struct kvm_vm *vm,
+ struct test_params *params)
+{
+ int nr_vcpus = params->nr_vcpus;
+ bool lru_gen = params->lru_gen;
+ struct memcg_stats stats;
+ long total_pages = nr_vcpus * params->vcpu_memory_bytes / getpagesize();
+ int found_gens[5];

pr_info("\n");
access_memory(vm, nr_vcpus, ACCESS_WRITE, "Populating memory");
@@ -319,11 +511,83 @@ static void run_test(enum vm_guest_mode mode, void *arg)
access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from populated memory");

/* Repeat on memory that has been marked as idle. */
- mark_memory_idle(vm, nr_vcpus);
+ if (lru_gen) {
+ /* Do an initial page table scan */
+ lru_gen_do_aging(&stats, TEST_MEMCG_NAME);
+ TEST_ASSERT(sum_memcg_stats(&stats) >= total_pages,
+ "Not all pages tracked in lru_gen stats.\n"
+ "Is lru_gen enabled? Did the memcg get created properly?");
+
+ /* Find the generation we're currently in (probably youngest) */
+ found_gens[0] = lru_gen_find_generation(&stats, total_pages);
+
+ /* Do an aging pass now */
+ lru_gen_do_aging(&stats, TEST_MEMCG_NAME);
+
+ /* Same generation, but a newer generation has been made */
+ found_gens[1] = lru_gen_find_generation(&stats, total_pages);
+ TEST_ASSERT(found_gens[1] == found_gens[0],
+ "unexpected gen change: %d vs. %d",
+ found_gens[1], found_gens[0]);
+ } else
+ mark_memory_idle(vm, nr_vcpus);
+
access_memory(vm, nr_vcpus, ACCESS_WRITE, "Writing to idle memory");
- mark_memory_idle(vm, nr_vcpus);
+
+ if (lru_gen) {
+ /* Scan the page tables again */
+ lru_gen_do_aging(&stats, TEST_MEMCG_NAME);
+
+ /* The pages should now be young again, so in a newer generation */
+ found_gens[2] = lru_gen_find_generation(&stats, total_pages);
+ TEST_ASSERT(found_gens[2] > found_gens[1],
+ "pages did not get younger");
+
+ /* Do another aging pass */
+ lru_gen_do_aging(&stats, TEST_MEMCG_NAME);
+
+ /* Same generation; new generation has been made */
+ found_gens[3] = lru_gen_find_generation(&stats, total_pages);
+ TEST_ASSERT(found_gens[3] == found_gens[2],
+ "unexpected gen change: %d vs. %d",
+ found_gens[3], found_gens[2]);
+ } else
+ mark_memory_idle(vm, nr_vcpus);
+
access_memory(vm, nr_vcpus, ACCESS_READ, "Reading from idle memory");

+ if (lru_gen) {
+ /* Scan the pages tables again */
+ lru_gen_do_aging(&stats, TEST_MEMCG_NAME);
+
+ /* The pages should now be young again, so in a newer generation */
+ found_gens[4] = lru_gen_find_generation(&stats, total_pages);
+ TEST_ASSERT(found_gens[4] > found_gens[3],
+ "pages did not get younger");
+ }
+}
+
+static void setup_vm_and_run(enum vm_guest_mode mode, void *arg)
+{
+ struct test_params *params = arg;
+ int nr_vcpus = params->nr_vcpus;
+ struct kvm_vm *vm;
+
+ if (params->lru_gen) {
+ create_memcg(TEST_MEMCG_NAME);
+ move_to_memcg(TEST_MEMCG_NAME, getpid());
+ }
+
+ vm = memstress_create_vm(mode, nr_vcpus, params->vcpu_memory_bytes, 1,
+ params->backing_src, !overlap_memory_access);
+
+ memstress_start_vcpu_threads(nr_vcpus, vcpu_thread_main);
+
+ if (params->benchmark_lru_gen)
+ run_benchmark(mode, vm, params);
+ else
+ run_test(mode, vm, params);
+
memstress_join_vcpu_threads(nr_vcpus);
memstress_destroy_vm(vm);
}
@@ -331,8 +595,8 @@ static void run_test(enum vm_guest_mode mode, void *arg)
static void help(char *name)
{
puts("");
- printf("usage: %s [-h] [-m mode] [-b vcpu_bytes] [-v vcpus] [-o] [-s mem_type]\n",
- name);
+ printf("usage: %s [-h] [-m mode] [-b vcpu_bytes] [-v vcpus] [-o]"
+ " [-s mem_type] [-l] [-r memcg_root]\n", name);
puts("");
printf(" -h: Display this help message.");
guest_modes_help();
@@ -342,6 +606,9 @@ static void help(char *name)
printf(" -v: specify the number of vCPUs to run.\n");
printf(" -o: Overlap guest memory accesses instead of partitioning\n"
" them into a separate region of memory for each vCPU.\n");
+ printf(" -l: Use MGLRU aging instead of idle page tracking\n");
+ printf(" -p: Benchmark MGLRU aging while faulting memory in\n");
+ printf(" -r: The memory cgroup hierarchy root to use (when -l is given)\n");
backing_src_help("-s");
puts("");
exit(0);
@@ -353,13 +620,15 @@ int main(int argc, char *argv[])
.backing_src = DEFAULT_VM_MEM_SRC,
.vcpu_memory_bytes = DEFAULT_PER_VCPU_MEM_SIZE,
.nr_vcpus = 1,
+ .lru_gen = false,
+ .benchmark_lru_gen = false,
};
int page_idle_fd;
int opt;

guest_modes_append_default();

- while ((opt = getopt(argc, argv, "hm:b:v:os:")) != -1) {
+ while ((opt = getopt(argc, argv, "hm:b:v:os:lr:p")) != -1) {
switch (opt) {
case 'm':
guest_modes_cmdline(optarg);
@@ -376,6 +645,15 @@ int main(int argc, char *argv[])
case 's':
params.backing_src = parse_backing_src_type(optarg);
break;
+ case 'l':
+ params.lru_gen = true;
+ break;
+ case 'p':
+ params.benchmark_lru_gen = true;
+ break;
+ case 'r':
+ cgroup_root = strdup(optarg);
+ break;
case 'h':
default:
help(argv[0]);
@@ -383,12 +661,39 @@ int main(int argc, char *argv[])
}
}

- page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR);
- __TEST_REQUIRE(page_idle_fd >= 0,
- "CONFIG_IDLE_PAGE_TRACKING is not enabled");
- close(page_idle_fd);
+ if (!params.lru_gen) {
+ page_idle_fd = open("/sys/kernel/mm/page_idle/bitmap", O_RDWR);
+ __TEST_REQUIRE(page_idle_fd >= 0,
+ "CONFIG_IDLE_PAGE_TRACKING is not enabled");
+ close(page_idle_fd);
+ } else {
+ int lru_gen_fd, lru_gen_debug_fd;
+ long mglru_features;
+ char mglru_feature_str[8] = {};
+
+ lru_gen_fd = open("/sys/kernel/mm/lru_gen/enabled", O_RDONLY);
+ __TEST_REQUIRE(lru_gen_fd >= 0,
+ "CONFIG_LRU_GEN is not enabled");
+ TEST_ASSERT(read(lru_gen_fd, &mglru_feature_str, 7) > 0,
+ "couldn't read lru_gen features");
+ mglru_features = strtol(mglru_feature_str, NULL, 16);
+ __TEST_REQUIRE(mglru_features & LRU_GEN_ENABLED,
+ "lru_gen is not enabled");
+ __TEST_REQUIRE(mglru_features & LRU_GEN_MM_WALK,
+ "lru_gen does not support MM_WALK");
+ __TEST_REQUIRE(mglru_features & LRU_GEN_SECONDARY_MMU_WALK,
+ "lru_gen does not support SECONDARY_MMU_WALK");
+
+ lru_gen_debug_fd = open(DEBUGFS_LRU_GEN, O_RDWR);
+ __TEST_REQUIRE(lru_gen_debug_fd >= 0,
+ "Cannot access %s", DEBUGFS_LRU_GEN);
+ close(lru_gen_debug_fd);
+ }
+
+ TEST_ASSERT(!params.benchmark_lru_gen || params.lru_gen,
+ "-p specified without -l");

- for_each_guest_mode(run_test, &params);
+ for_each_guest_mode(setup_vm_and_run, &params);

return 0;
}
diff --git a/tools/testing/selftests/kvm/include/lru_gen_util.h b/tools/testing/selftests/kvm/include/lru_gen_util.h
new file mode 100644
index 000000000000..4eef8085a3cb
--- /dev/null
+++ b/tools/testing/selftests/kvm/include/lru_gen_util.h
@@ -0,0 +1,55 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Tools for integrating with lru_gen, like parsing the lru_gen debugfs output.
+ *
+ * Copyright (C) 2024, Google LLC.
+ */
+#ifndef SELFTEST_KVM_LRU_GEN_UTIL_H
+#define SELFTEST_KVM_LRU_GEN_UTIL_H
+
+#include <inttypes.h>
+#include <limits.h>
+#include <stdlib.h>
+
+#include "test_util.h"
+
+#define MAX_NR_GENS 16 /* MAX_NR_GENS in include/linux/mmzone.h */
+#define MAX_NR_NODES 4 /* Maximum number of nodes we support */
+
+static const char *DEBUGFS_LRU_GEN = "/sys/kernel/debug/lru_gen";
+
+struct generation_stats {
+ int gen;
+ long age_ms;
+ long nr_anon;
+ long nr_file;
+};
+
+struct node_stats {
+ int node;
+ int nr_gens; /* Number of populated gens entries. */
+ struct generation_stats gens[MAX_NR_GENS];
+};
+
+struct memcg_stats {
+ unsigned long memcg_id;
+ int nr_nodes; /* Number of populated nodes entries. */
+ struct node_stats nodes[MAX_NR_NODES];
+};
+
+void print_memcg_stats(const struct memcg_stats *stats, const char *name);
+
+void read_memcg_stats(struct memcg_stats *stats, const char *memcg);
+
+void read_print_memcg_stats(struct memcg_stats *stats, const char *memcg);
+
+long sum_memcg_stats(const struct memcg_stats *stats);
+
+void lru_gen_do_aging(struct memcg_stats *stats, const char *memcg);
+
+void lru_gen_do_aging_quiet(struct memcg_stats *stats, const char *memcg);
+
+int lru_gen_find_generation(const struct memcg_stats *stats,
+ unsigned long total_pages);
+
+#endif /* SELFTEST_KVM_LRU_GEN_UTIL_H */
diff --git a/tools/testing/selftests/kvm/lib/lru_gen_util.c b/tools/testing/selftests/kvm/lib/lru_gen_util.c
new file mode 100644
index 000000000000..3c02a635a9f7
--- /dev/null
+++ b/tools/testing/selftests/kvm/lib/lru_gen_util.c
@@ -0,0 +1,391 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2024, Google LLC.
+ */
+
+#include <time.h>
+
+#include "lru_gen_util.h"
+
+/*
+ * Tracks state while we parse memcg lru_gen stats. The file we're parsing is
+ * structured like this (some extra whitespace elided):
+ *
+ * memcg (id) (path)
+ * node (id)
+ * (gen_nr) (age_in_ms) (nr_anon_pages) (nr_file_pages)
+ */
+struct memcg_stats_parse_context {
+ bool consumed; /* Whether or not this line was consumed */
+ /* Next parse handler to invoke */
+ void (*next_handler)(struct memcg_stats *,
+ struct memcg_stats_parse_context *, char *);
+ int current_node_idx; /* Current index in nodes array */
+ const char *name; /* The name of the memcg we're looking for */
+};
+
+static void memcg_stats_handle_searching(struct memcg_stats *stats,
+ struct memcg_stats_parse_context *ctx,
+ char *line);
+static void memcg_stats_handle_in_memcg(struct memcg_stats *stats,
+ struct memcg_stats_parse_context *ctx,
+ char *line);
+static void memcg_stats_handle_in_node(struct memcg_stats *stats,
+ struct memcg_stats_parse_context *ctx,
+ char *line);
+
+struct split_iterator {
+ char *str;
+ char *save;
+};
+
+static char *split_next(struct split_iterator *it)
+{
+ char *ret = strtok_r(it->str, " \t\n\r", &it->save);
+
+ it->str = NULL;
+ return ret;
+}
+
+static void memcg_stats_handle_searching(struct memcg_stats *stats,
+ struct memcg_stats_parse_context *ctx,
+ char *line)
+{
+ struct split_iterator it = { .str = line };
+ char *prefix = split_next(&it);
+ char *memcg_id = split_next(&it);
+ char *memcg_name = split_next(&it);
+ char *end;
+
+ ctx->consumed = true;
+
+ if (!prefix || strcmp("memcg", prefix))
+ return; /* Not a memcg line (maybe empty), skip */
+
+ TEST_ASSERT(memcg_id && memcg_name,
+ "malformed memcg line; no memcg id or memcg_name");
+
+ if (strcmp(memcg_name + 1, ctx->name))
+ return; /* Wrong memcg, skip */
+
+ /* Found it! */
+
+ stats->memcg_id = strtoul(memcg_id, &end, 10);
+ TEST_ASSERT(*end == '\0', "malformed memcg id '%s'", memcg_id);
+ if (!stats->memcg_id)
+ return; /* Removed memcg? */
+
+ ctx->next_handler = memcg_stats_handle_in_memcg;
+}
+
+static void memcg_stats_handle_in_memcg(struct memcg_stats *stats,
+ struct memcg_stats_parse_context *ctx,
+ char *line)
+{
+ struct split_iterator it = { .str = line };
+ char *prefix = split_next(&it);
+ char *id = split_next(&it);
+ long found_node_id;
+ char *end;
+
+ ctx->consumed = true;
+ ctx->current_node_idx = -1;
+
+ if (!prefix)
+ return; /* Skip empty lines */
+
+ if (!strcmp("memcg", prefix)) {
+ /* Memcg done, found next one; stop. */
+ ctx->next_handler = NULL;
+ return;
+ } else if (strcmp("node", prefix))
+ TEST_ASSERT(false, "found malformed line after 'memcg ...',"
+ "token: '%s'", prefix);
+
+ /* At this point we know we have a node line. Parse the ID. */
+
+ TEST_ASSERT(id, "malformed node line; no node id");
+
+ found_node_id = strtol(id, &end, 10);
+ TEST_ASSERT(*end == '\0', "malformed node id '%s'", id);
+
+ ctx->current_node_idx = stats->nr_nodes++;
+ TEST_ASSERT(ctx->current_node_idx < MAX_NR_NODES,
+ "memcg has stats for too many nodes, max is %d",
+ MAX_NR_NODES);
+ stats->nodes[ctx->current_node_idx].node = found_node_id;
+
+ ctx->next_handler = memcg_stats_handle_in_node;
+}
+
+static void memcg_stats_handle_in_node(struct memcg_stats *stats,
+ struct memcg_stats_parse_context *ctx,
+ char *line)
+{
+ /* Have to copy since we might not consume */
+ char *my_line = strdup(line);
+ struct split_iterator it = { .str = my_line };
+ char *gen, *age, *nr_anon, *nr_file;
+ struct node_stats *node_stats;
+ struct generation_stats *gen_stats;
+ char *end;
+
+ TEST_ASSERT(it.str, "failed to copy input line");
+
+ gen = split_next(&it);
+
+ /* Skip empty lines */
+ if (!gen)
+ goto out_consume; /* Skip empty lines */
+
+ if (!strcmp("memcg", gen) || !strcmp("node", gen)) {
+ /*
+ * Reached next memcg or node section. Don't consume, let the
+ * other handler deal with this.
+ */
+ ctx->next_handler = memcg_stats_handle_in_memcg;
+ goto out;
+ }
+
+ node_stats = &stats->nodes[ctx->current_node_idx];
+ TEST_ASSERT(node_stats->nr_gens < MAX_NR_GENS,
+ "found too many generation lines; max is %d",
+ MAX_NR_GENS);
+ gen_stats = &node_stats->gens[node_stats->nr_gens++];
+
+ age = split_next(&it);
+ nr_anon = split_next(&it);
+ nr_file = split_next(&it);
+
+ TEST_ASSERT(age && nr_anon && nr_file,
+ "malformed generation line; not enough tokens");
+
+ gen_stats->gen = (int)strtol(gen, &end, 10);
+ TEST_ASSERT(*end == '\0', "malformed generation number '%s'", gen);
+
+ gen_stats->age_ms = strtol(age, &end, 10);
+ TEST_ASSERT(*end == '\0', "malformed generation age '%s'", age);
+
+ gen_stats->nr_anon = strtol(nr_anon, &end, 10);
+ TEST_ASSERT(*end == '\0', "malformed anonymous page count '%s'",
+ nr_anon);
+
+ gen_stats->nr_file = strtol(nr_file, &end, 10);
+ TEST_ASSERT(*end == '\0', "malformed file page count '%s'", nr_file);
+
+out_consume:
+ ctx->consumed = true;
+out:
+ free(my_line);
+}
+
+/* Pretty-print lru_gen @stats. */
+void print_memcg_stats(const struct memcg_stats *stats, const char *name)
+{
+ int node, gen;
+
+ fprintf(stderr, "stats for memcg %s (id %lu):\n",
+ name, stats->memcg_id);
+ for (node = 0; node < stats->nr_nodes; ++node) {
+ fprintf(stderr, "\tnode %d\n", stats->nodes[node].node);
+ for (gen = 0; gen < stats->nodes[node].nr_gens; ++gen) {
+ const struct generation_stats *gstats =
+ &stats->nodes[node].gens[gen];
+
+ fprintf(stderr,
+ "\t\tgen %d\tage_ms %ld"
+ "\tnr_anon %ld\tnr_file %ld\n",
+ gstats->gen, gstats->age_ms, gstats->nr_anon,
+ gstats->nr_file);
+ }
+ }
+}
+
+/* Re-read lru_gen debugfs information for @memcg into @stats. */
+void read_memcg_stats(struct memcg_stats *stats, const char *memcg)
+{
+ FILE *f;
+ ssize_t read = 0;
+ char *line = NULL;
+ size_t bufsz;
+ struct memcg_stats_parse_context ctx = {
+ .next_handler = memcg_stats_handle_searching,
+ .name = memcg,
+ };
+
+ memset(stats, 0, sizeof(struct memcg_stats));
+
+ f = fopen(DEBUGFS_LRU_GEN, "r");
+ TEST_ASSERT(f, "fopen(%s) failed", DEBUGFS_LRU_GEN);
+
+ while (ctx.next_handler && (read = getline(&line, &bufsz, f)) > 0) {
+ ctx.consumed = false;
+
+ do {
+ ctx.next_handler(stats, &ctx, line);
+ if (!ctx.next_handler)
+ break;
+ } while (!ctx.consumed);
+ }
+
+ if (read < 0 && !feof(f))
+ TEST_ASSERT(false, "getline(%s) failed", DEBUGFS_LRU_GEN);
+
+ TEST_ASSERT(stats->memcg_id > 0, "Couldn't find memcg: %s\n"
+ "Did the memcg get created in the proper mount?",
+ memcg);
+ if (line)
+ free(line);
+ TEST_ASSERT(!fclose(f), "fclose(%s) failed", DEBUGFS_LRU_GEN);
+}
+
+/*
+ * Find all pages tracked by lru_gen for this memcg in generation @target_gen.
+ *
+ * If @target_gen is negative, look for all generations.
+ */
+static long sum_memcg_stats_for_gen(int target_gen,
+ const struct memcg_stats *stats)
+{
+ int node, gen;
+ long total_nr = 0;
+
+ for (node = 0; node < stats->nr_nodes; ++node) {
+ const struct node_stats *node_stats = &stats->nodes[node];
+
+ for (gen = 0; gen < node_stats->nr_gens; ++gen) {
+ const struct generation_stats *gen_stats =
+ &node_stats->gens[gen];
+
+ if (target_gen >= 0 && gen_stats->gen != target_gen)
+ continue;
+
+ total_nr += gen_stats->nr_anon + gen_stats->nr_file;
+ }
+ }
+
+ return total_nr;
+}
+
+/* Find all pages tracked by lru_gen for this memcg. */
+long sum_memcg_stats(const struct memcg_stats *stats)
+{
+ return sum_memcg_stats_for_gen(-1, stats);
+}
+
+/* Read the memcg stats and optionally print if this is a debug build. */
+void read_print_memcg_stats(struct memcg_stats *stats, const char *memcg)
+{
+ read_memcg_stats(stats, memcg);
+#ifdef DEBUG
+ print_memcg_stats(stats, memcg);
+#endif
+}
+
+/*
+ * If lru_gen aging should force page table scanning.
+ *
+ * If you want to set this to false, you will need to do eviction
+ * before doing extra aging passes.
+ */
+static const bool force_scan = true;
+
+static void run_aging_impl(unsigned long memcg_id, int node_id, int max_gen)
+{
+ FILE *f = fopen(DEBUGFS_LRU_GEN, "w");
+ char *command;
+ size_t sz;
+
+ TEST_ASSERT(f, "fopen(%s) failed", DEBUGFS_LRU_GEN);
+ sz = asprintf(&command, "+ %lu %d %d 1 %d\n",
+ memcg_id, node_id, max_gen, force_scan);
+ TEST_ASSERT(sz > 0, "creating aging command failed");
+
+ pr_debug("Running aging command: %s", command);
+ if (fwrite(command, sizeof(char), sz, f) < sz) {
+ TEST_ASSERT(false, "writing aging command %s to %s failed",
+ command, DEBUGFS_LRU_GEN);
+ }
+
+ TEST_ASSERT(!fclose(f), "fclose(%s) failed", DEBUGFS_LRU_GEN);
+}
+
+static void _lru_gen_do_aging(struct memcg_stats *stats, const char *memcg,
+ bool verbose)
+{
+ int node, gen;
+ struct timespec ts_start;
+ struct timespec ts_elapsed;
+
+ pr_debug("lru_gen: invoking aging...\n");
+
+ /* Must read memcg stats to construct the proper aging command. */
+ read_print_memcg_stats(stats, memcg);
+
+ if (verbose)
+ clock_gettime(CLOCK_MONOTONIC, &ts_start);
+
+ for (node = 0; node < stats->nr_nodes; ++node) {
+ int max_gen = 0;
+
+ for (gen = 0; gen < stats->nodes[node].nr_gens; ++gen) {
+ int this_gen = stats->nodes[node].gens[gen].gen;
+
+ max_gen = max_gen > this_gen ? max_gen : this_gen;
+ }
+
+ run_aging_impl(stats->memcg_id, stats->nodes[node].node,
+ max_gen);
+ }
+
+ if (verbose) {
+ ts_elapsed = timespec_elapsed(ts_start);
+ pr_info("%-30s: %ld.%09lds\n", "lru_gen: Aging",
+ ts_elapsed.tv_sec, ts_elapsed.tv_nsec);
+ }
+
+ /* Re-read so callers get updated information */
+ read_print_memcg_stats(stats, memcg);
+}
+
+/* Do aging, and print how long it took. */
+void lru_gen_do_aging(struct memcg_stats *stats, const char *memcg)
+{
+ return _lru_gen_do_aging(stats, memcg, true);
+}
+
+/* Do aging, don't print anything. */
+void lru_gen_do_aging_quiet(struct memcg_stats *stats, const char *memcg)
+{
+ return _lru_gen_do_aging(stats, memcg, false);
+}
+
+/*
+ * Find which generation contains more than half of @total_pages, assuming that
+ * such a generation exists.
+ */
+int lru_gen_find_generation(const struct memcg_stats *stats,
+ unsigned long total_pages)
+{
+ int node, gen, gen_idx, min_gen = INT_MAX, max_gen = -1;
+
+ for (node = 0; node < stats->nr_nodes; ++node)
+ for (gen_idx = 0; gen_idx < stats->nodes[node].nr_gens;
+ ++gen_idx) {
+ gen = stats->nodes[node].gens[gen_idx].gen;
+ max_gen = gen > max_gen ? gen : max_gen;
+ min_gen = gen < min_gen ? gen : min_gen;
+ }
+
+ for (gen = min_gen; gen < max_gen; ++gen)
+ /* See if the most pages are in this generation. */
+ if (sum_memcg_stats_for_gen(gen, stats) >
+ total_pages / 2)
+ return gen;
+
+ TEST_ASSERT(false, "No generation includes majority of %lu pages.",
+ total_pages);
+
+ /* unreachable, but make the compiler happy */
+ return -1;
+}
--
2.45.1.288.g0e0cd299f1-goog