[PATCH RFC 12/15] KVM: selftests: Add dirty ring buffer test

From: Peter Xu
Date: Fri Nov 29 2019 - 16:35:37 EST


Add the initial dirty ring buffer test.

The current test implements the userspace dirty ring collection, by
only reaping the dirty ring when the ring is full.

So it's still running asynchronously like this:

vcpu main thread

1. vcpu dirties pages
2. vcpu gets dirty ring full
(userspace exit)

3. main thread waits until full
(so hardware buffers flushed)
4. main thread collects
5. main thread continues vcpu

6. vcpu continues, goes back to 1

We can't directly collects dirty bits during vcpu execution because
otherwise we can't guarantee the hardware dirty bits were flushed when
we collect and we're very strict on the dirty bits so otherwise we can
fail the future verify procedure. A follow up patch will make this
test to support async just like the existing dirty log test, by adding
a vcpu kick mechanism.

Signed-off-by: Peter Xu <peterx@xxxxxxxxxx>
---
tools/testing/selftests/kvm/dirty_log_test.c | 148 ++++++++++++++++++
.../testing/selftests/kvm/include/kvm_util.h | 5 +
tools/testing/selftests/kvm/lib/kvm_util.c | 95 +++++++++++
.../selftests/kvm/lib/kvm_util_internal.h | 5 +
4 files changed, 253 insertions(+)

diff --git a/tools/testing/selftests/kvm/dirty_log_test.c b/tools/testing/selftests/kvm/dirty_log_test.c
index 3542311f56ff..968e35c5d380 100644
--- a/tools/testing/selftests/kvm/dirty_log_test.c
+++ b/tools/testing/selftests/kvm/dirty_log_test.c
@@ -12,8 +12,10 @@
#include <unistd.h>
#include <time.h>
#include <pthread.h>
+#include <semaphore.h>
#include <linux/bitmap.h>
#include <linux/bitops.h>
+#include <asm/barrier.h>

#include "test_util.h"
#include "kvm_util.h"
@@ -57,6 +59,8 @@
# define test_and_clear_bit_le test_and_clear_bit
#endif

+#define TEST_DIRTY_RING_COUNT 1024
+
/*
* Guest/Host shared variables. Ensure addr_gva2hva() and/or
* sync_global_to/from_guest() are used when accessing from
@@ -128,6 +132,10 @@ static uint64_t host_dirty_count;
static uint64_t host_clear_count;
static uint64_t host_track_next_count;

+/* Whether dirty ring reset is requested, or finished */
+static sem_t dirty_ring_vcpu_stop;
+static sem_t dirty_ring_vcpu_cont;
+
enum log_mode_t {
/* Only use KVM_GET_DIRTY_LOG for logging */
LOG_MODE_DIRTY_LOG = 0,
@@ -135,6 +143,9 @@ enum log_mode_t {
/* Use both KVM_[GET|CLEAR]_DIRTY_LOG for logging */
LOG_MODE_CLERA_LOG = 1,

+ /* Use dirty ring for logging */
+ LOG_MODE_DIRTY_RING = 2,
+
LOG_MODE_NUM,
};

@@ -177,6 +188,123 @@ static void default_after_vcpu_run(struct kvm_vm *vm)
exit_reason_str(run->exit_reason));
}

+static void dirty_ring_create_vm_done(struct kvm_vm *vm)
+{
+ /*
+ * Switch to dirty ring mode after VM creation but before any
+ * of the vcpu creation.
+ */
+ vm_enable_dirty_ring(vm, TEST_DIRTY_RING_COUNT *
+ sizeof(struct kvm_dirty_gfn));
+}
+
+static uint32_t dirty_ring_collect_one(struct kvm_dirty_gfn *dirty_gfns,
+ struct kvm_dirty_ring_indexes *indexes,
+ int slot, void *bitmap,
+ uint32_t num_pages, int index)
+{
+ struct kvm_dirty_gfn *cur;
+ uint32_t avail, fetch, count = 0;
+
+ /*
+ * We should keep it somewhere, but to be simple we read
+ * fetch_index too.
+ */
+ fetch = READ_ONCE(indexes->fetch_index);
+ avail = READ_ONCE(indexes->avail_index);
+
+ /* Make sure we read valid entries always */
+ rmb();
+
+ DEBUG("ring %d: fetch: 0x%x, avail: 0x%x\n", index, fetch, avail);
+
+ while (fetch != avail) {
+ cur = &dirty_gfns[fetch % test_dirty_ring_count];
+ TEST_ASSERT(cur->pad == 0, "Padding is non-zero: 0x%x", cur->pad);
+ TEST_ASSERT(cur->slot == slot, "Slot number didn't match: "
+ "%u != %u", cur->slot, slot);
+ TEST_ASSERT(cur->offset < num_pages, "Offset overflow: "
+ "0x%llx >= 0x%llx", cur->offset, num_pages);
+ //DEBUG("slot %d offset %llu\n", cur->slot, cur->offset);
+ test_and_set_bit(cur->offset, bitmap);
+ fetch++;
+ count++;
+ }
+ WRITE_ONCE(indexes->fetch_index, fetch);
+
+ return count;
+}
+
+static void dirty_ring_collect_dirty_pages(struct kvm_vm *vm, int slot,
+ void *bitmap, uint32_t num_pages)
+{
+ /* We only have one vcpu */
+ struct kvm_run *state = vcpu_state(vm, VCPU_ID);
+ struct kvm_vm_run *vm_run = vm_state(vm);
+ uint32_t count = 0, cleared;
+
+ /*
+ * Before fetching the dirty pages, we need a vmexit of the
+ * worker vcpu to make sure the hardware dirty buffers were
+ * flushed. This is not needed for dirty-log/clear-log tests
+ * because get dirty log will natually do so.
+ *
+ * For now we do it in the simple way - we simply wait until
+ * the vcpu uses up the soft dirty ring, then it'll always
+ * do a vmexit to make sure that PML buffers will be flushed.
+ * In real hypervisors, we probably need a vcpu kick or to
+ * stop the vcpus (before the final sync) to make sure we'll
+ * get all the existing dirty PFNs even cached in hardware.
+ */
+ sem_wait(&dirty_ring_vcpu_stop);
+
+ count += dirty_ring_collect_one(kvm_map_dirty_ring(vm),
+ &vm_run->vm_ring_indexes,
+ slot, bitmap, num_pages, -1);
+
+ /* Only have one vcpu */
+ count += dirty_ring_collect_one(vcpu_map_dirty_ring(vm, VCPU_ID),
+ &state->vcpu_ring_indexes,
+ slot, bitmap, num_pages, VCPU_ID);
+
+ cleared = kvm_vm_reset_dirty_ring(vm);
+
+ /* Cleared pages should be the same as collected */
+ TEST_ASSERT(cleared == count, "Reset dirty pages (%u) mismatch "
+ "with collected (%u)", cleared, count);
+
+ DEBUG("Notifying vcpu to continue\n");
+ sem_post(&dirty_ring_vcpu_cont);
+
+ DEBUG("Iteration %ld collected %u pages\n", iteration, count);
+}
+
+static void dirty_ring_after_vcpu_run(struct kvm_vm *vm)
+{
+ struct kvm_run *run = vcpu_state(vm, VCPU_ID);
+
+ /* A ucall-sync or ring-full event is allowed */
+ if (get_ucall(vm, VCPU_ID, NULL) == UCALL_SYNC) {
+ /* We should allow this to continue */
+ ;
+ } else if (run->exit_reason == KVM_EXIT_DIRTY_RING_FULL) {
+ sem_post(&dirty_ring_vcpu_stop);
+ DEBUG("vcpu stops because dirty ring full...\n");
+ sem_wait(&dirty_ring_vcpu_cont);
+ DEBUG("vcpu continues now.\n");
+ } else {
+ TEST_ASSERT(false, "Invalid guest sync status: "
+ "exit_reason=%s\n",
+ exit_reason_str(run->exit_reason));
+ }
+}
+
+static void dirty_ring_before_vcpu_join(void)
+{
+ /* Kick another round of vcpu just to make sure it will quit */
+ sem_post(&dirty_ring_vcpu_cont);
+}
+
struct log_mode {
const char *name;
/* Hook when the vm creation is done (before vcpu creation) */
@@ -186,6 +314,7 @@ struct log_mode {
void *bitmap, uint32_t num_pages);
/* Hook to call when after each vcpu run */
void (*after_vcpu_run)(struct kvm_vm *vm);
+ void (*before_vcpu_join) (void);
} log_modes[LOG_MODE_NUM] = {
{
.name = "dirty-log",
@@ -199,6 +328,13 @@ struct log_mode {
.collect_dirty_pages = clear_log_collect_dirty_pages,
.after_vcpu_run = default_after_vcpu_run,
},
+ {
+ .name = "dirty-ring",
+ .create_vm_done = dirty_ring_create_vm_done,
+ .collect_dirty_pages = dirty_ring_collect_dirty_pages,
+ .before_vcpu_join = dirty_ring_before_vcpu_join,
+ .after_vcpu_run = dirty_ring_after_vcpu_run,
+ },
};

/*
@@ -245,6 +381,14 @@ static void log_mode_after_vcpu_run(struct kvm_vm *vm)
mode->after_vcpu_run(vm);
}

+static void log_mode_before_vcpu_join(void)
+{
+ struct log_mode *mode = &log_modes[host_log_mode];
+
+ if (mode->before_vcpu_join)
+ mode->before_vcpu_join();
+}
+
static void generate_random_array(uint64_t *guest_array, uint64_t size)
{
uint64_t i;
@@ -460,6 +604,7 @@ static void run_test(enum vm_guest_mode mode, unsigned long iterations,

/* Tell the vcpu thread to quit */
host_quit = true;
+ log_mode_before_vcpu_join();
pthread_join(vcpu_thread, NULL);

DEBUG("Total bits checked: dirty (%"PRIu64"), clear (%"PRIu64"), "
@@ -524,6 +669,9 @@ int main(int argc, char *argv[])
unsigned int host_ipa_limit;
#endif

+ sem_init(&dirty_ring_vcpu_stop, 0, 0);
+ sem_init(&dirty_ring_vcpu_cont, 0, 0);
+
#ifdef __x86_64__
vm_guest_mode_params_init(VM_MODE_PXXV48_4K, true, true);
#endif
diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h
index 29cccaf96baf..5ad52f38af8d 100644
--- a/tools/testing/selftests/kvm/include/kvm_util.h
+++ b/tools/testing/selftests/kvm/include/kvm_util.h
@@ -67,6 +67,7 @@ enum vm_mem_backing_src_type {

int kvm_check_cap(long cap);
int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap);
+void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size);

struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
struct kvm_vm *_vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm);
@@ -76,6 +77,7 @@ void kvm_vm_release(struct kvm_vm *vmp);
void kvm_vm_get_dirty_log(struct kvm_vm *vm, int slot, void *log);
void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
uint64_t first_page, uint32_t num_pages);
+uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm);

int kvm_memcmp_hva_gva(void *hva, struct kvm_vm *vm, const vm_vaddr_t gva,
size_t len);
@@ -111,6 +113,7 @@ vm_paddr_t addr_hva2gpa(struct kvm_vm *vm, void *hva);
vm_paddr_t addr_gva2gpa(struct kvm_vm *vm, vm_vaddr_t gva);

struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid);
+struct kvm_vm_run *vm_state(struct kvm_vm *vm);
void vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
int _vcpu_run(struct kvm_vm *vm, uint32_t vcpuid);
void vcpu_run_complete_io(struct kvm_vm *vm, uint32_t vcpuid);
@@ -137,6 +140,8 @@ void vcpu_nested_state_get(struct kvm_vm *vm, uint32_t vcpuid,
int vcpu_nested_state_set(struct kvm_vm *vm, uint32_t vcpuid,
struct kvm_nested_state *state, bool ignore_error);
#endif
+void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid);
+void *kvm_map_dirty_ring(struct kvm_vm *vm);

const char *exit_reason_str(unsigned int exit_reason);

diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c
index 41cf45416060..3a71e66a0b58 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util.c
+++ b/tools/testing/selftests/kvm/lib/kvm_util.c
@@ -85,6 +85,26 @@ int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap)
return ret;
}

+void vm_enable_dirty_ring(struct kvm_vm *vm, uint32_t ring_size)
+{
+ struct kvm_enable_cap cap = {};
+ int ret;
+
+ ret = kvm_check_cap(KVM_CAP_DIRTY_LOG_RING);
+
+ TEST_ASSERT(ret >= 0, "KVM_CAP_DIRTY_LOG_RING");
+
+ if (ret == 0) {
+ fprintf(stderr, "KVM does not support dirty ring, skipping tests\n");
+ exit(KSFT_SKIP);
+ }
+
+ cap.cap = KVM_CAP_DIRTY_LOG_RING;
+ cap.args[0] = ring_size;
+ vm_enable_cap(vm, &cap);
+ vm->dirty_ring_size = ring_size;
+}
+
static void vm_open(struct kvm_vm *vm, int perm)
{
vm->kvm_fd = open(KVM_DEV_PATH, perm);
@@ -297,6 +317,11 @@ void kvm_vm_clear_dirty_log(struct kvm_vm *vm, int slot, void *log,
strerror(-ret));
}

+uint32_t kvm_vm_reset_dirty_ring(struct kvm_vm *vm)
+{
+ return ioctl(vm->fd, KVM_RESET_DIRTY_RINGS);
+}
+
/*
* Userspace Memory Region Find
*
@@ -408,6 +433,13 @@ static void vm_vcpu_rm(struct kvm_vm *vm, uint32_t vcpuid)
struct vcpu *vcpu = vcpu_find(vm, vcpuid);
int ret;

+ if (vcpu->dirty_gfns) {
+ ret = munmap(vcpu->dirty_gfns, vm->dirty_ring_size);
+ TEST_ASSERT(ret == 0, "munmap of VCPU dirty ring failed, "
+ "rc: %i errno: %i", ret, errno);
+ vcpu->dirty_gfns = NULL;
+ }
+
ret = munmap(vcpu->state, sizeof(*vcpu->state));
TEST_ASSERT(ret == 0, "munmap of VCPU fd failed, rc: %i "
"errno: %i", ret, errno);
@@ -447,6 +479,16 @@ void kvm_vm_free(struct kvm_vm *vmp)
{
int ret;

+ if (vmp->vm_run) {
+ munmap(vmp->vm_run, sizeof(struct kvm_vm_run));
+ vmp->vm_run = NULL;
+ }
+
+ if (vmp->vm_dirty_gfns) {
+ munmap(vmp->vm_dirty_gfns, vmp->dirty_ring_size);
+ vmp->vm_dirty_gfns = NULL;
+ }
+
if (vmp == NULL)
return;

@@ -1122,6 +1164,18 @@ struct kvm_run *vcpu_state(struct kvm_vm *vm, uint32_t vcpuid)
return vcpu->state;
}

+struct kvm_vm_run *vm_state(struct kvm_vm *vm)
+{
+ if (!vm->vm_run) {
+ vm->vm_run = (struct kvm_vm_run *)
+ mmap(NULL, sizeof(struct kvm_vm_run),
+ PROT_READ | PROT_WRITE, MAP_SHARED, vm->fd, 0);
+ TEST_ASSERT(vm->vm_run != MAP_FAILED,
+ "kvm vm run mapping failed");
+ }
+ return vm->vm_run;
+}
+
/*
* VM VCPU Run
*
@@ -1409,6 +1463,46 @@ int _vcpu_ioctl(struct kvm_vm *vm, uint32_t vcpuid,
return ret;
}

+void *vcpu_map_dirty_ring(struct kvm_vm *vm, uint32_t vcpuid)
+{
+ struct vcpu *vcpu;
+ uint32_t size = vm->dirty_ring_size;
+
+ TEST_ASSERT(size > 0, "Should enable dirty ring first");
+
+ vcpu = vcpu_find(vm, vcpuid);
+
+ TEST_ASSERT(vcpu, "Cannot find vcpu %u", vcpuid);
+
+ if (!vcpu->dirty_gfns) {
+ vcpu->dirty_gfns_count = size / sizeof(struct kvm_dirty_gfn);
+ vcpu->dirty_gfns = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, vcpu->fd, vm->page_size *
+ KVM_DIRTY_LOG_PAGE_OFFSET);
+ TEST_ASSERT(vcpu->dirty_gfns != MAP_FAILED,
+ "Dirty ring map failed");
+ }
+
+ return vcpu->dirty_gfns;
+}
+
+void *kvm_map_dirty_ring(struct kvm_vm *vm)
+{
+ uint32_t size = vm->dirty_ring_size;
+
+ TEST_ASSERT(size > 0, "Should enable dirty ring first");
+
+ if (!vm->vm_dirty_gfns) {
+ vm->vm_dirty_gfns = mmap(NULL, size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, vm->fd, vm->page_size *
+ KVM_DIRTY_LOG_PAGE_OFFSET);
+ TEST_ASSERT(vm->vm_dirty_gfns != MAP_FAILED,
+ "Dirty ring map failed");
+ }
+
+ return vm->vm_dirty_gfns;
+}
+
/*
* VM Ioctl
*
@@ -1503,6 +1597,7 @@ static struct exit_reason {
{KVM_EXIT_INTERNAL_ERROR, "INTERNAL_ERROR"},
{KVM_EXIT_OSI, "OSI"},
{KVM_EXIT_PAPR_HCALL, "PAPR_HCALL"},
+ {KVM_EXIT_DIRTY_RING_FULL, "DIRTY_RING_FULL"},
#ifdef KVM_EXIT_MEMORY_NOT_PRESENT
{KVM_EXIT_MEMORY_NOT_PRESENT, "MEMORY_NOT_PRESENT"},
#endif
diff --git a/tools/testing/selftests/kvm/lib/kvm_util_internal.h b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
index ac50c42750cf..3423d78d7993 100644
--- a/tools/testing/selftests/kvm/lib/kvm_util_internal.h
+++ b/tools/testing/selftests/kvm/lib/kvm_util_internal.h
@@ -39,6 +39,8 @@ struct vcpu {
uint32_t id;
int fd;
struct kvm_run *state;
+ struct kvm_dirty_gfn *dirty_gfns;
+ uint32_t dirty_gfns_count;
};

struct kvm_vm {
@@ -61,6 +63,9 @@ struct kvm_vm {
vm_paddr_t pgd;
vm_vaddr_t gdt;
vm_vaddr_t tss;
+ uint32_t dirty_ring_size;
+ struct kvm_vm_run *vm_run;
+ struct kvm_dirty_gfn *vm_dirty_gfns;
};

struct vcpu *vcpu_find(struct kvm_vm *vm, uint32_t vcpuid);
--
2.21.0