[PATCH Part2 RFC v4 35/40] KVM: Add arch hooks to track the host write to guest memory

From: Brijesh Singh
Date: Wed Jul 07 2021 - 14:40:31 EST


The kvm_write_guest{_page} and kvm_vcpu_write_guest{_page} are used by
the hypevisor to write to the guest memory. The kvm_vcpu_map() and
kvm_map_gfn() are used by the hypervisor to map the guest memory and
and access it later.

When SEV-SNP is enabled in the guest VM, the guest memory pages can
either be a private or shared. A write from the hypervisor goes through
the RMP checks. If hardware sees that hypervisor is attempting to write
to a guest private page, then it triggers an RMP violation (i.e, #PF with
RMP bit set).

Enhance the KVM guest write helpers to invoke an architecture specific
hooks (kvm_arch_write_gfn_{begin,end}) to track the write access from the
hypervisor.

When SEV-SNP is enabled, the guest uses the PAGE_STATE vmgexit to ask the
hypervisor to change the page state from shared to private or vice versa.
While changing the page state to private, use the
kvm_host_write_track_is_active() to check whether the page is being
tracked for the host write access (i.e either mapped or kvm_write_guest
is in progress). If its tracked, then do not change the page state.

Signed-off-by: Brijesh Singh <brijesh.singh@xxxxxxx>
---
arch/x86/include/asm/kvm_host.h | 6 +++
arch/x86/kvm/svm/sev.c | 51 +++++++++++++++++++++
arch/x86/kvm/svm/svm.c | 2 +
arch/x86/kvm/svm/svm.h | 1 +
arch/x86/kvm/x86.c | 78 +++++++++++++++++++++++++++++++++
include/linux/kvm_host.h | 3 ++
virt/kvm/kvm_main.c | 21 +++++++--
7 files changed, 159 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 59185b6bc82a..678992e9966a 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -865,10 +865,13 @@ struct kvm_lpage_info {
int disallow_lpage;
};

+bool kvm_host_write_track_is_active(struct kvm *kvm, gfn_t gfn);
+
struct kvm_arch_memory_slot {
struct kvm_rmap_head *rmap[KVM_NR_PAGE_SIZES];
struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1];
unsigned short *gfn_track[KVM_PAGE_TRACK_MAX];
+ unsigned short *host_write_track[KVM_PAGE_TRACK_MAX];
};

/*
@@ -1393,6 +1396,9 @@ struct kvm_x86_ops {
void (*vcpu_deliver_sipi_vector)(struct kvm_vcpu *vcpu, u8 vector);
void *(*alloc_apic_backing_page)(struct kvm_vcpu *vcpu);
int (*get_tdp_max_page_level)(struct kvm_vcpu *vcpu, gpa_t gpa, int max_level);
+
+ void (*write_page_begin)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn);
+ void (*write_page_end)(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn);
};

struct kvm_x86_nested_ops {
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 0155d9b3127d..839cf321c6dd 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -2884,6 +2884,19 @@ static int snp_make_page_shared(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t pfn,
return rmpupdate(pfn_to_page(pfn), &val);
}

+static inline bool kvm_host_write_track_gpa_range_is_active(struct kvm *kvm,
+ gpa_t start, gpa_t end)
+{
+ while (start < end) {
+ if (kvm_host_write_track_is_active(kvm, gpa_to_gfn(start)))
+ return 1;
+
+ start += PAGE_SIZE;
+ }
+
+ return false;
+}
+
static int snp_make_page_private(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t pfn, int level)
{
struct kvm_sev_info *sev = &to_kvm_svm(vcpu->kvm)->sev_info;
@@ -2895,6 +2908,14 @@ static int snp_make_page_private(struct kvm_vcpu *vcpu, gpa_t gpa, kvm_pfn_t pfn
if (!e)
return -EINVAL;

+ /*
+ * If the GPA is tracked for the write access then do not change the
+ * page state from shared to private.
+ */
+ if (kvm_host_write_track_gpa_range_is_active(vcpu->kvm,
+ gpa, gpa + page_level_size(level)))
+ return -EBUSY;
+
/* Log if the entry is validated */
if (rmpentry_validated(e))
pr_warn_ratelimited("Asked to make a pre-validated gpa %llx private\n", gpa);
@@ -3468,3 +3489,33 @@ int sev_get_tdp_max_page_level(struct kvm_vcpu *vcpu, gpa_t gpa, int max_level)

return min_t(uint32_t, level, max_level);
}
+
+void sev_snp_write_page_begin(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ struct rmpentry *e;
+ int level, rc;
+ kvm_pfn_t pfn;
+
+ if (!sev_snp_guest(kvm))
+ return;
+
+ pfn = gfn_to_pfn(kvm, gfn);
+ if (is_error_noslot_pfn(pfn))
+ return;
+
+ e = snp_lookup_page_in_rmptable(pfn_to_page(pfn), &level);
+ if (unlikely(!e))
+ return;
+
+ /*
+ * A hypervisor should never write to the guest private page. A write to the
+ * guest private will cause an RMP violation. If the guest page is private,
+ * then make it shared.
+ */
+ if (rmpentry_assigned(e)) {
+ pr_err("SEV-SNP: write to guest private gfn %llx\n", gfn);
+ rc = snp_make_page_shared(kvm_get_vcpu(kvm, 0),
+ gfn << PAGE_SHIFT, pfn, PG_LEVEL_4K);
+ BUG_ON(rc != 0);
+ }
+}
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 2632eae52aa3..4ff6fc86dd18 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -4577,6 +4577,8 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {

.alloc_apic_backing_page = svm_alloc_apic_backing_page,
.get_tdp_max_page_level = sev_get_tdp_max_page_level,
+
+ .write_page_begin = sev_snp_write_page_begin,
};

static struct kvm_x86_init_ops svm_init_ops __initdata = {
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index af4cce39b30f..e0276ad8a1ae 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -576,6 +576,7 @@ void sev_es_prepare_guest_switch(struct vcpu_svm *svm, unsigned int cpu);
void sev_es_unmap_ghcb(struct vcpu_svm *svm);
struct page *snp_safe_alloc_page(struct kvm_vcpu *vcpu);
int sev_get_tdp_max_page_level(struct kvm_vcpu *vcpu, gpa_t gpa, int max_level);
+void sev_snp_write_page_begin(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn);

/* vmenter.S */

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bbc4e04e67ad..1398b8021982 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -9076,6 +9076,48 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
}

+static void update_gfn_track(struct kvm_memory_slot *slot, gfn_t gfn,
+ enum kvm_page_track_mode mode, short count)
+{
+ int index, val;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
+
+ val = slot->arch.host_write_track[mode][index];
+
+ if (WARN_ON(val + count < 0 || val + count > USHRT_MAX))
+ return;
+
+ slot->arch.host_write_track[mode][index] += count;
+}
+
+bool kvm_host_write_track_is_active(struct kvm *kvm, gfn_t gfn)
+{
+ struct kvm_memory_slot *slot;
+ int index;
+
+ slot = gfn_to_memslot(kvm, gfn);
+ if (!slot)
+ return false;
+
+ index = gfn_to_index(gfn, slot->base_gfn, PG_LEVEL_4K);
+ return !!READ_ONCE(slot->arch.host_write_track[KVM_PAGE_TRACK_WRITE][index]);
+}
+EXPORT_SYMBOL_GPL(kvm_host_write_track_is_active);
+
+void kvm_arch_write_gfn_begin(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_track(slot, gfn, KVM_PAGE_TRACK_WRITE, 1);
+
+ if (kvm_x86_ops.write_page_begin)
+ kvm_x86_ops.write_page_begin(kvm, slot, gfn);
+}
+
+void kvm_arch_write_gfn_end(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn)
+{
+ update_gfn_track(slot, gfn, KVM_PAGE_TRACK_WRITE, -1);
+}
+
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
if (!lapic_in_kernel(vcpu))
@@ -10896,6 +10938,36 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kvm_hv_destroy_vm(kvm);
}

+static void kvm_write_page_track_free_memslot(struct kvm_memory_slot *slot)
+{
+ int i;
+
+ for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
+ kvfree(slot->arch.host_write_track[i]);
+ slot->arch.host_write_track[i] = NULL;
+ }
+}
+
+static int kvm_write_page_track_create_memslot(struct kvm_memory_slot *slot,
+ unsigned long npages)
+{
+ int i;
+
+ for (i = 0; i < KVM_PAGE_TRACK_MAX; i++) {
+ slot->arch.host_write_track[i] =
+ kvcalloc(npages, sizeof(*slot->arch.host_write_track[i]),
+ GFP_KERNEL_ACCOUNT);
+ if (!slot->arch.host_write_track[i])
+ goto track_free;
+ }
+
+ return 0;
+
+track_free:
+ kvm_write_page_track_free_memslot(slot);
+ return -ENOMEM;
+}
+
void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
int i;
@@ -10969,8 +11041,14 @@ static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
if (kvm_page_track_create_memslot(slot, npages))
goto out_free;

+ if (kvm_write_page_track_create_memslot(slot, npages))
+ goto e_free_page_track;
+
return 0;

+e_free_page_track:
+ kvm_page_track_free_memslot(slot);
+
out_free:
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
kvfree(slot->arch.rmap[i]);
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 2f34487e21f2..f22e22cd2179 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -1550,6 +1550,9 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp,
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
unsigned long start, unsigned long end);

+void kvm_arch_write_gfn_begin(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn);
+void kvm_arch_write_gfn_end(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn);
+
#ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE
int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu);
#else
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 6b4feb92dc79..bc805c15d0de 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -160,6 +160,14 @@ __weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
{
}

+__weak void kvm_arch_write_gfn_begin(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn)
+{
+}
+
+__weak void kvm_arch_write_gfn_end(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn)
+{
+}
+
bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
{
/*
@@ -2309,7 +2317,8 @@ static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
cache->generation = gen;
}

-static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
+static int __kvm_map_gfn(struct kvm *kvm,
+ struct kvm_memslots *slots, gfn_t gfn,
struct kvm_host_map *map,
struct gfn_to_pfn_cache *cache,
bool atomic)
@@ -2361,20 +2370,22 @@ static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
map->pfn = pfn;
map->gfn = gfn;

+ kvm_arch_write_gfn_begin(kvm, slot, map->gfn);
+
return 0;
}

int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
struct gfn_to_pfn_cache *cache, bool atomic)
{
- return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
+ return __kvm_map_gfn(vcpu->kvm, kvm_memslots(vcpu->kvm), gfn, map,
cache, atomic);
}
EXPORT_SYMBOL_GPL(kvm_map_gfn);

int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
{
- return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
+ return __kvm_map_gfn(vcpu->kvm, kvm_vcpu_memslots(vcpu), gfn, map,
NULL, false);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_map);
@@ -2412,6 +2423,8 @@ static void __kvm_unmap_gfn(struct kvm *kvm,
else
kvm_release_pfn(map->pfn, dirty, NULL);

+ kvm_arch_write_gfn_end(kvm, memslot, map->gfn);
+
map->hva = NULL;
map->page = NULL;
}
@@ -2612,7 +2625,9 @@ static int __kvm_write_guest_page(struct kvm *kvm,
addr = gfn_to_hva_memslot(memslot, gfn);
if (kvm_is_error_hva(addr))
return -EFAULT;
+ kvm_arch_write_gfn_begin(kvm, memslot, gfn);
r = __copy_to_user((void __user *)addr + offset, data, len);
+ kvm_arch_write_gfn_end(kvm, memslot, gfn);
if (r)
return -EFAULT;
mark_page_dirty_in_slot(kvm, memslot, gfn);
--
2.17.1