[PATCH v2 17/18] KVM: arm64: Introduce the EL1 pKVM MMU

From: Quentin Perret
Date: Tue Dec 03 2024 - 06:24:20 EST


Introduce a set of helper functions allowing to manipulate the pKVM
guest stage-2 page-tables from EL1 using pKVM's HVC interface.

Each helper has an exact one-to-one correspondance with the traditional
kvm_pgtable_stage2_*() functions from pgtable.c, with a strictly
matching prototype. This will ease plumbing later on in mmu.c.

These callbacks track the gfn->pfn mappings in a simple rb_tree indexed
by IPA in lieu of a page-table. This rb-tree is kept in sync with pKVM's
state and is protected by a new rwlock -- the existing mmu_lock
protection does not suffice in the map() path where the tree must be
modified while user_mem_abort() only acquires a read_lock.

Signed-off-by: Quentin Perret <qperret@xxxxxxxxxx>
---

The embedded union inside struct kvm_pgtable is arguably a bit horrible
currently... I considered making the pgt argument to all kvm_pgtable_*()
functions an opaque void * ptr, and moving the definition of
struct kvm_pgtable to pgtable.c and the pkvm version into pkvm.c. Given
that the allocation of that data-structure is done by the caller, that
means we'd need to expose kvm_pgtable_get_pgd_size() or something that
each MMU (pgtable.c and pkvm.c) would have to implement and things like
that. But that felt like a bigger surgery, so I went with the simpler
option. Thoughts welcome :-)

Similarly, happy to drop the mappings_lock if we want to teach
user_mem_abort() about taking a write lock on the mmu_lock in the pKVM
case, but again this implementation is the least invasive into normal
KVM so that felt like a reasonable starting point.
---
arch/arm64/include/asm/kvm_host.h | 1 +
arch/arm64/include/asm/kvm_pgtable.h | 27 ++--
arch/arm64/include/asm/kvm_pkvm.h | 28 ++++
arch/arm64/kvm/pkvm.c | 195 +++++++++++++++++++++++++++
4 files changed, 242 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index f75988e3515b..05936b57a3a4 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -85,6 +85,7 @@ void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu);
struct kvm_hyp_memcache {
phys_addr_t head;
unsigned long nr_pages;
+ struct pkvm_mapping *mapping; /* only used from EL1 */
};

static inline void push_hyp_memcache(struct kvm_hyp_memcache *mc,
diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h
index 04418b5e3004..d24d18874015 100644
--- a/arch/arm64/include/asm/kvm_pgtable.h
+++ b/arch/arm64/include/asm/kvm_pgtable.h
@@ -412,15 +412,24 @@ static inline bool kvm_pgtable_walk_lock_held(void)
* be used instead of block mappings.
*/
struct kvm_pgtable {
- u32 ia_bits;
- s8 start_level;
- kvm_pteref_t pgd;
- struct kvm_pgtable_mm_ops *mm_ops;
-
- /* Stage-2 only */
- struct kvm_s2_mmu *mmu;
- enum kvm_pgtable_stage2_flags flags;
- kvm_pgtable_force_pte_cb_t force_pte_cb;
+ union {
+ struct {
+ u32 ia_bits;
+ s8 start_level;
+ kvm_pteref_t pgd;
+ struct kvm_pgtable_mm_ops *mm_ops;
+
+ /* Stage-2 only */
+ struct kvm_s2_mmu *mmu;
+ enum kvm_pgtable_stage2_flags flags;
+ kvm_pgtable_force_pte_cb_t force_pte_cb;
+ };
+ struct {
+ struct kvm *kvm;
+ struct rb_root mappings;
+ rwlock_t mappings_lock;
+ } pkvm;
+ };
};

/**
diff --git a/arch/arm64/include/asm/kvm_pkvm.h b/arch/arm64/include/asm/kvm_pkvm.h
index cd56acd9a842..84211d5daf87 100644
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -11,6 +11,12 @@
#include <linux/scatterlist.h>
#include <asm/kvm_pgtable.h>

+struct pkvm_mapping {
+ u64 gfn;
+ u64 pfn;
+ struct rb_node node;
+};
+
/* Maximum number of VMs that can co-exist under pKVM. */
#define KVM_MAX_PVMS 255

@@ -137,4 +143,26 @@ static inline size_t pkvm_host_sve_state_size(void)
SVE_SIG_REGS_SIZE(sve_vq_from_vl(kvm_host_sve_max_vl)));
}

+static inline pkvm_handle_t pkvm_pgt_to_handle(struct kvm_pgtable *pgt)
+{
+ return pgt->pkvm.kvm->arch.pkvm.handle;
+}
+
+int pkvm_pgtable_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops);
+void pkvm_pgtable_destroy(struct kvm_pgtable *pgt);
+int pkvm_pgtable_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ u64 phys, enum kvm_pgtable_prot prot,
+ void *mc, enum kvm_pgtable_walk_flags flags);
+int pkvm_pgtable_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size);
+int pkvm_pgtable_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size);
+int pkvm_pgtable_flush(struct kvm_pgtable *pgt, u64 addr, u64 size);
+bool pkvm_pgtable_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold);
+int pkvm_pgtable_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot,
+ enum kvm_pgtable_walk_flags flags);
+void pkvm_pgtable_mkyoung(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_walk_flags flags);
+int pkvm_pgtable_split(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_mmu_memory_cache *mc);
+void pkvm_pgtable_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level);
+kvm_pte_t *pkvm_pgtable_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level,
+ enum kvm_pgtable_prot prot, void *mc, bool force_pte);
+
#endif /* __ARM64_KVM_PKVM_H__ */
diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c
index 85117ea8f351..9c648a510671 100644
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -7,6 +7,7 @@
#include <linux/init.h>
#include <linux/kmemleak.h>
#include <linux/kvm_host.h>
+#include <asm/kvm_mmu.h>
#include <linux/memblock.h>
#include <linux/mutex.h>
#include <linux/sort.h>
@@ -268,3 +269,197 @@ static int __init finalize_pkvm(void)
return ret;
}
device_initcall_sync(finalize_pkvm);
+
+static int cmp_mappings(struct rb_node *node, const struct rb_node *parent)
+{
+ struct pkvm_mapping *a = rb_entry(node, struct pkvm_mapping, node);
+ struct pkvm_mapping *b = rb_entry(parent, struct pkvm_mapping, node);
+
+ if (a->gfn < b->gfn)
+ return -1;
+ if (a->gfn > b->gfn)
+ return 1;
+ return 0;
+}
+
+static struct rb_node *find_first_mapping_node(struct rb_root *root, u64 gfn)
+{
+ struct rb_node *node = root->rb_node, *prev = NULL;
+ struct pkvm_mapping *mapping;
+
+ while (node) {
+ mapping = rb_entry(node, struct pkvm_mapping, node);
+ if (mapping->gfn == gfn)
+ return node;
+ prev = node;
+ node = (gfn < mapping->gfn) ? node->rb_left : node->rb_right;
+ }
+
+ return prev;
+}
+
+#define for_each_mapping_in_range(pgt, start_ipa, end_ipa, mapping, tmp) \
+ for (tmp = find_first_mapping_node(&pgt->pkvm.mappings, ((start_ipa) >> PAGE_SHIFT)); \
+ tmp && ({ mapping = rb_entry(tmp, struct pkvm_mapping, node); tmp = rb_next(tmp); 1; });) \
+ if (mapping->gfn < ((start_ipa) >> PAGE_SHIFT)) \
+ continue; \
+ else if (mapping->gfn >= ((end_ipa) >> PAGE_SHIFT)) \
+ break; \
+ else
+
+int pkvm_pgtable_init(struct kvm_pgtable *pgt, struct kvm_s2_mmu *mmu, struct kvm_pgtable_mm_ops *mm_ops)
+{
+ pgt->pkvm.kvm = kvm_s2_mmu_to_kvm(mmu);
+ pgt->pkvm.mappings = RB_ROOT;
+ rwlock_init(&pgt->pkvm.mappings_lock);
+
+ return 0;
+}
+
+void pkvm_pgtable_destroy(struct kvm_pgtable *pgt)
+{
+ pkvm_handle_t handle = pkvm_pgt_to_handle(pgt);
+ struct pkvm_mapping *mapping;
+ struct rb_node *node;
+
+ if (!handle)
+ return;
+
+ node = rb_first(&pgt->pkvm.mappings);
+ while (node) {
+ mapping = rb_entry(node, struct pkvm_mapping, node);
+ kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn);
+ node = rb_next(node);
+ rb_erase(&mapping->node, &pgt->pkvm.mappings);
+ kfree(mapping);
+ }
+}
+
+int pkvm_pgtable_map(struct kvm_pgtable *pgt, u64 addr, u64 size,
+ u64 phys, enum kvm_pgtable_prot prot,
+ void *mc, enum kvm_pgtable_walk_flags flags)
+{
+ struct pkvm_mapping *mapping = NULL;
+ struct kvm_hyp_memcache *cache = mc;
+ u64 gfn = addr >> PAGE_SHIFT;
+ u64 pfn = phys >> PAGE_SHIFT;
+ int ret;
+
+ if (size != PAGE_SIZE)
+ return -EINVAL;
+
+ write_lock(&pgt->pkvm.mappings_lock);
+ ret = kvm_call_hyp_nvhe(__pkvm_host_share_guest, pfn, gfn, prot);
+ if (ret) {
+ /* Is the gfn already mapped due to a racing vCPU? */
+ if (ret == -EPERM)
+ ret = -EAGAIN;
+ goto unlock;
+ }
+
+ swap(mapping, cache->mapping);
+ mapping->gfn = gfn;
+ mapping->pfn = pfn;
+ WARN_ON(rb_find_add(&mapping->node, &pgt->pkvm.mappings, cmp_mappings));
+unlock:
+ write_unlock(&pgt->pkvm.mappings_lock);
+
+ return ret;
+}
+
+int pkvm_pgtable_unmap(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ pkvm_handle_t handle = pkvm_pgt_to_handle(pgt);
+ struct pkvm_mapping *mapping;
+ struct rb_node *tmp;
+ int ret = 0;
+
+ write_lock(&pgt->pkvm.mappings_lock);
+ for_each_mapping_in_range(pgt, addr, addr + size, mapping, tmp) {
+ ret = kvm_call_hyp_nvhe(__pkvm_host_unshare_guest, handle, mapping->gfn);
+ if (WARN_ON(ret))
+ break;
+
+ rb_erase(&mapping->node, &pgt->pkvm.mappings);
+ kfree(mapping);
+ }
+ write_unlock(&pgt->pkvm.mappings_lock);
+
+ return ret;
+}
+
+int pkvm_pgtable_wrprotect(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ pkvm_handle_t handle = pkvm_pgt_to_handle(pgt);
+ struct pkvm_mapping *mapping;
+ struct rb_node *tmp;
+ int ret = 0;
+
+ read_lock(&pgt->pkvm.mappings_lock);
+ for_each_mapping_in_range(pgt, addr, addr + size, mapping, tmp) {
+ ret = kvm_call_hyp_nvhe(__pkvm_host_wrprotect_guest, handle, mapping->gfn);
+ if (WARN_ON(ret))
+ break;
+ }
+ read_unlock(&pgt->pkvm.mappings_lock);
+
+ return ret;
+}
+
+int pkvm_pgtable_flush(struct kvm_pgtable *pgt, u64 addr, u64 size)
+{
+ struct pkvm_mapping *mapping;
+ struct rb_node *tmp;
+
+ read_lock(&pgt->pkvm.mappings_lock);
+ for_each_mapping_in_range(pgt, addr, addr + size, mapping, tmp)
+ __clean_dcache_guest_page(pfn_to_kaddr(mapping->pfn), PAGE_SIZE);
+ read_unlock(&pgt->pkvm.mappings_lock);
+
+ return 0;
+}
+
+bool pkvm_pgtable_test_clear_young(struct kvm_pgtable *pgt, u64 addr, u64 size, bool mkold)
+{
+ pkvm_handle_t handle = pkvm_pgt_to_handle(pgt);
+ struct pkvm_mapping *mapping;
+ struct rb_node *tmp;
+ bool young = false;
+
+ read_lock(&pgt->pkvm.mappings_lock);
+ for_each_mapping_in_range(pgt, addr, addr + size, mapping, tmp)
+ young |= kvm_call_hyp_nvhe(__pkvm_host_test_clear_young_guest, handle, mapping->gfn,
+ mkold);
+ read_unlock(&pgt->pkvm.mappings_lock);
+
+ return young;
+}
+
+int pkvm_pgtable_relax_perms(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_prot prot,
+ enum kvm_pgtable_walk_flags flags)
+{
+ return kvm_call_hyp_nvhe(__pkvm_host_relax_guest_perms, addr >> PAGE_SHIFT, prot);
+}
+
+void pkvm_pgtable_mkyoung(struct kvm_pgtable *pgt, u64 addr, enum kvm_pgtable_walk_flags flags)
+{
+ WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_mkyoung_guest, addr >> PAGE_SHIFT));
+}
+
+void pkvm_pgtable_free_unlinked(struct kvm_pgtable_mm_ops *mm_ops, void *pgtable, s8 level)
+{
+ WARN_ON(1);
+}
+
+kvm_pte_t *pkvm_pgtable_create_unlinked(struct kvm_pgtable *pgt, u64 phys, s8 level,
+ enum kvm_pgtable_prot prot, void *mc, bool force_pte)
+{
+ WARN_ON(1);
+ return NULL;
+}
+
+int pkvm_pgtable_split(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_mmu_memory_cache *mc)
+{
+ WARN_ON(1);
+ return -EINVAL;
+}
--
2.47.0.338.g60cca15819-goog