[PATCH v2 04/18] KVM: arm64: Move host page ownership tracking to the hyp vmemmap

From: Quentin Perret
Date: Tue Dec 03 2024 - 05:38:45 EST


We currently store part of the page-tracking state in PTE software bits
for the host, guests and the hypervisor. This is sub-optimal when e.g.
sharing pages as this forces to break block mappings purely to support
this software tracking. This causes an unnecessarily fragmented stage-2
page-table for the host in particular when it shares pages with Secure,
which can lead to measurable regressions. Moreover, having this state
stored in the page-table forces us to do multiple costly walks on the
page transition path, hence causing overhead.

In order to work around these problems, move the host-side page-tracking
logic from SW bits in its stage-2 PTEs to the hypervisor's vmemmap.

Signed-off-by: Quentin Perret <qperret@xxxxxxxxxx>
---
arch/arm64/kvm/hyp/include/nvhe/memory.h | 6 +-
arch/arm64/kvm/hyp/nvhe/mem_protect.c | 94 ++++++++++++++++--------
arch/arm64/kvm/hyp/nvhe/setup.c | 7 +-
3 files changed, 71 insertions(+), 36 deletions(-)

diff --git a/arch/arm64/kvm/hyp/include/nvhe/memory.h b/arch/arm64/kvm/hyp/include/nvhe/memory.h
index 88cb8ff9e769..08f3a0416d4c 100644
--- a/arch/arm64/kvm/hyp/include/nvhe/memory.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/memory.h
@@ -8,7 +8,7 @@
#include <linux/types.h>

/*
- * SW bits 0-1 are reserved to track the memory ownership state of each page:
+ * Bits 0-1 are reserved to track the memory ownership state of each page:
* 00: The page is owned exclusively by the page-table owner.
* 01: The page is owned by the page-table owner, but is shared
* with another entity.
@@ -44,7 +44,9 @@ static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot)
struct hyp_page {
u16 refcount;
u8 order;
- u8 reserved;
+
+ /* Host (non-meta) state. Guarded by the host stage-2 lock. */
+ enum pkvm_page_state host_state : 8;
};

extern u64 __hyp_vmemmap;
diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
index caba3e4bd09e..1595081c4f6b 100644
--- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c
+++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c
@@ -201,8 +201,8 @@ static void *guest_s2_zalloc_page(void *mc)

memset(addr, 0, PAGE_SIZE);
p = hyp_virt_to_page(addr);
- memset(p, 0, sizeof(*p));
p->refcount = 1;
+ p->order = 0;

return addr;
}
@@ -268,6 +268,7 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd)

void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
{
+ struct hyp_page *page;
void *addr;

/* Dump all pgtable pages in the hyp_pool */
@@ -279,7 +280,9 @@ void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc)
/* Drain the hyp_pool into the memcache */
addr = hyp_alloc_pages(&vm->pool, 0);
while (addr) {
- memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page));
+ page = hyp_virt_to_page(addr);
+ page->refcount = 0;
+ page->order = 0;
push_hyp_memcache(mc, addr, hyp_virt_to_phys);
WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1));
addr = hyp_alloc_pages(&vm->pool, 0);
@@ -382,19 +385,25 @@ bool addr_is_memory(phys_addr_t phys)
return !!find_mem_range(phys, &range);
}

-static bool addr_is_allowed_memory(phys_addr_t phys)
+static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
+{
+ return range->start <= addr && addr < range->end;
+}
+
+static int range_is_allowed_memory(u64 start, u64 end)
{
struct memblock_region *reg;
struct kvm_mem_range range;

- reg = find_mem_range(phys, &range);
+ /* Can't check the state of both MMIO and memory regions at once */
+ reg = find_mem_range(start, &range);
+ if (!is_in_mem_range(end - 1, &range))
+ return -EINVAL;

- return reg && !(reg->flags & MEMBLOCK_NOMAP);
-}
+ if (!reg || reg->flags & MEMBLOCK_NOMAP)
+ return -EPERM;

-static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range)
-{
- return range->start <= addr && addr < range->end;
+ return 0;
}

static bool range_is_memory(u64 start, u64 end)
@@ -454,8 +463,11 @@ static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
if (kvm_pte_valid(pte))
return -EAGAIN;

- if (pte)
+ if (pte) {
+ WARN_ON(addr_is_memory(addr) &&
+ !(hyp_phys_to_page(addr)->host_state & PKVM_NOPAGE));
return -EPERM;
+ }

do {
u64 granule = kvm_granule_size(level);
@@ -477,10 +489,29 @@ int host_stage2_idmap_locked(phys_addr_t addr, u64 size,
return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot);
}

+static void __host_update_page_state(phys_addr_t addr, u64 size, enum pkvm_page_state state)
+{
+ phys_addr_t end = addr + size;
+ for (; addr < end; addr += PAGE_SIZE)
+ hyp_phys_to_page(addr)->host_state = state;
+}
+
int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id)
{
- return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
- addr, size, &host_s2_pool, owner_id);
+ int ret;
+
+ ret = host_stage2_try(kvm_pgtable_stage2_set_owner, &host_mmu.pgt,
+ addr, size, &host_s2_pool, owner_id);
+ if (ret || !addr_is_memory(addr))
+ return ret;
+
+ /* Don't forget to update the vmemmap tracking for the host */
+ if (owner_id == PKVM_ID_HOST)
+ __host_update_page_state(addr, size, PKVM_PAGE_OWNED);
+ else
+ __host_update_page_state(addr, size, PKVM_NOPAGE);
+
+ return 0;
}

static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot)
@@ -604,35 +635,38 @@ static int check_page_state_range(struct kvm_pgtable *pgt, u64 addr, u64 size,
return kvm_pgtable_walk(pgt, addr, size, &walker);
}

-static enum pkvm_page_state host_get_page_state(kvm_pte_t pte, u64 addr)
-{
- if (!addr_is_allowed_memory(addr))
- return PKVM_NOPAGE;
-
- if (!kvm_pte_valid(pte) && pte)
- return PKVM_NOPAGE;
-
- return pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte));
-}
-
static int __host_check_page_state_range(u64 addr, u64 size,
enum pkvm_page_state state)
{
- struct check_walk_data d = {
- .desired = state,
- .get_page_state = host_get_page_state,
- };
+ u64 end = addr + size;
+ int ret;
+
+ ret = range_is_allowed_memory(addr, end);
+ if (ret)
+ return ret;

hyp_assert_lock_held(&host_mmu.lock);
- return check_page_state_range(&host_mmu.pgt, addr, size, &d);
+ for (; addr < end; addr += PAGE_SIZE) {
+ if (hyp_phys_to_page(addr)->host_state != state)
+ return -EPERM;
+ }
+
+ return 0;
}

static int __host_set_page_state_range(u64 addr, u64 size,
enum pkvm_page_state state)
{
- enum kvm_pgtable_prot prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, state);
+ if (hyp_phys_to_page(addr)->host_state & PKVM_NOPAGE) {
+ int ret = host_stage2_idmap_locked(addr, size, PKVM_HOST_MEM_PROT);

- return host_stage2_idmap_locked(addr, size, prot);
+ if (ret)
+ return ret;
+ }
+
+ __host_update_page_state(addr, size, state);
+
+ return 0;
}

static int host_request_owned_transition(u64 *completer_addr,
diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c
index cbdd18cd3f98..7e04d1c2a03d 100644
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -180,7 +180,6 @@ static void hpool_put_page(void *addr)
static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
enum kvm_pgtable_walk_flags visit)
{
- enum kvm_pgtable_prot prot;
enum pkvm_page_state state;
phys_addr_t phys;

@@ -203,16 +202,16 @@ static int fix_host_ownership_walker(const struct kvm_pgtable_visit_ctx *ctx,
case PKVM_PAGE_OWNED:
return host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HYP);
case PKVM_PAGE_SHARED_OWNED:
- prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED);
+ hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_BORROWED;
break;
case PKVM_PAGE_SHARED_BORROWED:
- prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED);
+ hyp_phys_to_page(phys)->host_state = PKVM_PAGE_SHARED_OWNED;
break;
default:
return -EINVAL;
}

- return host_stage2_idmap_locked(phys, PAGE_SIZE, prot);
+ return 0;
}

static int fix_hyp_pgtable_refcnt_walker(const struct kvm_pgtable_visit_ctx *ctx,
--
2.47.0.338.g60cca15819-goog