Re: [RFC PATCH 17/28] arm64: RME: Runtime faulting of memory

From: Zhi Wang
Date: Mon Mar 06 2023 - 13:21:32 EST


On Fri, 27 Jan 2023 11:29:21 +0000
Steven Price <steven.price@xxxxxxx> wrote:

> At runtime if the realm guest accesses memory which hasn't yet been
> mapped then KVM needs to either populate the region or fault the guest.
>
> For memory in the lower (protected) region of IPA a fresh page is
> provided to the RMM which will zero the contents. For memory in the
> upper (shared) region of IPA, the memory from the memslot is mapped
> into the realm VM non secure.
>
> Signed-off-by: Steven Price <steven.price@xxxxxxx>
> ---
> arch/arm64/include/asm/kvm_emulate.h | 10 +++++
> arch/arm64/include/asm/kvm_rme.h | 12 ++++++
> arch/arm64/kvm/mmu.c | 64 +++++++++++++++++++++++++---
> arch/arm64/kvm/rme.c | 48 +++++++++++++++++++++
> 4 files changed, 128 insertions(+), 6 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kvm_emulate.h b/arch/arm64/include/asm/kvm_emulate.h
> index 285e62914ca4..3a71b3d2e10a 100644
> --- a/arch/arm64/include/asm/kvm_emulate.h
> +++ b/arch/arm64/include/asm/kvm_emulate.h
> @@ -502,6 +502,16 @@ static inline enum realm_state kvm_realm_state(struct kvm *kvm)
> return READ_ONCE(kvm->arch.realm.state);
> }
>
> +static inline gpa_t kvm_gpa_stolen_bits(struct kvm *kvm)
> +{
> + if (kvm_is_realm(kvm)) {
> + struct realm *realm = &kvm->arch.realm;
> +
> + return BIT(realm->ia_bits - 1);
> + }
> + return 0;
> +}
> +

"stolen" seems a little bit vague. Maybe "shared" bit would be better as
SEV-SNP has C bit and TDX has shared bit. It would be nice to align with
the common knowledge.

Also, it would be nice to change the name of gpa_stolen_mask accordingly.

> static inline bool vcpu_is_rec(struct kvm_vcpu *vcpu)
> {
> if (static_branch_unlikely(&kvm_rme_is_available))
> diff --git a/arch/arm64/include/asm/kvm_rme.h b/arch/arm64/include/asm/kvm_rme.h
> index 9d1583c44a99..303e4a5e5704 100644
> --- a/arch/arm64/include/asm/kvm_rme.h
> +++ b/arch/arm64/include/asm/kvm_rme.h
> @@ -50,6 +50,18 @@ void kvm_destroy_rec(struct kvm_vcpu *vcpu);
> int kvm_rec_enter(struct kvm_vcpu *vcpu);
> int handle_rme_exit(struct kvm_vcpu *vcpu, int rec_run_status);
>
> +void kvm_realm_unmap_range(struct kvm *kvm, unsigned long ipa, u64 size);
> +int realm_map_protected(struct realm *realm,
> + unsigned long hva,
> + unsigned long base_ipa,
> + struct page *dst_page,
> + unsigned long map_size,
> + struct kvm_mmu_memory_cache *memcache);
> +int realm_map_non_secure(struct realm *realm,
> + unsigned long ipa,
> + struct page *page,
> + unsigned long map_size,
> + struct kvm_mmu_memory_cache *memcache);
> int realm_set_ipa_state(struct kvm_vcpu *vcpu,
> unsigned long addr, unsigned long end,
> unsigned long ripas);
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index f29558c5dcbc..5417c273861b 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -235,8 +235,13 @@ static void __unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64
>
> lockdep_assert_held_write(&kvm->mmu_lock);
> WARN_ON(size & ~PAGE_MASK);
> - WARN_ON(stage2_apply_range(kvm, start, end, kvm_pgtable_stage2_unmap,
> - may_block));
> +
> + if (kvm_is_realm(kvm))
> + kvm_realm_unmap_range(kvm, start, size);
> + else
> + WARN_ON(stage2_apply_range(kvm, start, end,
> + kvm_pgtable_stage2_unmap,
> + may_block));
> }
>
> static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 size)
> @@ -250,7 +255,11 @@ static void stage2_flush_memslot(struct kvm *kvm,
> phys_addr_t addr = memslot->base_gfn << PAGE_SHIFT;
> phys_addr_t end = addr + PAGE_SIZE * memslot->npages;
>
> - stage2_apply_range_resched(kvm, addr, end, kvm_pgtable_stage2_flush);
> + if (kvm_is_realm(kvm))
> + kvm_realm_unmap_range(kvm, addr, end - addr);
> + else
> + stage2_apply_range_resched(kvm, addr, end,
> + kvm_pgtable_stage2_flush);
> }
>
> /**
> @@ -818,6 +827,10 @@ void stage2_unmap_vm(struct kvm *kvm)
> struct kvm_memory_slot *memslot;
> int idx, bkt;
>
> + /* For realms this is handled by the RMM so nothing to do here */
> + if (kvm_is_realm(kvm))
> + return;
> +
> idx = srcu_read_lock(&kvm->srcu);
> mmap_read_lock(current->mm);
> write_lock(&kvm->mmu_lock);
> @@ -840,6 +853,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
> pgt = mmu->pgt;
> if (kvm_is_realm(kvm) &&
> kvm_realm_state(kvm) != REALM_STATE_DYING) {
> + unmap_stage2_range(mmu, 0, (~0ULL) & PAGE_MASK);
> write_unlock(&kvm->mmu_lock);
> kvm_realm_destroy_rtts(&kvm->arch.realm, pgt->ia_bits,
> pgt->start_level);
> @@ -1190,6 +1204,24 @@ static bool kvm_vma_mte_allowed(struct vm_area_struct *vma)
> return vma->vm_flags & VM_MTE_ALLOWED;
> }
>
> +static int realm_map_ipa(struct kvm *kvm, phys_addr_t ipa, unsigned long hva,
> + kvm_pfn_t pfn, unsigned long map_size,
> + enum kvm_pgtable_prot prot,
> + struct kvm_mmu_memory_cache *memcache)
> +{
> + struct realm *realm = &kvm->arch.realm;
> + struct page *page = pfn_to_page(pfn);
> +
> + if (WARN_ON(!(prot & KVM_PGTABLE_PROT_W)))
> + return -EFAULT;
> +
> + if (!realm_is_addr_protected(realm, ipa))
> + return realm_map_non_secure(realm, ipa, page, map_size,
> + memcache);
> +
> + return realm_map_protected(realm, hva, ipa, page, map_size, memcache);
> +}
> +
> static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> struct kvm_memory_slot *memslot, unsigned long hva,
> unsigned long fault_status)
> @@ -1210,9 +1242,15 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> unsigned long vma_pagesize, fault_granule;
> enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> struct kvm_pgtable *pgt;
> + gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
>
> fault_granule = 1UL << ARM64_HW_PGTABLE_LEVEL_SHIFT(fault_level);
> write_fault = kvm_is_write_fault(vcpu);
> +
> + /* Realms cannot map read-only */

Out of curiosity, why? It would be nice to have more explanation in the
comment.

> + if (vcpu_is_rec(vcpu))
> + write_fault = true;
> +
> exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
> VM_BUG_ON(write_fault && exec_fault);
>
> @@ -1272,7 +1310,7 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> if (vma_pagesize == PMD_SIZE || vma_pagesize == PUD_SIZE)
> fault_ipa &= ~(vma_pagesize - 1);
>
> - gfn = fault_ipa >> PAGE_SHIFT;
> + gfn = (fault_ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
> mmap_read_unlock(current->mm);
>
> /*
> @@ -1345,7 +1383,8 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> * If we are not forced to use page mapping, check if we are
> * backed by a THP and thus use block mapping if possible.
> */
> - if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) {
> + /* FIXME: We shouldn't need to disable this for realms */
> + if (vma_pagesize == PAGE_SIZE && !(force_pte || device || kvm_is_realm(kvm))) {

Why do we have to disable this temporarily?

> if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE)
> vma_pagesize = fault_granule;
> else
> @@ -1382,6 +1421,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> */
> if (fault_status == FSC_PERM && vma_pagesize == fault_granule)
> ret = kvm_pgtable_stage2_relax_perms(pgt, fault_ipa, prot);
> + else if (kvm_is_realm(kvm))
> + ret = realm_map_ipa(kvm, fault_ipa, hva, pfn, vma_pagesize,
> + prot, memcache);
> else
> ret = kvm_pgtable_stage2_map(pgt, fault_ipa, vma_pagesize,
> __pfn_to_phys(pfn), prot,
> @@ -1437,6 +1479,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
> struct kvm_memory_slot *memslot;
> unsigned long hva;
> bool is_iabt, write_fault, writable;
> + gpa_t gpa_stolen_mask = kvm_gpa_stolen_bits(vcpu->kvm);
> gfn_t gfn;
> int ret, idx;
>
> @@ -1491,7 +1534,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>
> idx = srcu_read_lock(&vcpu->kvm->srcu);
>
> - gfn = fault_ipa >> PAGE_SHIFT;
> + gfn = (fault_ipa & ~gpa_stolen_mask) >> PAGE_SHIFT;
> memslot = gfn_to_memslot(vcpu->kvm, gfn);
> hva = gfn_to_hva_memslot_prot(memslot, gfn, &writable);
> write_fault = kvm_is_write_fault(vcpu);
> @@ -1536,6 +1579,7 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
> * of the page size.
> */
> fault_ipa |= kvm_vcpu_get_hfar(vcpu) & ((1 << 12) - 1);
> + fault_ipa &= ~gpa_stolen_mask;
> ret = io_mem_abort(vcpu, fault_ipa);
> goto out_unlock;
> }
> @@ -1617,6 +1661,10 @@ bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
> if (!kvm->arch.mmu.pgt)
> return false;
>

Does the unprotected (shared) region of a realm support aging?

> + /* We don't support aging for Realms */
> + if (kvm_is_realm(kvm))
> + return true;
> +
> WARN_ON(size != PAGE_SIZE && size != PMD_SIZE && size != PUD_SIZE);
>
> kpte = kvm_pgtable_stage2_mkold(kvm->arch.mmu.pgt,
> @@ -1630,6 +1678,10 @@ bool kvm_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
> if (!kvm->arch.mmu.pgt)
> return false;
>
> + /* We don't support aging for Realms */
> + if (kvm_is_realm(kvm))
> + return true;
> +
> return kvm_pgtable_stage2_is_young(kvm->arch.mmu.pgt,
> range->start << PAGE_SHIFT);
> }
> diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
> index 3405b43e1421..3d46191798e5 100644
> --- a/arch/arm64/kvm/rme.c
> +++ b/arch/arm64/kvm/rme.c
> @@ -608,6 +608,54 @@ int realm_map_protected(struct realm *realm,
> return -ENXIO;
> }
>
> +int realm_map_non_secure(struct realm *realm,
> + unsigned long ipa,
> + struct page *page,
> + unsigned long map_size,
> + struct kvm_mmu_memory_cache *memcache)
> +{
> + phys_addr_t rd = virt_to_phys(realm->rd);
> + int map_level;
> + int ret = 0;
> + unsigned long desc = page_to_phys(page) |
> + PTE_S2_MEMATTR(MT_S2_FWB_NORMAL) |
> + /* FIXME: Read+Write permissions for now */
Why can't we handle the prot from the realm_map_ipa()? Working in progress? :)
> + (3 << 6) |
> + PTE_SHARED;
> +
> + if (WARN_ON(!IS_ALIGNED(ipa, map_size)))
> + return -EINVAL;
> +
> + switch (map_size) {
> + case PAGE_SIZE:
> + map_level = 3;
> + break;
> + case RME_L2_BLOCK_SIZE:
> + map_level = 2;
> + break;
> + default:
> + return -EINVAL;
> + }
> +
> + ret = rmi_rtt_map_unprotected(rd, ipa, map_level, desc);
> +
> + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
> + /* Create missing RTTs and retry */
> + int level = RMI_RETURN_INDEX(ret);
> +
> + ret = realm_create_rtt_levels(realm, ipa, level, map_level,
> + memcache);
> + if (WARN_ON(ret))
> + return -ENXIO;
> +
> + ret = rmi_rtt_map_unprotected(rd, ipa, map_level, desc);
> + }
> + if (WARN_ON(ret))
> + return -ENXIO;
> +
> + return 0;
> +}
> +
> static int populate_par_region(struct kvm *kvm,
> phys_addr_t ipa_base,
> phys_addr_t ipa_end)