Re: [PATCH v6 19/43] arm64: RME: Allow populating initial contents
From: Steven Price
Date: Mon Feb 03 2025 - 11:53:09 EST
On 30/01/2025 04:38, Gavin Shan wrote:
> On 12/13/24 1:55 AM, Steven Price wrote:
>> The VMM needs to populate the realm with some data before starting (e.g.
>> a kernel and initrd). This is measured by the RMM and used as part of
>> the attestation later on.
>>
>> For now only 4k mappings are supported, future work may add support for
>> larger mappings.
>>
>> Co-developed-by: Suzuki K Poulose <suzuki.poulose@xxxxxxx>
>> Signed-off-by: Suzuki K Poulose <suzuki.poulose@xxxxxxx>
>> Signed-off-by: Steven Price <steven.price@xxxxxxx>
>> ---
>> Changes since v5:
>> * Refactor to use PFNs rather than tracking struct page in
>> realm_create_protected_data_page().
>> * Pull changes from a later patch (in the v5 series) for accessing
>> pages from a guest memfd.
>> * Do the populate in chunks to avoid holding locks for too long and
>> triggering RCU stall warnings.
>> ---
>> arch/arm64/kvm/rme.c | 243 +++++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 243 insertions(+)
>>
>> diff --git a/arch/arm64/kvm/rme.c b/arch/arm64/kvm/rme.c
>> index 22f0c74455af..d4561e368cd5 100644
>> --- a/arch/arm64/kvm/rme.c
>> +++ b/arch/arm64/kvm/rme.c
>> @@ -4,6 +4,7 @@
>> */
>> #include <linux/kvm_host.h>
>> +#include <linux/hugetlb.h>
>>
>
> This wouldn't be needed since the huge pages, especially hugetlb part,
> isn't
> supported yet.
Indeed, thanks for spotting.
>> #include <asm/kvm_emulate.h>
>> #include <asm/kvm_mmu.h>
>> @@ -545,6 +546,236 @@ void kvm_realm_unmap_range(struct kvm *kvm,
>> unsigned long start, u64 size,
>> realm_unmap_private_range(kvm, start, end);
>> }
>> +static int realm_create_protected_data_page(struct realm *realm,
>> + unsigned long ipa,
>> + kvm_pfn_t dst_pfn,
>> + kvm_pfn_t src_pfn,
>> + unsigned long flags)
>> +{
>> + phys_addr_t dst_phys, src_phys;
>> + int ret;
>> +
>> + dst_phys = __pfn_to_phys(dst_pfn);
>> + src_phys = __pfn_to_phys(src_pfn);
>> +
>> + if (rmi_granule_delegate(dst_phys))
>> + return -ENXIO;
>> +
>> + ret = rmi_data_create(virt_to_phys(realm->rd), dst_phys, ipa,
>> src_phys,
>> + flags);
>> +
>> + if (RMI_RETURN_STATUS(ret) == RMI_ERROR_RTT) {
>> + /* Create missing RTTs and retry */
>> + int level = RMI_RETURN_INDEX(ret);
>> +
>> + ret = realm_create_rtt_levels(realm, ipa, level,
>> + RMM_RTT_MAX_LEVEL, NULL);
>> + if (ret)
>> + goto err;
>> +
>> + ret = rmi_data_create(virt_to_phys(realm->rd), dst_phys, ipa,
>> + src_phys, flags);
>> + }
>> +
>> + if (!ret)
>> + return 0;
>> +
>> +err:
>> + if (WARN_ON(rmi_granule_undelegate(dst_phys))) {
>> + /* Page can't be returned to NS world so is lost */
>> + get_page(pfn_to_page(dst_pfn));
>> + }
>> + return -ENXIO;
>> +}
>> +
>> +static int fold_rtt(struct realm *realm, unsigned long addr, int level)
>> +{
>> + phys_addr_t rtt_addr;
>> + int ret;
>> +
>> + ret = realm_rtt_fold(realm, addr, level + 1, &rtt_addr);
>> + if (ret)
>> + return ret;
>> +
>> + free_delegated_granule(rtt_addr);
>> +
>> + return 0;
>> +}
>> +
>> +static int populate_par_region(struct kvm *kvm,
>> + phys_addr_t ipa_base,
>> + phys_addr_t ipa_end,
>> + u32 flags)
>> +{
>
> At the first glance, I was wandering what's meant by 'par' in the
> function name.
> It turns to be a 2MB region and I guess it represents 'part'. I think
> this may
> be renamed to populate_sub_region() or populate_region() directly.
Actually 'par' is outdated and refers to "protected address range" which
is a concept which existed in an older version of CCA realms. It's
definitely in need of a rename! ;) I'll just drop the 'par' and call it
populate_region().
>> + struct realm *realm = &kvm->arch.realm;
>> + struct kvm_memory_slot *memslot;
>> + gfn_t base_gfn, end_gfn;
>> + int idx;
>> + phys_addr_t ipa;
>> + int ret = 0;
>> + unsigned long data_flags = 0;
>> +
>> + base_gfn = gpa_to_gfn(ipa_base);
>> + end_gfn = gpa_to_gfn(ipa_end);
>> +
>> + if (flags & KVM_ARM_RME_POPULATE_FLAGS_MEASURE)
>> + data_flags = RMI_MEASURE_CONTENT;
>> +
>
> The 'data_flags' can be sorted out by its caller kvm_populate_realm(),
> and passed
> to populate_par_region(). In that way, we needn't to figure out
> 'data_flags' in
> every call to populate_par_region().
Ack
>> + idx = srcu_read_lock(&kvm->srcu);
>> + memslot = gfn_to_memslot(kvm, base_gfn);
>> + if (!memslot) {
>> + ret = -EFAULT;
>> + goto out;
>> + }
>> +
>> + /* We require the region to be contained within a single memslot */
>> + if (memslot->base_gfn + memslot->npages < end_gfn) {
>> + ret = -EINVAL;
>> + goto out;
>> + }
>> +
>> + if (!kvm_slot_can_be_private(memslot)) {
>> + ret = -EINVAL;
>> + goto out;
>> + }
>> +
>> + write_lock(&kvm->mmu_lock);
>> +
>> + ipa = ipa_base;
>> + while (ipa < ipa_end) {
>> + struct vm_area_struct *vma;
>> + unsigned long map_size;
>> + unsigned int vma_shift;
>> + unsigned long offset;
>> + unsigned long hva;
>> + struct page *page;
>> + bool writeable;
>> + kvm_pfn_t pfn;
>> + int level, i;
>> +
>> + hva = gfn_to_hva_memslot(memslot, gpa_to_gfn(ipa));
>> + vma = vma_lookup(current->mm, hva);
>> + if (!vma) {
>> + ret = -EFAULT;
>> + break;
>> + }
>> +
>> + /* FIXME: Currently we only support 4k sized mappings */
>> + vma_shift = PAGE_SHIFT;
>> +
>> + map_size = 1 << vma_shift;
>> +
>> + ipa = ALIGN_DOWN(ipa, map_size);
>> +
>
> The blank lines in above 5 lines can be dropped :)
:) Agreed
>> + switch (map_size) {
>> + case RMM_L2_BLOCK_SIZE:
>> + level = 2;
>> + break;
>> + case PAGE_SIZE:
>> + level = 3;
>> + break;
>> + default:
>> + WARN_ONCE(1, "Unsupported vma_shift %d", vma_shift);
>> + ret = -EFAULT;
>> + break;
>> + }
>> +
>> + pfn = __kvm_faultin_pfn(memslot, gpa_to_gfn(ipa), FOLL_WRITE,
>> + &writeable, &page);
>> +
>> + if (is_error_pfn(pfn)) {
>> + ret = -EFAULT;
>> + break;
>> + }
>> +
>> + if (level < RMM_RTT_MAX_LEVEL) {
>> + /*
>> + * A temporary RTT is needed during the map, precreate
>> + * it, however if there is an error (e.g. missing
>> + * parent tables) this will be handled in the
>> + * realm_create_protected_data_page() call.
>> + */
>> + realm_create_rtt_levels(realm, ipa, level,
>> + RMM_RTT_MAX_LEVEL, NULL);
>> + }
>> +
>
> This block of code to create the temporary RTT can be removed. With it
> removed,
> we're going to rely on realm_create_protected_data_page() to create the
> needed
> RTT in its failing path. If the temporary RTT has been existing, the
> function
> call to realm_create_rtt_levels() doesn't nothing except multiple RMI
> calls are
> issued. RMI calls aren't cheap and it causes performance lost if you agree.
Actually this is to avoid extra RMI calls - albeit this is irrelevant
with the current FIXME above because everything is 4k.
For a block mapping it's still required to create RTTs to the full
depth, the individual 4k pages are then mapped, and finally the RTTs are
"folded" to remove the bottom table.
So in the usual case where RTTs exist to level 2 (because there are
other mappings nearby) then attempting to create the final level avoids
the first call to rmi_data_create() (in
realm_create_protected_data_page()) failing.
We don't expect tables to be created to RMM_RTT_MAX_LEVEL - because that
implies that there is something already mapped there (although this can
also happen in the situation where there was previously something mapped
but it has been unmapped without the RTTs being destroyed).
The comment is there because rather than adding another path for the
situation where parent tables don't exist, this is handled by the
rmi_data_create() failing in realm_create_protected_data_page() - the
error code lets us know which levels are needed. This is (of course)
inefficient, but it will be rare.
But given that I haven't got huge pages supported yet it's hard to make
any real judgements about the trade-offs and which is going to be most
performant.
>> + for (offset = 0, i = 0; offset < map_size && !ret;
>> + offset += PAGE_SIZE, i++) {
>> + phys_addr_t page_ipa = ipa + offset;
>> + kvm_pfn_t priv_pfn;
>> + struct page *gmem_page;
>> + int order;
>> +
>> + ret = kvm_gmem_get_pfn(kvm, memslot,
>> + page_ipa >> PAGE_SHIFT,
>> + &priv_pfn, &gmem_page, &order);
>> + if (ret)
>> + break;
>> +
>> + ret = realm_create_protected_data_page(realm, page_ipa,
>> + priv_pfn,
>> + pfn + i,
>> + data_flags);
>> + }
>> +
>> + kvm_release_faultin_page(kvm, page, false, false);
>> +
>> + if (ret)
>> + break;
>> +
>> + if (level == 2)
>> + fold_rtt(realm, ipa, level);
>> +
>> + ipa += map_size;
>> + }
>> +
>> + write_unlock(&kvm->mmu_lock);
>> +
>> +out:
>> + srcu_read_unlock(&kvm->srcu, idx);
>> + return ret;
>> +}
>> +
>> +static int kvm_populate_realm(struct kvm *kvm,
>> + struct kvm_cap_arm_rme_populate_realm_args *args)
>> +{
>> + phys_addr_t ipa_base, ipa_end;
>> +
>> + if (kvm_realm_state(kvm) != REALM_STATE_NEW)
>> + return -EINVAL;
>> +
>> + if (!IS_ALIGNED(args->populate_ipa_base, PAGE_SIZE) ||
>> + !IS_ALIGNED(args->populate_ipa_size, PAGE_SIZE))
>> + return -EINVAL;
>> +
>> + if (args->flags & ~RMI_MEASURE_CONTENT)
>> + return -EINVAL;
>> +
>> + ipa_base = args->populate_ipa_base;
>> + ipa_end = ipa_base + args->populate_ipa_size;
>> +
>> + if (ipa_end < ipa_base)
>> + return -EINVAL;
>> +
>> + /*
>> + * Perform the populate in parts to ensure locks are not held for
>> too
>> + * long
>> + */
>> + while (ipa_base < ipa_end) {
>> + phys_addr_t end = min(ipa_end, ipa_base + SZ_2M);
>> +
>> + int ret = populate_par_region(kvm, ipa_base, end,
>> + args->flags);
>> +
>> + if (ret)
>> + return ret;
>> +
>
> cond_resched() seems nice to have here so that those pending tasks can
> be run immediately.
Ack
Thanks,
Steve
>> + ipa_base = end;
>> + }
>> +
>> + return 0;
>> +}
>> +
>> int realm_set_ipa_state(struct kvm_vcpu *vcpu,
>> unsigned long start,
>> unsigned long end,
>> @@ -794,6 +1025,18 @@ int kvm_realm_enable_cap(struct kvm *kvm, struct
>> kvm_enable_cap *cap)
>> r = kvm_init_ipa_range_realm(kvm, &args);
>> break;
>> }
>> + case KVM_CAP_ARM_RME_POPULATE_REALM: {
>> + struct kvm_cap_arm_rme_populate_realm_args args;
>> + void __user *argp = u64_to_user_ptr(cap->args[1]);
>> +
>> + if (copy_from_user(&args, argp, sizeof(args))) {
>> + r = -EFAULT;
>> + break;
>> + }
>> +
>> + r = kvm_populate_realm(kvm, &args);
>> + break;
>> + }
>> default:
>> r = -EINVAL;
>> break;
>
> Thanks,
> Gavin
>