[PATCH 19/21] KVM: TDX: Add an ioctl to create initial guest memory

From: Rick Edgecombe
Date: Tue Sep 03 2024 - 23:20:28 EST


From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>

Add a new ioctl for the user space VMM to initialize guest memory with the
specified memory contents.

Because TDX protects the guest's memory, the creation of the initial guest
memory requires a dedicated TDX module API, TDH.MEM.PAGE.ADD(), instead of
directly copying the memory contents into the guest's memory in the case of
the default VM type.

Define a new subcommand, KVM_TDX_INIT_MEM_REGION, of vCPU-scoped
KVM_MEMORY_ENCRYPT_OP. Check if the GFN is already pre-allocated, assign
the guest page in Secure-EPT, copy the initial memory contents into the
guest memory, and encrypt the guest memory. Optionally, extend the memory
measurement of the TDX guest.

Discussion history:
- Originally, KVM_TDX_INIT_MEM_REGION used the callback of the TDP MMU of
the KVM page fault handler. It issues TDX SEAMCALL deep in the call
stack, and the ioctl passes down the necessary parameters. [2] rejected
it. [3] suggests that the call to the TDX module should be invoked in a
shallow call stack.

- Instead, introduce guest memory pre-population [1] that doesn't update
vendor-specific part (Secure-EPT in TDX case) and the vendor-specific
code (KVM_TDX_INIT_MEM_REGION) updates only vendor-specific parts without
modifying the KVM TDP MMU suggested at [4]

Crazy idea. For TDX S-EPT, what if KVM_MAP_MEMORY does all of the
SEPT.ADD stuff, which doesn't affect the measurement, and even fills in
KVM's copy of the leaf EPTE, but tdx_sept_set_private_spte() doesn't do
anything if the TD isn't finalized?

Then KVM provides a dedicated TDX ioctl(), i.e. what is/was
KVM_TDX_INIT_MEM_REGION, to do PAGE.ADD. KVM_TDX_INIT_MEM_REGION
wouldn't need to map anything, it would simply need to verify that the
pfn from guest_memfd() is the same as what's in the TDP MMU.

- Use the common guest_memfd population function, kvm_gmem_populate()
instead of a custom function. It should check whether the PFN
from TDP MMU is the same as the one from guest_memfd. [1]

- Instead of forcing userspace to do two passes, pre-map the guest
initial memory in tdx_gmem_post_populate. [5]

Link: https://lore.kernel.org/kvm/20240419085927.3648704-1-pbonzini@xxxxxxxxxx/ [1]
Link: https://lore.kernel.org/kvm/Zbrj5WKVgMsUFDtb@xxxxxxxxxx/ [2]
Link: https://lore.kernel.org/kvm/Zh8DHbb8FzoVErgX@xxxxxxxxxx/ [3]
Link: https://lore.kernel.org/kvm/Ze-TJh0BBOWm9spT@xxxxxxxxxx/ [4]
Link: https://lore.kernel.org/kvm/CABgObfa=a3cKcKJHQRrCs-3Ty8ppSRou=dhi6Q+KdZnom0Zegw@xxxxxxxxxxxxxx/ [5]
Suggested-by: Sean Christopherson <seanjc@xxxxxxxxxx>
Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
Co-developed-by: Yan Zhao <yan.y.zhao@xxxxxxxxx>
Signed-off-by: Yan Zhao <yan.y.zhao@xxxxxxxxx>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@xxxxxxxxx>
---
TDX MMU part 2 v1:
- Update the code according to latest gmem update.
https://lore.kernel.org/kvm/CABgObfa=a3cKcKJHQRrCs-3Ty8ppSRou=dhi6Q+KdZnom0Zegw@xxxxxxxxxxxxxx/
- Fixup a aligment bug reported by Binbin.
- Rename KVM_MEMORY_MAPPING => KVM_MAP_MEMORY (Sean)
- Drop issueing TDH.MEM.PAGE.ADD() on KVM_MAP_MEMORY(), defer it to
KVM_TDX_INIT_MEM_REGION. (Sean)
- Added nr_premapped to track the number of premapped pages
- Drop tdx_post_mmu_map_page().
- Drop kvm_slot_can_be_private() check (Paolo)
- Use kvm_tdp_mmu_gpa_is_mapped() (Paolo)

v19:
- Switched to use KVM_MEMORY_MAPPING
- Dropped measurement extension
- updated commit message. private_page_add() => set_private_spte()
---
arch/x86/include/uapi/asm/kvm.h | 9 ++
arch/x86/kvm/vmx/tdx.c | 150 ++++++++++++++++++++++++++++++++
virt/kvm/kvm_main.c | 1 +
3 files changed, 160 insertions(+)

diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 39636be5c891..789d1d821b4f 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -931,6 +931,7 @@ enum kvm_tdx_cmd_id {
KVM_TDX_CAPABILITIES = 0,
KVM_TDX_INIT_VM,
KVM_TDX_INIT_VCPU,
+ KVM_TDX_INIT_MEM_REGION,
KVM_TDX_GET_CPUID,

KVM_TDX_CMD_NR_MAX,
@@ -996,4 +997,12 @@ struct kvm_tdx_init_vm {
struct kvm_cpuid2 cpuid;
};

+#define KVM_TDX_MEASURE_MEMORY_REGION _BITULL(0)
+
+struct kvm_tdx_init_mem_region {
+ __u64 source_addr;
+ __u64 gpa;
+ __u64 nr_pages;
+};
+
#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 50ce24905062..796d1a495a66 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -8,6 +8,7 @@
#include "tdx_ops.h"
#include "vmx.h"
#include "mmu/spte.h"
+#include "common.h"

#undef pr_fmt
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -1586,6 +1587,152 @@ static int tdx_vcpu_init(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
return 0;
}

+struct tdx_gmem_post_populate_arg {
+ struct kvm_vcpu *vcpu;
+ __u32 flags;
+};
+
+static int tdx_gmem_post_populate(struct kvm *kvm, gfn_t gfn, kvm_pfn_t pfn,
+ void __user *src, int order, void *_arg)
+{
+ u64 error_code = PFERR_GUEST_FINAL_MASK | PFERR_PRIVATE_ACCESS;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct tdx_gmem_post_populate_arg *arg = _arg;
+ struct kvm_vcpu *vcpu = arg->vcpu;
+ gpa_t gpa = gfn_to_gpa(gfn);
+ u8 level = PG_LEVEL_4K;
+ struct page *page;
+ int ret, i;
+ u64 err, entry, level_state;
+
+ /*
+ * Get the source page if it has been faulted in. Return failure if the
+ * source page has been swapped out or unmapped in primary memory.
+ */
+ ret = get_user_pages_fast((unsigned long)src, 1, 0, &page);
+ if (ret < 0)
+ return ret;
+ if (ret != 1)
+ return -ENOMEM;
+
+ if (!kvm_mem_is_private(kvm, gfn)) {
+ ret = -EFAULT;
+ goto out_put_page;
+ }
+
+ ret = kvm_tdp_map_page(vcpu, gpa, error_code, &level);
+ if (ret < 0)
+ goto out_put_page;
+
+ read_lock(&kvm->mmu_lock);
+
+ if (!kvm_tdp_mmu_gpa_is_mapped(vcpu, gpa)) {
+ ret = -ENOENT;
+ goto out;
+ }
+
+ ret = 0;
+ do {
+ err = tdh_mem_page_add(kvm_tdx, gpa, pfn_to_hpa(pfn),
+ pfn_to_hpa(page_to_pfn(page)),
+ &entry, &level_state);
+ } while (err == TDX_ERROR_SEPT_BUSY);
+ if (err) {
+ ret = -EIO;
+ goto out;
+ }
+
+ WARN_ON_ONCE(!atomic64_read(&kvm_tdx->nr_premapped));
+ atomic64_dec(&kvm_tdx->nr_premapped);
+
+ if (arg->flags & KVM_TDX_MEASURE_MEMORY_REGION) {
+ for (i = 0; i < PAGE_SIZE; i += TDX_EXTENDMR_CHUNKSIZE) {
+ err = tdh_mr_extend(kvm_tdx, gpa + i, &entry,
+ &level_state);
+ if (err) {
+ ret = -EIO;
+ break;
+ }
+ }
+ }
+
+out:
+ read_unlock(&kvm->mmu_lock);
+out_put_page:
+ put_page(page);
+ return ret;
+}
+
+static int tdx_vcpu_init_mem_region(struct kvm_vcpu *vcpu, struct kvm_tdx_cmd *cmd)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
+ struct kvm_tdx_init_mem_region region;
+ struct tdx_gmem_post_populate_arg arg;
+ long gmem_ret;
+ int ret;
+
+ if (!to_tdx(vcpu)->initialized)
+ return -EINVAL;
+
+ /* Once TD is finalized, the initial guest memory is fixed. */
+ if (is_td_finalized(kvm_tdx))
+ return -EINVAL;
+
+ if (cmd->flags & ~KVM_TDX_MEASURE_MEMORY_REGION)
+ return -EINVAL;
+
+ if (copy_from_user(&region, u64_to_user_ptr(cmd->data), sizeof(region)))
+ return -EFAULT;
+
+ if (!PAGE_ALIGNED(region.source_addr) || !PAGE_ALIGNED(region.gpa) ||
+ !region.nr_pages ||
+ region.gpa + (region.nr_pages << PAGE_SHIFT) <= region.gpa ||
+ !kvm_is_private_gpa(kvm, region.gpa) ||
+ !kvm_is_private_gpa(kvm, region.gpa + (region.nr_pages << PAGE_SHIFT) - 1))
+ return -EINVAL;
+
+ mutex_lock(&kvm->slots_lock);
+
+ kvm_mmu_reload(vcpu);
+ ret = 0;
+ while (region.nr_pages) {
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ break;
+ }
+
+ arg = (struct tdx_gmem_post_populate_arg) {
+ .vcpu = vcpu,
+ .flags = cmd->flags,
+ };
+ gmem_ret = kvm_gmem_populate(kvm, gpa_to_gfn(region.gpa),
+ u64_to_user_ptr(region.source_addr),
+ 1, tdx_gmem_post_populate, &arg);
+ if (gmem_ret < 0) {
+ ret = gmem_ret;
+ break;
+ }
+
+ if (gmem_ret != 1) {
+ ret = -EIO;
+ break;
+ }
+
+ region.source_addr += PAGE_SIZE;
+ region.gpa += PAGE_SIZE;
+ region.nr_pages--;
+
+ cond_resched();
+ }
+
+ mutex_unlock(&kvm->slots_lock);
+
+ if (copy_to_user(u64_to_user_ptr(cmd->data), &region, sizeof(region)))
+ ret = -EFAULT;
+ return ret;
+}
+
int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
{
struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
@@ -1605,6 +1752,9 @@ int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
case KVM_TDX_INIT_VCPU:
ret = tdx_vcpu_init(vcpu, &cmd);
break;
+ case KVM_TDX_INIT_MEM_REGION:
+ ret = tdx_vcpu_init_mem_region(vcpu, &cmd);
+ break;
case KVM_TDX_GET_CPUID:
ret = tdx_vcpu_get_cpuid(vcpu, &cmd);
break;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 73fc3334721d..0822db480719 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2639,6 +2639,7 @@ struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn

return NULL;
}
+EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot);

bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
{
--
2.34.1