Re: [PATCH v19 044/130] KVM: TDX: Do TDX specific vcpu initialization

From: Chao Gao
Date: Thu Mar 21 2024 - 01:43:45 EST


>+/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
>+static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
>+{
>+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
>+ struct vcpu_tdx *tdx = to_tdx(vcpu);
>+ unsigned long *tdvpx_pa = NULL;
>+ unsigned long tdvpr_pa;
>+ unsigned long va;
>+ int ret, i;
>+ u64 err;
>+
>+ if (is_td_vcpu_created(tdx))
>+ return -EINVAL;
>+
>+ /*
>+ * vcpu_free method frees allocated pages. Avoid partial setup so
>+ * that the method can't handle it.
>+ */
>+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
>+ if (!va)
>+ return -ENOMEM;
>+ tdvpr_pa = __pa(va);
>+
>+ tdvpx_pa = kcalloc(tdx_info->nr_tdvpx_pages, sizeof(*tdx->tdvpx_pa),
>+ GFP_KERNEL_ACCOUNT);
>+ if (!tdvpx_pa) {
>+ ret = -ENOMEM;
>+ goto free_tdvpr;
>+ }
>+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
>+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
>+ if (!va) {
>+ ret = -ENOMEM;
>+ goto free_tdvpx;
>+ }
>+ tdvpx_pa[i] = __pa(va);
>+ }
>+
>+ err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa);
>+ if (KVM_BUG_ON(err, vcpu->kvm)) {
>+ ret = -EIO;
>+ pr_tdx_error(TDH_VP_CREATE, err, NULL);
>+ goto free_tdvpx;
>+ }
>+ tdx->tdvpr_pa = tdvpr_pa;
>+
>+ tdx->tdvpx_pa = tdvpx_pa;
>+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {

Can you merge the for-loop above into this one? then ...

>+ err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]);
>+ if (KVM_BUG_ON(err, vcpu->kvm)) {
>+ pr_tdx_error(TDH_VP_ADDCX, err, NULL);

>+ for (; i < tdx_info->nr_tdvpx_pages; i++) {
>+ free_page((unsigned long)__va(tdvpx_pa[i]));
>+ tdvpx_pa[i] = 0;
>+ }

.. no need to free remaining pages.

>+ /* vcpu_free method frees TDVPX and TDR donated to TDX */
>+ return -EIO;
>+ }
>+ }
>+
>+ err = tdh_vp_init(tdx->tdvpr_pa, vcpu_rcx);
>+ if (KVM_BUG_ON(err, vcpu->kvm)) {
>+ pr_tdx_error(TDH_VP_INIT, err, NULL);
>+ return -EIO;
>+ }
>+
>+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
>+ tdx->td_vcpu_created = true;
>+ return 0;
>+
>+free_tdvpx:
>+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
>+ if (tdvpx_pa[i])
>+ free_page((unsigned long)__va(tdvpx_pa[i]));
>+ tdvpx_pa[i] = 0;
>+ }
>+ kfree(tdvpx_pa);
>+ tdx->tdvpx_pa = NULL;
>+free_tdvpr:
>+ if (tdvpr_pa)
>+ free_page((unsigned long)__va(tdvpr_pa));
>+ tdx->tdvpr_pa = 0;
>+
>+ return ret;
>+}
>+
>+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
>+{
>+ struct msr_data apic_base_msr;
>+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
>+ struct vcpu_tdx *tdx = to_tdx(vcpu);
>+ struct kvm_tdx_cmd cmd;
>+ int ret;
>+
>+ if (tdx->initialized)
>+ return -EINVAL;
>+
>+ if (!is_hkid_assigned(kvm_tdx) || is_td_finalized(kvm_tdx))

These checks look random e.g., I am not sure why is_td_created() isn't check here.

A few helper functions and boolean variables are added to track which stage the
TD or TD vCPU is in. e.g.,

is_hkid_assigned()
is_td_finalized()
is_td_created()
tdx->initialized
td_vcpu_created

Insteading of doing this, I am wondering if adding two state machines for
TD and TD vCPU would make the implementation clear and easy to extend.

>+ return -EINVAL;
>+
>+ if (copy_from_user(&cmd, argp, sizeof(cmd)))
>+ return -EFAULT;
>+
>+ if (cmd.error)
>+ return -EINVAL;
>+
>+ /* Currently only KVM_TDX_INTI_VCPU is defined for vcpu operation. */
>+ if (cmd.flags || cmd.id != KVM_TDX_INIT_VCPU)
>+ return -EINVAL;

Even though KVM_TD_INIT_VCPU is the only supported command, it is worthwhile to
use a switch-case statement. New commands can be added easily without the need
to refactor this function first.

>+
>+ /*
>+ * As TDX requires X2APIC, set local apic mode to X2APIC. User space
>+ * VMM, e.g. qemu, is required to set CPUID[0x1].ecx.X2APIC=1 by
>+ * KVM_SET_CPUID2. Otherwise kvm_set_apic_base() will fail.
>+ */
>+ apic_base_msr = (struct msr_data) {
>+ .host_initiated = true,
>+ .data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
>+ (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0),
>+ };
>+ if (kvm_set_apic_base(vcpu, &apic_base_msr))
>+ return -EINVAL;

Exporting kvm_vcpu_is_reset_bsp() and kvm_set_apic_base() should be done
here (rather than in a previous patch).

>+
>+ ret = tdx_td_vcpu_init(vcpu, (u64)cmd.data);
>+ if (ret)
>+ return ret;
>+
>+ tdx->initialized = true;
>+ return 0;
>+}
>+

>diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>index c002761bb662..2bd4b7c8fa51 100644
>--- a/arch/x86/kvm/x86.c
>+++ b/arch/x86/kvm/x86.c
>@@ -6274,6 +6274,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> case KVM_SET_DEVICE_ATTR:
> r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
> break;
>+ case KVM_MEMORY_ENCRYPT_OP:
>+ r = -ENOTTY;

Maybe -EINVAL is better. Because previously trying to call this on vCPU fd
failed with -EINVAL given ...

>+ if (!kvm_x86_ops.vcpu_mem_enc_ioctl)
>+ goto out;
>+ r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp);
>+ break;
> default:
> r = -EINVAL;

.. this.

> }
>--
>2.25.1
>
>