Re: [PATCH v10 021/108] KVM: TDX: initialize VM with TDX specific parameters

From: Wang, Lei
Date: Wed Jan 04 2023 - 02:59:24 EST


On 10/30/2022 2:22 PM, isaku.yamahata@xxxxxxxxx wrote:
> From: Xiaoyao Li <xiaoyao.li@xxxxxxxxx>
>
> TDX requires additional parameters for TDX VM for confidential execution to
> protect its confidentiality of its memory contents and its CPU state from
> any other software, including VMM. When creating guest TD VM before
> creating vcpu, the number of vcpu, TSC frequency (that is same among
> vcpus. and it can't be changed.) CPUIDs which is emulated by the TDX
> module. It means guest can trust those CPUIDs. and sha384 values for
> measurement.
>
> Add new subcommand, KVM_TDX_INIT_VM, to pass parameters for TDX guest. It
> assigns encryption key to the TDX guest for memory encryption. TDX
> encrypts memory per-guest bases. It assigns device model passes per-VM
> parameters for the TDX guest. The maximum number of vcpus, tsc frequency
> (TDX guest has fised VM-wide TSC frequency. not per-vcpu. The TDX guest
> can not change it.), attributes (production or debug), available extended
> features (which is reflected into guest XCR0, IA32_XSS MSR), cpuids, sha384
> measurements, and etc.
>
> This subcommand is called before creating vcpu and KVM_SET_CPUID2, i.e.
> cpuids configurations aren't available yet. So CPUIDs configuration values
> needs to be passed in struct kvm_init_vm. It's device model responsibility

I suppose this should be kvm_tdx_init_vm.

> to make this cpuid config for KVM_TDX_INIT_VM and KVM_SET_CPUID2.
>
> Signed-off-by: Xiaoyao Li <xiaoyao.li@xxxxxxxxx>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
> ---
> arch/x86/include/asm/tdx.h | 3 +
> arch/x86/include/uapi/asm/kvm.h | 31 +++
> arch/x86/kvm/vmx/tdx.c | 296 ++++++++++++++++++++++----
> arch/x86/kvm/vmx/tdx.h | 22 ++
> tools/arch/x86/include/uapi/asm/kvm.h | 33 +++
> 5 files changed, 347 insertions(+), 38 deletions(-)
>
> diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> index cd304d323d33..05ac4bfc8f8a 100644
> --- a/arch/x86/include/asm/tdx.h
> +++ b/arch/x86/include/asm/tdx.h
> @@ -131,6 +131,9 @@ static inline long tdx_kvm_hypercall(unsigned int nr, unsigned long p1,
> #endif /* CONFIG_INTEL_TDX_GUEST && CONFIG_KVM_GUEST */
>
> #ifdef CONFIG_INTEL_TDX_HOST
> +
> +/* -1 indicates CPUID leaf with no sub-leaves. */
> +#define TDX_CPUID_NO_SUBLEAF ((u32)-1)
> struct tdx_cpuid_config {
> u32 leaf;
> u32 sub_leaf;
> diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
> index 2ad9666e02a5..26661879c031 100644
> --- a/arch/x86/include/uapi/asm/kvm.h
> +++ b/arch/x86/include/uapi/asm/kvm.h
> @@ -538,6 +538,7 @@ struct kvm_pmu_event_filter {
> /* Trust Domain eXtension sub-ioctl() commands. */
> enum kvm_tdx_cmd_id {
> KVM_TDX_CAPABILITIES = 0,
> + KVM_TDX_INIT_VM,
>
> KVM_TDX_CMD_NR_MAX,
> };
> @@ -583,4 +584,34 @@ struct kvm_tdx_capabilities {
> struct kvm_tdx_cpuid_config cpuid_configs[0];
> };
>
> +struct kvm_tdx_init_vm {
> + __u64 attributes;
> + __u64 mrconfigid[6]; /* sha384 digest */
> + __u64 mrowner[6]; /* sha384 digest */
> + __u64 mrownerconfig[6]; /* sha348 digest */
> + union {
> + /*
> + * KVM_TDX_INIT_VM is called before vcpu creation, thus before
> + * KVM_SET_CPUID2. CPUID configurations needs to be passed.
> + *
> + * This configuration supersedes KVM_SET_CPUID{,2}.
> + * The user space VMM, e.g. qemu, should make them consistent
> + * with this values.
> + * sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES(256)
> + * = 8KB.
> + */
> + struct {
> + struct kvm_cpuid2 cpuid;
> + /* 8KB with KVM_MAX_CPUID_ENTRIES. */
> + struct kvm_cpuid_entry2 entries[];
> + };
> + /*
> + * For future extensibility.
> + * The size(struct kvm_tdx_init_vm) = 16KB.
> + * This should be enough given sizeof(TD_PARAMS) = 1024
> + */
> + __u64 reserved[2029];
> + };
> +};
> +
> #endif /* _ASM_X86_KVM_H */
> diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
> index d77709a6da51..54045e0576e7 100644
> --- a/arch/x86/kvm/vmx/tdx.c
> +++ b/arch/x86/kvm/vmx/tdx.c
> @@ -284,6 +284,205 @@ static int tdx_do_tdh_mng_key_config(void *param)
> int tdx_vm_init(struct kvm *kvm)
> {
> struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> +
> + kvm_tdx->hkid = -1;
> +
> + /*
> + * This function initializes only KVM software construct. It doesn't
> + * initialize TDX stuff, e.g. TDCS, TDR, TDCX, HKID etc.
> + * It is handled by KVM_TDX_INIT_VM, __tdx_td_init().
> + */
> +
> + return 0;
> +}
> +
> +int tdx_dev_ioctl(void __user *argp)
> +{
> + struct kvm_tdx_capabilities __user *user_caps;
> + struct kvm_tdx_capabilities caps;
> + struct kvm_tdx_cmd cmd;
> +
> + BUILD_BUG_ON(sizeof(struct kvm_tdx_cpuid_config) !=
> + sizeof(struct tdx_cpuid_config));
> +
> + if (copy_from_user(&cmd, argp, sizeof(cmd)))
> + return -EFAULT;
> + if (cmd.flags || cmd.error || cmd.unused)
> + return -EINVAL;
> + /*
> + * Currently only KVM_TDX_CAPABILITIES is defined for system-scoped
> + * mem_enc_ioctl().
> + */
> + if (cmd.id != KVM_TDX_CAPABILITIES)
> + return -EINVAL;
> +
> + user_caps = (void __user *)cmd.data;
> + if (copy_from_user(&caps, user_caps, sizeof(caps)))
> + return -EFAULT;
> +
> + if (caps.nr_cpuid_configs < tdx_caps.nr_cpuid_configs)
> + return -E2BIG;
> +
> + caps = (struct kvm_tdx_capabilities) {
> + .attrs_fixed0 = tdx_caps.attrs_fixed0,
> + .attrs_fixed1 = tdx_caps.attrs_fixed1,
> + .xfam_fixed0 = tdx_caps.xfam_fixed0,
> + .xfam_fixed1 = tdx_caps.xfam_fixed1,
> + .nr_cpuid_configs = tdx_caps.nr_cpuid_configs,
> + .padding = 0,
> + };
> +
> + if (copy_to_user(user_caps, &caps, sizeof(caps)))
> + return -EFAULT;
> + if (copy_to_user(user_caps->cpuid_configs, &tdx_caps.cpuid_configs,
> + tdx_caps.nr_cpuid_configs *
> + sizeof(struct tdx_cpuid_config)))
> + return -EFAULT;
> +
> + return 0;
> +}

tdx_dev_ioctl() is introduced in previous patch with the same code added here,
which means this is just a place change and it will confuse reviewers. Is it
neccesary to do so?

> +
> +/*
> + * cpuid entry lookup in TDX cpuid config way.
> + * The difference is how to specify index(subleaves).
> + * Specify index to TDX_CPUID_NO_SUBLEAF for CPUID leaf with no-subleaves.
> + */
> +static const struct kvm_cpuid_entry2 *tdx_find_cpuid_entry(const struct kvm_cpuid2 *cpuid,
> + u32 function, u32 index)
> +{
> + int i;
> +
> + /* In TDX CPU CONFIG, TDX_CPUID_NO_SUBLEAF means index = 0. */
> + if (index == TDX_CPUID_NO_SUBLEAF)
> + index = 0;
> +
> + for (i = 0; i < cpuid->nent; i++) {
> + const struct kvm_cpuid_entry2 *e = &cpuid->entries[i];
> +
> + if (e->function == function &&
> + (e->index == index ||
> + !(e->flags & KVM_CPUID_FLAG_SIGNIFCANT_INDEX)))
> + return e;
> + }
> + return NULL;
> +}
> +
> +static int setup_tdparams(struct kvm *kvm, struct td_params *td_params,
> + struct kvm_tdx_init_vm *init_vm)

Indentation is broken here, please fix it.

> +{
> + const struct kvm_cpuid2 *cpuid = &init_vm->cpuid;
> + const struct kvm_cpuid_entry2 *entry;
> + u64 guest_supported_xcr0;
> + u64 guest_supported_xss;
> + int max_pa;
> + int i;
> +
> + if (kvm->created_vcpus)
> + return -EBUSY;
> + td_params->max_vcpus = kvm->max_vcpus;
> + td_params->attributes = init_vm->attributes;
> + if (td_params->attributes & TDX_TD_ATTRIBUTE_PERFMON) {
> + /*
> + * TODO: save/restore PMU related registers around TDENTER.
> + * Once it's done, remove this guard.
> + */
> + pr_warn("TD doesn't support perfmon yet. KVM needs to save/restore "
> + "host perf registers properly.\n");
> + return -EOPNOTSUPP;
> + }
> +
> + for (i = 0; i < tdx_caps.nr_cpuid_configs; i++) {
> + const struct tdx_cpuid_config *config = &tdx_caps.cpuid_configs[i];
> + const struct kvm_cpuid_entry2 *entry =
> + tdx_find_cpuid_entry(cpuid, config->leaf, config->sub_leaf);
> + struct tdx_cpuid_value *value = &td_params->cpuid_values[i];
> +
> + if (!entry)
> + continue;
> +
> + value->eax = entry->eax & config->eax;
> + value->ebx = entry->ebx & config->ebx;
> + value->ecx = entry->ecx & config->ecx;
> + value->edx = entry->edx & config->edx;
> + }
> +
> + max_pa = 36;
> + entry = tdx_find_cpuid_entry(cpuid, 0x80000008, 0);
> + if (entry)
> + max_pa = entry->eax & 0xff;
> +
> + td_params->eptp_controls = VMX_EPTP_MT_WB;
> + /*
> + * No CPU supports 4-level && max_pa > 48.
> + * "5-level paging and 5-level EPT" section 4.1 4-level EPT
> + * "4-level EPT is limited to translating 48-bit guest-physical
> + * addresses."
> + * cpu_has_vmx_ept_5levels() check is just in case.
> + */
> + if (cpu_has_vmx_ept_5levels() && max_pa > 48) {
> + td_params->eptp_controls |= VMX_EPTP_PWL_5;
> + td_params->exec_controls |= TDX_EXEC_CONTROL_MAX_GPAW;
> + } else {
> + td_params->eptp_controls |= VMX_EPTP_PWL_4;
> + }
> +
> + /* Setup td_params.xfam */
> + entry = tdx_find_cpuid_entry(cpuid, 0xd, 0);
> + if (entry)
> + guest_supported_xcr0 = (entry->eax | ((u64)entry->edx << 32));
> + else
> + guest_supported_xcr0 = 0;
> + guest_supported_xcr0 &= kvm_caps.supported_xcr0;
> +
> + entry = tdx_find_cpuid_entry(cpuid, 0xd, 1);
> + if (entry)
> + guest_supported_xss = (entry->ecx | ((u64)entry->edx << 32));
> + else
> + guest_supported_xss = 0;
> + /* PT can be exposed to TD guest regardless of KVM's XSS support */
> + guest_supported_xss &= (kvm_caps.supported_xss | XFEATURE_MASK_PT);
> +
> + td_params->xfam = guest_supported_xcr0 | guest_supported_xss;
> + if (td_params->xfam & XFEATURE_MASK_LBR) {
> + /*
> + * TODO: once KVM supports LBR(save/restore LBR related
> + * registers around TDENTER), remove this guard.
> + */
> + pr_warn("TD doesn't support LBR yet. KVM needs to save/restore "
> + "IA32_LBR_DEPTH properly.\n");
> + return -EOPNOTSUPP;
> + }
> +
> + if (td_params->xfam & XFEATURE_MASK_XTILE) {
> + /*
> + * TODO: once KVM supports AMX(save/restore AMX related
> + * registers around TDENTER), remove this guard.
> + */
> + pr_warn("TD doesn't support AMX yet. KVM needs to save/restore "
> + "IA32_XFD, IA32_XFD_ERR properly.\n");
> + return -EOPNOTSUPP;
> + }
> +
> + td_params->tsc_frequency =
> + TDX_TSC_KHZ_TO_25MHZ(kvm->arch.default_tsc_khz);
> +
> +#define MEMCPY_SAME_SIZE(dst, src) \
> + do { \
> + BUILD_BUG_ON(sizeof(dst) != sizeof(src)); \
> + memcpy((dst), (src), sizeof(dst)); \
> + } while (0)
> +
> + MEMCPY_SAME_SIZE(td_params->mrconfigid, init_vm->mrconfigid);
> + MEMCPY_SAME_SIZE(td_params->mrowner, init_vm->mrowner);
> + MEMCPY_SAME_SIZE(td_params->mrownerconfig, init_vm->mrownerconfig);
> +
> + return 0;
> +}
> +
> +static int __tdx_td_init(struct kvm *kvm, struct td_params *td_params)
> +{
> + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> + struct tdx_module_output out;
> cpumask_var_t packages;
> int ret, i;
> u64 err;
> @@ -390,10 +589,13 @@ int tdx_vm_init(struct kvm *kvm)
> tdx_mark_td_page_added(&kvm_tdx->tdcs[i]);
> }
>
> - /*
> - * Note, TDH_MNG_INIT cannot be invoked here. TDH_MNG_INIT requires a dedicated
> - * ioctl() to define the configure CPUID values for the TD.
> - */
> + err = tdh_mng_init(kvm_tdx->tdr.pa, __pa(td_params), &out);
> + if (WARN_ON_ONCE(err)) {
> + pr_tdx_error(TDH_MNG_INIT, err, &out);
> + ret = -EIO;
> + goto teardown;
> + }
> +
> return 0;
>
> /*
> @@ -431,50 +633,65 @@ int tdx_vm_init(struct kvm *kvm)
> return ret;
> }
>
> -int tdx_dev_ioctl(void __user *argp)
> +static int tdx_td_init(struct kvm *kvm, struct kvm_tdx_cmd *cmd)
> {
> - struct kvm_tdx_capabilities __user *user_caps;
> - struct kvm_tdx_capabilities caps;
> - struct kvm_tdx_cmd cmd;
> + struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> + struct kvm_tdx_init_vm *init_vm = NULL;
> + struct td_params *td_params = NULL;
> + void *entries_end;
> + int ret;
>
> - BUILD_BUG_ON(sizeof(struct kvm_tdx_cpuid_config) !=
> - sizeof(struct tdx_cpuid_config));
> + BUILD_BUG_ON(sizeof(*init_vm) != 16 * 1024);
> + BUILD_BUG_ON((sizeof(*init_vm) - offsetof(typeof(*init_vm), entries)) /
> + sizeof(init_vm->entries[0]) < KVM_MAX_CPUID_ENTRIES);
> + BUILD_BUG_ON(sizeof(struct td_params) != 1024);
>
> - if (copy_from_user(&cmd, argp, sizeof(cmd)))
> - return -EFAULT;
> - if (cmd.flags || cmd.error || cmd.unused)
> + if (is_td_initialized(kvm))
> return -EINVAL;
> - /*
> - * Currently only KVM_TDX_CAPABILITIES is defined for system-scoped
> - * mem_enc_ioctl().
> - */
> - if (cmd.id != KVM_TDX_CAPABILITIES)
> +
> + if (cmd->flags)
> return -EINVAL;
>
> - user_caps = (void __user *)cmd.data;
> - if (copy_from_user(&caps, user_caps, sizeof(caps)))
> - return -EFAULT;
> + init_vm = kzalloc(sizeof(*init_vm), GFP_KERNEL);
> + if (copy_from_user(init_vm, (void __user *)cmd->data, sizeof(*init_vm))) {
> + ret = -EFAULT;
> + goto out;
> + }
>
> - if (caps.nr_cpuid_configs < tdx_caps.nr_cpuid_configs)
> - return -E2BIG;
> + ret = -EINVAL;
> + if (init_vm->cpuid.padding)
> + goto out;
> + /* init_vm->entries shouldn't overrun. */
> + entries_end = init_vm->entries + init_vm->cpuid.nent;
> + if (entries_end > (void *)(init_vm + 1))
> + goto out;
> + /* Unused part must be zero. */
> + if (memchr_inv(entries_end, 0, (void *)(init_vm + 1) - entries_end))
> + goto out;
>
> - caps = (struct kvm_tdx_capabilities) {
> - .attrs_fixed0 = tdx_caps.attrs_fixed0,
> - .attrs_fixed1 = tdx_caps.attrs_fixed1,
> - .xfam_fixed0 = tdx_caps.xfam_fixed0,
> - .xfam_fixed1 = tdx_caps.xfam_fixed1,
> - .nr_cpuid_configs = tdx_caps.nr_cpuid_configs,
> - .padding = 0,
> - };
> + td_params = kzalloc(sizeof(struct td_params), GFP_KERNEL);
> + if (!td_params) {
> + ret = -ENOMEM;
> + goto out;
> + }
>
> - if (copy_to_user(user_caps, &caps, sizeof(caps)))
> - return -EFAULT;
> - if (copy_to_user(user_caps->cpuid_configs, &tdx_caps.cpuid_configs,
> - tdx_caps.nr_cpuid_configs *
> - sizeof(struct tdx_cpuid_config)))
> - return -EFAULT;
> + ret = setup_tdparams(kvm, td_params, init_vm);
> + if (ret)
> + goto out;
>
> - return 0;
> + ret = __tdx_td_init(kvm, td_params);
> + if (ret)
> + goto out;
> +
> + kvm_tdx->tsc_offset = td_tdcs_exec_read64(kvm_tdx, TD_TDCS_EXEC_TSC_OFFSET);
> + kvm_tdx->attributes = td_params->attributes;
> + kvm_tdx->xfam = td_params->xfam;
> +
> +out:
> + /* kfree() accepts NULL. */
> + kfree(init_vm);
> + kfree(td_params);
> + return ret;
> }
>
> int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
> @@ -490,6 +707,9 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
> mutex_lock(&kvm->lock);
>
> switch (tdx_cmd.id) {
> + case KVM_TDX_INIT_VM:
> + r = tdx_td_init(kvm, &tdx_cmd);
> + break;
> default:
> r = -EINVAL;
> goto out;
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index 938314635b47..ff0ea9cad347 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -18,7 +18,11 @@ struct kvm_tdx {
> struct tdx_td_page tdr;
> struct tdx_td_page *tdcs;
>
> + u64 attributes;
> + u64 xfam;
> int hkid;
> +
> + u64 tsc_offset;
> };
>
> struct vcpu_tdx {
> @@ -48,6 +52,11 @@ static inline struct vcpu_tdx *to_tdx(struct kvm_vcpu *vcpu)
> return container_of(vcpu, struct vcpu_tdx, vcpu);
> }
>
> +static inline bool is_td_initialized(struct kvm *kvm)
> +{
> + return to_kvm_tdx(kvm)->hkid > 0;
> +}

There is a similar function, is_hkid_assigned(), which is previously defined. Do
you think redefining a new function here will bring code redundency?

> +
> static __always_inline void tdvps_vmcs_check(u32 field, u8 bits)
> {
> #define VMCS_ENC_ACCESS_TYPE_MASK 0x1UL
> @@ -148,6 +157,19 @@ TDX_BUILD_TDVPS_ACCESSORS(64, VMCS, vmcs);
> TDX_BUILD_TDVPS_ACCESSORS(64, STATE_NON_ARCH, state_non_arch);
> TDX_BUILD_TDVPS_ACCESSORS(8, MANAGEMENT, management);
>
> +static __always_inline u64 td_tdcs_exec_read64(struct kvm_tdx *kvm_tdx, u32 field)
> +{
> + struct tdx_module_output out;
> + u64 err;
> +
> + err = tdh_mng_rd(kvm_tdx->tdr.pa, TDCS_EXEC(field), &out);
> + if (unlikely(err)) {
> + pr_err("TDH_MNG_RD[EXEC.0x%x] failed: 0x%llx\n", field, err);
> + return 0;
> + }
> + return out.r8;
> +}
> +
> #else
> struct kvm_tdx {
> struct kvm kvm;
> diff --git a/tools/arch/x86/include/uapi/asm/kvm.h b/tools/arch/x86/include/uapi/asm/kvm.h
> index 2ad9666e02a5..531a0033e530 100644
> --- a/tools/arch/x86/include/uapi/asm/kvm.h
> +++ b/tools/arch/x86/include/uapi/asm/kvm.h
> @@ -538,6 +538,7 @@ struct kvm_pmu_event_filter {
> /* Trust Domain eXtension sub-ioctl() commands. */
> enum kvm_tdx_cmd_id {
> KVM_TDX_CAPABILITIES = 0,
> + KVM_TDX_INIT_VM,
>
> KVM_TDX_CMD_NR_MAX,
> };
> @@ -583,4 +584,36 @@ struct kvm_tdx_capabilities {
> struct kvm_tdx_cpuid_config cpuid_configs[0];
> };
>
> +struct kvm_tdx_init_vm {
> + __u64 attributes;
> + __u32 max_vcpus;
> + __u32 padding;
> + __u64 mrconfigid[6]; /* sha384 digest */
> + __u64 mrowner[6]; /* sha384 digest */
> + __u64 mrownerconfig[6]; /* sha348 digest */
> + union {
> + /*
> + * KVM_TDX_INIT_VM is called before vcpu creation, thus before
> + * KVM_SET_CPUID2. CPUID configurations needs to be passed.
> + *
> + * This configuration supersedes KVM_SET_CPUID{,2}.
> + * The user space VMM, e.g. qemu, should make them consistent
> + * with this values.
> + * sizeof(struct kvm_cpuid_entry2) * KVM_MAX_CPUID_ENTRIES(256)
> + * = 8KB.
> + */
> + struct {
> + struct kvm_cpuid2 cpuid;
> + /* 8KB with KVM_MAX_CPUID_ENTRIES. */
> + struct kvm_cpuid_entry2 entries[];
> + };
> + /*
> + * For future extensibility.
> + * The size(struct kvm_tdx_init_vm) = 16KB.
> + * This should be enough given sizeof(TD_PARAMS) = 1024
> + */
> + __u64 reserved[2028];
> + };
> +};
> +
> #endif /* _ASM_X86_KVM_H */