Re: [PATCH v17 071/116] KVM: TDX: handle vcpu migration over logical processor

From: Yuan Yao
Date: Wed Nov 15 2023 - 01:50:11 EST


On Tue, Nov 07, 2023 at 06:56:37AM -0800, isaku.yamahata@xxxxxxxxx wrote:
> From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
>
> For vcpu migration, in the case of VMX, VMCS is flushed on the source pcpu,
> and load it on the target pcpu. There are corresponding TDX SEAMCALL APIs,
> call them on vcpu migration. The logic is mostly same as VMX except the
> TDX SEAMCALLs are used.
>
> When shutting down the machine, (VMX or TDX) vcpus needs to be shutdown on
> each pcpu. Do the similar for TDX with TDX SEAMCALL APIs.
>
> Signed-off-by: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>
> ---
> arch/x86/kvm/vmx/main.c | 32 ++++++-
> arch/x86/kvm/vmx/tdx.c | 190 ++++++++++++++++++++++++++++++++++++-
> arch/x86/kvm/vmx/tdx.h | 2 +
> arch/x86/kvm/vmx/x86_ops.h | 4 +
> 4 files changed, 221 insertions(+), 7 deletions(-)
>
> diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
> index e7c570686736..8b109d0fe764 100644
> --- a/arch/x86/kvm/vmx/main.c
> +++ b/arch/x86/kvm/vmx/main.c
> @@ -44,6 +44,14 @@ static int vt_hardware_enable(void)
> return ret;
> }
>
......
> -void tdx_mmu_release_hkid(struct kvm *kvm)
> +static int __tdx_mmu_release_hkid(struct kvm *kvm)
> {
> bool packages_allocated, targets_allocated;
> struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> cpumask_var_t packages, targets;
> + struct kvm_vcpu *vcpu;
> + unsigned long j;
> + int i, ret = 0;
> u64 err;
> - int i;
>
> if (!is_hkid_assigned(kvm_tdx))
> - return;
> + return 0;
>
> if (!is_td_created(kvm_tdx)) {
> tdx_hkid_free(kvm_tdx);
> - return;
> + return 0;
> }
>
> packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
> targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
> cpus_read_lock();
>
> + kvm_for_each_vcpu(j, vcpu, kvm)
> + tdx_flush_vp_on_cpu(vcpu);
> +
> /*
> * We can destroy multiple the guest TDs simultaneously. Prevent
> * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
> @@ -236,6 +361,19 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
> */
> write_lock(&kvm->mmu_lock);
>
> + err = tdh_mng_vpflushdone(kvm_tdx->tdr_pa);
> + if (err == TDX_FLUSHVP_NOT_DONE) {

Not sure IIUC, The __tdx_mmu_release_hkid() is called in MMU release
callback, which means all threads of the process have dropped mm by
do_exit() so they won't run kvm code anymore, and tdx_flush_vp_on_cpu()
is called for each pcpu they run last time, so will this error really
happen ?

> + ret = -EBUSY;
> + goto out;
> + }
> + if (WARN_ON_ONCE(err)) {
> + pr_tdx_error(TDH_MNG_VPFLUSHDONE, err, NULL);
> + pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
> + kvm_tdx->hkid);
> + ret = -EIO;
> + goto out;
> + }
> +
> for_each_online_cpu(i) {
> if (packages_allocated &&
> cpumask_test_and_set_cpu(topology_physical_package_id(i),
> @@ -258,14 +396,24 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
> pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
> pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
> kvm_tdx->hkid);
> + ret = -EIO;
> } else
> tdx_hkid_free(kvm_tdx);
>
> +out:
> write_unlock(&kvm->mmu_lock);
> mutex_unlock(&tdx_lock);
> cpus_read_unlock();
> free_cpumask_var(targets);
> free_cpumask_var(packages);
> +
> + return ret;
> +}
> +
> +void tdx_mmu_release_hkid(struct kvm *kvm)
> +{
> + while (__tdx_mmu_release_hkid(kvm) == -EBUSY)
> + ;
> }
>
> void tdx_vm_free(struct kvm *kvm)
> @@ -429,6 +577,26 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)
> return 0;
> }
>
> +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> +{
> + struct vcpu_tdx *tdx = to_tdx(vcpu);
> +
> + if (vcpu->cpu == cpu)
> + return;
> +
> + tdx_flush_vp_on_cpu(vcpu);
> +
> + local_irq_disable();
> + /*
> + * Pairs with the smp_wmb() in tdx_disassociate_vp() to ensure
> + * vcpu->cpu is read before tdx->cpu_list.
> + */
> + smp_rmb();
> +
> + list_add(&tdx->cpu_list, &per_cpu(associated_tdvcpus, cpu));
> + local_irq_enable();
> +}
> +
> void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
> {
> struct vcpu_tdx *tdx = to_tdx(vcpu);
> @@ -469,6 +637,16 @@ void tdx_vcpu_free(struct kvm_vcpu *vcpu)
> struct vcpu_tdx *tdx = to_tdx(vcpu);
> int i;
>
> + /*
> + * When destroying VM, kvm_unload_vcpu_mmu() calls vcpu_load() for every
> + * vcpu after they already disassociated from the per cpu list by
> + * tdx_mmu_release_hkid(). So we need to disassociate them again,
> + * otherwise the freed vcpu data will be accessed when do
> + * list_{del,add}() on associated_tdvcpus list later.
> + */
> + tdx_disassociate_vp_on_cpu(vcpu);
> + WARN_ON_ONCE(vcpu->cpu != -1);
> +
> /*
> * This methods can be called when vcpu allocation/initialization
> * failed. So it's possible that hkid, tdvpx and tdvpr are not assigned
> @@ -1873,6 +2051,10 @@ int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops)
> return -EINVAL;
> }
>
> + /* tdx_hardware_disable() uses associated_tdvcpus. */
> + for_each_possible_cpu(i)
> + INIT_LIST_HEAD(&per_cpu(associated_tdvcpus, i));
> +
> for (i = 0; i < ARRAY_SIZE(tdx_uret_msrs); i++) {
> /*
> * Here it checks if MSRs (tdx_uret_msrs) can be saved/restored
> diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
> index c700792c08e2..4f803814126a 100644
> --- a/arch/x86/kvm/vmx/tdx.h
> +++ b/arch/x86/kvm/vmx/tdx.h
> @@ -70,6 +70,8 @@ struct vcpu_tdx {
> unsigned long tdvpr_pa;
> unsigned long *tdvpx_pa;
>
> + struct list_head cpu_list;
> +
> union tdx_exit_reason exit_reason;
>
> bool initialized;
> diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
> index 4c9793b5b30d..911ef1e8eeda 100644
> --- a/arch/x86/kvm/vmx/x86_ops.h
> +++ b/arch/x86/kvm/vmx/x86_ops.h
> @@ -137,6 +137,7 @@ void vmx_setup_mce(struct kvm_vcpu *vcpu);
> #ifdef CONFIG_INTEL_TDX_HOST
> int __init tdx_hardware_setup(struct kvm_x86_ops *x86_ops);
> void tdx_hardware_unsetup(void);
> +void tdx_hardware_disable(void);
> bool tdx_is_vm_type_supported(unsigned long type);
> int tdx_offline_cpu(void);
>
> @@ -153,6 +154,7 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
> fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu);
> void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu);
> void tdx_vcpu_put(struct kvm_vcpu *vcpu);
> +void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
> u8 tdx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
>
> int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
> @@ -164,6 +166,7 @@ void tdx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level);
> #else
> static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -EOPNOTSUPP; }
> static inline void tdx_hardware_unsetup(void) {}
> +static inline void tdx_hardware_disable(void) {}
> static inline bool tdx_is_vm_type_supported(unsigned long type) { return false; }
> static inline int tdx_offline_cpu(void) { return 0; }
>
> @@ -183,6 +186,7 @@ static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
> static inline fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu) { return EXIT_FASTPATH_NONE; }
> static inline void tdx_prepare_switch_to_guest(struct kvm_vcpu *vcpu) {}
> static inline void tdx_vcpu_put(struct kvm_vcpu *vcpu) {}
> +static inline void tdx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) {}
> static inline u8 tdx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) { return 0; }
>
> static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }
> --
> 2.25.1
>
>