Re: [PATCH v19 087/130] KVM: TDX: handle vcpu migration over logical processor

From: Reinette Chatre
Date: Fri Apr 12 2024 - 12:16:18 EST


Hi Isaku,

On 2/26/2024 12:26 AM, isaku.yamahata@xxxxxxxxx wrote:
> From: Isaku Yamahata <isaku.yamahata@xxxxxxxxx>

..

> @@ -218,6 +257,87 @@ static void tdx_reclaim_control_page(unsigned long td_page_pa)
> free_page((unsigned long)__va(td_page_pa));
> }
>
> +struct tdx_flush_vp_arg {
> + struct kvm_vcpu *vcpu;
> + u64 err;
> +};
> +
> +static void tdx_flush_vp(void *arg_)
> +{
> + struct tdx_flush_vp_arg *arg = arg_;
> + struct kvm_vcpu *vcpu = arg->vcpu;
> + u64 err;
> +
> + arg->err = 0;
> + lockdep_assert_irqs_disabled();
> +
> + /* Task migration can race with CPU offlining. */
> + if (unlikely(vcpu->cpu != raw_smp_processor_id()))
> + return;
> +
> + /*
> + * No need to do TDH_VP_FLUSH if the vCPU hasn't been initialized. The
> + * list tracking still needs to be updated so that it's correct if/when
> + * the vCPU does get initialized.
> + */
> + if (is_td_vcpu_created(to_tdx(vcpu))) {
> + /*
> + * No need to retry. TDX Resources needed for TDH.VP.FLUSH are,
> + * TDVPR as exclusive, TDR as shared, and TDCS as shared. This
> + * vp flush function is called when destructing vcpu/TD or vcpu
> + * migration. No other thread uses TDVPR in those cases.
> + */

(I have comment later that refer back to this comment about needing retry.)

..

> @@ -233,26 +353,31 @@ static void tdx_do_tdh_phymem_cache_wb(void *unused)
> pr_tdx_error(TDH_PHYMEM_CACHE_WB, err, NULL);
> }
>
> -void tdx_mmu_release_hkid(struct kvm *kvm)
> +static int __tdx_mmu_release_hkid(struct kvm *kvm)
> {
> bool packages_allocated, targets_allocated;
> struct kvm_tdx *kvm_tdx = to_kvm_tdx(kvm);
> cpumask_var_t packages, targets;
> + struct kvm_vcpu *vcpu;
> + unsigned long j;
> + int i, ret = 0;
> u64 err;
> - int i;
>
> if (!is_hkid_assigned(kvm_tdx))
> - return;
> + return 0;
>
> if (!is_td_created(kvm_tdx)) {
> tdx_hkid_free(kvm_tdx);
> - return;
> + return 0;
> }
>
> packages_allocated = zalloc_cpumask_var(&packages, GFP_KERNEL);
> targets_allocated = zalloc_cpumask_var(&targets, GFP_KERNEL);
> cpus_read_lock();
>
> + kvm_for_each_vcpu(j, vcpu, kvm)
> + tdx_flush_vp_on_cpu(vcpu);
> +
> /*
> * We can destroy multiple guest TDs simultaneously. Prevent
> * tdh_phymem_cache_wb from returning TDX_BUSY by serialization.
> @@ -270,6 +395,19 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
> */
> write_lock(&kvm->mmu_lock);
>
> + err = tdh_mng_vpflushdone(kvm_tdx->tdr_pa);
> + if (err == TDX_FLUSHVP_NOT_DONE) {
> + ret = -EBUSY;
> + goto out;
> + }
> + if (WARN_ON_ONCE(err)) {
> + pr_tdx_error(TDH_MNG_VPFLUSHDONE, err, NULL);
> + pr_err("tdh_mng_vpflushdone() failed. HKID %d is leaked.\n",
> + kvm_tdx->hkid);
> + ret = -EIO;
> + goto out;
> + }
> +
> for_each_online_cpu(i) {
> if (packages_allocated &&
> cpumask_test_and_set_cpu(topology_physical_package_id(i),
> @@ -291,14 +429,24 @@ void tdx_mmu_release_hkid(struct kvm *kvm)
> pr_tdx_error(TDH_MNG_KEY_FREEID, err, NULL);
> pr_err("tdh_mng_key_freeid() failed. HKID %d is leaked.\n",
> kvm_tdx->hkid);
> + ret = -EIO;
> } else
> tdx_hkid_free(kvm_tdx);
>
> +out:
> write_unlock(&kvm->mmu_lock);
> mutex_unlock(&tdx_lock);
> cpus_read_unlock();
> free_cpumask_var(targets);
> free_cpumask_var(packages);
> +
> + return ret;
> +}
> +
> +void tdx_mmu_release_hkid(struct kvm *kvm)
> +{
> + while (__tdx_mmu_release_hkid(kvm) == -EBUSY)
> + ;
> }

As I understand, __tdx_mmu_release_hkid() returns -EBUSY
after TDH.VP.FLUSH has been sent for every vCPU followed by
TDH.MNG.VPFLUSHDONE, which returns TDX_FLUSHVP_NOT_DONE.

Considering earlier comment that a retry of TDH.VP.FLUSH is not
needed, why is this while() loop here that sends the
TDH.VP.FLUSH again to all vCPUs instead of just a loop within
__tdx_mmu_release_hkid() to _just_ resend TDH.MNG.VPFLUSHDONE?

Could it be possible for a vCPU to appear during this time, thus
be missed in one TDH.VP.FLUSH cycle, to require a new cycle of
TDH.VP.FLUSH?

I note that TDX_FLUSHVP_NOT_DONE is distinct from TDX_OPERAND_BUSY
that can also be returned from TDH.MNG.VPFLUSHDONE and
wonder if a retry may be needed in that case also/instead? It looks like
TDH.MNG.VPFLUSHDONE needs exclusive access to all operands and I
do not know enough yet if this is the case here.

Reinette