Re: [PATCH v2 1/2] KVM: x86: Add KVM_[GS]ET_CLOCK_GUEST for accurate KVM clock migration
From: David Woodhouse
Date: Mon Apr 15 2024 - 03:17:19 EST
On Wed, 2024-04-10 at 09:52 +0000, Jack Allister wrote:
>
> +static int kvm_vcpu_ioctl_get_clock_guest(struct kvm_vcpu *v, void __user *argp)
> +{
> + struct pvclock_vcpu_time_info *vcpu_pvti = &v->arch.hv_clock;
> + struct pvclock_vcpu_time_info local_pvti = { 0 };
> + struct kvm_arch *ka = &v->kvm->arch;
> + uint64_t host_tsc, guest_tsc;
> + bool use_master_clock;
> + uint64_t kernel_ns;
> + unsigned int seq;
> +
> + /*
> + * CLOCK_MONOTONIC_RAW is not suitable for GET/SET API,
> + * see kvm_vcpu_ioctl_set_clock_guest equivalent comment.
> + */
> + if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC))
> + return -EINVAL;
> +
> + /*
> + * Querying needs to be performed in a seqcount loop as it's possible
> + * another vCPU has triggered an update of the master clock. If so we
> + * should store the host TSC & time at this point.
> + */
> + do {
> + seq = read_seqcount_begin(&ka->pvclock_sc);
> + use_master_clock = ka->use_master_clock;
> + if (use_master_clock) {
> + host_tsc = ka->master_cycle_now;
> + kernel_ns = ka->master_kernel_ns;
> + }
> + } while (read_seqcount_retry(&ka->pvclock_sc, seq));
> +
> + if (!use_master_clock)
> + return -EINVAL;
> +
> + /*
> + * It's possible that this vCPU doesn't have a HVCLOCK configured
> + * but the other vCPUs may. If this is the case calculate based
> + * upon the time gathered in the seqcount but do not update the
> + * vCPU specific PVTI. If we have one, then use that.
> + */
> + if (!vcpu_pvti->tsc_timestamp && !vcpu_pvti->system_time) {
|| !kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)
Otherwise you may be using out of date information.
> + guest_tsc = kvm_read_l1_tsc(v, host_tsc);
> +
> + local_pvti.tsc_timestamp = guest_tsc;
> + local_pvti.system_time = kernel_ns + ka->kvmclock_offset;
This is missing the scale information in tsc_to_system_mul and
tsc_shift. Is there a reason we can't just call kvm_guest_time_update()
from here? (I think we looked at using it for the *SET* function, but
did we look at doing so for GET?)
> + } else {
> + local_pvti = *vcpu_pvti;
> + }
> +
> + if (copy_to_user(argp, &local_pvti, sizeof(local_pvti)))
> + return -EFAULT;
> +
> + return 0;
> +}
> +
> +static int kvm_vcpu_ioctl_set_clock_guest(struct kvm_vcpu *v, void __user *argp)
> +{
> + struct pvclock_vcpu_time_info dummy_pvti;
> + struct pvclock_vcpu_time_info orig_pvti;
> + struct kvm *kvm = v->kvm;
> + struct kvm_arch *ka = &kvm->arch;
> + uint64_t clock_orig, clock_dummy;
> + uint64_t host_tsc, guest_tsc;
> + int64_t kernel_ns;
> + int64_t correction;
> + int rc = 0;
> +
> + /*
> + * If a constant TSC is not provided by the host then KVM will
> + * be using CLOCK_MONOTONIC_RAW, correction using this is not
> + * precise and as such we can never sync to the precision we
> + * are requiring.
> + */
> + if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC))
> + return -EINVAL;
> +
> + if (copy_from_user(&orig_pvti, argp, sizeof(orig_pvti)))
> + return -EFAULT;
> +
> + kvm_hv_request_tsc_page_update(kvm);
> + kvm_start_pvclock_update(kvm);
> + pvclock_update_vm_gtod_copy(kvm);
> +
> + if (!ka->use_master_clock) {
> + rc = -EINVAL;
> + goto out;
> + }
> +
> + /*
> + * Sample the kernel time and host TSC at a singular point.
> + * We then calculate the guest TSC using this exact point in time.
> + * From here we can then determine the delta using the
> + * PV time info requested from the user and what we currently have
> + * using the fixed point in time. This delta is then used as a
> + * correction factor to subtract from the clock offset.
> + */
> + if (!kvm_get_time_and_clockread(&kernel_ns, &host_tsc)) {
> + rc = -EFAULT;
> + goto out;
> + }
> +
> + guest_tsc = kvm_read_l1_tsc(v, host_tsc);
> +
> + dummy_pvti = orig_pvti;
> + dummy_pvti.tsc_timestamp = guest_tsc;
> + dummy_pvti.system_time = kernel_ns + ka->kvmclock_offset;
> +
> + clock_orig = __pvclock_read_cycles(&orig_pvti, guest_tsc);
> + clock_dummy = __pvclock_read_cycles(&dummy_pvti, guest_tsc);
>
In both cases here you're using the scale information directly from
userspace... that you forgot to fill in for them earlier. I think we
should we have a sanity check on it, to ensure that it matches the TSC
frequency of the vCPU?
> + correction = clock_orig - clock_dummy;
> + ka->kvmclock_offset += correction;
> +
> +out:
> + kvm_end_pvclock_update(kvm);
> + return rc;
> +}
> +
> long kvm_arch_vcpu_ioctl(struct file *filp,
> unsigned int ioctl, unsigned long arg)
> {
> @@ -6239,6 +6357,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> srcu_read_unlock(&vcpu->kvm->srcu, idx);
> break;
> }
> + case KVM_SET_CLOCK_GUEST:
> + r = kvm_vcpu_ioctl_set_clock_guest(vcpu, argp);
> + break;
> + case KVM_GET_CLOCK_GUEST:
> + r = kvm_vcpu_ioctl_get_clock_guest(vcpu, argp);
> + break;
> #ifdef CONFIG_KVM_HYPERV
> case KVM_GET_SUPPORTED_HV_CPUID:
> r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 2190adbe3002..0d306311e4d6 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1548,4 +1548,7 @@ struct kvm_create_guest_memfd {
> __u64 reserved[6];
> };
>
> +#define KVM_SET_CLOCK_GUEST _IOW(KVMIO, 0xd5, struct pvclock_vcpu_time_info)
> +#define KVM_GET_CLOCK_GUEST _IOR(KVMIO, 0xd6, struct pvclock_vcpu_time_info)
> +
> #endif /* __LINUX_KVM_H */
Attachment:
smime.p7s
Description: S/MIME cryptographic signature