Re: [PATCH v2 1/2] KVM: x86: Add KVM_[GS]ET_CLOCK_GUEST for accurate KVM clock migration

From: David Woodhouse
Date: Mon Apr 15 2024 - 03:17:19 EST


On Wed, 2024-04-10 at 09:52 +0000, Jack Allister wrote:
>
> +static int kvm_vcpu_ioctl_get_clock_guest(struct kvm_vcpu *v, void __user *argp)
> +{
> +       struct pvclock_vcpu_time_info *vcpu_pvti = &v->arch.hv_clock;
> +       struct pvclock_vcpu_time_info local_pvti = { 0 };
> +       struct kvm_arch *ka = &v->kvm->arch;
> +       uint64_t host_tsc, guest_tsc;
> +       bool use_master_clock;
> +       uint64_t kernel_ns;
> +       unsigned int seq;
> +
> +       /*
> +        * CLOCK_MONOTONIC_RAW is not suitable for GET/SET API,
> +        * see kvm_vcpu_ioctl_set_clock_guest equivalent comment.
> +        */
> +       if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC))
> +               return -EINVAL;
> +
> +       /*
> +        * Querying needs to be performed in a seqcount loop as it's possible
> +        * another vCPU has triggered an update of the master clock. If so we
> +        * should store the host TSC & time at this point.
> +        */
> +       do {
> +               seq = read_seqcount_begin(&ka->pvclock_sc);
> +               use_master_clock = ka->use_master_clock;
> +               if (use_master_clock) {
> +                       host_tsc = ka->master_cycle_now;
> +                       kernel_ns = ka->master_kernel_ns;
> +               }
> +       } while (read_seqcount_retry(&ka->pvclock_sc, seq));
> +
> +       if (!use_master_clock)
> +               return -EINVAL;
> +
> +       /*
> +        * It's possible that this vCPU doesn't have a HVCLOCK configured
> +        * but the other vCPUs may. If this is the case calculate based
> +        * upon the time gathered in the seqcount but do not update the
> +        * vCPU specific PVTI. If we have one, then use that.
> +        */
> +       if (!vcpu_pvti->tsc_timestamp && !vcpu_pvti->system_time) {

|| !kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)

Otherwise you may be using out of date information.

> +               guest_tsc = kvm_read_l1_tsc(v, host_tsc);
> +
> +               local_pvti.tsc_timestamp = guest_tsc;
> +               local_pvti.system_time = kernel_ns + ka->kvmclock_offset;

This is missing the scale information in tsc_to_system_mul and
tsc_shift. Is there a reason we can't just call kvm_guest_time_update()
from here? (I think we looked at using it for the *SET* function, but
did we look at doing so for GET?)


> +       } else {
> +               local_pvti = *vcpu_pvti;
> +       }
> +
> +       if (copy_to_user(argp, &local_pvti, sizeof(local_pvti)))
> +               return -EFAULT;
> +
> +       return 0;
> +}
> +
> +static int kvm_vcpu_ioctl_set_clock_guest(struct kvm_vcpu *v, void __user *argp)
> +{
> +       struct pvclock_vcpu_time_info dummy_pvti;
> +       struct pvclock_vcpu_time_info orig_pvti;
> +       struct kvm *kvm = v->kvm;
> +       struct kvm_arch *ka = &kvm->arch;
> +       uint64_t clock_orig, clock_dummy;
> +       uint64_t host_tsc, guest_tsc;
> +       int64_t kernel_ns;
> +       int64_t correction;
> +       int rc = 0;
> +
> +       /*
> +        * If a constant TSC is not provided by the host then KVM will
> +        * be using CLOCK_MONOTONIC_RAW, correction using this is not
> +        * precise and as such we can never sync to the precision we
> +        * are requiring.
> +        */
> +       if (!static_cpu_has(X86_FEATURE_CONSTANT_TSC))
> +               return -EINVAL;
> +
> +       if (copy_from_user(&orig_pvti, argp, sizeof(orig_pvti)))
> +               return -EFAULT;
> +
> +       kvm_hv_request_tsc_page_update(kvm);
> +       kvm_start_pvclock_update(kvm);
> +       pvclock_update_vm_gtod_copy(kvm);
> +
> +       if (!ka->use_master_clock) {
> +               rc = -EINVAL;
> +               goto out;
> +       }
> +
> +       /*
> +        * Sample the kernel time and host TSC at a singular point.
> +        * We then calculate the guest TSC using this exact point in time.
> +        * From here we can then determine the delta using the
> +        * PV time info requested from the user and what we currently have
> +        * using the fixed point in time. This delta is then used as a
> +        * correction factor to subtract from the clock offset.
> +        */
> +       if (!kvm_get_time_and_clockread(&kernel_ns, &host_tsc)) {
> +               rc = -EFAULT;
> +               goto out;
> +       }
> +
> +       guest_tsc = kvm_read_l1_tsc(v, host_tsc);
> +
> +       dummy_pvti = orig_pvti;
> +       dummy_pvti.tsc_timestamp = guest_tsc;
> +       dummy_pvti.system_time = kernel_ns + ka->kvmclock_offset;
> +
> +       clock_orig = __pvclock_read_cycles(&orig_pvti, guest_tsc);
> +       clock_dummy = __pvclock_read_cycles(&dummy_pvti, guest_tsc);
>

In both cases here you're using the scale information directly from
userspace... that you forgot to fill in for them earlier. I think we
should we have a sanity check on it, to ensure that it matches the TSC
frequency of the vCPU?

> +       correction = clock_orig - clock_dummy;
> +       ka->kvmclock_offset += correction;
> +
> +out:
> +       kvm_end_pvclock_update(kvm);
> +       return rc;
> +}
> +
>  long kvm_arch_vcpu_ioctl(struct file *filp,
>                          unsigned int ioctl, unsigned long arg)
>  {
> @@ -6239,6 +6357,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
>                 srcu_read_unlock(&vcpu->kvm->srcu, idx);
>                 break;
>         }
> +       case KVM_SET_CLOCK_GUEST:
> +               r = kvm_vcpu_ioctl_set_clock_guest(vcpu, argp);
> +               break;
> +       case KVM_GET_CLOCK_GUEST:
> +               r = kvm_vcpu_ioctl_get_clock_guest(vcpu, argp);
> +               break;
>  #ifdef CONFIG_KVM_HYPERV
>         case KVM_GET_SUPPORTED_HV_CPUID:
>                 r = kvm_ioctl_get_supported_hv_cpuid(vcpu, argp);
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index 2190adbe3002..0d306311e4d6 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -1548,4 +1548,7 @@ struct kvm_create_guest_memfd {
>         __u64 reserved[6];
>  };
>  
> +#define KVM_SET_CLOCK_GUEST       _IOW(KVMIO,  0xd5, struct pvclock_vcpu_time_info)
> +#define KVM_GET_CLOCK_GUEST       _IOR(KVMIO,  0xd6, struct pvclock_vcpu_time_info)
> +
>  #endif /* __LINUX_KVM_H */

Attachment: smime.p7s
Description: S/MIME cryptographic signature