Re: [PATCH 3/4] KVM: x86: introduce get_kvmclock_ns

From: Paolo Bonzini
Date: Fri Sep 02 2016 - 10:09:57 EST




On 02/09/2016 15:52, Roman Kagan wrote:
> On Thu, Sep 01, 2016 at 05:26:14PM +0200, Paolo Bonzini wrote:
>> Introduce a function that reads the exact nanoseconds value that is
>> provided to the guest in kvmclock. This crystallizes the notion of
>> kvmclock as a thin veneer over a stable TSC, that the guest will
>> (hopefully) convert with NTP. In other words, kvmclock is *not* a
>> paravirtualized host-to-guest NTP.
>>
>> Drop the get_kernel_ns() function, that was used both to get the base
>> value of the master clock and to get the current value of kvmclock.
>> The former use is replaced by ktime_get_boot_ns(), the latter is
>> the purpose of get_kernel_ns().
>>
>> This also allows KVM to provide a Hyper-V time reference counter that
>> is synchronized with the time that is computed from the TSC page.
>>
>> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
>> ---
>> arch/x86/entry/vdso/vclock_gettime.c | 2 +-
>> arch/x86/include/asm/pvclock.h | 5 ++--
>> arch/x86/kernel/pvclock.c | 2 +-
>> arch/x86/kvm/hyperv.c | 2 +-
>> arch/x86/kvm/x86.c | 48 +++++++++++++++++++++++++++---------
>> arch/x86/kvm/x86.h | 6 +----
>> 6 files changed, 43 insertions(+), 22 deletions(-)
>>
>> diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
>> index 94d54d0defa7..02223cb4bcfd 100644
>> --- a/arch/x86/entry/vdso/vclock_gettime.c
>> +++ b/arch/x86/entry/vdso/vclock_gettime.c
>> @@ -129,7 +129,7 @@ static notrace cycle_t vread_pvclock(int *mode)
>> return 0;
>> }
>>
>> - ret = __pvclock_read_cycles(pvti);
>> + ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
>> } while (pvclock_read_retry(pvti, version));
>>
>> /* refer to vread_tsc() comment for rationale */
>> diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
>> index d019f0cc80ec..3ad741b84072 100644
>> --- a/arch/x86/include/asm/pvclock.h
>> +++ b/arch/x86/include/asm/pvclock.h
>> @@ -87,9 +87,10 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
>> }
>>
>> static __always_inline
>> -cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src)
>> +cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
>> + u64 tsc)
>> {
>> - u64 delta = rdtsc_ordered() - src->tsc_timestamp;
>> + u64 delta = tsc - src->tsc_timestamp;
>> cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
>> src->tsc_shift);
>> return src->system_time + offset;
>> diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
>> index 3599404e3089..5b2cc889ce34 100644
>> --- a/arch/x86/kernel/pvclock.c
>> +++ b/arch/x86/kernel/pvclock.c
>> @@ -80,7 +80,7 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
>>
>> do {
>> version = pvclock_read_begin(src);
>> - ret = __pvclock_read_cycles(src);
>> + ret = __pvclock_read_cycles(src, rdtsc_ordered());
>> flags = src->flags;
>> } while (pvclock_read_retry(src, version));
>>
>
> Perhaps adding an argument to __pvclock_read_cycles deserves a separate
> patch, to get timekeeping folks' ack on it?
>
>> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
>> index 01bd7b7a6866..ed5b77f39ffb 100644
>> --- a/arch/x86/kvm/hyperv.c
>> +++ b/arch/x86/kvm/hyperv.c
>> @@ -386,7 +386,7 @@ static void synic_init(struct kvm_vcpu_hv_synic *synic)
>>
>> static u64 get_time_ref_counter(struct kvm *kvm)
>> {
>> - return div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
>> + return div_u64(get_kvmclock_ns(kvm), 100);
>
> Since this does slightly different calculation than the real hyperv tsc
> ref page clock is supposed to, I wonder if we are safe WRT precision
> errors leading to occasional monotonicity violations?

The Hyper-V scale is

tsc_to_system_mul * 2^(32+tsc_shift) / 100

and the only source of error could be from doing here

(tsc * tsc_to_system_mul >> (32-tsc_shift)) / 100

vs

tsc * ((tsc_to_system_mul >> (32-tsc_shift)) / 100))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
this is scale / 2^64

in the TSC ref page clock. If my reasoning is correct the error will be
at most 100 units of the scale value, which is a relative error of
around 1 parts per 2^49.

Likewise for the offset, the improvement from

(tsc - base_tsc) * tsc_to_system_mul >> (32-tsc_shift)
+ base_system_time

vs. using a single offset as in the TSC ref page is one nanosecond---and
the ref page only has a resolution of 100 ns.


Paolo

>> }
>>
>> static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
>> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>> index b5853b86b67d..2edcfa054cbe 100644
>> --- a/arch/x86/kvm/x86.c
>> +++ b/arch/x86/kvm/x86.c
>> @@ -1425,7 +1425,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
>>
>> raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
>> offset = kvm_compute_tsc_offset(vcpu, data);
>> - ns = get_kernel_ns();
>> + ns = ktime_get_boot_ns();
>> elapsed = ns - kvm->arch.last_tsc_nsec;
>>
>> if (vcpu->arch.virtual_tsc_khz) {
>> @@ -1716,6 +1716,34 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>> #endif
>> }
>>
>> +static u64 __get_kvmclock_ns(struct kvm *kvm)
>> +{
>> + struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
>> + struct kvm_arch *ka = &kvm->arch;
>> + s64 ns;
>> +
>> + if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
>> + u64 tsc = kvm_read_l1_tsc(vcpu, rdtsc());
>> + ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
>> + } else {
>> + ns = ktime_get_boot_ns() + ka->kvmclock_offset;
>> + }
>> +
>> + return ns;
>> +}
>> +
>> +u64 get_kvmclock_ns(struct kvm *kvm)
>> +{
>> + unsigned long flags;
>> + s64 ns;
>> +
>> + local_irq_save(flags);
>> + ns = __get_kvmclock_ns(kvm);
>> + local_irq_restore(flags);
>> +
>> + return ns;
>> +}
>> +
>> static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
>> {
>> struct kvm_vcpu_arch *vcpu = &v->arch;
>> @@ -1805,7 +1833,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>> }
>> if (!use_master_clock) {
>> host_tsc = rdtsc();
>> - kernel_ns = get_kernel_ns();
>> + kernel_ns = ktime_get_boot_ns();
>> }
>>
>> tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
>> @@ -4048,7 +4076,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
>> case KVM_SET_CLOCK: {
>> struct kvm_clock_data user_ns;
>> u64 now_ns;
>> - s64 delta;
>>
>> r = -EFAULT;
>> if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
>> @@ -4060,10 +4087,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
>>
>> r = 0;
>> local_irq_disable();
>> - now_ns = get_kernel_ns();
>> - delta = user_ns.clock - now_ns;
>> + now_ns = __get_kvmclock_ns(kvm);
>> + kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
>> local_irq_enable();
>> - kvm->arch.kvmclock_offset = delta;
>> kvm_gen_update_masterclock(kvm);
>> break;
>> }
>
> I'm curious why explicit irq disable/enable is left here, unlike the
> next hunk where it's moved into get_kvmclock_ns?
>
>> @@ -4071,10 +4097,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
>> struct kvm_clock_data user_ns;
>> u64 now_ns;
>>
>> - local_irq_disable();
>> - now_ns = get_kernel_ns();
>> - user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
>> - local_irq_enable();
>> + now_ns = get_kvmclock_ns(kvm);
>> + user_ns.clock = now_ns;
>
> Nitpick: now_ns is even less useful now than it was before the patch.
>
>> user_ns.flags = 0;
>> memset(&user_ns.pad, 0, sizeof(user_ns.pad));
>>
>> @@ -7539,7 +7563,7 @@ int kvm_arch_hardware_enable(void)
>> * before any KVM threads can be running. Unfortunately, we can't
>> * bring the TSCs fully up to date with real time, as we aren't yet far
>> * enough into CPU bringup that we know how much real time has actually
>> - * elapsed; our helper function, get_kernel_ns() will be using boot
>> + * elapsed; our helper function, ktime_get_boot_ns() will be using boot
>> * variables that haven't been updated yet.
>> *
>> * So we simply find the maximum observed TSC above, then record the
>> @@ -7774,7 +7798,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
>> mutex_init(&kvm->arch.apic_map_lock);
>> spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
>>
>> - kvm->arch.kvmclock_offset = -get_kernel_ns();
>> + kvm->arch.kvmclock_offset = -ktime_get_boot_ns();
>> pvclock_update_vm_gtod_copy(kvm);
>>
>> INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
>> diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
>> index a82ca466b62e..e8ff3e4ce38a 100644
>> --- a/arch/x86/kvm/x86.h
>> +++ b/arch/x86/kvm/x86.h
>> @@ -148,11 +148,6 @@ static inline void kvm_register_writel(struct kvm_vcpu *vcpu,
>> return kvm_register_write(vcpu, reg, val);
>> }
>>
>> -static inline u64 get_kernel_ns(void)
>> -{
>> - return ktime_get_boot_ns();
>> -}
>> -
>> static inline bool kvm_check_has_quirk(struct kvm *kvm, u64 quirk)
>> {
>> return !(kvm->arch.disabled_quirks & quirk);
>> @@ -164,6 +159,7 @@ void kvm_set_pending_timer(struct kvm_vcpu *vcpu);
>> int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
>>
>> void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
>> +u64 get_kvmclock_ns(struct kvm *kvm);
>>
>> int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
>> gva_t addr, void *val, unsigned int bytes,
>> --
>> 1.8.3.1
>>
>>