Re: [PATCH 4/4] KVM: x86: Hyper-V tsc page setup

From: Roman Kagan
Date: Fri Sep 02 2016 - 15:12:56 EST


On Thu, Sep 01, 2016 at 05:26:15PM +0200, Paolo Bonzini wrote:
> Lately tsc page was implemented but filled with empty
> values. This patch setup tsc page scale and offset based
> on vcpu tsc, tsc_khz and HV_X64_MSR_TIME_REF_COUNT value.
>
> The valid tsc page drops HV_X64_MSR_TIME_REF_COUNT msr
> reads count to zero which potentially improves performance.
>
> Signed-off-by: Andrey Smetanin <asmetanin@xxxxxxxxxxxxx>
> Reviewed-by: Peter Hornyack <peterhornyack@xxxxxxxxxx>
> CC: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> CC: Radim KrÄmÃÅ <rkrcmar@xxxxxxxxxx>
> CC: Roman Kagan <rkagan@xxxxxxxxxxxxx>
> CC: Denis V. Lunev <den@xxxxxxxxxx>
> [Computation of TSC page parameters rewritten to use the Linux timekeeper
> parameters. - Paolo]
> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> ---
> arch/x86/kvm/hyperv.c | 141 ++++++++++++++++++++++++++++++++++++++++++++------
> arch/x86/kvm/hyperv.h | 3 ++
> arch/x86/kvm/x86.c | 8 +--
> 3 files changed, 133 insertions(+), 19 deletions(-)
>
> diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
> index ed5b77f39ffb..e089d1f52dc0 100644
> --- a/arch/x86/kvm/hyperv.c
> +++ b/arch/x86/kvm/hyperv.c
> @@ -756,6 +756,129 @@ static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
> return 0;
> }
>
> +/*
> + * The kvmclock and Hyper-V TSC page use similar formulas. Because the KVM
> + * one is more precise, it is a little more complex. However, converting

I'm not sure you're right regarding which one is more precise :)
Hyper-V uses a right shift of 64 which is higher precision than typical
kvmclock shift of around 22.

> + * between them is possible:
> + *
> + * kvmclock formula:
> + * nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
> + * + system_time
> + *
> + * Hyper-V formula:
> + * nsec/100 = ticks * scale / 2^64 + offset
> + *
> + * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
> + * By dividing the kvmclock formula by 100 and equating what's left we get:
> + * ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
> + * scale / 2^64 = tsc_to_system_mul * 2^(tsc_shift-32) / 100
> + * scale = tsc_to_system_mul * 2^(32+tsc_shift) / 100
> + *
> + * Now expand the kvmclock formula and divide by 100:
> + * nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
> + * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
> + * + system_time
> + * nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
> + * - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
> + * + system_time / 100
> + *
> + * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
> + * nsec/100 = ticks * scale / 2^64
> + * - tsc_timestamp * scale / 2^64
> + * + system_time / 100
> + *
> + * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
> + * offset = system_time / 100 - tsc_timestamp * scale / 2^64
> + *
> + * These two equivalencies are implemented in this function.
> + */
> +static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
> + HV_REFERENCE_TSC_PAGE *tsc_ref)
> +{
> + u64 max_mul;
> +
> + if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
> + return false;
> +
> + /*
> + * check if scale would overflow, if so we use the time ref counter
> + * tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
> + * tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
> + * tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
> + */
> + max_mul = 100ull << (32 - hv_clock->tsc_shift);
> + if (hv_clock->tsc_to_system_mul >= max_mul)
> + return false;
> +
> + /*
> + * Otherwise compute the scale and offset according to the formulas
> + * derived above.
> + */
> + tsc_ref->tsc_scale =
> + mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
> + hv_clock->tsc_to_system_mul,
> + 100);
> +
> + tsc_ref->tsc_offset = hv_clock->system_time;
> + do_div(tsc_ref->tsc_offset, 100);
> + tsc_ref->tsc_offset -=
> + mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
> + return true;

Although this is correct, we may want to make it slightly easier to
follow: note that the hv_clock contents is essentially populated using
vcpu->hw_tsc_khz and vcpu->last_guest_tsc, so at this point we can just
directly calculate ->tsc_scale and ->tsc_offset from them. If we also
stash them somewhere on vcpu we can make the reference counter use
exactly the same procedure as the guest would with tsc page, and
guarantee against precision errors.

Dunno if that really matters much, though.

> +}
> +
> +void kvm_hv_setup_tsc_page(struct kvm *kvm,
> + struct pvclock_vcpu_time_info *hv_clock)
> +{
> + struct kvm_hv *hv = &kvm->arch.hyperv;
> + HV_REFERENCE_TSC_PAGE tsc_ref = { 0 };
> + u32 tsc_seq;
> + u64 gfn;
> +
> + BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(tsc_ref.tsc_sequence));
> + BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
> +
> + if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
> + return;
> +
> + gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
> + /*
> + * Because the TSC parameters only vary when there is a
> + * change in the master clock, do not bother with caching.
> + */
> + if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
> + &tsc_seq, sizeof(tsc_seq))))
> + return;
> +
> + /*
> + * While we're computing and writing the parameters, force the
> + * guest to use the time reference count MSR.
> + */
> + if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
> + &tsc_ref, sizeof(tsc_ref.tsc_sequence)))
> + return;
> +
> + if (!compute_tsc_page_parameters(hv_clock, &tsc_ref))
> + return;
> +
> + /* Ensure sequence is zero before writing the rest of the struct. */
> + smp_wmb();
> + if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &tsc_ref, sizeof(tsc_ref)))
> + return;
> +
> + /*
> + * Now switch to the TSC page mechanism by writing the sequence.
> + */
> + if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
> + tsc_seq = 1;
> + else
> + tsc_seq++;

If tsc_seq = 0xFFFFFFFE you end up with invalid tsc_seq.
You rather need to unconditionally do the increment first, and compare
it to invalid values after that.

> +
> + /* Write the struct entirely before the non-zero sequence. */
> + smp_wmb();
> +
> + kvm_write_guest(kvm, gfn_to_gpa(gfn), &tsc_seq, sizeof(tsc_seq));
> +}
> +
> static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
> bool host)
> {
> @@ -793,23 +916,11 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
> mark_page_dirty(kvm, gfn);
> break;
> }
> - case HV_X64_MSR_REFERENCE_TSC: {
> - u64 gfn;
> - HV_REFERENCE_TSC_PAGE tsc_ref;
> -
> - memset(&tsc_ref, 0, sizeof(tsc_ref));
> + case HV_X64_MSR_REFERENCE_TSC:
> hv->hv_tsc_page = data;
> - if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
> - break;
> - gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
> - if (kvm_write_guest(
> - kvm,
> - gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
> - &tsc_ref, sizeof(tsc_ref)))
> - return 1;
> - mark_page_dirty(kvm, gfn);
> + if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
> + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
> break;
> - }
> case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
> return kvm_hv_msr_set_crash_data(vcpu,
> msr - HV_X64_MSR_CRASH_P0,
> diff --git a/arch/x86/kvm/hyperv.h b/arch/x86/kvm/hyperv.h
> index 60eccd4bd1d3..cd1119538add 100644
> --- a/arch/x86/kvm/hyperv.h
> +++ b/arch/x86/kvm/hyperv.h
> @@ -84,4 +84,7 @@ static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
>
> void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
>
> +void kvm_hv_setup_tsc_page(struct kvm *kvm,
> + struct pvclock_vcpu_time_info *hv_clock);
> +
> #endif
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 2edcfa054cbe..2e792bf80912 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1881,10 +1881,10 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>
> vcpu->hv_clock.flags = pvclock_flags;
>
> - if (!vcpu->pv_time_enabled)
> - return 0;
> -
> - kvm_setup_pvclock_page(v);
> + if (vcpu->pv_time_enabled)
> + kvm_setup_pvclock_page(v);
> + if (v == kvm_get_vcpu(v->kvm, 0))
> + kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
> return 0;
> }
>
> --
> 1.8.3.1
>