Re: [PATCH 1/4] KVM: x86: always fill in vcpu->arch.hv_clock
From: Roman Kagan
Date: Fri Sep 02 2016 - 12:03:13 EST
On Thu, Sep 01, 2016 at 05:26:12PM +0200, Paolo Bonzini wrote:
> We will use it in the next patches for KVM_GET_CLOCK and as a basis for the
> contents of the Hyper-V TSC page. Get the values from the Linux
> timekeeper even if kvmclock is not enabled.
>
> Signed-off-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> ---
> arch/x86/kvm/x86.c | 109 +++++++++++++++++++++++++++++------------------------
> 1 file changed, 59 insertions(+), 50 deletions(-)
>
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 19f9f9e05c2a..65974dd0565f 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -1716,6 +1716,60 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
> #endif
> }
>
> +static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
> +{
> + struct kvm_vcpu_arch *vcpu = &v->arch;
> + struct pvclock_vcpu_time_info guest_hv_clock;
> +
> + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
> + &guest_hv_clock, sizeof(guest_hv_clock))))
> + return;
> +
> + /* This VCPU is paused, but it's legal for a guest to read another
> + * VCPU's kvmclock, so we really have to follow the specification where
> + * it says that version is odd if data is being modified, and even after
> + * it is consistent.
> + *
> + * Version field updates must be kept separate. This is because
> + * kvm_write_guest_cached might use a "rep movs" instruction, and
> + * writes within a string instruction are weakly ordered. So there
> + * are three writes overall.
> + *
> + * As a small optimization, only write the version field in the first
> + * and third write. The vcpu->pv_time cache is still valid, because the
> + * version field is the first in the struct.
> + */
> + BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
> +
> + vcpu->hv_clock.version = guest_hv_clock.version + 1;
> + kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
> + &vcpu->hv_clock,
> + sizeof(vcpu->hv_clock.version));
> +
> + smp_wmb();
> +
> + /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
> + vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
> +
> + if (vcpu->pvclock_set_guest_stopped_request) {
> + vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
> + vcpu->pvclock_set_guest_stopped_request = false;
> + }
> +
> + trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
> +
> + kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
> + &vcpu->hv_clock,
> + sizeof(vcpu->hv_clock));
> +
> + smp_wmb();
> +
> + vcpu->hv_clock.version++;
> + kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
> + &vcpu->hv_clock,
> + sizeof(vcpu->hv_clock.version));
> +}
> +
> static int kvm_guest_time_update(struct kvm_vcpu *v)
> {
> unsigned long flags, tgt_tsc_khz;
> @@ -1723,7 +1777,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
> struct kvm_arch *ka = &v->kvm->arch;
> s64 kernel_ns;
> u64 tsc_timestamp, host_tsc;
> - struct pvclock_vcpu_time_info guest_hv_clock;
> u8 pvclock_flags;
> bool use_master_clock;
>
> @@ -1777,8 +1830,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
>
> local_irq_restore(flags);
>
> - if (!vcpu->pv_time_enabled)
> - return 0;
Strictly speaking, you only need .hv_clock updated if either kvmclock or
tsc_ref_page is enabled, so you may want to still skip the calculations
otherwise.
> + /* With all the info we got, fill in the values */
>
> if (kvm_has_tsc_control)
> tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
> @@ -1790,64 +1842,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
> vcpu->hw_tsc_khz = tgt_tsc_khz;
> }
>
> - /* With all the info we got, fill in the values */
> vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
> vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
> vcpu->last_guest_tsc = tsc_timestamp;
>
> - if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
> - &guest_hv_clock, sizeof(guest_hv_clock))))
> - return 0;
> -
> - /* This VCPU is paused, but it's legal for a guest to read another
> - * VCPU's kvmclock, so we really have to follow the specification where
> - * it says that version is odd if data is being modified, and even after
> - * it is consistent.
> - *
> - * Version field updates must be kept separate. This is because
> - * kvm_write_guest_cached might use a "rep movs" instruction, and
> - * writes within a string instruction are weakly ordered. So there
> - * are three writes overall.
> - *
> - * As a small optimization, only write the version field in the first
> - * and third write. The vcpu->pv_time cache is still valid, because the
> - * version field is the first in the struct.
> - */
> - BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
> -
> - vcpu->hv_clock.version = guest_hv_clock.version + 1;
> - kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
> - &vcpu->hv_clock,
> - sizeof(vcpu->hv_clock.version));
> -
> - smp_wmb();
> -
> - /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
> - pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
> -
> - if (vcpu->pvclock_set_guest_stopped_request) {
> - pvclock_flags |= PVCLOCK_GUEST_STOPPED;
> - vcpu->pvclock_set_guest_stopped_request = false;
> - }
> -
> /* If the host uses TSC clocksource, then it is stable */
> + pvclock_flags = 0;
> if (use_master_clock)
> pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
>
> vcpu->hv_clock.flags = pvclock_flags;
A nitpick: the above five lines can be collapsed into two:
vcpu->hv_clock.flags = use_master_clock ?
PVCLOCK_TSC_STABLE_BIT : 0;
and pvclock_flags can be killed.
>
> - trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
> -
> - kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
> - &vcpu->hv_clock,
> - sizeof(vcpu->hv_clock));
> -
> - smp_wmb();
> + if (!vcpu->pv_time_enabled)
> + return 0;
>
> - vcpu->hv_clock.version++;
> - kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
> - &vcpu->hv_clock,
> - sizeof(vcpu->hv_clock.version));
> + kvm_setup_pvclock_page(v);
> return 0;
> }
>
> --
> 1.8.3.1
>
>