Re: [PATCH RFC 1/1] KVM: x86: add param to update master clock periodically

From: Sean Christopherson
Date: Fri Oct 13 2023 - 14:07:52 EST


On Wed, Oct 11, 2023, David Woodhouse wrote:
> On Tue, 2023-10-10 at 17:20 -0700, Sean Christopherson wrote:
> > On Wed, Oct 04, 2023, Dongli Zhang wrote:
> > > > -static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
> > > > -{
> > > > -       struct kvm *kvm = v->kvm;
> > > > -
> > > > -       kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
> > > > -       schedule_delayed_work(&kvm->arch.kvmclock_update_work,
> > > > -                                       KVMCLOCK_UPDATE_DELAY);
> > > > -}
> > > > -
> > > >  #define KVMCLOCK_SYNC_PERIOD (300 * HZ)
> > >
> > > While David mentioned "maximum delta", how about to turn above into a module
> > > param with the default 300HZ.
> > >
> > > BTW, 300HZ should be enough for vCPU hotplug case, unless people prefer 1-hour
> > > or 1-day.
> >
> > Hmm, I think I agree with David that it would be better if KVM can take care of
> > the gory details and promise a certain level of accuracy.  I'm usually a fan of
> > punting complexity to userspace, but requiring every userspace to figure out the
> > ideal sync frequency on every platform is more than a bit unfriendly.  And it
> > might not even be realistic unless userspace makes assumptions about how the kernel
> > computes CLOCK_MONOTONIC_RAW from TSC cycles.
> >
>
> I think perhaps I would rather save up my persuasiveness on the topic
> of "let's not make things too awful for userspace to cope with" for the
> live update/migration mess. I think I need to dust off that attempt at
> fixing our 'how to migrate with clocks intact' documentation from
> https://lore.kernel.org/kvm/13f256ad95de186e3b6bcfcc1f88da5d0ad0cb71.camel@xxxxxxxxxxxxx/
> The changes we're discussing here obviously have an effect on migration
> too.
>
> Where the host TSC is actually reliable, I would really prefer for the
> kvmclock to just be a fixed function of the guest TSC and *not* to be
> arbitrarily yanked back[1] to the host's CLOCK_MONOTONIC periodically.

CLOCK_MONOTONIC_RAW! Just wanted to clarify because if kvmclock were tied to the
non-raw clock, then we'd have to somehow reconcile host NTP updates.

I generally support the idea, but I think it needs to an opt-in from userspace.
Essentially a "I pinky swear to give all vCPUs the same TSC frequency, to not
suspend the host, and to not run software/firmware that writes IA32_TSC_ADJUST".
AFAICT, there are too many edge cases and assumptions about userspace for KVM to
safely couple kvmclock to guest TSC by default.

> [1] Yes, I believe "back" does happen. I have test failures in my queue
> to look at, where guests see the "Xen" clock going backwards.

Yeah, I assume "back" can happen based purely on the wierdness of the pvclock math.o

What if we add a module param to disable KVM's TSC synchronization craziness
entirely? If we first clean up the peroidic sync mess, then it seems like it'd
be relatively straightforward to let kill off all of the synchronization, including
the synchronization of kvmclock to the host's TSC-based CLOCK_MONOTONIC_RAW.

Not intended to be a functional patch...

---
arch/x86/kvm/x86.c | 35 ++++++++++++++++++++++++++++++++---
1 file changed, 32 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5b2104bdd99f..75fc6cbaef0d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -157,6 +157,9 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR);
static bool __read_mostly kvmclock_periodic_sync = true;
module_param(kvmclock_periodic_sync, bool, S_IRUGO);

+static bool __read_mostly enable_tsc_sync = true;
+module_param_named(tsc_synchronization, enable_tsc_sync, bool, 0444);
+
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);
@@ -2722,6 +2725,12 @@ static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
bool matched = false;
bool synchronizing = false;

+ if (!enable_tsc_sync) {
+ offset = kvm_compute_l1_tsc_offset(vcpu, data);
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
+ return;
+ }
+
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_compute_l1_tsc_offset(vcpu, data);
ns = get_kvmclock_base_ns();
@@ -2967,9 +2976,12 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
&ka->master_kernel_ns,
&ka->master_cycle_now);

- ka->use_master_clock = host_tsc_clocksource && vcpus_matched
- && !ka->backwards_tsc_observed
- && !ka->boot_vcpu_runs_old_kvmclock;
+ WARN_ON_ONCE(!host_tsc_clocksource && !enable_tsc_sync);
+
+ ka->use_master_clock = host_tsc_clocksource &&
+ (vcpus_matched || !enable_tsc_sync) &&
+ !ka->backwards_tsc_observed &&
+ !ka->boot_vcpu_runs_old_kvmclock;

if (ka->use_master_clock)
atomic_set(&kvm_guest_has_master_clock, 1);
@@ -3278,6 +3290,9 @@ static void kvmclock_sync_fn(struct work_struct *work)

void kvm_adjust_pv_clock_users(struct kvm *kvm, bool add_user)
{
+ if (!enable_tsc_sync)
+ return;
+
/*
* Doesn't need to be a spinlock, but can't be kvm->lock as this is
* call while holding a vCPU's mutext.
@@ -5528,6 +5543,11 @@ static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu,
if (get_user(offset, uaddr))
break;

+ if (!enable_tsc_sync) {
+ kvm_vcpu_write_tsc_offset(vcpu, offset);
+ break;
+ }
+
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);

matched = (vcpu->arch.virtual_tsc_khz &&
@@ -12188,6 +12208,9 @@ int kvm_arch_hardware_enable(void)
if (ret != 0)
return ret;

+ if (!enable_tsc_sync)
+ return 0;
+
local_tsc = rdtsc();
stable = !kvm_check_tsc_unstable();
list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -13670,6 +13693,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit);

static int __init kvm_x86_init(void)
{
+ if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
+ enable_tsc_sync = true;
+
+ if (!enable_tsc_sync)
+ kvmclock_periodic_sync = false;
+
kvm_mmu_x86_module_init();
mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible();
return 0;

base-commit: 7d2edad0beb2a6f07f6e6c2d477d5874f5417d6c
--