[PATCH RFC: kvm tsc virtualization 12/20] Higher accuracy TSC offset computation

From: Zachary Amsden
Date: Mon Dec 14 2009 - 23:11:09 EST


Use per-cpu delta counters and statistically eliminate outliers which
might happen due to platform anomalies such as NMI.

Signed-off-by: Zachary Amsden <zamsden@xxxxxxxxxx>
---
arch/x86/kvm/x86.c | 100 +++++++++++++++++++++++++++++++++++++---------------
1 files changed, 71 insertions(+), 29 deletions(-)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3c4266f..c66dede 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -840,6 +840,11 @@ again:
EXPORT_SYMBOL_GPL(kvm_get_ref_tsc);

#define SYNC_TRIES 64
+struct cpu_delta_array {
+ s64 delta[SYNC_TRIES];
+};
+static DEFINE_PER_CPU(struct cpu_delta_array, delta_array);
+

/*
* sync_tsc_helper is a dual-entry coroutine meant to be run by only
@@ -849,25 +854,16 @@ EXPORT_SYMBOL_GPL(kvm_get_ref_tsc);
*
* To discount cache latency effects, this routine will be called
* twice, one with the measure / recording CPUs reversed. In addition,
- * the first 4 and last 2 results will be discarded to allow branch
- * predicition to become established (and to discount effects from
- * a potentially early predicted loop exit).
+ * the first and last results will be discarded to allow branch predicition
+ * to become established (and to discount effects from a potentially early
+ * predicted loop exit).
*
- * Because of this, we must be extra careful to guard the entrance
- * and exit against the CPU switch. I chose to use atomic instructions
- * only at the end of the measure loop and use the same routine for
- * both CPUs, with symmetric comparisons, and a statically placed
- * recording array, hopefully maximizing the branch predicition and
- * cache locality. The results appear quite good; on known to be
- * synchronized CPUs, I typically get < 10 TSC delta measured, with
- * maximum observed error on the order of 100 cycles.
- *
- * This doesn't account for NUMA cache effects, and could potentially
- * be improved there by moving the delta[] array to the stack of the
- * measuring CPU. In fact, this modification might be worth trying
- * for non-NUMA systems as well, but this appears to be fine for now.
+ * We allocate the delta array as a per-cpu variable to get local cache
+ * allocation, and also take steps to statistically ignore outliers which
+ * might be caused by SMIs. It may seem like overkill, but it is very
+ * accurate, which is what we're aiming for.
*/
-static void sync_tsc_helper(int measure_cpu, u64 *delta, atomic_t *ready)
+static void sync_tsc_helper(int measure_cpu, s64 *delta, atomic_t *ready)
{
int tries;
static u64 tsc_other;
@@ -915,12 +911,59 @@ static void sync_tsc_helper(int measure_cpu, u64 *delta, atomic_t *ready)
mb();
}

+/*
+ * Average and trim the samples of any outliers; we use > 2 x sigma
+ */
+static s64 average_samples(s64 *samples, unsigned num_samples)
+{
+ unsigned i, j;
+ s64 mean;
+ u64 stddev;
+ s64 average;
+
+ /* First get the mean */
+ mean = 0;
+ for (i = 0; i < num_samples; i++)
+ mean += samples[i];
+ mean = mean / num_samples;
+
+ /* Now the deviation */
+ stddev = 0;
+ for (i = 0; i < num_samples; i++) {
+ s64 dev = samples[i] - mean;
+ stddev += dev * dev;
+ }
+ stddev = stddev / (num_samples - 1);
+ stddev = int_sqrt(stddev);
+
+ /* Throw out anything outside 2x stddev */
+ stddev <<= 1;
+ average = 0, j = 0;
+ for (i = 0; i < num_samples; i++) {
+ s64 dev = samples[i] - mean;
+ if (dev < 0)
+ dev = -dev;
+ if (dev <= stddev) {
+ average += samples[i];
+ j++;
+ }
+ }
+ if (j > 0)
+ average = average / j;
+ else
+ printk(KERN_ERR "kvm: TSC samples failed to converge!\n");
+ pr_debug("%s: mean = %lld, stddev = %llu, average = %lld\n",
+ __func__, mean, stddev, average);
+
+ return average;
+}
+
static void kvm_sync_tsc(void *cpup)
{
int new_cpu = *(int *)cpup;
unsigned long flags;
- static s64 delta[SYNC_TRIES*2];
- static atomic_t ready = ATOMIC_INIT(1);
+ s64 *delta1, *delta2;
+ static atomic_t ready ____cacheline_aligned = ATOMIC_INIT(1);

BUG_ON(tsc_base_cpu == -1);
pr_debug("%s: IN, cpu = %d, freq = %ldkHz, tsc_base_cpu = %d\n", __func__, raw_smp_processor_id(), per_cpu(cpu_tsc_khz, raw_smp_processor_id()) , tsc_base_cpu);
@@ -933,27 +976,26 @@ static void kvm_sync_tsc(void *cpup)
&per_cpu(cpu_tsc_multiplier, new_cpu),
&per_cpu(cpu_tsc_shift, new_cpu));
}
- sync_tsc_helper(tsc_base_cpu, delta, &ready);
- sync_tsc_helper(new_cpu, &delta[SYNC_TRIES], &ready);
+ delta1 = per_cpu(delta_array, tsc_base_cpu).delta;
+ delta2 = per_cpu(delta_array, new_cpu).delta;
+ sync_tsc_helper(tsc_base_cpu, delta1, &ready);
+ sync_tsc_helper(new_cpu, delta2, &ready);
if (raw_smp_processor_id() == new_cpu) {
- int i;
s64 accumulator = 0;

/*
- * accumulate [SYNC_TRIES+4,-2) of tsc{base} - tsc{new}
- * subtract [SYNC_TRIES+4,-2) of tsc{new} - tsc{base}
+ * accumulate [2,SYNC_TRIES-1) of tsc{base} - tsc{new}
+ * subtract [SYNC_TRIES+2,-1) of tsc{new} - tsc{base}
*
* this allows instruction cycle and cache differences to
* cancel each other out and drops warm up/cool down variation
*
* Note the arithmatic must be signed because of the divide
*/
+ accumulator += average_samples(&delta1[2], SYNC_TRIES-3);
+ accumulator -= average_samples(&delta2[2], SYNC_TRIES-3);
+ accumulator /= 2;

- for (i = 4; i < SYNC_TRIES - 2; i++)
- accumulator += delta[i];
- for (i = 4; i < SYNC_TRIES - 2; i++)
- accumulator -= delta[i+SYNC_TRIES];
- accumulator = accumulator / (SYNC_TRIES*2-12);
per_cpu(cpu_tsc_offset, new_cpu) = accumulator;
++per_cpu(cpu_tsc_generation, new_cpu);
atomic_set(&per_cpu(cpu_tsc_synchronized, new_cpu), 1);
--
1.6.5.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/