Re: 'perf test tsc' failing, bisected to "sched/clock: Provide better clock continuity"

From: Peter Zijlstra
Date: Thu Mar 16 2017 - 14:12:09 EST


On Thu, Mar 16, 2017 at 03:53:11PM +0100, Peter Zijlstra wrote:
> On Thu, Mar 16, 2017 at 11:01:03AM -0300, Arnaldo Carvalho de Melo wrote:
> > Hi, this entry is failing for a while:
> >
> > [root@jouet ~]# perf test -v tsc
> > 55: Convert perf time to TSC :
> > --- start ---
> > test child forked, pid 3008
> > mmap size 528384B
> > 1st event perf time 93133455486631 tsc 15369449468752
> > rdtsc time 93133464598760 tsc 15369473104358
> > 2nd event perf time 93133455506961 tsc 15369449521485
> > test child finished with -1
> > ---- end ----
> > Convert perf time to TSC: FAILED!
> > [root@jouet ~]#
> >
> > I bisected it to the following kernel change, ideas?
> >
> > [acme@felicio linux]$ git bisect good
> > 5680d8094ffa9e5cfc81afdd865027ee6417c263 is the first bad commit
> > commit 5680d8094ffa9e5cfc81afdd865027ee6417c263
> > Author: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> > Date: Thu Dec 15 13:36:17 2016 +0100
> >
> > sched/clock: Provide better clock continuity
>
> Right, ahunter also complained about this. I had a half arsed fugly
> patch in the works. Let me see if I can improve and finish.

The below seems to cure things for me...


---
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2244,6 +2244,7 @@ void arch_perf_update_userpage(struct pe
struct perf_event_mmap_page *userpg, u64 now)
{
struct cyc2ns_data *data;
+ u64 offset;

userpg->cap_user_time = 0;
userpg->cap_user_time_zero = 0;
@@ -2251,11 +2252,13 @@ void arch_perf_update_userpage(struct pe
!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
userpg->pmc_width = x86_pmu.cntval_bits;

- if (!sched_clock_stable())
+ if (!using_native_sched_clock() || !sched_clock_stable())
return;

data = cyc2ns_read_begin();

+ offset = data->cyc2ns_offset + __sched_clock_offset;
+
/*
* Internal timekeeping for enabled/running/stopped times
* is always in the local_clock domain.
@@ -2263,7 +2266,7 @@ void arch_perf_update_userpage(struct pe
userpg->cap_user_time = 1;
userpg->time_mult = data->cyc2ns_mul;
userpg->time_shift = data->cyc2ns_shift;
- userpg->time_offset = data->cyc2ns_offset - now;
+ userpg->time_offset = offset - now;

/*
* cap_user_time_zero doesn't make sense when we're using a different
@@ -2271,7 +2274,7 @@ void arch_perf_update_userpage(struct pe
*/
if (!event->attr.use_clockid) {
userpg->cap_user_time_zero = 1;
- userpg->time_zero = data->cyc2ns_offset;
+ userpg->time_zero = offset;
}

cyc2ns_read_end(data);
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -12,6 +12,8 @@ extern int recalibrate_cpu_khz(void);

extern int no_timer_check;

+extern bool using_native_sched_clock(void);
+
/*
* We use the full linear equation: f(x) = a + b*x, in order to allow
* a continuous function in the face of dynamic freq changes.
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -328,7 +328,7 @@ unsigned long long sched_clock(void)
return paravirt_sched_clock();
}

-static inline bool using_native_sched_clock(void)
+bool using_native_sched_clock(void)
{
return pv_time_ops.sched_clock == native_sched_clock;
}
@@ -336,7 +336,7 @@ static inline bool using_native_sched_cl
unsigned long long
sched_clock(void) __attribute__((alias("native_sched_clock")));

-static inline bool using_native_sched_clock(void) { return true; }
+bool using_native_sched_clock(void) { return true; }
#endif

int check_tsc_unstable(void)
--- a/include/linux/sched/clock.h
+++ b/include/linux/sched/clock.h
@@ -54,15 +54,16 @@ static inline u64 local_clock(void)
}
#else
extern void sched_clock_init_late(void);
-/*
- * Architectures can set this to 1 if they have specified
- * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
- * but then during bootup it turns out that sched_clock()
- * is reliable after all:
- */
extern int sched_clock_stable(void);
extern void clear_sched_clock_stable(void);

+/*
+ * When sched_clock_stable(), __sched_clock_offset provides the offset
+ * between local_clock() and sched_clock().
+ */
+extern u64 __sched_clock_offset;
+
+
extern void sched_clock_tick(void);
extern void sched_clock_idle_sleep_event(void);
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -96,10 +96,10 @@ static DEFINE_STATIC_KEY_FALSE(__sched_c
static int __sched_clock_stable_early = 1;

/*
- * We want: ktime_get_ns() + gtod_offset == sched_clock() + raw_offset
+ * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset
*/
-static __read_mostly u64 raw_offset;
-static __read_mostly u64 gtod_offset;
+__read_mostly u64 __sched_clock_offset;
+static __read_mostly u64 __gtod_offset;

struct sched_clock_data {
u64 tick_raw;
@@ -131,11 +131,11 @@ static void __set_sched_clock_stable(voi
/*
* Attempt to make the (initial) unstable->stable transition continuous.
*/
- raw_offset = (scd->tick_gtod + gtod_offset) - (scd->tick_raw);
+ __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw);

printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n",
- scd->tick_gtod, gtod_offset,
- scd->tick_raw, raw_offset);
+ scd->tick_gtod, __gtod_offset,
+ scd->tick_raw, __sched_clock_offset);

static_branch_enable(&__sched_clock_stable);
tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
@@ -161,11 +161,11 @@ static void __clear_sched_clock_stable(v
*
* Still do what we can.
*/
- gtod_offset = (scd->tick_raw + raw_offset) - (scd->tick_gtod);
+ __gtod_offset = (scd->tick_raw + __sched_clock_offset) - (scd->tick_gtod);

printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n",
- scd->tick_gtod, gtod_offset,
- scd->tick_raw, raw_offset);
+ scd->tick_gtod, __gtod_offset,
+ scd->tick_raw, __sched_clock_offset);

tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);

@@ -238,7 +238,7 @@ static u64 sched_clock_local(struct sche
* scd->tick_gtod + TICK_NSEC);
*/

- clock = scd->tick_gtod + gtod_offset + delta;
+ clock = scd->tick_gtod + __gtod_offset + delta;
min_clock = wrap_max(scd->tick_gtod, old_clock);
max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);

@@ -324,7 +324,7 @@ u64 sched_clock_cpu(int cpu)
u64 clock;

if (sched_clock_stable())
- return sched_clock() + raw_offset;
+ return sched_clock() + __sched_clock_offset;

if (unlikely(!sched_clock_running))
return 0ull;