Re: [PATCH] x86/tsc: use real seqcount_latch in cyc2ns_read_begin()

From: Peter Zijlstra
Date: Thu Oct 11 2018 - 04:41:03 EST


On Thu, Oct 11, 2018 at 09:31:33AM +0200, Peter Zijlstra wrote:
> On Wed, Oct 10, 2018 at 05:33:36PM -0700, Eric Dumazet wrote:
> > While looking at native_sched_clock() disassembly I had
> > the surprise to see the compiler (gcc 7.3 here) had
> > optimized out the loop, meaning the code is broken.
> >
> > Using the documented and approved API not only fixes the bug,
> > it also makes the code more readable.
> >
> > Replacing five this_cpu_read() by one this_cpu_ptr() makes
> > the generated code smaller.
>
> Does not for me, that is, the resulting asm is actually larger
>
> You're quite right the loop went missing; no idea wth that compiler is
> smoking (gcc-8.2 for me). In order to eliminate that loop it needs to
> think that two consecutive loads of this_cpu_read(cyc2ns.seq.sequence)
> will return the same value. But this_cpu_read() is an asm() statement,
> it _should_ not assume such.
>
> We assume that this_cpu_read() implies READ_ONCE() in a number of
> locations, this really should not happen.
>
> The reason it was written using this_cpu_read() is so that it can use
> %gs: prefixed instructions and avoid ever loading that percpu offset and
> doing manual address computation.
>
> Let me prod at this with a sharp stick.

OK, so apart from the inlining being crap, which is fixed by:

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 6490f618e096..638491062fea 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -57,7 +57,7 @@ struct cyc2ns {

static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);

-void cyc2ns_read_begin(struct cyc2ns_data *data)
+void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data)
{
int seq, idx;

@@ -74,7 +74,7 @@ void cyc2ns_read_begin(struct cyc2ns_data *data)
} while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
}

-void cyc2ns_read_end(void)
+void __always_inline cyc2ns_read_end(void)
{
preempt_enable_notrace();
}
@@ -103,7 +103,7 @@ void cyc2ns_read_end(void)
* -johnstul@xxxxxxxxxx "math is hard, lets go shopping!"
*/

-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
{
struct cyc2ns_data data;
unsigned long long ns;


That gets us:

native_sched_clock:
pushq %rbp #
movq %rsp, %rbp #,

... jump label ...

rdtsc
salq $32, %rdx #, tmp110
orq %rax, %rdx # low, tmp110
movl %gs:cyc2ns+32(%rip),%ecx # cyc2ns.seq.sequence, pfo_ret__
andl $1, %ecx #, idx
salq $4, %rcx #, tmp116
movl %gs:cyc2ns(%rcx),%esi # cyc2ns.data[idx_14].cyc2ns_mul, pfo_ret__
movl %esi, %esi # pfo_ret__, pfo_ret__
movq %rsi, %rax # pfo_ret__, tmp133
mulq %rdx # _23
movq %gs:cyc2ns+8(%rcx),%rdi # cyc2ns.data[idx_14].cyc2ns_offset, pfo_ret__
addq $cyc2ns, %rcx #, tmp117
movl %gs:4(%rcx),%ecx # cyc2ns.data[idx_14].cyc2ns_shift, pfo_ret__
shrdq %rdx, %rax # pfo_ret__,, tmp134
shrq %cl, %rdx # pfo_ret__,
testb $64, %cl #, pfo_ret__
cmovne %rdx, %rax #,, tmp134
addq %rdi, %rax # pfo_ret__, <retval>
popq %rbp #
ret


If we then fix the percpu mess, with:

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index e9202a0de8f0..1a19d11cfbbd 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -185,22 +185,22 @@ do { \
typeof(var) pfo_ret__; \
switch (sizeof(var)) { \
case 1: \
- asm(op "b "__percpu_arg(1)",%0" \
+ asm volatile(op "b "__percpu_arg(1)",%0"\
: "=q" (pfo_ret__) \
: "m" (var)); \
break; \
case 2: \
- asm(op "w "__percpu_arg(1)",%0" \
+ asm volatile(op "w "__percpu_arg(1)",%0"\
: "=r" (pfo_ret__) \
: "m" (var)); \
break; \
case 4: \
- asm(op "l "__percpu_arg(1)",%0" \
+ asm volatile(op "l "__percpu_arg(1)",%0"\
: "=r" (pfo_ret__) \
: "m" (var)); \
break; \
case 8: \
- asm(op "q "__percpu_arg(1)",%0" \
+ asm volatile(op "q "__percpu_arg(1)",%0"\
: "=r" (pfo_ret__) \
: "m" (var)); \
break; \


That turns into:

native_sched_clock:
pushq %rbp #
movq %rsp, %rbp #,

... jump label ...

rdtsc
salq $32, %rdx #, tmp110
orq %rax, %rdx # low, tmp110
movq %rdx, %r10 # tmp110, _23
movq $cyc2ns, %r9 #, tmp136
.L235:
movl %gs:cyc2ns+32(%rip),%eax # cyc2ns.seq.sequence, pfo_ret__
movl %eax, %ecx # pfo_ret__, idx
andl $1, %ecx #, idx
salq $4, %rcx #, tmp116
addq %r9, %rcx # tmp136, tmp117
movq %gs:8(%rcx),%rdi # cyc2ns.data[idx_14].cyc2ns_offset, pfo_ret__
movl %gs:(%rcx),%esi # cyc2ns.data[idx_14].cyc2ns_mul, pfo_ret__
movl %gs:4(%rcx),%ecx # cyc2ns.data[idx_14].cyc2ns_shift, pfo_ret__
movl %gs:cyc2ns+32(%rip),%r8d # cyc2ns.seq.sequence, pfo_ret__
cmpl %r8d, %eax # pfo_ret__, pfo_ret__
jne .L235 #,
movl %esi, %esi # pfo_ret__, pfo_ret__
movq %rsi, %rax # pfo_ret__, tmp133
mulq %r10 # _23
shrdq %rdx, %rax # pfo_ret__,, tmp134
shrq %cl, %rdx # pfo_ret__,
testb $64, %cl #, pfo_ret__
cmovne %rdx, %rax #,, tmp134
addq %rdi, %rax # pfo_ret__, <retval>
popq %rbp #
ret

which is exactly right. Except perhaps for the mess that
mul_u64_u32_shr() turns into.