Re: [PATCH] x86/tsc: use real seqcount_latch in cyc2ns_read_begin()
From: Eric Dumazet
Date: Thu Oct 11 2018 - 11:17:59 EST
On Thu, Oct 11, 2018 at 12:31 AM Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>
> On Wed, Oct 10, 2018 at 05:33:36PM -0700, Eric Dumazet wrote:
> > While looking at native_sched_clock() disassembly I had
> > the surprise to see the compiler (gcc 7.3 here) had
> > optimized out the loop, meaning the code is broken.
> >
> > Using the documented and approved API not only fixes the bug,
> > it also makes the code more readable.
> >
> > Replacing five this_cpu_read() by one this_cpu_ptr() makes
> > the generated code smaller.
>
> Does not for me, that is, the resulting asm is actually larger
[resent in non html]
I counted the number of bytes of text, and really the after my patch
code size is smaller.
%gs prefixes are one-byte, but the 32bit offsets are adding up.
Prior version (with resinstaed loop, thanks to one smp_rmb()
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index b52bd2b6cdb443ba0c89d78aaa52b02b82a10b6e..d4ad30bbaa0cc8bf81da0272829da26f229734bf
100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -71,7 +71,7 @@ void cyc2ns_read_begin(struct cyc2ns_data *data)
data->cyc2ns_offset = this_cpu_read(cyc2ns.data[idx].cyc2ns_offset);
data->cyc2ns_mul = this_cpu_read(cyc2ns.data[idx].cyc2ns_mul);
data->cyc2ns_shift = this_cpu_read(cyc2ns.data[idx].cyc2ns_shift);
-
+ smp_rmb();
} while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
}
Total length : 0xA7 bytes
00000000000002a0 <native_sched_clock>:
2a0: 4c 8d 54 24 08 lea 0x8(%rsp),%r10
2a5: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp
2a9: 41 ff 72 f8 pushq -0x8(%r10)
2ad: 55 push %rbp
2ae: 48 89 e5 mov %rsp,%rbp
2b1: 41 52 push %r10
2b3: e9 60 00 00 00 jmpq 318 <native_sched_clock+0x78>
2b8: 0f 31 rdtsc
2ba: 48 c1 e2 20 shl $0x20,%rdx
2be: 48 09 c2 or %rax,%rdx
2c1: 49 89 d2 mov %rdx,%r10
2c4: 49 c7 c1 00 00 00 00 mov $0x0,%r9
2c7: R_X86_64_32S .data..percpu..shared_aligned
2cb: 65 8b 05 00 00 00 00 mov %gs:0x0(%rip),%eax # 2d2
<native_sched_clock+0x32>
2ce: R_X86_64_PC32 .data..percpu..shared_aligned+0x1c
2d2: 89 c1 mov %eax,%ecx
2d4: 83 e1 01 and $0x1,%ecx
2d7: 48 c1 e1 04 shl $0x4,%rcx
2db: 4c 01 c9 add %r9,%rcx
2de: 65 48 8b 79 08 mov %gs:0x8(%rcx),%rdi
2e3: 65 8b 31 mov %gs:(%rcx),%esi
2e6: 65 8b 49 04 mov %gs:0x4(%rcx),%ecx
2ea: 65 44 8b 05 00 00 00 mov %gs:0x0(%rip),%r8d # 2f2
<native_sched_clock+0x52>
2f1: 00
2ee: R_X86_64_PC32 .data..percpu..shared_aligned+0x1c
2f2: 44 39 c0 cmp %r8d,%eax
2f5: 75 d4 jne 2cb <native_sched_clock+0x2b>
2f7: 89 f6 mov %esi,%esi
2f9: 48 89 f0 mov %rsi,%rax
2fc: 49 f7 e2 mul %r10
2ff: 48 0f ad d0 shrd %cl,%rdx,%rax
303: 48 d3 ea shr %cl,%rdx
306: f6 c1 40 test $0x40,%cl
309: 48 0f 45 c2 cmovne %rdx,%rax
30d: 48 01 f8 add %rdi,%rax
310: 41 5a pop %r10
312: 5d pop %rbp
313: 49 8d 62 f8 lea -0x8(%r10),%rsp
317: c3 retq
318: 48 69 05 00 00 00 00 imul $0x3d0900,0x0(%rip),%rax
# 323 <native_sched_clock+0x83>
31f: 00 09 3d 00
31b: R_X86_64_PC32 jiffies_64-0x8
323: 41 5a pop %r10
325: 48 ba 00 b8 64 d9 45 movabs $0xffc2f745d964b800,%rdx
32c: f7 c2 ff
32f: 5d pop %rbp
330: 49 8d 62 f8 lea -0x8(%r10),%rsp
334: 48 01 d0 add %rdx,%rax
337: c3 retq
New version (my cleanup patch)
Total length = 0x91 bytes
00000000000002a0 <native_sched_clock>:
2a0: 4c 8d 54 24 08 lea 0x8(%rsp),%r10
2a5: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp
2a9: 41 ff 72 f8 pushq -0x8(%r10)
2ad: 55 push %rbp
2ae: 48 89 e5 mov %rsp,%rbp
2b1: 41 52 push %r10
2b3: e9 59 00 00 00 jmpq 311 <native_sched_clock+0x71>
2b8: 0f 31 rdtsc
2ba: 48 c1 e2 20 shl $0x20,%rdx
2be: 48 09 c2 or %rax,%rdx
2c1: 49 89 d1 mov %rdx,%r9
2c4: 49 c7 c0 00 00 00 00 mov $0x0,%r8
2c7: R_X86_64_32S .data..percpu..shared_aligned
2cb: 65 4c 03 05 00 00 00 add %gs:0x0(%rip),%r8 # 2d3
<native_sched_clock+0x33>
2d2: 00
2cf: R_X86_64_PC32 this_cpu_off-0x4
2d3: 41 8b 40 20 mov 0x20(%r8),%eax
2d7: 89 c6 mov %eax,%esi
2d9: 83 e6 01 and $0x1,%esi
2dc: 48 c1 e6 04 shl $0x4,%rsi
2e0: 4c 01 c6 add %r8,%rsi
2e3: 8b 3e mov (%rsi),%edi
2e5: 8b 4e 04 mov 0x4(%rsi),%ecx
2e8: 48 8b 76 08 mov 0x8(%rsi),%rsi
2ec: 41 3b 40 20 cmp 0x20(%r8),%eax
2f0: 75 e1 jne 2d3 <native_sched_clock+0x33>
2f2: 48 89 f8 mov %rdi,%rax
2f5: 49 f7 e1 mul %r9
2f8: 48 0f ad d0 shrd %cl,%rdx,%rax
2fc: 48 d3 ea shr %cl,%rdx
2ff: f6 c1 40 test $0x40,%cl
302: 48 0f 45 c2 cmovne %rdx,%rax
306: 48 01 f0 add %rsi,%rax
309: 41 5a pop %r10
30b: 5d pop %rbp
30c: 49 8d 62 f8 lea -0x8(%r10),%rsp
310: c3 retq
311: 48 69 05 00 00 00 00 imul $0x3d0900,0x0(%rip),%rax
# 31c <native_sched_clock+0x7c>
318: 00 09 3d 00
314: R_X86_64_PC32 jiffies_64-0x8
31c: 41 5a pop %r10
31e: 48 ba 00 b8 64 d9 45 movabs $0xffc2f745d964b800,%rdx
325: f7 c2 ff
328: 5d pop %rbp
329: 49 8d 62 f8 lea -0x8(%r10),%rsp
32d: 48 01 d0 add %rdx,%rax
330: c3 retq
331: