Re: [PATCH] x86/tsc: use real seqcount_latch in cyc2ns_read_begin()

From: Peter Zijlstra
Date: Thu Oct 11 2018 - 11:24:52 EST


On Thu, Oct 11, 2018 at 08:00:42AM -0700, Eric Dumazet wrote:
> Yes, but the code size is bigger (I have looked at the disassembly)
>
> All these %gs plus offset add up

> Total length : 0xA7 bytes

effective length: 0x78 bytes

> 00000000000002a0 <native_sched_clock>:
> 2a0: 4c 8d 54 24 08 lea 0x8(%rsp),%r10
> 2a5: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp
> 2a9: 41 ff 72 f8 pushq -0x8(%r10)
> 2ad: 55 push %rbp
> 2ae: 48 89 e5 mov %rsp,%rbp
> 2b1: 41 52 push %r10

2b3: 66 66 66 66 90 k8_nop5_atomic

> 2b8: 0f 31 rdtsc
> 2ba: 48 c1 e2 20 shl $0x20,%rdx
> 2be: 48 09 c2 or %rax,%rdx
> 2c1: 49 89 d2 mov %rdx,%r10
> 2c4: 49 c7 c1 00 00 00 00 mov $0x0,%r9
> 2c7: R_X86_64_32S .data..percpu..shared_aligned
> 2cb: 65 8b 05 00 00 00 00 mov %gs:0x0(%rip),%eax # 2d2
> <native_sched_clock+0x32>
> 2ce: R_X86_64_PC32 .data..percpu..shared_aligned+0x1c
> 2d2: 89 c1 mov %eax,%ecx
> 2d4: 83 e1 01 and $0x1,%ecx
> 2d7: 48 c1 e1 04 shl $0x4,%rcx
> 2db: 4c 01 c9 add %r9,%rcx
> 2de: 65 48 8b 79 08 mov %gs:0x8(%rcx),%rdi
> 2e3: 65 8b 31 mov %gs:(%rcx),%esi
> 2e6: 65 8b 49 04 mov %gs:0x4(%rcx),%ecx
> 2ea: 65 44 8b 05 00 00 00 mov %gs:0x0(%rip),%r8d # 2f2
> <native_sched_clock+0x52>
> 2f1: 00
> 2ee: R_X86_64_PC32 .data..percpu..shared_aligned+0x1c
> 2f2: 44 39 c0 cmp %r8d,%eax
> 2f5: 75 d4 jne 2cb <native_sched_clock+0x2b>
> 2f7: 89 f6 mov %esi,%esi
> 2f9: 48 89 f0 mov %rsi,%rax
> 2fc: 49 f7 e2 mul %r10
> 2ff: 48 0f ad d0 shrd %cl,%rdx,%rax
> 303: 48 d3 ea shr %cl,%rdx
> 306: f6 c1 40 test $0x40,%cl
> 309: 48 0f 45 c2 cmovne %rdx,%rax
> 30d: 48 01 f8 add %rdi,%rax
> 310: 41 5a pop %r10
> 312: 5d pop %rbp
> 313: 49 8d 62 f8 lea -0x8(%r10),%rsp
> 317: c3 retq
>
> New version :
>
> Total length = 0x91 bytes

effective: 0x71

>
> 00000000000002a0 <native_sched_clock>:
> 2a0: 4c 8d 54 24 08 lea 0x8(%rsp),%r10
> 2a5: 48 83 e4 f0 and $0xfffffffffffffff0,%rsp
> 2a9: 41 ff 72 f8 pushq -0x8(%r10)
> 2ad: 55 push %rbp
> 2ae: 48 89 e5 mov %rsp,%rbp
> 2b1: 41 52 push %r10

2b3: 66 66 66 66 90 k8_nop5_atomic

> 2b8: 0f 31 rdtsc
> 2ba: 48 c1 e2 20 shl $0x20,%rdx
> 2be: 48 09 c2 or %rax,%rdx
> 2c1: 49 89 d1 mov %rdx,%r9
> 2c4: 49 c7 c0 00 00 00 00 mov $0x0,%r8
> 2c7: R_X86_64_32S .data..percpu..shared_aligned
> 2cb: 65 4c 03 05 00 00 00 add %gs:0x0(%rip),%r8 # 2d3
> <native_sched_clock+0x33>
> 2d2: 00
> 2cf: R_X86_64_PC32 this_cpu_off-0x4
> 2d3: 41 8b 40 20 mov 0x20(%r8),%eax
> 2d7: 89 c6 mov %eax,%esi
> 2d9: 83 e6 01 and $0x1,%esi
> 2dc: 48 c1 e6 04 shl $0x4,%rsi
> 2e0: 4c 01 c6 add %r8,%rsi
> 2e3: 8b 3e mov (%rsi),%edi
> 2e5: 8b 4e 04 mov 0x4(%rsi),%ecx
> 2e8: 48 8b 76 08 mov 0x8(%rsi),%rsi
> 2ec: 41 3b 40 20 cmp 0x20(%r8),%eax
> 2f0: 75 e1 jne 2d3 <native_sched_clock+0x33>
> 2f2: 48 89 f8 mov %rdi,%rax
> 2f5: 49 f7 e1 mul %r9
> 2f8: 48 0f ad d0 shrd %cl,%rdx,%rax
> 2fc: 48 d3 ea shr %cl,%rdx
> 2ff: f6 c1 40 test $0x40,%cl
> 302: 48 0f 45 c2 cmovne %rdx,%rax
> 306: 48 01 f0 add %rsi,%rax
> 309: 41 5a pop %r10
> 30b: 5d pop %rbp
> 30c: 49 8d 62 f8 lea -0x8(%r10),%rsp
> 310: c3 retq

Ah, right you are. But my version only touches the one cacheline,
whereas yours will do that extra cpu offset load, which might or might
not be hot.

Difficult..

You have some weird stack setup though.. mine doesn't have that:

$ objdump -dr defconfig-build/arch/x86/kernel/tsc.o | awk '/<native_sched_clock>:$/ {p=1} /^$/ {p=0} {if (p) print $0}'
0000000000000b00 <native_sched_clock>:
b00: 55 push %rbp
b01: 48 89 e5 mov %rsp,%rbp
b04: 66 66 66 66 90 k8_nop5_atomic
b09: 0f 31 rdtsc
b0b: 48 c1 e2 20 shl $0x20,%rdx
b0f: 48 09 c2 or %rax,%rdx
b12: 49 89 d2 mov %rdx,%r10
b15: 49 c7 c1 00 00 00 00 mov $0x0,%r9
b18: R_X86_64_32S .data..percpu..shared_aligned
b1c: 65 8b 05 00 00 00 00 mov %gs:0x0(%rip),%eax # b23 <native_sched_clock+0x23>
b1f: R_X86_64_PC32 .data..percpu..shared_aligned+0x1c
b23: 89 c1 mov %eax,%ecx
b25: 83 e1 01 and $0x1,%ecx
b28: 48 c1 e1 04 shl $0x4,%rcx
b2c: 4c 01 c9 add %r9,%rcx
b2f: 65 48 8b 79 08 mov %gs:0x8(%rcx),%rdi
b34: 65 8b 31 mov %gs:(%rcx),%esi
b37: 65 8b 49 04 mov %gs:0x4(%rcx),%ecx
b3b: 65 44 8b 05 00 00 00 mov %gs:0x0(%rip),%r8d # b43 <native_sched_clock+0x43>
b42: 00
b3f: R_X86_64_PC32 .data..percpu..shared_aligned+0x1c
b43: 44 39 c0 cmp %r8d,%eax
b46: 75 d4 jne b1c <native_sched_clock+0x1c>
b48: 89 f6 mov %esi,%esi
b4a: 48 89 f0 mov %rsi,%rax
b4d: 49 f7 e2 mul %r10
b50: 48 0f ad d0 shrd %cl,%rdx,%rax
b54: 48 d3 ea shr %cl,%rdx
b57: f6 c1 40 test $0x40,%cl
b5a: 48 0f 45 c2 cmovne %rdx,%rax
b5e: 48 01 f8 add %rdi,%rax
b61: 5d pop %rbp
b62: c3 retq

Which gets me to 0x62 effective bytes.