Re: [RFC] locking/mutex: Optimize __mutex_trylock_fast
From: Will Deacon
Date: Thu Apr 05 2018 - 05:55:37 EST
Hi Peter,
On Thu, Apr 05, 2018 at 11:37:16AM +0200, Peter Zijlstra wrote:
> Subject: locking/mutex: Optimize __mutex_trylock_fast()
> From: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Date: Thu Apr 5 11:05:35 CEST 2018
>
> Use try_cmpxchg to avoid the pointless TEST instruction..
> And add the (missing) atomic_long_try_cmpxchg*() wrappery.
>
> On x86_64 this gives:
>
> 0000000000000710 <mutex_lock>: 0000000000000710 <mutex_lock>:
> 710: 65 48 8b 14 25 00 00 mov %gs:0x0,%rdx 710: 65 48 8b 14 25 00 00 mov %gs:0x0,%rdx
> 717: 00 00 717: 00 00
> 715: R_X86_64_32S current_task 715: R_X86_64_32S current_task
> 719: 31 c0 xor %eax,%eax 719: 31 c0 xor %eax,%eax
> 71b: f0 48 0f b1 17 lock cmpxchg %rdx,(%rdi) 71b: f0 48 0f b1 17 lock cmpxchg %rdx,(%rdi)
> 720: 48 85 c0 test %rax,%rax 720: 75 02 jne 724 <mutex_lock+0x14>
> 723: 75 02 jne 727 <mutex_lock+0x17> 722: f3 c3 repz retq
> 725: f3 c3 repz retq 724: eb da jmp 700 <__mutex_lock_slowpath>
> 727: eb d7 jmp 700 <__mutex_lock_slowpath>
>
> On ARM64 this gives:
>
> 000000000000638 <mutex_lock>: 0000000000000638 <mutex_lock>:
> 638: d5384101 mrs x1, sp_el0 638: d5384101 mrs x1, sp_el0
> 63c: d2800002 mov x2, #0x0 63c: d2800002 mov x2, #0x0
> 640: f9800011 prfm pstl1strm, [x0] 640: f9800011 prfm pstl1strm, [x0]
> 644: c85ffc03 ldaxr x3, [x0] 644: c85ffc03 ldaxr x3, [x0]
> 648: ca020064 eor x4, x3, x2 648: ca020064 eor x4, x3, x2
> 64c: b5000064 cbnz x4, 658 <mutex_lock+0x20> 64c: b5000064 cbnz x4, 658 <mutex_lock+0x20>
> 650: c8047c01 stxr w4, x1, [x0] 650: c8047c01 stxr w4, x1, [x0]
> 654: 35ffff84 cbnz w4, 644 <mutex_lock+0xc> 654: 35ffff84 cbnz w4, 644 <mutex_lock+0xc>
> 658: b40000c3 cbz x3, 670 <mutex_lock+0x38> 658: b5000043 cbnz x3, 660 <mutex_lock+0x28>
> 65c: a9bf7bfd stp x29, x30, [sp,#-16]! 65c: d65f03c0 ret
> 660: 910003fd mov x29, sp 660: a9bf7bfd stp x29, x30, [sp,#-16]!
> 664: 97ffffef bl 620 <__mutex_lock_slowpath> 664: 910003fd mov x29, sp
> 668: a8c17bfd ldp x29, x30, [sp],#16 668: 97ffffee bl 620 <__mutex_lock_slowpath>
> 66c: d65f03c0 ret 66c: a8c17bfd ldp x29, x30, [sp],#16
> 670: d65f03c0 ret 670: d65f03c0 ret
>
> Which to my untrained eye just looks different, not worse. Will?
Yeah, looks fine. The compiler has just decided to move the slowpath out of
line. One comment on the patch:
>
> Reported-by: Matthew Wilcox <mawilcox@xxxxxxxxxxxxx>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> ---
> include/asm-generic/atomic-long.h | 17 +++++++++++++++++
> kernel/locking/mutex.c | 3 ++-
> 2 files changed, 19 insertions(+), 1 deletion(-)
>
> --- a/include/asm-generic/atomic-long.h
> +++ b/include/asm-generic/atomic-long.h
> @@ -25,6 +25,7 @@ typedef atomic64_t atomic_long_t;
>
> #define ATOMIC_LONG_INIT(i) ATOMIC64_INIT(i)
> #define ATOMIC_LONG_PFX(x) atomic64 ## x
> +#define ATOMIC_LONG_TYPE s64
>
> #else
>
> @@ -32,6 +33,7 @@ typedef atomic_t atomic_long_t;
>
> #define ATOMIC_LONG_INIT(i) ATOMIC_INIT(i)
> #define ATOMIC_LONG_PFX(x) atomic ## x
> +#define ATOMIC_LONG_TYPE int
>
> #endif
>
> @@ -90,6 +92,21 @@ ATOMIC_LONG_ADD_SUB_OP(sub, _release)
> #define atomic_long_cmpxchg(l, old, new) \
> (ATOMIC_LONG_PFX(_cmpxchg)((ATOMIC_LONG_PFX(_t) *)(l), (old), (new)))
>
> +
> +#define atomic_long_try_cmpxchg_relaxed(l, old, new) \
> + (ATOMIC_LONG_PFX(_try_cmpxchg_relaxed)((ATOMIC_LONG_PFX(_t) *)(l), \
> + (ATOMIC_LONG_TYPE *)(old), (ATOMIC_LONG_TYPE)(new)))
Are the casts on old and new strictly needed? We don't have them for the
non-try versions...
Will