Re: locking/atomic: Introduce atomic_try_cmpxchg()

From: Dmitry Vyukov
Date: Fri Mar 24 2017 - 14:19:28 EST


On Fri, Mar 24, 2017 at 7:08 PM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> On Fri, Mar 24, 2017 at 06:51:15PM +0100, Dmitry Vyukov wrote:
>> On Fri, Mar 24, 2017 at 6:23 PM, Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
>> > On Fri, Mar 24, 2017 at 09:54:46AM -0700, Andy Lutomirski wrote:
>> >> > So the first snipped I tested regressed like so:
>> >> >
>> >> >
>> >> > 0000000000000000 <T_refcount_inc>: 0000000000000000 <T_refcount_inc>:
>> >> > 0: 8b 07 mov (%rdi),%eax 0: 8b 17 mov (%rdi),%edx
>> >> > 2: 83 f8 ff cmp $0xffffffff,%eax 2: 83 fa ff cmp $0xffffffff,%edx
>> >> > 5: 74 13 je 1a <T_refcount_inc+0x1a> 5: 74 1a je 21 <T_refcount_inc+0x21>
>> >> > 7: 85 c0 test %eax,%eax 7: 85 d2 test %edx,%edx
>> >> > 9: 74 0d je 18 <T_refcount_inc+0x18> 9: 74 13 je 1e <T_refcount_inc+0x1e>
>> >> > b: 8d 50 01 lea 0x1(%rax),%edx b: 8d 4a 01 lea 0x1(%rdx),%ecx
>> >> > e: f0 0f b1 17 lock cmpxchg %edx,(%rdi) e: 89 d0 mov %edx,%eax
>> >> > 12: 75 ee jne 2 <T_refcount_inc+0x2> 10: f0 0f b1 0f lock cmpxchg %ecx,(%rdi)
>> >> > 14: ff c2 inc %edx 14: 74 04 je 1a <T_refcount_inc+0x1a>
>> >> > 16: 75 02 jne 1a <T_refcount_inc+0x1a> 16: 89 c2 mov %eax,%edx
>> >> > 18: 0f 0b ud2 18: eb e8 jmp 2 <T_refcount_inc+0x2>
>> >> > 1a: c3 retq 1a: ff c1 inc %ecx
>> >> > 1c: 75 03 jne 21 <T_refcount_inc+0x21>
>> >> > 1e: 0f 0b ud2
>> >> > 20: c3 retq
>> >> > 21: c3 retq
>> >>
>
>> This seems to help ;)
>>
>> #define try_cmpxchg(ptr, pold, new) __atomic_compare_exchange_n(ptr, pold, new, 0, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)
>
> That gets me:
>
> 0000000000000000 <T_refcount_inc>:
> 0: 8b 07 mov (%rdi),%eax
> 2: 89 44 24 fc mov %eax,-0x4(%rsp)
> 6: 8b 44 24 fc mov -0x4(%rsp),%eax
> a: 83 f8 ff cmp $0xffffffff,%eax
> d: 74 1c je 2b <T_refcount_inc+0x2b>
> f: 85 c0 test %eax,%eax
> 11: 75 07 jne 1a <T_refcount_inc+0x1a>
> 13: 8b 44 24 fc mov -0x4(%rsp),%eax
> 17: 0f 0b ud2
> 19: c3 retq
> 1a: 8d 50 01 lea 0x1(%rax),%edx
> 1d: 8b 44 24 fc mov -0x4(%rsp),%eax
> 21: f0 0f b1 17 lock cmpxchg %edx,(%rdi)
> 25: 75 db jne 2 <T_refcount_inc+0x2>
> 27: ff c2 inc %edx
> 29: 74 e8 je 13 <T_refcount_inc+0x13>
> 2b: c3 retq
>
>
> Which is even worse... (I did double check it actually compiled)


For me gcc 7.0.1 generates:

0000000000000330 <T_refcount_inc1>:
330: 55 push %rbp
331: 8b 07 mov (%rdi),%eax
333: 48 89 e5 mov %rsp,%rbp
336: 83 f8 ff cmp $0xffffffff,%eax
339: 74 12 je 34d <T_refcount_inc1+0x1d>
33b: 85 c0 test %eax,%eax
33d: 74 10 je 34f <T_refcount_inc1+0x1f>
33f: 8d 50 01 lea 0x1(%rax),%edx
342: f0 0f b1 17 lock cmpxchg %edx,(%rdi)
346: 75 ee jne 336 <T_refcount_inc1+0x6>
348: 83 fa ff cmp $0xffffffff,%edx
34b: 74 04 je 351 <T_refcount_inc1+0x21>
34d: 5d pop %rbp
34e: c3 retq
34f: 31 c0 xor %eax,%eax
351: 0f 0b ud2
353: 5d pop %rbp
354: c3 retq

with:

if (!success) \
*_old = __old; \

0000000000000320 <T_refcount_inc1>:
320: 8b 0f mov (%rdi),%ecx
322: 55 push %rbp
323: 48 89 e5 mov %rsp,%rbp
326: 83 f9 ff cmp $0xffffffff,%ecx
329: 74 2d je 358 <T_refcount_inc1+0x38>
32b: 85 c9 test %ecx,%ecx
32d: 74 25 je 354 <T_refcount_inc1+0x34>
32f: 8d 71 01 lea 0x1(%rcx),%esi
332: 89 c8 mov %ecx,%eax
334: f0 0f b1 37 lock cmpxchg %esi,(%rdi)
338: 89 c2 mov %eax,%edx
33a: 74 20 je 35c <T_refcount_inc1+0x3c>
33c: 83 fa ff cmp $0xffffffff,%edx
33f: 74 17 je 358 <T_refcount_inc1+0x38>
341: 85 d2 test %edx,%edx
343: 74 0f je 354 <T_refcount_inc1+0x34>
345: 8d 72 01 lea 0x1(%rdx),%esi
348: 89 d0 mov %edx,%eax
34a: f0 0f b1 37 lock cmpxchg %esi,(%rdi)
34e: 74 0a je 35a <T_refcount_inc1+0x3a>
350: 89 c2 mov %eax,%edx
352: eb e8 jmp 33c <T_refcount_inc1+0x1c>
354: 31 c9 xor %ecx,%ecx
356: 0f 0b ud2
358: 5d pop %rbp
359: c3 retq
35a: 89 d1 mov %edx,%ecx
35c: 83 fe ff cmp $0xffffffff,%esi
35f: 74 f5 je 356 <T_refcount_inc1+0x36>
361: 5d pop %rbp
362: c3 retq


with __atomic_compare_exchange_n:

exactly the same as the original code.


But I don't have an answer for runtime patching of LOCK.
Looks like something to fix in gcc.