[PATCH v3 3/4] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions

From: Uros Bizjak
Date: Wed Apr 10 2024 - 02:30:33 EST


Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions to
use arch_atomic64_try_cmpxchg. This implementation avoids one extra
trip through the cmpxchg loop.

The value preload before the cmpxchg loop does not need to be atomic.
Use arch_atomic64_read_nonatomic(v) to load the value from atomic_t
location in a non-atomic way.

The generated code improves from:

1917d5: 31 c9 xor %ecx,%ecx
1917d7: 31 db xor %ebx,%ebx
1917d9: 89 4c 24 3c mov %ecx,0x3c(%esp)
1917dd: 8b 74 24 24 mov 0x24(%esp),%esi
1917e1: 89 c8 mov %ecx,%eax
1917e3: 89 5c 24 34 mov %ebx,0x34(%esp)
1917e7: 8b 7c 24 28 mov 0x28(%esp),%edi
1917eb: 21 ce and %ecx,%esi
1917ed: 89 74 24 4c mov %esi,0x4c(%esp)
1917f1: 21 df and %ebx,%edi
1917f3: 89 de mov %ebx,%esi
1917f5: 89 7c 24 50 mov %edi,0x50(%esp)
1917f9: 8b 54 24 4c mov 0x4c(%esp),%edx
1917fd: 8b 7c 24 2c mov 0x2c(%esp),%edi
191801: 8b 4c 24 50 mov 0x50(%esp),%ecx
191805: 89 d3 mov %edx,%ebx
191807: 89 f2 mov %esi,%edx
191809: f0 0f c7 0f lock cmpxchg8b (%edi)
19180d: 89 c1 mov %eax,%ecx
19180f: 8b 74 24 34 mov 0x34(%esp),%esi
191813: 89 d3 mov %edx,%ebx
191815: 89 44 24 4c mov %eax,0x4c(%esp)
191819: 8b 44 24 3c mov 0x3c(%esp),%eax
19181d: 89 df mov %ebx,%edi
19181f: 89 54 24 44 mov %edx,0x44(%esp)
191823: 89 ca mov %ecx,%edx
191825: 31 de xor %ebx,%esi
191827: 31 c8 xor %ecx,%eax
191829: 09 f0 or %esi,%eax
19182b: 75 ac jne 1917d9 <...>

to:

1912ba: 8b 06 mov (%esi),%eax
1912bc: 8b 56 04 mov 0x4(%esi),%edx
1912bf: 89 44 24 3c mov %eax,0x3c(%esp)
1912c3: 89 c1 mov %eax,%ecx
1912c5: 23 4c 24 34 and 0x34(%esp),%ecx
1912c9: 89 d3 mov %edx,%ebx
1912cb: 23 5c 24 38 and 0x38(%esp),%ebx
1912cf: 89 54 24 40 mov %edx,0x40(%esp)
1912d3: 89 4c 24 2c mov %ecx,0x2c(%esp)
1912d7: 89 5c 24 30 mov %ebx,0x30(%esp)
1912db: 8b 5c 24 2c mov 0x2c(%esp),%ebx
1912df: 8b 4c 24 30 mov 0x30(%esp),%ecx
1912e3: f0 0f c7 0e lock cmpxchg8b (%esi)
1912e7: 0f 85 f3 02 00 00 jne 1915e0 <...>

Signed-off-by: Uros Bizjak <ubizjak@xxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
---
v2: Use arch_atomic64_read_nonatomic().
---
arch/x86/include/asm/atomic64_32.h | 43 +++++++++++++-----------------
1 file changed, 18 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
index bc76a88ae481..8db2ec4d6cda 100644
--- a/arch/x86/include/asm/atomic64_32.h
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -227,69 +227,62 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)

static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);

- while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
}

static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);

- while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));

- return old;
+ return val;
}
#define arch_atomic64_fetch_and arch_atomic64_fetch_and

static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);

- while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
}

static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);

- while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));

- return old;
+ return val;
}
#define arch_atomic64_fetch_or arch_atomic64_fetch_or

static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);

- while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
}

static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);

- while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));

- return old;
+ return val;
}
#define arch_atomic64_fetch_xor arch_atomic64_fetch_xor

static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
{
- s64 old, c = 0;
+ s64 val = arch_atomic64_read_nonatomic(v);

- while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c)
- c = old;
+ do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i));

- return old;
+ return val;
}
#define arch_atomic64_fetch_add arch_atomic64_fetch_add

--
2.44.0