Re: [PATCH 2/6] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions

From: Mark Rutland
Date: Tue Apr 09 2024 - 07:13:24 EST

Next message: Asbjørn Sloth Tønnesen: "Re: [PATCH net-next 1/6] flow_offload: add flow_rule_no_unsupp_control_flags()"
Previous message: Oleg Nesterov: "Re: [PATCH] selftests/timers/posix_timers: reimplement check_timer_distribution()"
In reply to: Uros Bizjak: "[PATCH 2/6] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions"
Next in thread: Uros Bizjak: "Re: [PATCH 2/6] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Tue, Apr 09, 2024 at 12:03:53PM +0200, Uros Bizjak wrote:
> Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions to
> use arch_atomic64_try_cmpxchg. This implementation avoids one extra
> trip through the cmpxchg loop.
>
> The value preload before the cmpxchg loop does not need to be atomic,
> but should use READ_ONCE to prevent compiler from merging, refetching
> or reordering the read.
>
> The generated code improves from:
>
> 1917d5: 31 c9 xor %ecx,%ecx
> 1917d7: 31 db xor %ebx,%ebx
> 1917d9: 89 4c 24 3c mov %ecx,0x3c(%esp)
> 1917dd: 8b 74 24 24 mov 0x24(%esp),%esi
> 1917e1: 89 c8 mov %ecx,%eax
> 1917e3: 89 5c 24 34 mov %ebx,0x34(%esp)
> 1917e7: 8b 7c 24 28 mov 0x28(%esp),%edi
> 1917eb: 21 ce and %ecx,%esi
> 1917ed: 89 74 24 4c mov %esi,0x4c(%esp)
> 1917f1: 21 df and %ebx,%edi
> 1917f3: 89 de mov %ebx,%esi
> 1917f5: 89 7c 24 50 mov %edi,0x50(%esp)
> 1917f9: 8b 54 24 4c mov 0x4c(%esp),%edx
> 1917fd: 8b 7c 24 2c mov 0x2c(%esp),%edi
> 191801: 8b 4c 24 50 mov 0x50(%esp),%ecx
> 191805: 89 d3 mov %edx,%ebx
> 191807: 89 f2 mov %esi,%edx
> 191809: f0 0f c7 0f lock cmpxchg8b (%edi)
> 19180d: 89 c1 mov %eax,%ecx
> 19180f: 8b 74 24 34 mov 0x34(%esp),%esi
> 191813: 89 d3 mov %edx,%ebx
> 191815: 89 44 24 4c mov %eax,0x4c(%esp)
> 191819: 8b 44 24 3c mov 0x3c(%esp),%eax
> 19181d: 89 df mov %ebx,%edi
> 19181f: 89 54 24 44 mov %edx,0x44(%esp)
> 191823: 89 ca mov %ecx,%edx
> 191825: 31 de xor %ebx,%esi
> 191827: 31 c8 xor %ecx,%eax
> 191829: 09 f0 or %esi,%eax
> 19182b: 75 ac jne 1917d9 <...>
>
> to:
>
> 1912ba: 8b 06 mov (%esi),%eax
> 1912bc: 8b 56 04 mov 0x4(%esi),%edx
> 1912bf: 89 44 24 3c mov %eax,0x3c(%esp)
> 1912c3: 89 c1 mov %eax,%ecx
> 1912c5: 23 4c 24 34 and 0x34(%esp),%ecx
> 1912c9: 89 d3 mov %edx,%ebx
> 1912cb: 23 5c 24 38 and 0x38(%esp),%ebx
> 1912cf: 89 54 24 40 mov %edx,0x40(%esp)
> 1912d3: 89 4c 24 2c mov %ecx,0x2c(%esp)
> 1912d7: 89 5c 24 30 mov %ebx,0x30(%esp)
> 1912db: 8b 5c 24 2c mov 0x2c(%esp),%ebx
> 1912df: 8b 4c 24 30 mov 0x30(%esp),%ecx
> 1912e3: f0 0f c7 0e lock cmpxchg8b (%esi)
> 1912e7: 0f 85 f3 02 00 00 jne 1915e0 <...>
>
> Signed-off-by: Uros Bizjak <ubizjak@xxxxxxxxx>
> Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: Borislav Petkov <bp@xxxxxxxxx>
> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
> Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
> Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> ---
> arch/x86/include/asm/atomic64_32.h | 44 ++++++++++++------------------
> 1 file changed, 18 insertions(+), 26 deletions(-)
>
> diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
> index 11e817dab44a..84affd7a5d1c 100644
> --- a/arch/x86/include/asm/atomic64_32.h
> +++ b/arch/x86/include/asm/atomic64_32.h
> @@ -201,69 +201,61 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
>
> static __always_inline void arch_atomic64_and(s64 i, atomic64_t *v)
> {
> - s64 old, c = 0;
> + s64 val = __READ_ONCE(v->counter);

I reckon it's worth placing this in a helper with a big comment, e.g.

static __always_inline s64 arch_atomic64_read_tearable(atomic64_t *v)
{
/*
* TODO: explain that this might be torn, but it occurs *once*, and can
* safely be consumed by atomic64_try_cmpxchg().
*
* TODO: point to the existing commentary regarding why we use
* __READ_ONCE() for KASAN reasons.
*/
return __READ_ONCE(v->counter);
}

.. and then use that in each of the instances below.

That way the subtlety is clearly documented, and it'd more clearly align with
the x86_64 verions.

Mark.

>
> - while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
> - c = old;
> + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
> }
>
> static __always_inline s64 arch_atomic64_fetch_and(s64 i, atomic64_t *v)
> {
> - s64 old, c = 0;
> + s64 val = __READ_ONCE(v->counter);
>
> - while ((old = arch_atomic64_cmpxchg(v, c, c & i)) != c)
> - c = old;
> + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val & i));
>
> - return old;
> + return val;
> }
> #define arch_atomic64_fetch_and arch_atomic64_fetch_and
>
> static __always_inline void arch_atomic64_or(s64 i, atomic64_t *v)
> {
> - s64 old, c = 0;
> + s64 val = __READ_ONCE(v->counter);
>
> - while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
> - c = old;
> + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
> }
>
> static __always_inline s64 arch_atomic64_fetch_or(s64 i, atomic64_t *v)
> {
> - s64 old, c = 0;
> + s64 val = __READ_ONCE(v->counter);
>
> - while ((old = arch_atomic64_cmpxchg(v, c, c | i)) != c)
> - c = old;
> + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val | i));
>
> - return old;
> + return val;
> }
> #define arch_atomic64_fetch_or arch_atomic64_fetch_or
>
> static __always_inline void arch_atomic64_xor(s64 i, atomic64_t *v)
> {
> - s64 old, c = 0;
> + s64 val = __READ_ONCE(v->counter);
>
> - while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
> - c = old;
> + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
> }
>
> static __always_inline s64 arch_atomic64_fetch_xor(s64 i, atomic64_t *v)
> {
> - s64 old, c = 0;
> + s64 val = __READ_ONCE(v->counter);
>
> - while ((old = arch_atomic64_cmpxchg(v, c, c ^ i)) != c)
> - c = old;
> + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val ^ i));
>
> - return old;
> + return val;
> }
> #define arch_atomic64_fetch_xor arch_atomic64_fetch_xor
>
> static __always_inline s64 arch_atomic64_fetch_add(s64 i, atomic64_t *v)
> {
> - s64 old, c = 0;
> + s64 val = __READ_ONCE(v->counter);
>
> - while ((old = arch_atomic64_cmpxchg(v, c, c + i)) != c)
> - c = old;
> -
> - return old;
> + do { } while (!arch_atomic64_try_cmpxchg(v, &val, val + i));
> + return val;
> }
> #define arch_atomic64_fetch_add arch_atomic64_fetch_add
>
> --
> 2.44.0
>
>

Next message: Asbjørn Sloth Tønnesen: "Re: [PATCH net-next 1/6] flow_offload: add flow_rule_no_unsupp_control_flags()"
Previous message: Oleg Nesterov: "Re: [PATCH] selftests/timers/posix_timers: reimplement check_timer_distribution()"
In reply to: Uros Bizjak: "[PATCH 2/6] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions"
Next in thread: Uros Bizjak: "Re: [PATCH 2/6] locking/atomic/x86: Rewrite x86_32 arch_atomic64_{,fetch}_{and,or,xor}() functions"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]