Re: [PATCH 3/3] powerpc: rewrite atomics to use ARCH_ATOMIC

From: Nicholas Piggin
Date: Mon Dec 21 2020 - 22:53:55 EST


Excerpts from Boqun Feng's message of November 14, 2020 1:30 am:
> Hi Nicholas,
>
> On Wed, Nov 11, 2020 at 09:07:23PM +1000, Nicholas Piggin wrote:
>> All the cool kids are doing it.
>>
>> Signed-off-by: Nicholas Piggin <npiggin@xxxxxxxxx>
>> ---
>> arch/powerpc/include/asm/atomic.h | 681 ++++++++++-------------------
>> arch/powerpc/include/asm/cmpxchg.h | 62 +--
>> 2 files changed, 248 insertions(+), 495 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/atomic.h b/arch/powerpc/include/asm/atomic.h
>> index 8a55eb8cc97b..899aa2403ba7 100644
>> --- a/arch/powerpc/include/asm/atomic.h
>> +++ b/arch/powerpc/include/asm/atomic.h
>> @@ -11,185 +11,285 @@
>> #include <asm/cmpxchg.h>
>> #include <asm/barrier.h>
>>
>> +#define ARCH_ATOMIC
>> +
>> +#ifndef CONFIG_64BIT
>> +#include <asm-generic/atomic64.h>
>> +#endif
>> +
>> /*
>> * Since *_return_relaxed and {cmp}xchg_relaxed are implemented with
>> * a "bne-" instruction at the end, so an isync is enough as a acquire barrier
>> * on the platform without lwsync.
>> */
>> #define __atomic_acquire_fence() \
>> - __asm__ __volatile__(PPC_ACQUIRE_BARRIER "" : : : "memory")
>> + asm volatile(PPC_ACQUIRE_BARRIER "" : : : "memory")
>>
>> #define __atomic_release_fence() \
>> - __asm__ __volatile__(PPC_RELEASE_BARRIER "" : : : "memory")
>> + asm volatile(PPC_RELEASE_BARRIER "" : : : "memory")
>>
>> -static __inline__ int atomic_read(const atomic_t *v)
>> -{
>> - int t;
>> +#define __atomic_pre_full_fence smp_mb
>>
>> - __asm__ __volatile__("lwz%U1%X1 %0,%1" : "=r"(t) : "m"(v->counter));
>> +#define __atomic_post_full_fence smp_mb
>>

Thanks for the review.

> Do you need to define __atomic_{pre,post}_full_fence for PPC? IIRC, they
> are default smp_mb__{before,atomic}_atomic(), so are smp_mb() defautly
> on PPC.

Okay I didn't realise that's not required.

>> - return t;
>> +#define arch_atomic_read(v) __READ_ONCE((v)->counter)
>> +#define arch_atomic_set(v, i) __WRITE_ONCE(((v)->counter), (i))
>> +#ifdef CONFIG_64BIT
>> +#define ATOMIC64_INIT(i) { (i) }
>> +#define arch_atomic64_read(v) __READ_ONCE((v)->counter)
>> +#define arch_atomic64_set(v, i) __WRITE_ONCE(((v)->counter), (i))
>> +#endif
>> +
> [...]
>>
>> +#define ATOMIC_FETCH_OP_UNLESS_RELAXED(name, type, dtype, width, asm_op) \
>> +static inline int arch_##name##_relaxed(type *v, dtype a, dtype u) \
>
> I don't think we have atomic_fetch_*_unless_relaxed() at atomic APIs,
> ditto for:
>
> atomic_fetch_add_unless_relaxed()
> atomic_inc_not_zero_relaxed()
> atomic_dec_if_positive_relaxed()
>
> , and we don't have the _acquire() and _release() variants for them
> either, and if you don't define their fully-ordered version (e.g.
> atomic_inc_not_zero()), atomic-arch-fallback.h will use read and cmpxchg
> to implement them, and I think not what we want.

Okay. How can those be added? The atoimc generation is pretty
complicated.

> [...]
>>
>> #endif /* __KERNEL__ */
>> #endif /* _ASM_POWERPC_ATOMIC_H_ */
>> diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h
>> index cf091c4c22e5..181f7e8b3281 100644
>> --- a/arch/powerpc/include/asm/cmpxchg.h
>> +++ b/arch/powerpc/include/asm/cmpxchg.h
>> @@ -192,7 +192,7 @@ __xchg_relaxed(void *ptr, unsigned long x, unsigned int size)
>> (unsigned long)_x_, sizeof(*(ptr))); \
>> })
>>
>> -#define xchg_relaxed(ptr, x) \
>> +#define arch_xchg_relaxed(ptr, x) \
>> ({ \
>> __typeof__(*(ptr)) _x_ = (x); \
>> (__typeof__(*(ptr))) __xchg_relaxed((ptr), \
>> @@ -448,35 +448,7 @@ __cmpxchg_relaxed(void *ptr, unsigned long old, unsigned long new,
>> return old;
>> }
>>
>> -static __always_inline unsigned long
>> -__cmpxchg_acquire(void *ptr, unsigned long old, unsigned long new,
>> - unsigned int size)
>> -{
>> - switch (size) {
>> - case 1:
>> - return __cmpxchg_u8_acquire(ptr, old, new);
>> - case 2:
>> - return __cmpxchg_u16_acquire(ptr, old, new);
>> - case 4:
>> - return __cmpxchg_u32_acquire(ptr, old, new);
>> -#ifdef CONFIG_PPC64
>> - case 8:
>> - return __cmpxchg_u64_acquire(ptr, old, new);
>> -#endif
>> - }
>> - BUILD_BUG_ON_MSG(1, "Unsupported size for __cmpxchg_acquire");
>> - return old;
>> -}
>> -#define cmpxchg(ptr, o, n) \
>> - ({ \
>> - __typeof__(*(ptr)) _o_ = (o); \
>> - __typeof__(*(ptr)) _n_ = (n); \
>> - (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \
>> - (unsigned long)_n_, sizeof(*(ptr))); \
>> - })
>> -
>> -
>
> If you remove {atomic_}_cmpxchg_{,_acquire}() and use the version
> provided by atomic-arch-fallback.h, then a fail cmpxchg or
> cmpxchg_acquire() will still result into a full barrier or a acquire
> barrier after the RMW operation, the barrier is not necessary and
> probably this is not what we want?

Why is that done? That seems like a very subtle difference. Shouldn't
the fallback version skip the barrier?

Thanks,
Nick