Re: [PATCH v5 1/3] powerpc/bitops: Use immediate operand when possible

From: Michael Ellerman
Date: Tue Nov 30 2021 - 06:40:26 EST


LEROY Christophe <christophe.leroy@xxxxxxxxxx> writes:
> Hi Michael,
>
> Any chance to get this series merged this cycle ?

Yeah. I hesitated a bit at the changes which make the atomic asm even
more complicated, but I guess it's worth it.

cheers

> Le 21/09/2021 à 17:09, Christophe Leroy a écrit :
>> Today we get the following code generation for bitops like
>> set or clear bit:
>>
>> c0009fe0: 39 40 08 00 li r10,2048
>> c0009fe4: 7c e0 40 28 lwarx r7,0,r8
>> c0009fe8: 7c e7 53 78 or r7,r7,r10
>> c0009fec: 7c e0 41 2d stwcx. r7,0,r8
>>
>> c000d568: 39 00 18 00 li r8,6144
>> c000d56c: 7c c0 38 28 lwarx r6,0,r7
>> c000d570: 7c c6 40 78 andc r6,r6,r8
>> c000d574: 7c c0 39 2d stwcx. r6,0,r7
>>
>> Most set bits are constant on lower 16 bits, so it can easily
>> be replaced by the "immediate" version of the operation. Allow
>> GCC to choose between the normal or immediate form.
>>
>> For clear bits, on 32 bits 'rlwinm' can be used instead of 'andc' for
>> when all bits to be cleared are consecutive.
>>
>> On 64 bits we don't have any equivalent single operation for clearing,
>> single bits or a few bits, we'd need two 'rldicl' so it is not
>> worth it, the li/andc sequence is doing the same.
>>
>> With this patch we get:
>>
>> c0009fe0: 7d 00 50 28 lwarx r8,0,r10
>> c0009fe4: 61 08 08 00 ori r8,r8,2048
>> c0009fe8: 7d 00 51 2d stwcx. r8,0,r10
>>
>> c000d558: 7c e0 40 28 lwarx r7,0,r8
>> c000d55c: 54 e7 05 64 rlwinm r7,r7,0,21,18
>> c000d560: 7c e0 41 2d stwcx. r7,0,r8
>>
>> On pmac32_defconfig, it reduces the text by approx 10 kbytes.
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@xxxxxxxxxx>
>> Reviewed-by: Segher Boessenkool <segher@xxxxxxxxxxxxxxxxxxx>
>> ---
>> v5: Fixed the argument of is_rlwinm_mask_valid() in test_and_clear_bits()
>>
>> v4: Rebased
>>
>> v3:
>> - Using the mask validation proposed by Segher
>>
>> v2:
>> - Use "n" instead of "i" as constraint for the rlwinm mask
>> - Improve mask verification to handle more than single bit masks
>>
>> Signed-off-by: Christophe Leroy <christophe.leroy@xxxxxxxxxx>
>> ---
>> arch/powerpc/include/asm/bitops.h | 89 ++++++++++++++++++++++++++++---
>> 1 file changed, 81 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
>> index 11847b6a244e..a05d8c62cbea 100644
>> --- a/arch/powerpc/include/asm/bitops.h
>> +++ b/arch/powerpc/include/asm/bitops.h
>> @@ -71,19 +71,61 @@ static inline void fn(unsigned long mask, \
>> __asm__ __volatile__ ( \
>> prefix \
>> "1:" PPC_LLARX "%0,0,%3,0\n" \
>> - stringify_in_c(op) "%0,%0,%2\n" \
>> + #op "%I2 %0,%0,%2\n" \
>> PPC_STLCX "%0,0,%3\n" \
>> "bne- 1b\n" \
>> : "=&r" (old), "+m" (*p) \
>> - : "r" (mask), "r" (p) \
>> + : "rK" (mask), "r" (p) \
>> : "cc", "memory"); \
>> }
>>
>> DEFINE_BITOP(set_bits, or, "")
>> -DEFINE_BITOP(clear_bits, andc, "")
>> -DEFINE_BITOP(clear_bits_unlock, andc, PPC_RELEASE_BARRIER)
>> DEFINE_BITOP(change_bits, xor, "")
>>
>> +static __always_inline bool is_rlwinm_mask_valid(unsigned long x)
>> +{
>> + if (!x)
>> + return false;
>> + if (x & 1)
>> + x = ~x; // make the mask non-wrapping
>> + x += x & -x; // adding the low set bit results in at most one bit set
>> +
>> + return !(x & (x - 1));
>> +}
>> +
>> +#define DEFINE_CLROP(fn, prefix) \
>> +static inline void fn(unsigned long mask, volatile unsigned long *_p) \
>> +{ \
>> + unsigned long old; \
>> + unsigned long *p = (unsigned long *)_p; \
>> + \
>> + if (IS_ENABLED(CONFIG_PPC32) && \
>> + __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {\
>> + asm volatile ( \
>> + prefix \
>> + "1:" "lwarx %0,0,%3\n" \
>> + "rlwinm %0,%0,0,%2\n" \
>> + "stwcx. %0,0,%3\n" \
>> + "bne- 1b\n" \
>> + : "=&r" (old), "+m" (*p) \
>> + : "n" (~mask), "r" (p) \
>> + : "cc", "memory"); \
>> + } else { \
>> + asm volatile ( \
>> + prefix \
>> + "1:" PPC_LLARX "%0,0,%3,0\n" \
>> + "andc %0,%0,%2\n" \
>> + PPC_STLCX "%0,0,%3\n" \
>> + "bne- 1b\n" \
>> + : "=&r" (old), "+m" (*p) \
>> + : "r" (mask), "r" (p) \
>> + : "cc", "memory"); \
>> + } \
>> +}
>> +
>> +DEFINE_CLROP(clear_bits, "")
>> +DEFINE_CLROP(clear_bits_unlock, PPC_RELEASE_BARRIER)
>> +
>> static inline void arch_set_bit(int nr, volatile unsigned long *addr)
>> {
>> set_bits(BIT_MASK(nr), addr + BIT_WORD(nr));
>> @@ -116,12 +158,12 @@ static inline unsigned long fn( \
>> __asm__ __volatile__ ( \
>> prefix \
>> "1:" PPC_LLARX "%0,0,%3,%4\n" \
>> - stringify_in_c(op) "%1,%0,%2\n" \
>> + #op "%I2 %1,%0,%2\n" \
>> PPC_STLCX "%1,0,%3\n" \
>> "bne- 1b\n" \
>> postfix \
>> : "=&r" (old), "=&r" (t) \
>> - : "r" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \
>> + : "rK" (mask), "r" (p), "i" (IS_ENABLED(CONFIG_PPC64) ? eh : 0) \
>> : "cc", "memory"); \
>> return (old & mask); \
>> }
>> @@ -130,11 +172,42 @@ DEFINE_TESTOP(test_and_set_bits, or, PPC_ATOMIC_ENTRY_BARRIER,
>> PPC_ATOMIC_EXIT_BARRIER, 0)
>> DEFINE_TESTOP(test_and_set_bits_lock, or, "",
>> PPC_ACQUIRE_BARRIER, 1)
>> -DEFINE_TESTOP(test_and_clear_bits, andc, PPC_ATOMIC_ENTRY_BARRIER,
>> - PPC_ATOMIC_EXIT_BARRIER, 0)
>> DEFINE_TESTOP(test_and_change_bits, xor, PPC_ATOMIC_ENTRY_BARRIER,
>> PPC_ATOMIC_EXIT_BARRIER, 0)
>>
>> +static inline unsigned long test_and_clear_bits(unsigned long mask, volatile unsigned long *_p)
>> +{
>> + unsigned long old, t;
>> + unsigned long *p = (unsigned long *)_p;
>> +
>> + if (IS_ENABLED(CONFIG_PPC32) &&
>> + __builtin_constant_p(mask) && is_rlwinm_mask_valid(~mask)) {
>> + asm volatile (
>> + PPC_ATOMIC_ENTRY_BARRIER
>> + "1:" "lwarx %0,0,%3\n"
>> + "rlwinm %1,%0,0,%2\n"
>> + "stwcx. %1,0,%3\n"
>> + "bne- 1b\n"
>> + PPC_ATOMIC_EXIT_BARRIER
>> + : "=&r" (old), "=&r" (t)
>> + : "n" (~mask), "r" (p)
>> + : "cc", "memory");
>> + } else {
>> + asm volatile (
>> + PPC_ATOMIC_ENTRY_BARRIER
>> + "1:" PPC_LLARX "%0,0,%3,0\n"
>> + "andc %1,%0,%2\n"
>> + PPC_STLCX "%1,0,%3\n"
>> + "bne- 1b\n"
>> + PPC_ATOMIC_EXIT_BARRIER
>> + : "=&r" (old), "=&r" (t)
>> + : "r" (mask), "r" (p)
>> + : "cc", "memory");
>> + }
>> +
>> + return (old & mask);
>> +}
>> +
>> static inline int arch_test_and_set_bit(unsigned long nr,
>> volatile unsigned long *addr)
>> {
>>