[RFC PATCH 08/15] Provide an implementation of bitops using C++11 atomics
From: David Howells
Date: Wed May 18 2016 - 11:11:45 EST
Provide an implementation of various atomic bit functions using the
__atomic_fetch_{and,or,xor}() C++11 atomics. This involves creating a mask
and adjusting the address in each function as the bit number can be greater
than the number of bits in a word and can also be negative.
On some arches, such as those that use LL/SC type constructs or CMPXCHG,
AND/OR/XOR are normally used anyway so this is not a problem. On
powerpc64, for example:
.test_and_set_bit:
clrlwi r10,r3,26
rldicl r3,r3,58,6
rldicr r3,r3,3,60
li r9,1
sld r9,r9,r10
add r4,r4,r3
hwsync
retry: ldarx r3,0,r4
or r10,r3,r9 <--- OR is used here
stdcx. r10,0,r4
bne- retry
hwsync
and r9,r9,r3
addic r3,r9,-1
subfe r3,r3,r9
blr
However, on some arches such as x86, there are bit wangling instructions
that take a bit number and may even correctly handle bit numbers outside
the range 0...BITS_PER_LONG-1. Even on x86, if the result isn't of
interest, it is apparently faster to use LOCKed AND/OR/XOR than to use
LOCKed BTR/BTS/BTC.
However, the current gcc implementation always uses a CMPXCHG loop rather
than using BTR/BTS/BTC, though it will still use AND/OR/XOR if it can:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49244
With this fixed,
bool try_test_and_change_bit(unsigned long *p)
{
return test_and_change_bit(83, p);
}
becomes:
foo_test_and_change_bit:
lock btcq $0x13,0x8(%rdi)
setb %al
retq
There are three things to note here:
(1) The SETB instruction is generated by the compiler.
(2) Because the memory variable is a long, BTCQ is used - which uses an
extra byte of opcode. We could use BTCL instead as the inline asm
does... except that that generates technically broken code since the
bit number is a *64-bit* number and BTCL only takes a 32-bit bit
number. I'm not sure that that is worth worrying about, however.
(3) The compiler divided the constant bit number between the immediate
value to the BTCQ instruction and the displacement on the memory
address. The immediate value is, however, an imm8 so this is
unnecessary outside the range 127..-128 I think.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49244#c20
Next there's:
const char *foo(unsigned long *p);
const char *foo_test_and_change_bit_2(unsigned long *p)
{
return test_and_change_bit(83, p) ? foo(p) : "bar";
}
becomes:
foo_test_and_change_bit_2:
lock btcq $0x13,0x8(%rdi)
jae .L5
jmpq foo
.L5 mov $.LC0,%eax
retq
with the compiler generating a conditional jump around a tail-call to
foo().
Next, we can use a variable as the bit number rather than a constant.
bool foo_test_and_change_bit_3(unsigned long *p, long n)
{
return test_and_change_bit(n, p);
}
becomes:
foo_test_and_change_bit_3:
mov %rsi,%rax }
and $0x3f,%esi } Splitting the bit number
sar $0x6,%rax }
lock btc %rsi,(%rdi,%rax,8)
setb %al
retq
As can be seen, the compiler again splits the bit number up unnecessarily
since the CPU will do that.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49244#c13
Finally, if we ignore the result:
void foo_test_and_change_bit_4(unsigned long *p)
{
test_and_change_bit(83, p);
}
void foo_test_and_change_bit_5(unsigned long *p, long n)
{
test_and_change_bit(n, p);
}
the compiler will use XOR instead of BTC:
foo_test_and_change_bit_4:
lock xorq $0x80000,0x8(%rdi)
retq
foo_test_and_change_bit_5:
mov %rsi,%rdx
mov %esi,%ecx
mov $0x1,%eax
sar $0x6,%rdx
shl %cl,%rax
lock xor %rax,(%rdi,%rdx,8)
retq
Also, the compiler won't optimise __atomic_load[_n]() to either a TEST or a
BT instruction:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49244#c13
For arm64, if compiled with -march=armv8-a+lse this will generate LDCLR,
LDSET and LDEOR instructions for __atomic_fetch_{and,or,xor}() with
appropriate {,A,L,AL} suffixes. However, because LSCLR is actually an
AND-NOT not an AND, the operand is negated by the compiler to undo the
negation by the CPU. This combined with the ~ operator in the code to
generate the mask:
static __always_inline
bool iso_test_and_clear_bit(long bit, volatile unsigned long *addr,
int memorder)
{
unsigned long mask = 1UL << (bit & (BITS_PER_LONG - 1));
unsigned long old;
addr += bit >> _BITOPS_LONG_SHIFT;
---> old = __atomic_fetch_and(addr, ~mask, memorder);
return old & mask;
}
means that the mask gets effectively negated three times:
foo_clear_bit_unlock:
and w3, w0, #0x3f
asr x2, x0, #6
mov x0, #0x1
add x1, x1, x2, lsl #3
lsl x0, x0, x3
mvn x0, x0 <--- Negate from my code
mvn x2, x0 <--- Negate from atomic
ldclrl x2, x2, [x1] <--- Negate from CPU
ret
including two redundant instructions.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71153
Signed-off-by: David Howells <dhowells@xxxxxxxxxx>
---
include/asm-generic/iso-bitops.h | 181 ++++++++++++++++++++++++++++++++++++++
1 file changed, 181 insertions(+)
create mode 100644 include/asm-generic/iso-bitops.h
diff --git a/include/asm-generic/iso-bitops.h b/include/asm-generic/iso-bitops.h
new file mode 100644
index 000000000000..64d5067e3a67
--- /dev/null
+++ b/include/asm-generic/iso-bitops.h
@@ -0,0 +1,181 @@
+/* Use ISO C++11 intrinsics to implement bitops.
+ *
+ * Copyright (C) 2016 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@xxxxxxxxxx)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _ASM_GENERIC_ISO_BITOPS_H
+#define _ASM_GENERIC_ISO_BITOPS_H
+
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+static __always_inline
+bool test_bit(long bit, const volatile unsigned long *addr)
+{
+ unsigned long mask = 1UL << (bit & (BITS_PER_LONG - 1));
+ unsigned long old;
+
+ addr += bit >> _BITOPS_LONG_SHIFT;
+ old = __atomic_load_n(addr, __ATOMIC_RELAXED);
+ return old & mask;
+}
+
+/**
+ * set_bit - Atomically set a bit in memory
+ * @bit: the bit to set
+ * @addr: the address to start counting from
+ *
+ * This function is atomic and may not be reordered. See __set_bit() if you do
+ * not require the atomic guarantees.
+ *
+ * Note: there are no guarantees that this function will not be reordered on
+ * non x86 architectures, so if you are writing portable code, make sure not to
+ * rely on its reordering guarantees.
+ *
+ * Note that @bit may be almost arbitrarily large; this function is not
+ * restricted to acting on a single-word quantity.
+ */
+static __always_inline
+void iso_set_bit(long bit, volatile unsigned long *addr, int memorder)
+{
+ unsigned long mask = 1UL << (bit & (BITS_PER_LONG - 1));
+
+ addr += bit >> _BITOPS_LONG_SHIFT;
+ __atomic_fetch_or(addr, mask, memorder);
+}
+
+#define set_bit(b, a) iso_set_bit((b), (a), __ATOMIC_ACQ_REL)
+
+/**
+ * set_bit_unlock - Sets a bit in memory with release semantics
+ * @bit: Bit to set
+ * @addr: Address to start counting from
+ *
+ * This function is atomic and implies release semantics before the memory
+ * operation. It can be used for an unlock.
+ */
+#define set_bit_unlock(b, a) iso_set_bit((b), (a), __ATOMIC_RELEASE)
+
+/**
+ * clear_bit - Sets a bit in memory
+ * @bit: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * clear_bit() is atomic and may not be reordered. However, it does not
+ * contain a memory barrier, so if it is used for locking purposes, you should
+ * call smp_mb__before_atomic() and/or smp_mb__after_atomic() in order to
+ * ensure changes are visible on other processors.
+ */
+static __always_inline
+void iso_clear_bit(long bit, volatile unsigned long *addr, int memorder)
+{
+ unsigned long mask = 1UL << (bit & (BITS_PER_LONG - 1));
+
+ addr += bit >> _BITOPS_LONG_SHIFT;
+ __atomic_fetch_and(addr, ~mask, memorder);
+}
+
+#define clear_bit(b, a) iso_clear_bit((b), (a), __ATOMIC_ACQ_REL)
+
+/**
+ * clear_bit_unlock - Clears a bit in memory with release semantics
+ * @bit: Bit to clear
+ * @addr: Address to start counting from
+ *
+ * This function is atomic and implies release semantics before the memory
+ * operation. It can be used for an unlock.
+ */
+#define clear_bit_unlock(b, a) iso_clear_bit((b), (a), __ATOMIC_RELEASE)
+
+/**
+ * change_bit - Toggle a bit in memory
+ * @bit: Bit to change
+ * @addr: Address to start counting from
+ *
+ * change_bit() is atomic and may not be reordered. Note that @bit may be
+ * almost arbitrarily large; this function is not restricted to acting on a
+ * single-word quantity.
+ */
+static __always_inline
+void iso_change_bit(long bit, volatile unsigned long *addr, int memorder)
+{
+ unsigned long mask = 1UL << (bit & (BITS_PER_LONG - 1));
+
+ addr += bit >> _BITOPS_LONG_SHIFT;
+ __atomic_fetch_xor(addr, mask, memorder);
+}
+
+#define change_bit(b, a) iso_change_bit((b), (a), __ATOMIC_ACQ_REL)
+
+/**
+ * test_and_set_bit - Set a bit and return its old value
+ * @bit: Bit to set
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered. It also implies memory
+ * barriers both sides.
+ */
+static __always_inline
+bool iso_test_and_set_bit(long bit, volatile unsigned long *addr, int memorder)
+{
+ unsigned long mask = 1UL << (bit & (BITS_PER_LONG - 1));
+ unsigned long old;
+
+ addr += bit >> _BITOPS_LONG_SHIFT;
+ old = __atomic_fetch_or(addr, mask, memorder);
+ return old & mask;
+}
+
+#define test_and_set_bit(b, a) iso_test_and_set_bit((b), (a), __ATOMIC_ACQ_REL)
+#define test_and_set_bit_lock(b, a) iso_test_and_set_bit((b), (a), __ATOMIC_ACQUIRE)
+
+/**
+ * test_and_clear_bit - Clear a bit and return its old value
+ * @bit: Bit to clear
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered. It also implies memory
+ * barriers both sides.
+ */
+static __always_inline
+bool iso_test_and_clear_bit(long bit, volatile unsigned long *addr, int memorder)
+{
+ unsigned long mask = 1UL << (bit & (BITS_PER_LONG - 1));
+ unsigned long old;
+
+ addr += bit >> _BITOPS_LONG_SHIFT;
+ old = __atomic_fetch_and(addr, ~mask, memorder);
+ return old & mask;
+}
+
+#define test_and_clear_bit(b, a) iso_test_and_clear_bit((b), (a), __ATOMIC_ACQ_REL)
+#define test_and_clear_bit_lock(b, a) iso_test_and_clear_bit((b), (a), __ATOMIC_ACQUIRE)
+
+/**
+ * test_and_change_bit - Toggle a bit and return its old value
+ * @bit: Bit to toggle
+ * @addr: Address to count from
+ *
+ * This operation is atomic and cannot be reordered. It also implies memory
+ * barriers both sides.
+ */
+static __always_inline
+bool iso_test_and_change_bit(long bit, volatile unsigned long *addr, int memorder)
+{
+ unsigned long mask = 1UL << (bit & (BITS_PER_LONG - 1));
+ unsigned long old;
+
+ addr += bit >> _BITOPS_LONG_SHIFT;
+ old = __atomic_fetch_xor(addr, mask, memorder);
+ return old & mask;
+}
+
+#define test_and_change_bit(b, a) iso_test_and_change_bit((b), (a), __ATOMIC_ACQ_REL)
+
+#endif /* _ASM_GENERIC_ISO_BITOPS_H */