[tip: x86/asm] x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}

From: tip-bot2 for Uros Bizjak
Date: Fri Sep 15 2023 - 07:25:25 EST


The following commit has been merged into the x86/asm branch of tip:

Commit-ID: 54cd971c6f4461fb6b178579751788bf4f64dfca
Gitweb: https://git.kernel.org/tip/54cd971c6f4461fb6b178579751788bf4f64dfca
Author: Uros Bizjak <ubizjak@xxxxxxxxx>
AuthorDate: Wed, 06 Sep 2023 20:58:44 +02:00
Committer: Ingo Molnar <mingo@xxxxxxxxxx>
CommitterDate: Fri, 15 Sep 2023 13:16:35 +02:00

x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128}

Define target-specific {raw,this}_cpu_try_cmpxchg64() and
{raw,this}_cpu_try_cmpxchg128() macros. These definitions override
the generic fallback definitions and enable target-specific
optimized implementations.

Several places in mm/slub.o improve from e.g.:

53bc: 48 8d 4f 40 lea 0x40(%rdi),%rcx
53c0: 48 89 fa mov %rdi,%rdx
53c3: 49 8b 5c 05 00 mov 0x0(%r13,%rax,1),%rbx
53c8: 4c 89 e8 mov %r13,%rax
53cb: 49 8d 30 lea (%r8),%rsi
53ce: e8 00 00 00 00 call 53d3 <...>
53cf: R_X86_64_PLT32 this_cpu_cmpxchg16b_emu-0x4
53d3: 48 31 d7 xor %rdx,%rdi
53d6: 4c 31 e8 xor %r13,%rax
53d9: 48 09 c7 or %rax,%rdi
53dc: 75 ae jne 538c <...>

to:

53bc: 48 8d 4a 40 lea 0x40(%rdx),%rcx
53c0: 49 8b 1c 07 mov (%r15,%rax,1),%rbx
53c4: 4c 89 f8 mov %r15,%rax
53c7: 48 8d 37 lea (%rdi),%rsi
53ca: e8 00 00 00 00 call 53cf <...>
53cb: R_X86_64_PLT32 this_cpu_cmpxchg16b_emu-0x4
53cf: 75 bb jne 538c <...>

reducing the size of mm/slub.o by 80 bytes:

text data bss dec hex filename
39758 5337 4208 49303 c097 slub-new.o
39838 5337 4208 49383 c0e7 slub-old.o

Signed-off-by: Uros Bizjak <ubizjak@xxxxxxxxx>
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Link: https://lore.kernel.org/r/20230906185941.53527-1-ubizjak@xxxxxxxxx
---
arch/x86/include/asm/percpu.h | 67 ++++++++++++++++++++++++++++++++++-
1 file changed, 67 insertions(+)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 34734d7..4c36419 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -237,12 +237,47 @@ do { \

#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval)
#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
+
+#define percpu_try_cmpxchg64_op(size, qual, _var, _ovalp, _nval) \
+({ \
+ bool success; \
+ u64 *_oval = (u64 *)(_ovalp); \
+ union { \
+ u64 var; \
+ struct { \
+ u32 low, high; \
+ }; \
+ } old__, new__; \
+ \
+ old__.var = *_oval; \
+ new__.var = _nval; \
+ \
+ asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \
+ "cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [var] "+m" (_var), \
+ "+a" (old__.low), \
+ "+d" (old__.high) \
+ : "b" (new__.low), \
+ "c" (new__.high) \
+ : "memory", "esi"); \
+ if (unlikely(!success)) \
+ *_oval = old__.var; \
+ likely(success); \
+})
+
+#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, , pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, volatile, pcp, ovalp, nval)
#endif

#ifdef CONFIG_X86_64
#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval);
#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval);

+#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval);
+#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval);
+
#define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval) \
({ \
union { \
@@ -269,6 +304,38 @@ do { \

#define raw_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval)
#define this_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval)
+
+#define percpu_try_cmpxchg128_op(size, qual, _var, _ovalp, _nval) \
+({ \
+ bool success; \
+ u128 *_oval = (u128 *)(_ovalp); \
+ union { \
+ u128 var; \
+ struct { \
+ u64 low, high; \
+ }; \
+ } old__, new__; \
+ \
+ old__.var = *_oval; \
+ new__.var = _nval; \
+ \
+ asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \
+ "cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
+ CC_SET(z) \
+ : CC_OUT(z) (success), \
+ [var] "+m" (_var), \
+ "+a" (old__.low), \
+ "+d" (old__.high) \
+ : "b" (new__.low), \
+ "c" (new__.high) \
+ : "memory", "rsi"); \
+ if (unlikely(!success)) \
+ *_oval = old__.var; \
+ likely(success); \
+})
+
+#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, , pcp, ovalp, nval)
+#define this_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, volatile, pcp, ovalp, nval)
#endif

/*