[PATCH] x86/asm: Use CC_SET/CC_OUT in percpu_cmpxchg16b_double

From: Uros Bizjak
Date: Fri Dec 28 2018 - 10:38:19 EST


Use CC_SET(z)/CC_OUT(z) instead of explicit setz instruction.
Using these two defines, the compiler that supports generation of
condition code outputs from inline assembly flags generates one
conditional jump instruction, e.g:

[lea (%rdi),%rsi
callq this_cpu_cmpxchg16b_emu]
-- or --
[cmpxchg16b %gs:(%rdi)]
jne 199764 <kmem_cache_alloc+0x44>

instead of

[lea (%rdi),%rsi
callq this_cpu_cmpxchg16b_emu]
-- or --
[cmpxchg16b %gs:(%rdi)]
sete %cl
test %cl,%cl
je 19ae04 <kmem_cache_alloc+0x44>

The complication with percpu_cmpxchg16b_double is, that the definition
defaults to the call to this_cpu_cmpxchg16b_emu library function, which
(depending on X86_FEATURE_CX16 flag) is later patched with real cmpxchg16b
instruction. To solve this complication, the patch changes
this_cpu_cmpxchg16b_emu library function to return the result in ZF flag of
%rflags register, instead of %al register. Please also note that instead
of popf instruction (which restores flags register to a previously saved state),
the patched function uses sti, but followed by a nop, which ends the
inhibition of interrupts early.

The patch also introduces alternative_io_tail definition. This definition
can take a tail instruction, common to all alternatives. By using this
definition, it is possible to remove setz from cmpxchg16b alternatives, saved
in .altinstr_replacement section, thus saving a few bytes from the binary.

Signed-off-by: Uros Bizjak <ubizjak@xxxxxxxxx>
Cc: x86@xxxxxxxxxx
---
arch/x86/include/asm/alternative.h | 6 ++++++
arch/x86/include/asm/percpu.h | 14 ++++++++------
arch/x86/lib/cmpxchg16b_emu.S | 16 ++++++----------
3 files changed, 20 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 0660e14690c8..49b29990950a 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -205,6 +205,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
: output : "i" (0), ## input)

+/* Like alternative_io, but with a common tail instruction. */
+#define alternative_io_tail(oldinstr, newinstr, feature, tail, \
+ output, input...) \
+ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) tail \
+ : output : "i" (0), ## input)
+
/* Like alternative_io, but for replacing a direct call with another one. */
#define alternative_call(oldfunc, newfunc, feature, output, input...) \
asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 1a19d11cfbbd..9cf2e78eb4b3 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -495,12 +495,14 @@ do { \
bool __ret; \
typeof(pcp1) __o1 = (o1), __n1 = (n1); \
typeof(pcp2) __o2 = (o2), __n2 = (n2); \
- alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \
- "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t", \
- X86_FEATURE_CX16, \
- ASM_OUTPUT2("=a" (__ret), "+m" (pcp1), \
- "+m" (pcp2), "+d" (__o2)), \
- "b" (__n1), "c" (__n2), "a" (__o1) : "rsi"); \
+ alternative_io_tail("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu", \
+ "cmpxchg16b "__percpu_arg(1), \
+ X86_FEATURE_CX16, \
+ CC_SET(z), \
+ ASM_OUTPUT2(CC_OUT(z) (__ret), \
+ "+m" (pcp1), "+m" (pcp2), \
+ "+a" (__o1), "+d" (__o2)), \
+ "b" (__n1), "c" (__n2) : "rsi"); \
__ret; \
})

diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
index 9b330242e740..d8c1ae48e0d9 100644
--- a/arch/x86/lib/cmpxchg16b_emu.S
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -17,20 +17,20 @@
* %rdx : high 64 bits of old value
* %rbx : low 64 bits of new value
* %rcx : high 64 bits of new value
- * %al : Operation successful
+ *
+ * Outputs:
+ * %rflags.zf: set if the destination operand and %rdx:%rax are equal
*/
ENTRY(this_cpu_cmpxchg16b_emu)

#
-# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
-# via the ZF. Caller will access %al to get result.
+# Emulate 'cmpxchg16b %gs:(%rsi)'
#
# Note that this is only useful for a cpuops operation. Meaning that we
# do *not* have a fully atomic operation but just an operation that is
# *atomic* on a single cpu (as provided by the this_cpu_xx class of
# macros).
#
- pushfq
cli

cmpq PER_CPU_VAR((%rsi)), %rax
@@ -41,13 +41,9 @@ ENTRY(this_cpu_cmpxchg16b_emu)
movq %rbx, PER_CPU_VAR((%rsi))
movq %rcx, PER_CPU_VAR(8(%rsi))

- popfq
- mov $1, %al
- ret
-
.Lnot_same:
- popfq
- xor %al,%al
+ sti
+ nop
ret

ENDPROC(this_cpu_cmpxchg16b_emu)
--
2.20.1