[PATCH 5/5] -march=native: MOVBE support

From: Alexey Dobriyan
Date: Thu Dec 07 2017 - 17:42:15 EST


Use MOVBE if it is available.

This doesn't save code size as MOVBE seems to be as long as MOV+BSWAP,
It is not clear if it saves uop, maybe it will in the future.

Do it because it is easy, I guess.
---
arch/x86/crypto/des3_ede-asm_64.S | 28 ++++++++++++++++++++++++++++
arch/x86/net/bpf_jit.S | 12 ++++++++++++
scripts/kconfig/cpuid.c | 4 ++++
scripts/march-native.sh | 3 ++-
4 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/arch/x86/crypto/des3_ede-asm_64.S b/arch/x86/crypto/des3_ede-asm_64.S
index 8e49ce117494..007319ea1f62 100644
--- a/arch/x86/crypto/des3_ede-asm_64.S
+++ b/arch/x86/crypto/des3_ede-asm_64.S
@@ -159,6 +159,15 @@

#define dummy2(a, b) /*_*/

+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+#define read_block(io, left, right) \
+ movbe (io), left##d; \
+ movbe 4(io), right##d;
+
+#define write_block(io, left, right) \
+ movbe left##d, (io); \
+ movbe right##d, 4(io);
+#else
#define read_block(io, left, right) \
movl (io), left##d; \
movl 4(io), right##d; \
@@ -170,6 +179,7 @@
bswapl right##d; \
movl left##d, (io); \
movl right##d, 4(io);
+#endif

ENTRY(des3_ede_x86_64_crypt_blk)
/* input:
@@ -443,6 +453,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
pushq %rsi /* dst */

/* load input */
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+ movbe 0 * 4(%rdx), RL0d;
+ movbe 1 * 4(%rdx), RR0d;
+ movbe 2 * 4(%rdx), RL1d;
+ movbe 3 * 4(%rdx), RR1d;
+ movbe 4 * 4(%rdx), RL2d;
+ movbe 5 * 4(%rdx), RR2d;
+#else
movl 0 * 4(%rdx), RL0d;
movl 1 * 4(%rdx), RR0d;
movl 2 * 4(%rdx), RL1d;
@@ -456,6 +474,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
bswapl RR1d;
bswapl RL2d;
bswapl RR2d;
+#endif

initial_permutation3(RL, RR);

@@ -516,6 +535,14 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)

final_permutation3(RR, RL);

+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+ movbe RR0d, 0 * 4(%rsi);
+ movbe RL0d, 1 * 4(%rsi);
+ movbe RR1d, 2 * 4(%rsi);
+ movbe RL1d, 3 * 4(%rsi);
+ movbe RR2d, 4 * 4(%rsi);
+ movbe RL2d, 5 * 4(%rsi);
+#else
bswapl RR0d;
bswapl RL0d;
bswapl RR1d;
@@ -530,6 +557,7 @@ ENTRY(des3_ede_x86_64_crypt_blk_3way)
movl RL1d, 3 * 4(%rsi);
movl RR2d, 4 * 4(%rsi);
movl RL2d, 5 * 4(%rsi);
+#endif

popq %r15;
popq %r14;
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
index b33093f84528..17fe33750298 100644
--- a/arch/x86/net/bpf_jit.S
+++ b/arch/x86/net/bpf_jit.S
@@ -34,8 +34,12 @@ FUNC(sk_load_word_positive_offset)
sub %esi,%eax # hlen - offset
cmp $3,%eax
jle bpf_slow_path_word
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+ movbe (SKBDATA,%rsi),%eax
+#else
mov (SKBDATA,%rsi),%eax
bswap %eax /* ntohl() */
+#endif
ret

FUNC(sk_load_half)
@@ -80,8 +84,12 @@ FUNC(sk_load_byte_positive_offset)
bpf_slow_path_word:
bpf_slow_path_common(4)
js bpf_error
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+ movbe 32(%rbp),%eax
+#else
mov 32(%rbp),%eax
bswap %eax
+#endif
ret

bpf_slow_path_half:
@@ -118,8 +126,12 @@ bpf_slow_path_word_neg:

FUNC(sk_load_word_negative_offset)
sk_negative_common(4)
+#ifdef CONFIG_MARCH_NATIVE_MOVBE
+ movbe (%rax), %eax
+#else
mov (%rax), %eax
bswap %eax
+#endif
ret

bpf_slow_path_half_neg:
diff --git a/scripts/kconfig/cpuid.c b/scripts/kconfig/cpuid.c
index ecb285183581..2c23c8699ae6 100644
--- a/scripts/kconfig/cpuid.c
+++ b/scripts/kconfig/cpuid.c
@@ -42,6 +42,7 @@ static inline void cpuid2(uint32_t eax0, uint32_t ecx0, uint32_t *eax, uint32_t
);
}

+static bool movbe = false;
static bool popcnt = false;
static bool rep_movsb = false;
static bool rep_stosb = false;
@@ -56,6 +57,8 @@ static void intel(void)
cpuid(1, &eax, &ecx, &edx, &ebx);
// printf("%08x %08x %08x %08x\n", eax, ecx, edx, ebx);

+ if (ecx & (1 << 22))
+ movbe = true;
if (ecx & (1 << 23))
popcnt = true;
}
@@ -86,6 +89,7 @@ int main(int argc, char *argv[])
intel();

#define _(x) if (streq(opt, #x)) return x ? EXIT_SUCCESS : EXIT_FAILURE
+ _(movbe);
_(popcnt);
_(rep_movsb);
_(rep_stosb);
diff --git a/scripts/march-native.sh b/scripts/march-native.sh
index d3adf0edb2be..93f6a9bd4a6c 100755
--- a/scripts/march-native.sh
+++ b/scripts/march-native.sh
@@ -30,6 +30,7 @@ option() {
}

if test -x "$CPUID"; then
+ "$CPUID" movbe && option "CONFIG_MARCH_NATIVE_MOVBE"
"$CPUID" popcnt && option "CONFIG_MARCH_NATIVE_POPCNT"
"$CPUID" rep_movsb && option "CONFIG_MARCH_NATIVE_REP_MOVSB"
"$CPUID" rep_stosb && option "CONFIG_MARCH_NATIVE_REP_STOSB"
@@ -76,7 +77,7 @@ for i in $COLLECT_GCC_OPTIONS; do
-mhle) option "CONFIG_MARCH_NATIVE_HLE" ;;
-mlzcnt) option "CONFIG_MARCH_NATIVE_LZCNT" ;;
-mmmx) option "CONFIG_MARCH_NATIVE_MMX" ;;
- -mmovbe) option "CONFIG_MARCH_NATIVE_MOVBE" ;;
+ -mmovbe);;
-mpclmul) option "CONFIG_MARCH_NATIVE_PCLMUL" ;;
-mpopcnt);;
-mprfchw) option "CONFIG_MARCH_NATIVE_PREFETCHW" ;;
--
2.13.6