[PATCH 3/8] perf bench: add memset_movnti()

From: Ankur Arora
Date: Wed Oct 14 2020 - 04:33:26 EST


Clone memset_movnti() from arch/x86/lib/memset_64.S.

perf bench mem memset on -f x86-64-movnt on Intel Broadwellx, Skylakex
and AMD Rome:

Intel Broadwellx:
$ for i in 2 8 32 128 512; do
perf bench mem memset -f x86-64-movnt -s ${i}MB
done

# Output pruned.
# Running 'mem/memset' benchmark:
# function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S)
# Copying 2MB bytes ...
11.837121 GB/sec
# Copying 8MB bytes ...
11.783560 GB/sec
# Copying 32MB bytes ...
11.868591 GB/sec
# Copying 128MB bytes ...
11.865211 GB/sec
# Copying 512MB bytes ...
11.864085 GB/sec

Intel Skylakex:
$ for i in 2 8 32 128 512; do
perf bench mem memset -f x86-64-movnt -s ${i}MB
done
# Running 'mem/memset' benchmark:
# function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S)
# Copying 2MB bytes ...
6.361971 GB/sec
# Copying 8MB bytes ...
6.300403 GB/sec
# Copying 32MB bytes ...
6.288992 GB/sec
# Copying 128MB bytes ...
6.328793 GB/sec
# Copying 512MB bytes ...
6.324471 GB/sec

AMD Rome:
$ for i in 2 8 32 128 512; do
perf bench mem memset -f x86-64-movnt -s ${i}MB
done
# Running 'mem/memset' benchmark:
# function 'x86-64-movnt' (movnt-based memset() in arch/x86/lib/memset_64.S)
# Copying 2MB bytes ...
10.993199 GB/sec
# Copying 8MB bytes ...
14.221784 GB/sec
# Copying 32MB bytes ...
14.293337 GB/sec
# Copying 128MB bytes ...
15.238947 GB/sec
# Copying 512MB bytes ...
16.476093 GB/sec

Signed-off-by: Ankur Arora <ankur.a.arora@xxxxxxxxxx>
---
tools/arch/x86/lib/memset_64.S | 68 ++++++++++++++++------------
tools/perf/bench/mem-memset-x86-64-asm-def.h | 6 ++-
2 files changed, 43 insertions(+), 31 deletions(-)

diff --git a/tools/arch/x86/lib/memset_64.S b/tools/arch/x86/lib/memset_64.S
index fd5d25a474b7..bfbf6d06f81e 100644
--- a/tools/arch/x86/lib/memset_64.S
+++ b/tools/arch/x86/lib/memset_64.S
@@ -26,7 +26,7 @@ SYM_FUNC_START(__memset)
*
* Otherwise, use original memset function.
*/
- ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+ ALTERNATIVE_2 "jmp memset_movq", "", X86_FEATURE_REP_GOOD, \
"jmp memset_erms", X86_FEATURE_ERMS

movq %rdi,%r9
@@ -65,7 +65,8 @@ SYM_FUNC_START(memset_erms)
ret
SYM_FUNC_END(memset_erms)

-SYM_FUNC_START(memset_orig)
+.macro MEMSET_MOV OP fence
+SYM_FUNC_START(memset_\OP)
movq %rdi,%r10

/* expand byte value */
@@ -76,64 +77,71 @@ SYM_FUNC_START(memset_orig)
/* align dst */
movl %edi,%r9d
andl $7,%r9d
- jnz .Lbad_alignment
-.Lafter_bad_alignment:
+ jnz .Lbad_alignment_\@
+.Lafter_bad_alignment_\@:

movq %rdx,%rcx
shrq $6,%rcx
- jz .Lhandle_tail
+ jz .Lhandle_tail_\@

.p2align 4
-.Lloop_64:
+.Lloop_64_\@:
decq %rcx
- movq %rax,(%rdi)
- movq %rax,8(%rdi)
- movq %rax,16(%rdi)
- movq %rax,24(%rdi)
- movq %rax,32(%rdi)
- movq %rax,40(%rdi)
- movq %rax,48(%rdi)
- movq %rax,56(%rdi)
+ \OP %rax,(%rdi)
+ \OP %rax,8(%rdi)
+ \OP %rax,16(%rdi)
+ \OP %rax,24(%rdi)
+ \OP %rax,32(%rdi)
+ \OP %rax,40(%rdi)
+ \OP %rax,48(%rdi)
+ \OP %rax,56(%rdi)
leaq 64(%rdi),%rdi
- jnz .Lloop_64
+ jnz .Lloop_64_\@

/* Handle tail in loops. The loops should be faster than hard
to predict jump tables. */
.p2align 4
-.Lhandle_tail:
+.Lhandle_tail_\@:
movl %edx,%ecx
andl $63&(~7),%ecx
- jz .Lhandle_7
+ jz .Lhandle_7_\@
shrl $3,%ecx
.p2align 4
-.Lloop_8:
+.Lloop_8_\@:
decl %ecx
- movq %rax,(%rdi)
+ \OP %rax,(%rdi)
leaq 8(%rdi),%rdi
- jnz .Lloop_8
+ jnz .Lloop_8_\@

-.Lhandle_7:
+.Lhandle_7_\@:
andl $7,%edx
- jz .Lende
+ jz .Lende_\@
.p2align 4
-.Lloop_1:
+.Lloop_1_\@:
decl %edx
movb %al,(%rdi)
leaq 1(%rdi),%rdi
- jnz .Lloop_1
+ jnz .Lloop_1_\@

-.Lende:
+.Lende_\@:
+ .if \fence
+ sfence
+ .endif
movq %r10,%rax
ret

-.Lbad_alignment:
+.Lbad_alignment_\@:
cmpq $7,%rdx
- jbe .Lhandle_7
+ jbe .Lhandle_7_\@
movq %rax,(%rdi) /* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%rdx
- jmp .Lafter_bad_alignment
-.Lfinal:
-SYM_FUNC_END(memset_orig)
+ jmp .Lafter_bad_alignment_\@
+.Lfinal_\@:
+SYM_FUNC_END(memset_\OP)
+.endm
+
+MEMSET_MOV OP=movq fence=0
+MEMSET_MOV OP=movnti fence=1
diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h
index dac6d2b7c39b..53ead7f91313 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h
@@ -1,6 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */

-MEMSET_FN(memset_orig,
+MEMSET_FN(memset_movq,
"x86-64-unrolled",
"unrolled memset() in arch/x86/lib/memset_64.S")

@@ -11,3 +11,7 @@ MEMSET_FN(__memset,
MEMSET_FN(memset_erms,
"x86-64-stosb",
"movsb-based memset() in arch/x86/lib/memset_64.S")
+
+MEMSET_FN(memset_movnti,
+ "x86-64-movnt",
+ "movnt-based memset() in arch/x86/lib/memset_64.S")
--
2.9.3