[PATCH v3 07/21] x86/asm: add memset_movnti()

From: Ankur Arora
Date: Mon Jun 06 2022 - 16:48:20 EST

Next message: Matthew Wilcox (Oracle): "[PATCH 01/20] fs: Add aops->migrate_folio"
Previous message: Matthew Wilcox (Oracle): "[PATCH 04/20] mm/migrate: Convert buffer_migrate_page() to buffer_migrate_folio()"
In reply to: Ankur Arora: "[PATCH v3 19/21] gup: hint non-caching if clearing large regions"
Next in thread: Ankur Arora: "[PATCH v3 05/21] mm/huge_page: generalize process_huge_page()"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Add a MOVNTI based non-caching implementation of memset().

memset_movnti() only needs to differ from memset_orig() in the opcode
used in the inner loop, so move the memset_orig() logic into a macro,
and use that to generate memset_orig() (now memset_movq()) and
memset_movnti().

Signed-off-by: Ankur Arora <ankur.a.arora@xxxxxxxxxx>
---
arch/x86/lib/memset_64.S | 68 ++++++++++++++++++++++------------------
1 file changed, 38 insertions(+), 30 deletions(-)

diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index fc9ffd3ff3b2..307b753ca03a 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -24,7 +24,7 @@ SYM_FUNC_START(__memset)
*
* Otherwise, use original memset function.
*/
- ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+ ALTERNATIVE_2 "jmp memset_movq", "", X86_FEATURE_REP_GOOD, \
"jmp memset_erms", X86_FEATURE_ERMS

movq %rdi,%r9
@@ -66,7 +66,8 @@ SYM_FUNC_START_LOCAL(memset_erms)
RET
SYM_FUNC_END(memset_erms)

-SYM_FUNC_START_LOCAL(memset_orig)
+.macro MEMSET_MOV OP fence
+SYM_FUNC_START_LOCAL(memset_\OP)
movq %rdi,%r10

/* expand byte value */
@@ -77,64 +78,71 @@ SYM_FUNC_START_LOCAL(memset_orig)
/* align dst */
movl %edi,%r9d
andl $7,%r9d
- jnz .Lbad_alignment
-.Lafter_bad_alignment:
+ jnz .Lbad_alignment_\@
+.Lafter_bad_alignment_\@:

movq %rdx,%rcx
shrq $6,%rcx
- jz .Lhandle_tail
+ jz .Lhandle_tail_\@

.p2align 4
-.Lloop_64:
+.Lloop_64_\@:
decq %rcx
- movq %rax,(%rdi)
- movq %rax,8(%rdi)
- movq %rax,16(%rdi)
- movq %rax,24(%rdi)
- movq %rax,32(%rdi)
- movq %rax,40(%rdi)
- movq %rax,48(%rdi)
- movq %rax,56(%rdi)
+ \OP %rax,(%rdi)
+ \OP %rax,8(%rdi)
+ \OP %rax,16(%rdi)
+ \OP %rax,24(%rdi)
+ \OP %rax,32(%rdi)
+ \OP %rax,40(%rdi)
+ \OP %rax,48(%rdi)
+ \OP %rax,56(%rdi)
leaq 64(%rdi),%rdi
- jnz .Lloop_64
+ jnz .Lloop_64_\@

/* Handle tail in loops. The loops should be faster than hard
to predict jump tables. */
.p2align 4
-.Lhandle_tail:
+.Lhandle_tail_\@:
movl %edx,%ecx
andl $63&(~7),%ecx
- jz .Lhandle_7
+ jz .Lhandle_7_\@
shrl $3,%ecx
.p2align 4
-.Lloop_8:
+.Lloop_8_\@:
decl %ecx
- movq %rax,(%rdi)
+ \OP %rax,(%rdi)
leaq 8(%rdi),%rdi
- jnz .Lloop_8
+ jnz .Lloop_8_\@

-.Lhandle_7:
+.Lhandle_7_\@:
andl $7,%edx
- jz .Lende
+ jz .Lende_\@
.p2align 4
-.Lloop_1:
+.Lloop_1_\@:
decl %edx
movb %al,(%rdi)
leaq 1(%rdi),%rdi
- jnz .Lloop_1
+ jnz .Lloop_1_\@

-.Lende:
+.Lende_\@:
+ .if \fence
+ sfence
+ .endif
movq %r10,%rax
RET

-.Lbad_alignment:
+.Lbad_alignment_\@:
cmpq $7,%rdx
- jbe .Lhandle_7
+ jbe .Lhandle_7_\@
movq %rax,(%rdi) /* unaligned store */
movq $8,%r8
subq %r9,%r8
addq %r8,%rdi
subq %r8,%rdx
- jmp .Lafter_bad_alignment
-.Lfinal:
-SYM_FUNC_END(memset_orig)
+ jmp .Lafter_bad_alignment_\@
+.Lfinal_\@:
+SYM_FUNC_END(memset_\OP)
+.endm
+
+MEMSET_MOV OP=movq fence=0
+MEMSET_MOV OP=movnti fence=1
--
2.31.1

Next message: Matthew Wilcox (Oracle): "[PATCH 01/20] fs: Add aops->migrate_folio"
Previous message: Matthew Wilcox (Oracle): "[PATCH 04/20] mm/migrate: Convert buffer_migrate_page() to buffer_migrate_folio()"
In reply to: Ankur Arora: "[PATCH v3 19/21] gup: hint non-caching if clearing large regions"
Next in thread: Ankur Arora: "[PATCH v3 05/21] mm/huge_page: generalize process_huge_page()"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]