Re: Question Regarding ERMS memcpy

From: Borislav Petkov
Date: Sun Mar 05 2017 - 06:18:46 EST


On Sun, Mar 05, 2017 at 10:50:59AM +0100, Borislav Petkov wrote:
> On Sat, Mar 04, 2017 at 04:56:38PM -0800, hpa@xxxxxxxxx wrote:
> > That's what the -march= and -mtune= option do!
>
> How does that even help with a distro kernel built with -mtune=generic ?
>
> gcc can't possibly know on what targets is that kernel going to be
> booted on. So it probably does some universally optimal things, like in
> the dmi_scan_machine() case:
>
> memcpy_fromio(buf, p, 32);
>
> turns into:
>
> .loc 3 219 0
> movl $8, %ecx #, tmp79
> movq %rax, %rsi # p, p
> movq %rsp, %rdi #, tmp77
> rep movsl
>
> Apparently it thinks it is fine to do 8*4-byte MOVS. But why not
> 4*8-byte MOVS?
>
> That's half the loops.
>
> [ It is a whole different story what the machine actually does underneath. It
> being a half cacheline probably doesn't help and it really does the separate
> MOVs but then it would be cheaper if it did 4 8-byte ones. ]
>
> One thing's for sure - both variants are certainly cheaper than to CALL
> a memcpy variant.
>
> What we probably should try to do, though, is simply patch in the body
> of REP; MOVSQ or REP; MOVSB into the call sites and only have a call to
> memcpy_orig() because that last one if fat.
>
> I remember we did talk about it at some point but don't remember why we
> didn't do it.

Right, and I believe it was Linus who mentioned we should simply patch
in the small REP_GOOD and ERMS memcpy body into the call sites. CCed.

Anyway, something like below.

It almost builds here - I still need to figure out that
arch/x86/boot/compressed/misc.c including decompression logic which does
memcpy and it turns into the alternative version which we don't want in
arch/x86/boot/compressed/.

Also, I need to check what vmlinuz size bloat we're talking: with the
diff below, we do add padding which looks like this:

3d: 90 nop
3e: 90 nop
3f: 90 nop
40: 90 nop
41: 90 nop
42: 90 nop
43: 90 nop
44: 90 nop
45: 90 nop
46: 90 nop
47: 90 nop
48: 90 nop
49: 90 nop
4a: 90 nop
4b: 90 nop
4c: 90 nop
4d: 90 nop
4e: 90 nop
4f: 90 nop

and that at every call site.

But at least we save ourselves the call overhead and use the optimal
memcpy variant on each machine.

---
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 2d449337a360..24711ca4033b 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -143,9 +143,7 @@ endif
export CONFIG_X86_X32_ABI

# Don't unroll struct assignments with kmemcheck enabled
-ifeq ($(CONFIG_KMEMCHECK),y)
- KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy)
-endif
+KBUILD_CFLAGS += $(call cc-option,-fno-builtin)

# Stackpointer is addressed different for 32 bit and 64 bit x86
sp-$(CONFIG_X86_32) := esp
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 1b020381ab38..8216c2e610ae 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -204,6 +204,12 @@ static inline int alternatives_text_reserved(void *start, void *end)
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
: output : "i" (0), ## input)

+#define alternative_io_2(oldinstr, newinstr1, feature1, newinstr2, feature2, \
+ output, input...) \
+ asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
+ : output \
+ : "i" (0), ## input)
+
/* Like alternative_io, but for replacing a direct call with another one. */
#define alternative_call(oldfunc, newfunc, feature, output, input...) \
asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index a164862d77e3..a529a12a35ed 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -3,6 +3,8 @@

#ifdef __KERNEL__
#include <linux/jump_label.h>
+#include <asm/cpufeatures.h>
+#include <asm/alternative.h>

/* Written 2002 by Andi Kleen */

@@ -28,8 +30,37 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
function. */

#define __HAVE_ARCH_MEMCPY 1
-extern void *memcpy(void *to, const void *from, size_t len);
-extern void *__memcpy(void *to, const void *from, size_t len);
+void *memcpy_orig(void *to, const void *from, size_t len);
+
+/*
+ * We build a call to memcpy_orig by default which replaced on
+ * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
+ * have the enhanced REP MOVSB/STOSB feature (ERMS), change the REP_GOOD
+ * routine to a REP; MOVSB mem copy.
+ */
+static inline void *memcpy(void *to, const void *from, size_t len)
+{
+ void *ret;
+
+ alternative_io_2("call memcpy_orig\n\t",
+ "movq %%rdi, %%rax\n\t"
+ "movq %%rdx, %%rcx\n\t"
+ "shrq $3, %%rcx\n\t"
+ "andl $7, %%edx\n\t"
+ "rep movsq\n\t"
+ "movl %%edx, %%ecx\n\t"
+ "rep movsb\n\t",
+ X86_FEATURE_REP_GOOD,
+ "movq %%rdi, %%rax\n\t"
+ "movq %%rdx, %%rcx\n\t"
+ "rep movsb\n\t",
+ X86_FEATURE_ERMS,
+ ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), "=d" (len)),
+ "1" (to), "2" (from), "3" (len)
+ : "memory", "rcx");
+
+ return ret;
+}

#ifndef CONFIG_KMEMCHECK
#if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 779782f58324..c3c8890b155c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -6,13 +6,6 @@
#include <asm/alternative-asm.h>
#include <asm/export.h>

-/*
- * We build a jump to memcpy_orig by default which gets NOPped out on
- * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
- * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
- * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
- */
-
.weak memcpy

/*
@@ -26,36 +19,12 @@
* Output:
* rax original destination
*/
-ENTRY(__memcpy)
-ENTRY(memcpy)
- ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
- "jmp memcpy_erms", X86_FEATURE_ERMS
-
- movq %rdi, %rax
- movq %rdx, %rcx
- shrq $3, %rcx
- andl $7, %edx
- rep movsq
- movl %edx, %ecx
- rep movsb
- ret
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-EXPORT_SYMBOL(memcpy)
-EXPORT_SYMBOL(__memcpy)
-
-/*
- * memcpy_erms() - enhanced fast string memcpy. This is faster and
- * simpler than memcpy. Use memcpy_erms when possible.
- */
-ENTRY(memcpy_erms)
- movq %rdi, %rax
- movq %rdx, %rcx
- rep movsb
- ret
-ENDPROC(memcpy_erms)
-
ENTRY(memcpy_orig)
+ pushq %r8
+ pushq %r9
+ pushq %r10
+ pushq %r11
+
movq %rdi, %rax

cmpq $0x20, %rdx
@@ -179,8 +148,14 @@ ENTRY(memcpy_orig)
movb %cl, (%rdi)

.Lend:
+ popq %r11
+ popq %r10
+ popq %r9
+ popq %r8
+
retq
ENDPROC(memcpy_orig)
+EXPORT_SYMBOL_GPL(memcpy_orig)

#ifndef CONFIG_UML
/*

--
Regards/Gruss,
Boris.

SUSE Linux GmbH, GF: Felix ImendÃrffer, Jane Smithard, Graham Norton, HRB 21284 (AG NÃrnberg)
--