[tip:x86/asm] x86/asm: Optimize clear_page()

From: tip-bot for Borislav Petkov
Date: Tue Mar 07 2017 - 03:51:19 EST


Commit-ID: f25d384755191690b1196776d319cb6a4e899f28
Gitweb: http://git.kernel.org/tip/f25d384755191690b1196776d319cb6a4e899f28
Author: Borislav Petkov <bp@xxxxxxx>
AuthorDate: Thu, 9 Feb 2017 01:34:49 +0100
Committer: Ingo Molnar <mingo@xxxxxxxxxx>
CommitDate: Tue, 7 Mar 2017 08:28:00 +0100

x86/asm: Optimize clear_page()

Currently, we CALL clear_page() which then JMPs to the proper function
chosen by the alternatives.

What we should do instead is CALL the proper function directly. (This
was something Ingo suggested a while ago). So let's do that.

Measuring our favourite kernel build workload shows that there are no
significant changes in performance.

AMD
===
-- /tmp/before 2017-02-09 18:01:46.451961188 +0100
++ /tmp/after 2017-02-09 18:01:54.883961175 +0100
@@ -1,15 +1,15 @@
Performance counter stats for 'system wide' (5 runs):

- 1028960.373643 cpu-clock (msec) # 6.000 CPUs utilized ( +- 1.41% )
+ 1023086.018961 cpu-clock (msec) # 6.000 CPUs utilized ( +- 1.20% )
- 518,744 context-switches # 0.504 K/sec ( +- 1.04% )
+ 518,254 context-switches # 0.507 K/sec ( +- 1.01% )
- 38,112 cpu-migrations # 0.037 K/sec ( +- 1.95% )
+ 37,917 cpu-migrations # 0.037 K/sec ( +- 1.02% )
- 20,874,266 page-faults # 0.020 M/sec ( +- 0.07% )
+ 20,918,897 page-faults # 0.020 M/sec ( +- 0.18% )
- 2,043,646,230,667 cycles # 1.986 GHz ( +- 0.14% ) (66.67%)
+ 2,045,305,584,032 cycles # 1.999 GHz ( +- 0.16% ) (66.67%)
- 553,698,855,431 stalled-cycles-frontend # 27.09% frontend cycles idle ( +- 0.07% ) (66.67%)
+ 555,099,401,413 stalled-cycles-frontend # 27.14% frontend cycles idle ( +- 0.13% ) (66.67%)
- 621,544,286,390 stalled-cycles-backend # 30.41% backend cycles idle ( +- 0.39% ) (66.67%)
+ 621,371,430,254 stalled-cycles-backend # 30.38% backend cycles idle ( +- 0.32% ) (66.67%)
- 1,738,364,431,659 instructions # 0.85 insn per cycle
+ 1,739,895,771,901 instructions # 0.85 insn per cycle
- # 0.36 stalled cycles per insn ( +- 0.11% ) (66.67%)
+ # 0.36 stalled cycles per insn ( +- 0.13% ) (66.67%)
- 391,170,943,850 branches # 380.161 M/sec ( +- 0.13% ) (66.67%)
+ 391,398,551,757 branches # 382.567 M/sec ( +- 0.13% ) (66.67%)
- 22,567,810,411 branch-misses # 5.77% of all branches ( +- 0.11% ) (66.67%)
+ 22,574,726,683 branch-misses # 5.77% of all branches ( +- 0.13% ) (66.67%)

- 171.480741921 seconds time elapsed ( +- 1.41% )
+ 170.509229451 seconds time elapsed ( +- 1.20% )

Intel
=====

-- /tmp/before 2017-02-09 20:36:19.851947473 +0100
++ /tmp/after 2017-02-09 20:36:30.151947458 +0100
@@ -1,15 +1,15 @@
Performance counter stats for 'system wide' (5 runs):

- 2207248.598126 cpu-clock (msec) # 8.000 CPUs utilized ( +- 0.69% )
+ 2213300.106631 cpu-clock (msec) # 8.000 CPUs utilized ( +- 0.73% )
- 899,342 context-switches # 0.407 K/sec ( +- 0.68% )
+ 898,381 context-switches # 0.406 K/sec ( +- 0.79% )
- 80,553 cpu-migrations # 0.036 K/sec ( +- 1.13% )
+ 80,979 cpu-migrations # 0.037 K/sec ( +- 1.11% )
- 36,171,148 page-faults # 0.016 M/sec ( +- 0.02% )
+ 36,179,791 page-faults # 0.016 M/sec ( +- 0.02% )
- 6,665,288,826,484 cycles # 3.020 GHz ( +- 0.07% ) (83.33%)
+ 6,671,638,410,799 cycles # 3.014 GHz ( +- 0.06% ) (83.33%)
- 5,065,975,115,197 stalled-cycles-frontend # 76.01% frontend cycles idle ( +- 0.11% ) (83.33%)
+ 5,076,835,183,223 stalled-cycles-frontend # 76.10% frontend cycles idle ( +- 0.11% ) (83.33%)
- 3,841,556,350,614 stalled-cycles-backend # 57.64% backend cycles idle ( +- 0.13% ) (66.67%)
+ 3,852,823,974,333 stalled-cycles-backend # 57.75% backend cycles idle ( +- 0.12% ) (66.67%)
- 4,148,398,171,079 instructions # 0.62 insn per cycle
+ 4,148,997,156,059 instructions # 0.62 insn per cycle
- # 1.22 stalled cycles per insn ( +- 0.10% ) (83.33%)
+ # 1.22 stalled cycles per insn ( +- 0.11% ) (83.33%)
- 887,187,118,591 branches # 401.943 M/sec ( +- 0.09% ) (83.33%)
+ 887,271,341,121 branches # 400.882 M/sec ( +- 0.11% ) (83.33%)
- 30,139,439,034 branch-misses # 3.40% of all branches ( +- 0.09% ) (83.33%)
+ 30,134,864,997 branch-misses # 3.40% of all branches ( +- 0.06% ) (83.33%)

- 275.904405540 seconds time elapsed ( +- 0.69% )
+ 276.660352016 seconds time elapsed ( +- 0.73% )

allmodconfig vmlinux size grows by a ~1Kb but that's fine - we optimize
our calling of the clear_page variants.

text data bss dec hex filename
9051979 23067670 27009024 59128673 3863b61 vmlinux
9053000 23067670 27009024 59129694 3863f5e vmlinux.clear_page

Reported-by: kernel test robot <fengguang.wu@xxxxxxxxx>
Tested-by: Fengguang Wu <fengguang.wu@xxxxxxxxx>
Signed-off-by: Borislav Petkov <bp@xxxxxxx>
Cc: Andy Lutomirski <luto@xxxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxxxx>
Cc: Brian Gerst <brgerst@xxxxxxxxx>
Cc: Denys Vlasenko <dvlasenk@xxxxxxxxxx>
Cc: H. Peter Anvin <hpa@xxxxxxxxx>
Cc: Josh Poimboeuf <jpoimboe@xxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Link: http://lkml.kernel.org/r/20170215111927.emdgxf2pide3kwro@xxxxxxx
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
---
arch/x86/include/asm/page_64.h | 16 +++++++++++++++-
arch/x86/lib/clear_page_64.S | 17 +++++++----------
2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
index b3bebf9..b4a0d43 100644
--- a/arch/x86/include/asm/page_64.h
+++ b/arch/x86/include/asm/page_64.h
@@ -4,6 +4,7 @@
#include <asm/page_64_types.h>

#ifndef __ASSEMBLY__
+#include <asm/alternative.h>

/* duplicated to the one in bootmem.h */
extern unsigned long max_pfn;
@@ -34,7 +35,20 @@ extern unsigned long __phys_addr_symbol(unsigned long);
#define pfn_valid(pfn) ((pfn) < max_pfn)
#endif

-void clear_page(void *page);
+void clear_page_orig(void *page);
+void clear_page_rep(void *page);
+void clear_page_erms(void *page);
+
+static inline void clear_page(void *page)
+{
+ alternative_call_2(clear_page_orig,
+ clear_page_rep, X86_FEATURE_REP_GOOD,
+ clear_page_erms, X86_FEATURE_ERMS,
+ "=D" (page),
+ "0" (page)
+ : "memory", "rax", "rcx");
+}
+
void copy_page(void *to, void *from);

#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 5e2af3a..81b1635 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -14,20 +14,15 @@
* Zero a page.
* %rdi - page
*/
-ENTRY(clear_page)
-
- ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
- "jmp clear_page_c_e", X86_FEATURE_ERMS
-
+ENTRY(clear_page_rep)
movl $4096/8,%ecx
xorl %eax,%eax
rep stosq
ret
-ENDPROC(clear_page)
-EXPORT_SYMBOL(clear_page)
+ENDPROC(clear_page_rep)
+EXPORT_SYMBOL_GPL(clear_page_rep)

ENTRY(clear_page_orig)
-
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
@@ -47,10 +42,12 @@ ENTRY(clear_page_orig)
nop
ret
ENDPROC(clear_page_orig)
+EXPORT_SYMBOL_GPL(clear_page_orig)

-ENTRY(clear_page_c_e)
+ENTRY(clear_page_erms)
movl $4096,%ecx
xorl %eax,%eax
rep stosb
ret
-ENDPROC(clear_page_c_e)
+ENDPROC(clear_page_erms)
+EXPORT_SYMBOL_GPL(clear_page_erms)