[RFC 7/7] x86/current: Aggressive caching of current

From: Nadav Amit
Date: Thu Jul 18 2019 - 21:04:04 EST


The current_task is supposed to be constant in each thread and therefore
does not need to be reread. There is already an attempt to cache it
using inline assembly, using this_cpu_read_stable(), which hides the
dependency on the read memory address.

However, this caching is not working very well. For example,
sync_mm_rss() still reads current_task twice for no reason.

Allow more aggressive caching by aliasing current_task to
into a constant const_current_task and reading from the constant copy.
Doing so requires the compiler to support x86 segment qualifiers.
Hide const_current_task in a different compilation unit to avoid the
compiler from assuming that the value is constant during compilation.

Signed-off-by: Nadav Amit <namit@xxxxxxxxxx>
---
arch/x86/include/asm/current.h | 30 ++++++++++++++++++++++++++++++
arch/x86/kernel/cpu/Makefile | 1 +
arch/x86/kernel/cpu/common.c | 7 +------
arch/x86/kernel/cpu/current.c | 16 ++++++++++++++++
include/linux/compiler.h | 2 +-
5 files changed, 49 insertions(+), 7 deletions(-)
create mode 100644 arch/x86/kernel/cpu/current.c

diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 3e204e6140b5..7f093e81a647 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -10,11 +10,41 @@ struct task_struct;

DECLARE_PER_CPU(struct task_struct *, current_task);

+#if USE_X86_SEG_SUPPORT
+
+/*
+ * Hold a constant alias for current_task, which would allow to avoid caching of
+ * current task.
+ *
+ * We must mark const_current_task with the segment qualifiers, as otherwise gcc
+ * would do redundant reads of const_current_task.
+ */
+DECLARE_PER_CPU(struct task_struct * const __percpu_seg_override, const_current_task);
+
+static __always_inline struct task_struct *get_current(void)
+{
+
+ /*
+ * GCC is missing functionality of removing segment qualifiers, which
+ * messes with per-cpu infrastructure that holds local copies. Use
+ * __raw_cpu_read to avoid holding any copy.
+ */
+ return __raw_cpu_read(, const_current_task);
+}
+
+#else /* USE_X86_SEG_SUPPORT */
+
+/*
+ * Without segment qualifier support, the per-cpu infrastrucutre is not
+ * suitable for reading constants, so use this_cpu_read_stable() in this case.
+ */
static __always_inline struct task_struct *get_current(void)
{
return this_cpu_read_stable(current_task);
}

+#endif /* USE_X86_SEG_SUPPORT */
+
#define current get_current()

#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d7a1e5a9331c..d816f03a37d7 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -19,6 +19,7 @@ CFLAGS_common.o := $(nostackp)

obj-y := cacheinfo.o scattered.o topology.o
obj-y += common.o
+obj-y += current.o
obj-y += rdrand.o
obj-y += match.o
obj-y += bugs.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e0489d2860d3..33a6b51e8059 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1607,13 +1607,8 @@ DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data);

/*
- * The following percpu variables are hot. Align current_task to
- * cacheline size such that they fall in the same cacheline.
+ * The following percpu variables are hot.
*/
-DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
- &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-
DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;

diff --git a/arch/x86/kernel/cpu/current.c b/arch/x86/kernel/cpu/current.c
new file mode 100644
index 000000000000..3238c6e34984
--- /dev/null
+++ b/arch/x86/kernel/cpu/current.c
@@ -0,0 +1,16 @@
+#include <linux/sched/task.h>
+#include <asm/current.h>
+
+/*
+ * Align current_task to cacheline size such that they fall in the same
+ * cacheline.
+ */
+DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
+ &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+#if USE_X86_SEG_SUPPORT
+DECLARE_PER_CPU(struct task_struct * const __percpu_seg_override, const_current_task)
+ __attribute__((alias("current_task")));
+EXPORT_PER_CPU_SYMBOL(const_current_task);
+#endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index f0fd5636fddb..1b6ee9ab6373 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -299,7 +299,7 @@ unsigned long read_word_at_a_time(const void *addr)
*/
#define __ADDRESSABLE(sym) \
static void * __section(".discard.addressable") __used \
- __PASTE(__addressable_##sym, __LINE__) = (void *)&sym;
+ __PASTE(__addressable_##sym, __LINE__) = (void *)(uintptr_t)&sym;

/**
* offset_to_ptr - convert a relative memory offset to an absolute pointer
--
2.17.1