[PATCH RFC 3/3] x86/fpu: defer FPU state load until return to userspace

From: riel
Date: Mon Oct 17 2016 - 16:13:25 EST


From: Rik van Riel <riel@xxxxxxxxxx>

Defer loading of FPU state until return to userspace. This gives
the kernel the potential to skip loading FPU state for tasks that
stay in kernel mode, or for tasks that end up with repeated
invocations of kernel_fpu_begin.

It also increases the chances that a task's FPU state will remain
valid in the FPU registers until it is scheduled back in, allowing
us to skip restoring that task's FPU state altogether.

This also prepares the ground work for not having to restore
qemu userspace FPU state in KVM VCPU threads, when merely returning
to the host kernel because the guest went idle, or is running a
kernel thread. That functionality will come in a later patch.

Signed-off-by: Rik van Riel <riel@xxxxxxxxxx>
---
arch/x86/entry/common.c | 9 +++++++++
arch/x86/include/asm/fpu/api.h | 5 +++++
arch/x86/include/asm/fpu/internal.h | 13 +++++--------
arch/x86/include/asm/thread_info.h | 4 +++-
arch/x86/kernel/fpu/core.c | 28 ++++++++++++++++++++++++----
arch/x86/kernel/process_32.c | 5 ++---
arch/x86/kernel/process_64.c | 5 ++---
7 files changed, 50 insertions(+), 19 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index bdd9cc59d20f..0c11ee22f90b 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -27,6 +27,7 @@
#include <asm/vdso.h>
#include <asm/uaccess.h>
#include <asm/cpufeature.h>
+#include <asm/fpu/api.h>

#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
@@ -189,6 +190,14 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
exit_to_usermode_loop(regs, cached_flags);

+ /* Reload ti->flags; we may have rescheduled above. */
+ cached_flags = READ_ONCE(ti->flags);
+
+ if (unlikely(cached_flags & _TIF_LOAD_FPU)) {
+ clear_thread_flag(TIF_LOAD_FPU);
+ switch_fpu_return();
+ }
+
#ifdef CONFIG_COMPAT
/*
* Compat syscalls set TS_COMPAT. Make sure we clear it before
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index 1429a7c736db..d7ef49a03b51 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -37,6 +37,11 @@ extern int irq_ts_save(void);
extern void irq_ts_restore(int TS_state);

/*
+ * Load the task FPU state before returning to userspace.
+ */
+extern void switch_fpu_return(void);
+
+/*
* Query the presence of one or more xfeatures. Works on any legacy CPU as well.
*
* If 'feature_name' is set then put a human-readable description of
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index d40deb337807..cccc0c059b41 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -601,8 +601,8 @@ static inline void make_fpregs_active_loadnew(void)
* - switch_fpu_prepare() saves the old state.
* This is done within the context of the old process.
*
- * - switch_fpu_finish() restores the new state as
- * necessary.
+ * - switch_fpu_finish() sets TIF_LOAD_FPU; the floating point state
+ * will get loaded on return to userspace, or when the kernel needs it.
*/
static inline void
switch_fpu_prepare(struct fpu *old_fpu, int cpu)
@@ -628,13 +628,10 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
* Set up the userspace FPU context for the new task, if the task
* has used the FPU.
*/
-static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
+static inline void switch_fpu_finish(void)
{
- bool preload = static_cpu_has(X86_FEATURE_FPU) &&
- new_fpu->fpstate_active;
-
- if (preload)
- __make_fpregs_active(new_fpu, cpu);
+ if (static_cpu_has(X86_FEATURE_FPU))
+ set_thread_flag(TIF_LOAD_FPU);
}

/*
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 2aaca53c0974..9941d118f2cc 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -90,6 +90,7 @@ struct task_struct;
#define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */
#define TIF_ADDR32 29 /* 32-bit address space on 64 bits */
#define TIF_X32 30 /* 32-bit native x86-64 binary */
+#define TIF_LOAD_FPU 31 /* load FPU on return to userspace */

#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
@@ -112,6 +113,7 @@ struct task_struct;
#define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT)
#define _TIF_ADDR32 (1 << TIF_ADDR32)
#define _TIF_X32 (1 << TIF_X32)
+#define _TIF_LOAD_FPU (1 << TIF_LOAD_FPU)

/*
* work to do in syscall_trace_enter(). Also includes TIF_NOHZ for
@@ -125,7 +127,7 @@ struct task_struct;
/* work to do on any return to user space */
#define _TIF_ALLWORK_MASK \
((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \
- _TIF_NOHZ)
+ _TIF_NOHZ | _TIF_LOAD_FPU)

/* flags to check in __switch_to() */
#define _TIF_WORK_CTXSW \
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 34ba9d47c20f..09c4254a6e26 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -99,14 +99,14 @@ void __kernel_fpu_begin(void)

kernel_fpu_disable();

+ __cpu_invalidate_fpregs_state();
+
if (fpu->fpregs_active) {
/*
* Ignore return value -- we don't care if reg state
* is clobbered.
*/
copy_fpregs_to_fpstate(fpu);
- } else {
- __cpu_invalidate_fpregs_state();
}
}
EXPORT_SYMBOL(__kernel_fpu_begin);
@@ -115,8 +115,10 @@ void __kernel_fpu_end(void)
{
struct fpu *fpu = &current->thread.fpu;

- if (fpu->fpregs_active)
- copy_kernel_to_fpregs(&fpu->state);
+ if (fpu->fpregs_active) {
+ switch_fpu_finish();
+ fpu->fpregs_active = 0;
+ }

kernel_fpu_enable();
}
@@ -501,6 +503,24 @@ void fpu__clear(struct fpu *fpu)
}

/*
+ * Load FPU context before returning to userspace.
+ */
+void switch_fpu_return(void)
+{
+ if (!static_cpu_has(X86_FEATURE_FPU))
+ return;
+
+ /*
+ * We should never return to user space without the task's
+ * own FPU contents loaded into the registers. That makes it
+ * a bug to not have the task's FPU state set up.
+ */
+ WARN_ON_FPU(!fpstate_active());
+
+ make_fpregs_active();
+}
+
+/*
* x87 math exception handling:
*/

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 7dc8c9c3d801..9103871f80f5 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -229,7 +229,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
struct fpu *prev_fpu = &prev->fpu;
- struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);

@@ -294,9 +293,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (prev->gs | next->gs)
lazy_load_gs(next->gs);

- switch_fpu_finish(next_fpu, cpu);
-
this_cpu_write(current_task, next_p);

+ switch_fpu_finish();
+
return prev_p;
}
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 705669efb762..4b228e1e4423 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -260,7 +260,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
struct fpu *prev_fpu = &prev->fpu;
- struct fpu *next_fpu = &next->fpu;
int cpu = smp_processor_id();
struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
unsigned prev_fsindex, prev_gsindex;
@@ -415,8 +414,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
prev->gsbase = 0;
prev->gsindex = prev_gsindex;

- switch_fpu_finish(next_fpu, cpu);
-
/*
* Switch the PDA and FPU contexts.
*/
@@ -425,6 +422,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* Reload esp0 and ss1. This changes current_thread_info(). */
load_sp0(tss, next);

+ switch_fpu_finish();
+
/*
* Now maybe reload the debug registers and handle I/O bitmaps
*/
--
2.7.4