[RFC PATCH 10/10] x86/fpu: defer FPU state load until return to userspace
From: Sebastian Andrzej Siewior
Date: Wed Sep 12 2018 - 09:36:09 EST
From: Rik van Riel <riel@xxxxxxxxxxx>
Defer loading of FPU state until return to userspace. This gives
the kernel the potential to skip loading FPU state for tasks that
stay in kernel mode, or for tasks that end up with repeated
invocations of kernel_fpu_begin.
It also increases the chances that a task's FPU state will remain
valid in the FPU registers until it is scheduled back in, allowing
us to skip restoring that task's FPU state altogether.
Signed-off-by: Rik van Riel <riel@xxxxxxxxxxx>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@xxxxxxxxxxxxx>
---
arch/x86/entry/common.c | 9 +++++++++
arch/x86/include/asm/fpu/api.h | 5 +++++
arch/x86/include/asm/fpu/internal.h | 16 +++++++++------
arch/x86/kernel/fpu/core.c | 31 ++++++++++++++++++++++-------
arch/x86/kernel/process_32.c | 7 ++++---
arch/x86/kernel/process_64.c | 7 ++++---
arch/x86/kvm/x86.c | 10 +++++++---
arch/x86/mm/pkeys.c | 2 +-
8 files changed, 64 insertions(+), 23 deletions(-)
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 3b2490b819181..180e16a8177d2 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -31,6 +31,7 @@
#include <asm/vdso.h>
#include <linux/uaccess.h>
#include <asm/cpufeature.h>
+#include <asm/fpu/api.h>
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>
@@ -196,6 +197,14 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
exit_to_usermode_loop(regs, cached_flags);
+ /* Reload ti->flags; we may have rescheduled above. */
+ cached_flags = READ_ONCE(ti->flags);
+
+ if (unlikely(cached_flags & _TIF_LOAD_FPU)) {
+ clear_thread_flag(TIF_LOAD_FPU);
+ switch_fpu_return();
+ }
+
#ifdef CONFIG_COMPAT
/*
* Compat syscalls set TS_COMPAT. Make sure we clear it before
diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index a9caac9d4a729..308e66a7fcd62 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -27,6 +27,11 @@ extern void kernel_fpu_begin(void);
extern void kernel_fpu_end(void);
extern bool irq_fpu_usable(void);
+/*
+ * Load the task FPU state before returning to userspace.
+ */
+extern void switch_fpu_return(void);
+
/*
* Query the presence of one or more xfeatures. Works on any legacy CPU as well.
*
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 184d76c6470b1..370686972592e 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -472,9 +472,11 @@ static inline void fpregs_activate(struct fpu *fpu)
/*
* Load the FPU state for the current task. Call with preemption disabled.
*/
-static inline void __fpregs_load_activate(struct fpu *fpu, int cpu)
+static inline void __fpregs_load_activate(void)
{
- if (!fpregs_state_valid(fpu, cpu))
+ struct fpu *fpu = ¤t->thread.fpu;
+
+ if (!fpregs_state_valid(fpu, smp_processor_id()))
copy_kernel_to_fpregs(&fpu->state);
fpregs_activate(fpu);
}
@@ -482,11 +484,13 @@ static inline void __fpregs_load_activate(struct fpu *fpu, int cpu)
static inline void __fpregs_changes_begin(void)
{
preempt_disable();
+ local_bh_disable();
}
static inline void __fpregs_changes_end(void)
{
preempt_enable();
+ local_bh_enable();
}
/*
@@ -497,8 +501,8 @@ static inline void __fpregs_changes_end(void)
* - switch_fpu_prepare() saves the old state.
* This is done within the context of the old process.
*
- * - switch_fpu_finish() restores the new state as
- * necessary.
+ * - switch_fpu_finish() sets TIF_LOAD_FPU; the floating point state
+ * will get loaded on return to userspace, or when the kernel needs it.
*/
static inline void
switch_fpu_prepare(struct fpu *old_fpu, int cpu)
@@ -523,7 +527,7 @@ switch_fpu_prepare(struct fpu *old_fpu, int cpu)
* Set up the userspace FPU context for the new task, if the task
* has used the FPU.
*/
-static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
+static inline void switch_fpu_finish(struct fpu *new_fpu)
{
bool preload = static_cpu_has(X86_FEATURE_FPU) &&
new_fpu->initialized;
@@ -531,7 +535,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)
if (!preload)
return;
- __fpregs_load_activate(new_fpu, cpu);
+ set_thread_flag(TIF_LOAD_FPU);
/* Protection keys need to be in place right at context switch time. */
if (boot_cpu_has(X86_FEATURE_OSPKE)) {
if (new_fpu->pkru != __read_pkru())
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 8564086c217fc..cbbd20515cf46 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -101,14 +101,14 @@ void __kernel_fpu_begin(void)
kernel_fpu_disable();
- if (fpu->initialized) {
+ __cpu_invalidate_fpregs_state();
+
+ if (!test_and_set_thread_flag(TIF_LOAD_FPU)) {
/*
* Ignore return value -- we don't care if reg state
* is clobbered.
*/
copy_fpregs_to_fpstate(fpu);
- } else {
- __cpu_invalidate_fpregs_state();
}
}
EXPORT_SYMBOL(__kernel_fpu_begin);
@@ -117,8 +117,7 @@ void __kernel_fpu_end(void)
{
struct fpu *fpu = ¤t->thread.fpu;
- if (fpu->initialized)
- copy_kernel_to_fpregs(&fpu->state);
+ switch_fpu_finish(fpu);
kernel_fpu_enable();
}
@@ -147,7 +146,7 @@ void fpu__save(struct fpu *fpu)
{
WARN_ON_FPU(fpu != ¤t->thread.fpu);
- preempt_disable();
+ __fpregs_changes_begin();
trace_x86_fpu_before_save(fpu);
if (fpu->initialized) {
if (!copy_fpregs_to_fpstate(fpu)) {
@@ -155,7 +154,7 @@ void fpu__save(struct fpu *fpu)
}
}
trace_x86_fpu_after_save(fpu);
- preempt_enable();
+ __fpregs_changes_end();
}
EXPORT_SYMBOL_GPL(fpu__save);
@@ -403,6 +402,24 @@ void fpu__clear(struct fpu *fpu)
}
}
+/*
+ * Load FPU context before returning to userspace.
+ */
+void switch_fpu_return(void)
+{
+ if (!static_cpu_has(X86_FEATURE_FPU))
+ return;
+
+ /*
+ * We should never return to user space without the task's
+ * own FPU contents loaded into the registers. That makes it
+ * a bug to not have the task's FPU state set up.
+ */
+ WARN_ON_FPU(!current->thread.fpu.initialized);
+
+ __fpregs_load_activate();
+}
+
/*
* x87 math exception handling:
*/
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 5046a3c9dec2f..a65f8ce36379b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -236,7 +236,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
- switch_fpu_prepare(prev_fpu, cpu);
+ if (prev_fpu->initialized && !test_thread_flag(TIF_LOAD_FPU))
+ switch_fpu_prepare(prev_fpu, cpu);
/*
* Save away %gs. No need to save %fs, as it was saved on the
@@ -297,10 +298,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
if (prev->gs | next->gs)
lazy_load_gs(next->gs);
- switch_fpu_finish(next_fpu, cpu);
-
this_cpu_write(current_task, next_p);
+ switch_fpu_finish(next_fpu);
+
/* Load the Intel cache allocation PQR MSR. */
intel_rdt_sched_in();
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index ea5ea850348da..66b763f3da6a0 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -427,7 +427,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
this_cpu_read(irq_count) != -1);
- switch_fpu_prepare(prev_fpu, cpu);
+ if (prev_fpu->initialized && !test_thread_flag(TIF_LOAD_FPU))
+ switch_fpu_prepare(prev_fpu, cpu);
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
@@ -478,8 +479,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
load_seg_legacy(prev->gsindex, prev->gsbase,
next->gsindex, next->gsbase, GS);
- switch_fpu_finish(next_fpu, cpu);
-
/*
* Switch the PDA and FPU contexts.
*/
@@ -489,6 +488,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* Reload sp0. */
update_task_stack(next_p);
+ switch_fpu_finish(next_fpu);
+
/*
* Now maybe reload the debug registers and handle I/O bitmaps
*/
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2d8b3c257e769..d9a8c5ec8d6fe 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -7573,6 +7573,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
wait_lapic_expire(vcpu);
guest_enter_irqoff();
+ if (test_and_clear_thread_flag(TIF_LOAD_FPU))
+ switch_fpu_return();
+
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
set_debugreg(vcpu->arch.eff_db[0], 0);
@@ -7832,22 +7835,23 @@ static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
- preempt_disable();
+ __fpregs_changes_begin();
copy_fpregs_to_fpstate(&vcpu->arch.user_fpu);
/* PKRU is separately restored in kvm_x86_ops->run. */
__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state,
~XFEATURE_MASK_PKRU);
- preempt_enable();
+ __fpregs_changes_end();
trace_kvm_fpu(1);
}
/* When vcpu_run ends, restore user space FPU context. */
static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
{
- preempt_disable();
+ __fpregs_changes_begin();
copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu);
copy_kernel_to_fpregs(&vcpu->arch.user_fpu.state);
preempt_enable();
+ __fpregs_changes_end();
++vcpu->stat.fpu_reload;
trace_kvm_fpu(0);
}
diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c
index 6e91529fe2a30..83835abd526d7 100644
--- a/arch/x86/mm/pkeys.c
+++ b/arch/x86/mm/pkeys.c
@@ -28,7 +28,7 @@ void write_pkru(u32 pkru)
current->thread.fpu.pkru = pkru;
__fpregs_changes_begin();
- __fpregs_load_activate(¤t->thread.fpu, smp_processor_id());
+ __fpregs_load_activate();
__write_pkru(pkru);
__fpregs_changes_end();
}
--
2.19.0