[PATCH] x86/fpu: Allow nested in-kernel use of FPU

From: Xiao Liang
Date: Wed Apr 03 2024 - 10:03:29 EST


When a softirq preempts a task which has kernel FPU in-use, it's not
allowed to use FPU in current implementation. This has performance
drawbacks, e.g. on SIMD crypto algs.

To enable nested in-kernel use of FPU, the preempting softirq or hardirq
saves kernel FPU state to a per-cpu variable when entering
(kernel_fpu_begin_mask) nested FPU section and restores it on exit
(kernel_fpu_end).

Signed-off-by: Xiao Liang <shaw.leon@xxxxxxxxx>
---

We observed a performance drop when testing IPSec AES crypto with aesni driver.
When FPU is not available in softirq, crypto_simd puts the cipher to async mode
in cryptd, causing throughput drop from ~600Mbps to ~200Mbps on our testbox.
And it couldn't recover from this state until the queue gets drained. This
patch is intended to improve the performance in such cases.

Referenced implementation for arm64, see

aefbab8e77eb ("arm64: fpsimd: Preserve/restore kernel mode NEON at context switch")

The main difference is that FPU is allowed in hardirq on x86, and FPU context
is saved in per-cpu variables so not to bother with task_struct.

arch/x86/kernel/fpu/core.c | 49 ++++++++++++++++++++++++++++----------
1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 520deb411a70..7f21e70fcceb 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -43,9 +43,15 @@ struct fpu_state_config fpu_user_cfg __ro_after_init;
*/
struct fpstate init_fpstate __ro_after_init;

-/* Track in-kernel FPU usage */
+/*
+ * Task can be preempted by softirq or hardirq even when kernel FPU is in use.
+ * The flag in_kernel_fpu tracks such nestable kernel FPU usage.
+ */
static DEFINE_PER_CPU(bool, in_kernel_fpu);

+/* Save/restore fpstate when beginning/ending a nested kernel FPU section. */
+static DEFINE_PER_CPU(struct fpu, kernel_fpu);
+
/*
* Track which context is using the FPU on the CPU:
*/
@@ -60,10 +66,6 @@ bool irq_fpu_usable(void)
if (WARN_ON_ONCE(in_nmi()))
return false;

- /* In kernel FPU usage already active? */
- if (this_cpu_read(in_kernel_fpu))
- return false;
-
/*
* When not in NMI or hard interrupt context, FPU can be used in:
*
@@ -423,14 +425,28 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask)
preempt_disable();

WARN_ON_FPU(!irq_fpu_usable());
- WARN_ON_FPU(this_cpu_read(in_kernel_fpu));

- this_cpu_write(in_kernel_fpu, true);
+ if (this_cpu_read(in_kernel_fpu)) {
+ struct fpu *fpu = this_cpu_ptr(&kernel_fpu);
+
+ /* Cannot be preempted when kernel FPU is in use. */
+ WARN_ON_ONCE(in_task());
+
+ if (unlikely(!fpu->fpstate))
+ fpstate_reset(fpu);

- if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
- !test_thread_flag(TIF_NEED_FPU_LOAD)) {
- set_thread_flag(TIF_NEED_FPU_LOAD);
- save_fpregs_to_fpstate(&current->thread.fpu);
+ /* Save kernel FPU state begin nested FPU section. */
+ save_fpregs_to_fpstate(fpu);
+ } else {
+ fpregs_lock();
+ if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) &&
+ !test_thread_flag(TIF_NEED_FPU_LOAD)) {
+ set_thread_flag(TIF_NEED_FPU_LOAD);
+ save_fpregs_to_fpstate(&current->thread.fpu);
+ }
+ if (in_task())
+ this_cpu_write(in_kernel_fpu, true);
+ fpregs_unlock();
}
__cpu_invalidate_fpregs_state();

@@ -445,9 +461,16 @@ EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask);

void kernel_fpu_end(void)
{
- WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
+ /*
+ * When returning from nested kernel FPU section, restore previously
+ * saved fpstate.
+ */
+ if (!in_task() && this_cpu_read(in_kernel_fpu))
+ restore_fpregs_from_fpstate(this_cpu_ptr(&kernel_fpu)->fpstate,
+ XFEATURE_MASK_FPSTATE);
+ else
+ this_cpu_write(in_kernel_fpu, false);

- this_cpu_write(in_kernel_fpu, false);
preempt_enable();
}
EXPORT_SYMBOL_GPL(kernel_fpu_end);
--
2.44.0