[patch 2.6.13-rc3] i386: semi-lazy i387 context switching

From: Chuck Ebbert
Date: Fri Jul 29 2005 - 15:47:17 EST



This patch is not for inclusion -- I just want to see if the idea
is sound. It's based on suggestions from <linux@xxxxxxxxxxx>

When saving FP context, the current CPU number is saved in the
tasks thread structure, and a pointer to that structure is saved
in a per-cpu data area.

On loading an FP context, the per-cpu pointer is cleared. (But
the CPU number in the task is untouched.)

Upon task switch, the CPU number in the per-task area is compared to
the current CPU and the per-CPU pointer is checked. If everything
matches, loading of the FPU context will be skipped.

To prevent extra overhead when a task does short bursts of FP math
and then switches to integer, a normal FPU context load will be forced
after 100 skipped loads.

Problems:
- As posted, the code only works on machines with fxsr.
GCC internal errors prevent the commented-out code
from compiling; I guess a conditional jump is needed.

- May not be preempt-safe (but AFAICT it is.)

Volanomark profile results are promising:

Before After
8304 8176 device_not_available
11809 12334 math_state_restore
-----------------
20114 20500

So it seems to be reducing the number of traps but each trap takes
a bit longer. This is a good result from a worst-case scenario.

The other worst-case test is for systems not using FP math at all.
This is untested, and best-case results are still pending as well.

Signed-off-by: Chuck Ebbert <76306.1226@xxxxxxxxxxxxxx>

Index: 2.6.13-rc3-mm3/arch/i386/kernel/i387.c
===================================================================
--- 2.6.13-rc3-mm3.orig/arch/i386/kernel/i387.c 2005-07-29 02:26:39.000000000 -0400
+++ 2.6.13-rc3-mm3/arch/i386/kernel/i387.c 2005-07-29 14:41:34.000000000 -0400
@@ -27,6 +27,8 @@

static unsigned long mxcsr_feature_mask = 0xffffffff;

+DEFINE_PER_CPU(struct thread_struct *, current_i387_thread);
+
void mxcsr_feature_mask_init(void)
{
unsigned long mask = 0;
Index: 2.6.13-rc3-mm3/arch/i386/kernel/process.c
===================================================================
--- 2.6.13-rc3-mm3.orig/arch/i386/kernel/process.c 2005-07-29 02:26:39.000000000 -0400
+++ 2.6.13-rc3-mm3/arch/i386/kernel/process.c 2005-07-29 14:41:34.000000000 -0400
@@ -475,6 +475,8 @@

p->thread.eip = (unsigned long) ret_from_fork;

+ p->thread.current_i387_cpu = -1;
+
savesegment(fs,p->thread.fs);
savesegment(gs,p->thread.gs);

@@ -679,8 +681,29 @@

/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */

- __unlazy_fpu(prev_p);
+ if (prev_p->thread_info->status & TS_USEDFPU) {
+ save_init_fpu(prev_p);
+ goto lazy_load;
+ }
+
+ /* This breaks GCC 3.3 and 4.0.1 (internal compiler error) */
+// alternative_input( /* do lazy restore if fxsr unsupported */
+// "jmp %1",
+// "",
+// X86_FEATURE_FXSR,
+// "a" (*&&lazy_load));
+
+ if (next->current_i387_cpu == smp_processor_id()
+ && next == per_cpu(current_i387_thread, smp_processor_id())) {
+
+ if (likely(++next->lazy_i387_switches < 100)) {
+ next_p->thread_info->status |= TS_USEDFPU;
+ clts();
+ } else
+ next->lazy_i387_switches = 0;
+ }

+lazy_load:
/*
* Reload esp0, LDT and the page table pointer:
*/
Index: 2.6.13-rc3-mm3/include/asm-i386/i387.h
===================================================================
--- 2.6.13-rc3-mm3.orig/include/asm-i386/i387.h 2005-07-29 14:32:03.000000000 -0400
+++ 2.6.13-rc3-mm3/include/asm-i386/i387.h 2005-07-29 14:41:34.000000000 -0400
@@ -17,6 +17,8 @@
#include <asm/sigcontext.h>
#include <asm/user.h>

+DECLARE_PER_CPU(struct thread_struct *, current_i387_thread);
+
extern void mxcsr_feature_mask_init(void);
extern void init_fpu(struct task_struct *);

@@ -24,16 +26,31 @@
* FPU lazy state save handling...
*/

-/*
- * The "nop" is needed to make the instructions the same
- * length.
- */
-#define restore_fpu(tsk) \
- alternative_input( \
- "nop ; frstor %1", \
- "fxrstor %1", \
- X86_FEATURE_FXSR, \
- "m" ((tsk)->thread.i387.fxsave))
+static inline void restore_fpu( struct task_struct *tsk )
+{
+ /*
+ * The "nop" is needed to make the instructions the same
+ * length.
+ */
+ alternative_input(
+ "frstor %1 ; nop",
+ "fxrstor %1",
+ X86_FEATURE_FXSR,
+ "m" (tsk->thread.i387.fxsave));
+
+ /* This breaks GCC 3.3 and 4.0.1 (internal compiler error) */
+// alternative_input( /* skip ahead if fxsr unsupported */
+// "jmp %1",
+// "",
+// X86_FEATURE_FXSR,
+// "a" (*&&no_fxsr));
+
+ /* ??? is preempt disabled when this is called? */
+ per_cpu(current_i387_thread, smp_processor_id()) = 0;
+no_fxsr:
+ __attribute__((unused))
+ return; /* required to avoid gcc error */
+}

extern void kernel_fpu_begin(void);
#define kernel_fpu_end() do { stts(); preempt_enable(); } while(0)
@@ -49,6 +66,18 @@
X86_FEATURE_FXSR,
"m" (tsk->thread.i387.fxsave)
:"memory");
+
+ /* This breaks GCC 3.3 and 4.0.1 (internal compiler error) */
+// alternative_input( /* skip ahead if fxsr unsupported */
+// "jmp %1",
+// "",
+// X86_FEATURE_FXSR,
+// "a" (*&&no_fxsr));
+
+ tsk->thread.current_i387_cpu = smp_processor_id();
+ per_cpu(current_i387_thread, smp_processor_id()) = &tsk->thread;
+no_fxsr:
+ __attribute__((unused))
tsk->thread_info->status &= ~TS_USEDFPU;
}

Index: 2.6.13-rc3-mm3/include/asm-i386/processor.h
===================================================================
--- 2.6.13-rc3-mm3.orig/include/asm-i386/processor.h 2005-07-13 16:20:26.000000000 -0400
+++ 2.6.13-rc3-mm3/include/asm-i386/processor.h 2005-07-29 14:41:34.000000000 -0400
@@ -447,6 +447,7 @@
unsigned long cr2, trap_no, error_code;
/* floating point info */
union i387_union i387;
+ int current_i387_cpu, lazy_i387_switches;
/* virtual 86 mode info */
struct vm86_struct __user * vm86_info;
unsigned long screen_bitmap;
__
Chuck
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/