[PATCHv1] x86: don't schedule when handling #NM exception

From: David Vrabel
Date: Mon Mar 10 2014 - 12:18:32 EST


math_state_restore() is called from the #NM exception handler. It may
do a GFP_KERNEL allocation (in init_fpu()) which may schedule.

Change this allocation to GFP_ATOMIC, but leave all the other callers
of init_fpu() or fpu_alloc() using GFP_KERNEL.

do_group_exit() will also call schedule() so replace the call with
force_sig(SIGKILL, tsk) instead.

Scheduling in math_state_restore() is particularly bad in Xen PV
guests since the Xen clears CR0.TS before raising #NM exception (in
the expectation that the #NM handler always clears TS). If task A is
descheduled and task B is scheduled. Task B may end up with CR0.TS
unexpectedly clear and any FPU instructions will not raise #NM and
will corrupt task A's FPU state instead.

Reported-by: Zhu Yanhai <zhu.yanhai@xxxxxxxxx>
Signed-off-by: David Vrabel <david.vrabel@xxxxxxxxxx>
---
arch/x86/include/asm/fpu-internal.h | 4 ++--
arch/x86/include/asm/i387.h | 2 +-
arch/x86/include/asm/xsave.h | 2 +-
arch/x86/kernel/i387.c | 16 ++++++++--------
arch/x86/kernel/process.c | 2 +-
arch/x86/kernel/traps.c | 9 ++-------
arch/x86/kernel/xsave.c | 4 ++--
arch/x86/kvm/x86.c | 2 +-
8 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h
index cea1c76..e32a73c 100644
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -567,11 +567,11 @@ static bool fpu_allocated(struct fpu *fpu)
return fpu->state != NULL;
}

-static inline int fpu_alloc(struct fpu *fpu)
+static inline int fpu_alloc(struct fpu *fpu, gfp_t gfp)
{
if (fpu_allocated(fpu))
return 0;
- fpu->state = kmem_cache_alloc(task_xstate_cachep, GFP_KERNEL);
+ fpu->state = kmem_cache_alloc(task_xstate_cachep, gfp);
if (!fpu->state)
return -ENOMEM;
WARN_ON((unsigned long)fpu->state & 15);
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index ed8089d..5559c80 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -18,7 +18,7 @@
struct pt_regs;
struct user_i387_struct;

-extern int init_fpu(struct task_struct *child);
+extern int init_fpu(struct task_struct *child, gfp_t gfp);
extern void fpu_finit(struct fpu *fpu);
extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
extern void math_state_restore(void);
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 5547389..e6d6610 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -44,7 +44,7 @@ extern struct xsave_struct *init_xstate_buf;

extern void xsave_init(void);
extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
-extern int init_fpu(struct task_struct *child);
+extern int init_fpu(struct task_struct *child, gfp_t gfp);

static inline int fpu_xrstor_checking(struct xsave_struct *fx)
{
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index e8368c6..baf94ad 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -217,7 +217,7 @@ EXPORT_SYMBOL_GPL(fpu_finit);
* value at reset if we support XMM instructions and then
* remember the current task has used the FPU.
*/
-int init_fpu(struct task_struct *tsk)
+int init_fpu(struct task_struct *tsk, gfp_t gfp)
{
int ret;

@@ -231,7 +231,7 @@ int init_fpu(struct task_struct *tsk)
/*
* Memory allocation at the first usage of the FPU and other state.
*/
- ret = fpu_alloc(&tsk->thread.fpu);
+ ret = fpu_alloc(&tsk->thread.fpu, gfp);
if (ret)
return ret;

@@ -266,7 +266,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
if (!cpu_has_fxsr)
return -ENODEV;

- ret = init_fpu(target);
+ ret = init_fpu(target, GFP_KERNEL);
if (ret)
return ret;

@@ -285,7 +285,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
if (!cpu_has_fxsr)
return -ENODEV;

- ret = init_fpu(target);
+ ret = init_fpu(target, GFP_KERNEL);
if (ret)
return ret;

@@ -318,7 +318,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
if (!cpu_has_xsave)
return -ENODEV;

- ret = init_fpu(target);
+ ret = init_fpu(target, GFP_KERNEL);
if (ret)
return ret;

@@ -348,7 +348,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
if (!cpu_has_xsave)
return -ENODEV;

- ret = init_fpu(target);
+ ret = init_fpu(target, GFP_KERNEL);
if (ret)
return ret;

@@ -515,7 +515,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
struct user_i387_ia32_struct env;
int ret;

- ret = init_fpu(target);
+ ret = init_fpu(target, GFP_KERNEL);
if (ret)
return ret;

@@ -546,7 +546,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
struct user_i387_ia32_struct env;
int ret;

- ret = init_fpu(target);
+ ret = init_fpu(target, GFP_KERNEL);
if (ret)
return ret;

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3fb8d95..10705a6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -69,7 +69,7 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
*dst = *src;
if (fpu_allocated(&src->thread.fpu)) {
memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
- ret = fpu_alloc(&dst->thread.fpu);
+ ret = fpu_alloc(&dst->thread.fpu, GFP_KERNEL);
if (ret)
return ret;
fpu_copy(dst, src);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 57409f6..c8078d2 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -624,18 +624,13 @@ void math_state_restore(void)
struct task_struct *tsk = current;

if (!tsk_used_math(tsk)) {
- local_irq_enable();
- /*
- * does a slab alloc which can sleep
- */
- if (init_fpu(tsk)) {
+ if (init_fpu(tsk, GFP_ATOMIC)) {
/*
* ran out of memory!
*/
- do_group_exit(SIGKILL);
+ force_sig(SIGKILL, tsk);
return;
}
- local_irq_disable();
}

__thread_fpu_begin(tsk);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index a4b451c..0512744 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -347,7 +347,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
if (!access_ok(VERIFY_READ, buf, size))
return -EACCES;

- if (!used_math() && init_fpu(tsk))
+ if (!used_math() && init_fpu(tsk, GFP_KERNEL))
return -1;

if (!static_cpu_has(X86_FEATURE_FPU))
@@ -628,7 +628,7 @@ void eager_fpu_init(void)
* This is same as math_state_restore(). But use_xsave() is
* not yet patched to use math_state_restore().
*/
- init_fpu(current);
+ init_fpu(current, GFP_KERNEL);
__thread_fpu_begin(current);
if (cpu_has_xsave)
xrstor_state(init_xstate_buf, -1);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2b85784..fc74b68 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6619,7 +6619,7 @@ int fx_init(struct kvm_vcpu *vcpu)
{
int err;

- err = fpu_alloc(&vcpu->arch.guest_fpu);
+ err = fpu_alloc(&vcpu->arch.guest_fpu, gfp);
if (err)
return err;

--
1.7.2.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/