[PATCH] x86/fpu: move FPU state into separate cache

From: Kees Cook
Date: Wed Mar 29 2017 - 16:39:20 EST


This removes ARCH_WANTS_DYNAMIC_TASK_STRUCT from x86, leaving only s390
still defining this config.

In order to support future structure layout randomization of the
task_struct, none of the structure fields are allowed to have a specific
position or dynamic size. To enable randomization of task_struct on
x86, the FPU state must be moved to its own dynamically sized cache,
and dereferenced from the task_struct.

This change is nearly identical to what was done in grsecurity to support
structure layout randomization. Hopefully I found all the needed changes.
This passes allyesconfig, and boot tests.

Signed-off-by: Kees Cook <keescook@xxxxxxxxxxxx>
---
arch/x86/Kconfig | 1 -
arch/x86/include/asm/fpu/internal.h | 16 ++++++------
arch/x86/include/asm/fpu/types.h | 6 +----
arch/x86/include/asm/processor.h | 10 +++-----
arch/x86/include/asm/trace/fpu.h | 4 +--
arch/x86/kernel/fpu/core.c | 31 +++++++++++------------
arch/x86/kernel/fpu/init.c | 50 ++-----------------------------------
arch/x86/kernel/fpu/regset.c | 24 +++++++++---------
arch/x86/kernel/fpu/signal.c | 12 ++++-----
arch/x86/kernel/fpu/xstate.c | 6 ++---
arch/x86/kernel/process.c | 24 ++++++++++++++++--
arch/x86/kvm/x86.c | 36 ++++++++++++++++----------
include/linux/sched.h | 5 ++--
13 files changed, 100 insertions(+), 125 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9a5af1e1cd61..13b54a5ddfde 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -71,7 +71,6 @@ config X86
select ARCH_USE_QUEUED_SPINLOCKS
select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
select ARCH_WANT_FRAME_POINTERS
- select ARCH_WANTS_DYNAMIC_TASK_STRUCT
select BUILDTIME_EXTABLE_SORT
select CLKEVT_I8253
select CLOCKSOURCE_VALIDATE_LAST_CYCLE
diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index 255645f60ca2..f564c29d5194 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -196,9 +196,9 @@ static inline int copy_user_to_fregs(struct fregs_state __user *fx)
static inline void copy_fxregs_to_kernel(struct fpu *fpu)
{
if (IS_ENABLED(CONFIG_X86_32))
- asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state.fxsave));
+ asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave));
else if (IS_ENABLED(CONFIG_AS_FXSAVEQ))
- asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state.fxsave));
+ asm volatile("fxsaveq %[fx]" : [fx] "=m" (fpu->state->fxsave));
else {
/* Using "rex64; fxsave %0" is broken because, if the memory
* operand uses any extended registers for addressing, a second
@@ -215,15 +215,15 @@ static inline void copy_fxregs_to_kernel(struct fpu *fpu)
* an extended register is needed for addressing (fix submitted
* to mainline 2005-11-21).
*
- * asm volatile("rex64/fxsave %0" : "=m" (fpu->state.fxsave));
+ * asm volatile("rex64/fxsave %0" : "=m" (fpu->state->fxsave));
*
* This, however, we can work around by forcing the compiler to
* select an addressing mode that doesn't require extended
* registers.
*/
asm volatile( "rex64/fxsave (%[fx])"
- : "=m" (fpu->state.fxsave)
- : [fx] "R" (&fpu->state.fxsave));
+ : "=m" (fpu->state->fxsave)
+ : [fx] "R" (&fpu->state->fxsave));
}
}

@@ -432,7 +432,7 @@ static inline int copy_user_to_xregs(struct xregs_state __user *buf, u64 mask)
static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
{
if (likely(use_xsave())) {
- copy_xregs_to_kernel(&fpu->state.xsave);
+ copy_xregs_to_kernel(&fpu->state->xsave);
return 1;
}

@@ -445,7 +445,7 @@ static inline int copy_fpregs_to_fpstate(struct fpu *fpu)
* Legacy FPU register saving, FNSAVE always clears FPU registers,
* so we have to mark them inactive:
*/
- asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave));
+ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state->fsave));

return 0;
}
@@ -599,7 +599,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu)

if (preload) {
if (!fpregs_state_valid(new_fpu, cpu))
- copy_kernel_to_fpregs(&new_fpu->state);
+ copy_kernel_to_fpregs(new_fpu->state);
fpregs_activate(new_fpu);
}
}
diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index 3c80f5b9c09d..c828fefc2133 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -330,11 +330,7 @@ struct fpu {
* copy. If the task context-switches away then they get
* saved here and represent the FPU state.
*/
- union fpregs_state state;
- /*
- * WARNING: 'state' is dynamically-sized. Do not put
- * anything after it here.
- */
+ union fpregs_state *state;
};

#endif /* _ASM_X86_FPU_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index e2335edb9fc5..fcf76cb0ae1c 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -440,6 +440,8 @@ struct thread_struct {
unsigned long gs;
#endif

+ /* Floating point and extended processor state */
+ struct fpu fpu;
/* Save middle states of ptrace breakpoints */
struct perf_event *ptrace_bps[HBP_NUM];
/* Debug status used for traps, single steps, etc... */
@@ -464,13 +466,6 @@ struct thread_struct {

unsigned int sig_on_uaccess_err:1;
unsigned int uaccess_err:1; /* uaccess failed */
-
- /* Floating point and extended processor state */
- struct fpu fpu;
- /*
- * WARNING: 'fpu' is dynamically-sized. It *MUST* be at
- * the end.
- */
};

/*
@@ -803,6 +798,7 @@ static inline void spin_lock_prefetch(const void *x)
.sysenter_cs = __KERNEL_CS, \
.io_bitmap_ptr = NULL, \
.addr_limit = KERNEL_DS, \
+ .fpu.state = &init_fpregs_state, \
}

/*
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index 342e59789fcd..4c07f7b49773 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -23,8 +23,8 @@ DECLARE_EVENT_CLASS(x86_fpu,
__entry->fpregs_active = fpu->fpregs_active;
__entry->fpstate_active = fpu->fpstate_active;
if (boot_cpu_has(X86_FEATURE_OSXSAVE)) {
- __entry->xfeatures = fpu->state.xsave.header.xfeatures;
- __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv;
+ __entry->xfeatures = fpu->state->xsave.header.xfeatures;
+ __entry->xcomp_bv = fpu->state->xsave.header.xcomp_bv;
}
),
TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d xfeatures: %llx xcomp_bv: %llx",
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index e1114f070c2d..f935effa0b69 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -117,7 +117,7 @@ void __kernel_fpu_end(void)
struct fpu *fpu = &current->thread.fpu;

if (fpu->fpregs_active)
- copy_kernel_to_fpregs(&fpu->state);
+ copy_kernel_to_fpregs(fpu->state);

kernel_fpu_enable();
}
@@ -150,7 +150,7 @@ void fpu__save(struct fpu *fpu)
trace_x86_fpu_before_save(fpu);
if (fpu->fpregs_active) {
if (!copy_fpregs_to_fpstate(fpu)) {
- copy_kernel_to_fpregs(&fpu->state);
+ copy_kernel_to_fpregs(fpu->state);
}
}
trace_x86_fpu_after_save(fpu);
@@ -201,7 +201,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
* Don't let 'init optimized' areas of the XSAVE area
* leak into the child task:
*/
- memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size);
+ memset(&dst_fpu->state->xsave, 0, fpu_kernel_xstate_size);

/*
* Save current FPU registers directly into the child
@@ -220,10 +220,9 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu)
*/
preempt_disable();
if (!copy_fpregs_to_fpstate(dst_fpu)) {
- memcpy(&src_fpu->state, &dst_fpu->state,
- fpu_kernel_xstate_size);
+ memcpy(src_fpu->state, dst_fpu->state, fpu_kernel_xstate_size);

- copy_kernel_to_fpregs(&src_fpu->state);
+ copy_kernel_to_fpregs(src_fpu->state);
}
preempt_enable();

@@ -242,7 +241,7 @@ void fpu__activate_curr(struct fpu *fpu)
WARN_ON_FPU(fpu != &current->thread.fpu);

if (!fpu->fpstate_active) {
- fpstate_init(&fpu->state);
+ fpstate_init(fpu->state);
trace_x86_fpu_init_state(fpu);

trace_x86_fpu_activate_state(fpu);
@@ -270,7 +269,7 @@ void fpu__activate_fpstate_read(struct fpu *fpu)
fpu__save(fpu);
} else {
if (!fpu->fpstate_active) {
- fpstate_init(&fpu->state);
+ fpstate_init(fpu->state);
trace_x86_fpu_init_state(fpu);

trace_x86_fpu_activate_state(fpu);
@@ -305,7 +304,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu)
/* Invalidate any lazy state: */
__fpu_invalidate_fpregs_state(fpu);
} else {
- fpstate_init(&fpu->state);
+ fpstate_init(fpu->state);
trace_x86_fpu_init_state(fpu);

trace_x86_fpu_activate_state(fpu);
@@ -368,7 +367,7 @@ void fpu__current_fpstate_write_end(void)
* an XRSTOR if they are active.
*/
if (fpregs_active())
- copy_kernel_to_fpregs(&fpu->state);
+ copy_kernel_to_fpregs(fpu->state);

/*
* Our update is done and the fpregs/fpstate are in sync
@@ -395,7 +394,7 @@ void fpu__restore(struct fpu *fpu)
kernel_fpu_disable();
trace_x86_fpu_before_restore(fpu);
fpregs_activate(fpu);
- copy_kernel_to_fpregs(&fpu->state);
+ copy_kernel_to_fpregs(fpu->state);
trace_x86_fpu_after_restore(fpu);
kernel_fpu_enable();
}
@@ -489,11 +488,11 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
* fully reproduce the context of the exception.
*/
if (boot_cpu_has(X86_FEATURE_FXSR)) {
- cwd = fpu->state.fxsave.cwd;
- swd = fpu->state.fxsave.swd;
+ cwd = fpu->state->fxsave.cwd;
+ swd = fpu->state->fxsave.swd;
} else {
- cwd = (unsigned short)fpu->state.fsave.cwd;
- swd = (unsigned short)fpu->state.fsave.swd;
+ cwd = (unsigned short)fpu->state->fsave.cwd;
+ swd = (unsigned short)fpu->state->fsave.swd;
}

err = swd & ~cwd;
@@ -507,7 +506,7 @@ int fpu__exception_code(struct fpu *fpu, int trap_nr)
unsigned short mxcsr = MXCSR_DEFAULT;

if (boot_cpu_has(X86_FEATURE_XMM))
- mxcsr = fpu->state.fxsave.mxcsr;
+ mxcsr = fpu->state->fxsave.mxcsr;

err = ~(mxcsr >> 7) & mxcsr;
}
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index c2f8dde3255c..74a0fb816351 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -34,7 +34,7 @@ static void fpu__init_cpu_generic(void)
/* Flush out any pending x87 state: */
#ifdef CONFIG_MATH_EMULATION
if (!boot_cpu_has(X86_FEATURE_FPU))
- fpstate_init_soft(&current->thread.fpu.state.soft);
+ fpstate_init_soft(&current->thread.fpu.state->soft);
else
#endif
asm volatile ("fninit");
@@ -137,51 +137,7 @@ static void __init fpu__init_system_generic(void)
unsigned int fpu_kernel_xstate_size;
EXPORT_SYMBOL_GPL(fpu_kernel_xstate_size);

-/* Get alignment of the TYPE. */
-#define TYPE_ALIGN(TYPE) offsetof(struct { char x; TYPE test; }, test)
-
-/*
- * Enforce that 'MEMBER' is the last field of 'TYPE'.
- *
- * Align the computed size with alignment of the TYPE,
- * because that's how C aligns structs.
- */
-#define CHECK_MEMBER_AT_END_OF(TYPE, MEMBER) \
- BUILD_BUG_ON(sizeof(TYPE) != ALIGN(offsetofend(TYPE, MEMBER), \
- TYPE_ALIGN(TYPE)))
-
-/*
- * We append the 'struct fpu' to the task_struct:
- */
-static void __init fpu__init_task_struct_size(void)
-{
- int task_size = sizeof(struct task_struct);
-
- /*
- * Subtract off the static size of the register state.
- * It potentially has a bunch of padding.
- */
- task_size -= sizeof(((struct task_struct *)0)->thread.fpu.state);
-
- /*
- * Add back the dynamically-calculated register state
- * size.
- */
- task_size += fpu_kernel_xstate_size;
-
- /*
- * We dynamically size 'struct fpu', so we require that
- * it be at the end of 'thread_struct' and that
- * 'thread_struct' be at the end of 'task_struct'. If
- * you hit a compile error here, check the structure to
- * see if something got added to the end.
- */
- CHECK_MEMBER_AT_END_OF(struct fpu, state);
- CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu);
- CHECK_MEMBER_AT_END_OF(struct task_struct, thread);
-
- arch_task_struct_size = task_size;
-}
+union fpregs_state init_fpregs_state;

/*
* Set up the user and kernel xstate sizes based on the legacy FPU context size.
@@ -285,7 +241,5 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
fpu__init_system_generic();
fpu__init_system_xstate_size_legacy();
fpu__init_system_xstate();
- fpu__init_task_struct_size();
-
fpu__init_system_ctx_switch();
}
diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c
index b188b16841e3..c75bed318070 100644
--- a/arch/x86/kernel/fpu/regset.c
+++ b/arch/x86/kernel/fpu/regset.c
@@ -42,7 +42,7 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
fpstate_sanitize_xstate(fpu);

return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &fpu->state.fxsave, 0, -1);
+ &fpu->state->fxsave, 0, -1);
}

int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -59,19 +59,19 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
fpstate_sanitize_xstate(fpu);

ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &fpu->state.fxsave, 0, -1);
+ &fpu->state->fxsave, 0, -1);

/*
* mxcsr reserved bits must be masked to zero for security reasons.
*/
- fpu->state.fxsave.mxcsr &= mxcsr_feature_mask;
+ fpu->state->fxsave.mxcsr &= mxcsr_feature_mask;

/*
* update the header bits in the xsave header, indicating the
* presence of FP and SSE state.
*/
if (boot_cpu_has(X86_FEATURE_XSAVE))
- fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;
+ fpu->state->xsave.header.xfeatures |= XFEATURE_MASK_FPSSE;

return ret;
}
@@ -87,7 +87,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
if (!boot_cpu_has(X86_FEATURE_XSAVE))
return -ENODEV;

- xsave = &fpu->state.xsave;
+ xsave = &fpu->state->xsave;

fpu__activate_fpstate_read(fpu);

@@ -127,7 +127,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
if ((pos != 0) || (count < fpu_user_xstate_size))
return -EFAULT;

- xsave = &fpu->state.xsave;
+ xsave = &fpu->state->xsave;

fpu__activate_fpstate_write(fpu);

@@ -140,7 +140,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
* In case of failure, mark all states as init:
*/
if (ret)
- fpstate_init(&fpu->state);
+ fpstate_init(fpu->state);

/*
* mxcsr reserved bits must be masked to zero for security reasons.
@@ -230,7 +230,7 @@ static inline u32 twd_fxsr_to_i387(struct fxregs_state *fxsave)
void
convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
{
- struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
+ struct fxregs_state *fxsave = &tsk->thread.fpu.state->fxsave;
struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
int i;
@@ -268,7 +268,7 @@ void convert_to_fxsr(struct task_struct *tsk,
const struct user_i387_ia32_struct *env)

{
- struct fxregs_state *fxsave = &tsk->thread.fpu.state.fxsave;
+ struct fxregs_state *fxsave = &tsk->thread.fpu.state->fxsave;
struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
int i;
@@ -306,7 +306,7 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,

if (!boot_cpu_has(X86_FEATURE_FXSR))
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &fpu->state.fsave, 0,
+ &fpu->state->fsave, 0,
-1);

fpstate_sanitize_xstate(fpu);
@@ -337,7 +337,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,

if (!boot_cpu_has(X86_FEATURE_FXSR))
return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &fpu->state.fsave, 0,
+ &fpu->state->fsave, 0,
-1);

if (pos > 0 || count < sizeof(env))
@@ -352,7 +352,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
* presence of FP.
*/
if (boot_cpu_has(X86_FEATURE_XSAVE))
- fpu->state.xsave.header.xfeatures |= XFEATURE_MASK_FP;
+ fpu->state->xsave.header.xfeatures |= XFEATURE_MASK_FP;
return ret;
}

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 83c23c230b4c..d943bfa48e83 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -56,7 +56,7 @@ static inline int check_for_xstate(struct fxregs_state __user *buf,
static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)
{
if (use_fxsr()) {
- struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+ struct xregs_state *xsave = &tsk->thread.fpu.state->xsave;
struct user_i387_ia32_struct env;
struct _fpstate_32 __user *fp = buf;

@@ -155,7 +155,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
*/
int copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
{
- struct xregs_state *xsave = &current->thread.fpu.state.xsave;
+ struct xregs_state *xsave = &current->thread.fpu.state->xsave;
struct task_struct *tsk = current;
int ia32_fxstate = (buf != buf_fx);

@@ -209,7 +209,7 @@ sanitize_restored_xstate(struct task_struct *tsk,
struct user_i387_ia32_struct *ia32_env,
u64 xfeatures, int fx_only)
{
- struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
+ struct xregs_state *xsave = &tsk->thread.fpu.state->xsave;
struct xstate_header *header = &xsave->header;

if (use_xsave()) {
@@ -325,14 +325,14 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size)

if (using_compacted_format()) {
err = copyin_to_xsaves(NULL, buf_fx,
- &fpu->state.xsave);
+ &fpu->state->xsave);
} else {
- err = __copy_from_user(&fpu->state.xsave,
+ err = __copy_from_user(&fpu->state->xsave,
buf_fx, state_size);
}

if (err || __copy_from_user(&env, buf, sizeof(env))) {
- fpstate_init(&fpu->state);
+ fpstate_init(fpu->state);
trace_x86_fpu_init_state(fpu);
err = -1;
} else {
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c24ac1efb12d..2ba5e6f18775 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -157,14 +157,14 @@ static int xfeature_is_user(int xfeature_nr)
*/
void fpstate_sanitize_xstate(struct fpu *fpu)
{
- struct fxregs_state *fx = &fpu->state.fxsave;
+ struct fxregs_state *fx = &fpu->state->fxsave;
int feature_bit;
u64 xfeatures;

if (!use_xsaveopt())
return;

- xfeatures = fpu->state.xsave.header.xfeatures;
+ xfeatures = fpu->state->xsave.header.xfeatures;

/*
* None of the feature bits are in init state. So nothing else
@@ -875,7 +875,7 @@ const void *get_xsave_field_ptr(int xsave_state)
*/
fpu__save(fpu);

- return get_xsave_addr(&fpu->state.xsave, xsave_state);
+ return get_xsave_addr(&fpu->state->xsave, xsave_state);
}

#ifdef CONFIG_ARCH_HAS_PKEYS
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0bb88428cbf2..60129943a064 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -20,8 +20,8 @@
#include <linux/dmi.h>
#include <linux/utsname.h>
#include <linux/stackprotector.h>
-#include <linux/tick.h>
#include <linux/cpuidle.h>
+#include <linux/kthread.h>
#include <trace/events/power.h>
#include <linux/hw_breakpoint.h>
#include <asm/cpu.h>
@@ -73,20 +73,40 @@ EXPORT_PER_CPU_SYMBOL(cpu_tss);
DEFINE_PER_CPU(bool, __tss_limit_invalid);
EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);

+struct kmem_cache *fpregs_state_cachep;
+EXPORT_SYMBOL(fpregs_state_cachep);
+
+void __init arch_task_cache_init(void)
+{
+ /* create a slab on which fpregs_states can be allocated */
+ fpregs_state_cachep = kmem_cache_create("fpregs_state",
+ fpu_kernel_xstate_size,
+ ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK,
+ NULL);
+}
+
/*
* this gets called so that we can store lazy state into memory and copy the
* current task into the new thread.
*/
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
- memcpy(dst, src, arch_task_struct_size);
+ *dst = *src;
#ifdef CONFIG_VM86
dst->thread.vm86 = NULL;
#endif
+ dst->thread.fpu.state = kmem_cache_alloc_node(fpregs_state_cachep,
+ GFP_KERNEL, tsk_fork_get_node(src));

return fpu__copy(&dst->thread.fpu, &src->thread.fpu);
}

+void arch_release_task_struct(struct task_struct *tsk)
+{
+ kmem_cache_free(fpregs_state_cachep, tsk->thread.fpu.state);
+ tsk->thread.fpu.state = NULL;
+}
+
/*
* Free current thread data structures etc..
*/
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ee22226e3807..17d2cbc838d6 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -3213,7 +3213,7 @@ static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,

static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)
{
- struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
+ struct xregs_state *xsave = &vcpu->arch.guest_fpu.state->xsave;
u64 xstate_bv = xsave->header.xfeatures;
u64 valid;

@@ -3250,7 +3250,7 @@ static void fill_xsave(u8 *dest, struct kvm_vcpu *vcpu)

static void load_xsave(struct kvm_vcpu *vcpu, u8 *src)
{
- struct xregs_state *xsave = &vcpu->arch.guest_fpu.state.xsave;
+ struct xregs_state *xsave = &vcpu->arch.guest_fpu.state->xsave;
u64 xstate_bv = *(u64 *)(src + XSAVE_HDR_OFFSET);
u64 valid;

@@ -3294,7 +3294,7 @@ static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
fill_xsave((u8 *) guest_xsave->region, vcpu);
} else {
memcpy(guest_xsave->region,
- &vcpu->arch.guest_fpu.state.fxsave,
+ &vcpu->arch.guest_fpu.state->fxsave,
sizeof(struct fxregs_state));
*(u64 *)&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
XFEATURE_MASK_FPSSE;
@@ -3319,7 +3319,7 @@ static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
} else {
if (xstate_bv & ~XFEATURE_MASK_FPSSE)
return -EINVAL;
- memcpy(&vcpu->arch.guest_fpu.state.fxsave,
+ memcpy(&vcpu->arch.guest_fpu.state->fxsave,
guest_xsave->region, sizeof(struct fxregs_state));
}
return 0;
@@ -7545,7 +7545,7 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
struct fxregs_state *fxsave =
- &vcpu->arch.guest_fpu.state.fxsave;
+ &vcpu->arch.guest_fpu.state->fxsave;

memcpy(fpu->fpr, fxsave->st_space, 128);
fpu->fcw = fxsave->cwd;
@@ -7562,7 +7562,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)
{
struct fxregs_state *fxsave =
- &vcpu->arch.guest_fpu.state.fxsave;
+ &vcpu->arch.guest_fpu.state->fxsave;

memcpy(fxsave->st_space, fpu->fpr, 128);
fxsave->cwd = fpu->fcw;
@@ -7578,9 +7578,9 @@ int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu)

static void fx_init(struct kvm_vcpu *vcpu)
{
- fpstate_init(&vcpu->arch.guest_fpu.state);
+ fpstate_init(vcpu->arch.guest_fpu.state);
if (boot_cpu_has(X86_FEATURE_XSAVES))
- vcpu->arch.guest_fpu.state.xsave.header.xcomp_bv =
+ vcpu->arch.guest_fpu.state->xsave.header.xcomp_bv =
host_xcr0 | XSTATE_COMPACTION_ENABLED;

/*
@@ -7603,7 +7603,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
*/
vcpu->guest_fpu_loaded = 1;
__kernel_fpu_begin();
- __copy_kernel_to_fpregs(&vcpu->arch.guest_fpu.state);
+ __copy_kernel_to_fpregs(vcpu->arch.guest_fpu.state);
trace_kvm_fpu(1);
}

@@ -7891,6 +7891,8 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
struct static_key kvm_no_apic_vcpu __read_mostly;
EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);

+extern struct kmem_cache *fpregs_state_cachep;
+
int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
{
struct page *page;
@@ -7908,11 +7910,15 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
else
vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;

- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page) {
- r = -ENOMEM;
+ r = -ENOMEM;
+ vcpu->arch.guest_fpu.state = kmem_cache_alloc(fpregs_state_cachep,
+ GFP_KERNEL);
+ if (!vcpu->arch.guest_fpu.state)
goto fail;
- }
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ goto fail_free_fpregs;
vcpu->arch.pio_data = page_address(page);

kvm_set_tsc_khz(vcpu, max_tsc_khz);
@@ -7970,6 +7976,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
kvm_mmu_destroy(vcpu);
fail_free_pio_data:
free_page((unsigned long)vcpu->arch.pio_data);
+fail_free_fpregs:
+ kmem_cache_free(fpregs_state_cachep, vcpu->arch.guest_fpu.state);
fail:
return r;
}
@@ -7988,6 +7996,8 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
free_page((unsigned long)vcpu->arch.pio_data);
if (!lapic_in_kernel(vcpu))
static_key_slow_dec(&kvm_no_apic_vcpu);
+ kmem_cache_free(fpregs_state_cachep, vcpu->arch.guest_fpu.state);
+ vcpu->arch.guest_fpu.state = NULL;
}

void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d31a8095237b..a7b239a87160 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1053,8 +1053,9 @@ struct task_struct {
struct thread_struct thread;

/*
- * WARNING: on x86, 'thread_struct' contains a variable-sized
- * structure. It *MUST* be at the end of 'task_struct'.
+ * WARNING: Under CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT,
+ * 'thread_struct' contains a variable-sized structure.
+ * It *MUST* be at the end of 'task_struct'.
*
* Do not put anything below here!
*/
--
2.7.4


--
Kees Cook
Pixel Security