Re: [PATCH v3 00/13] Virtually mapped stacks with guard pages (x86, core)

From: Linus Torvalds
Date: Fri Jun 24 2016 - 02:17:40 EST


On Thu, Jun 23, 2016 at 12:17 PM, Linus Torvalds
<torvalds@xxxxxxxxxxxxxxxxxxxx> wrote:
>
> With the goal being that I'm hoping that we can then actually get rid
> of this (at least on x86-64, even if we leave it in some other
> architectures) in 4.8.

The context here was that we could almost get rid of thread-info
entirely, at least for x86-64, by moving it into struct task_struct.

It turns out that we're not *that* far off after the obvious cleanups
I already committed, but I couldn't get things quite to work.

I'm attaching a patch that I wrote today that doesn't boot, but "looks
right". The reason I'm attaching it is because I'm hoping somebody
wants to take a look and maybe see what else I missed, but mostly
because I think the patch is interesting in a couple of cases where we
just do incredibly ugly things.

First off, some code that Andy wrote when he re-organized the entry path.

Oh Gods, Andy. That pt_regs_to_thread_info() thing made me want to do
unspeakable acts on a poor innocent wax figure that looked _exactly_
like you.

I just got rid of pt_regs_to_thread_info() entirely, and just replaced
it with current_thread_info(). I'm not at all convinced that trying
to be that clever was really a good idea.

Secondly, the x86-64 ret_from_fork calling convention was documented
wrongly. It says %rdi contains the previous task pointer. Yes it does,
but it doesn't mention that %r8 is supposed to contain the new
thread_info. That was fun to find.

And thirdly, the stack size games that asm/kprobes.h plays are just
disgusting. I stared at that code for much too long. I may in fact be
going blind as a result.

The rest was fairly straightforward, although since the end result
doesn't actually work, that "straightforward" may be broken too. But
the basic approach _looks_ sane.

Comments? Anybody want to play with this and see where I went wrong?

(Note - this patch was written on top of the two thread-info removal
patches I committed in

da01e18a37a5 x86: avoid avoid passing around 'thread_info' in stack
dumping code
6720a305df74 locking: avoid passing around 'thread_info' in mutex
debugging code

and depends on them, since "ti->task" no longer exists with
CONFIG_THREAD_INFO_IN_TASK. "ti" and "task" will have the same value).

Linus
This is a non-working attempt at moving the thread_info into the
task_struct

arch/x86/Kconfig | 1 +
arch/x86/entry/common.c | 21 +++++++--------------
arch/x86/entry/entry_64.S | 9 ++++++---
arch/x86/include/asm/kprobes.h | 12 ++++++------
arch/x86/include/asm/switch_to.h | 6 ++----
arch/x86/include/asm/thread_info.h | 38 ++++----------------------------------
arch/x86/kernel/dumpstack.c | 2 +-
arch/x86/kernel/irq_32.c | 2 --
arch/x86/kernel/irq_64.c | 3 +--
arch/x86/kernel/process.c | 6 ++----
arch/x86/um/ptrace_32.c | 8 ++++----
include/linux/init_task.h | 9 +++++++++
include/linux/sched.h | 14 +++++++++++++-
init/Kconfig | 3 +++
init/init_task.c | 7 +++++--
15 files changed, 64 insertions(+), 77 deletions(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d9a94da0c29f..f33bc80577c5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -154,6 +154,7 @@ config X86
select SPARSE_IRQ
select SRCU
select SYSCTL_EXCEPTION_TRACE
+ select THREAD_INFO_IN_TASK
select USER_STACKTRACE_SUPPORT
select VIRT_TO_BUS
select X86_DEV_DMA_OPS if X86_64
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index ec138e538c44..d5feac5f252d 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -31,13 +31,6 @@
#define CREATE_TRACE_POINTS
#include <trace/events/syscalls.h>

-static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
-{
- unsigned long top_of_stack =
- (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
- return (struct thread_info *)(top_of_stack - THREAD_SIZE);
-}
-
#ifdef CONFIG_CONTEXT_TRACKING
/* Called on entry from user mode with IRQs off. */
__visible void enter_from_user_mode(void)
@@ -78,7 +71,7 @@ static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
*/
unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
{
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
unsigned long ret = 0;
u32 work;

@@ -156,7 +149,7 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
unsigned long phase1_result)
{
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
long ret = 0;
u32 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY;

@@ -239,7 +232,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
/* Disable IRQs and retry */
local_irq_disable();

- cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+ cached_flags = READ_ONCE(current_thread_info()->flags);

if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS))
break;
@@ -250,7 +243,7 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags)
/* Called with IRQs disabled. */
__visible inline void prepare_exit_to_usermode(struct pt_regs *regs)
{
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
u32 cached_flags;

if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled()))
@@ -309,7 +302,7 @@ static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags)
*/
__visible inline void syscall_return_slowpath(struct pt_regs *regs)
{
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
u32 cached_flags = READ_ONCE(ti->flags);

CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
@@ -332,7 +325,7 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs)
#ifdef CONFIG_X86_64
__visible void do_syscall_64(struct pt_regs *regs)
{
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
unsigned long nr = regs->orig_ax;

enter_from_user_mode();
@@ -365,7 +358,7 @@ __visible void do_syscall_64(struct pt_regs *regs)
*/
static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
{
- struct thread_info *ti = pt_regs_to_thread_info(regs);
+ struct thread_info *ti = current_thread_info();
unsigned int nr = (unsigned int)regs->orig_ax;

#ifdef CONFIG_IA32_EMULATION
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9ee0da1807ed..f49742de2c65 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -179,7 +179,8 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
* If we need to do entry work or if we guess we'll need to do
* exit work, go straight to the slow path.
*/
- testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ GET_THREAD_INFO(%r11)
+ testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TI_flags(%r11)
jnz entry_SYSCALL64_slow_path

entry_SYSCALL_64_fastpath:
@@ -217,7 +218,8 @@ entry_SYSCALL_64_fastpath:
*/
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ GET_THREAD_INFO(%r11)
+ testl $_TIF_ALLWORK_MASK, TI_flags(%r11)
jnz 1f

LOCKDEP_SYS_EXIT
@@ -368,9 +370,10 @@ END(ptregs_\func)
* A newly forked process directly context switches into this address.
*
* rdi: prev task we switched from
+ * rsi: task we're switching to
*/
ENTRY(ret_from_fork)
- LOCK ; btr $TIF_FORK, TI_flags(%r8)
+ LOCK ; btr $TIF_FORK, TI_flags(%rsi) /* rsi: this newly forked task */

call schedule_tail /* rdi: 'prev' task parameter */

diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4421b5da409d..1d2997e74b08 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -38,12 +38,12 @@ typedef u8 kprobe_opcode_t;
#define RELATIVECALL_OPCODE 0xe8
#define RELATIVE_ADDR_SIZE 4
#define MAX_STACK_SIZE 64
-#define MIN_STACK_SIZE(ADDR) \
- (((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \
- THREAD_SIZE - (unsigned long)(ADDR))) \
- ? (MAX_STACK_SIZE) \
- : (((unsigned long)current_thread_info()) + \
- THREAD_SIZE - (unsigned long)(ADDR)))
+
+#define current_stack_top() ((unsigned long)task_stack_page(current)+THREAD_SIZE)
+#define current_stack_size(ADDR) (current_stack_top() - (unsigned long)(ADDR))
+
+#define MIN_STACK_SIZE(ADDR) \
+ (MAX_STACK_SIZE < current_stack_size(ADDR) ? MAX_STACK_SIZE : current_stack_size(ADDR))

#define flush_insn_slot(p) do { } while (0)

diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 8f321a1b03a1..ae0aa0612c67 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -110,18 +110,16 @@ do { \
"call __switch_to\n\t" \
"movq "__percpu_arg([current_task])",%%rsi\n\t" \
__switch_canary \
- "movq %P[thread_info](%%rsi),%%r8\n\t" \
"movq %%rax,%%rdi\n\t" \
- "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
+ "testl %[_tif_fork],%P[ti_flags](%%rsi)\n\t" \
"jnz ret_from_fork\n\t" \
RESTORE_CONTEXT \
: "=a" (last) \
__switch_canary_oparam \
: [next] "S" (next), [prev] "D" (prev), \
[threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
- [ti_flags] "i" (offsetof(struct thread_info, flags)), \
+ [ti_flags] "i" (offsetof(struct task_struct, thread_info.flags)), \
[_tif_fork] "i" (_TIF_FORK), \
- [thread_info] "i" (offsetof(struct task_struct, stack)), \
[current_task] "m" (current_task) \
__switch_canary_iparam \
: "memory", "cc" __EXTRA_CLOBBER)
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 30c133ac05cd..eef687fdc90d 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -53,24 +53,22 @@ struct task_struct;
#include <linux/atomic.h>

struct thread_info {
- struct task_struct *task; /* main task structure */
__u32 flags; /* low level flags */
__u32 status; /* thread synchronous flags */
__u32 cpu; /* current CPU */
- mm_segment_t addr_limit;
unsigned int sig_on_uaccess_error:1;
unsigned int uaccess_err:1; /* uaccess failed */
+ mm_segment_t addr_limit;
};

#define INIT_THREAD_INFO(tsk) \
{ \
- .task = &tsk, \
.flags = 0, \
.cpu = 0, \
.addr_limit = KERNEL_DS, \
}

-#define init_thread_info (init_thread_union.thread_info)
+#define init_thread_info (init_task.thread_info)
#define init_stack (init_thread_union.stack)

#else /* !__ASSEMBLY__ */
@@ -166,7 +164,7 @@ struct thread_info {

static inline struct thread_info *current_thread_info(void)
{
- return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
+ return (struct thread_info *)current;
}

static inline unsigned long current_stack_pointer(void)
@@ -188,35 +186,7 @@ static inline unsigned long current_stack_pointer(void)

/* Load thread_info address into "reg" */
#define GET_THREAD_INFO(reg) \
- _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \
- _ASM_SUB $(THREAD_SIZE),reg ;
-
-/*
- * ASM operand which evaluates to a 'thread_info' address of
- * the current task, if it is known that "reg" is exactly "off"
- * bytes below the top of the stack currently.
- *
- * ( The kernel stack's size is known at build time, it is usually
- * 2 or 4 pages, and the bottom of the kernel stack contains
- * the thread_info structure. So to access the thread_info very
- * quickly from assembly code we can calculate down from the
- * top of the kernel stack to the bottom, using constant,
- * build-time calculations only. )
- *
- * For example, to fetch the current thread_info->flags value into %eax
- * on x86-64 defconfig kernels, in syscall entry code where RSP is
- * currently at exactly SIZEOF_PTREGS bytes away from the top of the
- * stack:
- *
- * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax
- *
- * will translate to:
- *
- * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax
- *
- * which is below the current RSP by almost 16K.
- */
-#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg)
+ _ASM_MOV PER_CPU_VAR(current_task),reg

#endif

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index d6209f3a69cb..ef8017ca5ba9 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -80,7 +80,7 @@ print_ftrace_graph_addr(unsigned long addr, void *data,
static inline int valid_stack_ptr(struct task_struct *task,
void *p, unsigned int size, void *end)
{
- void *t = task_thread_info(task);
+ void *t = task_stack_page(task);
if (end) {
if (p < end && p >= (end-THREAD_SIZE))
return 1;
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 38da8f29a9c8..c627bf8d98ad 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -130,11 +130,9 @@ void irq_ctx_init(int cpu)

void do_softirq_own_stack(void)
{
- struct thread_info *curstk;
struct irq_stack *irqstk;
u32 *isp, *prev_esp;

- curstk = current_stack();
irqstk = __this_cpu_read(softirq_stack);

/* build the stack frame on the softirq stack */
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 206d0b90a3ab..38f9f5678dc8 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -41,8 +41,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
if (user_mode(regs))
return;

- if (regs->sp >= curbase + sizeof(struct thread_info) +
- sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
+ if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN &&
regs->sp <= curbase + THREAD_SIZE)
return;

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 96becbbb52e0..8f60f810a9e7 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -536,9 +536,7 @@ unsigned long get_wchan(struct task_struct *p)
* PADDING
* ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
* stack
- * ----------- bottom = start + sizeof(thread_info)
- * thread_info
- * ----------- start
+ * ----------- bottom = start
*
* The tasks stack pointer points at the location where the
* framepointer is stored. The data on the stack is:
@@ -549,7 +547,7 @@ unsigned long get_wchan(struct task_struct *p)
*/
top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
top -= 2 * sizeof(unsigned long);
- bottom = start + sizeof(struct thread_info);
+ bottom = start;

sp = READ_ONCE(p->thread.sp);
if (sp < bottom || sp > top)
diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c
index ebd4dd6ef73b..14e8f6a628c2 100644
--- a/arch/x86/um/ptrace_32.c
+++ b/arch/x86/um/ptrace_32.c
@@ -191,7 +191,7 @@ int peek_user(struct task_struct *child, long addr, long data)

static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
{
- int err, n, cpu = ((struct thread_info *) child->stack)->cpu;
+ int err, n, cpu = task_thread_info(child)->cpu;
struct user_i387_struct fpregs;

err = save_i387_registers(userspace_pid[cpu],
@@ -208,7 +208,7 @@ static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *c

static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child)
{
- int n, cpu = ((struct thread_info *) child->stack)->cpu;
+ int n, cpu = task_thread_info(child)->cpu;
struct user_i387_struct fpregs;

n = copy_from_user(&fpregs, buf, sizeof(fpregs));
@@ -221,7 +221,7 @@ static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *c

static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child)
{
- int err, n, cpu = ((struct thread_info *) child->stack)->cpu;
+ int err, n, cpu = task_thread_info(child)->cpu;
struct user_fxsr_struct fpregs;

err = save_fpx_registers(userspace_pid[cpu], (unsigned long *) &fpregs);
@@ -237,7 +237,7 @@ static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *

static int set_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child)
{
- int n, cpu = ((struct thread_info *) child->stack)->cpu;
+ int n, cpu = task_thread_info(child)->cpu;
struct user_fxsr_struct fpregs;

n = copy_from_user(&fpregs, buf, sizeof(fpregs));
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index f2cb8d45513d..a00f53b64c09 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -15,6 +15,8 @@
#include <net/net_namespace.h>
#include <linux/sched/rt.h>

+#include <asm/thread_info.h>
+
#ifdef CONFIG_SMP
# define INIT_PUSHABLE_TASKS(tsk) \
.pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO),
@@ -183,12 +185,19 @@ extern struct task_group root_task_group;
# define INIT_KASAN(tsk)
#endif

+#ifdef CONFIG_THREAD_INFO_IN_TASK
+# define INIT_TASK_TI(tsk) .thread_info = INIT_THREAD_INFO(tsk),
+#else
+# define INIT_TASK_TI(tsk)
+#endif
+
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
*/
#define INIT_TASK(tsk) \
{ \
+ INIT_TASK_TI(tsk) \
.state = 0, \
.stack = &init_thread_info, \
.usage = ATOMIC_INIT(2), \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e42ada26345..06236a36ba17 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1456,6 +1456,9 @@ struct tlbflush_unmap_batch {
};

struct task_struct {
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ struct thread_info thread_info;
+#endif
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
void *stack;
atomic_t usage;
@@ -2539,7 +2542,9 @@ extern void set_curr_task(int cpu, struct task_struct *p);
void yield(void);

union thread_union {
+#ifndef CONFIG_THREAD_INFO_IN_TASK
struct thread_info thread_info;
+#endif
unsigned long stack[THREAD_SIZE/sizeof(long)];
};

@@ -2967,7 +2972,14 @@ static inline void threadgroup_change_end(struct task_struct *tsk)
cgroup_threadgroup_change_end(tsk);
}

-#ifndef __HAVE_THREAD_FUNCTIONS
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+
+#define task_thread_info(task) (&(task)->thread_info)
+#define task_stack_page(task) ((task)->stack)
+#define setup_thread_stack(new,old) do { } while(0)
+#define end_of_stack(task) ((unsigned long *)task_stack_page(task))
+
+#elif !defined(__HAVE_THREAD_FUNCTIONS)

#define task_thread_info(task) ((struct thread_info *)(task)->stack)
#define task_stack_page(task) ((task)->stack)
diff --git a/init/Kconfig b/init/Kconfig
index f755a602d4a1..0c83af6d3753 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -26,6 +26,9 @@ config IRQ_WORK
config BUILDTIME_EXTABLE_SORT
bool

+config THREAD_INFO_IN_TASK
+ bool
+
menu "General setup"

config BROKEN
diff --git a/init/init_task.c b/init/init_task.c
index ba0a7f362d9e..11f83be1fa79 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -22,5 +22,8 @@ EXPORT_SYMBOL(init_task);
* Initial thread structure. Alignment of this is handled by a special
* linker map entry.
*/
-union thread_union init_thread_union __init_task_data =
- { INIT_THREAD_INFO(init_task) };
+union thread_union init_thread_union __init_task_data = {
+#ifndef CONFIG_THREAD_INFO_IN_TASK
+ INIT_THREAD_INFO(init_task)
+#endif
+};