[PATCH v2 36/44] x86/entry/unwind: encode pt_regs pointer in frame pointer

From: Josh Poimboeuf
Date: Thu Aug 04 2016 - 18:26:34 EST


With frame pointers, when a task is interrupted, its stack is no longer
completely reliable because the function could have been interrupted
before it had a chance to save the previous frame pointer on the stack.
So the caller of the interrupted function could get skipped by a stack
trace.

This is problematic for live patching, which needs to know whether a
stack trace of a sleeping task can be relied upon. There's currently no
way to detect if a sleeping task was interrupted by a page fault
exception or preemption before it went to sleep.

Another issue is that when dumping the stack of an interrupted task, the
unwinder has no way of knowing where the saved pt_regs registers are, so
it can't print them.

This solves those issues by encoding the pt_regs pointer in the frame
pointer on entry from an interrupt or an exception. The frame pointer
unwinder is also updated to decode it.

Suggested-by: Andy Lutomirski <luto@xxxxxxxxxxxxxx>
Signed-off-by: Josh Poimboeuf <jpoimboe@xxxxxxxxxx>
---
arch/x86/entry/calling.h | 21 +++++++++++
arch/x86/entry/entry_64.S | 10 ++++--
arch/x86/include/asm/unwind.h | 11 ++++++
arch/x86/kernel/unwind_frame.c | 80 ++++++++++++++++++++++++++++++++++++++++++
4 files changed, 119 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 9a9e588..ff5a5a3 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -201,6 +201,27 @@ For 32-bit we have the following conventions - kernel is built with
.byte 0xf1
.endm

+ /*
+ * This is a sneaky trick to help the unwinder find pt_regs on the
+ * stack. The frame pointer is replaced with an encoded pointer to
+ * pt_regs. The encoding is just a clearing of the highest-order bit,
+ * which makes it an invalid address and is also a signal to the
+ * unwinder that it's a pt_regs pointer in disguise.
+ *
+ * NOTE: This must be called *after* SAVE_EXTRA_REGS because it
+ * corrupts rbp.
+ */
+.macro ENCODE_FRAME_POINTER ptregs_offset=0
+#ifdef CONFIG_FRAME_POINTER
+ .if \ptregs_offset
+ leaq \ptregs_offset(%rsp), %rbp
+ .else
+ mov %rsp, %rbp
+ .endif
+ btr $63, %rbp
+#endif
+.endm
+
#endif /* CONFIG_X86_64 */

/*
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 8956eae..0ee48c6 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -430,6 +430,7 @@ END(irq_entries_start)
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
+ ENCODE_FRAME_POINTER

testb $3, CS(%rsp)
jz 1f
@@ -892,6 +893,7 @@ ENTRY(xen_failsafe_callback)
ALLOC_PT_GPREGS_ON_STACK
SAVE_C_REGS
SAVE_EXTRA_REGS
+ ENCODE_FRAME_POINTER
jmp error_exit
END(xen_failsafe_callback)

@@ -935,6 +937,7 @@ ENTRY(paranoid_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
+ ENCODE_FRAME_POINTER 8
movl $1, %ebx
movl $MSR_GS_BASE, %ecx
rdmsr
@@ -982,6 +985,7 @@ ENTRY(error_entry)
cld
SAVE_C_REGS 8
SAVE_EXTRA_REGS 8
+ ENCODE_FRAME_POINTER 8
xorl %ebx, %ebx
testb $3, CS+8(%rsp)
jz .Lerror_kernelspace
@@ -1164,6 +1168,7 @@ ENTRY(nmi)
pushq %r13 /* pt_regs->r13 */
pushq %r14 /* pt_regs->r14 */
pushq %r15 /* pt_regs->r15 */
+ ENCODE_FRAME_POINTER

/*
* At this point we no longer need to worry about stack damage
@@ -1177,11 +1182,10 @@ ENTRY(nmi)

/*
* Return back to user mode. We must *not* do the normal exit
- * work, because we don't want to enable interrupts. Fortunately,
- * do_nmi doesn't modify pt_regs.
+ * work, because we don't want to enable interrupts.
*/
SWAPGS
- jmp restore_c_regs_and_iret
+ jmp restore_regs_and_iret

.Lnmi_from_kernel:
/*
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index c4fdd58..5ba7f3c 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -13,6 +13,7 @@ struct unwind_state {
int graph_idx;
#ifdef CONFIG_FRAME_POINTER
unsigned long *bp;
+ struct pt_regs *regs;
#else
unsigned long *sp;
#endif
@@ -43,6 +44,11 @@ static inline unsigned long *unwind_get_stack_ptr(struct unwind_state *state)
return state->bp;
}

+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+{
+ return state->regs;
+}
+
unsigned long unwind_get_return_address(struct unwind_state *state);

#else /* !CONFIG_FRAME_POINTER */
@@ -61,6 +67,11 @@ static inline unsigned long *unwind_get_stack_ptr(struct unwind_state *state)
return state->sp;
}

+static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
+{
+ return NULL;
+}
+
static inline
unsigned long unwind_get_return_address(struct unwind_state *state)
{
diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c
index f28f1b5..d24c192 100644
--- a/arch/x86/kernel/unwind_frame.c
+++ b/arch/x86/kernel/unwind_frame.c
@@ -21,6 +21,52 @@ unsigned long unwind_get_return_address(struct unwind_state *state)
}
EXPORT_SYMBOL_GPL(unwind_get_return_address);

+#ifdef CONFIG_X86_64
+/*
+ * This determines if the frame pointer actually contains an encoded pointer to
+ * pt_regs on the stack. See ENCODE_FRAME_POINTER.
+ */
+static struct pt_regs *decode_frame_pointer(struct unwind_state *state,
+ unsigned long bp)
+{
+ struct pt_regs *regs;
+ unsigned long *task_begin = task_stack_page(state->task);
+ unsigned long *task_end = task_stack_page(state->task) + THREAD_SIZE;
+
+ /* if the MSB is set, it's not an encoded pointer */
+ if (bp & (1UL << (BITS_PER_LONG - 1)))
+ return NULL;
+
+ /* decode it by setting the MSB */
+ bp |= 1UL << (BITS_PER_LONG - 1);
+ regs = (struct pt_regs *)bp;
+
+ /* make sure the regs are on the current unwind_state stack */
+ if (on_stack(&state->stack_info, regs, sizeof(*regs)))
+ return regs;
+
+ /*
+ * The regs might have been placed on the task stack before entry code
+ * switched to the irq stack.
+ */
+ if (state->stack_info.type == STACK_TYPE_IRQ &&
+ state->stack_info.next_sp >= task_begin &&
+ state->stack_info.next_sp < task_end &&
+ (unsigned long *)regs >= task_begin &&
+ (unsigned long *)regs < task_end &&
+ (unsigned long *)(regs + 1) <= task_end)
+ return regs;
+
+ return NULL;
+}
+#else
+static struct pt_regs *decode_frame_pointer(struct unwind_state *state,
+ unsigned long bp)
+{
+ return NULL;
+}
+#endif
+
static bool update_stack_state(struct unwind_state *state, void *addr,
size_t len)
{
@@ -45,14 +91,47 @@ unknown:

bool unwind_next_frame(struct unwind_state *state)
{
+ struct pt_regs *regs;
unsigned long *next_bp;

+ state->regs = NULL;
+
if (unwind_done(state))
return false;

next_bp = (unsigned long *)*state->bp;

/*
+ * Check if the next frame pointer is really an encoded pt_regs
+ * pointer.
+ */
+ regs = decode_frame_pointer(state, (unsigned long)next_bp);
+ if (regs) {
+ /*
+ * We may need to switch to the next stack to access the regs.
+ * This can happen when switching from the IRQ stack: the
+ * encoded regs pointer is on the IRQ stack but the regs
+ * themselves are on the task stack.
+ */
+ if (!update_stack_state(state, regs, sizeof(*regs)))
+ return false;
+
+ /*
+ * The regs are now safe to access and are made available to
+ * the user even if we've reached the end.
+ */
+ state->regs = regs;
+
+ if (user_mode(regs)) {
+ /* reached the end */
+ state->stack_info.type = STACK_TYPE_UNKNOWN;
+ return false;
+ }
+
+ next_bp = (unsigned long *)regs->bp;
+ }
+
+ /*
* Make sure the next frame is on a valid stack and can be accessed
* safely.
*/
@@ -72,6 +151,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,

state->task = task;
state->bp = get_frame_pointer(task, regs);
+ state->regs = NULL;

get_stack_info(state->bp, state->task, &state->stack_info,
&state->stack_mask);
--
2.7.4