[PATCH 4.1 35/84] x86/nmi/64: Improve nested NMI comments

From: Greg Kroah-Hartman
Date: Fri Aug 14 2015 - 14:21:57 EST


4.1-stable review patch. If anyone has any objections, please let me know.

------------------

From: Andy Lutomirski <luto@xxxxxxxxxx>

commit 0b22930ebad563ae97ff3f8d7b9f12060b4c6e6b upstream.

I found the nested NMI documentation to be difficult to follow.
Improve the comments.

Signed-off-by: Andy Lutomirski <luto@xxxxxxxxxx>
Reviewed-by: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Borislav Petkov <bp@xxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Signed-off-by: Ingo Molnar <mingo@xxxxxxxxxx>
Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>

---
arch/x86/kernel/entry_64.S | 160 +++++++++++++++++++++++++--------------------
arch/x86/kernel/nmi.c | 4 -
2 files changed, 94 insertions(+), 70 deletions(-)

--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1411,11 +1411,12 @@ ENTRY(nmi)
* If the variable is not set and the stack is not the NMI
* stack then:
* o Set the special variable on the stack
- * o Copy the interrupt frame into a "saved" location on the stack
- * o Copy the interrupt frame into a "copy" location on the stack
+ * o Copy the interrupt frame into an "outermost" location on the
+ * stack
+ * o Copy the interrupt frame into an "iret" location on the stack
* o Continue processing the NMI
* If the variable is set or the previous stack is the NMI stack:
- * o Modify the "copy" location to jump to the repeate_nmi
+ * o Modify the "iret" location to jump to the repeat_nmi
* o return back to the first NMI
*
* Now on exit of the first NMI, we first clear the stack variable
@@ -1491,18 +1492,60 @@ ENTRY(nmi)

.Lnmi_from_kernel:
/*
- * Check the special variable on the stack to see if NMIs are
- * executing.
+ * Here's what our stack frame will look like:
+ * +---------------------------------------------------------+
+ * | original SS |
+ * | original Return RSP |
+ * | original RFLAGS |
+ * | original CS |
+ * | original RIP |
+ * +---------------------------------------------------------+
+ * | temp storage for rdx |
+ * +---------------------------------------------------------+
+ * | "NMI executing" variable |
+ * +---------------------------------------------------------+
+ * | iret SS } Copied from "outermost" frame |
+ * | iret Return RSP } on each loop iteration; overwritten |
+ * | iret RFLAGS } by a nested NMI to force another |
+ * | iret CS } iteration if needed. |
+ * | iret RIP } |
+ * +---------------------------------------------------------+
+ * | outermost SS } initialized in first_nmi; |
+ * | outermost Return RSP } will not be changed before |
+ * | outermost RFLAGS } NMI processing is done. |
+ * | outermost CS } Copied to "iret" frame on each |
+ * | outermost RIP } iteration. |
+ * +---------------------------------------------------------+
+ * | pt_regs |
+ * +---------------------------------------------------------+
+ *
+ * The "original" frame is used by hardware. Before re-enabling
+ * NMIs, we need to be done with it, and we need to leave enough
+ * space for the asm code here.
+ *
+ * We return by executing IRET while RSP points to the "iret" frame.
+ * That will either return for real or it will loop back into NMI
+ * processing.
+ *
+ * The "outermost" frame is copied to the "iret" frame on each
+ * iteration of the loop, so each iteration starts with the "iret"
+ * frame pointing to the final return target.
+ */
+
+ /*
+ * Determine whether we're a nested NMI.
+ *
+ * First check "NMI executing". If it's set, then we're nested.
+ * This will not detect if we interrupted an outer NMI just
+ * before IRET.
*/
cmpl $1, -8(%rsp)
je nested_nmi

/*
- * Now test if the previous stack was an NMI stack.
- * We need the double check. We check the NMI stack to satisfy the
- * race when the first NMI clears the variable before returning.
- * We check the variable because the first NMI could be in a
- * breakpoint routine using a breakpoint stack.
+ * Now test if the previous stack was an NMI stack. This covers
+ * the case where we interrupt an outer NMI after it clears
+ * "NMI executing" but before IRET.
*/
lea 6*8(%rsp), %rdx
/* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
@@ -1519,9 +1562,11 @@ ENTRY(nmi)

nested_nmi:
/*
- * Do nothing if we interrupted the fixup in repeat_nmi.
- * It's about to repeat the NMI handler, so we are fine
- * with ignoring this one.
+ * If we interrupted an NMI that is between repeat_nmi and
+ * end_repeat_nmi, then we must not modify the "iret" frame
+ * because it's being written by the outer NMI. That's okay;
+ * the outer NMI handler is about to call do_nmi anyway,
+ * so we can just resume the outer NMI.
*/
movq $repeat_nmi, %rdx
cmpq 8(%rsp), %rdx
@@ -1531,7 +1576,10 @@ nested_nmi:
ja nested_nmi_out

1:
- /* Set up the interrupted NMIs stack to jump to repeat_nmi */
+ /*
+ * Modify the "iret" frame to point to repeat_nmi, forcing another
+ * iteration of NMI handling.
+ */
leaq -1*8(%rsp), %rdx
movq %rdx, %rsp
CFI_ADJUST_CFA_OFFSET 1*8
@@ -1550,60 +1598,23 @@ nested_nmi_out:
popq_cfi %rdx
CFI_RESTORE rdx

- /* No need to check faults here */
+ /* We are returning to kernel mode, so this cannot result in a fault. */
INTERRUPT_RETURN

CFI_RESTORE_STATE
first_nmi:
- /*
- * Because nested NMIs will use the pushed location that we
- * stored in rdx, we must keep that space available.
- * Here's what our stack frame will look like:
- * +-------------------------+
- * | original SS |
- * | original Return RSP |
- * | original RFLAGS |
- * | original CS |
- * | original RIP |
- * +-------------------------+
- * | temp storage for rdx |
- * +-------------------------+
- * | NMI executing variable |
- * +-------------------------+
- * | copied SS |
- * | copied Return RSP |
- * | copied RFLAGS |
- * | copied CS |
- * | copied RIP |
- * +-------------------------+
- * | Saved SS |
- * | Saved Return RSP |
- * | Saved RFLAGS |
- * | Saved CS |
- * | Saved RIP |
- * +-------------------------+
- * | pt_regs |
- * +-------------------------+
- *
- * The saved stack frame is used to fix up the copied stack frame
- * that a nested NMI may change to make the interrupted NMI iret jump
- * to the repeat_nmi. The original stack frame and the temp storage
- * is also used by nested NMIs and can not be trusted on exit.
- */
- /* Do not pop rdx, nested NMIs will corrupt that part of the stack */
+ /* Restore rdx. */
movq (%rsp), %rdx
CFI_RESTORE rdx

- /* Set the NMI executing variable on the stack. */
+ /* Set "NMI executing" on the stack. */
pushq_cfi $1

- /*
- * Leave room for the "copied" frame
- */
+ /* Leave room for the "iret" frame */
subq $(5*8), %rsp
CFI_ADJUST_CFA_OFFSET 5*8

- /* Copy the stack frame to the Saved frame */
+ /* Copy the "original" frame to the "outermost" frame */
.rept 5
pushq_cfi 11*8(%rsp)
.endr
@@ -1611,6 +1622,7 @@ first_nmi:

/* Everything up to here is safe from nested NMIs */

+repeat_nmi:
/*
* If there was a nested NMI, the first NMI's iret will return
* here. But NMIs are still enabled and we can take another
@@ -1619,16 +1631,21 @@ first_nmi:
* it will just return, as we are about to repeat an NMI anyway.
* This makes it safe to copy to the stack frame that a nested
* NMI will update.
- */
-repeat_nmi:
- /*
- * Update the stack variable to say we are still in NMI (the update
- * is benign for the non-repeat case, where 1 was pushed just above
- * to this very stack slot).
+ *
+ * RSP is pointing to "outermost RIP". gsbase is unknown, but, if
+ * we're repeating an NMI, gsbase has the same value that it had on
+ * the first iteration. paranoid_entry will load the kernel
+ * gsbase if needed before we call do_nmi.
+ *
+ * Set "NMI executing" in case we came back here via IRET.
*/
movq $1, 10*8(%rsp)

- /* Make another copy, this one may be modified by nested NMIs */
+ /*
+ * Copy the "outermost" frame to the "iret" frame. NMIs that nest
+ * here must not modify the "iret" frame while we're writing to
+ * it or it will end up containing garbage.
+ */
addq $(10*8), %rsp
CFI_ADJUST_CFA_OFFSET -10*8
.rept 5
@@ -1639,9 +1656,9 @@ repeat_nmi:
end_repeat_nmi:

/*
- * Everything below this point can be preempted by a nested
- * NMI if the first NMI took an exception and reset our iret stack
- * so that we repeat another NMI.
+ * Everything below this point can be preempted by a nested NMI.
+ * If this happens, then the inner NMI will change the "iret"
+ * frame to point back to repeat_nmi.
*/
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
ALLOC_PT_GPREGS_ON_STACK
@@ -1668,11 +1685,18 @@ nmi_swapgs:
nmi_restore:
RESTORE_EXTRA_REGS
RESTORE_C_REGS
- /* Pop the extra iret frame at once */
+
+ /* Point RSP at the "iret" frame. */
REMOVE_PT_GPREGS_FROM_STACK 6*8

- /* Clear the NMI executing stack variable */
+ /* Clear "NMI executing". */
movq $0, 5*8(%rsp)
+
+ /*
+ * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
+ * stack in a single instruction. We are returning to kernel
+ * mode, so this cannot result in a fault.
+ */
INTERRUPT_RETURN
CFI_ENDPROC
END(nmi)
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -408,8 +408,8 @@ static void default_do_nmi(struct pt_reg
NOKPROBE_SYMBOL(default_do_nmi);

/*
- * NMIs can hit breakpoints which will cause it to lose its NMI context
- * with the CPU when the breakpoint or page fault does an IRET.
+ * NMIs can page fault or hit breakpoints which will cause it to lose
+ * its NMI context with the CPU when the breakpoint or page fault does an IRET.
*
* As a result, NMIs can nest if NMIs get unmasked due an IRET during
* NMI processing. On x86_64, the asm glue protects us from nested NMIs


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/