Re: [PATCH 3/3] x86/mm, tracing: Fix CR2 corruption

From: Andy Lutomirski
Date: Wed Jul 03 2019 - 16:27:25 EST


On Wed, Jul 3, 2019 at 3:28 AM root <peterz@xxxxxxxxxxxxx> wrote:
>
> Despire the current efforts to read CR2 before tracing happens there
> still exist a number of possible holes:
>
> idtentry page_fault do_page_fault has_error_code=1
> call error_entry
> TRACE_IRQS_OFF
> call trace_hardirqs_off*
> #PF // modifies CR2
>
> CALL_enter_from_user_mode
> __context_tracking_exit()
> trace_user_exit(0)
> #PF // modifies CR2
>
> call do_page_fault
> address = read_cr2(); /* whoopsie */
>
> And similar for i386.
>
> Fix it by pulling the CR2 read into the entry code, before any of that
> stuff gets a chance to run and ruin things.
>
> Ideally we'll clean up the entry code by moving this tracing and
> context tracking nonsense into C some day, but let's not delay fixing
> this longer.
>
> Reported-by: He Zhe <zhe.he@xxxxxxxxxxxxx>
> Reported-by: Eiichi Tsukata <devel@xxxxxxxxxxxx>
> Debugged-by: Steven Rostedt <rostedt@xxxxxxxxxxx>
> Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
> ---
> arch/x86/entry/entry_32.S | 25 ++++++++++++++++++++++---
> arch/x86/entry/entry_64.S | 28 ++++++++++++++--------------
> arch/x86/include/asm/kvm_para.h | 2 +-
> arch/x86/include/asm/traps.h | 2 +-
> arch/x86/kernel/kvm.c | 8 ++++----
> arch/x86/mm/fault.c | 28 ++++++++++------------------
> 6 files changed, 52 insertions(+), 41 deletions(-)
>
> --- a/arch/x86/entry/entry_32.S
> +++ b/arch/x86/entry/entry_32.S
> @@ -1443,9 +1443,28 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vec
>
> ENTRY(page_fault)
> ASM_CLAC
> - pushl $do_page_fault
> - ALIGN
> - jmp common_exception
> + pushl $0; /* %gs's slot on the stack */
> +
> + SAVE_ALL switch_stacks=1 skip_gs=1
> +
> + ENCODE_FRAME_POINTER
> + UNWIND_ESPFIX_STACK
> +
> + /* fixup %gs */
> + GS_TO_REG %ecx
> + REG_TO_PTGS %ecx
> + SET_KERNEL_GS %ecx
> +
> + GET_CR2_INTO(%ecx) # might clobber %eax
> +
> + /* fixup orig %eax */
> + movl PT_ORIG_EAX(%esp), %edx # get the error code
> + movl $-1, PT_ORIG_EAX(%esp) # no syscall to restart
> +
> + TRACE_IRQS_OFF
> + movl %esp, %eax # pt_regs pointer
> + call do_page_fault
> + jmp ret_from_exception
> END(page_fault)
>
> common_exception:
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -901,7 +901,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work
> * @paranoid == 2 is special: the stub will never switch stacks. This is for
> * #DF: if the thread stack is somehow unusable, we'll still get a useful OOPS.
> */
> -.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0
> +.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ist_offset=0 create_gap=0 read_cr2=0
> ENTRY(\sym)
> UNWIND_HINT_IRET_REGS offset=\has_error_code*8
>
> @@ -937,18 +937,27 @@ ENTRY(\sym)
>
> .if \paranoid
> call paranoid_entry
> + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
> .else
> call error_entry
> .endif
> UNWIND_HINT_REGS
> - /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
>
> - .if \paranoid
> + .if \read_cr2
> + GET_CR2_INTO(%rdx); /* can clobber %rax */
> + .endif
> +
> .if \shift_ist != -1
> TRACE_IRQS_OFF_DEBUG /* reload IDT in case of recursion */
> .else
> TRACE_IRQS_OFF
> .endif
> +
> + .if \paranoid == 0
> + testb $3, CS(%rsp)
> + jz .Lfrom_kernel_no_context_tracking_\@
> + CALL_enter_from_user_mode
> +.Lfrom_kernel_no_context_tracking_\@:
> .endif
>
> movq %rsp, %rdi /* pt_regs pointer */
> @@ -1180,10 +1189,10 @@ idtentry xenint3 do_int3 has_error_co
> #endif
>
> idtentry general_protection do_general_protection has_error_code=1
> -idtentry page_fault do_page_fault has_error_code=1
> +idtentry page_fault do_page_fault has_error_code=1 read_cr2=1
>
> #ifdef CONFIG_KVM_GUEST
> -idtentry async_page_fault do_async_page_fault has_error_code=1
> +idtentry async_page_fault do_async_page_fault has_error_code=1 read_cr2=1
> #endif
>
> #ifdef CONFIG_X86_MCE
> @@ -1338,18 +1347,9 @@ ENTRY(error_entry)
> movq %rax, %rsp /* switch stack */
> ENCODE_FRAME_POINTER
> pushq %r12
> -
> - /*
> - * We need to tell lockdep that IRQs are off. We can't do this until
> - * we fix gsbase, and we should do it before enter_from_user_mode
> - * (which can take locks).
> - */
> - TRACE_IRQS_OFF

This hunk looks wrong. Am I missing some other place that handles the
case where we enter from kernel mode and IRQs were on?