Re: [patch V4 part 2 17/18] x86/kvm/vmx: Move guest enter/exit into .noinstr.text
From: Paolo Bonzini
Date: Wed May 06 2020 - 04:17:17 EST
On 05/05/20 15:41, Thomas Gleixner wrote:
> Move the functions which are inside the RCU off region into the
> non-instrumentable text section.
>
> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
> Cc: Paolo Bonzini <pbonzini@xxxxxxxxxx>
> Cc: Sean Christopherson <sean.j.christopherson@xxxxxxxxx>
> ---
> arch/x86/include/asm/hardirq.h | 4 -
> arch/x86/include/asm/kvm_host.h | 8 +++
> arch/x86/kvm/vmx/ops.h | 4 +
> arch/x86/kvm/vmx/vmenter.S | 5 +
> arch/x86/kvm/vmx/vmx.c | 105 ++++++++++++++++++++++------------------
> arch/x86/kvm/x86.c | 2
> 6 files changed, 79 insertions(+), 49 deletions(-)
>
> --- a/arch/x86/include/asm/hardirq.h
> +++ b/arch/x86/include/asm/hardirq.h
> @@ -67,12 +67,12 @@ static inline void kvm_set_cpu_l1tf_flus
> __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1);
> }
>
> -static inline void kvm_clear_cpu_l1tf_flush_l1d(void)
> +static __always_inline void kvm_clear_cpu_l1tf_flush_l1d(void)
> {
> __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 0);
> }
>
> -static inline bool kvm_get_cpu_l1tf_flush_l1d(void)
> +static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void)
> {
> return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d);
> }
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1601,7 +1601,15 @@ asmlinkage void kvm_spurious_fault(void)
> insn "\n\t" \
> "jmp 668f \n\t" \
> "667: \n\t" \
> + "1: \n\t" \
> + ".pushsection .discard.instr_begin \n\t" \
> + ".long 1b - . \n\t" \
> + ".popsection \n\t" \
> "call kvm_spurious_fault \n\t" \
> + "1: \n\t" \
> + ".pushsection .discard.instr_end \n\t" \
> + ".long 1b - . \n\t" \
> + ".popsection \n\t" \
> "668: \n\t" \
> _ASM_EXTABLE(666b, 667b)
>
> --- a/arch/x86/kvm/vmx/ops.h
> +++ b/arch/x86/kvm/vmx/ops.h
> @@ -146,7 +146,9 @@ do { \
> : : op1 : "cc" : error, fault); \
> return; \
> error: \
> + instr_begin(); \
> insn##_error(error_args); \
> + instr_end(); \
> return; \
> fault: \
> kvm_spurious_fault(); \
> @@ -161,7 +163,9 @@ do { \
> : : op1, op2 : "cc" : error, fault); \
> return; \
> error: \
> + instr_begin(); \
> insn##_error(error_args); \
> + instr_end(); \
> return; \
> fault: \
> kvm_spurious_fault(); \
> --- a/arch/x86/kvm/vmx/vmenter.S
> +++ b/arch/x86/kvm/vmx/vmenter.S
> @@ -27,7 +27,7 @@
> #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
> #endif
>
> - .text
> +.section .noinstr.text, "ax"
>
> /**
> * vmx_vmenter - VM-Enter the current loaded VMCS
> @@ -231,6 +231,9 @@ SYM_FUNC_START(__vmx_vcpu_run)
> jmp 1b
> SYM_FUNC_END(__vmx_vcpu_run)
>
> +
> +.section .text, "ax"
> +
> /**
> * vmread_error_trampoline - Trampoline from inline asm to vmread_error()
> * @field: VMCS field encoding that failed
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -6000,7 +6000,7 @@ static int vmx_handle_exit(struct kvm_vc
> * information but as all relevant affected CPUs have 32KiB L1D cache size
> * there is no point in doing so.
> */
> -static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
> +static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
> {
> int size = PAGE_SIZE << L1D_CACHE_ORDER;
>
> @@ -6033,7 +6033,7 @@ static void vmx_l1d_flush(struct kvm_vcp
> vcpu->stat.l1d_flush++;
>
> if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
> - wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
> + native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
> return;
> }
>
> @@ -6514,7 +6514,7 @@ static void vmx_update_hv_timer(struct k
> }
> }
>
> -void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
> +void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
> {
> if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
> vmx->loaded_vmcs->host_state.rsp = host_rsp;
> @@ -6524,6 +6524,61 @@ void vmx_update_host_rsp(struct vcpu_vmx
>
> bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
>
> +static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
> + struct vcpu_vmx *vmx)
> +{
> + instr_begin();
> + /*
> + * VMENTER enables interrupts (host state), but the kernel state is
> + * interrupts disabled when this is invoked. Also tell RCU about
> + * it. This is the same logic as for exit_to_user_mode().
> + *
> + * 1) Trace interrupts on state
> + * 2) Prepare lockdep with RCU on
> + * 3) Invoke context tracking if enabled to adjust RCU state
> + * 4) Tell lockdep that interrupts are enabled
> + */
> + trace_hardirqs_on_prepare();
> + lockdep_hardirqs_on_prepare(CALLER_ADDR0);
> + instr_end();
> +
> + guest_enter_irqoff();
> + lockdep_hardirqs_on(CALLER_ADDR0);
> +
> + /* L1D Flush includes CPU buffer clear to mitigate MDS */
> + if (static_branch_unlikely(&vmx_l1d_should_flush))
> + vmx_l1d_flush(vcpu);
> + else if (static_branch_unlikely(&mds_user_clear))
> + mds_clear_cpu_buffers();
> +
> + if (vcpu->arch.cr2 != read_cr2())
> + write_cr2(vcpu->arch.cr2);
> +
> + vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
> + vmx->loaded_vmcs->launched);
> +
> + vcpu->arch.cr2 = read_cr2();
> +
> + /*
> + * VMEXIT disables interrupts (host state), but tracing and lockdep
> + * have them in state 'on'. Same as enter_from_user_mode().
> + *
> + * 1) Tell lockdep that interrupts are disabled
> + * 2) Invoke context tracking if enabled to reactivate RCU
> + * 3) Trace interrupts off state
> + *
> + * This needs to be done before the below as native_read_msr()
> + * contains a tracepoint and x86_spec_ctrl_restore_host() calls
> + * into world and some more.
> + */
> + lockdep_hardirqs_off(CALLER_ADDR0);
> + guest_exit_irqoff();
> +
> + instr_begin();
> + trace_hardirqs_off_prepare();
> + instr_end();
> +}
> +
> static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
> {
> struct vcpu_vmx *vmx = to_vmx(vcpu);
> @@ -6604,49 +6659,9 @@ static void vmx_vcpu_run(struct kvm_vcpu
> x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
>
> /*
> - * VMENTER enables interrupts (host state), but the kernel state is
> - * interrupts disabled when this is invoked. Also tell RCU about
> - * it. This is the same logic as for exit_to_user_mode().
> - *
> - * 1) Trace interrupts on state
> - * 2) Prepare lockdep with RCU on
> - * 3) Invoke context tracking if enabled to adjust RCU state
> - * 4) Tell lockdep that interrupts are enabled
> + * The actual VMENTER/EXIT is in the .noinstr.text section.
> */
> - trace_hardirqs_on_prepare();
> - lockdep_hardirqs_on_prepare(CALLER_ADDR0);
> - guest_enter_irqoff();
> - lockdep_hardirqs_on(CALLER_ADDR0);
> -
> - /* L1D Flush includes CPU buffer clear to mitigate MDS */
> - if (static_branch_unlikely(&vmx_l1d_should_flush))
> - vmx_l1d_flush(vcpu);
> - else if (static_branch_unlikely(&mds_user_clear))
> - mds_clear_cpu_buffers();
> -
> - if (vcpu->arch.cr2 != read_cr2())
> - write_cr2(vcpu->arch.cr2);
> -
> - vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
> - vmx->loaded_vmcs->launched);
> -
> - vcpu->arch.cr2 = read_cr2();
> -
> - /*
> - * VMEXIT disables interrupts (host state), but tracing and lockdep
> - * have them in state 'on'. Same as enter_from_user_mode().
> - *
> - * 1) Tell lockdep that interrupts are disabled
> - * 2) Invoke context tracking if enabled to reactivate RCU
> - * 3) Trace interrupts off state
> - *
> - * This needs to be done before the below as native_read_msr()
> - * contains a tracepoint and x86_spec_ctrl_restore_host() calls
> - * into world and some more.
> - */
> - lockdep_hardirqs_off(CALLER_ADDR0);
> - guest_exit_irqoff();
> - trace_hardirqs_off_prepare();
> + vmx_vcpu_enter_exit(vcpu, vmx);
>
> /*
> * We do not use IBRS in the kernel. If this vCPU has used the
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -381,7 +381,7 @@ int kvm_set_apic_base(struct kvm_vcpu *v
> }
> EXPORT_SYMBOL_GPL(kvm_set_apic_base);
>
> -asmlinkage __visible void kvm_spurious_fault(void)
> +asmlinkage __visible noinstr void kvm_spurious_fault(void)
> {
> /* Fault while not rebooting. We want the trace. */
> BUG_ON(!kvm_rebooting);
>
Acked-by: Paolo Bonzini <pbonzini@xxxxxxxxxx>