RE: [PATCH RESEND v9 33/36] KVM: VMX: Add VMX_DO_FRED_EVENT_IRQOFF for IRQ/NMI handling

From: Li, Xin3
Date: Tue Aug 01 2023 - 19:19:17 EST



> > +#include "../../entry/calling.h"
>
> Rather than do the low level PUSH_REGS and POP_REGS, I vote to have core code
> expose a FRED-specific wrapper for invoking external_interrupt(). More below.

Nice idea!

> > + /*
> > + * A FRED stack frame has extra 16 bytes of information pushed at the
> > + * regular stack top compared to an IDT stack frame.
>
> There is pretty much no chance that anyone remembers the layout of an IDT stack
> frame off the top of their head. I.e. saying "FRED has 16 bytes more" isn't all
> that useful. It also fails to capture the fact that FRED stuff a hell of a lot
> more information in those "common" 48 bytes.
>
> It'll be hard/impossible to capture all of the overload info in a comment, but
> showing the actual layout of the frame would be super helpful, e.g. something
> like
> this
>
> /*
> * FRED stack frames are always 64 bytes:
> *
> * ------------------------------
> * | Bytes | Usage |
> * -----------------------------|
> * | 63:56 | Reserved |
> * | 55:48 | Event Data |
> * | 47:40 | SS + Event Info |
> * | 39:32 | RSP |
> * | 31:24 | RFLAGS |
> * | 23:16 | CS + Aux Info |
> * | 15:8 | RIP |
> * | 7:0 | Error Code |
> * ------------------------------
> */
> *
> * Use LEA to get the return RIP and push it, then push an error code.
> * Note, only NMI handling does an ERETS to the target! IRQ handling
> * doesn't need to unmask NMIs and so simply uses CALL+RET, i.e. the
> * RIP pushed here is only truly consumed for NMIs!

I take these as ASM code does need more love, i.e., nice comments :/

Otherwise only more confusion.


>
> Jumping way back to providing a wrapper for FRED, if we do that, then there's no
> need to include calling.h, and the weird wrinkle about the ERET target kinda goes
> away too. E.g. provide this in arch/x86/entry/entry_64_fred.S
>
> .section .text, "ax"
>
> /* Note, this is instrumentation safe, and returns via RET, not ERETS! */
> #if IS_ENABLED(CONFIG_KVM_INTEL)
> SYM_CODE_START(fred_irq_entry_from_kvm)
> FRED_ENTER
> call external_interrupt
> FRED_EXIT
> RET
> SYM_CODE_END(fred_irq_entry_from_kvm)
> EXPORT_SYMBOL_GPL(fred_irq_entry_from_kvm);
> #endif
>
> and then the KVM side for this particular chunk is more simply:
>
> lea 1f(%rip), %rax
> push %rax
> push $0 /* FRED error code, 0 for NMI and external
> interrupts */
>
> \branch_insn \branch_target
> 1:
> VMX_DO_EVENT_FRAME_END
> RET
>
>
> Alternatively, the whole thing could be shoved into
> arch/x86/entry/entry_64_fred.S,
> but at a glance I don't think that would be a net positive due to the need to
> handle
> IRQs vs. NMIs.
>
> > + \branch_insn \branch_target
> > +
> > + .if \nmi == 0
> > + POP_REGS
> > + .endif
> > +
> > +1:
> > + /*
> > + * "Restore" RSP from RBP, even though IRET has already unwound RSP to
>
> As mentioned above, this is incorrect on two fronts.
>
> > diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> > index 0ecf4be2c6af..4e90c69a92bf 100644
> > --- a/arch/x86/kvm/vmx/vmx.c
> > +++ b/arch/x86/kvm/vmx/vmx.c
> > @@ -6890,6 +6890,14 @@ static void vmx_apicv_post_state_restore(struct
> kvm_vcpu *vcpu)
> > memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
> > }
> >
> > +#ifdef CONFIG_X86_FRED
> > +void vmx_do_fred_interrupt_irqoff(unsigned int vector);
> > +void vmx_do_fred_nmi_irqoff(unsigned int vector);
> > +#else
> > +#define vmx_do_fred_interrupt_irqoff(x) BUG()
> > +#define vmx_do_fred_nmi_irqoff(x) BUG()
> > +#endif
>
> My slight preference is to open code the BUG() as a ud2 in assembly, purely to
> avoid more #ifdefs.
>
> > +
> > void vmx_do_interrupt_irqoff(unsigned long entry);
> > void vmx_do_nmi_irqoff(void);
> >
> > @@ -6932,14 +6940,16 @@ static void handle_external_interrupt_irqoff(struct
> kvm_vcpu *vcpu)
> > {
> > u32 intr_info = vmx_get_intr_info(vcpu);
> > unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
> > - gate_desc *desc = (gate_desc *)host_idt_base + vector;
> >
> > if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
> > "unexpected VM-Exit interrupt info: 0x%x", intr_info))
> > return;
> >
> > kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
> > - vmx_do_interrupt_irqoff(gate_offset(desc));
> > + if (cpu_feature_enabled(X86_FEATURE_FRED))
> > + vmx_do_fred_interrupt_irqoff(vector); /* Event type is 0 */
>
> I strongly prefer to use code to document what's going on. E.g. the tail comment
> just left me wondering, what event type is 0? Whereas this makes it quite clear
> that KVM is signaling a hardware interrupt. The fact that it's a nop as far as
> code generation goes is irrelevant.
>
> vmx_do_fred_interrupt_irqoff((EVENT_TYPE_HWINT << 16) | vector);

Better readability.

>
> > + else
> > + vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base
> + vector));
> > kvm_after_interrupt(vcpu);
> >
> > vcpu->arch.at_instruction_boundary = true;
>
> Here's a diff for (hopefully) everything I've suggested above.

Thanks a lot! I will test it and update the patch in this mail thread.

>
> ---
> arch/x86/entry/entry_64_fred.S | 17 ++++++-
> arch/x86/kernel/traps.c | 5 --
> arch/x86/kvm/vmx/vmenter.S | 84 +++++++++++++++-------------------
> arch/x86/kvm/vmx/vmx.c | 7 +--
> 4 files changed, 55 insertions(+), 58 deletions(-)
>
> diff --git a/arch/x86/entry/entry_64_fred.S b/arch/x86/entry/entry_64_fred.S
> index 12063267d2ac..a973c0bd29f6 100644
> --- a/arch/x86/entry/entry_64_fred.S
> +++ b/arch/x86/entry/entry_64_fred.S
> @@ -10,7 +10,6 @@
> #include "calling.h"
>
> .code64
> - .section ".noinstr.text", "ax"
>
> .macro FRED_ENTER
> UNWIND_HINT_END_OF_STACK
> @@ -24,6 +23,22 @@
> POP_REGS
> .endm
>
> + .section .text, "ax"
> +
> +/* Note, this is instrumentation safe, and returns via RET, not ERETS! */
> +#if IS_ENABLED(CONFIG_KVM_INTEL)
> +SYM_CODE_START(fred_irq_entry_from_kvm)
> + FRED_ENTER
> + call external_interrupt
> + FRED_EXIT
> + RET
> +SYM_CODE_END(fred_irq_entry_from_kvm)
> +EXPORT_SYMBOL_GPL(fred_irq_entry_from_kvm);
> +#endif
> +
> + .section ".noinstr.text", "ax"
> +
> +
> /*
> * The new RIP value that FRED event delivery establishes is
> * IA32_FRED_CONFIG & ~FFFH for events that occur in ring 3.
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 21eeba7b188f..cbcb83c71dab 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -1566,11 +1566,6 @@ int external_interrupt(struct pt_regs *regs)
> return 0;
> }
>
> -#if IS_ENABLED(CONFIG_KVM_INTEL)
> -/* For KVM VMX to handle IRQs in IRQ induced VM exits. */
> -EXPORT_SYMBOL_GPL(external_interrupt);
> -#endif
> -
> #endif /* CONFIG_X86_64 */
>
> void __init trap_init(void)
> diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
> index 79a4c91d9434..e25df565c3f8 100644
> --- a/arch/x86/kvm/vmx/vmenter.S
> +++ b/arch/x86/kvm/vmx/vmenter.S
> @@ -9,7 +9,6 @@
> #include <asm/segment.h>
> #include "kvm-asm-offsets.h"
> #include "run_flags.h"
> -#include "../../entry/calling.h"
>
> #define WORD_SIZE (BITS_PER_LONG / 8)
>
> @@ -33,15 +32,31 @@
> #define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
> #endif
>
> +.macro VMX_DO_EVENT_FRAME_BEGIN
> + /*
> + * Unconditionally create a stack frame, getting the correct RSP on the
> + * stack (for x86-64) would take two instructions anyways, and RBP can
> + * be used to restore RSP to make objtool happy (see below).
> + */
> + push %_ASM_BP
> + mov %_ASM_SP, %_ASM_BP
> +.endm
> +
> +.macro VMX_DO_EVENT_FRAME_END
> + /*
> + * "Restore" RSP from RBP, even though {E,I}RET has already unwound RSP
> + * to the correct value *in most cases*. KVM's IRQ handling with FRED
> + * doesn't do ERETS, and objtool doesn't know the callee will IRET/ERET
> + * and, without the explicit restore, thinks the stack is getting walloped.
> + * Using an unwind hint is problematic due to x86-64's dynamic alignment.
> + */
> + mov %_ASM_BP, %_ASM_SP
> + pop %_ASM_BP
> +.endm
> +
> #ifdef CONFIG_X86_FRED
> .macro VMX_DO_FRED_EVENT_IRQOFF branch_insn branch_target nmi=0
> - /*
> - * Unconditionally create a stack frame, getting the correct RSP on the
> - * stack (for x86-64) would take two instructions anyways, and RBP can
> - * be used to restore RSP to make objtool happy (see below).
> - */
> - push %_ASM_BP
> - mov %_ASM_SP, %_ASM_BP
> + VMX_DO_EVENT_FRAME_BEGIN
>
> /*
> * Don't check the FRED stack level, the call stack leading to this
> @@ -78,43 +93,23 @@
> * push an error code before invoking the IRQ/NMI handler.
> *
> * Use LEA to get the return RIP and push it, then push an error code.
> + * Note, only NMI handling does an ERETS to the target! IRQ handling
> + * doesn't need to unmask NMIs and so simply uses CALL+RET, i.e. the
> + * RIP pushed here is only truly consumed for NMIs!
> */
> lea 1f(%rip), %rax
> push %rax
> push $0 /* FRED error code, 0 for NMI and external
> interrupts */
>
> - .if \nmi == 0
> - PUSH_REGS
> - mov %rsp, %rdi
> - .endif
> -
> \branch_insn \branch_target
> -
> - .if \nmi == 0
> - POP_REGS
> - .endif
> -
> 1:
> - /*
> - * "Restore" RSP from RBP, even though IRET has already unwound RSP to
> - * the correct value. objtool doesn't know the callee will IRET and,
> - * without the explicit restore, thinks the stack is getting walloped.
> - * Using an unwind hint is problematic due to x86-64's dynamic alignment.
> - */
> - mov %_ASM_BP, %_ASM_SP
> - pop %_ASM_BP
> + VMX_DO_EVENT_FRAME_END
> RET
> .endm
> #endif
>
> .macro VMX_DO_EVENT_IRQOFF call_insn call_target
> - /*
> - * Unconditionally create a stack frame, getting the correct RSP on the
> - * stack (for x86-64) would take two instructions anyways, and RBP can
> - * be used to restore RSP to make objtool happy (see below).
> - */
> - push %_ASM_BP
> - mov %_ASM_SP, %_ASM_BP
> + VMX_DO_EVENT_FRAME_BEGIN
>
> #ifdef CONFIG_X86_64
> /*
> @@ -129,14 +124,7 @@
> push $__KERNEL_CS
> \call_insn \call_target
>
> - /*
> - * "Restore" RSP from RBP, even though IRET has already unwound RSP to
> - * the correct value. objtool doesn't know the callee will IRET and,
> - * without the explicit restore, thinks the stack is getting walloped.
> - * Using an unwind hint is problematic due to x86-64's dynamic alignment.
> - */
> - mov %_ASM_BP, %_ASM_SP
> - pop %_ASM_BP
> + VMX_DO_EVENT_FRAME_END
> RET
> .endm
>
> @@ -375,11 +363,13 @@ SYM_INNER_LABEL_ALIGN(vmx_vmexit,
> SYM_L_GLOBAL)
>
> SYM_FUNC_END(__vmx_vcpu_run)
>
> -#ifdef CONFIG_X86_FRED
> SYM_FUNC_START(vmx_do_fred_nmi_irqoff)
> +#ifdef CONFIG_X86_FRED
> VMX_DO_FRED_EVENT_IRQOFF jmp fred_entrypoint_kernel nmi=1
> +#else
> + ud2
> +#endif
> SYM_FUNC_END(vmx_do_fred_nmi_irqoff)
> -#endif
>
> SYM_FUNC_START(vmx_do_nmi_irqoff)
> VMX_DO_EVENT_IRQOFF call asm_exc_nmi_kvm_vmx
> @@ -438,11 +428,13 @@ SYM_FUNC_END(vmread_error_trampoline)
> #endif
>
> .section .text, "ax"
> -#ifdef CONFIG_X86_FRED
> SYM_FUNC_START(vmx_do_fred_interrupt_irqoff)
> - VMX_DO_FRED_EVENT_IRQOFF call external_interrupt
> +#ifdef CONFIG_X86_FRED
> + VMX_DO_FRED_EVENT_IRQOFF call fred_irq_entry_from_kvm
> +#else
> + ud2
> +#endif
> SYM_FUNC_END(vmx_do_fred_interrupt_irqoff)
> -#endif
>
> SYM_FUNC_START(vmx_do_interrupt_irqoff)
> VMX_DO_EVENT_IRQOFF CALL_NOSPEC _ASM_ARG1
> diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
> index bf757f5071e4..cb4675dd87df 100644
> --- a/arch/x86/kvm/vmx/vmx.c
> +++ b/arch/x86/kvm/vmx/vmx.c
> @@ -6919,13 +6919,8 @@ static void vmx_apicv_post_state_restore(struct
> kvm_vcpu *vcpu)
> memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
> }
>
> -#ifdef CONFIG_X86_FRED
> void vmx_do_fred_interrupt_irqoff(unsigned int vector);
> void vmx_do_fred_nmi_irqoff(unsigned int vector);
> -#else
> -#define vmx_do_fred_interrupt_irqoff(x) BUG()
> -#define vmx_do_fred_nmi_irqoff(x) BUG()
> -#endif
>
> void vmx_do_interrupt_irqoff(unsigned long entry);
> void vmx_do_nmi_irqoff(void);
> @@ -6976,7 +6971,7 @@ static void handle_external_interrupt_irqoff(struct
> kvm_vcpu *vcpu)
>
> kvm_before_interrupt(vcpu, KVM_HANDLING_IRQ);
> if (cpu_feature_enabled(X86_FEATURE_FRED))
> - vmx_do_fred_interrupt_irqoff(vector); /* Event type is 0 */
> + vmx_do_fred_interrupt_irqoff((EVENT_TYPE_HWINT << 16) |
> vector);
> else
> vmx_do_interrupt_irqoff(gate_offset((gate_desc *)host_idt_base
> + vector));
> kvm_after_interrupt(vcpu);
>
> base-commit: 8961078ffe509a97ec7803b17912e57c47b93fa2
> --