Re: [RFC v2 08/32] x86/traps: Add #VE support for TDX guest
From: Dave Hansen
Date: Fri May 07 2021 - 17:36:40 EST
On 4/26/21 11:01 AM, Kuppuswamy Sathyanarayanan wrote:
...
> The #VE cannot be nested before TDGETVEINFO is called, if there is any
> reason for it to nest the TD would shut down. The TDX module guarantees
> that no NMIs (or #MC or similar) can happen in this window. After
> TDGETVEINFO the #VE handler can nest if needed, although we don’t expect
> it to happen normally.
I think this description really needs some work. Does "The #VE cannot
be nested" mean that "hardware guarantees that #VE will not be
generated", or "the #VE must not be nested"?
What does "the TD would shut down" mean? I think you mean that instead
of delivering a nested #VE the hardware would actually exit to the host
and TDX would prevent the guest from being reentered. Right?
> diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
> index 5eb3bdf36a41..41a0732d5f68 100644
> --- a/arch/x86/include/asm/idtentry.h
> +++ b/arch/x86/include/asm/idtentry.h
> @@ -619,6 +619,10 @@ DECLARE_IDTENTRY_XENCB(X86_TRAP_OTHER, exc_xen_hypervisor_callback);
> DECLARE_IDTENTRY_RAW(X86_TRAP_OTHER, exc_xen_unknown_trap);
> #endif
>
> +#ifdef CONFIG_INTEL_TDX_GUEST
> +DECLARE_IDTENTRY(X86_TRAP_VE, exc_virtualization_exception);
> +#endif
> +
> /* Device interrupts common/spurious */
> DECLARE_IDTENTRY_IRQ(X86_TRAP_OTHER, common_interrupt);
> #ifdef CONFIG_X86_LOCAL_APIC
> diff --git a/arch/x86/include/asm/tdx.h b/arch/x86/include/asm/tdx.h
> index c5a870cef0ae..1ca55d8e9963 100644
> --- a/arch/x86/include/asm/tdx.h
> +++ b/arch/x86/include/asm/tdx.h
> @@ -11,6 +11,7 @@
> #include <linux/types.h>
>
> #define TDINFO 1
> +#define TDGETVEINFO 3
>
> struct tdcall_output {
> u64 rcx;
> @@ -29,6 +30,20 @@ struct tdvmcall_output {
> u64 r15;
> };
>
> +struct ve_info {
> + u64 exit_reason;
> + u64 exit_qual;
> + u64 gla;
> + u64 gpa;
> + u32 instr_len;
> + u32 instr_info;
> +};
Is this an architectural structure or some software construct?
> +unsigned long tdg_get_ve_info(struct ve_info *ve);
> +
> +int tdg_handle_virtualization_exception(struct pt_regs *regs,
> + struct ve_info *ve);
> +
> /* Common API to check TDX support in decompression and common kernel code. */
> bool is_tdx_guest(void);
>
> diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
> index ee1a283f8e96..546b6b636c7d 100644
> --- a/arch/x86/kernel/idt.c
> +++ b/arch/x86/kernel/idt.c
> @@ -64,6 +64,9 @@ static const __initconst struct idt_data early_idts[] = {
> */
> INTG(X86_TRAP_PF, asm_exc_page_fault),
> #endif
> +#ifdef CONFIG_INTEL_TDX_GUEST
> + INTG(X86_TRAP_VE, asm_exc_virtualization_exception),
> +#endif
> };
>
> /*
> @@ -87,6 +90,9 @@ static const __initconst struct idt_data def_idts[] = {
> INTG(X86_TRAP_MF, asm_exc_coprocessor_error),
> INTG(X86_TRAP_AC, asm_exc_alignment_check),
> INTG(X86_TRAP_XF, asm_exc_simd_coprocessor_error),
> +#ifdef CONFIG_INTEL_TDX_GUEST
> + INTG(X86_TRAP_VE, asm_exc_virtualization_exception),
> +#endif
>
> #ifdef CONFIG_X86_32
> TSKG(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS),
> diff --git a/arch/x86/kernel/tdx.c b/arch/x86/kernel/tdx.c
> index b63275db1db9..ccfcb07bfb2c 100644
> --- a/arch/x86/kernel/tdx.c
> +++ b/arch/x86/kernel/tdx.c
> @@ -82,6 +82,44 @@ static void tdg_get_info(void)
> td_info.attributes = out.rdx;
> }
>
> +unsigned long tdg_get_ve_info(struct ve_info *ve)
> +{
> + u64 ret;
> + struct tdcall_output out = {0};
> +
> + /*
> + * The #VE cannot be nested before TDGETVEINFO is called,
> + * if there is any reason for it to nest the TD would shut
> + * down. The TDX module guarantees that no NMIs (or #MC or
> + * similar) can happen in this window. After TDGETVEINFO
> + * the #VE handler can nest if needed, although we don’t
> + * expect it to happen normally.
> + */
I find that description a bit unsatisfying. Could we make this a bit
more concrete? By the way, what about *normal* interrupts?
Maybe we should talk about this in terms of *rules* that folks need to
follow. Maybe:
NMIs and machine checks are suppressed. Before this point any
#VE is fatal. After this point, NMIs and additional #VEs are
permitted.
> + ret = __tdcall(TDGETVEINFO, 0, 0, 0, 0, &out);
> +
> + ve->exit_reason = out.rcx;
> + ve->exit_qual = out.rdx;
> + ve->gla = out.r8;
> + ve->gpa = out.r9;
> + ve->instr_len = out.r10 & UINT_MAX;
> + ve->instr_info = out.r10 >> 32;
> +
> + return ret;
> +}
> +
> +int tdg_handle_virtualization_exception(struct pt_regs *regs,
> + struct ve_info *ve)
> +{
> + /*
> + * TODO: Add handler support for various #VE exit
> + * reasons. It will be added by other patches in
> + * the series.
> + */
> + pr_warn("Unexpected #VE: %lld\n", ve->exit_reason);
> + return -EFAULT;
> +}
> +
> void __init tdx_early_init(void)
> {
> if (!cpuid_has_tdx_guest())
> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
> index 213d4aa8e337..64869aa88a5a 100644
> --- a/arch/x86/kernel/traps.c
> +++ b/arch/x86/kernel/traps.c
> @@ -61,6 +61,7 @@
> #include <asm/insn.h>
> #include <asm/insn-eval.h>
> #include <asm/vdso.h>
> +#include <asm/tdx.h>
>
> #ifdef CONFIG_X86_64
> #include <asm/x86_init.h>
> @@ -1140,6 +1141,35 @@ DEFINE_IDTENTRY(exc_device_not_available)
> }
> }
>
> +#ifdef CONFIG_INTEL_TDX_GUEST
> +DEFINE_IDTENTRY(exc_virtualization_exception)
> +{
> + struct ve_info ve;
> + int ret;
> +
> + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
> +
> + /*
> + * Consume #VE info before re-enabling interrupts. It will be
> + * re-enabled after executing the TDGETVEINFO TDCALL.
> + */
"It" is nebulous here. Is this talking about NMIs, or the
cond_local_irq_enable() that is "after" TDGETVEINFO?
> + ret = tdg_get_ve_info(&ve);
> +
> + cond_local_irq_enable(regs);
> +
> + if (!ret)
> + ret = tdg_handle_virtualization_exception(regs, &ve);
> + /*
> + * If tdg_handle_virtualization_exception() could not process
> + * it successfully, treat it as #GP(0) and handle it.
> + */
> + if (ret)
> + do_general_protection(regs, 0);
> +
> + cond_local_irq_disable(regs);
> +}
> +#endif
> +
> #ifdef CONFIG_X86_32
> DEFINE_IDTENTRY_SW(iret_error)
> {
>