[PATCH v2] xen: get rid of paravirt op adjust_exception_frame

From: Juergen Gross
Date: Tue Aug 01 2017 - 06:40:14 EST


When running as Xen pv-guest the exception frame on the stack contains
%r11 and %rcx additional to the other data pushed by the processor.

Instead of having a paravirt op being called for each exception type
prepend the Xen specific code to each exception entry. When running as
Xen pv-guest just use the exception entry with prepended instructions,
otherwise use the entry without the Xen specific code.

Signed-off-by: Juergen Gross <jgross@xxxxxxxx>
---
arch/x86/entry/entry_64.S | 22 ++------------
arch/x86/entry/entry_64_compat.S | 1 -
arch/x86/include/asm/desc.h | 16 ++++++++++
arch/x86/include/asm/paravirt.h | 5 ---
arch/x86/include/asm/paravirt_types.h | 4 ---
arch/x86/include/asm/proto.h | 3 ++
arch/x86/include/asm/traps.h | 51 +++++++++++++++++++++++++++++--
arch/x86/kernel/asm-offsets_64.c | 1 -
arch/x86/kernel/paravirt.c | 3 --
arch/x86/kernel/traps.c | 57 ++++++++++++++++++-----------------
arch/x86/xen/enlighten_pv.c | 16 +++++-----
arch/x86/xen/irq.c | 3 --
arch/x86/xen/xen-asm_64.S | 45 ++++++++++++++++++++++++---
arch/x86/xen/xen-ops.h | 1 -
14 files changed, 147 insertions(+), 81 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a9a8027a6c0e..602bcf68a32c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -745,7 +745,6 @@ ENTRY(\sym)
.endif

ASM_CLAC
- PARAVIRT_ADJUST_EXCEPTION_FRAME

.ifeq \has_error_code
pushq $-1 /* ORIG_RAX: no syscall to restart */
@@ -901,7 +900,7 @@ ENTRY(do_softirq_own_stack)
END(do_softirq_own_stack)

#ifdef CONFIG_XEN
-idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0
+idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0

/*
* A note on the "critical region" in our callback handler.
@@ -967,8 +966,6 @@ ENTRY(xen_failsafe_callback)
movq 8(%rsp), %r11
addq $0x30, %rsp
pushq $0 /* RIP */
- pushq %r11
- pushq %rcx
jmp general_protection
1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
movq (%rsp), %rcx
@@ -997,9 +994,8 @@ idtentry int3 do_int3 has_error_code=0 paranoid=1 shift_ist=DEBUG_STACK
idtentry stack_segment do_stack_segment has_error_code=1

#ifdef CONFIG_XEN
-idtentry xen_debug do_debug has_error_code=0
-idtentry xen_int3 do_int3 has_error_code=0
-idtentry xen_stack_segment do_stack_segment has_error_code=1
+idtentry xendebug do_debug has_error_code=0
+idtentry xenint3 do_int3 has_error_code=0
#endif

idtentry general_protection do_general_protection has_error_code=1
@@ -1161,18 +1157,6 @@ END(error_exit)
/* Runs on exception stack */
ENTRY(nmi)
/*
- * Fix up the exception frame if we're on Xen.
- * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most
- * one value to the stack on native, so it may clobber the rdx
- * scratch slot, but it won't clobber any of the important
- * slots past it.
- *
- * Xen is a different story, because the Xen frame itself overlaps
- * the "NMI executing" variable.
- */
- PARAVIRT_ADJUST_EXCEPTION_FRAME
-
- /*
* We allow breakpoints in NMIs. If a breakpoint occurs, then
* the iretq it performs will take us out of NMI context.
* This means that we can have nested NMIs where the next
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e1721dafbcb1..843d2f08397b 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -294,7 +294,6 @@ ENTRY(entry_INT80_compat)
/*
* Interrupts are off on entry.
*/
- PARAVIRT_ADJUST_EXCEPTION_FRAME
ASM_CLAC /* Do this early to minimize exposure */
SWAPGS

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index d0a21b12dd58..f581e61ffcaa 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -9,6 +9,8 @@
#include <linux/smp.h>
#include <linux/percpu.h>

+#include <xen/xen.h>
+
static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info)
{
desc->limit0 = info->limit & 0x0ffff;
@@ -83,6 +85,12 @@ static inline phys_addr_t get_cpu_gdt_paddr(unsigned int cpu)
return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu));
}

+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+#define pv_trap_entry(name) (void *)(xen_pv_domain() ? xen_ ## name : name)
+#else
+#define pv_trap_entry(name) (void *)(name)
+#endif
+
#ifdef CONFIG_X86_64

static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
@@ -482,6 +490,14 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
0, 0, __KERNEL_CS); \
} while (0)

+#define set_intr_gate_pv(n, addr) \
+ do { \
+ set_intr_gate_notrace(n, pv_trap_entry(addr)); \
+ _trace_set_gate(n, GATE_INTERRUPT, \
+ pv_trap_entry(trace_##addr), \
+ 0, 0, __KERNEL_CS); \
+ } while (0)
+
extern int first_system_vector;
/* used_vectors is BITMAP for irq is not managed by percpu vector_irq */
extern unsigned long used_vectors[];
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 9ccac1926587..c25dd22f7c70 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -960,11 +960,6 @@ extern void default_banner(void);
#define GET_CR2_INTO_RAX \
call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2)

-#define PARAVIRT_ADJUST_EXCEPTION_FRAME \
- PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \
- CLBR_NONE, \
- call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame))
-
#define USERGS_SYSRET64 \
PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
CLBR_NONE, \
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 9ffc36bfe4cd..c55106726938 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -195,10 +195,6 @@ struct pv_irq_ops {

void (*safe_halt)(void);
void (*halt)(void);
-
-#ifdef CONFIG_X86_64
- void (*adjust_exception_frame)(void);
-#endif
} __no_randomize_layout;

struct pv_mmu_ops {
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 8d3964fc5f91..b408b1886195 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -24,6 +24,9 @@ void entry_SYSENTER_compat(void);
void __end_entry_SYSENTER_compat(void);
void entry_SYSCALL_compat(void);
void entry_INT80_compat(void);
+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+void xen_entry_INT80_compat(void);
+#endif
#endif

void x86_configure_nx(void);
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index 01fd0a7f48cd..e2ae5f3b9c2c 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -13,9 +13,8 @@ asmlinkage void divide_error(void);
asmlinkage void debug(void);
asmlinkage void nmi(void);
asmlinkage void int3(void);
-asmlinkage void xen_debug(void);
-asmlinkage void xen_int3(void);
-asmlinkage void xen_stack_segment(void);
+asmlinkage void xendebug(void);
+asmlinkage void xenint3(void);
asmlinkage void overflow(void);
asmlinkage void bounds(void);
asmlinkage void invalid_op(void);
@@ -38,6 +37,34 @@ asmlinkage void machine_check(void);
#endif /* CONFIG_X86_MCE */
asmlinkage void simd_coprocessor_error(void);

+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+asmlinkage void xen_divide_error(void);
+asmlinkage void xen_debug(void);
+asmlinkage void xen_xendebug(void);
+asmlinkage void xen_int3(void);
+asmlinkage void xen_xenint3(void);
+asmlinkage void xen_nmi(void);
+asmlinkage void xen_overflow(void);
+asmlinkage void xen_bounds(void);
+asmlinkage void xen_invalid_op(void);
+asmlinkage void xen_device_not_available(void);
+asmlinkage void xen_double_fault(void);
+asmlinkage void xen_coprocessor_segment_overrun(void);
+asmlinkage void xen_invalid_TSS(void);
+asmlinkage void xen_segment_not_present(void);
+asmlinkage void xen_stack_segment(void);
+asmlinkage void xen_general_protection(void);
+asmlinkage void xen_page_fault(void);
+asmlinkage void xen_async_page_fault(void);
+asmlinkage void xen_spurious_interrupt_bug(void);
+asmlinkage void xen_coprocessor_error(void);
+asmlinkage void xen_alignment_check(void);
+#ifdef CONFIG_X86_MCE
+asmlinkage void xen_machine_check(void);
+#endif /* CONFIG_X86_MCE */
+asmlinkage void xen_simd_coprocessor_error(void);
+#endif
+
#ifdef CONFIG_TRACING
asmlinkage void trace_page_fault(void);
#define trace_stack_segment stack_segment
@@ -54,6 +81,24 @@ asmlinkage void trace_page_fault(void);
#define trace_alignment_check alignment_check
#define trace_simd_coprocessor_error simd_coprocessor_error
#define trace_async_page_fault async_page_fault
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
+asmlinkage void xen_trace_page_fault(void);
+#define xen_trace_stack_segment xen_stack_segment
+#define xen_trace_divide_error xen_divide_error
+#define xen_trace_bounds xen_bounds
+#define xen_trace_invalid_op xen_invalid_op
+#define xen_trace_device_not_available xen_device_not_available
+#define xen_trace_coprocessor_segment_overrun xen_coprocessor_segment_overrun
+#define xen_trace_invalid_TSS xen_invalid_TSS
+#define xen_trace_segment_not_present xen_segment_not_present
+#define xen_trace_general_protection xen_general_protection
+#define xen_trace_spurious_interrupt_bug xen_spurious_interrupt_bug
+#define xen_trace_coprocessor_error xen_coprocessor_error
+#define xen_trace_alignment_check xen_alignment_check
+#define xen_trace_simd_coprocessor_error xen_simd_coprocessor_error
+#define xen_trace_async_page_fault xen_async_page_fault
+#endif
#endif

dotraplinkage void do_divide_error(struct pt_regs *, long);
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 99332f550c48..cf42206926af 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -20,7 +20,6 @@ static char syscalls_ia32[] = {
int main(void)
{
#ifdef CONFIG_PARAVIRT
- OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
BLANK();
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index bc0a849589bb..a14df9eecfed 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -319,9 +319,6 @@ __visible struct pv_irq_ops pv_irq_ops = {
.irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable),
.safe_halt = native_safe_halt,
.halt = native_halt,
-#ifdef CONFIG_X86_64
- .adjust_exception_frame = paravirt_nop,
-#endif
};

__visible struct pv_cpu_ops pv_cpu_ops = {
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index bf54309b85da..a79a97a46a59 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -946,15 +946,15 @@ void __init early_trap_init(void)
* early stage. DEBUG_STACK will be equipped after cpu_init() in
* trap_init().
*
- * We don't need to set trace_idt_table like set_intr_gate(),
+ * We don't need to set trace_idt_table like set_intr_gate_pv(),
* since we don't have trace_debug and it will be reset to
* 'debug' in trap_init() by set_intr_gate_ist().
*/
set_intr_gate_notrace(X86_TRAP_DB, debug);
/* int3 can be called from all */
- set_system_intr_gate(X86_TRAP_BP, &int3);
+ set_system_intr_gate(X86_TRAP_BP, pv_trap_entry(int3));
#ifdef CONFIG_X86_32
- set_intr_gate(X86_TRAP_PF, page_fault);
+ set_intr_gate_pv(X86_TRAP_PF, page_fault);
#endif
load_idt(&idt_descr);
}
@@ -962,7 +962,7 @@ void __init early_trap_init(void)
void __init early_trap_pf_init(void)
{
#ifdef CONFIG_X86_64
- set_intr_gate(X86_TRAP_PF, page_fault);
+ set_intr_gate_pv(X86_TRAP_PF, page_fault);
#endif
}

@@ -978,42 +978,45 @@ void __init trap_init(void)
early_iounmap(p, 4);
#endif

- set_intr_gate(X86_TRAP_DE, divide_error);
- set_intr_gate_ist(X86_TRAP_NMI, &nmi, NMI_STACK);
+ set_intr_gate_pv(X86_TRAP_DE, divide_error);
+ set_intr_gate_ist(X86_TRAP_NMI, pv_trap_entry(nmi), NMI_STACK);
/* int4 can be called from all */
- set_system_intr_gate(X86_TRAP_OF, &overflow);
- set_intr_gate(X86_TRAP_BR, bounds);
- set_intr_gate(X86_TRAP_UD, invalid_op);
- set_intr_gate(X86_TRAP_NM, device_not_available);
+ set_system_intr_gate(X86_TRAP_OF, pv_trap_entry(overflow));
+ set_intr_gate_pv(X86_TRAP_BR, bounds);
+ set_intr_gate_pv(X86_TRAP_UD, invalid_op);
+ set_intr_gate_pv(X86_TRAP_NM, device_not_available);
#ifdef CONFIG_X86_32
set_task_gate(X86_TRAP_DF, GDT_ENTRY_DOUBLEFAULT_TSS);
#else
- set_intr_gate_ist(X86_TRAP_DF, &double_fault, DOUBLEFAULT_STACK);
+ set_intr_gate_ist(X86_TRAP_DF, pv_trap_entry(double_fault),
+ DOUBLEFAULT_STACK);
#endif
- set_intr_gate(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
- set_intr_gate(X86_TRAP_TS, invalid_TSS);
- set_intr_gate(X86_TRAP_NP, segment_not_present);
- set_intr_gate(X86_TRAP_SS, stack_segment);
- set_intr_gate(X86_TRAP_GP, general_protection);
- set_intr_gate(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
- set_intr_gate(X86_TRAP_MF, coprocessor_error);
- set_intr_gate(X86_TRAP_AC, alignment_check);
+ set_intr_gate_pv(X86_TRAP_OLD_MF, coprocessor_segment_overrun);
+ set_intr_gate_pv(X86_TRAP_TS, invalid_TSS);
+ set_intr_gate_pv(X86_TRAP_NP, segment_not_present);
+ set_intr_gate_pv(X86_TRAP_SS, stack_segment);
+ set_intr_gate_pv(X86_TRAP_GP, general_protection);
+ set_intr_gate_pv(X86_TRAP_SPURIOUS, spurious_interrupt_bug);
+ set_intr_gate_pv(X86_TRAP_MF, coprocessor_error);
+ set_intr_gate_pv(X86_TRAP_AC, alignment_check);
#ifdef CONFIG_X86_MCE
- set_intr_gate_ist(X86_TRAP_MC, &machine_check, MCE_STACK);
+ set_intr_gate_ist(X86_TRAP_MC, pv_trap_entry(machine_check), MCE_STACK);
#endif
- set_intr_gate(X86_TRAP_XF, simd_coprocessor_error);
+ set_intr_gate_pv(X86_TRAP_XF, simd_coprocessor_error);

/* Reserve all the builtin and the syscall vector: */
for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
set_bit(i, used_vectors);

#ifdef CONFIG_IA32_EMULATION
- set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_compat);
+ set_system_intr_gate(IA32_SYSCALL_VECTOR,
+ pv_trap_entry(entry_INT80_compat));
set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif

#ifdef CONFIG_X86_32
- set_system_intr_gate(IA32_SYSCALL_VECTOR, entry_INT80_32);
+ set_system_intr_gate(IA32_SYSCALL_VECTOR,
+ pv_trap_entry(entry_INT80_32));
set_bit(IA32_SYSCALL_VECTOR, used_vectors);
#endif

@@ -1035,15 +1038,15 @@ void __init trap_init(void)
* in early_trap_init(). However, ITS works only after
* cpu_init() loads TSS. See comments in early_trap_init().
*/
- set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
+ set_intr_gate_ist(X86_TRAP_DB, pv_trap_entry(debug), DEBUG_STACK);
/* int3 can be called from all */
- set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+ set_system_intr_gate_ist(X86_TRAP_BP, pv_trap_entry(int3), DEBUG_STACK);

x86_init.irqs.trap_init();

#ifdef CONFIG_X86_64
memcpy(&debug_idt_table, &idt_table, IDT_ENTRIES * 16);
- set_nmi_gate(X86_TRAP_DB, &debug);
- set_nmi_gate(X86_TRAP_BP, &int3);
+ set_nmi_gate(X86_TRAP_DB, pv_trap_entry(debug));
+ set_nmi_gate(X86_TRAP_BP, pv_trap_entry(int3));
#endif
}
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index 811e4ddb3f37..7e107142bc4f 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -598,24 +598,22 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
* so we should never see them. Warn if
* there's an unexpected IST-using fault handler.
*/
- if (addr == (unsigned long)debug)
- addr = (unsigned long)xen_debug;
- else if (addr == (unsigned long)int3)
- addr = (unsigned long)xen_int3;
- else if (addr == (unsigned long)stack_segment)
- addr = (unsigned long)xen_stack_segment;
- else if (addr == (unsigned long)double_fault) {
+ if (addr == (unsigned long)xen_debug)
+ addr = (unsigned long)xen_xendebug;
+ else if (addr == (unsigned long)xen_int3)
+ addr = (unsigned long)xen_xenint3;
+ else if (addr == (unsigned long)xen_double_fault) {
/* Don't need to handle these */
return 0;
#ifdef CONFIG_X86_MCE
- } else if (addr == (unsigned long)machine_check) {
+ } else if (addr == (unsigned long)xen_machine_check) {
/*
* when xen hypervisor inject vMCE to guest,
* use native mce handler to handle it
*/
;
#endif
- } else if (addr == (unsigned long)nmi)
+ } else if (addr == (unsigned long)xen_nmi)
/*
* Use the native version as well.
*/
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 33e92955e09d..d4eff5676cfa 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -123,9 +123,6 @@ static const struct pv_irq_ops xen_irq_ops __initconst = {

.safe_halt = xen_safe_halt,
.halt = xen_halt,
-#ifdef CONFIG_X86_64
- .adjust_exception_frame = xen_adjust_exception_frame,
-#endif
};

void __init xen_init_irq_ops(void)
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index c3df43141e70..f72ff71cc897 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -22,11 +22,46 @@

#include "xen-asm.h"

-ENTRY(xen_adjust_exception_frame)
- mov 8+0(%rsp), %rcx
- mov 8+8(%rsp), %r11
- ret $16
-ENDPROC(xen_adjust_exception_frame)
+.macro xen_pv_trap name
+ENTRY(xen_\name)
+ pop %rcx
+ pop %r11
+ jmp \name
+END(xen_\name)
+.endm
+
+xen_pv_trap divide_error
+xen_pv_trap debug
+xen_pv_trap xendebug
+xen_pv_trap int3
+xen_pv_trap xenint3
+xen_pv_trap nmi
+xen_pv_trap overflow
+xen_pv_trap bounds
+xen_pv_trap invalid_op
+xen_pv_trap device_not_available
+xen_pv_trap double_fault
+xen_pv_trap coprocessor_segment_overrun
+xen_pv_trap invalid_TSS
+xen_pv_trap segment_not_present
+xen_pv_trap stack_segment
+xen_pv_trap general_protection
+xen_pv_trap page_fault
+xen_pv_trap async_page_fault
+xen_pv_trap spurious_interrupt_bug
+xen_pv_trap coprocessor_error
+xen_pv_trap alignment_check
+#ifdef CONFIG_X86_MCE
+xen_pv_trap machine_check
+#endif /* CONFIG_X86_MCE */
+xen_pv_trap simd_coprocessor_error
+#ifdef CONFIG_IA32_EMULATION
+xen_pv_trap entry_INT80_compat
+#endif
+#ifdef CONFIG_TRACING
+xen_pv_trap trace_page_fault
+#endif
+xen_pv_trap hypervisor_callback

hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
/*
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 0d5004477db6..18e8729d42bd 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -145,7 +145,6 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long);
__visible void xen_iret(void);
__visible void xen_sysret32(void);
__visible void xen_sysret64(void);
-__visible void xen_adjust_exception_frame(void);

extern int xen_panic_handler_init(void);

--
2.12.3