Re: [PATCH 06/11] x86/xen: Add i386 kexec/kdump implementation

From: Jan Beulich
Date: Fri Sep 28 2012 - 04:11:43 EST


>>> On 27.09.12 at 20:06, Daniel Kiper <daniel.kiper@xxxxxxxxxx> wrote:
> Add i386 kexec/kdump implementation.

So this as well as the subsequent patch introduces quite a bit of
duplicate code. The old 2.6.18 kernel had an initial pair of cleanup
patches (attached in their forward ported form for 3.6-rc6) that
would allow reducing the amount of duplication, particularly by
eliminating the need to clone relocate_kernel_??.S altogether.

Additionally, in the PAE case (which is the only relevant one for
a 32-bit Xen kernel) I'm missing the address restriction
enforcement for the PGD, without which the __ma() conversion
result may not fit into the field it gets stored into.

Finally, as noticed in an earlier patch already, you appear to
re-introduce stuff long dropped from the kernel - the forward
ported kernels get away with just setting PA_CONTROL_PAGE,
PA_PGD, and PA_SWAP_PAGE in the page list. Since the number
and purpose of the pages is established entirely by the guest
kernel, all you need to obey is that the hypervisor expects
alternating PA_/VA_ pairs (where the VA_ ones can be left
unpopulated). Perhaps taking a look at a recent SLES kernel
would help...

Jan

Subject: kexec: Move asm segment handling code to the assembly file (x86_64)
From: http://xenbits.xensource.com/xen-unstable.hg (tip 13816)
Patch-mainline: n/a

This patch moves the idt, gdt, and segment handling code from machine_kexec.c
to relocate_kernel.S. The main reason behind this move is to avoid code
duplication in the Xen hypervisor. With this patch all code required to kexec
is put on the control page.

On top of that this patch also counts as a cleanup - I think it is much
nicer to write assembly directly in assembly files than wrap inline assembly
in C functions for no apparent reason.

Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx>
Acked-by: jbeulich@xxxxxxxxxx

Applies to 2.6.19-rc1.
jb: fixed up register usage for 2.6.30 (bnc#545206)

--- head-2011-08-09.orig/arch/x86/kernel/machine_kexec_64.c 2011-08-09 10:21:04.000000000 +0200
+++ head-2011-08-09/arch/x86/kernel/machine_kexec_64.c 2010-04-15 09:38:56.000000000 +0200
@@ -203,47 +203,6 @@ static int init_pgtable(struct kimage *i
return init_transition_pgtable(image, level4p);
}

-static void set_idt(void *newidt, u16 limit)
-{
- struct desc_ptr curidt;
-
- /* x86-64 supports unaliged loads & stores */
- curidt.size = limit;
- curidt.address = (unsigned long)newidt;
-
- __asm__ __volatile__ (
- "lidtq %0\n"
- : : "m" (curidt)
- );
-};
-
-
-static void set_gdt(void *newgdt, u16 limit)
-{
- struct desc_ptr curgdt;
-
- /* x86-64 supports unaligned loads & stores */
- curgdt.size = limit;
- curgdt.address = (unsigned long)newgdt;
-
- __asm__ __volatile__ (
- "lgdtq %0\n"
- : : "m" (curgdt)
- );
-};
-
-static void load_segments(void)
-{
- __asm__ __volatile__ (
- "\tmovl %0,%%ds\n"
- "\tmovl %0,%%es\n"
- "\tmovl %0,%%ss\n"
- "\tmovl %0,%%fs\n"
- "\tmovl %0,%%gs\n"
- : : "a" (__KERNEL_DS) : "memory"
- );
-}
-
int machine_kexec_prepare(struct kimage *image)
{
unsigned long start_pgtable;
@@ -311,24 +270,6 @@ void machine_kexec(struct kimage *image)
page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
<< PAGE_SHIFT);

- /*
- * The segment registers are funny things, they have both a
- * visible and an invisible part. Whenever the visible part is
- * set to a specific selector, the invisible part is loaded
- * with from a table in memory. At no other time is the
- * descriptor table in memory accessed.
- *
- * I take advantage of this here by force loading the
- * segments, before I zap the gdt with an invalid value.
- */
- load_segments();
- /*
- * The gdt & idt are now invalid.
- * If you want to load them you must set up your own idt & gdt.
- */
- set_gdt(phys_to_virt(0), 0);
- set_idt(phys_to_virt(0), 0);
-
/* now call it */
image->start = relocate_kernel((unsigned long)image->head,
(unsigned long)page_list,
--- head-2011-08-09.orig/arch/x86/kernel/relocate_kernel_64.S 2011-08-09 10:19:07.000000000 +0200
+++ head-2011-08-09/arch/x86/kernel/relocate_kernel_64.S 2011-08-09 10:21:24.000000000 +0200
@@ -91,13 +91,30 @@ relocate_kernel:
/* Switch to the identity mapped page tables */
movq %r9, %cr3

+ /* setup idt */
+ lidtq idt_80 - relocate_kernel(%r8)
+
+ /* setup gdt */
+ leaq gdt - relocate_kernel(%r8), %rax
+ movq %rax, (gdt_80 - relocate_kernel) + 2(%r8)
+ lgdtq gdt_80 - relocate_kernel(%r8)
+
+ /* setup data segment registers */
+ xorl %eax, %eax
+ movl %eax, %ds
+ movl %eax, %es
+ movl %eax, %fs
+ movl %eax, %gs
+ movl %eax, %ss
+
/* setup a new stack at the end of the physical control page */
lea PAGE_SIZE(%r8), %rsp

- /* jump to identity mapped page */
+ /* load new code segment and jump to identity mapped page */
addq $(identity_mapped - relocate_kernel), %r8
+ pushq $(gdt_cs - gdt)
pushq %r8
- ret
+ lretq

identity_mapped:
/* set return address to 0 if not preserving context */
@@ -264,5 +281,20 @@ swap_pages:
3:
ret

+ .align 16
+gdt:
+ .quad 0x0000000000000000 /* NULL descriptor */
+gdt_cs:
+ .quad 0x00af9a000000ffff
+gdt_end:
+
+gdt_80:
+ .word gdt_end - gdt - 1 /* limit */
+ .quad 0 /* base - filled in by code above */
+
+idt_80:
+ .word 0 /* limit */
+ .quad 0 /* base */
+
.globl kexec_control_code_size
.set kexec_control_code_size, . - relocate_kernel
Subject: kexec: Move asm segment handling code to the assembly file (i386)
From: http://xenbits.xensource.com/xen-unstable.hg (tip 13816)
Patch-mainline: n/a

This patch moves the idt, gdt, and segment handling code from machine_kexec.c
to relocate_kernel.S. The main reason behind this move is to avoid code
duplication in the Xen hypervisor. With this patch all code required to kexec
is put on the control page.

On top of that this patch also counts as a cleanup - I think it is much
nicer to write assembly directly in assembly files than wrap inline assembly
in C functions for no apparent reason.

Signed-off-by: Magnus Damm <magnus@xxxxxxxxxxxxx>
Acked-by: jbeulich@xxxxxxxxxx

Applies to 2.6.19-rc1.
jb: fixed up register usage (paralleling what's needed for 2.6.30 on x86-64)

--- head.orig/arch/x86/kernel/machine_kexec_32.c 2012-04-10 14:24:22.000000000 +0200
+++ head/arch/x86/kernel/machine_kexec_32.c 2012-04-10 14:50:08.000000000 +0200
@@ -26,48 +26,6 @@
#include <asm/cacheflush.h>
#include <asm/debugreg.h>

-static void set_idt(void *newidt, __u16 limit)
-{
- struct desc_ptr curidt;
-
- /* ia32 supports unaliged loads & stores */
- curidt.size = limit;
- curidt.address = (unsigned long)newidt;
-
- load_idt(&curidt);
-}
-
-
-static void set_gdt(void *newgdt, __u16 limit)
-{
- struct desc_ptr curgdt;
-
- /* ia32 supports unaligned loads & stores */
- curgdt.size = limit;
- curgdt.address = (unsigned long)newgdt;
-
- load_gdt(&curgdt);
-}
-
-static void load_segments(void)
-{
-#define __STR(X) #X
-#define STR(X) __STR(X)
-
- __asm__ __volatile__ (
- "\tljmp $"STR(__KERNEL_CS)",$1f\n"
- "\t1:\n"
- "\tmovl $"STR(__KERNEL_DS)",%%eax\n"
- "\tmovl %%eax,%%ds\n"
- "\tmovl %%eax,%%es\n"
- "\tmovl %%eax,%%fs\n"
- "\tmovl %%eax,%%gs\n"
- "\tmovl %%eax,%%ss\n"
- : : : "eax", "memory");
-#undef STR
-#undef __STR
-}
-
static void machine_kexec_free_page_tables(struct kimage *image)
{
free_page((unsigned long)image->arch.pgd);
@@ -227,24 +185,6 @@ void machine_kexec(struct kimage *image)
page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page)
<< PAGE_SHIFT);

- /*
- * The segment registers are funny things, they have both a
- * visible and an invisible part. Whenever the visible part is
- * set to a specific selector, the invisible part is loaded
- * with from a table in memory. At no other time is the
- * descriptor table in memory accessed.
- *
- * I take advantage of this here by force loading the
- * segments, before I zap the gdt with an invalid value.
- */
- load_segments();
- /*
- * The gdt & idt are now invalid.
- * If you want to load them you must set up your own idt & gdt.
- */
- set_gdt(phys_to_virt(0), 0);
- set_idt(phys_to_virt(0), 0);
-
/* now call it */
image->start = relocate_kernel_ptr((unsigned long)image->head,
(unsigned long)page_list,
--- head.orig/arch/x86/kernel/relocate_kernel_32.S 2011-10-24 09:10:05.000000000 +0200
+++ head/arch/x86/kernel/relocate_kernel_32.S 2011-08-09 10:21:17.000000000 +0200
@@ -87,14 +87,32 @@ relocate_kernel:
movl PTR(PA_PGD)(%ebp), %eax
movl %eax, %cr3

+ /* setup idt */
+ lidtl idt_48 - relocate_kernel(%edi)
+
+ /* setup gdt */
+ leal gdt - relocate_kernel(%edi), %eax
+ movl %eax, (gdt_48 - relocate_kernel) + 2(%edi)
+ lgdtl gdt_48 - relocate_kernel(%edi)
+
+ /* setup data segment registers */
+ mov $(gdt_ds - gdt), %eax
+ mov %eax, %ds
+ mov %eax, %es
+ mov %eax, %fs
+ mov %eax, %gs
+ mov %eax, %ss
+
/* setup a new stack at the end of the physical control page */
lea PAGE_SIZE(%edi), %esp

- /* jump to identity mapped page */
+ /* load new code segment and jump to identity mapped page */
+ pushl $0
+ pushl $(gdt_cs - gdt)
movl %edi, %eax
addl $(identity_mapped - relocate_kernel), %eax
pushl %eax
- ret
+ iretl

identity_mapped:
/* set return address to 0 if not preserving context */
@@ -273,5 +291,22 @@ swap_pages:
popl %ebp
ret

+ .align 16
+gdt:
+ .quad 0x0000000000000000 /* NULL descriptor */
+gdt_cs:
+ .quad 0x00cf9a000000ffff /* kernel 4GB code at 0x00000000 */
+gdt_ds:
+ .quad 0x00cf92000000ffff /* kernel 4GB data at 0x00000000 */
+gdt_end:
+
+gdt_48:
+ .word gdt_end - gdt - 1 /* limit */
+ .long 0 /* base - filled in by code above */
+
+idt_48:
+ .word 0 /* limit */
+ .long 0 /* base */
+
.globl kexec_control_code_size
.set kexec_control_code_size, . - relocate_kernel