Initial working version (Re: [PATCH 1/2] x86-32: Fix crashes withCPU hotplug on AMD machines)

From: Borislav Petkov
Date: Tue Aug 24 2010 - 03:34:00 EST


From: "H. Peter Anvin" <hpa@xxxxxxxxx>
Date: Thu, Aug 12, 2010 at 08:47:55AM -0700

> > I've been working on what you suggested and the patch boots on a 32-bit
> > PAE kernel but keeps rebooting on non-PAE after setting the stack_start
> > on the AP. I could send it out later for you to look at, if you like -
> > you should be able to spot what I'm missing :)...
> >
>
> Sounds good. This is obviously .37-or-higher material.

Ok, here we go, the version below boots a !PAE and a PAE kernel fine
in kvm. The approach taken is, IMHO, the least intrusive in trying
to touch code assuming swapper_pg_dir as little as possible and use
initial_page_table for boot/acpi sleep/reboot only.

For example, paths like init_memory_mapping ->
kernel_physical_mapping_init -> ... which fill in the kernel mappings
into swapper_pg_dir are left unchanged by switching to swapper_pg_dir
at the beginning of setup_arch() BTW, I guess this could be done even
earlier since I copy the mappings from initial_page_table so there's no
difference between the two at the switch point.

Later, after paging_init(), synching back all swapper_pg_dir changes
into initial_page_table is done for the APs to boot and have a stack
mapping in the initial_page_table so that they don't pagefault into a
triple fault (this was the problem with the previous version, btw.)

Please take a look and let me know if I've forgotten something before I
start hammering on it on a real hardware.

Thanks.

---
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index f686f49..8abde9e 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -26,7 +26,7 @@ struct mm_struct;
struct vm_area_struct;

extern pgd_t swapper_pg_dir[1024];
-extern pgd_t trampoline_pg_dir[1024];
+extern pgd_t initial_page_table[1024];

static inline void pgtable_cache_init(void) { }
static inline void check_pgt_cache(void) { }
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 7f3eba0..169be89 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -172,6 +172,4 @@ static inline void flush_tlb_kernel_range(unsigned long start,
flush_tlb_all();
}

-extern void zap_low_mappings(bool early);
-
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index 4dde797..f4500fb 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -13,16 +13,13 @@ extern unsigned char *trampoline_base;

extern unsigned long init_rsp;
extern unsigned long initial_code;
-extern unsigned long initial_page_table;
extern unsigned long initial_gs;

#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)

extern unsigned long setup_trampoline(void);
-extern void __init setup_trampoline_page_table(void);
extern void __init reserve_trampoline_memory(void);
#else
-static inline void setup_trampoline_page_table(void) {}
static inline void reserve_trampoline_memory(void) {}
#endif /* CONFIG_X86_TRAMPOLINE */

diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 33cec15..d04500e 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -12,6 +12,11 @@
#include <asm/segment.h>
#include <asm/desc.h>

+#ifdef CONFIG_X86_32
+#include <asm/pgtable.h>
+#include <asm/pgtable_32.h>
+#endif
+
#include "realmode/wakeup.h"
#include "sleep.h"

@@ -90,7 +95,7 @@ int acpi_save_state_mem(void)

#ifndef CONFIG_64BIT
header->pmode_entry = (u32)&wakeup_pmode_return;
- header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET);
+ header->pmode_cr3 = (u32)(initial_page_table - __PAGE_OFFSET);
saved_magic = 0x12345678;
#else /* CONFIG_64BIT */
header->trampoline_segment = setup_trampoline() >> 4;
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 784360c..8b9c201 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -17,6 +17,7 @@
#include <asm/apic.h>
#include <asm/io_apic.h>
#include <asm/bios_ebda.h>
+#include <asm/tlbflush.h>

static void __init i386_default_early_setup(void)
{
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fa8c1b8..005646a 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -183,7 +183,7 @@ default_entry:
#ifdef CONFIG_X86_PAE

/*
- * In PAE mode swapper_pg_dir is statically defined to contain enough
+ * In PAE mode initial_page_table is statically defined to contain enough
* entries to cover the VMSPLIT option (that is the top 1, 2 or 3
* entries). The identity mapping is handled by pointing two PGD
* entries to the first kernel PMD.
@@ -197,7 +197,7 @@ default_entry:
xorl %ebx,%ebx /* %ebx is kept at zero */

movl $pa(__brk_base), %edi
- movl $pa(swapper_pg_pmd), %edx
+ movl $pa(initial_pg_pmd), %edx
movl $PTE_IDENT_ATTR, %eax
10:
leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
@@ -226,14 +226,14 @@ default_entry:
movl %eax, pa(max_pfn_mapped)

/* Do early initialization of the fixmap area */
- movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
- movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
+ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+ movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
#else /* Not PAE */

page_pde_offset = (__PAGE_OFFSET >> 20);

movl $pa(__brk_base), %edi
- movl $pa(swapper_pg_dir), %edx
+ movl $pa(initial_page_table), %edx
movl $PTE_IDENT_ATTR, %eax
10:
leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
@@ -257,8 +257,8 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
movl %eax, pa(max_pfn_mapped)

/* Do early initialization of the fixmap area */
- movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
- movl %eax,pa(swapper_pg_dir+0xffc)
+ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
+ movl %eax,pa(initial_page_table+0xffc)
#endif
jmp 3f
/*
@@ -334,7 +334,7 @@ ENTRY(startup_32_smp)
/*
* Enable paging
*/
- movl pa(initial_page_table), %eax
+ movl $pa(initial_page_table), %eax
movl %eax,%cr3 /* set the page table pointer.. */
movl %cr0,%eax
orl $X86_CR0_PG,%eax
@@ -614,8 +614,6 @@ ignore_int:
.align 4
ENTRY(initial_code)
.long i386_start_kernel
-ENTRY(initial_page_table)
- .long pa(swapper_pg_dir)

/*
* BSS section
@@ -623,20 +621,18 @@ ENTRY(initial_page_table)
__PAGE_ALIGNED_BSS
.align PAGE_SIZE_asm
#ifdef CONFIG_X86_PAE
-swapper_pg_pmd:
+initial_pg_pmd:
.fill 1024*KPMDS,4,0
#else
-ENTRY(swapper_pg_dir)
+ENTRY(initial_page_table)
.fill 1024,4,0
#endif
-swapper_pg_fixmap:
+initial_pg_fixmap:
.fill 1024,4,0
-#ifdef CONFIG_X86_TRAMPOLINE
-ENTRY(trampoline_pg_dir)
- .fill 1024,4,0
-#endif
ENTRY(empty_zero_page)
.fill 4096,1,0
+ENTRY(swapper_pg_dir)
+ .fill 1024,4,0

/*
* This starts the data section.
@@ -645,20 +641,20 @@ ENTRY(empty_zero_page)
__PAGE_ALIGNED_DATA
/* Page-aligned for the benefit of paravirt? */
.align PAGE_SIZE_asm
-ENTRY(swapper_pg_dir)
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
+ENTRY(initial_page_table)
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
# if KPMDS == 3
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
# elif KPMDS == 2
.long 0,0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
# elif KPMDS == 1
.long 0,0
.long 0,0
- .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
# else
# error "Kernel PMDs should be 1, 2 or 3"
# endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e3af342..ce5566e 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -371,16 +371,10 @@ void machine_real_restart(const unsigned char *code, int length)
CMOS_WRITE(0x00, 0x8f);
spin_unlock(&rtc_lock);

- /* Remap the kernel at virtual address zero, as well as offset zero
- from the kernel segment. This assumes the kernel segment starts at
- virtual address PAGE_OFFSET. */
- memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
-
/*
- * Use `swapper_pg_dir' as our page directory.
+ * Switch back to the initial page table.
*/
- load_cr3(swapper_pg_dir);
+ load_cr3(initial_page_table);

/* Write 0x1234 to absolute memory location 0x472. The BIOS reads
this on booting to tell it to "Bypass memory test (also warm
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c3a4fbb..32322df 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -730,6 +730,17 @@ void __init setup_arch(char **cmdline_p)
#ifdef CONFIG_X86_32
memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
visws_early_detect();
+
+ /*
+ * copy kernel address range established so far and switch
+ * to the proper swapper page table
+ */
+ clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ initial_page_table + KERNEL_PGD_BOUNDARY,
+ min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
+
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
#else
printk(KERN_INFO "Command line: %s\n", boot_command_line);
#endif
@@ -1014,7 +1025,12 @@ void __init setup_arch(char **cmdline_p)
paging_init();
x86_init.paging.pagetable_setup_done(swapper_pg_dir);

- setup_trampoline_page_table();
+#ifdef CONFIG_X86_32
+ /* sync back kernel address range */
+ clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
+ swapper_pg_dir + KERNEL_PGD_BOUNDARY,
+ min_t(unsigned long, KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY));
+#endif

tboot_probe();

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8b3bfc4..0d86d0b 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -300,22 +300,17 @@ notrace static void __cpuinit start_secondary(void *unused)
* most necessary things.
*/

-#ifdef CONFIG_X86_32
- /*
- * Switch away from the trampoline page-table
- *
- * Do this before cpu_init() because it needs to access per-cpu
- * data which may not be mapped in the trampoline page-table.
- */
- load_cr3(swapper_pg_dir);
- __flush_tlb_all();
-#endif
-
vmi_bringup();
cpu_init();
preempt_disable();
smp_callin();

+#ifdef CONFIG_X86_32
+ /* switch away from the initial page table */
+ load_cr3(swapper_pg_dir);
+ __flush_tlb_all();
+#endif
+
/* otherwise gcc will move up smp_processor_id before the cpu_init */
barrier();
/*
@@ -774,7 +769,6 @@ do_rest:
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
- initial_page_table = __pa(&trampoline_pg_dir);
#else
clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
@@ -923,7 +917,6 @@ int __cpuinit native_cpu_up(unsigned int cpu)
per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;

err = do_boot_cpu(apicid, cpu);
-
if (err) {
pr_debug("do_boot_cpu failed %d\n", err);
return -EIO;
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index a874495..f1488a3 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -38,20 +38,3 @@ unsigned long __trampinit setup_trampoline(void)
memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE);
return virt_to_phys(trampoline_base);
}
-
-void __init setup_trampoline_page_table(void)
-{
-#ifdef CONFIG_X86_32
- /* Copy kernel address range */
- clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY,
- swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- min_t(unsigned long, KERNEL_PGD_PTRS,
- KERNEL_PGD_BOUNDARY));
-
- /* Initialize low mappings */
- clone_pgd_range(trampoline_pg_dir,
- swapper_pg_dir + KERNEL_PGD_BOUNDARY,
- min_t(unsigned long, KERNEL_PGD_PTRS,
- KERNEL_PGD_BOUNDARY));
-#endif
-}
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca7909..adad48b 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -548,48 +548,6 @@ static void __init pagetable_init(void)
permanent_kmaps_init(pgd_base);
}

-#ifdef CONFIG_ACPI_SLEEP
-/*
- * ACPI suspend needs this for resume, because things like the intel-agp
- * driver might have split up a kernel 4MB mapping.
- */
-char swsusp_pg_dir[PAGE_SIZE]
- __attribute__ ((aligned(PAGE_SIZE)));
-
-static inline void save_pg_dir(void)
-{
- memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
-}
-#else /* !CONFIG_ACPI_SLEEP */
-static inline void save_pg_dir(void)
-{
-}
-#endif /* !CONFIG_ACPI_SLEEP */
-
-void zap_low_mappings(bool early)
-{
- int i;
-
- /*
- * Zap initial low-memory mappings.
- *
- * Note that "pgd_clear()" doesn't do it for
- * us, because pgd_clear() is a no-op on i386.
- */
- for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
-#ifdef CONFIG_X86_PAE
- set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
-#else
- set_pgd(swapper_pg_dir+i, __pgd(0));
-#endif
- }
-
- if (early)
- __flush_tlb();
- else
- flush_tlb_all();
-}
-
pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
EXPORT_SYMBOL_GPL(__supported_pte_mask);

@@ -958,9 +916,6 @@ void __init mem_init(void)

if (boot_cpu_data.wp_works_ok < 0)
test_wp_bit();
-
- save_pg_dir();
- zap_low_mappings(true);
}

#ifdef CONFIG_MEMORY_HOTPLUG

--
Regards/Gruss,
Boris.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/