Re: [tip:x86/microcode] x86/microcode_intel_early.c: Early updateucode on Intel's CPU

From: H. Peter Anvin
Date: Wed Dec 12 2012 - 01:59:16 EST


On 12/11/2012 04:27 PM, Yinghai Lu wrote:
On Tue, Dec 11, 2012 at 3:57 PM, H. Peter Anvin <hpa@xxxxxxxxx> wrote:
Well, we could invoke it on the bootloader page tables, but as you say
it may not be a good idea... depending on how much memory we may be
talking about. One solution -- which I have to admit is starting to
sound really good -- is to set up a #PF handler which cycles through a
set of page tables and creates a "virtual identity map"... it does have
the advantage of making the entire physical address space available
without any additional funnies.

so that #PF handler will work before
arch/x86/kernel/setup.c::setup_arch/early_trap_init

early_strap_intit will install another handler there for #PF

for 64bit, moving early_ioremap_init ahead is very simple, like attach patch

but for 32 bit looks like it is not that easy.


Here is an incomplete patch for illustration purposes only what I mean with an early-mapping #PF handler. This creates a set of transient page tables on demand which allows us to access memory as if it was all mapped, but using only O(1) storage. The replacement policy is trivial: if we run out, we start over from scratch.

The "identity page tables" used during the transition to high virtual addresses are kind of magic; there is a bunch of extra aliases created, but the way it is done guarantees that the range we actually cares about is mapped correctly. The aliases don't matter and get scrubbed shortly thereafter anyway.

This should, obviously, be used on native only -- in particular Xen should instead rely on the initial page tables provided by the domain builder, which should map all physical memory.

Once the proper memory-map-aware page tables are built, we should turn this off by swapping to the newly built real init_level4_pgt instead.

-hpa

--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel. I don't speak on their behalf.

diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
index 766ea16..2d88344 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -1,6 +1,8 @@
#ifndef _ASM_X86_PGTABLE_64_DEFS_H
#define _ASM_X86_PGTABLE_64_DEFS_H

+#include <asm/sparsemem.h>
+
#ifndef __ASSEMBLY__
#include <linux/types.h>

@@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t;
#define MODULES_END _AC(0xffffffffff000000, UL)
#define MODULES_LEN (MODULES_END - MODULES_VADDR)

+#define EARLY_DYNAMIC_PAGE_TABLES 64
+
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 037df57..9443c77 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,11 +26,73 @@
#include <asm/e820.h>
#include <asm/bios_ebda.h>

-static void __init zap_identity_mappings(void)
+/*
+ * Manage page tables very early on.
+ */
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+static unsigned int __initdata next_early_pgt = 2, early_pgt_resets = 0;
+
+/* Wipe all early page tables except for the kernel symbol map */
+static void __init reset_early_page_tables(void)
{
- pgd_t *pgd = pgd_offset_k(0UL);
- pgd_clear(pgd);
- __flush_tlb_all();
+ unsigned long i;
+
+ for (i = 0; i < PTRS_PER_PGD-1; i++)
+ early_level4_pgt[i].pgd = 0;
+
+ next_early_pgt = 0;
+ early_pgt_resets++;
+
+ __native_flush_tlb();
+}
+
+/* Create a new PMD entry */
+int __init early_make_pgtable(unsigned long address)
+{
+ unsigned long physaddr = address - __PAGE_OFFSET;
+ unsigned long i;
+ pgdval_t pgd, *pgd_p;
+ pudval_t *pud_p;
+ pmdval_t pmd, *pmd_p;
+
+ if (physaddr >= MAXMEM)
+ return -1; /* Invalid address - puke */
+
+ i = (address >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1);
+ pgd_p = &early_level4_pgt[i].pgd;
+ pgd = *pgd_p;
+
+ /*
+ * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
+ * critical -- __PAGE_OFFSET would point us back into the dynamic
+ * range and we might end up looping forever...
+ */
+ if (pgd && next_early_pgt < EARLY_DYNAMIC_PAGE_TABLES) {
+ pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map);
+ } else {
+ if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES-1)
+ reset_early_page_tables();
+
+ pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
+ for (i = 0; i < PTRS_PER_PUD; i++)
+ pud_p[i] = 0;
+
+ *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + _KERNPG_TABLE;
+ }
+ i = (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
+ pud_p += i;
+
+ pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
+ pmd = (physaddr & PUD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL);
+ for (i = 0; i < PTRS_PER_PMD; i++) {
+ pmd_p[i] = pmd;
+ pmd += PMD_SIZE;
+ }
+
+ *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + _KERNPG_TABLE;
+
+ return 0;
}

/* Don't add a printk in there. printk relies on the PDA which is not initialized
@@ -70,12 +132,13 @@ void __init x86_64_start_kernel(char * real_mode_data)
(__START_KERNEL & PGDIR_MASK)));
BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);

+ /* Kill off the identity-map trampoline */
+ reset_early_page_tables();
+
/* clear bss before set_intr_gate with early_idt_handler */
clear_bss();

- /* Make NULL pointers segfault */
- zap_identity_mappings();
-
+ /* XXX - this is wrong... we need to build page tables from scratch */
max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;

for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 94bf9cc..e13ff91 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
.code64
.globl startup_64
startup_64:
-
/*
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
* and someone has loaded an identity mapped page table
* for us. These identity mapped page tables map all of the
* kernel pages and possibly all of memory.
*
- * %esi holds a physical pointer to real_mode_data.
+ * %rsi holds a physical pointer to real_mode_data.
*
* We come here either directly from a 64bit bootloader, or from
* arch/x86_64/boot/compressed/head.S.
@@ -66,7 +65,8 @@ startup_64:
* tables and then reload them.
*/

- /* Compute the delta between the address I am compiled to run at and the
+ /*
+ * Compute the delta between the address I am compiled to run at and the
* address I am actually running at.
*/
leaq _text(%rip), %rbp
@@ -78,53 +78,66 @@ startup_64:
testl %eax, %eax
jnz bad_address

- /* Is the address too large? */
- leaq _text(%rip), %rdx
- movq $PGDIR_SIZE, %rax
- cmpq %rax, %rdx
- jae bad_address
-
- /* Fixup the physical addresses in the page table
+ /*
+ * Is the address too large?
*/
- addq %rbp, init_level4_pgt + 0(%rip)
- addq %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip)
- addq %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip)
+ leaq _text(%rip), %rax
+ shrq $MAX_PHYSMEM_BITS, %rax
+ jnz bad_address

- addq %rbp, level3_ident_pgt + 0(%rip)
+ /*
+ * Fixup the physical addresses in the page table
+ */
+ addq %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)

addq %rbp, level3_kernel_pgt + (510*8)(%rip)
addq %rbp, level3_kernel_pgt + (511*8)(%rip)

addq %rbp, level2_fixmap_pgt + (506*8)(%rip)

- /* Add an Identity mapping if I am above 1G */
+ /*
+ * Set up the identity mapping for the switchover. These
+ * entries should *NOT* have the global bit set! This also
+ * creates a bunch of nonsense entries but that is fine --
+ * it avoids problems around wraparound.
+ */
leaq _text(%rip), %rdi
- andq $PMD_PAGE_MASK, %rdi
+ leaq early_level4_pgt(%rip), %rbx

movq %rdi, %rax
- shrq $PUD_SHIFT, %rax
- andq $(PTRS_PER_PUD - 1), %rax
- jz ident_complete
+ shrq $PGDIR_SHIFT, %rax
+
+ leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx
+ movq %rdx, 0(%rbx,%rax,8)
+ movq %rdx, 8(%rbx,%rax,8)

- leaq (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), %rdx
- leaq level3_ident_pgt(%rip), %rbx
- movq %rdx, 0(%rbx, %rax, 8)
+ addq $4096, %rdx
+ movq %rdi, %rax
+ shrq $PUD_SHIFT, %rax
+ andl $(PTRS_PER_PUD-1), %eax
+ movq %rdx, (4096+0)(%rbx,%rax,8)
+ movq %rdx, (4096+8)(%rbx,%rax,8)

+ addq $8192, %rbx
movq %rdi, %rax
- shrq $PMD_SHIFT, %rax
- andq $(PTRS_PER_PMD - 1), %rax
- leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
- leaq level2_spare_pgt(%rip), %rbx
- movq %rdx, 0(%rbx, %rax, 8)
-ident_complete:
+ shrq $PMD_SHIFT, %rdi
+ addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
+ movl $PTRS_PER_PMD, %ecx

+1:
+ andq $(PTRS_PER_PMD - 1), %rdi
+ movq %rax, (%rbx,%rdi,8)
+ incq %rdi
+ addq $PMD_SIZE, %rax
+ decl %ecx
+ jnz 1b
+
/*
* Fixup the kernel text+data virtual addresses. Note that
* we might write invalid pmds, when the kernel is relocated
* cleanup_highmap() fixes this up along with the mappings
* beyond _end.
*/
-
leaq level2_kernel_pgt(%rip), %rdi
leaq 4096(%rdi), %r8
/* See if it is a valid page table entry */
@@ -149,7 +162,7 @@ ENTRY(secondary_startup_64)
* At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
* and someone has loaded a mapped page table.
*
- * %esi holds a physical pointer to real_mode_data.
+ * %rsi holds a physical pointer to real_mode_data.
*
* We come here either from startup_64 (using physical addresses)
* or from trampoline.S (using virtual addresses).
@@ -196,7 +209,7 @@ ENTRY(secondary_startup_64)
movq %rax, %cr0

/* Setup a boot time stack */
- movq stack_start(%rip),%rsp
+ movq stack_start(%rip), %rsp

/* zero EFLAGS after setting rsp */
pushq $0
@@ -236,31 +249,31 @@ ENTRY(secondary_startup_64)
movl initial_gs+4(%rip),%edx
wrmsr

- /* esi is pointer to real mode structure with interesting info.
+ /* rsi is pointer to real mode structure with interesting info.
pass it to C */
- movl %esi, %edi
+ movq %rsi, %rdi

/* Finally jump to run C code and to be on real kernel address
* Since we are running on identity-mapped space we have to jump
* to the full 64bit address, this is only possible as indirect
* jump. In addition we need to ensure %cs is set so we make this
- * a far return.
+ * a far jump.
*/
- movq initial_code(%rip),%rax
pushq $0 # fake return address to stop unwinder
- pushq $__KERNEL_CS # set correct cs
- pushq %rax # target address in negative space
- lretq
+ /* gas 2.22 is buggy and mis-assembles ljmpq */
+ rex64 ljmp *initial_code(%rip)

/* SMP bootup changes these two */
__REFDATA
- .align 8
- ENTRY(initial_code)
+ .balign 8
+ GLOBAL(initial_code)
.quad x86_64_start_kernel
- ENTRY(initial_gs)
+ .word __KERNEL_CS
+ .balign 8
+ GLOBAL(initial_gs)
.quad INIT_PER_CPU_VAR(irq_stack_union)

- ENTRY(stack_start)
+ GLOBAL(stack_start)
.quad init_thread_union+THREAD_SIZE-8
.word 0
__FINITDATA
@@ -268,7 +281,7 @@ ENTRY(secondary_startup_64)
bad_address:
jmp bad_address

- .section ".init.text","ax"
+ __INIT
.globl early_idt_handlers
early_idt_handlers:
# 104(%rsp) %rflags
@@ -305,14 +318,22 @@ ENTRY(early_idt_handler)
pushq %r11 # 0(%rsp)

cmpl $__KERNEL_CS,96(%rsp)
- jne 10f
+ jne 11f

+ cmpl $14,72(%rsp) # Page fault?
+ jnz 10f
+ GET_CR2_INTO(%rdi) # can clobber any volatile register if pv
+ call early_make_pgtable
+ andl %eax,%eax
+ jz 20f # All good
+
+10:
leaq 88(%rsp),%rdi # Pointer to %rip
call early_fixup_exception
andl %eax,%eax
jnz 20f # Found an exception entry

-10:
+11:
#ifdef CONFIG_EARLY_PRINTK
GET_CR2_INTO(%r9) # can clobber any volatile register if pv
movl 80(%rsp),%r8d # error code
@@ -334,7 +355,7 @@ ENTRY(early_idt_handler)
1: hlt
jmp 1b

-20: # Exception table entry found
+20: # Exception table entry found or page table generated
popq %r11
popq %r10
popq %r9
@@ -348,6 +369,8 @@ ENTRY(early_idt_handler)
decl early_recursion_flag(%rip)
INTERRUPT_RETURN

+ __INITDATA
+
.balign 4
early_recursion_flag:
.long 0
@@ -358,11 +381,10 @@ early_idt_msg:
early_idt_ripmsg:
.asciz "RIP %s\n"
#endif /* CONFIG_EARLY_PRINTK */
- .previous

#define NEXT_PAGE(name) \
.balign PAGE_SIZE; \
-ENTRY(name)
+GLOBAL(name)

/* Automate the creation of 1 to 1 mapping pmd entries */
#define PMDS(START, PERM, COUNT) \
@@ -372,46 +394,21 @@ ENTRY(name)
i = i + 1 ; \
.endr

- .data
- /*
- * This default setting generates an ident mapping at address 0x100000
- * and a mapping for the kernel that precisely maps virtual address
- * 0xffffffff80000000 to physical address 0x000000. (always using
- * 2Mbyte large pages provided by PAE mode)
- */
-NEXT_PAGE(init_level4_pgt)
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_PAGE_OFFSET*8, 0
- .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .org init_level4_pgt + L4_START_KERNEL*8, 0
- /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+ __INITDATA
+NEXT_PAGE(early_level4_pgt)
+ .fill 511,8,0
.quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE

-NEXT_PAGE(level3_ident_pgt)
- .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
- .fill 511,8,0
+NEXT_PAGE(early_dynamic_pgts)
+ .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0

+ .data
NEXT_PAGE(level3_kernel_pgt)
.fill L3_START_KERNEL,8,0
/* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
.quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
.quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE

-NEXT_PAGE(level2_fixmap_pgt)
- .fill 506,8,0
- .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
- /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
- .fill 5,8,0
-
-NEXT_PAGE(level1_fixmap_pgt)
- .fill 512,8,0
-
-NEXT_PAGE(level2_ident_pgt)
- /* Since I easily can, map the first 1G.
- * Don't set NX because code runs from these pages.
- */
- PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
-
NEXT_PAGE(level2_kernel_pgt)
/*
* 512 MB kernel mapping. We spend a full page on this pagetable
@@ -426,11 +423,13 @@ NEXT_PAGE(level2_kernel_pgt)
PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
KERNEL_IMAGE_SIZE/PMD_SIZE)

-NEXT_PAGE(level2_spare_pgt)
- .fill 512, 8, 0
+NEXT_PAGE(level2_fixmap_pgt)
+ .fill 506,8,0
+ .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+ /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
+ .fill 5,8,0

#undef PMDS
-#undef NEXT_PAGE

.data
.align 16
@@ -456,6 +455,7 @@ ENTRY(nmi_idt_table)
.skip IDT_ENTRIES * 16

__PAGE_ALIGNED_BSS
- .align PAGE_SIZE
-ENTRY(empty_zero_page)
+NEXT_PAGE(empty_zero_page)
+ .skip PAGE_SIZE
+NEXT_PAGE(init_level4_pgt)
.skip PAGE_SIZE