Re: [PATCH 2/8] x86/head: Refactor 32-bit pgtable setup

From: hpa
Date: Fri Oct 14 2016 - 14:31:45 EST


On October 14, 2016 11:05:12 AM PDT, Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx> wrote:
>From: Matt Fleming <matt@xxxxxxxxxxxxxxxxxxx>
>
>The new Xen PVH entry point requires page tables to be setup by the
>kernel since it is entered with paging disabled.
>
>Pull the common code out of head_32.S and into pgtable_32.S so that
>setup_pgtable_32 can be invoked from both the new Xen entry point and
>the existing startup_32 code.
>
>Cc: Boris Ostrovsky <boris.ostrovsky@xxxxxxxxxx>
>Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
>Cc: Ingo Molnar <mingo@xxxxxxxxxx>
>Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
>Cc: x86@xxxxxxxxxx
>Signed-off-by: Matt Fleming <matt@xxxxxxxxxxxxxxxxxxx>
>---
> arch/x86/Makefile | 2 +
> arch/x86/kernel/Makefile | 2 +
>arch/x86/kernel/head_32.S | 168
>+------------------------------------
>arch/x86/kernel/pgtable_32.S | 196
>+++++++++++++++++++++++++++++++++++++++++++
> 4 files changed, 201 insertions(+), 167 deletions(-)
> create mode 100644 arch/x86/kernel/pgtable_32.S
>
>diff --git a/arch/x86/Makefile b/arch/x86/Makefile
>index 2d44933..67cc771 100644
>--- a/arch/x86/Makefile
>+++ b/arch/x86/Makefile
>@@ -204,6 +204,8 @@ head-y += arch/x86/kernel/head$(BITS).o
> head-y += arch/x86/kernel/ebda.o
> head-y += arch/x86/kernel/platform-quirks.o
>
>+head-$(CONFIG_X86_32) += arch/x86/kernel/pgtable_32.o
>+
> libs-y += arch/x86/lib/
>
> # See arch/x86/Kbuild for content of core part of the kernel
>diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
>index 4dd5d50..eae85a5 100644
>--- a/arch/x86/kernel/Makefile
>+++ b/arch/x86/kernel/Makefile
>@@ -8,6 +8,8 @@ extra-y += ebda.o
> extra-y += platform-quirks.o
> extra-y += vmlinux.lds
>
>+extra-$(CONFIG_X86_32) += pgtable_32.o
>+
> CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
>
> ifdef CONFIG_FUNCTION_TRACER
>diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
>index 5f40126..0db066e 100644
>--- a/arch/x86/kernel/head_32.S
>+++ b/arch/x86/kernel/head_32.S
>@@ -41,51 +41,6 @@
> #define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id
>
> /*
>- * This is how much memory in addition to the memory covered up to
>- * and including _end we need mapped initially.
>- * We need:
>- * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
>- * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
>- *
>- * Modulo rounding, each megabyte assigned here requires a kilobyte of
>- * memory, which is currently unreclaimed.
>- *
>- * This should be a multiple of a page.
>- *
>- * KERNEL_IMAGE_SIZE should be greater than pa(_end)
>- * and small than max_low_pfn, otherwise will waste some page table
>entries
>- */
>-
>-#if PTRS_PER_PMD > 1
>-#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) +
>PTRS_PER_PGD)
>-#else
>-#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
>-#endif
>-
>-/*
>- * Number of possible pages in the lowmem region.
>- *
>- * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
>- * gas warning about overflowing shift count when gas has been
>compiled
>- * with only a host target support using a 32-bit type for internal
>- * representation.
>- */
>-LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
>-
>-/* Enough space to fit pagetables for the low memory linear map */
>-MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
>-
>-/*
>- * Worst-case size of the kernel mapping we need to make:
>- * a relocatable kernel can live anywhere in lowmem, so we need to be
>able
>- * to map all of lowmem.
>- */
>-KERNEL_PAGES = LOWMEM_PAGES
>-
>-INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
>-RESERVE_BRK(pagetables, INIT_MAP_SIZE)
>-
>-/*
> * 32-bit kernel entrypoint; only used by the boot CPU. On entry,
> * %esi points to the real-mode code as a 32-bit pointer.
> * CS and DS must be 4 GB flat segments, but we don't depend on
>@@ -157,92 +112,7 @@ ENTRY(startup_32)
> call load_ucode_bsp
> #endif
>
>-/*
>- * Initialize page tables. This creates a PDE and a set of page
>- * tables, which are located immediately beyond __brk_base. The
>variable
>- * _brk_end is set up to point to the first "safe" location.
>- * Mappings are created both at virtual address 0 (identity mapping)
>- * and PAGE_OFFSET for up to _end.
>- */
>-#ifdef CONFIG_X86_PAE
>-
>- /*
>- * In PAE mode initial_page_table is statically defined to contain
>- * enough entries to cover the VMSPLIT option (that is the top 1, 2
>or 3
>- * entries). The identity mapping is handled by pointing two PGD
>entries
>- * to the first kernel PMD.
>- *
>- * Note the upper half of each PMD or PTE are always zero at this
>stage.
>- */
>-
>-#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs
>*/
>-
>- xorl %ebx,%ebx /* %ebx is kept at zero */
>-
>- movl $pa(__brk_base), %edi
>- movl $pa(initial_pg_pmd), %edx
>- movl $PTE_IDENT_ATTR, %eax
>-10:
>- leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
>- movl %ecx,(%edx) /* Store PMD entry */
>- /* Upper half already zero */
>- addl $8,%edx
>- movl $512,%ecx
>-11:
>- stosl
>- xchgl %eax,%ebx
>- stosl
>- xchgl %eax,%ebx
>- addl $0x1000,%eax
>- loop 11b
>-
>- /*
>- * End condition: we must map up to the end + MAPPING_BEYOND_END.
>- */
>- movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
>- cmpl %ebp,%eax
>- jb 10b
>-1:
>- addl $__PAGE_OFFSET, %edi
>- movl %edi, pa(_brk_end)
>- shrl $12, %eax
>- movl %eax, pa(max_pfn_mapped)
>-
>- /* Do early initialization of the fixmap area */
>- movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
>- movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
>-#else /* Not PAE */
>-
>-page_pde_offset = (__PAGE_OFFSET >> 20);
>-
>- movl $pa(__brk_base), %edi
>- movl $pa(initial_page_table), %edx
>- movl $PTE_IDENT_ATTR, %eax
>-10:
>- leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
>- movl %ecx,(%edx) /* Store identity PDE entry */
>- movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
>- addl $4,%edx
>- movl $1024, %ecx
>-11:
>- stosl
>- addl $0x1000,%eax
>- loop 11b
>- /*
>- * End condition: we must map up to the end + MAPPING_BEYOND_END.
>- */
>- movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
>- cmpl %ebp,%eax
>- jb 10b
>- addl $__PAGE_OFFSET, %edi
>- movl %edi, pa(_brk_end)
>- shrl $12, %eax
>- movl %eax, pa(max_pfn_mapped)
>-
>- /* Do early initialization of the fixmap area */
>- movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
>- movl %eax,pa(initial_page_table+0xffc)
>-#endif
>+ call setup_pgtable_32
>
> #ifdef CONFIG_PARAVIRT
> /* This is can only trip for a broken bootloader... */
>@@ -660,47 +530,11 @@ ENTRY(setup_once_ref)
> */
> __PAGE_ALIGNED_BSS
> .align PAGE_SIZE
>-#ifdef CONFIG_X86_PAE
>-initial_pg_pmd:
>- .fill 1024*KPMDS,4,0
>-#else
>-ENTRY(initial_page_table)
>- .fill 1024,4,0
>-#endif
>-initial_pg_fixmap:
>- .fill 1024,4,0
> ENTRY(empty_zero_page)
> .fill 4096,1,0
> ENTRY(swapper_pg_dir)
> .fill 1024,4,0
>
>-/*
>- * This starts the data section.
>- */
>-#ifdef CONFIG_X86_PAE
>-__PAGE_ALIGNED_DATA
>- /* Page-aligned for the benefit of paravirt? */
>- .align PAGE_SIZE
>-ENTRY(initial_page_table)
>- .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
>-# if KPMDS == 3
>- .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>- .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
>- .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
>-# elif KPMDS == 2
>- .long 0,0
>- .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>- .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
>-# elif KPMDS == 1
>- .long 0,0
>- .long 0,0
>- .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>-# else
>-# error "Kernel PMDs should be 1, 2 or 3"
>-# endif
>- .align PAGE_SIZE /* needs to be page-sized too */
>-#endif
>-
> .data
> .balign 4
> ENTRY(initial_stack)
>diff --git a/arch/x86/kernel/pgtable_32.S
>b/arch/x86/kernel/pgtable_32.S
>new file mode 100644
>index 0000000..aded718
>--- /dev/null
>+++ b/arch/x86/kernel/pgtable_32.S
>@@ -0,0 +1,196 @@
>+#include <linux/threads.h>
>+#include <linux/init.h>
>+#include <linux/linkage.h>
>+#include <asm/segment.h>
>+#include <asm/page_types.h>
>+#include <asm/pgtable_types.h>
>+#include <asm/cache.h>
>+#include <asm/thread_info.h>
>+#include <asm/asm-offsets.h>
>+#include <asm/setup.h>
>+#include <asm/processor-flags.h>
>+#include <asm/msr-index.h>
>+#include <asm/cpufeatures.h>
>+#include <asm/percpu.h>
>+#include <asm/nops.h>
>+#include <asm/bootparam.h>
>+
>+/* Physical address */
>+#define pa(X) ((X) - __PAGE_OFFSET)
>+
>+/*
>+ * This is how much memory in addition to the memory covered up to
>+ * and including _end we need mapped initially.
>+ * We need:
>+ * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
>+ * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
>+ *
>+ * Modulo rounding, each megabyte assigned here requires a kilobyte of
>+ * memory, which is currently unreclaimed.
>+ *
>+ * This should be a multiple of a page.
>+ *
>+ * KERNEL_IMAGE_SIZE should be greater than pa(_end)
>+ * and small than max_low_pfn, otherwise will waste some page table
>entries
>+ */
>+
>+#if PTRS_PER_PMD > 1
>+#define PAGE_TABLE_SIZE(pages) (((pages) / PTRS_PER_PMD) +
>PTRS_PER_PGD)
>+#else
>+#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
>+#endif
>+
>+/*
>+ * Number of possible pages in the lowmem region.
>+ *
>+ * We shift 2 by 31 instead of 1 by 32 to the left in order to avoid a
>+ * gas warning about overflowing shift count when gas has been
>compiled
>+ * with only a host target support using a 32-bit type for internal
>+ * representation.
>+ */
>+LOWMEM_PAGES = (((2<<31) - __PAGE_OFFSET) >> PAGE_SHIFT)
>+
>+/* Enough space to fit pagetables for the low memory linear map */
>+MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
>+
>+/*
>+ * Worst-case size of the kernel mapping we need to make:
>+ * a relocatable kernel can live anywhere in lowmem, so we need to be
>able
>+ * to map all of lowmem.
>+ */
>+KERNEL_PAGES = LOWMEM_PAGES
>+
>+INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
>+RESERVE_BRK(pagetables, INIT_MAP_SIZE)
>+
>+/*
>+ * Initialize page tables. This creates a PDE and a set of page
>+ * tables, which are located immediately beyond __brk_base. The
>variable
>+ * _brk_end is set up to point to the first "safe" location.
>+ * Mappings are created both at virtual address 0 (identity mapping)
>+ * and PAGE_OFFSET for up to _end.
>+ */
>+ .text
>+ENTRY(setup_pgtable_32)
>+#ifdef CONFIG_X86_PAE
>+ /*
>+ * In PAE mode initial_page_table is statically defined to contain
>+ * enough entries to cover the VMSPLIT option (that is the top 1, 2
>or 3
>+ * entries). The identity mapping is handled by pointing two PGD
>entries
>+ * to the first kernel PMD.
>+ *
>+ * Note the upper half of each PMD or PTE are always zero at this
>stage.
>+ */
>+
>+#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs
>*/
>+
>+ xorl %ebx,%ebx /* %ebx is kept at zero */
>+
>+ movl $pa(__brk_base), %edi
>+ movl $pa(initial_pg_pmd), %edx
>+ movl $PTE_IDENT_ATTR, %eax
>+10:
>+ leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
>+ movl %ecx,(%edx) /* Store PMD entry */
>+ /* Upper half already zero */
>+ addl $8,%edx
>+ movl $512,%ecx
>+11:
>+ stosl
>+ xchgl %eax,%ebx
>+ stosl
>+ xchgl %eax,%ebx
>+ addl $0x1000,%eax
>+ loop 11b
>+
>+ /*
>+ * End condition: we must map up to the end + MAPPING_BEYOND_END.
>+ */
>+ movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
>+ cmpl %ebp,%eax
>+ jb 10b
>+1:
>+ addl $__PAGE_OFFSET, %edi
>+ movl %edi, pa(_brk_end)
>+ shrl $12, %eax
>+ movl %eax, pa(max_pfn_mapped)
>+
>+ /* Do early initialization of the fixmap area */
>+ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
>+ movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
>+#else /* Not PAE */
>+
>+page_pde_offset = (__PAGE_OFFSET >> 20);
>+
>+ movl $pa(__brk_base), %edi
>+ movl $pa(initial_page_table), %edx
>+ movl $PTE_IDENT_ATTR, %eax
>+10:
>+ leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
>+ movl %ecx,(%edx) /* Store identity PDE entry */
>+ movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
>+ addl $4,%edx
>+ movl $1024, %ecx
>+11:
>+ stosl
>+ addl $0x1000,%eax
>+ loop 11b
>+ /*
>+ * End condition: we must map up to the end + MAPPING_BEYOND_END.
>+ */
>+ movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
>+ cmpl %ebp,%eax
>+ jb 10b
>+ addl $__PAGE_OFFSET, %edi
>+ movl %edi, pa(_brk_end)
>+ shrl $12, %eax
>+ movl %eax, pa(max_pfn_mapped)
>+
>+ /* Do early initialization of the fixmap area */
>+ movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
>+ movl %eax,pa(initial_page_table+0xffc)
>+#endif
>+ ret
>+ENDPROC(setup_pgtable_32)
>+
>+/*
>+ * BSS section
>+ */
>+__PAGE_ALIGNED_BSS
>+ .align PAGE_SIZE
>+#ifdef CONFIG_X86_PAE
>+initial_pg_pmd:
>+ .fill 1024*KPMDS,4,0
>+#else
>+ENTRY(initial_page_table)
>+ .fill 1024,4,0
>+#endif
>+initial_pg_fixmap:
>+ .fill 1024,4,0
>+
>+/*
>+ * This starts the data section.
>+ */
>+#ifdef CONFIG_X86_PAE
>+__PAGE_ALIGNED_DATA
>+ /* Page-aligned for the benefit of paravirt? */
>+ .align PAGE_SIZE
>+ENTRY(initial_page_table)
>+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
>+# if KPMDS == 3
>+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
>+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
>+# elif KPMDS == 2
>+ .long 0,0
>+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
>+# elif KPMDS == 1
>+ .long 0,0
>+ .long 0,0
>+ .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
>+# else
>+# error "Kernel PMDs should be 1, 2 or 3"
>+# endif
>+ .align PAGE_SIZE /* needs to be page-sized too */
>+#endif

And why does it need a separate entry point as opposed to the plain one?
--
Sent from my Android device with K-9 Mail. Please excuse my brevity.