Re: [PATCH] x86: put initial_pg_tables into bss -v2

From: Yinghai Lu
Date: Mon Mar 09 2009 - 03:45:40 EST


On Sun, Mar 1, 2009 at 4:29 PM, H. Peter Anvin <hpa@xxxxxxxxx> wrote:
> It's still really ugly, though.  A much easier and cleaner way would seem to
> be to calculate a far limit on the brk and then marking it as a formal
> (non-alloc) section in the linker script and vmlinux file.  That way
> anything that examines the vmlinux file will see it as an exclusion section.
>  We can (and should) even verify that we don't overflow the brk and panic if
> we do.

please check attached patch.

[PATCH] x86: put initial_pg_tables into .bss -v4

Impact: cleanup

Don't use ram after _end blindly for pagetables. aka init pages is before _end
put those pg table into .bss

v2: keep initial page table up to 512M only.
v4: put initial page tables just before _end

Signed-off-by: Yinghai Lu <yinghai@xxxxxxxxxx>


YH
[PATCH] x86: put initial_pg_tables into .bss -v4

Impact: cleanup

Don't use ram after _end blindly for pagetables. aka init pages is before _end
put those pg table into .bss

v2: keep initial page table up to 512M only.
v4: put initial page tables just before _end

Signed-off-by: Yinghai Lu <yinghai@xxxxxxxxxx>

---
arch/x86/include/asm/page_32_types.h | 5 +++
arch/x86/kernel/head32.c | 3 +
arch/x86/kernel/head_32.S | 55 ++++++++++++++---------------------
arch/x86/kernel/vmlinux_32.lds.S | 11 ++++++-
4 files changed, 40 insertions(+), 34 deletions(-)

Index: linux-2.6/arch/x86/kernel/head32.c
===================================================================
--- linux-2.6.orig/arch/x86/kernel/head32.c
+++ linux-2.6/arch/x86/kernel/head32.c
@@ -18,7 +18,8 @@ void __init i386_start_kernel(void)
{
reserve_trampoline_memory();

- reserve_early(__pa_symbol(&_text), __pa_symbol(&_end), "TEXT DATA BSS");
+ reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop),
+ "TEXT DATA BSS");

#ifdef CONFIG_BLK_DEV_INITRD
/* Reserve INITRD */
Index: linux-2.6/arch/x86/kernel/head_32.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/head_32.S
+++ linux-2.6/arch/x86/kernel/head_32.S
@@ -38,42 +38,30 @@
#define X86_VENDOR_ID new_cpu_data+CPUINFO_x86_vendor_id

/*
- * This is how much memory *in addition to the memory covered up to
- * and including _end* we need mapped initially.
- * We need:
- * - one bit for each possible page, but only in low memory, which means
- * 2^32/4096/8 = 128K worst case (4G/4G split.)
+ * This is how much memory for page table to and including _end
+ * we need mapped initially.
* - enough space to map all low memory, which means
- * (2^32/4096) / 1024 pages (worst case, non PAE)
- * (2^32/4096) / 512 + 4 pages (worst case for PAE)
- * - a few pages for allocator use before the kernel pagetable has
- * been set up
+ * (KERNEL_IMAGE_SIZE/4096) / 1024 pages (worst case, non PAE)
+ * (KERNEL_IMAGE_SIZE/4096) / 512 + 4 pages (worst case for PAE)
*
* Modulo rounding, each megabyte assigned here requires a kilobyte of
* memory, which is currently unreclaimed.
*
* This should be a multiple of a page.
+ *
+ * KERNEL_IMAGE_SIZE should be greater than pa(_end)
+ * and small than max_low_pfn, otherwise will waste some page table entries
*/
-LOW_PAGES = 1<<(32-PAGE_SHIFT_asm)
-
-/*
- * To preserve the DMA pool in PAGEALLOC kernels, we'll allocate
- * pagetables from above the 16MB DMA limit, so we'll have to set
- * up pagetables 16MB more (worst-case):
- */
-#if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KMEMCHECK)
-LOW_PAGES = LOW_PAGES + 0x1000000
-#endif
+LOW_PAGES = (KERNEL_IMAGE_SIZE + PAGE_SIZE_asm - 1)>>PAGE_SHIFT

#if PTRS_PER_PMD > 1
PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PMD) + PTRS_PER_PGD
#else
PAGE_TABLE_SIZE = (LOW_PAGES / PTRS_PER_PGD)
#endif
-BOOTBITMAP_SIZE = LOW_PAGES / 8
ALLOCATOR_SLOP = 4

-INIT_MAP_BEYOND_END = BOOTBITMAP_SIZE + (PAGE_TABLE_SIZE + ALLOCATOR_SLOP)*PAGE_SIZE_asm
+INIT_MAP_SIZE = (PAGE_TABLE_SIZE + ALLOCATOR_SLOP) * PAGE_SIZE_asm

/*
* 32-bit kernel entrypoint; only used by the boot CPU. On entry,
@@ -166,10 +154,9 @@ num_subarch_entries = (. - subarch_entri

/*
* Initialize page tables. This creates a PDE and a set of page
- * tables, which are located immediately beyond _end. The variable
- * init_pg_tables_end is set up to point to the first "safe" location.
+ * tables, which are located immediately beyond _end.
* Mappings are created both at virtual address 0 (identity mapping)
- * and PAGE_OFFSET for up to _end+sizeof(page tables)+INIT_MAP_BEYOND_END.
+ * and PAGE_OFFSET for up to _end
*
* Note that the stack is not yet set up!
*/
@@ -209,14 +196,14 @@ default_entry:
loop 11b

/*
- * End condition: we must map up to and including INIT_MAP_BEYOND_END
- * bytes beyond the end of our own page tables.
+ * End condition: we must map up to the end.
*/
- leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+ movl $pa(_end), %ebp
+ addl PTE_IDENT_ATTR, %ebp
cmpl %ebp,%eax
jb 10b
1:
- movl %edi,pa(init_pg_tables_end)
+ movl %edi, pa(init_pg_tables_end)
shrl $12, %eax
movl %eax, pa(max_pfn_mapped)

@@ -242,14 +229,14 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
addl $0x1000,%eax
loop 11b
/*
- * End condition: we must map up to and including INIT_MAP_BEYOND_END
- * bytes beyond the end of our own page tables; the +0x007 is
+ * End condition: we must map up to end, the +0x007 is
* the attribute bits
*/
- leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
+ movl $pa(_end), %ebp
+ addl PTE_IDENT_ATTR, %ebp
cmpl %ebp,%eax
jb 10b
- movl %edi,pa(init_pg_tables_end)
+ movl %edi, pa(init_pg_tables_end)
shrl $12, %eax
movl %eax, pa(max_pfn_mapped)

@@ -636,6 +623,10 @@ swapper_pg_fixmap:
.fill 1024,4,0
ENTRY(empty_zero_page)
.fill 4096,1,0
+
+.section ".bss.extra_page_aligned","wa"
+ .align PAGE_SIZE_asm
+ .fill INIT_MAP_SIZE,1,0
/*
* This starts the data section.
*/
Index: linux-2.6/arch/x86/kernel/vmlinux_32.lds.S
===================================================================
--- linux-2.6.orig/arch/x86/kernel/vmlinux_32.lds.S
+++ linux-2.6/arch/x86/kernel/vmlinux_32.lds.S
@@ -189,10 +189,13 @@ SECTIONS
*(.bss)
. = ALIGN(4);
__bss_stop = .;
- _end = . ;
+ /* extra_page_aligned must be last one before end*/
/* This is where the kernel creates the early boot page tables */
. = ALIGN(PAGE_SIZE);
pg0 = . ;
+ *(.bss.extra_page_aligned)
+ . = ALIGN(8);
+ _end = . ;
}

/* Sections to be discarded */
@@ -205,6 +208,12 @@ SECTIONS
DWARF_DEBUG
}

+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+ "kernel image bigger than KERNEL_IMAGE_SIZE")
+
#ifdef CONFIG_KEXEC
/* Link time checks */
#include <asm/kexec.h>
Index: linux-2.6/arch/x86/include/asm/page_32_types.h
===================================================================
--- linux-2.6.orig/arch/x86/include/asm/page_32_types.h
+++ linux-2.6/arch/x86/include/asm/page_32_types.h
@@ -39,6 +39,11 @@
#define __VIRTUAL_MASK_SHIFT 32
#endif /* CONFIG_X86_PAE */

+/*
+ * Kernel image size is limited to 512 MB (see in arch/x86/kernel/head_32.S)
+ */
+#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
+
#ifndef __ASSEMBLY__

/*