Re: [PATCH v2 3/9] make kernel be able to load above 4G in boot stage

From: Baoquan He
Date: Mon Mar 02 2015 - 10:58:39 EST


Oops, I didn't copy the subject of Yinghai's patch, it should be as
below. Will change it back when repost.

x86, boot: Enable ident_mapping for kasl above 4G on 64bit

On 03/02/15 at 10:58pm, Baoquan He wrote:
> From: Yinghai Lu <yinghai@xxxxxxxxxx>
>
> split kernel_ident_mapping_init() and call that in boot::decompress_kernel
> stage. it will cover new range that is above 4G.
>
> -v2: fix one typo, use round_up/round_down and use MACRO for size.
>
> Signed-off-by: Yinghai Lu <yinghai@xxxxxxxxxx>
> ---
> arch/x86/boot/compressed/misc.c | 10 +++++
> arch/x86/boot/compressed/misc_pgt.c | 61 ++++++++++++++++++++++++++++++
> arch/x86/include/asm/page.h | 5 +++
> arch/x86/mm/ident_map.c | 74 +++++++++++++++++++++++++++++++++++++
> arch/x86/mm/init_64.c | 74 +------------------------------------
> 5 files changed, 151 insertions(+), 73 deletions(-)
> create mode 100644 arch/x86/boot/compressed/misc_pgt.c
> create mode 100644 arch/x86/mm/ident_map.c
>
> diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
> index ac5c05e..c9d8187 100644
> --- a/arch/x86/boot/compressed/misc.c
> +++ b/arch/x86/boot/compressed/misc.c
> @@ -9,6 +9,11 @@
> * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
> */
>
> +#ifdef CONFIG_X86_64
> +#define __pa(x) ((unsigned long)(x))
> +#define __va(x) ((void *)((unsigned long)(x)))
> +#endif
> +
> #include "misc.h"
> #include "../string.h"
>
> @@ -366,6 +371,8 @@ static void parse_elf(void *output)
> free(phdrs);
> }
>
> +#include "misc_pgt.c"
> +
> asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
> unsigned char *input_data,
> unsigned long input_len,
> @@ -421,6 +428,9 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap,
> error("Wrong destination address");
> #endif
>
> + if (output != output_orig)
> + fill_linux64_pagetable((unsigned long)output, output_len);
> +
> debug_putstr("\nDecompressing Linux... ");
> decompress(input_data, input_len, NULL, NULL, output, NULL, error);
> parse_elf(output);
> diff --git a/arch/x86/boot/compressed/misc_pgt.c b/arch/x86/boot/compressed/misc_pgt.c
> new file mode 100644
> index 0000000..2783f0f
> --- /dev/null
> +++ b/arch/x86/boot/compressed/misc_pgt.c
> @@ -0,0 +1,61 @@
> +
> +#ifdef CONFIG_X86_64
> +#include <asm/init.h>
> +#include <asm/pgtable.h>
> +
> +#include "../../mm/ident_map.c"
> +
> +struct alloc_pgt_data {
> + unsigned char *pgt_buf;
> + unsigned long pgt_buf_size;
> + unsigned long pgt_buf_offset;
> +};
> +
> +static void *alloc_pgt_page(void *context)
> +{
> + struct alloc_pgt_data *d = (struct alloc_pgt_data *)context;
> + unsigned char *p = (unsigned char *)d->pgt_buf;
> +
> + if (d->pgt_buf_offset >= d->pgt_buf_size) {
> + debug_putstr("out of pgt_buf in misc.c\n");
> + return NULL;
> + }
> +
> + p += d->pgt_buf_offset;
> + d->pgt_buf_offset += PAGE_SIZE;
> +
> + return p;
> +}
> +
> +/* 4 pages to cover cross 512G boundary */
> +#define PGT_BUF_SIZE (PAGE_SIZE*4)
> +
> +unsigned long __force_order;
> +static unsigned char pgt_buf[PGT_BUF_SIZE] __aligned(PAGE_SIZE);
> +
> +static void fill_linux64_pagetable(unsigned long start, unsigned long size)
> +{
> + struct alloc_pgt_data data = {
> + .pgt_buf = (unsigned char *) pgt_buf,
> + .pgt_buf_size = sizeof(pgt_buf),
> + .pgt_buf_offset = 0,
> + };
> + struct x86_mapping_info mapping_info = {
> + .alloc_pgt_page = alloc_pgt_page,
> + .context = &data,
> + .pmd_flag = __PAGE_KERNEL_LARGE_EXEC,
> + };
> + unsigned long end = start + size;
> + pgd_t *level4p = (pgd_t *)read_cr3();
> +
> + /* align boundary to 2M */
> + start = round_down(start, PMD_SIZE);
> + end = round_up(end, PMD_SIZE);
> + if (start >= (1UL<<32))
> + kernel_ident_mapping_init(&mapping_info, level4p, start, end);
> +}
> +#else
> +static void fill_linux64_pagetable(unsigned long start, unsigned long size)
> +{
> +}
> +#endif
> diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
> index 802dde3..cf8f619 100644
> --- a/arch/x86/include/asm/page.h
> +++ b/arch/x86/include/asm/page.h
> @@ -37,7 +37,10 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
> alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
> #define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
>
> +#ifndef __pa
> #define __pa(x) __phys_addr((unsigned long)(x))
> +#endif
> +
> #define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x))
> /* __pa_symbol should be used for C visible symbols.
> This seems to be the official gcc blessed way to do such arithmetic. */
> @@ -51,7 +54,9 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
> #define __pa_symbol(x) \
> __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x)))
>
> +#ifndef __va
> #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
> +#endif
>
> #define __boot_va(x) __va(x)
> #define __boot_pa(x) __pa(x)
> diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c
> new file mode 100644
> index 0000000..751ca92
> --- /dev/null
> +++ b/arch/x86/mm/ident_map.c
> @@ -0,0 +1,74 @@
> +
> +static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
> + unsigned long addr, unsigned long end)
> +{
> + addr &= PMD_MASK;
> + for (; addr < end; addr += PMD_SIZE) {
> + pmd_t *pmd = pmd_page + pmd_index(addr);
> +
> + if (!pmd_present(*pmd))
> + set_pmd(pmd, __pmd(addr | pmd_flag));
> + }
> +}
> +static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
> + unsigned long addr, unsigned long end)
> +{
> + unsigned long next;
> +
> + for (; addr < end; addr = next) {
> + pud_t *pud = pud_page + pud_index(addr);
> + pmd_t *pmd;
> +
> + next = (addr & PUD_MASK) + PUD_SIZE;
> + if (next > end)
> + next = end;
> +
> + if (pud_present(*pud)) {
> + pmd = pmd_offset(pud, 0);
> + ident_pmd_init(info->pmd_flag, pmd, addr, next);
> + continue;
> + }
> + pmd = (pmd_t *)info->alloc_pgt_page(info->context);
> + if (!pmd)
> + return -ENOMEM;
> + ident_pmd_init(info->pmd_flag, pmd, addr, next);
> + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
> + }
> +
> + return 0;
> +}
> +
> +int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
> + unsigned long addr, unsigned long end)
> +{
> + unsigned long next;
> + int result;
> + int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
> +
> + for (; addr < end; addr = next) {
> + pgd_t *pgd = pgd_page + pgd_index(addr) + off;
> + pud_t *pud;
> +
> + next = (addr & PGDIR_MASK) + PGDIR_SIZE;
> + if (next > end)
> + next = end;
> +
> + if (pgd_present(*pgd)) {
> + pud = pud_offset(pgd, 0);
> + result = ident_pud_init(info, pud, addr, next);
> + if (result)
> + return result;
> + continue;
> + }
> +
> + pud = (pud_t *)info->alloc_pgt_page(info->context);
> + if (!pud)
> + return -ENOMEM;
> + result = ident_pud_init(info, pud, addr, next);
> + if (result)
> + return result;
> + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
> + }
> +
> + return 0;
> +}
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 30eb05a..c30efb6 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -56,79 +56,7 @@
>
> #include "mm_internal.h"
>
> -static void ident_pmd_init(unsigned long pmd_flag, pmd_t *pmd_page,
> - unsigned long addr, unsigned long end)
> -{
> - addr &= PMD_MASK;
> - for (; addr < end; addr += PMD_SIZE) {
> - pmd_t *pmd = pmd_page + pmd_index(addr);
> -
> - if (!pmd_present(*pmd))
> - set_pmd(pmd, __pmd(addr | pmd_flag));
> - }
> -}
> -static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page,
> - unsigned long addr, unsigned long end)
> -{
> - unsigned long next;
> -
> - for (; addr < end; addr = next) {
> - pud_t *pud = pud_page + pud_index(addr);
> - pmd_t *pmd;
> -
> - next = (addr & PUD_MASK) + PUD_SIZE;
> - if (next > end)
> - next = end;
> -
> - if (pud_present(*pud)) {
> - pmd = pmd_offset(pud, 0);
> - ident_pmd_init(info->pmd_flag, pmd, addr, next);
> - continue;
> - }
> - pmd = (pmd_t *)info->alloc_pgt_page(info->context);
> - if (!pmd)
> - return -ENOMEM;
> - ident_pmd_init(info->pmd_flag, pmd, addr, next);
> - set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
> - }
> -
> - return 0;
> -}
> -
> -int kernel_ident_mapping_init(struct x86_mapping_info *info, pgd_t *pgd_page,
> - unsigned long addr, unsigned long end)
> -{
> - unsigned long next;
> - int result;
> - int off = info->kernel_mapping ? pgd_index(__PAGE_OFFSET) : 0;
> -
> - for (; addr < end; addr = next) {
> - pgd_t *pgd = pgd_page + pgd_index(addr) + off;
> - pud_t *pud;
> -
> - next = (addr & PGDIR_MASK) + PGDIR_SIZE;
> - if (next > end)
> - next = end;
> -
> - if (pgd_present(*pgd)) {
> - pud = pud_offset(pgd, 0);
> - result = ident_pud_init(info, pud, addr, next);
> - if (result)
> - return result;
> - continue;
> - }
> -
> - pud = (pud_t *)info->alloc_pgt_page(info->context);
> - if (!pud)
> - return -ENOMEM;
> - result = ident_pud_init(info, pud, addr, next);
> - if (result)
> - return result;
> - set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
> - }
> -
> - return 0;
> -}
> +#include "ident_map.c"
>
> static int __init parse_direct_gbpages_off(char *arg)
> {
> --
> 1.9.3
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/