Re: [PATCH resend] kexec/x86_64: Use one page table in x86_64machine_kexec
From: Simon Horman
Date: Tue Feb 03 2009 - 21:29:26 EST
On Tue, Feb 03, 2009 at 02:22:48PM +0800, Huang Ying wrote:
> Impact: reduce kernel BSS size by 7 pages, improve code readability
>
> Two page tables are used in current x86_64 kexec implementation. One
> is used to jump from kernel virtual address to identity map address,
> the other is used to map all physical memory. In fact, on x86_64,
> there is no conflict between kernel virtual address space and physical
> memory space, so just one page table is sufficient. The page table
> pages used to map control page are dynamically allocated to save
> memory if kexec image is not loaded. ASM code used to map control page
> is replaced by C code too.
Hi Huang,
this patch looks quite nice to me. I am CCing my former colleague Magnus
Damm for comment. He did some work in this area a little while ago.
> Signed-off-by: Huang Ying <ying.huang@xxxxxxxxx>
>
> ---
> arch/x86/include/asm/kexec.h | 27 ++-----
> arch/x86/kernel/machine_kexec_64.c | 82 +++++++++++++++-------
> arch/x86/kernel/relocate_kernel_64.S | 125 -----------------------------------
> 3 files changed, 67 insertions(+), 167 deletions(-)
>
> --- a/arch/x86/include/asm/kexec.h
> +++ b/arch/x86/include/asm/kexec.h
> @@ -9,23 +9,8 @@
> # define PAGES_NR 4
> #else
> # define PA_CONTROL_PAGE 0
> -# define VA_CONTROL_PAGE 1
> -# define PA_PGD 2
> -# define VA_PGD 3
> -# define PA_PUD_0 4
> -# define VA_PUD_0 5
> -# define PA_PMD_0 6
> -# define VA_PMD_0 7
> -# define PA_PTE_0 8
> -# define VA_PTE_0 9
> -# define PA_PUD_1 10
> -# define VA_PUD_1 11
> -# define PA_PMD_1 12
> -# define VA_PMD_1 13
> -# define PA_PTE_1 14
> -# define VA_PTE_1 15
> -# define PA_TABLE_PAGE 16
> -# define PAGES_NR 17
> +# define PA_TABLE_PAGE 1
> +# define PAGES_NR 2
> #endif
>
> #ifdef CONFIG_X86_32
> @@ -157,9 +142,9 @@ relocate_kernel(unsigned long indirectio
> unsigned long start_address) ATTRIB_NORET;
> #endif
>
> -#ifdef CONFIG_X86_32
> #define ARCH_HAS_KIMAGE_ARCH
>
> +#ifdef CONFIG_X86_32
> struct kimage_arch {
> pgd_t *pgd;
> #ifdef CONFIG_X86_PAE
> @@ -169,6 +154,12 @@ struct kimage_arch {
> pte_t *pte0;
> pte_t *pte1;
> };
> +#else
> +struct kimage_arch {
> + pud_t *pud;
> + pmd_t *pmd;
> + pte_t *pte;
> +};
> #endif
>
> #endif /* __ASSEMBLY__ */
> --- a/arch/x86/kernel/machine_kexec_64.c
> +++ b/arch/x86/kernel/machine_kexec_64.c
> @@ -18,15 +18,6 @@
> #include <asm/mmu_context.h>
> #include <asm/io.h>
>
> -#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
> -static u64 kexec_pgd[512] PAGE_ALIGNED;
> -static u64 kexec_pud0[512] PAGE_ALIGNED;
> -static u64 kexec_pmd0[512] PAGE_ALIGNED;
> -static u64 kexec_pte0[512] PAGE_ALIGNED;
> -static u64 kexec_pud1[512] PAGE_ALIGNED;
> -static u64 kexec_pmd1[512] PAGE_ALIGNED;
> -static u64 kexec_pte1[512] PAGE_ALIGNED;
> -
> static void init_level2_page(pmd_t *level2p, unsigned long addr)
> {
> unsigned long end_addr;
> @@ -107,12 +98,65 @@ out:
> return result;
> }
>
> +static void free_transition_pgtable(struct kimage *image)
> +{
> + free_page((unsigned long)image->arch.pud);
> + free_page((unsigned long)image->arch.pmd);
> + free_page((unsigned long)image->arch.pte);
> +}
> +
> +static int init_transition_pgtable(struct kimage *image, pgd_t *pgd)
> +{
> + pud_t *pud;
> + pmd_t *pmd;
> + pte_t *pte;
> + unsigned long vaddr, paddr;
> + int result = -ENOMEM;
> +
> + vaddr = (unsigned long)relocate_kernel;
> + paddr = __pa(page_address(image->control_code_page)+PAGE_SIZE);
> + pgd += pgd_index(vaddr);
> + if (!pgd_present(*pgd)) {
> + pud = (pud_t *)get_zeroed_page(GFP_KERNEL);
> + if (!pud)
> + goto err;
> + image->arch.pud = pud;
> + set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
> + }
> + pud = pud_offset(pgd, vaddr);
> + if (!pud_present(*pud)) {
> + pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL);
> + if (!pmd)
> + goto err;
> + image->arch.pmd = pmd;
> + set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
> + }
> + pmd = pmd_offset(pud, vaddr);
> + if (!pmd_present(*pmd)) {
> + pte = (pte_t *)get_zeroed_page(GFP_KERNEL);
> + if (!pte)
> + goto err;
> + image->arch.pte = pte;
> + set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE));
> + }
> + pte = pte_offset_kernel(pmd, vaddr);
> + set_pte(pte, pfn_pte(paddr >> PAGE_SHIFT, PAGE_KERNEL_EXEC));
> + return 0;
> +err:
> + free_transition_pgtable(image);
> + return result;
> +}
> +
>
> static int init_pgtable(struct kimage *image, unsigned long start_pgtable)
> {
> pgd_t *level4p;
> + int result;
> level4p = (pgd_t *)__va(start_pgtable);
> - return init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
> + result = init_level4_page(image, level4p, 0, max_pfn << PAGE_SHIFT);
> + if (result)
> + return result;
> + return init_transition_pgtable(image, level4p);
> }
>
> static void set_idt(void *newidt, u16 limit)
> @@ -174,7 +218,7 @@ int machine_kexec_prepare(struct kimage
>
> void machine_kexec_cleanup(struct kimage *image)
> {
> - return;
> + free_transition_pgtable(image);
> }
>
> /*
> @@ -195,22 +239,6 @@ void machine_kexec(struct kimage *image)
> memcpy(control_page, relocate_kernel, PAGE_SIZE);
>
> page_list[PA_CONTROL_PAGE] = virt_to_phys(control_page);
> - page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel;
> - page_list[PA_PGD] = virt_to_phys(&kexec_pgd);
> - page_list[VA_PGD] = (unsigned long)kexec_pgd;
> - page_list[PA_PUD_0] = virt_to_phys(&kexec_pud0);
> - page_list[VA_PUD_0] = (unsigned long)kexec_pud0;
> - page_list[PA_PMD_0] = virt_to_phys(&kexec_pmd0);
> - page_list[VA_PMD_0] = (unsigned long)kexec_pmd0;
> - page_list[PA_PTE_0] = virt_to_phys(&kexec_pte0);
> - page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
> - page_list[PA_PUD_1] = virt_to_phys(&kexec_pud1);
> - page_list[VA_PUD_1] = (unsigned long)kexec_pud1;
> - page_list[PA_PMD_1] = virt_to_phys(&kexec_pmd1);
> - page_list[VA_PMD_1] = (unsigned long)kexec_pmd1;
> - page_list[PA_PTE_1] = virt_to_phys(&kexec_pte1);
> - page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
> -
> page_list[PA_TABLE_PAGE] =
> (unsigned long)__pa(page_address(image->control_code_page));
>
> --- a/arch/x86/kernel/relocate_kernel_64.S
> +++ b/arch/x86/kernel/relocate_kernel_64.S
> @@ -29,122 +29,6 @@ relocate_kernel:
> * %rdx start address
> */
>
> - /* map the control page at its virtual address */
> -
> - movq $0x0000ff8000000000, %r10 /* mask */
> - mov $(39 - 3), %cl /* bits to shift */
> - movq PTR(VA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
> -
> - movq %r11, %r9
> - andq %r10, %r9
> - shrq %cl, %r9
> -
> - movq PTR(VA_PGD)(%rsi), %r8
> - addq %r8, %r9
> - movq PTR(PA_PUD_0)(%rsi), %r8
> - orq $PAGE_ATTR, %r8
> - movq %r8, (%r9)
> -
> - shrq $9, %r10
> - sub $9, %cl
> -
> - movq %r11, %r9
> - andq %r10, %r9
> - shrq %cl, %r9
> -
> - movq PTR(VA_PUD_0)(%rsi), %r8
> - addq %r8, %r9
> - movq PTR(PA_PMD_0)(%rsi), %r8
> - orq $PAGE_ATTR, %r8
> - movq %r8, (%r9)
> -
> - shrq $9, %r10
> - sub $9, %cl
> -
> - movq %r11, %r9
> - andq %r10, %r9
> - shrq %cl, %r9
> -
> - movq PTR(VA_PMD_0)(%rsi), %r8
> - addq %r8, %r9
> - movq PTR(PA_PTE_0)(%rsi), %r8
> - orq $PAGE_ATTR, %r8
> - movq %r8, (%r9)
> -
> - shrq $9, %r10
> - sub $9, %cl
> -
> - movq %r11, %r9
> - andq %r10, %r9
> - shrq %cl, %r9
> -
> - movq PTR(VA_PTE_0)(%rsi), %r8
> - addq %r8, %r9
> - movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
> - orq $PAGE_ATTR, %r8
> - movq %r8, (%r9)
> -
> - /* identity map the control page at its physical address */
> -
> - movq $0x0000ff8000000000, %r10 /* mask */
> - mov $(39 - 3), %cl /* bits to shift */
> - movq PTR(PA_CONTROL_PAGE)(%rsi), %r11 /* address to map */
> -
> - movq %r11, %r9
> - andq %r10, %r9
> - shrq %cl, %r9
> -
> - movq PTR(VA_PGD)(%rsi), %r8
> - addq %r8, %r9
> - movq PTR(PA_PUD_1)(%rsi), %r8
> - orq $PAGE_ATTR, %r8
> - movq %r8, (%r9)
> -
> - shrq $9, %r10
> - sub $9, %cl
> -
> - movq %r11, %r9
> - andq %r10, %r9
> - shrq %cl, %r9
> -
> - movq PTR(VA_PUD_1)(%rsi), %r8
> - addq %r8, %r9
> - movq PTR(PA_PMD_1)(%rsi), %r8
> - orq $PAGE_ATTR, %r8
> - movq %r8, (%r9)
> -
> - shrq $9, %r10
> - sub $9, %cl
> -
> - movq %r11, %r9
> - andq %r10, %r9
> - shrq %cl, %r9
> -
> - movq PTR(VA_PMD_1)(%rsi), %r8
> - addq %r8, %r9
> - movq PTR(PA_PTE_1)(%rsi), %r8
> - orq $PAGE_ATTR, %r8
> - movq %r8, (%r9)
> -
> - shrq $9, %r10
> - sub $9, %cl
> -
> - movq %r11, %r9
> - andq %r10, %r9
> - shrq %cl, %r9
> -
> - movq PTR(VA_PTE_1)(%rsi), %r8
> - addq %r8, %r9
> - movq PTR(PA_CONTROL_PAGE)(%rsi), %r8
> - orq $PAGE_ATTR, %r8
> - movq %r8, (%r9)
> -
> -relocate_new_kernel:
> - /* %rdi indirection_page
> - * %rsi page_list
> - * %rdx start address
> - */
> -
> /* zero out flags, and disable interrupts */
> pushq $0
> popfq
> @@ -156,9 +40,8 @@ relocate_new_kernel:
> /* get physical address of page table now too */
> movq PTR(PA_TABLE_PAGE)(%rsi), %rcx
>
> - /* switch to new set of page tables */
> - movq PTR(PA_PGD)(%rsi), %r9
> - movq %r9, %cr3
> + /* Switch to the identity mapped page tables */
> + movq %rcx, %cr3
>
> /* setup a new stack at the end of the physical control page */
> lea PAGE_SIZE(%r8), %rsp
> @@ -194,9 +77,7 @@ identity_mapped:
> jmp 1f
> 1:
>
> - /* Switch to the identity mapped page tables,
> - * and flush the TLB.
> - */
> + /* Flush the TLB (needed?) */
> movq %rcx, %cr3
>
> /* Do the copies */
>
--
Simon Horman
VA Linux Systems Japan K.K., Sydney, Australia Satellite Office
H: www.vergenet.net/~horms/ W: www.valinux.co.jp/en
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/