Re:Re: [PATCH V1] riscv: mm: Support > 1GB kernel image size when creating early page table

From: yang.zhang
Date: Sun May 05 2024 - 22:31:38 EST




At 2024-04-30 20:08:18, "Alexandre Ghiti" <alexghiti@xxxxxxxxxxxx> wrote:
>Hi Yang,
>
>On Tue, Apr 30, 2024 at 11:24 AM yang.zhang <gaoshanliukou@xxxxxxx> wrote:
>>
>> From: "yang.zhang" <yang.zhang@xxxxxxxxxxxx>
>>
>> By default, when creating early page table, only one PMD page table, but
>> if kernel image size exceeds 1GB, it need two PMD page table, otherwise,
>> it would BUG_ON in create_kernel_page_table.
>>
>> In addition, if trap earlier, trap vector doesn't yet set properly, current
>> value maybe set by previous firmwire, typically it's the _start of kernel,
>> it's confused and difficult to debug, so set it earlier.
>
>Totally agree with that, I'm used to debugging this part of the code
>and I know stvec points to __start and not handle_exception at this
>point...But that deserves a fix indeed, it's confusing. I don't see
>this fix in the patch below though?
Sorry, i missed it.
I send patch v2 for this.
>> ---
>> I'm not sure whether hugesize kernel is reasonable, if not, ignore this patch.
>
>Unless you have a good use case for this, I don't think that's
>necessary, we never encountered such situations so I'd rather not
>complicate this code even more.
I just see the kernel space is reserved 2GB(which is 0xffffffff80000000 -0xffffffffffffffff ).
Normally, kernel image size is just some MBs, so this patch maybe unnecessary.
Just ignore.
I add the patch for set trap vector earlier for helping debug.
Thanks.

>Thanks,
>
>Alex
>
>> This issue can be reproduced simpily by changing '_end' in vmlinux.lds.S
>> such as _end = . + 0x40000000;
>>
>> Signed-off-by: yang.zhang <yang.zhang@xxxxxxxxxxxx>
>> ---
>> arch/riscv/mm/init.c | 41 +++++++++++++++++++++++++++++++++++------
>> 1 file changed, 35 insertions(+), 6 deletions(-)
>>
>> diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c
>> index fe8e159394d8..094b39f920d3 100644
>> --- a/arch/riscv/mm/init.c
>> +++ b/arch/riscv/mm/init.c
>> @@ -386,11 +386,13 @@ static void __init create_pte_mapping(pte_t *ptep,
>> static pmd_t trampoline_pmd[PTRS_PER_PMD] __page_aligned_bss;
>> static pmd_t fixmap_pmd[PTRS_PER_PMD] __page_aligned_bss;
>> static pmd_t early_pmd[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
>> +static pmd_t early_pmd2[PTRS_PER_PMD] __initdata __aligned(PAGE_SIZE);
>>
>> #ifdef CONFIG_XIP_KERNEL
>> #define trampoline_pmd ((pmd_t *)XIP_FIXUP(trampoline_pmd))
>> #define fixmap_pmd ((pmd_t *)XIP_FIXUP(fixmap_pmd))
>> #define early_pmd ((pmd_t *)XIP_FIXUP(early_pmd))
>> +#define early_pmd2 ((pmd_t *)XIP_FIXUP(early_pmd2))
>> #endif /* CONFIG_XIP_KERNEL */
>>
>> static p4d_t trampoline_p4d[PTRS_PER_P4D] __page_aligned_bss;
>> @@ -432,9 +434,14 @@ static pmd_t *__init get_pmd_virt_late(phys_addr_t pa)
>>
>> static phys_addr_t __init alloc_pmd_early(uintptr_t va)
>> {
>> - BUG_ON((va - kernel_map.virt_addr) >> PUD_SHIFT);
>> + uintptr_t end_pud_idx = pud_index(kernel_map.virt_addr + kernel_map.size - 1);
>> + uintptr_t current_pud_idx = pud_index(va);
>>
>> - return (uintptr_t)early_pmd;
>> + BUG_ON(current_pud_idx > end_pud_idx);
>> + if (current_pud_idx == end_pud_idx)
>> + return (uintptr_t)early_pmd2;
>> + else
>> + return (uintptr_t)early_pmd;
>> }
>>
>> static phys_addr_t __init alloc_pmd_fixmap(uintptr_t va)
>> @@ -769,6 +776,18 @@ static void __init set_mmap_rnd_bits_max(void)
>> mmap_rnd_bits_max = MMAP_VA_BITS - PAGE_SHIFT - 3;
>> }
>>
>> +static pmd_t *__init select_pmd_early(uintptr_t pa)
>> +{
>> + uintptr_t end_pud_idx = pud_index(kernel_map.phys_addr + kernel_map.size - 1);
>> + uintptr_t current_pud_idx = pud_index(pa);
>> +
>> + BUG_ON(current_pud_idx > end_pud_idx);
>> + if (current_pud_idx == end_pud_idx)
>> + return early_pmd2;
>> + else
>> + return early_pmd;
>> +}
>> +
>> /*
>> * There is a simple way to determine if 4-level is supported by the
>> * underlying hardware: establish 1:1 mapping in 4-level page table mode
>> @@ -780,6 +799,7 @@ static __init void set_satp_mode(uintptr_t dtb_pa)
>> u64 identity_satp, hw_satp;
>> uintptr_t set_satp_mode_pmd = ((unsigned long)set_satp_mode) & PMD_MASK;
>> u64 satp_mode_cmdline = __pi_set_satp_mode_from_cmdline(dtb_pa);
>> + pmd_t *target_pmd, *target_pmd2;
>>
>> if (satp_mode_cmdline == SATP_MODE_57) {
>> disable_pgtable_l5();
>> @@ -789,17 +809,24 @@ static __init void set_satp_mode(uintptr_t dtb_pa)
>> return;
>> }
>>
>> + target_pmd = select_pmd_early(set_satp_mode_pmd);
>> + target_pmd2 = select_pmd_early(set_satp_mode_pmd + PMD_SIZE);
>> create_p4d_mapping(early_p4d,
>> set_satp_mode_pmd, (uintptr_t)early_pud,
>> P4D_SIZE, PAGE_TABLE);
>> create_pud_mapping(early_pud,
>> - set_satp_mode_pmd, (uintptr_t)early_pmd,
>> + set_satp_mode_pmd, (uintptr_t)target_pmd,
>> + PUD_SIZE, PAGE_TABLE);
>> + /* Handle the case where set_satp_mode straddles 2 PUDs */
>> + if (target_pmd2 != target_pmd)
>> + create_pud_mapping(early_pud,
>> + set_satp_mode_pmd + PMD_SIZE, (uintptr_t)target_pmd2,
>> PUD_SIZE, PAGE_TABLE);
>> /* Handle the case where set_satp_mode straddles 2 PMDs */
>> - create_pmd_mapping(early_pmd,
>> + create_pmd_mapping(target_pmd,
>> set_satp_mode_pmd, set_satp_mode_pmd,
>> PMD_SIZE, PAGE_KERNEL_EXEC);
>> - create_pmd_mapping(early_pmd,
>> + create_pmd_mapping(target_pmd2,
>> set_satp_mode_pmd + PMD_SIZE,
>> set_satp_mode_pmd + PMD_SIZE,
>> PMD_SIZE, PAGE_KERNEL_EXEC);
>> @@ -829,7 +856,9 @@ static __init void set_satp_mode(uintptr_t dtb_pa)
>> memset(early_pg_dir, 0, PAGE_SIZE);
>> memset(early_p4d, 0, PAGE_SIZE);
>> memset(early_pud, 0, PAGE_SIZE);
>> - memset(early_pmd, 0, PAGE_SIZE);
>> + memset(target_pmd, 0, PAGE_SIZE);
>> + if (target_pmd2 != target_pmd)
>> + memset(target_pmd2, 0, PAGE_SIZE);
>> }
>> #endif
>>
>> --
>> 2.25.1
>>