Re: [RFC 4/5] vmalloc_exec: share a huge page with kernel text

From: Song Liu
Date: Fri Oct 07 2022 - 02:54:03 EST




> On Oct 6, 2022, at 4:44 PM, Luis Chamberlain <mcgrof@xxxxxxxxxx> wrote:
>
> On Thu, Aug 18, 2022 at 03:42:17PM -0700, Song Liu wrote:
>> On x86 kernel, we allocate 2MB pages for kernel text up to
>> round_down(_etext, 2MB). Therefore, some of the kernel text is still
>> on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to
>> round_up(_etext, 2MB), and use the rest of the page for modules and
>> BPF programs.
>>
>> Here is an example:
>>
>> [root@eth50-1 ~]# grep _etext /proc/kallsyms
>> ffffffff82202a08 T _etext
>>
>> [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms | tail -n 3
>> ffffffff8220f920 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup [bpf]
>> ffffffff8220fa28 t bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new [bpf]
>> ffffffff8220fad4 t bpf_prog_3bf73fa16f5e3d92_handle__sched_switch [bpf]
>>
>> [root@eth50-1 ~]# grep 0xffffffff82200000 /sys/kernel/debug/page_tables/kernel
>> 0xffffffff82200000-0xffffffff82400000 2M ro PSE x pmd
>>
>> [root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms
>> ffffffff822ba910 t xfs_flush_inodes_worker [xfs]
>> ffffffff822bc580 t xfs_flush_inodes [xfs]
>>
>> ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text, xfs
>> module, and bpf programs.
>
> This is pretty rad. I'm not sure how you were able to squeeze xfs and
> *more* into one 2 MiB huge page though at least on debian 5.17.0-1-amd64
> xfs is 3.6847 MiB. How big is your XFS module?

In my build, xfs.ko is 50MB before strip, and 3.1MB after strip. But the
text section is about 1.3MB, so it fits in one 2MB page.

>
> I don't grok mm stuff, but I'd like to understand why we gain the ability
> of re-use the same 2 MiB page with this patch, from the code I really
> can't tail. Any pointers?

I don't quite follow the question here. In this case, we allocate one more
2MB page, so that some static kernel text can use it, and shall it with
dynamic kernel text. Does this answer your questions?

I am working on a newer version of this. I am planning to resend when it
is stable for BPF programs. For modules, I think we will need more
discussion about the interface with arch code.

Thanks,
Song

>
> But, I'm still concerned about the free'ing case in terms of
> fragmentation for contigous memory, when free huage pages are available.
>
> Luis
>
>> ---
>> arch/x86/mm/init_64.c | 3 ++-
>> mm/vmalloc.c | 27 +++++++++++++++++++++++++++
>> 2 files changed, 29 insertions(+), 1 deletion(-)
>>
>> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
>> index 39c5246964a9..d27d0af5beb5 100644
>> --- a/arch/x86/mm/init_64.c
>> +++ b/arch/x86/mm/init_64.c
>> @@ -1367,12 +1367,13 @@ int __init deferred_page_init_max_threads(const struct cpumask *node_cpumask)
>>
>> int kernel_set_to_readonly;
>>
>> +#define PMD_ALIGN(x) (((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
>> void mark_rodata_ro(void)
>> {
>> unsigned long start = PFN_ALIGN(_text);
>> unsigned long rodata_start = PFN_ALIGN(__start_rodata);
>> unsigned long end = (unsigned long)__end_rodata_hpage_align;
>> - unsigned long text_end = PFN_ALIGN(_etext);
>> + unsigned long text_end = PMD_ALIGN(_etext);
>> unsigned long rodata_end = PFN_ALIGN(__end_rodata);
>> unsigned long all_end;
>>
>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>> index 472287e71bf1..5f3b5df9313f 100644
>> --- a/mm/vmalloc.c
>> +++ b/mm/vmalloc.c
>> @@ -72,6 +72,11 @@ early_param("nohugevmalloc", set_nohugevmalloc);
>> static const bool vmap_allow_huge = false;
>> #endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
>>
>> +#define PMD_ALIGN(x) (((unsigned long)(x) + (PMD_SIZE - 1)) & PMD_MASK)
>> +
>> +static struct vm_struct text_tail_vm;
>> +static struct vmap_area text_tail_va;
>> +
>> bool is_vmalloc_addr(const void *x)
>> {
>> unsigned long addr = (unsigned long)kasan_reset_tag(x);
>> @@ -634,6 +639,8 @@ int is_vmalloc_or_module_addr(const void *x)
>> unsigned long addr = (unsigned long)kasan_reset_tag(x);
>> if (addr >= MODULES_VADDR && addr < MODULES_END)
>> return 1;
>> + if (addr >= text_tail_va.va_start && addr < text_tail_va.va_end)
>> + return 1;
>> #endif
>> return is_vmalloc_addr(x);
>> }
>> @@ -2371,6 +2378,25 @@ static void vmap_init_free_space(void)
>> }
>> }
>>
>> +static void register_text_tail_vm(void)
>> +{
>> + unsigned long start = PFN_ALIGN(_etext);
>> + unsigned long end = PMD_ALIGN(_etext);
>> + struct vmap_area *va;
>> +
>> + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
>> + if (WARN_ON_ONCE(!va))
>> + return;
>> + text_tail_vm.addr = (void *)start;
>> + text_tail_vm.size = end - start;
>> + text_tail_vm.flags = VM_KERNEL_EXEC;
>> + text_tail_va.va_start = start;
>> + text_tail_va.va_end = end;
>> + text_tail_va.vm = &text_tail_vm;
>> + memcpy(va, &text_tail_va, sizeof(*va));
>> + insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
>> +}
>> +
>> void __init vmalloc_init(void)
>> {
>> struct vmap_area *va;
>> @@ -2381,6 +2407,7 @@ void __init vmalloc_init(void)
>> * Create the cache for vmap_area objects.
>> */
>> vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
>> + register_text_tail_vm();
>>
>> for_each_possible_cpu(i) {
>> struct vmap_block_queue *vbq;
>> --
>> 2.30.2
>>