Re: [RFC v2 4/4] vmalloc_exec: share a huge page with kernel text

From: Edgecombe, Rick P
Date: Mon Oct 10 2022 - 14:32:25 EST


On Fri, 2022-10-07 at 16:43 -0700, Song Liu wrote:
> On x86 kernel, we allocate 2MB pages for kernel text up to
> round_down(_etext, 2MB). Therefore, some of the kernel text is still
> on 4kB pages. With vmalloc_exec, we can allocate 2MB pages up to
> round_up(_etext, 2MB), and use the rest of the page for modules and
> BPF programs.
>
> Here is an example:
>
> [root@eth50-1 ~]# grep _etext /proc/kallsyms
> ffffffff82202a08 T _etext
>
> [root@eth50-1 ~]# grep bpf_prog_ /proc/kallsyms | tail -n 3
> ffffffff8220f920 t
> bpf_prog_cc61a5364ac11d93_handle__sched_wakeup [bpf]
> ffffffff8220fa28 t
> bpf_prog_cc61a5364ac11d93_handle__sched_wakeup_new [bpf]
> ffffffff8220fad4 t
> bpf_prog_3bf73fa16f5e3d92_handle__sched_switch [bpf]
>
> [root@eth50-1 ~]# grep 0xffffffff82200000
> /sys/kernel/debug/page_tables/kernel
> 0xffffffff82200000-
> 0xffffffff82400000 2M ro PSE x pmd
>
> [root@eth50-1 ~]# grep xfs_flush_inodes /proc/kallsyms
> ffffffff822ba910 t xfs_flush_inodes_worker [xfs]
> ffffffff822bc580 t xfs_flush_inodes [xfs]
>
> ffffffff82200000-ffffffff82400000 is a 2MB page, serving kernel text,
> xfs
> module, and bpf programs.

Can this memory range be freed as part of a vfree_exec() call then?
Does vmalloc actually try to unmap it? If so, it could get complicated
with PTI.

It probably should be a special case that never gets fully freed.

>
> Signed-off-by: Song Liu <song@xxxxxxxxxx>
> ---
> arch/x86/mm/init_64.c | 3 ++-
> mm/vmalloc.c | 24 ++++++++++++++++++++++++
> 2 files changed, 26 insertions(+), 1 deletion(-)
>
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 0fe690ebc269..d94f196c541a 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -1367,12 +1367,13 @@ int __init
> deferred_page_init_max_threads(const struct cpumask *node_cpumask)
>
> int kernel_set_to_readonly;
>
> +#define PMD_ALIGN(x) (((unsigned long)(x) + (PMD_SIZE - 1)) &
> PMD_MASK)
> void mark_rodata_ro(void)
> {
> unsigned long start = PFN_ALIGN(_text);
> unsigned long rodata_start = PFN_ALIGN(__start_rodata);
> unsigned long end = (unsigned long)__end_rodata_hpage_align;
> - unsigned long text_end = PFN_ALIGN(_etext);
> + unsigned long text_end = PMD_ALIGN(_etext);

This should probably have more logic and adjustments. If etext is PMD
aligned, some of the stuff outside the diff won't do anything.

Also, if a kernel doesn't have modules or BPF JIT it would be a waste
of memory.

> unsigned long rodata_end = PFN_ALIGN(__end_rodata);
> unsigned long all_end;
>
> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> index 9212ff96b871..41509bbec583 100644
> --- a/mm/vmalloc.c
> +++ b/mm/vmalloc.c
> @@ -75,6 +75,9 @@ static const bool vmap_allow_huge = false;
> #define PMD_ALIGN(addr) ALIGN(addr, PMD_SIZE)
> #define PMD_ALIGN_DOWN(addr) ALIGN_DOWN(addr, PMD_SIZE)
>
> +static struct vm_struct text_tail_vm;
> +static struct vmap_area text_tail_va;
> +
> bool is_vmalloc_addr(const void *x)
> {
> unsigned long addr = (unsigned long)kasan_reset_tag(x);
> @@ -637,6 +640,8 @@ int is_vmalloc_or_module_addr(const void *x)
> unsigned long addr = (unsigned long)kasan_reset_tag(x);
> if (addr >= MODULES_VADDR && addr < MODULES_END)
> return 1;
> + if (addr >= text_tail_va.va_start && addr <
> text_tail_va.va_end)
> + return 1;
> #endif
> return is_vmalloc_addr(x);
> }
> @@ -2422,6 +2427,24 @@ static void vmap_init_free_space(void)
> }
> }
>
> +static void register_text_tail_vm(void)
> +{
> + unsigned long start = PFN_ALIGN((unsigned long)_etext);
> + unsigned long end = PMD_ALIGN((unsigned long)_etext);
> + struct vmap_area *va;
> +
> + va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
> + if (WARN_ON_ONCE(!va))
> + return;
> + text_tail_vm.addr = (void *)start;
> + text_tail_vm.size = end - start;
> + text_tail_va.va_start = start;
> + text_tail_va.va_end = end;
> + text_tail_va.vm = &text_tail_vm;
> + memcpy(va, &text_tail_va, sizeof(*va));
> + insert_vmap_area_augment(va, NULL, &free_text_area_root,
> &free_text_area_list);
> +}
> +
> void __init vmalloc_init(void)
> {
> struct vmap_area *va;
> @@ -2432,6 +2455,7 @@ void __init vmalloc_init(void)
> * Create the cache for vmap_area objects.
> */
> vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
> + register_text_tail_vm();
>
> for_each_possible_cpu(i) {
> struct vmap_block_queue *vbq;