Re: [RFC 1/5] vmalloc: introduce vmalloc_exec and vfree_exec

From: Song Liu
Date: Fri Oct 07 2022 - 02:39:39 EST




> On Oct 6, 2022, at 4:15 PM, Luis Chamberlain <mcgrof@xxxxxxxxxx> wrote:
>
> On Thu, Aug 18, 2022 at 03:42:14PM -0700, Song Liu wrote:
>> --- a/mm/nommu.c
>> +++ b/mm/nommu.c
>> @@ -372,6 +372,13 @@ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
>> }
>> EXPORT_SYMBOL(vm_map_pages_zero);
>>
>> +void *vmalloc_exec(unsigned long size, unsigned long align)
>> +{
>> + return NULL;
>> +}
>
> Well that's not so nice for no-mmu systems. Shouldn't we have a
> fallback?

This is still early version, so I am not quite sure whether we
need the fallback for no-mmu system.

>
>> diff --git a/mm/vmalloc.c b/mm/vmalloc.c
>> index effd1ff6a4b4..472287e71bf1 100644
>> --- a/mm/vmalloc.c
>> +++ b/mm/vmalloc.c
>> @@ -1583,9 +1592,17 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
>> va->va_end = addr + size;
>> va->vm = NULL;
>>
>> - spin_lock(&vmap_area_lock);
>> - insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
>> - spin_unlock(&vmap_area_lock);
>> + if (vm_flags & VM_KERNEL_EXEC) {
>> + spin_lock(&free_text_area_lock);
>> + insert_vmap_area(va, &free_text_area_root, &free_text_area_list);
>> + /* update subtree_max_size now as we need this soon */
>> + augment_tree_propagate_from(va);
>
> Sorry, it is not clear to me why its needed only for exec, can you elaborate a
> bit more?

This version was wrong. We should use insert_vmap_area_augment() here.
Actually, I changed this in latest version.

>
>> + spin_unlock(&free_text_area_lock);
>> + } else {
>> + spin_lock(&vmap_area_lock);
>> + insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
>> + spin_unlock(&vmap_area_lock);
>> + }
>>
>> BUG_ON(!IS_ALIGNED(va->va_start, align));
>> BUG_ON(va->va_start < vstart);
>
> <-- snip -->
>
>> @@ -3265,6 +3282,97 @@ void *vmalloc(unsigned long size)
>> }
>> EXPORT_SYMBOL(vmalloc);
>>
>> +void *vmalloc_exec(unsigned long size, unsigned long align)
>> +{
>> + struct vmap_area *va, *tmp;
>> + unsigned long addr;
>> + enum fit_type type;
>> + int ret;
>> +
>> + va = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, NUMA_NO_NODE);
>> + if (unlikely(!va))
>> + return ERR_PTR(-ENOMEM);
>> +
>> +again:
>> + preload_this_cpu_lock(&free_text_area_lock, GFP_KERNEL, NUMA_NO_NODE);
>> + tmp = find_vmap_lowest_match(free_text_area_root.rb_node,
>> + size, align, 1, false);
>> +
>> + if (!tmp) {
>> + unsigned long alloc_size;
>> + void *ptr;
>> +
>> + spin_unlock(&free_text_area_lock);
>> +
>> + alloc_size = roundup(size, PMD_SIZE * num_online_nodes());
>> + ptr = __vmalloc_node_range(alloc_size, PMD_SIZE, MODULES_VADDR,
>> + MODULES_END, GFP_KERNEL, PAGE_KERNEL,
>> + VM_KERNEL_EXEC | VM_ALLOW_HUGE_VMAP | VM_NO_GUARD,
>> + NUMA_NO_NODE, __builtin_return_address(0));
>
> We can review the guard stuff on the other thread with Peter.
>
>> + if (unlikely(!ptr)) {
>> + ret = -ENOMEM;
>> + goto err_out;
>> + }
>> + memset(ptr, 0, alloc_size);
>> + set_memory_ro((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
>> + set_memory_x((unsigned long)ptr, alloc_size >> PAGE_SHIFT);
>
> I *really* like that this is now not something users have to muck with thanks!

Well, this pushed some other complexity to the user side, for example, all
those hacks with text_poke in 3/5.

>
>> +
>> + goto again;
>> + }
>> +
>> + addr = roundup(tmp->va_start, align);
>> + type = classify_va_fit_type(tmp, addr, size);
>> + if (WARN_ON_ONCE(type == NOTHING_FIT)) {
>> + addr = -ENOMEM;
>> + goto err_out;
>> + }
>> +
>> + ret = adjust_va_to_fit_type(&free_text_area_root, &free_text_area_list,
>> + tmp, addr, size, type);
>> + if (ret) {
>> + addr = ret;
>> + goto err_out;
>> + }
>> + spin_unlock(&free_text_area_lock);
>> +
>> + va->va_start = addr;
>> + va->va_end = addr + size;
>> + va->vm = tmp->vm;
>> +
>> + spin_lock(&vmap_area_lock);
>> + insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
>> + spin_unlock(&vmap_area_lock);
>> +
>> + return (void *)addr;
>> +
>> +err_out:
>> + spin_unlock(&free_text_area_lock);
>> + return ERR_PTR(ret);
>> +}
>> +
>> +void vfree_exec(const void *addr)
>> +{
>> + struct vmap_area *va;
>> +
>> + might_sleep();
>> +
>> + spin_lock(&vmap_area_lock);
>> + va = __find_vmap_area((unsigned long)addr, vmap_area_root.rb_node);
>> + if (WARN_ON_ONCE(!va)) {
>> + spin_unlock(&vmap_area_lock);
>> + return;
>> + }
>> +
>> + unlink_va(va, &vmap_area_root);
>
> Curious why we don't memset to 0 before merge_or_add_vmap_area_augment()?
> I realize other code doesn't seem to do it, though.

We should do the memset here. We will need the text_poke version of it.

>
>> + spin_unlock(&vmap_area_lock);
>> +
>> + spin_lock(&free_text_area_lock);
>> + merge_or_add_vmap_area_augment(va,
>> + &free_text_area_root, &free_text_area_list);
>
> I have concern that we can be using precious physically contigous memory
> from huge pages to then end up in a situation where we create our own
> pool and allow things to be non-contigous afterwards.
>
> I'm starting to suspect that if the allocation is > PAGE_SIZE we just
> give it back generally. Otherwise wouldn't the fragmentation cause us
> to eventually just eat up most huge pages available? Probably not for
> eBPF but if we use this on a system with tons of module insertions /
> deletions this seems like it could happen?

Currently, bpf_prog_pack doesn't let allocation > PMD_SIZE to share with
smaller allocations. I guess it is similar to the idea here? I am not
sure what's the proper threshold for modules. We can discuss this later.

Thanks,
Song

>
> Luis
>
>> + spin_unlock(&free_text_area_lock);
>> + /* TODO: when the whole vm_struct is not in use, free it */
>> +}
>> +
>> /**
>> * vmalloc_huge - allocate virtually contiguous memory, allow huge pages
>> * @size: allocation size
>> @@ -3851,7 +3959,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
>> /* It is a BUG(), but trigger recovery instead. */
>> goto recovery;
>>
>> - ret = adjust_va_to_fit_type(va, start, size, type);
>> + ret = adjust_va_to_fit_type(&free_vmap_area_root, &free_vmap_area_list,
>> + va, start, size, type);
>> if (unlikely(ret))
>> goto recovery;
>>
>> --
>> 2.30.2
>>