Re: [RFC PATCH v2 1/1] mm/vmap: keep track of free blocks for vmap allocation

From: Uladzislau Rezki
Date: Mon Mar 25 2019 - 13:20:25 EST


Hello, Roman!

> Hi, Uladzislau!
>
> Definitely a clever idea and very promising numbers!
>
> The overall approach makes total sense to me.
>
> I tried to go through the code, but it was a bit challenging.
> I wonder, if you can split it into smaller parts to simplify the review?
>
I appreciate that you spent your time on it. Thank you :)

> Idk how easy is to split the core (maybe the area merging code can be
> separated?), but at least the optionally compiled debug code can
> be moved into separate patches.
>
At least debug code should be split, i totally agree with you. As for
"merging" part. We can split it, but then it will be kind of broken.
I mean it will not work because one part/patch depends on other.

Another approach is to make "prepare patches" and one final replacing
patch that will remove old code. But i am not sure if it is worth it
or not.

> Some small nits/questions below.
>
> >
> > Signed-off-by: Uladzislau Rezki (Sony) <urezki@xxxxxxxxx>
> > ---
> > include/linux/vmalloc.h | 6 +-
> > mm/vmalloc.c | 1109 ++++++++++++++++++++++++++++++++++++-----------
> > 2 files changed, 871 insertions(+), 244 deletions(-)
> >
> > diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
> > index 398e9c95cd61..ad483378fdd1 100644
> > --- a/include/linux/vmalloc.h
> > +++ b/include/linux/vmalloc.h
> > @@ -45,12 +45,16 @@ struct vm_struct {
> > struct vmap_area {
> > unsigned long va_start;
> > unsigned long va_end;
> > +
> > + /*
> > + * Largest available free size in subtree.
> > + */
> > + unsigned long subtree_max_size;
> > unsigned long flags;
> > struct rb_node rb_node; /* address sorted rbtree */
> > struct list_head list; /* address sorted list */
> > struct llist_node purge_list; /* "lazy purge" list */
> > struct vm_struct *vm;
> > - struct rcu_head rcu_head;
> > };
> >
> > /*
> > diff --git a/mm/vmalloc.c b/mm/vmalloc.c
> > index 755b02983d8d..29e9786299cf 100644
> > --- a/mm/vmalloc.c
> > +++ b/mm/vmalloc.c
> > @@ -31,6 +31,7 @@
> > #include <linux/compiler.h>
> > #include <linux/llist.h>
> > #include <linux/bitops.h>
> > +#include <linux/rbtree_augmented.h>
> >
> > #include <linux/uaccess.h>
> > #include <asm/tlbflush.h>
> > @@ -320,8 +321,9 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
> > }
> > EXPORT_SYMBOL(vmalloc_to_pfn);
> >
> > -
> > /*** Global kva allocator ***/
> > +#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
> > +#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
> >
> > #define VM_LAZY_FREE 0x02
> > #define VM_VM_AREA 0x04
> > @@ -331,14 +333,76 @@ static DEFINE_SPINLOCK(vmap_area_lock);
> > LIST_HEAD(vmap_area_list);
> > static LLIST_HEAD(vmap_purge_list);
> > static struct rb_root vmap_area_root = RB_ROOT;
> > +static bool vmap_initialized __read_mostly;
> > +
> > +/*
> > + * This kmem_cache is used for vmap_area objects. Instead of
> > + * allocating from slab we reuse an object from this cache to
> > + * make things faster. Especially in "no edge" splitting of
> > + * free block.
> > + */
> > +static struct kmem_cache *vmap_area_cachep;
> > +
> > +/*
> > + * This linked list is used in pair with free_vmap_area_root.
> > + * It gives O(1) access to prev/next to perform fast coalescing.
> > + */
> > +static LIST_HEAD(free_vmap_area_list);
> > +
> > +/*
> > + * This augment red-black tree represents the free vmap space.
> > + * All vmap_area objects in this tree are sorted by va->va_start
> > + * address. It is used for allocation and merging when a vmap
> > + * object is released.
> > + *
> > + * Each vmap_area node contains a maximum available free block
> > + * of its sub-tree, right or left. Therefore it is possible to
> > + * find a lowest match of free area.
> > + */
> > +static struct rb_root free_vmap_area_root = RB_ROOT;
> > +
> > +static inline unsigned long
> > +__va_size(struct vmap_area *va)
> > +{
> > + return (va->va_end - va->va_start);
> > +}
> > +
> > +static unsigned long
> > +get_subtree_max_size(struct rb_node *node)
> > +{
> > + struct vmap_area *va;
> > +
> > + va = rb_entry_safe(node, struct vmap_area, rb_node);
> > + return va ? va->subtree_max_size : 0;
> > +}
> > +
> > +/*
> > + * Gets called when remove the node and rotate.
> > + */
> > +static unsigned long
> > +compute_subtree_max_size(struct vmap_area *va)
> > +{
> > + unsigned long max_size = __va_size(va);
> > + unsigned long child_max_size;
> > +
> > + child_max_size = get_subtree_max_size(va->rb_node.rb_right);
> > + if (child_max_size > max_size)
> > + max_size = child_max_size;
> >
> > -/* The vmap cache globals are protected by vmap_area_lock */
> > -static struct rb_node *free_vmap_cache;
> > -static unsigned long cached_hole_size;
> > -static unsigned long cached_vstart;
> > -static unsigned long cached_align;
> > + child_max_size = get_subtree_max_size(va->rb_node.rb_left);
> > + if (child_max_size > max_size)
> > + max_size = child_max_size;
> >
> > -static unsigned long vmap_area_pcpu_hole;
> > + return max_size;
> > +}
> > +
> > +RB_DECLARE_CALLBACKS(static, free_vmap_area_rb_augment_cb,
> > + struct vmap_area, rb_node, unsigned long, subtree_max_size,
> > + compute_subtree_max_size)
> > +
> > +static void purge_vmap_area_lazy(void);
> > +static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
> > +static unsigned long lazy_max_pages(void);
> >
> > static struct vmap_area *__find_vmap_area(unsigned long addr)
> > {
> > @@ -359,41 +423,623 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
> > return NULL;
> > }
> >
> > -static void __insert_vmap_area(struct vmap_area *va)
> > +/*
> > + * This function returns back addresses of parent node
> > + * and its left or right link for further processing.
> > + */
> > +static inline void
> > +__find_va_links(struct vmap_area *va,
> > + struct rb_root *root, struct rb_node *from,
> > + struct rb_node **parent, struct rb_node ***link)
>
> This function returns a pointer to parent, and it doesn't use
> the initial value of (*parent). Can't it just return it?
> I mean something like:
>
> static struct rb_node *__find_va_links(struct vmap_area *va,
> struct rb_root *root, struct rb_node *from,
> struct rb_node ***link) { ... }
>
> It would simplify things a bit.
Yes, we can do that. Do not see any issues returning "parent" instead.

>
> Also this triple pointer looks scary.
> If it returns a parent and one of two children, can't it return
> a desired child instead? Or it can be NULL with a non-NULL parent?
>
We use triple pointer because we just want to return back to caller address
either of rb_left or rb_right pointer of the "parent" node to bind a new
rb_node to the tree. parent->rb_left/parent->rb_right are NULL, whereas their
addresses("&") are not.

**p = &parent->rb_left;
*link = p;

<snip rbtree.h>
static inline void rb_link_node(struct rb_node *node,
struct rb_node *parent, struct rb_node **rb_link)
...
*rb_link = node;
<snip rbtree.h>

I think we should simplify it by just returning pointer to pointer
and leave how "parent" is returned:

static inline struct rb_node **
__find_va_links(struct vmap_area *va,
struct rb_root *root, struct rb_node *from,
struct rb_node **parent)
{
struct vmap_area *tmp_va;
struct rb_node **link;

if (root) {
link = &root->rb_node;
if (unlikely(!*link)) {
*parent = NULL;
return link;
}
} else {
link = &from;
}

/*
* Go to the bottom of the tree.
*/
do {
tmp_va = rb_entry(*link, struct vmap_area, rb_node);

/*
* During the traversal we also do some sanity check.
* Trigger the BUG() if there are sides(left/right)
* or full overlaps.
*/
if (va->va_start < tmp_va->va_end &&
va->va_end <= tmp_va->va_start)
link = &(*link)->rb_left;
else if (va->va_end > tmp_va->va_start &&
va->va_start >= tmp_va->va_end)
link = &(*link)->rb_right;
else
BUG();
} while (*link);

*parent = &tmp_va->rb_node;
return link;
}

What do you think?

> > {
> > - struct rb_node **p = &vmap_area_root.rb_node;
> > - struct rb_node *parent = NULL;
> > - struct rb_node *tmp;
> > + struct vmap_area *tmp_va;
> >
> > - while (*p) {
> > - struct vmap_area *tmp_va;
> > + if (root) {
> > + *link = &root->rb_node;
> > + if (unlikely(!**link)) {
> > + *parent = NULL;
> > + return;
> > + }
> > + } else {
> > + *link = &from;
> > + }
> >
> > - parent = *p;
> > - tmp_va = rb_entry(parent, struct vmap_area, rb_node);
> > - if (va->va_start < tmp_va->va_end)
> > - p = &(*p)->rb_left;
> > - else if (va->va_end > tmp_va->va_start)
> > - p = &(*p)->rb_right;
> > + /*
> > + * Go to the bottom of the tree.
> > + */
> > + do {
> > + tmp_va = rb_entry(**link, struct vmap_area, rb_node);
> > +
> > + /*
> > + * During the traversal we also do some sanity check.
> > + * Trigger the BUG() if there are sides(left/right)
> > + * or full overlaps.
> > + */
> > + if (va->va_start < tmp_va->va_end &&
> > + va->va_end <= tmp_va->va_start)
> > + *link = &(**link)->rb_left;
> > + else if (va->va_end > tmp_va->va_start &&
> > + va->va_start >= tmp_va->va_end)
> > + *link = &(**link)->rb_right;
> > else
> > BUG();
> > + } while (**link);
> > +
> > + *parent = &tmp_va->rb_node;
> > +}
> > +
> > +static inline void
> > +__find_va_free_siblings(struct rb_node *parent, struct rb_node **link,
> > + struct list_head **prev, struct list_head **next)
> > +{
> > + struct list_head *list;
> > +
> > + if (likely(parent)) {
> > + list = &rb_entry(parent, struct vmap_area, rb_node)->list;
> > + if (&parent->rb_right == link) {
> > + *next = list->next;
> > + *prev = list;
> > + } else {
> > + *prev = list->prev;
> > + *next = list
>
> So, does it mean that this function always returns two following elements?
> Can't it return a single element using the return statement instead?
> The second one can be calculated as ->next?
>
Yes, they follow each other and if you return "prev" for example you can easily
refer to next. But you will need to access "next" anyway. I would rather keep
implementation, because it strictly clear what it return when you look at this
function.

But if there are some objections and we can simplify, let's discuss :)

> > + }
> > + } else {
> > + /*
> > + * The red-black tree where we try to find VA neighbors
> > + * before merging or inserting is empty, i.e. it means
> > + * there is no free vmap space. Normally it does not
> > + * happen but we handle this case anyway.
> > + */
> > + *prev = *next = &free_vmap_area_list;
>
> And for example, return NULL in this case.
>
Then we will need to check in the __merge_or_add_vmap_area() that
next/prev are not NULL and not head. But i do not like current implementation
as well, since it is hardcoded to specific list head.

> > +}
> >
> > - rb_link_node(&va->rb_node, parent, p);
> > - rb_insert_color(&va->rb_node, &vmap_area_root);
> > +static inline void
> > +__link_va(struct vmap_area *va, struct rb_root *root,
> > + struct rb_node *parent, struct rb_node **link, struct list_head *head)
> > +{
> > + /*
> > + * VA is still not in the list, but we can
> > + * identify its future previous list_head node.
> > + */
> > + if (likely(parent)) {
> > + head = &rb_entry(parent, struct vmap_area, rb_node)->list;
> > + if (&parent->rb_right != link)
> > + head = head->prev;
> > + }
> >
> > - /* address-sort this list */
> > - tmp = rb_prev(&va->rb_node);
> > - if (tmp) {
> > - struct vmap_area *prev;
> > - prev = rb_entry(tmp, struct vmap_area, rb_node);
> > - list_add_rcu(&va->list, &prev->list);
> > - } else
> > - list_add_rcu(&va->list, &vmap_area_list);
> > + /* Insert to the rb-tree */
> > + rb_link_node(&va->rb_node, parent, link);
> > + if (root == &free_vmap_area_root) {
> > + /*
> > + * Some explanation here. Just perform simple insertion
> > + * to the tree. We do not set va->subtree_max_size to
> > + * its current size before calling rb_insert_augmented().
> > + * It is because of we populate the tree from the bottom
> > + * to parent levels when the node _is_ in the tree.
> > + *
> > + * Therefore we set subtree_max_size to zero after insertion,
> > + * to let __augment_tree_propagate_from() puts everything to
> > + * the correct order later on.
> > + */
> > + rb_insert_augmented(&va->rb_node,
> > + root, &free_vmap_area_rb_augment_cb);
> > + va->subtree_max_size = 0;
> > + } else {
> > + rb_insert_color(&va->rb_node, root);
> > + }
> > +
> > + /* Address-sort this list */
> > + list_add(&va->list, head);
> > }
> >
> > -static void purge_vmap_area_lazy(void);
> > +static inline void
> > +__unlink_va(struct vmap_area *va, struct rb_root *root)
> > +{
> > + /*
> > + * During merging a VA node can be empty, therefore
> > + * not linked with the tree nor list. Just check it.
> > + */
> > + if (!RB_EMPTY_NODE(&va->rb_node)) {
> > + if (root == &free_vmap_area_root)
> > + rb_erase_augmented(&va->rb_node,
> > + root, &free_vmap_area_rb_augment_cb);
> > + else
> > + rb_erase(&va->rb_node, root);
> >
> > -static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
> > + list_del(&va->list);
> > + }
> > +}
> > +
> > +#if DEBUG_AUGMENT_PROPAGATE_CHECK
> > +static void
> > +augment_tree_propagate_do_check(struct rb_node *n)
> > +{
> > + struct vmap_area *va;
> > + struct rb_node *node;
> > + unsigned long size;
> > + bool found = false;
> > +
> > + if (n == NULL)
> > + return;
> > +
> > + va = rb_entry(n, struct vmap_area, rb_node);
> > + size = va->subtree_max_size;
> > + node = n;
> > +
> > + while (node) {
> > + va = rb_entry(node, struct vmap_area, rb_node);
> > +
> > + if (get_subtree_max_size(node->rb_left) == size) {
> > + node = node->rb_left;
> > + } else {
> > + if (__va_size(va) == size) {
> > + found = true;
> > + break;
> > + }
> > +
> > + node = node->rb_right;
> > + }
> > + }
> > +
> > + if (!found) {
> > + va = rb_entry(n, struct vmap_area, rb_node);
> > + pr_emerg("tree is corrupted: %lu, %lu\n",
> > + __va_size(va), va->subtree_max_size);
> > + }
> > +
> > + augment_tree_propagate_do_check(n->rb_left);
> > + augment_tree_propagate_do_check(n->rb_right);
> > +}
> > +
> > +static void augment_tree_propagate_from_check(void)
> > +{
> > + augment_tree_propagate_do_check(free_vmap_area_root.rb_node);
> > +}
> > +#endif
> > +
> > +/*
> > + * This function populates subtree_max_size from bottom to upper
> > + * levels starting from VA point. The propagation must be done
> > + * when VA size is modified by changing its va_start/va_end. Or
> > + * in case of newly inserting of VA to the tree.
> > + *
> > + * It means that __augment_tree_propagate_from() must be called:
> > + * - After VA has been inserted to the tree(free path);
> > + * - After VA has been shrunk(allocation path);
> > + * - After VA has been increased(merging path).
> > + *
> > + * Please note that, it does not mean that upper parent nodes
> > + * and their subtree_max_size are recalculated all the time up
> > + * to the root node.
> > + *
> > + * 4--8
> > + * /\
> > + * / \
> > + * / \
> > + * 2--2 8--8
> > + *
> > + * For example if we modify the node 4, shrinking it to 2, then
> > + * no any modification is required. If we shrink the node 2 to 1
> > + * its subtree_max_size is updated only, and set to 1. If we shrink
> > + * the node 8 to 6, then its subtree_max_size is set to 6 and parent
> > + * node becomes 4--6.
> > + */
> > +static inline void
> > +__augment_tree_propagate_from(struct vmap_area *va)
> > +{
> > + struct rb_node *node = &va->rb_node;
> > + unsigned long new_va_sub_max_size;
> > +
> > + while (node) {
> > + va = rb_entry(node, struct vmap_area, rb_node);
> > + new_va_sub_max_size = compute_subtree_max_size(va);
> > +
> > + /*
> > + * If the newly calculated maximum available size of the
> > + * subtree is equal to the current one, then it means that
> > + * the tree is propagated correctly. So we have to stop at
> > + * this point to save cycles.
> > + */
> > + if (va->subtree_max_size == new_va_sub_max_size)
> > + break;
> > +
> > + va->subtree_max_size = new_va_sub_max_size;
> > + node = rb_parent(&va->rb_node);
> > + }
> > +
> > +#if DEBUG_AUGMENT_PROPAGATE_CHECK
> > + augment_tree_propagate_from_check();
> > +#endif
> > +}
> > +
> > +static void
> > +__insert_vmap_area(struct vmap_area *va,
> > + struct rb_root *root, struct list_head *head)
> > +{
> > + struct rb_node **link;
> > + struct rb_node *parent;
> > +
> > + __find_va_links(va, root, NULL, &parent, &link);
> > + __link_va(va, root, parent, link, head);
> > +}
> > +
> > +static void
> > +__insert_vmap_area_augment(struct vmap_area *va,
> > + struct rb_node *from, struct rb_root *root,
> > + struct list_head *head)
> > +{
> > + struct rb_node **link;
> > + struct rb_node *parent;
> > +
> > + if (from)
> > + __find_va_links(va, NULL, from, &parent, &link);
> > + else
> > + __find_va_links(va, root, NULL, &parent, &link);
> > +
> > + __link_va(va, root, parent, link, head);
> > + __augment_tree_propagate_from(va);
> > +}
> > +
> > +static inline void
> > +__remove_vmap_area_common(struct vmap_area *va,
> > + struct rb_root *root)
> > +{
> > + __unlink_va(va, root);
> > +}
> > +
> > +/*
> > + * Merge de-allocated chunk of VA memory with previous
> > + * and next free blocks. If coalesce is not done a new
> > + * free area is inserted. If VA has been merged, it is
> > + * freed.
> > + */
> > +static inline void
> > +__merge_or_add_vmap_area(struct vmap_area *va,
> > + struct rb_root *root, struct list_head *head)
> > +{
> > + struct vmap_area *sibling;
> > + struct list_head *next, *prev;
> > + struct rb_node **link;
> > + struct rb_node *parent;
> > + bool merged = false;
> > +
> > + /*
> > + * Find a place in the tree where VA potentially will be
> > + * inserted, unless it is merged with its sibling/siblings.
> > + */
> > + __find_va_links(va, root, NULL, &parent, &link);
> > +
> > + /*
> > + * Get next/prev nodes of VA to check if merging can be done.
> > + */
> > + __find_va_free_siblings(parent, link, &prev, &next);
> > +
> > + /*
> > + * start end
> > + * | |
> > + * |<------VA------>|<-----Next----->|
> > + * | |
> > + * start end
> > + */
> > + if (next != head) {
> > + sibling = list_entry(next, struct vmap_area, list);
> > + if (sibling->va_start == va->va_end) {
> > + sibling->va_start = va->va_start;
> > +
> > + /* Check and update the tree if needed. */
> > + __augment_tree_propagate_from(sibling);
> > +
> > + /* Remove this VA, it has been merged. */
> > + __remove_vmap_area_common(va, root);
> > +
> > + /* Free vmap_area object. */
> > + kmem_cache_free(vmap_area_cachep, va);
> > +
> > + /* Point to the new merged area. */
> > + va = sibling;
> > + merged = true;
> > + }
> > + }
> > +
> > + /*
> > + * start end
> > + * | |
> > + * |<-----Prev----->|<------VA------>|
> > + * | |
> > + * start end
> > + */
> > + if (prev != head) {
> > + sibling = list_entry(prev, struct vmap_area, list);
> > + if (sibling->va_end == va->va_start) {
> > + sibling->va_end = va->va_end;
> > +
> > + /* Check and update the tree if needed. */
> > + __augment_tree_propagate_from(sibling);
> > +
> > + /* Remove this VA, it has been merged. */
> > + __remove_vmap_area_common(va, root);
> > +
> > + /* Free vmap_area object. */
> > + kmem_cache_free(vmap_area_cachep, va);
> > +
> > + return;
> > + }
> > + }
> > +
> > + if (!merged) {
> > + __link_va(va, root, parent, link, head);
> > + __augment_tree_propagate_from(va);
> > + }
> > +}
> > +
> > +static inline bool
> > +is_within_this_va(struct vmap_area *va, unsigned long size,
> > + unsigned long align, unsigned long vstart)
> > +{
> > + unsigned long nva_start_addr;
> > +
> > + if (va->va_start > vstart)
> > + nva_start_addr = ALIGN(va->va_start, align);
> > + else
> > + nva_start_addr = ALIGN(vstart, align);
> > +
> > + /* Can be overflowed due to big size or alignment. */
> > + if (nva_start_addr + size < nva_start_addr ||
> > + nva_start_addr < vstart)
> > + return false;
> > +
> > + return (nva_start_addr + size <= va->va_end);
> > +}
> > +
> > +/*
> > + * Find the first free block(lowest start address) in the tree,
> > + * that will accomplish the request corresponding to passing
> > + * parameters.
> > + */
> > +static inline struct vmap_area *
> > +__find_vmap_lowest_match(unsigned long size,
> > + unsigned long align, unsigned long vstart)
> > +{
> > + struct vmap_area *va;
> > + struct rb_node *node;
> > + unsigned long length;
> > +
> > + /* Start from the root. */
> > + node = free_vmap_area_root.rb_node;
> > +
> > + /* Adjust the search size for alignment overhead. */
> > + length = size + align - 1;
> > +
> > + while (node) {
> > + va = rb_entry(node, struct vmap_area, rb_node);
> > +
> > + if (get_subtree_max_size(node->rb_left) >= length &&
> > + vstart < va->va_start) {
> > + node = node->rb_left;
> > + } else {
> > + if (is_within_this_va(va, size, align, vstart))
> > + return va;
> > +
> > + /*
> > + * Does not make sense to go deeper towards the right
> > + * sub-tree if it does not have a free block that is
> > + * equal or bigger to the requested search length.
> > + */
> > + if (get_subtree_max_size(node->rb_right) >= length) {
> > + node = node->rb_right;
> > + continue;
> > + }
> > +
> > + /*
> > + * OK. We roll back and find the fist right sub-tree,
> > + * that will satisfy the search criteria. It can happen
> > + * only once due to "vstart" restriction.
> > + */
> > + while ((node = rb_parent(node))) {
> > + va = rb_entry(node, struct vmap_area, rb_node);
> > + if (is_within_this_va(va, size, align, vstart))
> > + return va;
> > +
> > + if (get_subtree_max_size(node->rb_right) >= length &&
> > + vstart <= va->va_start) {
> > + node = node->rb_right;
> > + break;
> > + }
> > + }
> > + }
> > + }
> > +
> > + return NULL;
> > +}
> > +
> > +#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
> > +#include <linux/random.h>
> > +
> > +static struct vmap_area *
> > +__find_vmap_lowest_linear_match(unsigned long size,
> > + unsigned long align, unsigned long vstart)
> > +{
> > + struct vmap_area *va;
> > +
> > + list_for_each_entry(va, &free_vmap_area_list, list) {
> > + if (!is_within_this_va(va, size, align, vstart))
> > + continue;
> > +
> > + return va;
> > + }
> > +
> > + return NULL;
> > +}
> > +
> > +static void
> > +__find_vmap_lowest_match_check(unsigned long size)
> > +{
> > + struct vmap_area *va_1, *va_2;
> > + unsigned long vstart;
> > + unsigned int rnd;
> > +
> > + get_random_bytes(&rnd, sizeof(rnd));
> > + vstart = VMALLOC_START + rnd;
> > +
> > + va_1 = __find_vmap_lowest_match(size, 1, vstart);
> > + va_2 = __find_vmap_lowest_linear_match(size, 1, vstart);
> > +
> > + if (va_1 != va_2)
> > + pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
> > + va_1, va_2, vstart);
> > +}
> > +#endif
> > +
> > +enum alloc_fit_type {
> > + NOTHING_FIT = 0,
> > + FL_FIT_TYPE = 1, /* full fit */
> > + LE_FIT_TYPE = 2, /* left edge fit */
> > + RE_FIT_TYPE = 3, /* right edge fit */
> > + NE_FIT_TYPE = 4 /* no edge fit */
> > +};
> > +
> > +static inline u8
> > +__classify_va_fit_type(struct vmap_area *va,
> > + unsigned long nva_start_addr, unsigned long size)
> > +{
> > + u8 fit_type;
> > +
> > + /* Check if it is within VA. */
> > + if (nva_start_addr < va->va_start ||
> > + nva_start_addr + size > va->va_end)
> > + return NOTHING_FIT;
> > +
> > + /* Now classify. */
> > + if (va->va_start == nva_start_addr) {
> > + if (va->va_end == nva_start_addr + size)
> > + fit_type = FL_FIT_TYPE;
> > + else
> > + fit_type = LE_FIT_TYPE;
> > + } else if (va->va_end == nva_start_addr + size) {
> > + fit_type = RE_FIT_TYPE;
> > + } else {
> > + fit_type = NE_FIT_TYPE;
> > + }
> > +
> > + return fit_type;
> > +}
> > +
> > +static inline int
> > +__adjust_va_to_fit_type(struct vmap_area *va,
> > + unsigned long nva_start_addr, unsigned long size, u8 fit_type)
> > +{
> > + struct vmap_area *lva;
> > +
> > + if (fit_type == FL_FIT_TYPE) {
> > + /*
> > + * No need to split VA, it fully fits.
> > + *
> > + * | |
> > + * V NVA V
> > + * |---------------|
> > + */
> > + __remove_vmap_area_common(va, &free_vmap_area_root);
> > + kmem_cache_free(vmap_area_cachep, va);
> > + } else if (fit_type == LE_FIT_TYPE) {
> > + /*
> > + * Split left edge of fit VA.
> > + *
> > + * | |
> > + * V NVA V R
> > + * |-------|-------|
> > + */
> > + va->va_start += size;
> > + } else if (fit_type == RE_FIT_TYPE) {
> > + /*
> > + * Split right edge of fit VA.
> > + *
> > + * | |
> > + * L V NVA V
> > + * |-------|-------|
> > + */
> > + va->va_end = nva_start_addr;
> > + } else if (fit_type == NE_FIT_TYPE) {
> > + /*
> > + * Split no edge of fit VA.
> > + *
> > + * | |
> > + * L V NVA V R
> > + * |---|-------|---|
> > + */
> > + lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
> > + if (unlikely(!lva))
> > + return -1;
> > +
> > + /*
> > + * Build the remainder.
> > + */
> > + lva->va_start = va->va_start;
> > + lva->va_end = nva_start_addr;
> > +
> > + /*
> > + * Shrink this VA to remaining size.
> > + */
> > + va->va_start = nva_start_addr + size;
> > + } else {
> > + return -1;
> > + }
> > +
> > + if (fit_type != FL_FIT_TYPE) {
> > + __augment_tree_propagate_from(va);
> > +
> > + if (fit_type == NE_FIT_TYPE)
> > + __insert_vmap_area_augment(lva, &va->rb_node,
> > + &free_vmap_area_root, &free_vmap_area_list);
> > + }
> > +
> > + return 0;
> > +}
> > +
> > +/*
> > + * Returns a start address of the newly allocated area, if success.
> > + * Otherwise a vend is returned that indicates failure.
> > + */
> > +static inline unsigned long
> > +__alloc_vmap_area(unsigned long size, unsigned long align,
> > + unsigned long vstart, unsigned long vend, int node)
> > +{
> > + unsigned long nva_start_addr;
> > + struct vmap_area *va;
> > + u8 fit_type;
> > + int ret;
> > +
> > + va = __find_vmap_lowest_match(size, align, vstart);
> > + if (unlikely(!va))
> > + return vend;
> > +
> > + if (va->va_start > vstart)
> > + nva_start_addr = ALIGN(va->va_start, align);
> > + else
> > + nva_start_addr = ALIGN(vstart, align);
> > +
> > + /* Check the "vend" restriction. */
> > + if (nva_start_addr + size > vend)
> > + return vend;
> > +
> > + /* Classify what we have found. */
> > + fit_type = __classify_va_fit_type(va, nva_start_addr, size);
> > + if (unlikely(fit_type == NOTHING_FIT)) {
> > + WARN_ON_ONCE(true);
>
> Nit: WARN_ON_ONCE() has unlikely() built-in and returns the value,
> so it can be something like:
>
> if (WARN_ON_ONCE(fit_type == NOTHING_FIT))
> return vend;
>
> The same comment applies for a couple of other place, where is a check
> for NOTHING_FIT.
>
Agree. Will fix that!

>
> I do not see any issues so far, just some places in the code are a bit
> hard to follow. I'll try to spend more time on the patch in the next
> couple of weeks.
>
Thanks again for your time!

--
Vlad Rezki