[RFC] mm/vmalloc: keep track of free blocks for allocation

From: Uladzislau Rezki (Sony)
Date: Fri Jun 15 2018 - 09:02:05 EST


Hello,

please RFC.

Initial discussion was here: https://patchwork.kernel.org/patch/10244733/

Currently an allocation of the new VA area is done over
busy list iteration until a suitable hole is found between
two busy areas. Therefore each new allocation causes the
list being grown. Due to long list and different permissive
parameters an allocation can take a long time on embedded
devices(milliseconds).

This patch organizes the vmalloc memory layout into free
areas of the VMALLOC_START-VMALLOC_END range. It uses a
red-black tree that keeps blocks sorted by their offsets
in pair with linked list keeping the free space in order
of increasing addresses.

Allocation: to allocate a new block a search is done over
free list areas until a suitable block is large enough to
encompass the requested size. If the block is bigger than
requested size - it is split.

De-allocation: red-black tree allows efficiently find a
spot in the tree whereas a linked list allows fast merge
of de-allocated memory chunks with existing free blocks
creating large coalesced areas.

model name: QEMU Virtual CPU version 2.5+

test_1:
<measure this loop time>
for (n = 0; n < 1000000; n++) {
void *ptr_1 = vmalloc(3 * PAGE_SIZE);
*((__u8 *)ptr_1) = 0; /* Pretend we used the mem */
vfree(ptr_1);
}
<measure this loop time>

938007(us) vs 939222(us) +0.129%
932760(us) vs 932565(us) -0.020%
929691(us) vs 935795(us) +0.652%
932767(us) vs 932683(us) -0.009%
937520(us) vs 935457(us) -0.220%

test_2:
for (n = 0; n < 15000; n++)
ptr[n] = vmalloc(1 * PAGE_SIZE);

<measure this loop time>
for (n = 0; n < 1000000; n++) {
void *ptr_1 = vmalloc(100 * PAGE_SIZE);
void *ptr_2 = vmalloc(1 * PAGE_SIZE);
*((__u8 *)ptr_1) = 0; /* Pretend we used the mem */
*((__u8 *)ptr_2) = 1; /* Pretend we used the mem */

vfree(ptr_1);
vfree(ptr_2);
}
<measure this loop time>

33590880(us) vs 11027121(us) -67.172%
34503307(us) vs 11696023(us) -66.101%
44198667(us) vs 11849005(us) -73.191%
19377377(us) vs 12026349(us) -37.936%
29511186(us) vs 11757217(us) -60.160%

Signed-off-by: Uladzislau Rezki (Sony) <urezki@xxxxxxxxx>
---
mm/vmalloc.c | 420 +++++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 393 insertions(+), 27 deletions(-)

diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index ebff729cc956..2ab7ec93b199 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -332,6 +332,29 @@ LIST_HEAD(vmap_area_list);
static LLIST_HEAD(vmap_purge_list);
static struct rb_root vmap_area_root = RB_ROOT;

+/*
+ * This linked list is used in pair with free_vmap_area_root.
+ * It makes it possible of fast accessing to next/prev nodes
+ * to perform coalescing.
+ */
+static LIST_HEAD(free_vmap_area_list);
+
+/*
+ * This red-black tree is used for storing address-sorted
+ * vmap areas during free operation. Sorting is done using
+ * va_start address. We make use of it to merge a VA with
+ * its prev/next neighbors.
+ */
+static struct rb_root free_vmap_area_root = RB_ROOT;
+
+/*
+ * For vmalloc specific area allocation.
+ */
+static struct vmap_area *last_free_va_chunk;
+static unsigned long last_alloc_vstart;
+static unsigned long last_alloc_align;
+static unsigned long free_va_max_size;
+
/* The vmap cache globals are protected by vmap_area_lock */
static struct rb_node *free_vmap_cache;
static unsigned long cached_hole_size;
@@ -359,27 +382,53 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
return NULL;
}

-static void __insert_vmap_area(struct vmap_area *va)
+static inline void __find_va_slot(struct vmap_area *va,
+ struct rb_root *root, struct rb_node **parent, struct rb_node ***link)
{
- struct rb_node **p = &vmap_area_root.rb_node;
- struct rb_node *parent = NULL;
- struct rb_node *tmp;
+ *link = &root->rb_node;
+ *parent = NULL;

- while (*p) {
+ while (**link) {
struct vmap_area *tmp_va;

- parent = *p;
- tmp_va = rb_entry(parent, struct vmap_area, rb_node);
+ *parent = **link;
+ tmp_va = rb_entry(*parent, struct vmap_area, rb_node);
if (va->va_start < tmp_va->va_end)
- p = &(*p)->rb_left;
+ *link = &(**link)->rb_left;
else if (va->va_end > tmp_va->va_start)
- p = &(*p)->rb_right;
+ *link = &(**link)->rb_right;
else
BUG();
}
+}
+
+static inline void __find_va_siblings(struct rb_node *p_rb_node,
+ struct rb_node **p_rb_link, struct list_head **next, struct list_head **prev)
+{
+ struct list_head *p_list_head;
+
+ if (likely(p_rb_node)) {
+ p_list_head = &rb_entry(p_rb_node, struct vmap_area, rb_node)->list;
+ if (&p_rb_node->rb_right == p_rb_link) {
+ *next = p_list_head->next;
+ *prev = p_list_head;
+ } else {
+ *prev = p_list_head->prev;
+ *next = p_list_head;
+ }
+ } else {
+ /* Suppose it may ever happen. */
+ *next = *prev = &free_vmap_area_list;
+ }
+}

- rb_link_node(&va->rb_node, parent, p);
- rb_insert_color(&va->rb_node, &vmap_area_root);
+static inline void __link_va(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **p_link, struct list_head *head)
+{
+ struct rb_node *tmp;
+
+ rb_link_node(&va->rb_node, parent, p_link);
+ rb_insert_color(&va->rb_node, root);

/* address-sort this list */
tmp = rb_prev(&va->rb_node);
@@ -388,13 +437,239 @@ static void __insert_vmap_area(struct vmap_area *va)
prev = rb_entry(tmp, struct vmap_area, rb_node);
list_add_rcu(&va->list, &prev->list);
} else
- list_add_rcu(&va->list, &vmap_area_list);
+ list_add_rcu(&va->list, head);
+}
+
+static void __insert_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ struct rb_node **p_link;
+ struct rb_node *parent;
+
+ __find_va_slot(va, root, &parent, &p_link);
+ __link_va(va, root, parent, p_link, head);
}

static void purge_vmap_area_lazy(void);

static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);

+static inline unsigned long
+__va_size(struct vmap_area *va)
+{
+ return va->va_end - va->va_start;
+}
+
+static inline void
+__remove_free_va_area(struct vmap_area *va)
+{
+ /*
+ * Remove VA from the address-sorted tree/list.
+ * Do check if its rb_node is empty or not, since
+ * we use this function as common interface to
+ * destroy a vmap_area.
+ */
+ if (!RB_EMPTY_NODE(&va->rb_node)) {
+ rb_erase(&va->rb_node, &free_vmap_area_root);
+ list_del_rcu(&va->list);
+ }
+
+ /*
+ * Lazy free.
+ */
+ kfree_rcu(va, rcu_head);
+}
+
+/*
+ * Merge de-allocated chunk of VA memory with previous
+ * and next free blocks. Either a pointer to the new
+ * merged area is returned if coalesce is done or VA
+ * area if inserting is done.
+ */
+static inline struct vmap_area *
+__merge_add_free_va_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ struct vmap_area *sibling;
+ struct list_head *next, *prev;
+ struct rb_node **p_link;
+ struct rb_node *parent;
+ bool merged = false;
+
+ /*
+ * Find a place in the tree where VA potentially will be
+ * inserted, unless it is merged with its sibling/siblings.
+ */
+ __find_va_slot(va, root, &parent, &p_link);
+
+ /*
+ * Get next/prev nodes of VA to check if merging can be done.
+ */
+ __find_va_siblings(parent, p_link, &next, &prev);
+
+ /*
+ * start end
+ * | |
+ * |<------VA------>|<-----Next----->|
+ * | |
+ * start end
+ */
+ if (next != head) {
+ sibling = list_entry(next, struct vmap_area, list);
+ if (sibling->va_start == va->va_end) {
+ sibling->va_start = va->va_start;
+ __remove_free_va_area(va);
+
+ /* Point to the new merged area. */
+ va = sibling;
+ merged = true;
+ }
+ }
+
+ /*
+ * start end
+ * | |
+ * |<-----Prev----->|<------VA------>|
+ * | |
+ * start end
+ */
+ if (prev != head) {
+ sibling = list_entry(prev, struct vmap_area, list);
+ if (sibling->va_end == va->va_start) {
+ sibling->va_end = va->va_end;
+ __remove_free_va_area(va);
+
+ /* Point to the new merged area. */
+ va = sibling;
+ merged = true;
+ }
+ }
+
+ if (!merged)
+ __link_va(va, root, parent, p_link, head);
+
+ return va;
+}
+
+static inline unsigned long
+alloc_vmalloc_area(unsigned long size, unsigned long align,
+ unsigned long vstart, unsigned long vend,
+ int node, gfp_t gfp_mask)
+{
+ struct vmap_area *b_fit = NULL; /* best fit */
+ struct vmap_area *le_fit = NULL; /* left-edge fit */
+ struct vmap_area *re_fit = NULL; /* right-edge fit */
+ struct vmap_area *ne_fit = NULL; /* no edge fit */
+ struct vmap_area *va = last_free_va_chunk;
+ unsigned long nva_start_addr;
+
+ if (!last_free_va_chunk || size <= free_va_max_size ||
+ vstart < last_alloc_vstart || align < last_alloc_align) {
+ va = list_first_entry(&free_vmap_area_list, struct vmap_area, list);
+ free_va_max_size = 0;
+ last_free_va_chunk = NULL;
+ }
+
+ nva_start_addr = ALIGN(vstart, align);
+ list_for_each_entry_from(va, &free_vmap_area_list, list) {
+ if (va->va_start > vstart)
+ nva_start_addr = ALIGN(va->va_start, align);
+
+ /* VA does not fit to requested parameters. */
+ if (nva_start_addr + size > va->va_end) {
+ free_va_max_size = max(free_va_max_size, __va_size(va));
+ continue;
+ }
+
+ /* Nothing has been found, give up. */
+ if (nva_start_addr + size > vend)
+ break;
+
+ /* Classify what we have found. */
+ if (va->va_start == nva_start_addr) {
+ if (va->va_end == nva_start_addr + size)
+ b_fit = va;
+ le_fit = va;
+ } else if (va->va_end == nva_start_addr + size) {
+ re_fit = va;
+ } else {
+ ne_fit = va;
+ }
+
+ last_free_va_chunk = va;
+ last_alloc_vstart = vstart;
+ last_alloc_align = align;
+ break;
+ }
+
+ if (b_fit) {
+ /*
+ * No need to split VA, it fully fits.
+ *
+ * | |
+ * V NVA V
+ * |---------------|
+ */
+ if (b_fit->list.prev != &free_vmap_area_list)
+ last_free_va_chunk = list_prev_entry(b_fit, list);
+ else
+ last_free_va_chunk = NULL;
+
+ __remove_free_va_area(b_fit);
+ } else if (le_fit) {
+ /*
+ * Split left edge fit VA.
+ *
+ * | |
+ * V NVA V
+ * |-------|-------|
+ */
+ le_fit->va_start += size;
+ } else if (re_fit) {
+ /*
+ * Split right edge fit VA.
+ *
+ * | |
+ * V NVA V
+ * |-------|-------|
+ */
+ re_fit->va_end = nva_start_addr;
+ } else if (ne_fit) {
+ /*
+ * Split no edge fit VA.
+ *
+ * | |
+ * V NVA V
+ * |---|-------|---|
+ */
+ va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
+ if (unlikely(!va))
+ return VMALLOC_END;
+
+ /*
+ * Build right area of VA.
+ */
+ va->va_start = nva_start_addr + size;
+ va->va_end = ne_fit->va_end;
+
+ /*
+ * Build left area of VA.
+ */
+ ne_fit->va_end = nva_start_addr;
+
+ /*
+ * Add newly built right area to the address sorted list.
+ */
+ __insert_vmap_area(va,
+ &free_vmap_area_root, &free_vmap_area_list);
+ } else {
+ /* Not found. */
+ nva_start_addr = VMALLOC_END;
+ }
+
+ return nva_start_addr;
+}
+
/*
* Allocate a region of KVA of the specified size and alignment, within the
* vstart and vend.
@@ -409,6 +684,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
unsigned long addr;
int purged = 0;
struct vmap_area *first;
+ bool is_vmalloc_allocation;

BUG_ON(!size);
BUG_ON(offset_in_page(size));
@@ -426,9 +702,22 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
* to avoid false negatives.
*/
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
+ is_vmalloc_allocation = is_vmalloc_addr((void *)vstart);

retry:
spin_lock(&vmap_area_lock);
+ if (is_vmalloc_allocation) {
+ addr = alloc_vmalloc_area(size, align,
+ vstart, vend, node, gfp_mask);
+
+ /*
+ * If an allocation fails, the VMALLOC_END address is
+ * returned. Therefore, an overflow path will be triggered
+ * below.
+ */
+ goto found;
+ }
+
/*
* Invalidate cache if we have more permissive parameters.
* cached_hole_size notes the largest hole noticed _below_
@@ -504,8 +793,11 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
va->va_start = addr;
va->va_end = addr + size;
va->flags = 0;
- __insert_vmap_area(va);
- free_vmap_cache = &va->rb_node;
+ __insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+
+ if (!is_vmalloc_allocation)
+ free_vmap_cache = &va->rb_node;
+
spin_unlock(&vmap_area_lock);

BUG_ON(!IS_ALIGNED(va->va_start, align));
@@ -552,9 +844,14 @@ EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);

static void __free_vmap_area(struct vmap_area *va)
{
+ unsigned long last_free_va_start = 0;
+ bool is_vmalloc_area;
+
BUG_ON(RB_EMPTY_NODE(&va->rb_node));
+ is_vmalloc_area = (va->va_end > VMALLOC_START &&
+ va->va_end <= VMALLOC_END);

- if (free_vmap_cache) {
+ if (!is_vmalloc_area && free_vmap_cache) {
if (va->va_end < cached_vstart) {
free_vmap_cache = NULL;
} else {
@@ -573,16 +870,39 @@ static void __free_vmap_area(struct vmap_area *va)
RB_CLEAR_NODE(&va->rb_node);
list_del_rcu(&va->list);

- /*
- * Track the highest possible candidate for pcpu area
- * allocation. Areas outside of vmalloc area can be returned
- * here too, consider only end addresses which fall inside
- * vmalloc area proper.
- */
- if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
+ if (is_vmalloc_area) {
+ /*
+ * Track the highest possible candidate for pcpu area
+ * allocation. Areas outside of vmalloc area can be returned
+ * here too, consider only end addresses which fall inside
+ * vmalloc area proper.
+ */
vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);

- kfree_rcu(va, rcu_head);
+ if (last_free_va_chunk)
+ last_free_va_start = last_free_va_chunk->va_start;
+
+ /*
+ * Merge VA with its neighbors, otherwise add it.
+ */
+ va = __merge_add_free_va_area(va,
+ &free_vmap_area_root, &free_vmap_area_list);
+
+ /*
+ * Update a search criteria if merging/inserting is
+ * done before last_free_va_chunk va_start address.
+ */
+ if (last_free_va_start) {
+ if (va->va_start <= last_free_va_start) {
+ if (va->list.prev != &free_vmap_area_list)
+ last_free_va_chunk = list_prev_entry(va, list);
+ else
+ last_free_va_chunk = NULL;
+ }
+ }
+ } else {
+ kfree_rcu(va, rcu_head);
+ }
}

/*
@@ -1253,7 +1573,7 @@ void __init vm_area_register_early(struct vm_struct *vm, size_t align)

void __init vmalloc_init(void)
{
- struct vmap_area *va;
+ struct vmap_area *va, *prev_va;
struct vm_struct *tmp;
int i;

@@ -1269,16 +1589,62 @@ void __init vmalloc_init(void)
INIT_WORK(&p->wq, free_work);
}

+ /*
+ * Build free areas.
+ */
+ va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
+ va->va_start = (unsigned long) VMALLOC_START;
+
+ if (!vmlist)
+ va->va_end = (unsigned long) VMALLOC_END;
+ else
+ va->va_end = (unsigned long) vmlist->addr;
+
+ __insert_vmap_area(va,
+ &free_vmap_area_root, &free_vmap_area_list);
+
+ if (!vmlist)
+ goto build_free_area_done;
+
/* Import existing vmlist entries. */
- for (tmp = vmlist; tmp; tmp = tmp->next) {
+ for (tmp = vmlist, prev_va = NULL; tmp; tmp = tmp->next) {
+ struct vmap_area *free_area;
+
va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
va->flags = VM_VM_AREA;
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;
va->vm = tmp;
- __insert_vmap_area(va);
+ __insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+
+ /*
+ * Check if there is a padding between previous/current.
+ */
+ if (prev_va && (va->va_start - prev_va->va_end) > 0) {
+ free_area = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
+ free_area->va_start = prev_va->va_end;
+ free_area->va_end = va->va_start;
+
+ __insert_vmap_area(free_area,
+ &free_vmap_area_root, &free_vmap_area_list);
+ }
+
+ /*
+ * Handle last case building the remaining space.
+ */
+ if (!tmp->next) {
+ free_area = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
+ free_area->va_start = va->va_end;
+ free_area->va_end = (unsigned long) VMALLOC_END;
+
+ __insert_vmap_area(free_area,
+ &free_vmap_area_root, &free_vmap_area_list);
+ }
+
+ prev_va = va;
}

+build_free_area_done:
vmap_area_pcpu_hole = VMALLOC_END;

vmap_initialized = true;
@@ -2604,7 +2970,7 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,

va->va_start = base + offsets[area];
va->va_end = va->va_start + sizes[area];
- __insert_vmap_area(va);
+ __insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
}

vmap_area_pcpu_hole = base + offsets[last_area];
--
2.11.0