[RFC v2] percpu: Add a separate function to merge free areas

From: Leonard Crestez
Date: Tue Dec 02 2014 - 17:50:10 EST


Hello,

It seems that free_percpu performance is very bad when working with small
objects. The easiest way to reproduce this is to allocate and then free a large
number of percpu int counters in order. Small objects (reference counters and
pointers) are common users of alloc_percpu and I think this should be fast.
This particular issue can be encountered with very large number of net_device
structs.

The problem seems to be that pcpu_chunk keeps an array of all alocated areas. At
free time pcpu_free_area will delete items from that array linearly, using
memmove. This has worst-case quadratic complexity in the number of areas per
chunk. This gets really bad if the areas are small and are allocated and freed in order.

One way to fix this is to merge free areas in a separate function and handle
multiple frees at once. There is a patch below which does this, but
I'm not sure it's correct.

Instead of merging free areas on every free we only do it when 2/3 of
the slots in the map are used. This should hopefully amortize to linear
complexity instead of quadratic.

I've posted an earlier version of this to lkml earlier but got no response. That
was probably because I only posted to lkml. I now sent to linux-mm and the people
reported by "get_maintainer.pl". Here's a link to the older post:

https://www.mail-archive.com/linux-kernel@xxxxxxxxxxxxxxx/msg777188.html

That version of the patch is incorrect. Never updating contig_hint on free means
that once a chunk's contig_hint reaches 0 (when completely allocated) that chunk
will never be considered for allocation again. It also doesn't amortize the frees.

Entirely different solutions could be considered. For example it might make sense to
make chunks smaller (they are about ~40K on the system I care about). Or maybe even
write an entirely custom allocator for very small fixed-size percpu objects.

Signed-off-by: Crestez Dan Leonard <lcrestez@xxxxxxxxxxx>
---
mm/percpu.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++-------------
1 file changed, 78 insertions(+), 22 deletions(-)

diff --git a/mm/percpu.c b/mm/percpu.c
index 014bab6..9d85198 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -109,6 +109,7 @@ struct pcpu_chunk {

int map_used; /* # of map entries used before the sentry */
int map_alloc; /* # of map entries allocated */
+ int map_free_count; /* # of map entries freed but not merged */
int *map; /* allocation map */
struct work_struct map_extend_work;/* async ->map[] extension */

@@ -375,6 +376,69 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
}

/**
+ * percpu_merge_free_spaces - merge spaces in a chunk
+ * @chunk: chunk of interest
+ *
+ * This function should merge a continous region of free
+ * spaces into a single one.
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static void pcpu_merge_free_spaces(struct pcpu_chunk *chunk)
+{
+ int start;
+ int i, j;
+
+ chunk->map_free_count = 0;
+
+ /* We copy from map[i] into map[j] while merging free spaces. */
+ i = j = chunk->first_free;
+ /* In case first_free points to something busy */
+ if (chunk->map[i] & 1)
+ goto eat_busy;
+
+eat_free:
+ /* Look for busy space
+ * Last chunk is always busy, no need to check map_used
+ */
+ for (start = i; !(chunk->map[i] & 1); ++i);
+
+ /* Collapse */
+ if (j != start)
+ chunk->map[j] = chunk->map[start];
+ ++chunk->map_free_count;
+ ++j;
+
+ chunk->contig_hint = max(chunk->contig_hint,
+ (chunk->map[i] & ~1) - chunk->map[start]);
+
+eat_busy:
+ /* Look for free space */
+ for (start = i; i <= chunk->map_used && (chunk->map[i] & 1); ++i);
+
+ /* Move stuff if appropriate */
+ if (j != start)
+ memmove(chunk->map + j, chunk->map + start, (i - start) * sizeof(*chunk->map));
+ j += i - start;
+
+ if (i > chunk->map_used)
+ goto end;
+ else
+ goto eat_free;
+
+end:
+ /* Done */
+ chunk->map_used = j - 1;
+}
+
+static inline void pcpu_check_merge_free_spaces(struct pcpu_chunk *chunk)
+{
+ if (chunk->map_free_count * 3 >= chunk->map_used * 2)
+ pcpu_merge_free_spaces(chunk);
+}
+
+/**
* pcpu_need_to_extend - determine whether chunk area map needs to be extended
* @chunk: chunk of interest
* @is_atomic: the allocation context
@@ -623,10 +687,12 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
*++p = off += head;
++i;
max_contig = max(head, max_contig);
+ chunk->map_free_count++;
}
if (tail) {
p[1] = off + size;
max_contig = max(tail, max_contig);
+ chunk->map_free_count++;
}
}

@@ -641,6 +707,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
max_contig);

chunk->free_size -= size;
+ chunk->map_free_count--;
*p |= 1;

*occ_pages_p = pcpu_count_occupied_pages(chunk, i);
@@ -674,8 +741,7 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
int oslot = pcpu_chunk_slot(chunk);
int off = 0;
unsigned i, j;
- int to_free = 0;
- int *p;
+ int this_size;

freeme |= 1; /* we are searching for <given offset, in use> pair */

@@ -696,28 +762,15 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
if (i < chunk->first_free)
chunk->first_free = i;

- p = chunk->map + i;
- *p = off &= ~1;
- chunk->free_size += (p[1] & ~1) - off;
-
+ /* Mark this area as free */
+ chunk->map[i] &= ~1;
+ this_size = (chunk->map[i + 1] & ~1) - chunk->map[i];
+ chunk->free_size += this_size;
*occ_pages_p = pcpu_count_occupied_pages(chunk, i);
+ chunk->contig_hint = max(chunk->contig_hint, this_size);
+ chunk->map_free_count++;

- /* merge with next? */
- if (!(p[1] & 1))
- to_free++;
- /* merge with previous? */
- if (i > 0 && !(p[-1] & 1)) {
- to_free++;
- i--;
- p--;
- }
- if (to_free) {
- chunk->map_used -= to_free;
- memmove(p + 1, p + 1 + to_free,
- (chunk->map_used - i) * sizeof(chunk->map[0]));
- }
-
- chunk->contig_hint = max(chunk->map[i + 1] - chunk->map[i] - 1, chunk->contig_hint);
+ pcpu_check_merge_free_spaces(chunk);
pcpu_chunk_relocate(chunk, oslot);
}

@@ -740,6 +793,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
chunk->map[0] = 0;
chunk->map[1] = pcpu_unit_size | 1;
chunk->map_used = 1;
+ chunk->map_free_count = 1;

INIT_LIST_HEAD(&chunk->list);
INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
@@ -1669,6 +1723,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
schunk->map[0] = 1;
schunk->map[1] = ai->static_size;
schunk->map_used = 1;
+ schunk->map_free_count = 1;
if (schunk->free_size)
schunk->map[++schunk->map_used] = 1 | (ai->static_size + schunk->free_size);
else
@@ -1691,6 +1746,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
dchunk->map[1] = pcpu_reserved_chunk_limit;
dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
dchunk->map_used = 2;
+ dchunk->map_free_count = 1;
}

/* link the first chunk in */
--
2.1.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/