[PATCH 08/15] percpu: restructure locking

From: Tejun Heo
Date: Fri Aug 22 2014 - 12:53:48 EST


At first, the percpu allocator required a sleepable context for both
alloc and free paths and used pcpu_alloc_mutex to protect everything.
Later, pcpu_lock was introduced to protect the index data structure so
that the free path can be invoked from atomic contexts. The
conversion only updated what's necessary and left most of the
allocation path under pcpu_alloc_mutex.

The percpu allocator is planned to add support for atomic allocation
and this patch restructures locking so that the coverage of
pcpu_alloc_mutex is further reduced.

* pcpu_alloc() now grab pcpu_alloc_mutex only while creating a new
chunk and populating the allocated area. Everything else is now
protected soley by pcpu_lock.

After this change, multiple instances of pcpu_extend_area_map() may
race but the function already implements sufficient synchronization
using pcpu_lock.

This also allows multiple allocators to arrive at new chunk
creation. To avoid creating multiple empty chunks back-to-back, a
new chunk is created iff there is no other empty chunk after
grabbing pcpu_alloc_mutex.

* pcpu_lock is now held while modifying chunk->populated bitmap.
After this, all data structures are protected by pcpu_lock.

Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
---
mm/percpu-km.c | 2 ++
mm/percpu.c | 75 +++++++++++++++++++++++++++-------------------------------
2 files changed, 37 insertions(+), 40 deletions(-)

diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 67a971b..e662b49 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -68,7 +68,9 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
chunk->data = pages;
chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];

+ spin_lock_irq(&pcpu_lock);
bitmap_fill(chunk->populated, nr_pages);
+ spin_unlock_irq(&pcpu_lock);

return chunk;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index c8fe482..507afc0 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -152,31 +152,12 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
static int pcpu_reserved_chunk_limit;

/*
- * Synchronization rules.
- *
- * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
- * protects allocation/reclaim paths, chunks, populated bitmap and
- * vmalloc mapping. The latter is a spinlock and protects the index
- * data structures - chunk slots, chunks and area maps in chunks.
- *
- * During allocation, pcpu_alloc_mutex is kept locked all the time and
- * pcpu_lock is grabbed and released as necessary. All actual memory
- * allocations are done using GFP_KERNEL with pcpu_lock released. In
- * general, percpu memory can't be allocated with irq off but
- * irqsave/restore are still used in alloc path so that it can be used
- * from early init path - sched_init() specifically.
- *
- * Free path accesses and alters only the index data structures, so it
- * can be safely called from atomic context. When memory needs to be
- * returned to the system, free path schedules reclaim_work which
- * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
- * reclaimed, release both locks and frees the chunks. Note that it's
- * necessary to grab both locks to remove a chunk from circulation as
- * allocation path might be referencing the chunk with only
- * pcpu_alloc_mutex locked.
+ * Free path accesses and alters only the index data structures and can be
+ * safely called from atomic context. When memory needs to be returned to
+ * the system, free path schedules reclaim_work.
*/
-static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
-static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
+static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
+static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */

static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */

@@ -709,7 +690,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
static int warn_limit = 10;
struct pcpu_chunk *chunk;
const char *err;
- int slot, off, new_alloc, cpu;
+ int slot, off, new_alloc, cpu, ret;
int page_start, page_end, rs, re;
unsigned long flags;
void __percpu *ptr;
@@ -729,7 +710,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
return NULL;
}

- mutex_lock(&pcpu_alloc_mutex);
spin_lock_irqsave(&pcpu_lock, flags);

/* serve reserved allocations from the reserved chunk if available */
@@ -745,7 +725,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
spin_unlock_irqrestore(&pcpu_lock, flags);
if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
err = "failed to extend area map of reserved chunk";
- goto fail_unlock_mutex;
+ goto fail;
}
spin_lock_irqsave(&pcpu_lock, flags);
}
@@ -771,7 +751,7 @@ restart:
if (pcpu_extend_area_map(chunk,
new_alloc) < 0) {
err = "failed to extend area map";
- goto fail_unlock_mutex;
+ goto fail;
}
spin_lock_irqsave(&pcpu_lock, flags);
/*
@@ -787,37 +767,53 @@ restart:
}
}

- /* hmmm... no space left, create a new chunk */
spin_unlock_irqrestore(&pcpu_lock, flags);

- chunk = pcpu_create_chunk();
- if (!chunk) {
- err = "failed to allocate new chunk";
- goto fail_unlock_mutex;
+ /*
+ * No space left. Create a new chunk. We don't want multiple
+ * tasks to create chunks simultaneously. Serialize and create iff
+ * there's still no empty chunk after grabbing the mutex.
+ */
+ mutex_lock(&pcpu_alloc_mutex);
+
+ if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
+ chunk = pcpu_create_chunk();
+ if (!chunk) {
+ err = "failed to allocate new chunk";
+ goto fail;
+ }
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+ pcpu_chunk_relocate(chunk, -1);
+ } else {
+ spin_lock_irqsave(&pcpu_lock, flags);
}

- spin_lock_irqsave(&pcpu_lock, flags);
- pcpu_chunk_relocate(chunk, -1);
+ mutex_unlock(&pcpu_alloc_mutex);
goto restart;

area_found:
spin_unlock_irqrestore(&pcpu_lock, flags);

/* populate if not all pages are already there */
+ mutex_lock(&pcpu_alloc_mutex);
page_start = PFN_DOWN(off);
page_end = PFN_UP(off + size);

pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
WARN_ON(chunk->immutable);

- if (pcpu_populate_chunk(chunk, rs, re)) {
- spin_lock_irqsave(&pcpu_lock, flags);
+ ret = pcpu_populate_chunk(chunk, rs, re);
+
+ spin_lock_irqsave(&pcpu_lock, flags);
+ if (ret) {
+ mutex_unlock(&pcpu_alloc_mutex);
pcpu_free_area(chunk, off);
err = "failed to populate";
goto fail_unlock;
}
-
bitmap_set(chunk->populated, rs, re - rs);
+ spin_unlock_irqrestore(&pcpu_lock, flags);
}

mutex_unlock(&pcpu_alloc_mutex);
@@ -832,8 +828,7 @@ area_found:

fail_unlock:
spin_unlock_irqrestore(&pcpu_lock, flags);
-fail_unlock_mutex:
- mutex_unlock(&pcpu_alloc_mutex);
+fail:
if (warn_limit) {
pr_warning("PERCPU: allocation failed, size=%zu align=%zu, "
"%s\n", size, align, err);
--
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/