[RFC PATCH 2/3] mm: qmempool - quick queue based memory pool

From: Jesper Dangaard Brouer
Date: Wed Dec 10 2014 - 09:21:43 EST


A quick queue-based memory pool, that functions as a cache in-front
of kmem_cache SLAB/SLUB allocators. Which allows faster than
SLAB/SLUB reuse/caching of fixed size memory elements

The speed gain comes from, the shared storage, using a Lock-Free
queue that supports bulk refilling elements (to a percpu cache)
with a single cmpxchg. Thus, the (lock-prefixed) cmpxchg cost is
amortize over the bulk size.

Qmempool cannot easily replace all kmem_cache usage, because it is
restricted in which contexts is can be used in, as the Lock-Free
queue is not preemption safe. E.g. only supports GFP_ATOMIC allocations
from SLAB.

This version is optimized for usage from softirq context, and cannot
be used from hardirq context. Usage from none-softirq requires usage
of local_bh_{disable,enable}, which have a fairly high cost.

Performance micro benchmarks against SLUB. First test is fast-path
reuse of same element. Second test is allocating 256 element before
freeing elements again, this pattern comes from how NIC ring queue
cleanups often run.

On CPU E5-2695, CONFIG_PREEMPT=y, showing cost of alloc+free:

SLUB - softirq - none-softirq
fastpath-reuse: 19.563 ns - 7.837 ns - 18.536 ns
N(256)-pattern: 45.039 ns - 11.782 ns - 24.186 ns

A significant win for usage from softirq, and a smaller win for
none-softirq which requires taking local_bh_{disable,enable}.

Signed-off-by: Jesper Dangaard Brouer <brouer@xxxxxxxxxx>
---

include/linux/qmempool.h | 205 +++++++++++++++++++++++++++++
mm/Kconfig | 12 ++
mm/Makefile | 1
mm/qmempool.c | 322 ++++++++++++++++++++++++++++++++++++++++++++++
4 files changed, 540 insertions(+), 0 deletions(-)
create mode 100644 include/linux/qmempool.h
create mode 100644 mm/qmempool.c

diff --git a/include/linux/qmempool.h b/include/linux/qmempool.h
new file mode 100644
index 0000000..922ed27
--- /dev/null
+++ b/include/linux/qmempool.h
@@ -0,0 +1,205 @@
+/*
+ * qmempool - a quick queue based mempool
+ *
+ * A quick queue-based memory pool, that functions as a cache in-front
+ * of kmem_cache SLAB/SLUB allocators. Which allows faster than
+ * SLAB/SLUB reuse/caching of fixed size memory elements
+ *
+ * The speed gain comes from, the shared storage, using a Lock-Free
+ * queue that supports bulk refilling elements (to a percpu cache)
+ * with a single cmpxchg. Thus, the lock-prefixed cmpxchg cost is
+ * amortize over the bulk size.
+ *
+ * The Lock-Free queue is based on an array (of pointer to elements).
+ * This make access more cache optimal, as e.g. on 64bit 8 pointers
+ * can be stored per cache-line (which is superior to a linked list
+ * approach). Only storing the pointers to elements, is also
+ * beneficial as we don't touch the elements data.
+ *
+ * Qmempool cannot easily replace all kmem_cache usage, because it is
+ * restricted in which contexts is can be used in, as the Lock-Free
+ * queue is not preemption safe. This version is optimized for usage
+ * from softirq context, and cannot be used from hardirq context.
+ *
+ * Only support GFP_ATOMIC allocations from SLAB.
+ *
+ * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer
+ * for licensing details see kernel-base/COPYING
+ */
+
+#ifndef _LINUX_QMEMPOOL_H
+#define _LINUX_QMEMPOOL_H
+
+#include <linux/alf_queue.h>
+#include <linux/prefetch.h>
+#include <linux/hardirq.h>
+
+/* Bulking is an essential part of the performance gains as this
+ * amortize the cost of cmpxchg ops used when accessing sharedq
+ */
+#define QMEMPOOL_BULK 16
+#define QMEMPOOL_REFILL_MULTIPLIER 2
+
+struct qmempool_percpu {
+ struct alf_queue *localq;
+};
+
+struct qmempool {
+ /* The shared queue (sharedq) is a Multi-Producer-Multi-Consumer
+ * queue where access is protected by an atomic cmpxchg operation.
+ * The queue support bulk transfers, which amortize the cost
+ * of the atomic cmpxchg operation.
+ */
+ struct alf_queue *sharedq;
+
+ /* Per CPU local "cache" queues for faster atomic free access.
+ * The local queues (localq) are Single-Producer-Single-Consumer
+ * queues as they are per CPU.
+ */
+ struct qmempool_percpu __percpu *percpu;
+
+ /* Backed by some SLAB kmem_cache */
+ struct kmem_cache *kmem;
+
+ /* Setup */
+ uint32_t prealloc;
+ gfp_t gfp_mask;
+};
+
+extern void qmempool_destroy(struct qmempool *pool);
+extern struct qmempool *qmempool_create(
+ uint32_t localq_sz, uint32_t sharedq_sz, uint32_t prealloc,
+ struct kmem_cache *kmem, gfp_t gfp_mask);
+
+extern void *__qmempool_alloc_from_sharedq(
+ struct qmempool *pool, gfp_t gfp_mask, struct alf_queue *localq);
+extern void __qmempool_free_to_sharedq(void *elem, struct qmempool *pool,
+ struct alf_queue *localq);
+
+/* The percpu variables (SPSC queues) needs preempt protection, and
+ * the shared MPMC queue also needs protection against the same CPU
+ * access the same queue.
+ *
+ * Specialize and optimize the qmempool to run from softirq.
+ * Don't allow qmempool to be used from interrupt context.
+ *
+ * IDEA: When used from softirq, take advantage of the protection
+ * softirq gives. A softirq will never preempt another softirq,
+ * running on the same CPU. The only event that can preempt a softirq
+ * is an interrupt handler (and perhaps we don't need to support
+ * calling qmempool from an interrupt). Another softirq, even the
+ * same one, can run on another CPU however, but these helpers are
+ * only protecting our percpu variables.
+ *
+ * Thus, our percpu variables are safe if current the CPU is the one
+ * serving the softirq (tested via in_serving_softirq()), like:
+ *
+ * if (!in_serving_softirq())
+ * local_bh_disable();
+ *
+ * This makes qmempool very fast, when accesses from softirq, but
+ * slower when accessed outside softirq. The other contexts need to
+ * disable bottom-halves "bh" via local_bh_{disable,enable} (which on
+ * have been measured add cost if 7.5ns on CPU E5-2695).
+ *
+ * MUST not be used from interrupt context, when relying on softirq usage.
+ */
+static inline int __qmempool_preempt_disable(void)
+{
+ int in_serving_softirq = in_serving_softirq();
+
+ if (!in_serving_softirq)
+ local_bh_disable();
+
+ return in_serving_softirq;
+}
+
+static inline void __qmempool_preempt_enable(int in_serving_softirq)
+{
+ if (!in_serving_softirq)
+ local_bh_enable();
+}
+
+/* Elements - alloc and free functions are inlined here for
+ * performance reasons, as the per CPU lockless access should be as
+ * fast as possible.
+ */
+
+/* Main allocation function
+ *
+ * Caller must make sure this is called from a preemptive safe context
+ */
+static inline void * main_qmempool_alloc(struct qmempool *pool, gfp_t gfp_mask)
+{
+ /* NUMA considerations, for now the numa node is not handles,
+ * this could be handled via e.g. numa_mem_id()
+ */
+ void *elem;
+ struct qmempool_percpu *cpu;
+ int num;
+
+ /* 1. attempt get element from local per CPU queue */
+ cpu = this_cpu_ptr(pool->percpu);
+ num = alf_sc_dequeue(cpu->localq, (void **)&elem, 1);
+ if (num == 1) /* Succes: alloc elem by deq from localq cpu cache */
+ return elem;
+
+ /* 2. attempt get element from shared queue. This involves
+ * refilling the localq for next round. Side-effect can be
+ * alloc from SLAB.
+ */
+ elem = __qmempool_alloc_from_sharedq(pool, gfp_mask, cpu->localq);
+ return elem;
+}
+
+static inline void *__qmempool_alloc(struct qmempool *pool, gfp_t gfp_mask)
+{
+ void *elem;
+ int state;
+
+ state = __qmempool_preempt_disable();
+ elem = main_qmempool_alloc(pool, gfp_mask);
+ __qmempool_preempt_enable(state);
+ return elem;
+}
+
+static inline void *__qmempool_alloc_softirq(struct qmempool *pool,
+ gfp_t gfp_mask)
+{
+ return main_qmempool_alloc(pool, gfp_mask);
+}
+
+/* Main free function */
+static inline void __qmempool_free(struct qmempool *pool, void *elem)
+{
+ struct qmempool_percpu *cpu;
+ int num;
+ int state;
+
+ /* NUMA considerations, how do we make sure to avoid caching
+ * elements from a different NUMA node.
+ */
+ state = __qmempool_preempt_disable();
+
+ /* 1. attempt to free/return element to local per CPU queue */
+ cpu = this_cpu_ptr(pool->percpu);
+ num = alf_sp_enqueue(cpu->localq, &elem, 1);
+ if (num == 1) /* success: element free'ed by enqueue to localq */
+ goto done;
+
+ /* 2. localq cannot store more elements, need to return some
+ * from localq to sharedq, to make room. Side-effect can be
+ * free to SLAB.
+ */
+ __qmempool_free_to_sharedq(elem, pool, cpu->localq);
+
+done:
+ __qmempool_preempt_enable(state);
+}
+
+/* API users can choose to use "__" prefixed versions for inlining */
+extern void *qmempool_alloc(struct qmempool *pool, gfp_t gfp_mask);
+extern void *qmempool_alloc_softirq(struct qmempool *pool, gfp_t gfp_mask);
+extern void qmempool_free(struct qmempool *pool, void *elem);
+
+#endif /* _LINUX_QMEMPOOL_H */
diff --git a/mm/Kconfig b/mm/Kconfig
index 1d1ae6b..abaa94c 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -618,3 +618,15 @@ config MAX_STACK_SIZE_MB
changed to a smaller value in which case that is used.

A sane initial value is 80 MB.
+
+config QMEMPOOL
+ bool "Quick queue based mempool (qmempool)"
+ default y
+ select ALF_QUEUE
+ help
+ A mempool designed for faster than SLAB/kmem_cache
+ reuse/caching of fixed size memory elements. Works as a
+ caching layer in-front of existing kmem_cache SLABs. Speed
+ is achieved by _bulk_ refilling percpu local cache, from a
+ Lock-Free queue requiring a single (locked) cmpxchg per bulk
+ transfer, thus amortizing the cost of the cmpxchg.
diff --git a/mm/Makefile b/mm/Makefile
index 8405eb0..49c1e18 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -69,3 +69,4 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
+obj-$(CONFIG_QMEMPOOL) += qmempool.o
diff --git a/mm/qmempool.c b/mm/qmempool.c
new file mode 100644
index 0000000..d6debcc
--- /dev/null
+++ b/mm/qmempool.c
@@ -0,0 +1,322 @@
+/*
+ * qmempool - a quick queue based mempool
+ *
+ * Copyright (C) 2014, Red Hat, Inc., Jesper Dangaard Brouer
+ * for licensing details see kernel-base/COPYING
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/percpu.h>
+#include <linux/qmempool.h>
+#include <linux/log2.h>
+
+/* Due to hotplug CPU support, we need access to all qmempools
+ * in-order to cleanup elements in localq for the CPU going offline.
+ *
+ * TODO: implement HOTPLUG_CPU
+#ifdef CONFIG_HOTPLUG_CPU
+static LIST_HEAD(qmempool_list);
+static DEFINE_SPINLOCK(qmempool_list_lock);
+#endif
+ */
+
+void qmempool_destroy(struct qmempool *pool)
+{
+ void *elem = NULL;
+ int j;
+
+ if (pool->percpu) {
+ for_each_possible_cpu(j) {
+ struct qmempool_percpu *cpu =
+ per_cpu_ptr(pool->percpu, j);
+
+ while (alf_mc_dequeue(cpu->localq, &elem, 1) == 1)
+ kmem_cache_free(pool->kmem, elem);
+ BUG_ON(!alf_queue_empty(cpu->localq));
+ alf_queue_free(cpu->localq);
+ }
+ free_percpu(pool->percpu);
+ }
+
+ if (pool->sharedq) {
+ while (alf_mc_dequeue(pool->sharedq, &elem, 1) == 1)
+ kmem_cache_free(pool->kmem, elem);
+ BUG_ON(!alf_queue_empty(pool->sharedq));
+ alf_queue_free(pool->sharedq);
+ }
+
+ kfree(pool);
+}
+EXPORT_SYMBOL(qmempool_destroy);
+
+struct qmempool *
+qmempool_create(uint32_t localq_sz, uint32_t sharedq_sz, uint32_t prealloc,
+ struct kmem_cache *kmem, gfp_t gfp_mask)
+{
+ struct qmempool *pool;
+ int i, j, num;
+ void *elem;
+
+ /* Validate constraints, e.g. due to bulking */
+ if (localq_sz < QMEMPOOL_BULK) {
+ pr_err("%s() localq size(%d) too small for bulking\n",
+ __func__, localq_sz);
+ return NULL;
+ }
+ if (sharedq_sz < (QMEMPOOL_BULK * QMEMPOOL_REFILL_MULTIPLIER)) {
+ pr_err("%s() sharedq size(%d) too small for bulk refill\n",
+ __func__, sharedq_sz);
+ return NULL;
+ }
+ if (!is_power_of_2(localq_sz) || !is_power_of_2(sharedq_sz)) {
+ pr_err("%s() queue sizes (%d/%d) must be power-of-2\n",
+ __func__, localq_sz, sharedq_sz);
+ return NULL;
+ }
+ if (prealloc > sharedq_sz) {
+ pr_err("%s() prealloc(%d) req > sharedq size(%d)\n",
+ __func__, prealloc, sharedq_sz);
+ return NULL;
+ }
+ if ((prealloc % QMEMPOOL_BULK) != 0) {
+ pr_warn("%s() prealloc(%d) should be div by BULK size(%d)\n",
+ __func__, prealloc, QMEMPOOL_BULK);
+ }
+ if (!kmem) {
+ pr_err("%s() kmem_cache is a NULL ptr\n", __func__);
+ return NULL;
+ }
+
+ pool = kzalloc(sizeof(*pool), gfp_mask);
+ if (!pool)
+ return NULL;
+ pool->kmem = kmem;
+ pool->gfp_mask = gfp_mask;
+
+ /* MPMC (Multi-Producer-Multi-Consumer) queue */
+ pool->sharedq = alf_queue_alloc(sharedq_sz, gfp_mask);
+ if (IS_ERR_OR_NULL(pool->sharedq)) {
+ pr_err("%s() failed to create shared queue(%d) ERR_PTR:0x%p\n",
+ __func__, sharedq_sz, pool->sharedq);
+ qmempool_destroy(pool);
+ return NULL;
+ }
+
+ pool->prealloc = prealloc;
+ for (i = 0; i < prealloc; i++) {
+ elem = kmem_cache_alloc(pool->kmem, gfp_mask);
+ if (!elem) {
+ pr_err("%s() kmem_cache out of memory?!\n", __func__);
+ qmempool_destroy(pool);
+ return NULL;
+ }
+ /* Could use the SP version given it is not visible yet */
+ num = alf_mp_enqueue(pool->sharedq, &elem, 1);
+ BUG_ON(num <= 0);
+ }
+
+ pool->percpu = alloc_percpu(struct qmempool_percpu);
+ if (pool->percpu == NULL) {
+ pr_err("%s() failed to alloc percpu\n", __func__);
+ qmempool_destroy(pool);
+ return NULL;
+ }
+
+ /* SPSC (Single-Consumer-Single-Producer) queue per CPU */
+ for_each_possible_cpu(j) {
+ struct qmempool_percpu *cpu = per_cpu_ptr(pool->percpu, j);
+
+ cpu->localq = alf_queue_alloc(localq_sz, gfp_mask);
+ if (IS_ERR_OR_NULL(cpu->localq)) {
+ pr_err("%s() failed alloc localq(sz:%d) on cpu:%d\n",
+ __func__, localq_sz, j);
+ qmempool_destroy(pool);
+ return NULL;
+ }
+ }
+
+ return pool;
+}
+EXPORT_SYMBOL(qmempool_create);
+
+/* Element handling
+ */
+
+/* This function is called when sharedq runs-out of elements.
+ * Thus, sharedq needs to be refilled (enq) with elems from slab.
+ *
+ * Caller must assure this is called in an preemptive safe context due
+ * to alf_mp_enqueue() call.
+ */
+void *__qmempool_alloc_from_slab(struct qmempool *pool, gfp_t gfp_mask)
+{
+ void *elems[QMEMPOOL_BULK]; /* on stack variable */
+ void *elem;
+ int num, i, j;
+
+ /* Cannot use SLAB that can sleep if (gfp_mask & __GFP_WAIT),
+ * else preemption disable/enable scheme becomes too complicated
+ */
+ BUG_ON(gfp_mask & __GFP_WAIT);
+
+ elem = kmem_cache_alloc(pool->kmem, gfp_mask);
+ if (elem == NULL) /* slab depleted, no reason to call below allocs */
+ return NULL;
+
+ /* SLAB considerations, we need a kmem_cache interface that
+ * supports allocating a bulk of elements.
+ */
+
+ for (i = 0; i < QMEMPOOL_REFILL_MULTIPLIER; i++) {
+ for (j = 0; j < QMEMPOOL_BULK; j++) {
+ elems[j] = kmem_cache_alloc(pool->kmem, gfp_mask);
+ /* Handle if slab gives us NULL elem */
+ if (elems[j] == NULL) {
+ pr_err("%s() ARGH - slab returned NULL",
+ __func__);
+ num = alf_mp_enqueue(pool->sharedq, elems, j-1);
+ BUG_ON(num == 0); //FIXME handle
+ return elem;
+ }
+ }
+ num = alf_mp_enqueue(pool->sharedq, elems, QMEMPOOL_BULK);
+ /* FIXME: There is a theoretical chance that multiple
+ * CPU enter here, refilling sharedq at the same time,
+ * thus we must handle "full" situation, for now die
+ * hard so someone will need to fix this.
+ */
+ BUG_ON(num == 0); /* sharedq should have room */
+ }
+
+ /* What about refilling localq here? (else it will happen on
+ * next cycle, and will cost an extra cmpxchg).
+ */
+ return elem;
+}
+
+/* This function is called when the localq runs out-of elements.
+ * Thus, localq is refilled (enq) with elements (deq) from sharedq.
+ *
+ * Caller must assure this is called in an preemptive safe context due
+ * to alf_mp_dequeue() call.
+ */
+void *__qmempool_alloc_from_sharedq(struct qmempool *pool, gfp_t gfp_mask,
+ struct alf_queue *localq)
+{
+ void *elems[QMEMPOOL_BULK]; /* on stack variable */
+ void *elem;
+ int num;
+
+ /* Costs atomic "cmpxchg", but amortize cost by bulk dequeue */
+ num = alf_mc_dequeue(pool->sharedq, elems, QMEMPOOL_BULK);
+ if (likely(num > 0)) {
+ /* Consider prefetching data part of elements here, it
+ * should be an optimal place to hide memory prefetching.
+ * Especially given the localq is known to be an empty FIFO
+ * which guarantees the order objs are accessed in.
+ */
+ elem = elems[0]; /* extract one element */
+ if (num > 1) {
+ num = alf_sp_enqueue(localq, &elems[1], num-1);
+ /* Refill localq, should be empty, must succeed */
+ BUG_ON(num == 0);
+ }
+ return elem;
+ }
+ /* Use slab if sharedq runs out of elements */
+ elem = __qmempool_alloc_from_slab(pool, gfp_mask);
+ return elem;
+}
+EXPORT_SYMBOL(__qmempool_alloc_from_sharedq);
+
+/* Called when sharedq is full. Thus also make room in sharedq,
+ * besides also freeing the "elems" given.
+ */
+bool __qmempool_free_to_slab(struct qmempool *pool, void **elems, int n)
+{
+ int num, i, j;
+ /* SLAB considerations, we could use kmem_cache interface that
+ * supports returning a bulk of elements.
+ */
+
+ /* free these elements for real */
+ for (i = 0; i < n; i++)
+ kmem_cache_free(pool->kmem, elems[i]);
+
+ /* Make room in sharedq for next round */
+ for (i = 0; i < QMEMPOOL_REFILL_MULTIPLIER; i++) {
+ num = alf_mc_dequeue(pool->sharedq, elems, QMEMPOOL_BULK);
+ for (j = 0; j < num; j++)
+ kmem_cache_free(pool->kmem, elems[j]);
+ }
+ return true;
+}
+
+/* This function is called when the localq is full. Thus, elements
+ * from localq needs to be (dequeued) and returned (enqueued) to
+ * sharedq (or if shared is full, need to be free'ed to slab)
+ *
+ * MUST be called from a preemptive safe context.
+ */
+void __qmempool_free_to_sharedq(void *elem, struct qmempool *pool,
+ struct alf_queue *localq)
+{
+ void *elems[QMEMPOOL_BULK]; /* on stack variable */
+ int num_enq, num_deq;
+
+ elems[0] = elem;
+ /* Make room in localq */
+ num_deq = alf_sc_dequeue(localq, &elems[1], QMEMPOOL_BULK-1);
+ if (unlikely(num_deq == 0))
+ goto failed;
+ num_deq++; /* count first 'elem' */
+
+ /* Successful dequeued 'num_deq' elements from localq, "free"
+ * these elems by enqueuing to sharedq
+ */
+ num_enq = alf_mp_enqueue(pool->sharedq, elems, num_deq);
+ if (likely(num_enq == num_deq)) /* Success enqueued to sharedq */
+ return;
+
+ /* If sharedq is full (num_enq == 0) dequeue elements will be
+ * returned directly to the SLAB allocator.
+ *
+ * Note: This usage of alf_queue API depend on enqueue is
+ * fixed, by only enqueueing if all elements could fit, this
+ * is an API that might change.
+ */
+
+ __qmempool_free_to_slab(pool, elems, num_deq);
+ return;
+failed:
+ /* dequeing from a full localq should always be possible */
+ BUG();
+}
+EXPORT_SYMBOL(__qmempool_free_to_sharedq);
+
+/* API users can choose to use "__" prefixed versions for inlining */
+void *qmempool_alloc(struct qmempool *pool, gfp_t gfp_mask)
+{
+ return __qmempool_alloc(pool, gfp_mask);
+}
+EXPORT_SYMBOL(qmempool_alloc);
+
+void *qmempool_alloc_softirq(struct qmempool *pool, gfp_t gfp_mask)
+{
+ return __qmempool_alloc_softirq(pool, gfp_mask);
+}
+EXPORT_SYMBOL(qmempool_alloc_softirq);
+
+void qmempool_free(struct qmempool *pool, void *elem)
+{
+ return __qmempool_free(pool, elem);
+}
+EXPORT_SYMBOL(qmempool_free);
+
+MODULE_DESCRIPTION("Quick queue based mempool (qmempool)");
+MODULE_AUTHOR("Jesper Dangaard Brouer <netoptimizer@xxxxxxxxxx>");
+MODULE_LICENSE("GPL");

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/