Re: [RT LATENCY] 249 microsecond latency caused by slub'sunfreeze_partials() code.
From: JoonSoo Kim
Date: Mon Jun 03 2013 - 11:29:08 EST
Hello, Christoph.
2013/5/29 Christoph Lameter <cl@xxxxxxxxx>:
> I just ran some quick tests and the following seems to work.
>
> Critical portions that need additional review (Joonsoo?) are the
> modifications to get_partialinode() and __slab_free().
IMO, your code is fine to work.
But, this modification adds lots of "#ifdef" and makes code less clean.
How about *not* removing struct page *partial in struct kmem_cache_cpu?
This introduces memory overhead and makes code bigger
for !CONFIG_SLUB_CPU_PARTIAL, but, this will help to make clean code
and we will maintain code easily.
And I re-read Steven initial problem report in RT kernel and find that
unfreeze_partial() do lock and unlock several times. This means that
each page in cpu partial list doesn't come from same node. Why do we
add other node's slab to this cpu partial list? This is also not good
for general case. How about considering node affinity in __slab_free()?
IMHO, if we implement this, Steven's problem can be solved.
Thanks.
> Subject: slub: Make cpu partial slab support configurable V2
>
> cpu partial support can introduce level of indeterminism that is not wanted
> in certain context (like a realtime kernel). Make it configurable.
>
> Signed-off-by: Christoph Lameter <cl@xxxxxxxxx>
>
> Index: linux/include/linux/slub_def.h
> ===================================================================
> --- linux.orig/include/linux/slub_def.h 2013-05-20 15:21:26.925704505 -0500
> +++ linux/include/linux/slub_def.h 2013-05-20 15:27:40.827613445 -0500
> @@ -47,12 +47,24 @@ struct kmem_cache_cpu {
> void **freelist; /* Pointer to next available object */
> unsigned long tid; /* Globally unique transaction id */
> struct page *page; /* The slab from which we are allocating */
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> struct page *partial; /* Partially allocated frozen slabs */
> +#endif
> #ifdef CONFIG_SLUB_STATS
> unsigned stat[NR_SLUB_STAT_ITEMS];
> #endif
> };
>
> +static inline struct page *kmem_cache_cpu_partial_pages
> + (struct kmem_cache_cpu *c)
> +{
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> + return c->partial;
> +#else
> + return NULL;
> +#endif
> +}
> +
> /*
> * Word size structure that can be atomically updated or read and that
> * contains both the order and the number of objects that a slab of the
> @@ -73,7 +85,9 @@ struct kmem_cache {
> int size; /* The size of an object including meta data */
> int object_size; /* The size of an object without meta data */
> int offset; /* Free pointer offset. */
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> int cpu_partial; /* Number of per cpu partial objects to keep around */
> +#endif
> struct kmem_cache_order_objects oo;
>
> /* Allocation and freeing of slabs */
> @@ -104,6 +118,15 @@ struct kmem_cache {
> struct kmem_cache_node *node[MAX_NUMNODES];
> };
>
> +static inline int kmem_cache_cpu_partial(struct kmem_cache *s)
> +{
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> + return s->cpu_partial;
> +#else
> + return 0;
> +#endif
> +}
> +
> void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
> void *__kmalloc(size_t size, gfp_t flags);
>
> Index: linux/mm/slub.c
> ===================================================================
> --- linux.orig/mm/slub.c 2013-05-20 15:21:26.925704505 -0500
> +++ linux/mm/slub.c 2013-05-20 15:38:00.681437630 -0500
> @@ -1530,7 +1530,9 @@ static inline void *acquire_slab(struct
> return freelist;
> }
>
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
> +#endif
> static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags);
>
> /*
> @@ -1570,10 +1572,15 @@ static void *get_partial_node(struct kme
> stat(s, ALLOC_FROM_PARTIAL);
> object = t;
> } else {
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> put_cpu_partial(s, page, 0);
> stat(s, CPU_PARTIAL_NODE);
> +#else
> + BUG();
> +#endif
> }
> - if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
> + if (kmem_cache_debug(s) ||
> + available > kmem_cache_cpu_partial(s) / 2)
> break;
>
> }
> @@ -1874,6 +1881,7 @@ redo:
> }
> }
>
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> /*
> * Unfreeze all the cpu partial slabs.
> *
> @@ -1988,6 +1996,7 @@ static void put_cpu_partial(struct kmem_
>
> } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
> }
> +#endif
>
> static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
> {
> @@ -2012,7 +2021,9 @@ static inline void __flush_cpu_slab(stru
> if (c->page)
> flush_slab(s, c);
>
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> unfreeze_partials(s, c);
> +#endif
> }
> }
>
> @@ -2028,7 +2039,7 @@ static bool has_cpu_slab(int cpu, void *
> struct kmem_cache *s = info;
> struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
>
> - return c->page || c->partial;
> + return c->page || kmem_cache_cpu_partial_pages(c);
> }
>
> static void flush_all(struct kmem_cache *s)
> @@ -2224,6 +2235,7 @@ static void *__slab_alloc(struct kmem_ca
> page = c->page;
> if (!page)
> goto new_slab;
> +
> redo:
>
> if (unlikely(!node_match(page, node))) {
> @@ -2277,10 +2289,12 @@ load_freelist:
>
> new_slab:
>
> - if (c->partial) {
> + if (kmem_cache_cpu_partial_pages(c)) {
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> page = c->page = c->partial;
> c->partial = page->next;
> stat(s, CPU_PARTIAL_ALLOC);
> +#endif
> c->freelist = NULL;
> goto redo;
> }
> @@ -2495,6 +2509,7 @@ static void __slab_free(struct kmem_cach
> new.inuse--;
> if ((!new.inuse || !prior) && !was_frozen) {
>
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> if (!kmem_cache_debug(s) && !prior)
>
> /*
> @@ -2503,7 +2518,9 @@ static void __slab_free(struct kmem_cach
> */
> new.frozen = 1;
>
> - else { /* Needs to be taken off a list */
> + else
> +#endif
> + { /* Needs to be taken off a list */
>
> n = get_node(s, page_to_nid(page));
> /*
> @@ -2525,6 +2542,7 @@ static void __slab_free(struct kmem_cach
> "__slab_free"));
>
> if (likely(!n)) {
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
>
> /*
> * If we just froze the page then put it onto the
> @@ -2534,6 +2552,7 @@ static void __slab_free(struct kmem_cach
> put_cpu_partial(s, page, 1);
> stat(s, CPU_PARTIAL_FREE);
> }
> +#endif
> /*
> * The list lock was not taken therefore no list
> * activity can be necessary.
> @@ -3041,7 +3060,7 @@ static int kmem_cache_open(struct kmem_c
> * list to avoid pounding the page allocator excessively.
> */
> set_min_partial(s, ilog2(s->size) / 2);
> -
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> /*
> * cpu_partial determined the maximum number of objects kept in the
> * per cpu partial lists of a processor.
> @@ -3069,6 +3088,7 @@ static int kmem_cache_open(struct kmem_c
> s->cpu_partial = 13;
> else
> s->cpu_partial = 30;
> +#endif
>
> #ifdef CONFIG_NUMA
> s->remote_node_defrag_ratio = 1000;
> @@ -4283,14 +4303,15 @@ static ssize_t show_slab_objects(struct
> total += x;
> nodes[node] += x;
>
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> page = ACCESS_ONCE(c->partial);
> if (page) {
> x = page->pobjects;
> total += x;
> nodes[node] += x;
> }
> -
> per_cpu[node]++;
> +#endif
> }
> }
>
> @@ -4442,6 +4463,7 @@ static ssize_t min_partial_store(struct
> }
> SLAB_ATTR(min_partial);
>
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
> {
> return sprintf(buf, "%u\n", s->cpu_partial);
> @@ -4464,6 +4486,7 @@ static ssize_t cpu_partial_store(struct
> return length;
> }
> SLAB_ATTR(cpu_partial);
> +#endif
>
> static ssize_t ctor_show(struct kmem_cache *s, char *buf)
> {
> @@ -4503,6 +4526,7 @@ static ssize_t objects_partial_show(stru
> }
> SLAB_ATTR_RO(objects_partial);
>
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
> {
> int objects = 0;
> @@ -4533,6 +4557,7 @@ static ssize_t slabs_cpu_partial_show(st
> return len + sprintf(buf + len, "\n");
> }
> SLAB_ATTR_RO(slabs_cpu_partial);
> +#endif
>
> static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
> {
> @@ -4856,11 +4881,13 @@ STAT_ATTR(DEACTIVATE_BYPASS, deactivate_
> STAT_ATTR(ORDER_FALLBACK, order_fallback);
> STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
> STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
> STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
> STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
> STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
> #endif
> +#endif
>
> static struct attribute *slab_attrs[] = {
> &slab_size_attr.attr,
> @@ -4868,7 +4895,9 @@ static struct attribute *slab_attrs[] =
> &objs_per_slab_attr.attr,
> &order_attr.attr,
> &min_partial_attr.attr,
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> &cpu_partial_attr.attr,
> +#endif
> &objects_attr.attr,
> &objects_partial_attr.attr,
> &partial_attr.attr,
> @@ -4881,7 +4910,9 @@ static struct attribute *slab_attrs[] =
> &destroy_by_rcu_attr.attr,
> &shrink_attr.attr,
> &reserved_attr.attr,
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> &slabs_cpu_partial_attr.attr,
> +#endif
> #ifdef CONFIG_SLUB_DEBUG
> &total_objects_attr.attr,
> &slabs_attr.attr,
> @@ -4923,11 +4954,13 @@ static struct attribute *slab_attrs[] =
> &order_fallback_attr.attr,
> &cmpxchg_double_fail_attr.attr,
> &cmpxchg_double_cpu_fail_attr.attr,
> +#ifdef CONFIG_SLUB_CPU_PARTIAL
> &cpu_partial_alloc_attr.attr,
> &cpu_partial_free_attr.attr,
> &cpu_partial_node_attr.attr,
> &cpu_partial_drain_attr.attr,
> #endif
> +#endif
> #ifdef CONFIG_FAILSLAB
> &failslab_attr.attr,
> #endif
> Index: linux/init/Kconfig
> ===================================================================
> --- linux.orig/init/Kconfig 2013-05-20 15:21:26.925704505 -0500
> +++ linux/init/Kconfig 2013-05-20 15:21:26.921704441 -0500
> @@ -1558,6 +1558,17 @@ config SLOB
>
> endchoice
>
> +config SLUB_CPU_PARTIAL
> + default y
> + depends on SLUB
> + bool "SLUB per cpu partial cache"
> + help
> + Per cpu partial caches accellerate objects allocation and freeing
> + that is local to a processor at the price of more indeterminism
> + in the latency of the free. On overflow these caches will be cleared
> + which requires the taking of locks that may cause latency spikes.
> + Typically one would choose no for a realtime system.
> +
> config MMAP_ALLOW_UNINITIALIZED
> bool "Allow mmapped anonymous memory to be uninitialized"
> depends on EXPERT && !MMU
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/