[PATCH -mm 3/3] slab: lock_memory_hotplug for kmem_cache_{create,destroy,shrink}

From: Vladimir Davydov
Date: Sun Apr 06 2014 - 11:34:37 EST


When we create a sl[au]b cache, we allocate kmem_cache_node structures
for each online NUMA node. To handle nodes taken online/offline, we
register memory hotplug notifier and allocate/free kmem_cache_node
corresponding to the node that changes its state for each kmem cache.

To synchronize between the two paths we hold the slab_mutex during both
the cache creationg/destruction path and while tuning per-node parts of
kmem caches in memory hotplug handler, but that's not quite right,
because it does not guarantee that a newly created cache will have all
kmem_cache_nodes initialized in case it races with memory hotplug. For
instance, in case of slub:

CPU0 CPU1
---- ----
kmem_cache_create: online_pages:
__kmem_cache_create: slab_memory_callback:
slab_mem_going_online_callback:
lock slab_mutex
for each slab_caches list entry
allocate kmem_cache node
unlock slab_mutex
lock slab_mutex
init_kmem_cache_nodes:
for_each_node_state(node, N_NORMAL_MEMORY)
allocate kmem_cache node
add kmem_cache to slab_caches list
unlock slab_mutex
online_pages (continued):
node_states_set_node

As a result we'll get a kmem cache with not all kmem_cache_nodes
allocated.

To avoid issues like that we should hold the memory hotplug lock during
the whole kmem cache creation/destruction/shrink paths, just like we do
with cpu hotplug. This patch does the trick.

Signed-off-by: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx>
Cc: Christoph Lameter <cl@xxxxxxxxxxxxxxxxxxxx>
Cc: Pekka Enberg <penberg@xxxxxxxxxx>
---
mm/slab.c | 26 ++------------------------
mm/slab.h | 1 +
mm/slab_common.c | 35 +++++++++++++++++++++++++++++++++--
mm/slob.c | 3 +--
mm/slub.c | 5 ++---
5 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/mm/slab.c b/mm/slab.c
index eebc619ae33c..b33aae64e5d7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2442,8 +2442,7 @@ out:
return nr_freed;
}

-/* Called with slab_mutex held to protect against cpu hotplug */
-static int __cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shrink(struct kmem_cache *cachep)
{
int ret = 0, i = 0;
struct kmem_cache_node *n;
@@ -2464,32 +2463,11 @@ static int __cache_shrink(struct kmem_cache *cachep)
return (ret ? 1 : 0);
}

-/**
- * kmem_cache_shrink - Shrink a cache.
- * @cachep: The cache to shrink.
- *
- * Releases as many slabs as possible for a cache.
- * To help debugging, a zero exit status indicates all slabs were released.
- */
-int kmem_cache_shrink(struct kmem_cache *cachep)
-{
- int ret;
- BUG_ON(!cachep || in_interrupt());
-
- get_online_cpus();
- mutex_lock(&slab_mutex);
- ret = __cache_shrink(cachep);
- mutex_unlock(&slab_mutex);
- put_online_cpus();
- return ret;
-}
-EXPORT_SYMBOL(kmem_cache_shrink);
-
int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
int i;
struct kmem_cache_node *n;
- int rc = __cache_shrink(cachep);
+ int rc = __kmem_cache_shrink(cachep);

if (rc)
return rc;
diff --git a/mm/slab.h b/mm/slab.h
index 3045316b7c9d..00a47de7631f 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -91,6 +91,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)

int __kmem_cache_shutdown(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *);

struct seq_file;
struct file;
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f3cfccf76dda..321953cae074 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -205,6 +205,8 @@ kmem_cache_create(const char *name, size_t size, size_t align,
int err;

get_online_cpus();
+ lock_memory_hotplug();
+
mutex_lock(&slab_mutex);

err = kmem_cache_sanity_check(name, size);
@@ -239,6 +241,8 @@ kmem_cache_create(const char *name, size_t size, size_t align,

out_unlock:
mutex_unlock(&slab_mutex);
+
+ unlock_memory_hotplug();
put_online_cpus();

if (err) {
@@ -272,6 +276,8 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c
char *cache_name;

get_online_cpus();
+ lock_memory_hotplug();
+
mutex_lock(&slab_mutex);

/*
@@ -299,6 +305,8 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c

out_unlock:
mutex_unlock(&slab_mutex);
+
+ unlock_memory_hotplug();
put_online_cpus();
}

@@ -326,6 +334,8 @@ static int kmem_cache_destroy_memcg_children(struct kmem_cache *s)
void kmem_cache_destroy(struct kmem_cache *s)
{
get_online_cpus();
+ lock_memory_hotplug();
+
mutex_lock(&slab_mutex);

s->refcount--;
@@ -354,15 +364,36 @@ void kmem_cache_destroy(struct kmem_cache *s)
memcg_free_cache_params(s);
kfree(s->name);
kmem_cache_free(kmem_cache, s);
- goto out_put_cpus;
+ goto out;

out_unlock:
mutex_unlock(&slab_mutex);
-out_put_cpus:
+out:
+ unlock_memory_hotplug();
put_online_cpus();
}
EXPORT_SYMBOL(kmem_cache_destroy);

+/**
+ * kmem_cache_shrink - Shrink a cache.
+ * @cachep: The cache to shrink.
+ *
+ * Releases as many slabs as possible for a cache.
+ * To help debugging, a zero exit status indicates all slabs were released.
+ */
+int kmem_cache_shrink(struct kmem_cache *cachep)
+{
+ int ret;
+
+ get_online_cpus();
+ lock_memory_hotplug();
+ ret = __kmem_cache_shrink(cachep);
+ unlock_memory_hotplug();
+ put_online_cpus();
+ return ret;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
int slab_is_available(void)
{
return slab_state >= UP;
diff --git a/mm/slob.c b/mm/slob.c
index 730cad45d4be..21980e0f39a8 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -620,11 +620,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
return 0;
}

-int kmem_cache_shrink(struct kmem_cache *d)
+int __kmem_cache_shrink(struct kmem_cache *d)
{
return 0;
}
-EXPORT_SYMBOL(kmem_cache_shrink);

struct kmem_cache kmem_cache_boot = {
.name = "kmem_cache",
diff --git a/mm/slub.c b/mm/slub.c
index 5e234f1f8853..26d185d8cd64 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3412,7 +3412,7 @@ EXPORT_SYMBOL(kfree);
* being allocated from last increasing the chance that the last objects
* are freed in them.
*/
-int kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s)
{
int node;
int i;
@@ -3468,7 +3468,6 @@ int kmem_cache_shrink(struct kmem_cache *s)
kfree(slabs_by_inuse);
return 0;
}
-EXPORT_SYMBOL(kmem_cache_shrink);

static int slab_mem_going_offline_callback(void *arg)
{
@@ -3476,7 +3475,7 @@ static int slab_mem_going_offline_callback(void *arg)

mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list)
- kmem_cache_shrink(s);
+ __kmem_cache_shrink(s);
mutex_unlock(&slab_mutex);

return 0;
--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/