[PATCH -mm v2 8/9] list_lru: introduce per-memcg lists

From: Vladimir Davydov
Date: Fri Oct 24 2014 - 06:38:54 EST


There are several FS shrinkers, including super_block::s_shrink, that
keep reclaimable objects in the list_lru structure. Hence to turn them
to memcg-aware shrinkers, it is enough to make list_lru per-memcg.

This patch does the trick. It adds an array of lru lists to the
list_lru_node structure (per-node part of the list_lru), one for each
kmem-active memcg, and dispatches every item addition or removal to the
list corresponding to the memcg which the item is accounted to. So now
the list_lru structure is not just per node, but per node and per memcg.

Not all list_lrus need this feature, so this patch also adds a new
method, list_lru_init_memcg, which initializes a list_lru as memcg
aware. Otherwise (i.e. if initialized with old list_lru_init), the
list_lru won't have per memcg lists.

Just like per memcg caches arrays, the arrays of per-memcg lists are
indexed by memcg_cache_id, so we must grow them whenever
memcg_max_cache_ids is increased. So we introduce a callback,
memcg_update_all_list_lrus, invoked by memcg_alloc_cache_id if the id
space is full.

The locking is implemented in a manner similar to lruvecs, i.e. we have
one lock per node that protects all lists (both global and per cgroup)
on the node.

Signed-off-by: Vladimir Davydov <vdavydov@xxxxxxxxxxxxx>
---
include/linux/list_lru.h | 54 ++++++--
include/linux/memcontrol.h | 7 +
mm/list_lru.c | 325 ++++++++++++++++++++++++++++++++++++++++----
mm/memcontrol.c | 27 ++++
4 files changed, 377 insertions(+), 36 deletions(-)

diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index ee9486ac0621..731acd3bd6e6 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -11,6 +11,8 @@
#include <linux/nodemask.h>
#include <linux/shrinker.h>

+struct mem_cgroup;
+
/* list_lru_walk_cb has to always return one of those */
enum lru_status {
LRU_REMOVED, /* item removed from list */
@@ -22,11 +24,26 @@ enum lru_status {
internally, but has to return locked. */
};

-struct list_lru_node {
- spinlock_t lock;
+struct list_lru_one {
struct list_head list;
/* kept as signed so we can catch imbalance bugs */
long nr_items;
+};
+
+struct list_lru_memcg {
+ /* array of per cgroup lists, indexed by memcg_cache_id */
+ struct list_lru_one *lru[0];
+};
+
+struct list_lru_node {
+ /* protects all lists on the node, including per cgroup */
+ spinlock_t lock;
+ /* global list, used for the root cgroup in cgroup aware lrus */
+ struct list_lru_one lru;
+#ifdef CONFIG_MEMCG_KMEM
+ /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
+ struct list_lru_memcg *memcg_lrus;
+#endif
} ____cacheline_aligned_in_smp;

struct list_lru {
@@ -36,12 +53,17 @@ struct list_lru {
#endif
};

+#ifdef CONFIG_MEMCG_KMEM
+int memcg_update_all_list_lrus(int num_memcgs);
+#endif
+
void list_lru_destroy(struct list_lru *lru);
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
-static inline int list_lru_init(struct list_lru *lru)
-{
- return list_lru_init_key(lru, NULL);
-}
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+ struct lock_class_key *key);
+
+#define list_lru_init(lru) __list_lru_init((lru), false, NULL)
+#define list_lru_init_key(lru, key) __list_lru_init((lru), false, (key))
+#define list_lru_init_memcg(lru) __list_lru_init((lru), true, NULL)

/**
* list_lru_add: add an element to the lru list's tail
@@ -75,20 +97,23 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item);
bool list_lru_del(struct list_lru *lru, struct list_head *item);

/**
- * list_lru_count_node: return the number of objects currently held by @lru
+ * list_lru_count_one: return the number of objects currently held by @lru
* @lru: the lru pointer.
* @nid: the node id to count from.
+ * @memcg: the cgroup to count from.
*
* Always return a non-negative number, 0 for empty lists. There is no
* guarantee that the list is not updated while the count is being computed.
* Callers that want such a guarantee need to provide an outer lock.
*/
+unsigned long list_lru_count_one(struct list_lru *lru,
+ int nid, struct mem_cgroup *memcg);
unsigned long list_lru_count_node(struct list_lru *lru, int nid);

static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
struct shrink_control *sc)
{
- return list_lru_count_node(lru, sc->nid);
+ return list_lru_count_one(lru, sc->nid, sc->memcg);
}

static inline unsigned long list_lru_count(struct list_lru *lru)
@@ -105,9 +130,10 @@ static inline unsigned long list_lru_count(struct list_lru *lru)
typedef enum lru_status
(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
/**
- * list_lru_walk_node: walk a list_lru, isolating and disposing freeable items.
+ * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items.
* @lru: the lru pointer.
* @nid: the node id to scan from.
+ * @memcg: the cgroup to scan from.
* @isolate: callback function that is resposible for deciding what to do with
* the item currently being scanned
* @cb_arg: opaque type that will be passed to @isolate
@@ -125,6 +151,10 @@ typedef enum lru_status
*
* Return value: the number of objects effectively removed from the LRU.
*/
+unsigned long list_lru_walk_one(struct list_lru *lru,
+ int nid, struct mem_cgroup *memcg,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk);
unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk);
@@ -133,8 +163,8 @@ static inline unsigned long
list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
list_lru_walk_cb isolate, void *cb_arg)
{
- return list_lru_walk_node(lru, sc->nid, isolate, cb_arg,
- &sc->nr_to_scan);
+ return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg,
+ &sc->nr_to_scan);
}

static inline unsigned long
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index eebb56b94d23..fd1219cb8ee5 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -419,6 +419,8 @@ static inline bool memcg_kmem_enabled(void)
bool memcg_kmem_is_active(struct mem_cgroup *memcg);
bool memcg_kmem_is_active_subtree(struct mem_cgroup *memcg);

+struct mem_cgroup *mem_cgroup_from_kmem(void *ptr);
+
/*
* In general, we'll do everything in our power to not incur in any overhead
* for non-memcg users for the kmem functions. Not even a function call, if we
@@ -554,6 +556,11 @@ static inline bool memcg_kmem_is_active_subtree(struct mem_cgroup *memcg)
return false;
}

+static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
+{
+ return NULL;
+}
+
static inline bool
memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
{
diff --git a/mm/list_lru.c b/mm/list_lru.c
index a9021cb3ccde..4041e5a1569b 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -10,6 +10,7 @@
#include <linux/list_lru.h>
#include <linux/slab.h>
#include <linux/mutex.h>
+#include <linux/memcontrol.h>

#ifdef CONFIG_MEMCG_KMEM
static LIST_HEAD(list_lrus);
@@ -38,16 +39,56 @@ static void list_lru_unregister(struct list_lru *lru)
}
#endif /* CONFIG_MEMCG_KMEM */

+#ifdef CONFIG_MEMCG_KMEM
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+ return !!lru->node[0].memcg_lrus;
+}
+
+/*
+ * For the given list_lru_node return the list corresponding to the memory
+ * cgroup whose memcg_cache_id equals @idx. If @idx < 0 or the list_lru is not
+ * cgroup aware, the global list is returned.
+ */
+static inline struct list_lru_one *
+lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+ /*
+ * The lock protects the array of per cgroup lists from relocation
+ * (see update_memcg_lrus).
+ */
+ lockdep_assert_held(&nlru->lock);
+ if (nlru->memcg_lrus && idx >= 0)
+ return nlru->memcg_lrus->lru[idx];
+
+ return &nlru->lru;
+}
+#else
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+ return false;
+}
+
+static inline struct list_lru_one *
+lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+ return &nlru->lru;
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
+ struct mem_cgroup *memcg = mem_cgroup_from_kmem(item);
+ struct list_lru_one *l;

spin_lock(&nlru->lock);
- WARN_ON_ONCE(nlru->nr_items < 0);
+ l = lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+ WARN_ON_ONCE(l->nr_items < 0);
if (list_empty(item)) {
- list_add_tail(item, &nlru->list);
- nlru->nr_items++;
+ list_add_tail(item, &l->list);
+ l->nr_items++;
spin_unlock(&nlru->lock);
return true;
}
@@ -60,12 +101,15 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
+ struct mem_cgroup *memcg = mem_cgroup_from_kmem(item);
+ struct list_lru_one *l;

spin_lock(&nlru->lock);
+ l = lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
if (!list_empty(item)) {
list_del_init(item);
- nlru->nr_items--;
- WARN_ON_ONCE(nlru->nr_items < 0);
+ l->nr_items--;
+ WARN_ON_ONCE(l->nr_items < 0);
spin_unlock(&nlru->lock);
return true;
}
@@ -74,33 +118,58 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
}
EXPORT_SYMBOL_GPL(list_lru_del);

-unsigned long
-list_lru_count_node(struct list_lru *lru, int nid)
+static unsigned long __list_lru_count_one(struct list_lru *lru,
+ int nid, int memcg_idx)
{
- unsigned long count = 0;
struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
+ unsigned long count;

spin_lock(&nlru->lock);
- WARN_ON_ONCE(nlru->nr_items < 0);
- count += nlru->nr_items;
+ l = lru_from_memcg_idx(nlru, memcg_idx);
+ WARN_ON_ONCE(l->nr_items < 0);
+ count = l->nr_items;
spin_unlock(&nlru->lock);

return count;
}
+
+unsigned long list_lru_count_one(struct list_lru *lru,
+ int nid, struct mem_cgroup *memcg)
+{
+ return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
+}
+EXPORT_SYMBOL_GPL(list_lru_count_one);
+
+unsigned long list_lru_count_node(struct list_lru *lru, int nid)
+{
+ long count = 0;
+ int memcg_idx;
+
+ count += __list_lru_count_one(lru, nid, -1);
+ if (list_lru_memcg_aware(lru)) {
+ for_each_memcg_cache_index(memcg_idx)
+ count += __list_lru_count_one(lru, nid, memcg_idx);
+ }
+ return count;
+}
EXPORT_SYMBOL_GPL(list_lru_count_node);

-unsigned long
-list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
- void *cb_arg, unsigned long *nr_to_walk)
+static unsigned long
+__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
{

- struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
struct list_head *item, *n;
unsigned long isolated = 0;

spin_lock(&nlru->lock);
+ l = lru_from_memcg_idx(nlru, memcg_idx);
restart:
- list_for_each_safe(item, n, &nlru->list) {
+ list_for_each_safe(item, n, &l->list) {
enum lru_status ret;

/*
@@ -116,8 +185,8 @@ restart:
case LRU_REMOVED_RETRY:
assert_spin_locked(&nlru->lock);
case LRU_REMOVED:
- nlru->nr_items--;
- WARN_ON_ONCE(nlru->nr_items < 0);
+ l->nr_items--;
+ WARN_ON_ONCE(l->nr_items < 0);
isolated++;
/*
* If the lru lock has been dropped, our list
@@ -128,7 +197,7 @@ restart:
goto restart;
break;
case LRU_ROTATE:
- list_move_tail(item, &nlru->list);
+ list_move_tail(item, &l->list);
break;
case LRU_SKIP:
break;
@@ -147,12 +216,205 @@ restart:
spin_unlock(&nlru->lock);
return isolated;
}
+
+unsigned long
+list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
+{
+ return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
+ isolate, cb_arg, nr_to_walk);
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_one);
+
+unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
+{
+ long isolated = 0;
+ int memcg_idx;
+
+ isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
+ nr_to_walk);
+ if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
+ for_each_memcg_cache_index(memcg_idx) {
+ isolated += __list_lru_walk_one(lru, nid, memcg_idx,
+ isolate, cb_arg, nr_to_walk);
+ if (*nr_to_walk <= 0)
+ break;
+ }
+ }
+ return isolated;
+}
EXPORT_SYMBOL_GPL(list_lru_walk_node);

-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
+static void init_one_lru(struct list_lru_one *l)
+{
+ INIT_LIST_HEAD(&l->list);
+ l->nr_items = 0;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * Free the given array of per cgroup lists.
+ */
+static void list_lru_memcg_free(struct list_lru_memcg *p)
+{
+ int i, nr;
+
+ if (p) {
+ nr = ksize(p) / sizeof(void *);
+ for (i = 0; i < nr; i++)
+ kfree(p->lru[i]);
+ kfree(p);
+ }
+}
+
+/*
+ * Allocate an array of per cgroup lists that may store up to @nr lists and
+ * initialize it starting from @init_from.
+ */
+static struct list_lru_memcg *list_lru_memcg_alloc(int nr, int init_from)
+{
+ int i;
+ struct list_lru_memcg *p;
+ struct list_lru_one *l;
+
+ p = kmalloc(nr * sizeof(void *), GFP_KERNEL);
+ if (!p)
+ return NULL;
+
+ /*
+ * Instead of storing the array size along with the array, we employ
+ * ksize(). Therefore we must zero the whole structure to make sure
+ * list_lru_memcg_free() won't dereference crap.
+ */
+ memset(p, 0, ksize(p));
+
+ for (i = init_from; i < nr; i++) {
+ l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
+ if (!l) {
+ list_lru_memcg_free(p);
+ return NULL;
+ }
+ init_one_lru(l);
+ p->lru[i] = l;
+ }
+ return p;
+}
+
+/*
+ * Destroy per cgroup lists on each node for the given list_lru.
+ */
+static void list_lru_destroy_memcg_lrus(struct list_lru *lru)
+{
+ int i;
+
+ for (i = 0; i < nr_node_ids; i++)
+ list_lru_memcg_free(lru->node[i].memcg_lrus);
+}
+
+/*
+ * Initialize per cgroup lists on each node for the given list_lru.
+ */
+static int list_lru_init_memcg_lrus(struct list_lru *lru)
+{
+ int i;
+ struct list_lru_memcg *p;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ /*
+ * If memcg_max_cache_ids equals 0 (i.e. kmem accounting is
+ * inactive), kmalloc will return ZERO_SIZE_PTR (not NULL), so
+ * that the lru will still be cgroup aware.
+ */
+ p = list_lru_memcg_alloc(memcg_max_cache_ids, 0);
+ if (!p) {
+ list_lru_destroy_memcg_lrus(lru);
+ return -ENOMEM;
+ }
+ lru->node[i].memcg_lrus = p;
+ }
+ return 0;
+}
+
+/*
+ * Update per cgroup list arrays on each node for the given list_lru to be able
+ * to store up to @num_memcgs elements.
+ */
+static int list_lru_update_memcg_lrus(struct list_lru *lru, int num_memcgs)
+{
+ int i;
+ struct list_lru_node *nlru;
+ struct list_lru_memcg *old, *new;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ nlru = &lru->node[i];
+ old = nlru->memcg_lrus;
+
+ new = list_lru_memcg_alloc(num_memcgs, memcg_max_cache_ids);
+ if (!new)
+ return -ENOMEM;
+
+ memcpy(new, old, memcg_max_cache_ids * sizeof(void *));
+
+ /*
+ * The lock guarantees that we won't race with a reader
+ * (see also lru_from_memcg_idx).
+ *
+ * Since list_lru functions may be called under an IRQ-safe
+ * lock, we have to use IRQ-safe primitives here to avoid
+ * deadlock.
+ */
+ spin_lock_irq(&nlru->lock);
+ nlru->memcg_lrus = new;
+ spin_unlock_irq(&nlru->lock);
+
+ kfree(old);
+ }
+ return 0;
+}
+
+/*
+ * This function is called from the memory cgroup core before increasing
+ * memcg_max_cache_ids. We must update all lrus' arrays of per cgroup lists to
+ * conform to the new size. The memcg_cache_id_space_sem is held for writing.
+ */
+int memcg_update_all_list_lrus(int num_memcgs)
+{
+ int ret = 0;
+ struct list_lru *lru;
+
+ mutex_lock(&list_lrus_mutex);
+ list_for_each_entry(lru, &list_lrus, list) {
+ ret = list_lru_update_memcg_lrus(lru, num_memcgs);
+ /*
+ * It isn't worth the trouble to revert to the old size if we
+ * fail, so we just leave the lrus updated to this point.
+ */
+ if (ret)
+ break;
+ }
+ mutex_unlock(&list_lrus_mutex);
+ return ret;
+}
+#else
+static int list_lru_init_memcg_lrus(struct list_lru *lru)
+{
+ return 0;
+}
+
+static void list_lru_destroy_memcg_lrus(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+ struct lock_class_key *key)
{
int i;
size_t size = sizeof(*lru->node) * nr_node_ids;
+ int err = 0;

lru->node = kzalloc(size, GFP_KERNEL);
if (!lru->node)
@@ -162,13 +424,27 @@ int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
spin_lock_init(&lru->node[i].lock);
if (key)
lockdep_set_class(&lru->node[i].lock, key);
- INIT_LIST_HEAD(&lru->node[i].list);
- lru->node[i].nr_items = 0;
+ init_one_lru(&lru->node[i].lru);
}
- list_lru_register(lru);
- return 0;
+
+ /*
+ * Note, memcg_max_cache_ids must remain stable while we are
+ * allocating per cgroup lrus *and* registering the list_lru,
+ * otherwise memcg_update_all_list_lrus can skip our list_lru.
+ */
+ memcg_lock_cache_id_space();
+ if (memcg_aware)
+ err = list_lru_init_memcg_lrus(lru);
+
+ if (!err)
+ list_lru_register(lru);
+ else
+ kfree(lru->node);
+
+ memcg_unlock_cache_id_space();
+ return err;
}
-EXPORT_SYMBOL_GPL(list_lru_init_key);
+EXPORT_SYMBOL_GPL(__list_lru_init);

void list_lru_destroy(struct list_lru *lru)
{
@@ -176,6 +452,7 @@ void list_lru_destroy(struct list_lru *lru)
if (!lru->node)
return;
list_lru_unregister(lru);
+ list_lru_destroy_memcg_lrus(lru);
kfree(lru->node);
lru->node = NULL;
}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 444bf8fe5f1d..81f4d2485fbc 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2625,6 +2625,9 @@ static int memcg_alloc_cache_id(void)
mutex_unlock(&memcg_slab_mutex);

if (!err)
+ err = memcg_update_all_list_lrus(size);
+
+ if (!err)
memcg_max_cache_ids = size;
up_write(&memcg_cache_id_space_sem);

@@ -3013,6 +3016,30 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
memcg_uncharge_kmem(memcg, 1 << order);
pc->mem_cgroup = NULL;
}
+
+struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct page_cgroup *pc;
+ struct kmem_cache *cachep;
+ struct page *page;
+
+ if (!memcg_kmem_enabled())
+ return NULL;
+
+ page = virt_to_head_page(ptr);
+ if (PageSlab(page)) {
+ cachep = page->slab_cache;
+ if (!is_root_cache(cachep))
+ memcg = cachep->memcg_params->memcg;
+ } else {
+ /* page allocated with alloc_kmem_pages */
+ pc = lookup_page_cgroup(page);
+ if (pc->mem_cgroup)
+ memcg = pc->mem_cgroup;
+ }
+ return memcg;
+}
#else
static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg)
{
--
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/