Re: [PATCH v5 03/13] mm: Assign memcg-aware shrinkers bitmap to memcg
From: Vladimir Davydov
Date: Sun May 13 2018 - 12:52:40 EST
On Thu, May 10, 2018 at 12:52:36PM +0300, Kirill Tkhai wrote:
> Imagine a big node with many cpus, memory cgroups and containers.
> Let we have 200 containers, every container has 10 mounts,
> and 10 cgroups. All container tasks don't touch foreign
> containers mounts. If there is intensive pages write,
> and global reclaim happens, a writing task has to iterate
> over all memcgs to shrink slab, before it's able to go
> to shrink_page_list().
>
> Iteration over all the memcg slabs is very expensive:
> the task has to visit 200 * 10 = 2000 shrinkers
> for every memcg, and since there are 2000 memcgs,
> the total calls are 2000 * 2000 = 4000000.
>
> So, the shrinker makes 4 million do_shrink_slab() calls
> just to try to isolate SWAP_CLUSTER_MAX pages in one
> of the actively writing memcg via shrink_page_list().
> I've observed a node spending almost 100% in kernel,
> making useless iteration over already shrinked slab.
>
> This patch adds bitmap of memcg-aware shrinkers to memcg.
> The size of the bitmap depends on bitmap_nr_ids, and during
> memcg life it's maintained to be enough to fit bitmap_nr_ids
> shrinkers. Every bit in the map is related to corresponding
> shrinker id.
>
> Next patches will maintain set bit only for really charged
> memcg. This will allow shrink_slab() to increase its
> performance in significant way. See the last patch for
> the numbers.
>
> Signed-off-by: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx>
> ---
> include/linux/memcontrol.h | 21 ++++++++
> mm/memcontrol.c | 116 ++++++++++++++++++++++++++++++++++++++++++++
> mm/vmscan.c | 16 ++++++
> 3 files changed, 152 insertions(+), 1 deletion(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 6cbea2f25a87..e5e7e0fc7158 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -105,6 +105,17 @@ struct lruvec_stat {
> long count[NR_VM_NODE_STAT_ITEMS];
> };
>
> +#ifdef CONFIG_MEMCG_SHRINKER
> +/*
> + * Bitmap of shrinker::id corresponding to memcg-aware shrinkers,
> + * which have elements charged to this memcg.
> + */
> +struct memcg_shrinker_map {
> + struct rcu_head rcu;
> + unsigned long map[0];
> +};
> +#endif /* CONFIG_MEMCG_SHRINKER */
> +
AFAIR we don't normally ifdef structure definitions.
> /*
> * per-zone information in memory controller.
> */
> @@ -118,6 +129,9 @@ struct mem_cgroup_per_node {
>
> struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1];
>
> +#ifdef CONFIG_MEMCG_SHRINKER
> + struct memcg_shrinker_map __rcu *shrinker_map;
> +#endif
> struct rb_node tree_node; /* RB tree node */
> unsigned long usage_in_excess;/* Set to the value by which */
> /* the soft limit is exceeded*/
> @@ -1255,4 +1269,11 @@ static inline void memcg_put_cache_ids(void)
>
> #endif /* CONFIG_MEMCG && !CONFIG_SLOB */
>
> +#ifdef CONFIG_MEMCG_SHRINKER
> +#define MEMCG_SHRINKER_MAP(memcg, nid) (memcg->nodeinfo[nid]->shrinker_map)
I don't really like this helper macro. Accessing shrinker_map directly
looks cleaner IMO.
> +
> +extern int memcg_shrinker_nr_max;
As I've mentioned before, the capacity of shrinker map should be a
private business of memcontrol.c IMHO. We shouldn't use it in vmscan.c
as max shrinker id, instead we should introduce another variable for
this, private to vmscan.c.
> +extern int memcg_expand_shrinker_maps(int old_id, int id);
... Then this function would take just one argument, max id, and would
update shrinker_map capacity if necessary in memcontrol.c under the
corresponding mutex, which would look much more readable IMHO as all
shrinker_map related manipulations would be isolated in memcontrol.c.
> +#endif /* CONFIG_MEMCG_SHRINKER */
> +
> #endif /* _LINUX_MEMCONTROL_H */
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 3df3efa7ff40..18e0fdf302a9 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -322,6 +322,116 @@ struct workqueue_struct *memcg_kmem_cache_wq;
>
> #endif /* !CONFIG_SLOB */
>
> +#ifdef CONFIG_MEMCG_SHRINKER
> +int memcg_shrinker_nr_max;
memcg_shrinker_map_capacity, may be?
> +static DEFINE_MUTEX(shrinkers_nr_max_mutex);
memcg_shrinker_map_mutex?
> +
> +static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
> +{
> + kvfree(container_of(head, struct memcg_shrinker_map, rcu));
> +}
> +
> +static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
> + int size, int old_size)
If you followed my advice and made the shrinker_map_capacity private to
memcontrol.c, you wouldn't need to pass old_size here either, just max
shrinker id.
> +{
> + struct memcg_shrinker_map *new, *old;
> + int nid;
> +
> + lockdep_assert_held(&shrinkers_nr_max_mutex);
> +
> + for_each_node(nid) {
> + old = rcu_dereference_protected(MEMCG_SHRINKER_MAP(memcg, nid), true);
> + /* Not yet online memcg */
> + if (old_size && !old)
> + return 0;
> +
> + new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
> + if (!new)
> + return -ENOMEM;
> +
> + /* Set all old bits, clear all new bits */
> + memset(new->map, (int)0xff, old_size);
> + memset((void *)new->map + old_size, 0, size - old_size);
> +
> + rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
> + if (old)
> + call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
> + }
> +
> + return 0;
> +}
> +
> +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
> +{
> + struct mem_cgroup_per_node *pn;
> + struct memcg_shrinker_map *map;
> + int nid;
> +
> + if (memcg == root_mem_cgroup)
> + return;
Nit: there's mem_cgroup_is_root() helper.
> +
> + mutex_lock(&shrinkers_nr_max_mutex);
Why do you need to take the mutex here? You don't access shrinker map
capacity here AFAICS.
> + for_each_node(nid) {
> + pn = mem_cgroup_nodeinfo(memcg, nid);
> + map = rcu_dereference_protected(pn->shrinker_map, true);
> + if (map)
> + call_rcu(&map->rcu, memcg_free_shrinker_map_rcu);
> + rcu_assign_pointer(pn->shrinker_map, NULL);
> + }
> + mutex_unlock(&shrinkers_nr_max_mutex);
> +}
> +
> +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
> +{
> + int ret, size = memcg_shrinker_nr_max/BITS_PER_BYTE;
> +
> + if (memcg == root_mem_cgroup)
> + return 0;
Nit: mem_cgroup_is_root().
> +
> + mutex_lock(&shrinkers_nr_max_mutex);
> + ret = memcg_expand_one_shrinker_map(memcg, size, 0);
I don't think it's worth reusing the function designed for reallocating
shrinker maps for initial allocation. Please just fold the code here -
it will make both 'alloc' and 'expand' easier to follow IMHO.
> + mutex_unlock(&shrinkers_nr_max_mutex);
> +
> + if (ret)
> + memcg_free_shrinker_maps(memcg);
> +
> + return ret;
> +}
> +
> +static struct idr mem_cgroup_idr;
Stray change.
> +
> +int memcg_expand_shrinker_maps(int old_nr, int nr)
> +{
> + int size, old_size, ret = 0;
> + struct mem_cgroup *memcg;
> +
> + old_size = old_nr / BITS_PER_BYTE;
> + size = nr / BITS_PER_BYTE;
> +
> + mutex_lock(&shrinkers_nr_max_mutex);
> +
> + if (!root_mem_cgroup)
> + goto unlock;
This wants a comment.
> +
> + for_each_mem_cgroup(memcg) {
> + if (memcg == root_mem_cgroup)
Nit: mem_cgroup_is_root().
> + continue;
> + ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
> + if (ret)
> + goto unlock;
> + }
> +unlock:
> + mutex_unlock(&shrinkers_nr_max_mutex);
> + return ret;
> +}
> +#else /* CONFIG_MEMCG_SHRINKER */
> +static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
> +{
> + return 0;
> +}
> +static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
> +#endif /* CONFIG_MEMCG_SHRINKER */
> +
> /**
> * mem_cgroup_css_from_page - css of the memcg associated with a page
> * @page: page of interest
> @@ -4471,6 +4581,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
> {
> struct mem_cgroup *memcg = mem_cgroup_from_css(css);
>
> + if (memcg_alloc_shrinker_maps(memcg)) {
> + mem_cgroup_id_remove(memcg);
> + return -ENOMEM;
> + }
> +
> /* Online state pins memcg ID, memcg ID pins CSS */
> atomic_set(&memcg->id.ref, 1);
> css_get(css);
> @@ -4522,6 +4637,7 @@ static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
> vmpressure_cleanup(&memcg->vmpressure);
> cancel_work_sync(&memcg->high_work);
> mem_cgroup_remove_from_trees(memcg);
> + memcg_free_shrinker_maps(memcg);
> memcg_free_kmem(memcg);
> mem_cgroup_free(memcg);
> }
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index d691beac1048..d8a2870710e0 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -174,12 +174,26 @@ static DEFINE_IDR(shrinker_idr);
>
> static int prealloc_memcg_shrinker(struct shrinker *shrinker)
> {
> - int id, ret;
> + int id, nr, ret;
>
> down_write(&shrinker_rwsem);
> ret = id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
> if (ret < 0)
> goto unlock;
> +
> + if (id >= memcg_shrinker_nr_max) {
> + nr = memcg_shrinker_nr_max * 2;
> + if (nr == 0)
> + nr = BITS_PER_BYTE;
> + BUG_ON(id >= nr);
The logic defining shrinker map capacity growth should be private to
memcontrol.c IMHO.
> +
> + if (memcg_expand_shrinker_maps(memcg_shrinker_nr_max, nr)) {
> + idr_remove(&shrinker_idr, id);
> + goto unlock;
> + }
> + memcg_shrinker_nr_max = nr;
> + }
> +
> shrinker->id = id;
> ret = 0;
> unlock:
>