Re: [rfc patch 3/6] mm: memcg-aware global reclaim

From: KAMEZAWA Hiroyuki
Date: Thu May 12 2011 - 20:47:42 EST


On Thu, 12 May 2011 16:53:55 +0200
Johannes Weiner <hannes@xxxxxxxxxxx> wrote:

> A page charged to a memcg is linked to a lru list specific to that
> memcg. At the same time, traditional global reclaim is obvlivious to
> memcgs, and all the pages are also linked to a global per-zone list.
>
> This patch changes traditional global reclaim to iterate over all
> existing memcgs, so that it no longer relies on the global list being
> present.
>
> This is one step forward in integrating memcg code better into the
> rest of memory management. It is also a prerequisite to get rid of
> the global per-zone lru lists.
>
> RFC:
>
> The algorithm implemented in this patch is very naive. For each zone
> scanned at each priority level, it iterates over all existing memcgs
> and considers them for scanning.
>
> This is just a prototype and I did not optimize it yet because I am
> unsure about the maximum number of memcgs that still constitute a sane
> configuration in comparison to the machine size.
>
> It is perfectly fair since all memcgs are scanned at each priority
> level.
>
> On my 4G quadcore laptop with 1000 memcgs, a significant amount of CPU
> time was spent just iterating memcgs during reclaim. But it can not
> really be claimed that the old code was much better, either: global
> LRU reclaim could mean that a few hundred memcgs would have been
> emptied out completely, while others stayed untouched.
>
> I am open to solutions that trade fairness against CPU-time but don't
> want to have an extreme in either direction. Maybe break out early if
> a number of memcgs has been successfully reclaimed from and remember
> the last one scanned.
>
> Signed-off-by: Johannes Weiner <hannes@xxxxxxxxxxx>
> ---
> include/linux/memcontrol.h | 7 ++
> mm/memcontrol.c | 148 +++++++++++++++++++++++++++++---------------
> mm/vmscan.c | 21 +++++--
> 3 files changed, 120 insertions(+), 56 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 5e9840f5..58728c7 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -104,6 +104,7 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
> /*
> * For memory reclaim.
> */
> +void mem_cgroup_hierarchy_walk(struct mem_cgroup *, struct mem_cgroup **);
> int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
> int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
> unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
> @@ -289,6 +290,12 @@ static inline bool mem_cgroup_disabled(void)
> return true;
> }
>
> +static inline void mem_cgroup_hierarchy_walk(struct mem_cgroup *start,
> + struct mem_cgroup **iter)
> +{
> + *iter = start;
> +}
> +
> static inline int
> mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
> {
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index bf5ab87..edcd55a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -313,7 +313,7 @@ static bool move_file(void)
> }
>
> /*
> - * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
> + * Maximum loops in mem_cgroup_soft_reclaim(), used for soft
> * limit reclaim to prevent infinite loops, if they ever occur.
> */
> #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100)
> @@ -339,16 +339,6 @@ enum charge_type {
> /* Used for OOM nofiier */
> #define OOM_CONTROL (0)
>
> -/*
> - * Reclaim flags for mem_cgroup_hierarchical_reclaim
> - */
> -#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0
> -#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT)
> -#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1
> -#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT)
> -#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2
> -#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT)
> -
> static void mem_cgroup_get(struct mem_cgroup *mem);
> static void mem_cgroup_put(struct mem_cgroup *mem);
> static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
> @@ -1381,6 +1371,86 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
> return min(limit, memsw);
> }
>
> +void mem_cgroup_hierarchy_walk(struct mem_cgroup *start,
> + struct mem_cgroup **iter)
> +{
> + struct mem_cgroup *mem = *iter;
> + int id;
> +
> + if (!start)
> + start = root_mem_cgroup;
> + /*
> + * Even without hierarchy explicitely enabled in the root
> + * memcg, it is the ultimate parent of all memcgs.
> + */
> + if (!(start == root_mem_cgroup || start->use_hierarchy)) {
> + *iter = start;
> + return;
> + }
> +
> + if (!mem)
> + id = css_id(&start->css);
> + else {
> + id = css_id(&mem->css);
> + css_put(&mem->css);
> + mem = NULL;
> + }
> +
> + do {
> + struct cgroup_subsys_state *css;
> +
> + rcu_read_lock();
> + css = css_get_next(&mem_cgroup_subsys, id+1, &start->css, &id);
> + /*
> + * The caller must already have a reference to the
> + * starting point of this hierarchy walk, do not grab
> + * another one. This way, the loop can be finished
> + * when the hierarchy root is returned, without any
> + * further cleanup required.
> + */
> + if (css && (css == &start->css || css_tryget(css)))
> + mem = container_of(css, struct mem_cgroup, css);
> + rcu_read_unlock();
> + if (!css)
> + id = 0;
> + } while (!mem);
> +
> + if (mem == root_mem_cgroup)
> + mem = NULL;
> +
> + *iter = mem;
> +}
> +
> +static unsigned long mem_cgroup_target_reclaim(struct mem_cgroup *mem,
> + gfp_t gfp_mask,
> + bool noswap,
> + bool shrink)
> +{
> + unsigned long total = 0;
> + int loop;
> +
> + if (mem->memsw_is_minimum)
> + noswap = true;
> +
> + for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
> + drain_all_stock_async();
> + total += try_to_free_mem_cgroup_pages(mem, gfp_mask, noswap,
> + get_swappiness(mem));
> + if (total && shrink)
> + break;
> + if (mem_cgroup_margin(mem))
> + break;
> + /*
> + * If we have not been able to reclaim anything after
> + * two reclaim attempts, there may be no reclaimable
> + * pages under this hierarchy.
> + */
> + if (loop && !total)
> + break;
> + }
> + return total;
> +}
> +
> /*
> * Visit the first child (need not be the first child as per the ordering
> * of the cgroup list, since we track last_scanned_child) of @mem and use
> @@ -1427,21 +1497,16 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
> *
> * We give up and return to the caller when we visit root_mem twice.
> * (other groups can be removed while we're walking....)
> - *
> - * If shrink==true, for avoiding to free too much, this returns immedieately.
> */
> -static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
> - struct zone *zone,
> - gfp_t gfp_mask,
> - unsigned long reclaim_options)
> +static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_mem,
> + struct zone *zone,
> + gfp_t gfp_mask)
> {
> struct mem_cgroup *victim;
> int ret, total = 0;
> int loop = 0;
> - bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP;
> - bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK;
> - bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT;
> unsigned long excess;
> + bool noswap = false;
>
> excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT;
>
> @@ -1461,7 +1526,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
> * anything, it might because there are
> * no reclaimable pages under this hierarchy
> */
> - if (!check_soft || !total) {
> + if (!total) {
> css_put(&victim->css);
> break;
> }
> @@ -1484,25 +1549,11 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
> continue;
> }
> /* we use swappiness of local cgroup */
> - if (check_soft)
> - ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
> + ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
> noswap, get_swappiness(victim), zone);
> - else
> - ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
> - noswap, get_swappiness(victim));
> css_put(&victim->css);
> - /*
> - * At shrinking usage, we can't check we should stop here or
> - * reclaim more. It's depends on callers. last_scanned_child
> - * will work enough for keeping fairness under tree.
> - */
> - if (shrink)
> - return ret;
> total += ret;
> - if (check_soft) {
> - if (!res_counter_soft_limit_excess(&root_mem->res))
> - return total;
> - } else if (mem_cgroup_margin(root_mem))
> + if (!res_counter_soft_limit_excess(&root_mem->res))
> return total;
> }
> return total;
> @@ -1897,7 +1948,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
> unsigned long csize = nr_pages * PAGE_SIZE;
> struct mem_cgroup *mem_over_limit;
> struct res_counter *fail_res;
> - unsigned long flags = 0;
> + bool noswap = false;
> int ret;
>
> ret = res_counter_charge(&mem->res, csize, &fail_res);
> @@ -1911,7 +1962,7 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
>
> res_counter_uncharge(&mem->res, csize);
> mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
> - flags |= MEM_CGROUP_RECLAIM_NOSWAP;
> + noswap = true;
> } else
> mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
> /*
> @@ -1927,8 +1978,8 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
> if (!(gfp_mask & __GFP_WAIT))
> return CHARGE_WOULDBLOCK;
>
> - ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
> - gfp_mask, flags);
> + ret = mem_cgroup_target_reclaim(mem_over_limit, gfp_mask,
> + noswap, false);
> if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
> return CHARGE_RETRY;
> /*
> @@ -3085,7 +3136,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
>
> /*
> * A call to try to shrink memory usage on charge failure at shmem's swapin.
> - * Calling hierarchical_reclaim is not enough because we should update
> + * Calling target_reclaim is not enough because we should update
> * last_oom_jiffies to prevent pagefault_out_of_memory from invoking global OOM.
> * Moreover considering hierarchy, we should reclaim from the mem_over_limit,
> * not from the memcg which this page would be charged to.
> @@ -3167,7 +3218,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> int enlarge;
>
> /*
> - * For keeping hierarchical_reclaim simple, how long we should retry
> + * For keeping target_reclaim simple, how long we should retry
> * is depends on callers. We set our retry-count to be function
> * of # of children which we should visit in this loop.
> */
> @@ -3210,8 +3261,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
> if (!ret)
> break;
>
> - mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
> - MEM_CGROUP_RECLAIM_SHRINK);
> + mem_cgroup_target_reclaim(memcg, GFP_KERNEL, false, false);
> curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
> /* Usage is reduced ? */
> if (curusage >= oldusage)
> @@ -3269,9 +3319,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
> if (!ret)
> break;
>
> - mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL,
> - MEM_CGROUP_RECLAIM_NOSWAP |
> - MEM_CGROUP_RECLAIM_SHRINK);
> + mem_cgroup_target_reclaim(memcg, GFP_KERNEL, true, false);
> curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
> /* Usage is reduced ? */
> if (curusage >= oldusage)
> @@ -3311,9 +3359,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
> if (!mz)
> break;
>
> - reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone,
> - gfp_mask,
> - MEM_CGROUP_RECLAIM_SOFT);
> + reclaimed = mem_cgroup_soft_reclaim(mz->mem, zone, gfp_mask);
> nr_reclaimed += reclaimed;
> spin_lock(&mctz->lock);
>
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index ceeb2a5..e2a3647 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -1900,8 +1900,8 @@ static inline bool should_continue_reclaim(struct zone *zone,
> /*
> * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
> */
> -static void shrink_zone(int priority, struct zone *zone,
> - struct scan_control *sc)
> +static void do_shrink_zone(int priority, struct zone *zone,
> + struct scan_control *sc)
> {
> unsigned long nr[NR_LRU_LISTS];
> unsigned long nr_to_scan;
> @@ -1914,8 +1914,6 @@ restart:
> nr_scanned = sc->nr_scanned;
> get_scan_count(zone, sc, nr, priority);
>
> - sc->current_memcg = sc->memcg;
> -
> while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
> nr[LRU_INACTIVE_FILE]) {
> for_each_evictable_lru(l) {
> @@ -1954,6 +1952,19 @@ restart:
> goto restart;
>
> throttle_vm_writeout(sc->gfp_mask);
> +}
> +
> +static void shrink_zone(int priority, struct zone *zone,
> + struct scan_control *sc)
> +{
> + struct mem_cgroup *root = sc->memcg;
> + struct mem_cgroup *mem = NULL;
> +
> + do {
> + mem_cgroup_hierarchy_walk(root, &mem);
> + sc->current_memcg = mem;
> + do_shrink_zone(priority, zone, sc);

If I don't miss something, css_put() against mem->css will be required somewhere.

Thanks,
-Kame

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/