Re: [RFC][PATCH v3 1/10] check reclaimable in hierarchy walk

From: Ying Han
Date: Fri May 27 2011 - 01:48:05 EST


On Wed, May 25, 2011 at 10:15 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote:
>
> I may post this patch as stand alone, later.
> ==
> Check memcg has reclaimable pages at select_victim().
>
> Now, with help of bitmap as memcg->scan_node, we can check whether memcg has
> reclaimable pages with easy test of node_empty(&mem->scan_nodes).
>
> mem->scan_nodes is a bitmap to show whether memcg contains reclaimable
> memory or not, which is updated periodically.
>
> This patch makes use of scan_nodes and modify hierarchy walk at memory
> shrinking in following way.
>
>  - check scan_nodes in mem_cgroup_select_victim()
>  - mem_cgroup_select_victim() returns NULL if no memcg is reclaimable.
>  - force update of scan_nodes.
>  - rename mem_cgroup_select_victim() to be mem_cgroup_select_get_victim()
>    to show refcnt is +1.
>
> This will make hierarchy walk better.
>
> And this allows to remove mem_cgroup_local_pages() check which was used for
> the same purpose. But this function was wrong because it cannot handle
> information of unevictable pages and tmpfs v.s. swapless information.
>
> Changelog:
>  - added since v3.
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
> ---
>  mm/memcontrol.c |  165 +++++++++++++++++++++++++++++++++++++-------------------
>  1 file changed, 110 insertions(+), 55 deletions(-)
>
> Index: memcg_async/mm/memcontrol.c
> ===================================================================
> --- memcg_async.orig/mm/memcontrol.c
> +++ memcg_async/mm/memcontrol.c
> @@ -584,15 +584,6 @@ static long mem_cgroup_read_stat(struct
>        return val;
>  }
>
> -static long mem_cgroup_local_usage(struct mem_cgroup *mem)
> -{
> -       long ret;
> -
> -       ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
> -       ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
> -       return ret;
> -}
> -
>  static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
>                                         bool charge)
>  {
> @@ -1555,43 +1546,6 @@ u64 mem_cgroup_get_limit(struct mem_cgro
>        return min(limit, memsw);
>  }
>
> -/*
> - * Visit the first child (need not be the first child as per the ordering
> - * of the cgroup list, since we track last_scanned_child) of @mem and use
> - * that to reclaim free pages from.
> - */
> -static struct mem_cgroup *
> -mem_cgroup_select_victim(struct mem_cgroup *root_mem)
> -{
> -       struct mem_cgroup *ret = NULL;
> -       struct cgroup_subsys_state *css;
> -       int nextid, found;
> -
> -       if (!root_mem->use_hierarchy) {
> -               css_get(&root_mem->css);
> -               ret = root_mem;
> -       }
> -
> -       while (!ret) {
> -               rcu_read_lock();
> -               nextid = root_mem->last_scanned_child + 1;
> -               css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
> -                                  &found);
> -               if (css && css_tryget(css))
> -                       ret = container_of(css, struct mem_cgroup, css);
> -
> -               rcu_read_unlock();
> -               /* Updates scanning parameter */
> -               if (!css) {
> -                       /* this means start scan from ID:1 */
> -                       root_mem->last_scanned_child = 0;
> -               } else
> -                       root_mem->last_scanned_child = found;
> -       }
> -
> -       return ret;
> -}
> -
>  #if MAX_NUMNODES > 1
>
>  /*
> @@ -1600,11 +1554,11 @@ mem_cgroup_select_victim(struct mem_cgro
>  * nodes based on the zonelist. So update the list loosely once per 10 secs.
>  *
>  */
> -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
> +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem, bool force)
>  {
>        int nid;
>
> -       if (time_after(mem->next_scan_node_update, jiffies))
> +       if (!force && time_after(mem->next_scan_node_update, jiffies))
>                return;
>
>        mem->next_scan_node_update = jiffies + 10*HZ;
> @@ -1641,7 +1595,7 @@ int mem_cgroup_select_victim_node(struct
>  {
>        int node;
>
> -       mem_cgroup_may_update_nodemask(mem);
> +       mem_cgroup_may_update_nodemask(mem, false);
>        node = mem->last_scanned_node;
>
>        node = next_node(node, mem->scan_nodes);
> @@ -1660,13 +1614,117 @@ int mem_cgroup_select_victim_node(struct
>        return node;
>  }
>
> +/**
> + * mem_cgroup_has_reclaimable
> + * @mem_cgroup : the mem_cgroup
> + *
> + * The caller can test whether the memcg has reclaimable pages.
> + *
> + * This function checks memcg has reclaimable pages or not with bitmap of
> + * memcg->scan_nodes. This bitmap is updated periodically and indicates
> + * which node has reclaimable memcg memory or not.
> + * Although this is a rough test and result is not very precise but we don't
> + * have to scan all nodes and don't have to use locks.
> + *
> + * For non-NUMA, this cheks reclaimable pages on zones because we don't
> + * update scan_nodes.(see below)
> + */
> +static bool mem_cgroup_has_reclaimable(struct mem_cgroup *memcg)
> +{
> +       return !nodes_empty(memcg->scan_nodes);
> +}
> +
>  #else
> +
> +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem, bool force)
> +{
> +}
> +
>  int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
>  {
>        return 0;
>  }
> +
> +static bool mem_cgroup_has_reclaimable(struct mem_cgroup *memcg)
> +{
> +       unsigned long nr;
> +       int zid;
> +
> +       for (zid = NODE_DATA(0)->nr_zones - 1; zid >= 0; zid--)
> +               if (mem_cgroup_zone_reclaimable_pages(memcg, 0, zid))
> +                       break;
> +       if (zid < 0)
> +               return false;
> +       return true;
> +}
>  #endif

unused variable "nr".

--Ying
>
> +/**
> + * mem_cgroup_select_get_victim
> + * @root_mem: the root memcg of hierarchy which should be shrinked.
> + *
> + * Visit children of root_mem ony by one. If the routine finds a memcg
> + * which contains reclaimable pages, returns it with refcnt +1. The
> + * scan is done in round-robin and 'the next start point' is saved into
> + * mem->last_scanned_child. If no reclaimable memcg are found, returns NULL.
> + */
> +static struct mem_cgroup *
> +mem_cgroup_select_get_victim(struct mem_cgroup *root_mem)
> +{
> +       struct mem_cgroup *ret = NULL;
> +       struct cgroup_subsys_state *css;
> +       int nextid, found;
> +       bool second_visit = false;
> +
> +       if (!root_mem->use_hierarchy)
> +               goto return_root;
> +
> +       while (!ret) {
> +               rcu_read_lock();
> +               nextid = root_mem->last_scanned_child + 1;
> +               css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
> +                                  &found);
> +               if (css && css_tryget(css))
> +                       ret = container_of(css, struct mem_cgroup, css);
> +
> +               rcu_read_unlock();
> +               /* Updates scanning parameter */
> +               if (!css) { /* Indicates we scanned the last node of tree */
> +                       /*
> +                        * If all memcg has no reclaimable pages, we may enter
> +                        * an infinite loop. Exit here if we reached the end
> +                        * of hierarchy tree twice.
> +                        */
> +                       if (second_visit)
> +                               return NULL;
> +                       /* this means start scan from ID:1 */
> +                       root_mem->last_scanned_child = 0;
> +                       second_visit = true;
> +               } else
> +                       root_mem->last_scanned_child = found;
> +               if (css && ret) {
> +                       /*
> +                        * check memcg has reclaimable memory or not. Update
> +                        * information carefully if we might fail with cached
> +                        * bitmask information.
> +                        */
> +                       if (second_visit)
> +                               mem_cgroup_may_update_nodemask(ret, true);
> +
> +                       if (!mem_cgroup_has_reclaimable(ret)) {
> +                               css_put(css);
> +                               ret = NULL;
> +                       }
> +               }
> +       }
> +
> +       return ret;
> +return_root:
> +       css_get(&root_mem->css);
> +       return root_mem;
> +}
> +
> +
>  /*
>  * Scan the hierarchy if needed to reclaim memory. We remember the last child
>  * we reclaimed from, so that we don't end up penalizing one child extensively
> @@ -1705,7 +1763,9 @@ static int mem_cgroup_hierarchical_recla
>                is_kswapd = true;
>
>        while (1) {
> -               victim = mem_cgroup_select_victim(root_mem);
> +               victim = mem_cgroup_select_get_victim(root_mem);
> +               if (!victim)
> +                       return total;
>                if (victim == root_mem) {
>                        loop++;
>                        if (loop >= 1)
> @@ -1733,11 +1793,6 @@ static int mem_cgroup_hierarchical_recla
>                                }
>                        }
>                }
> -               if (!mem_cgroup_local_usage(victim)) {
> -                       /* this cgroup's local usage == 0 */
> -                       css_put(&victim->css);
> -                       continue;
> -               }
>                /* we use swappiness of local cgroup */
>                if (check_soft) {
>                        ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/