Re: [RFC][PATCH v3 1/10] check reclaimable in hierarchy walk
From: Ying Han
Date: Fri May 27 2011 - 01:48:05 EST
On Wed, May 25, 2011 at 10:15 PM, KAMEZAWA Hiroyuki
<kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote:
>
> I may post this patch as stand alone, later.
> ==
> Check memcg has reclaimable pages at select_victim().
>
> Now, with help of bitmap as memcg->scan_node, we can check whether memcg has
> reclaimable pages with easy test of node_empty(&mem->scan_nodes).
>
> mem->scan_nodes is a bitmap to show whether memcg contains reclaimable
> memory or not, which is updated periodically.
>
> This patch makes use of scan_nodes and modify hierarchy walk at memory
> shrinking in following way.
>
> - check scan_nodes in mem_cgroup_select_victim()
> - mem_cgroup_select_victim() returns NULL if no memcg is reclaimable.
> - force update of scan_nodes.
> - rename mem_cgroup_select_victim() to be mem_cgroup_select_get_victim()
> to show refcnt is +1.
>
> This will make hierarchy walk better.
>
> And this allows to remove mem_cgroup_local_pages() check which was used for
> the same purpose. But this function was wrong because it cannot handle
> information of unevictable pages and tmpfs v.s. swapless information.
>
> Changelog:
> - added since v3.
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
> ---
> mm/memcontrol.c | 165 +++++++++++++++++++++++++++++++++++++-------------------
> 1 file changed, 110 insertions(+), 55 deletions(-)
>
> Index: memcg_async/mm/memcontrol.c
> ===================================================================
> --- memcg_async.orig/mm/memcontrol.c
> +++ memcg_async/mm/memcontrol.c
> @@ -584,15 +584,6 @@ static long mem_cgroup_read_stat(struct
> return val;
> }
>
> -static long mem_cgroup_local_usage(struct mem_cgroup *mem)
> -{
> - long ret;
> -
> - ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
> - ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
> - return ret;
> -}
> -
> static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
> bool charge)
> {
> @@ -1555,43 +1546,6 @@ u64 mem_cgroup_get_limit(struct mem_cgro
> return min(limit, memsw);
> }
>
> -/*
> - * Visit the first child (need not be the first child as per the ordering
> - * of the cgroup list, since we track last_scanned_child) of @mem and use
> - * that to reclaim free pages from.
> - */
> -static struct mem_cgroup *
> -mem_cgroup_select_victim(struct mem_cgroup *root_mem)
> -{
> - struct mem_cgroup *ret = NULL;
> - struct cgroup_subsys_state *css;
> - int nextid, found;
> -
> - if (!root_mem->use_hierarchy) {
> - css_get(&root_mem->css);
> - ret = root_mem;
> - }
> -
> - while (!ret) {
> - rcu_read_lock();
> - nextid = root_mem->last_scanned_child + 1;
> - css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
> - &found);
> - if (css && css_tryget(css))
> - ret = container_of(css, struct mem_cgroup, css);
> -
> - rcu_read_unlock();
> - /* Updates scanning parameter */
> - if (!css) {
> - /* this means start scan from ID:1 */
> - root_mem->last_scanned_child = 0;
> - } else
> - root_mem->last_scanned_child = found;
> - }
> -
> - return ret;
> -}
> -
> #if MAX_NUMNODES > 1
>
> /*
> @@ -1600,11 +1554,11 @@ mem_cgroup_select_victim(struct mem_cgro
> * nodes based on the zonelist. So update the list loosely once per 10 secs.
> *
> */
> -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
> +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem, bool force)
> {
> int nid;
>
> - if (time_after(mem->next_scan_node_update, jiffies))
> + if (!force && time_after(mem->next_scan_node_update, jiffies))
> return;
>
> mem->next_scan_node_update = jiffies + 10*HZ;
> @@ -1641,7 +1595,7 @@ int mem_cgroup_select_victim_node(struct
> {
> int node;
>
> - mem_cgroup_may_update_nodemask(mem);
> + mem_cgroup_may_update_nodemask(mem, false);
> node = mem->last_scanned_node;
>
> node = next_node(node, mem->scan_nodes);
> @@ -1660,13 +1614,117 @@ int mem_cgroup_select_victim_node(struct
> return node;
> }
>
> +/**
> + * mem_cgroup_has_reclaimable
> + * @mem_cgroup : the mem_cgroup
> + *
> + * The caller can test whether the memcg has reclaimable pages.
> + *
> + * This function checks memcg has reclaimable pages or not with bitmap of
> + * memcg->scan_nodes. This bitmap is updated periodically and indicates
> + * which node has reclaimable memcg memory or not.
> + * Although this is a rough test and result is not very precise but we don't
> + * have to scan all nodes and don't have to use locks.
> + *
> + * For non-NUMA, this cheks reclaimable pages on zones because we don't
> + * update scan_nodes.(see below)
> + */
> +static bool mem_cgroup_has_reclaimable(struct mem_cgroup *memcg)
> +{
> + return !nodes_empty(memcg->scan_nodes);
> +}
> +
> #else
> +
> +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem, bool force)
> +{
> +}
> +
> int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
> {
> return 0;
> }
> +
> +static bool mem_cgroup_has_reclaimable(struct mem_cgroup *memcg)
> +{
> + unsigned long nr;
> + int zid;
> +
> + for (zid = NODE_DATA(0)->nr_zones - 1; zid >= 0; zid--)
> + if (mem_cgroup_zone_reclaimable_pages(memcg, 0, zid))
> + break;
> + if (zid < 0)
> + return false;
> + return true;
> +}
> #endif
unused variable "nr".
--Ying
>
> +/**
> + * mem_cgroup_select_get_victim
> + * @root_mem: the root memcg of hierarchy which should be shrinked.
> + *
> + * Visit children of root_mem ony by one. If the routine finds a memcg
> + * which contains reclaimable pages, returns it with refcnt +1. The
> + * scan is done in round-robin and 'the next start point' is saved into
> + * mem->last_scanned_child. If no reclaimable memcg are found, returns NULL.
> + */
> +static struct mem_cgroup *
> +mem_cgroup_select_get_victim(struct mem_cgroup *root_mem)
> +{
> + struct mem_cgroup *ret = NULL;
> + struct cgroup_subsys_state *css;
> + int nextid, found;
> + bool second_visit = false;
> +
> + if (!root_mem->use_hierarchy)
> + goto return_root;
> +
> + while (!ret) {
> + rcu_read_lock();
> + nextid = root_mem->last_scanned_child + 1;
> + css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
> + &found);
> + if (css && css_tryget(css))
> + ret = container_of(css, struct mem_cgroup, css);
> +
> + rcu_read_unlock();
> + /* Updates scanning parameter */
> + if (!css) { /* Indicates we scanned the last node of tree */
> + /*
> + * If all memcg has no reclaimable pages, we may enter
> + * an infinite loop. Exit here if we reached the end
> + * of hierarchy tree twice.
> + */
> + if (second_visit)
> + return NULL;
> + /* this means start scan from ID:1 */
> + root_mem->last_scanned_child = 0;
> + second_visit = true;
> + } else
> + root_mem->last_scanned_child = found;
> + if (css && ret) {
> + /*
> + * check memcg has reclaimable memory or not. Update
> + * information carefully if we might fail with cached
> + * bitmask information.
> + */
> + if (second_visit)
> + mem_cgroup_may_update_nodemask(ret, true);
> +
> + if (!mem_cgroup_has_reclaimable(ret)) {
> + css_put(css);
> + ret = NULL;
> + }
> + }
> + }
> +
> + return ret;
> +return_root:
> + css_get(&root_mem->css);
> + return root_mem;
> +}
> +
> +
> /*
> * Scan the hierarchy if needed to reclaim memory. We remember the last child
> * we reclaimed from, so that we don't end up penalizing one child extensively
> @@ -1705,7 +1763,9 @@ static int mem_cgroup_hierarchical_recla
> is_kswapd = true;
>
> while (1) {
> - victim = mem_cgroup_select_victim(root_mem);
> + victim = mem_cgroup_select_get_victim(root_mem);
> + if (!victim)
> + return total;
> if (victim == root_mem) {
> loop++;
> if (loop >= 1)
> @@ -1733,11 +1793,6 @@ static int mem_cgroup_hierarchical_recla
> }
> }
> }
> - if (!mem_cgroup_local_usage(victim)) {
> - /* this cgroup's local usage == 0 */
> - css_put(&victim->css);
> - continue;
> - }
> /* we use swappiness of local cgroup */
> if (check_soft) {
> ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/