Re: [RFC][PATCH] A scan node selection logic for memcg rather thanround-robin.

From: KAMEZAWA Hiroyuki
Date: Thu Jun 09 2011 - 20:05:54 EST


On Thu, 9 Jun 2011 19:10:31 +0900
KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx> wrote:

>
> A scan node selection logic for memcg rather than round-robin.
>
> This patch is what I'm testing now but I don't have big NUMA.
> please review if you have time. This is against linux-3.0-rc2.
>
I'm sorry that I noticed I can remove all "noswap" code then...
the size of patch will be half. I'll post more tested version in the next week.

Thanks,
-Kame


> ==
> From fedf64c8b7e42a8d03974456f01fa3774df94afe Mon Sep 17 00:00:00 2001
> From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
> Date: Thu, 9 Jun 2011 18:53:14 +0900
> Subject: [PATCH] weighted node scanning for memory cgroup.
>
> When I pushed 889976dbcb1218119fdd950fb7819084e37d7d37
> " memcg: reclaim memory from nodes in round-robin order "
>
> I mentioned " a better algorithm is needed."
>
> This patch is a better algorithm. This implements..
>
> 1. per node reclaim target possibilty (called as weight.)
> 2. select a node from nodes by proportional fair way with the weight
>
> For 1, this patch simply check the number of pages on LRU with
> some bias of active/inactive ratio and swappiness. A weight for
> node is calculated as
>
> weight = (inactive_files + active_files * active_ratio) * (200-swappines)
> (inactive_anon + active_files * active_ratio) * swappiness
>
> In general, if node contains more inactive files, the weight goes high.
> This weight can be seen by numa_stat.
>
> [root@bluextal linux-2.6]# cat /cgroup/memory/A/memory.numa_stat
> total=31863 N0=14235 N1=17628
> file=27912 N0=12814 N1=15098
> anon=3951 N0=1421 N1=2530
> unevictable=0 N0=0 N1=0
> lru_scan_weight=35798 N0=19886 N1=15912
>
> For 2. this patch uses lottery scheduling using node's weight.
> This is porportionally fair logic.
>
> For example, assume Node 0,1,2,3 and their weights are 1000,2000,1000,6000.
> The node selection possibility is 1:2:1:6. So, even if a node contains
> small amount of memory, it has possiblity to be reclaimed. In long run,
> there will prevent starvation....where the system ignores small umount of
> file caches in a node.....
>
> I used bsearch() for finding a victim node..so, you may think the code is
> complicated.
>
> TO-BE-CHECKED:
> - of course, calculation of "weight" is the heart of the logic.
> - what part is hard to read ?
> - cleverer implemenation ?
> - need more tests.
> - choose better variable names and add more comments ;)
>
> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
> ---
> include/linux/memcontrol.h | 2 +-
> mm/memcontrol.c | 226 +++++++++++++++++++++++++++++++++++++------
> mm/vmscan.c | 2 +-
> 3 files changed, 196 insertions(+), 34 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index 9724a38..95d18b6 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -108,7 +108,7 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
> */
> int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
> int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
> -int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
> +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg, bool noswap);
> unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
> struct zone *zone,
> enum lru_list lru);
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 06825be..2ed3749 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -48,6 +48,8 @@
> #include <linux/page_cgroup.h>
> #include <linux/cpu.h>
> #include <linux/oom.h>
> +#include <linux/random.h>
> +#include <linux/bsearch.h>
> #include "internal.h"
>
> #include <asm/uaccess.h>
> @@ -141,10 +143,22 @@ struct mem_cgroup_per_zone {
>
> struct mem_cgroup_per_node {
> struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
> + unsigned long weight;
> +};
> +
> +struct node_schedinfo {
> + unsigned long weight_start, weight_end;
> + unsigned long weight_noswap_start, weight_noswap_end;
> + int nid;
> };
>
> struct mem_cgroup_lru_info {
> struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
> + struct node_schedinfo *schedinfo;
> + struct rw_semaphore updating;
> + int nr_nodes;
> + unsigned long total_weight;
> + unsigned long total_weight_noswap;
> };
>
> /*
> @@ -1559,36 +1573,157 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
> }
>
> #if MAX_NUMNODES > 1
> +/*
> + * Weight for node scanning calculation.
> + * Basically, node includes inactive pages are guilty. But if we never scan
> + * nodes only with active pages, LRU rotation on the node never occurs.
> + * So, we need to take into account active pages to some extent.
> + */
> +static void mem_cgroup_node_sched_weight(struct mem_cgroup *mem,
> + int nid, unsigned long usage,
> + int file_active_ratio, int anon_active_ratio,
> + unsigned long *weight, unsigned long *weight_noswap)
> +{
> + unsigned long file, anon;
> +
> +
> + file = mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE);
> + /* reduce weight of node active pages with regard to the total ratio */
> + file = file * file_active_ratio / 100;
> + file += mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE);
> +
> +
> + anon = mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON);
> + /* reduce weight of node active pages with regard to the total ratio */
> + anon = anon * anon_active_ratio / 100;
> + anon += mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON);
> +
> + *weight_noswap = file;
> + *weight = file * (200 - mem->swappiness) + anon * (mem->swappiness + 1);
> + *weight /= 200;
> +
> + return;
> +}
> +
> +/*
> + * calculate a ratio for active/inactive ratio with regard to curret
> + * total inactive/active ratio.
> + */
> +static unsigned int
> +mem_cgroup_active_ratio(struct mem_cgroup *mem, enum lru_list l)
> +{
> + unsigned long active, inactive;
> +
> + inactive = mem_cgroup_get_local_zonestat(mem, l);
> + active = mem_cgroup_get_local_zonestat(mem, l + LRU_ACTIVE);
> +
> + return (active * 100)/(active + inactive);
> +}
>
> /*
> * Always updating the nodemask is not very good - even if we have an empty
> * list or the wrong list here, we can start from some node and traverse all
> * nodes based on the zonelist. So update the list loosely once per 10 secs.
> - *
> */
> -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
> +static void mem_cgroup_may_update_nodeweight(struct mem_cgroup *mem)
> {
> int nid;
> + int file_active_ratio, anon_active_ratio;
> + unsigned long total_weight, total_weight_noswap;
> + struct node_schedinfo *ns;
> + unsigned long usage;
>
> if (time_after(mem->next_scan_node_update, jiffies))
> return;
> -
> + down_write(&mem->info.updating);
> + /* double check for race. */
> + if (time_after(mem->next_scan_node_update, jiffies)) {
> + up_write(&mem->info.updating);
> + return;
> + }
> mem->next_scan_node_update = jiffies + 10*HZ;
> +
> /* make a nodemask where this memcg uses memory from */
> - mem->scan_nodes = node_states[N_HIGH_MEMORY];
> + total_weight = 0;
> + total_weight_noswap = 0;
>
> - for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
>
> - if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
> - mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
> - continue;
> + file_active_ratio = mem_cgroup_active_ratio(mem, LRU_INACTIVE_FILE);
> + anon_active_ratio = mem_cgroup_active_ratio(mem, LRU_INACTIVE_ANON);
> + usage = res_counter_read_u64(&mem->res, RES_USAGE) >> PAGE_SHIFT;
> +
> + /*
> + * Calculate each node's weight and put the score into array of
> + * node_scheinfo. Eech node schedinfo contains nid and
> + * [start_weight, end_weight). Later, a thread will get a ticket
> + * with a number < total_weight and find a node_schedinfo which
> + * has start_weight <= ticket number < end_weight.
> + *
> + * If ticket number is enough random, we can do proportional fair
> + * victim selection with regard to "weight" of each nodes.
> + *
> + * Why do we need to use "schedinfo" rather than adding member to
> + * per-node structure ? There are 2 reasons. 1) node array
> + * can have holes and not good for bsearch() 2) We can skip nodes
> + * which has no memory. Using other structure like prio-tree is a
> + * possible idea, of course.
> + */
> + ns = &mem->info.schedinfo[0];
> +
> + for_each_node_mask (nid, node_states[N_HIGH_MEMORY]) {
> + unsigned long weight, weight_noswap;
>
> - if (total_swap_pages &&
> - (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
> - mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
> + mem_cgroup_node_sched_weight(mem, nid, usage,
> + file_active_ratio,
> + anon_active_ratio,
> + &weight, &weight_noswap);
> +
> + if (weight == 0)
> continue;
> - node_clear(nid, mem->scan_nodes);
> + ns->nid = nid;
> + /* For noswap scan, we take care of file caches */
> + ns->weight_noswap_start = total_weight_noswap;
> + ns->weight_noswap_end = ns->weight_noswap_start + weight_noswap;
> + total_weight_noswap += weight_noswap;
> +
> + ns->weight_start = total_weight;
> + ns->weight_end = ns->weight_start + weight;
> + total_weight += weight;
> + ns++;
> + /* for numa stat */
> + mem->info.nodeinfo[nid]->weight = weight;
> }
> +
> + mem->info.total_weight = total_weight;
> + mem->info.total_weight_noswap = total_weight_noswap;
> + mem->info.nr_nodes = ns - mem->info.schedinfo;
> +
> + up_write(&mem->info.updating);
> +}
> +
> +/* routine for finding lottery winner by bsearch. */
> +static int cmp_node_lottery_weight(const void *key, const void *elt)
> +{
> + struct node_schedinfo *ns = (struct node_schedinfo *)elt;
> + unsigned long lottery = (unsigned long)key;
> +
> + if (lottery < ns->weight_start)
> + return -1;
> + if (lottery >= ns->weight_end)
> + return 1;
> + return 0;
> +}
> +
> +static int cmp_node_lottery_weight_noswap(const void *key, const void *elt)
> +{
> + struct node_schedinfo *ns = (struct node_schedinfo *)elt;
> + unsigned long lottery = (unsigned long)key;
> +
> + if (lottery < ns->weight_noswap_start)
> + return -1;
> + if (lottery >= ns->weight_noswap_end)
> + return 1;
> + return 0;
> }
>
> /*
> @@ -1601,29 +1736,42 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
> * hit limits, it will see a contention on a node. But freeing from remote
> * node means more costs for memory reclaim because of memory latency.
> *
> - * Now, we use round-robin. Better algorithm is welcomed.
> + * This selects a node by an lottery scheduling algorithm with regard to
> + * each node's weight under the whole memcg usage. By this we'll scan
> + * nodes in a way proportional fair to the number of reclaim candidate
> + * pages per node.
> */
> -int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
> +int mem_cgroup_select_victim_node(struct mem_cgroup *mem, bool noswap)
> {
> - int node;
> -
> - mem_cgroup_may_update_nodemask(mem);
> - node = mem->last_scanned_node;
> -
> - node = next_node(node, mem->scan_nodes);
> - if (node == MAX_NUMNODES)
> - node = first_node(mem->scan_nodes);
> - /*
> - * We call this when we hit limit, not when pages are added to LRU.
> - * No LRU may hold pages because all pages are UNEVICTABLE or
> - * memcg is too small and all pages are not on LRU. In that case,
> - * we use curret node.
> - */
> - if (unlikely(node == MAX_NUMNODES))
> - node = numa_node_id();
> + struct node_schedinfo *ns;
> + unsigned long lottery;
> + int nid;
>
> - mem->last_scanned_node = node;
> - return node;
> + mem_cgroup_may_update_nodeweight(mem);
> + /* use pseudo nid while updating schedule information. */
> + if (!down_read_trylock(&mem->info.updating))
> + return numa_node_id();
> +
> + if (!mem->info.nr_nodes) {
> + /* if all nodes has very small memory for each... */
> + ns = NULL;
> + } else if (noswap) {
> + lottery = random32() % mem->info.total_weight_noswap;
> + ns = bsearch((void*)lottery, mem->info.schedinfo,
> + mem->info.nr_nodes,
> + sizeof(*ns), cmp_node_lottery_weight_noswap);
> + } else {
> + lottery = random32() % mem->info.total_weight;
> + ns = bsearch((void*)lottery, mem->info.schedinfo,
> + mem->info.nr_nodes,
> + sizeof(*ns), cmp_node_lottery_weight);
> + }
> + if (ns)
> + nid = ns->nid;
> + else
> + nid = numa_node_id();
> + up_read(&mem->info.updating);
> + return nid;
> }
>
> #else
> @@ -4135,6 +4283,11 @@ static int mem_control_numa_stat_show(struct seq_file *m, void *arg)
> seq_printf(m, " N%d=%lu", nid, node_nr);
> }
> seq_putc(m, '\n');
> + seq_printf(m, "lru_scan_weight=%lu", mem_cont->info.total_weight);
> + for_each_node_state(nid, N_HIGH_MEMORY)
> + seq_printf(m, " N%d=%lu", nid,
> + mem_cont->info.nodeinfo[nid]->weight);
> + seq_putc(m, '\n');
> return 0;
> }
> #endif /* CONFIG_NUMA */
> @@ -4789,6 +4942,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
> for_each_node_state(node, N_POSSIBLE)
> free_mem_cgroup_per_zone_info(mem, node);
>
> + kfree(mem->info.schedinfo);
> +
> free_percpu(mem->stat);
> if (sizeof(struct mem_cgroup) < PAGE_SIZE)
> kfree(mem);
> @@ -4878,6 +5033,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
> if (alloc_mem_cgroup_per_zone_info(mem, node))
> goto free_out;
>
> + node = nodes_weight(node_states[N_POSSIBLE]);
> + mem->info.schedinfo =
> + kzalloc(sizeof(struct node_schedinfo) * node, GFP_KERNEL);
> + if (!mem->info.schedinfo)
> + goto free_out;
> + init_rwsem(&mem->info.updating);
> +
> /* root ? */
> if (cont->parent == NULL) {
> int cpu;
> diff --git a/mm/vmscan.c b/mm/vmscan.c
> index faa0a08..dd1823b 100644
> --- a/mm/vmscan.c
> +++ b/mm/vmscan.c
> @@ -2254,7 +2254,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
> * take care of from where we get pages. So the node where we start the
> * scan does not need to be the current node.
> */
> - nid = mem_cgroup_select_victim_node(mem_cont);
> + nid = mem_cgroup_select_victim_node(mem_cont, noswap);
>
> zonelist = NODE_DATA(nid)->node_zonelists;
>
> --
> 1.7.4.1
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/