[PATCH v5 5/6] memg: vmscan select victim node by weight

From: KAMEZAWA Hiroyuki
Date: Tue Aug 09 2011 - 06:19:26 EST



This patch implements a node selection logic based on each node's weight.

This patch adds a new array of nodescan_tickets[]. This array holds
each node's scan weight in a tuple of 2 values. as

for (i = 0, total_weight = 0; i < nodes; i++) {
weight = node->weight;
nodescan_tickets[i].start = total_weight;
nodescan_tickets[i].length = weight;
}

After this, a lottery logic as 'ticket = random32()/total_weight'
will make a ticket and bserach(ticket, nodescan_tickets[])
will find a node which holds [start, length] contains ticket.
(This is a lottery scheduling.)

By this, node will be selected in fair manner proportinal to
its weight.

This patch improve the scan time. Following is a test result
ot kernel-make on 4-node fake-numa under 500M limit, with 8cpus.
2cpus per node.

[Before patch]
772.52user 305.67system 4:11.48elapsed 428%CPU
(0avgtext+0avgdata 1457264maxresident)k
4797592inputs+5483240outputs (12550major+35707629minor)pagefaults 0swaps

[After patch]
773.73user 305.09system 3:51.28elapsed 466%CPU
(0avgtext+0avgdata 1458464maxresident)k
4400264inputs+4797056outputs (5578major+35690202minor)pagefaults 0swaps

elapsed time and major faults are reduced.

Here, vmscan_stat shows

[Before patch]
scanned_pages_by_limit 3926782
scanned_anon_pages_by_limit 1511090
scanned_file_pages_by_limit 2415692
elapsed_ns_by_limit 69528714562

[After patch]
scanned_pages_by_limit 4326462
scanned_anon_pages_by_limit 1310619
scanned_file_pages_by_limit 3015843
elapsed_ns_by_limit 42495200307

This patch helps to scan file caches rather than scanning anon.
and elapsed time is much reduced.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@xxxxxxxxxxxxxx>
---
include/linux/memcontrol.h | 3
mm/memcontrol.c | 150 ++++++++++++++++++++++++++++++++++++++-------
mm/vmscan.c | 4 -
3 files changed, 131 insertions(+), 26 deletions(-)

Index: mmotm-Aug3/mm/memcontrol.c
===================================================================
--- mmotm-Aug3.orig/mm/memcontrol.c
+++ mmotm-Aug3/mm/memcontrol.c
@@ -49,6 +49,9 @@
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <linux/random.h>
+#include <linux/bsearch.h>
+#include <linux/cpuset.h>
#include "internal.h"

#include <asm/uaccess.h>
@@ -151,6 +154,11 @@ struct mem_cgroup_lru_info {
struct mem_cgroup_per_node *nodeinfo[MAX_NUMNODES];
};

+struct numascan_ticket {
+ int nid;
+ unsigned int start, tickets;
+};
+
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
@@ -287,7 +295,10 @@ struct mem_cgroup {
atomic_t numainfo_events;
atomic_t numainfo_updating;
struct work_struct numainfo_update_work;
- unsigned long total_weight;
+ unsigned long total_weight;
+ int numascan_generation;
+ int numascan_tickets_num[2];
+ struct numascan_ticket *numascan_tickets[2];
#endif
/*
* Should the accounting and control be hierarchical, per subtree?
@@ -1660,6 +1671,46 @@ mem_cgroup_calc_numascan_weight(struct m
}

/*
+ * For lottery scheduling, this routine disributes "ticket" for
+ * scanning to each node. ticket will be recored into numascan_ticket
+ * array and this array will be used for scheduling, lator.
+ * For make lottery wair, we limit the sum of tickets almost 0xffff.
+ * Later, random() & 0xffff will do proportional fair lottery.
+ */
+#define NUMA_TICKET_SHIFT (16)
+#define NUMA_TICKET_FACTOR ((1 << NUMA_TICKET_SHIFT) - 1)
+static void mem_cgroup_update_numascan_tickets(struct mem_cgroup *memcg)
+{
+ struct numascan_ticket *nt;
+ unsigned int node_ticket, assigned_tickets;
+ u64 weight;
+ int nid, assigned_num, generation;
+
+ /* update ticket information by double buffering */
+ generation = memcg->numascan_generation ^ 0x1;
+
+ nt = memcg->numascan_tickets[generation];
+ assigned_tickets = 0;
+ assigned_num = 0;
+ for_each_node_mask(nid, memcg->scan_nodes) {
+ weight = memcg->info.nodeinfo[nid]->weight;
+ node_ticket = div64_u64(weight << NUMA_TICKET_SHIFT,
+ memcg->total_weight + 1);
+ if (!node_ticket)
+ node_ticket = 1;
+ nt->nid = nid;
+ nt->start = assigned_tickets;
+ nt->tickets = node_ticket;
+ assigned_tickets += node_ticket;
+ nt++;
+ assigned_num++;
+ }
+ memcg->numascan_tickets_num[generation] = assigned_num;
+ smp_wmb();
+ memcg->numascan_generation = generation;
+}
+
+/*
* Update all node's scan weight in background.
*/
static void mem_cgroup_numainfo_update_work(struct work_struct *work)
@@ -1672,6 +1723,8 @@ static void mem_cgroup_numainfo_update_w

memcg->total_weight = mem_cgroup_calc_numascan_weight(memcg);

+ synchronize_rcu();
+ mem_cgroup_update_numascan_tickets(memcg);
atomic_set(&memcg->numainfo_updating, 0);
css_put(&memcg->css);
}
@@ -1698,6 +1751,18 @@ static void mem_cgroup_may_update_nodema
schedule_work(&mem->numainfo_update_work);
}

+static int node_weight_compare(const void *key, const void *elt)
+{
+ unsigned long lottery = (unsigned long)key;
+ struct numascan_ticket *nt = (struct numascan_ticket *)elt;
+
+ if (lottery < nt->start)
+ return -1;
+ if (lottery > (nt->start + nt->tickets))
+ return 1;
+ return 0;
+}
+
/*
* Selecting a node where we start reclaim from. Because what we need is just
* reducing usage counter, start from anywhere is O,K. Considering
@@ -1707,32 +1772,38 @@ static void mem_cgroup_may_update_nodema
* we'll use or we've used. So, it may make LRU bad. And if several threads
* hit limits, it will see a contention on a node. But freeing from remote
* node means more costs for memory reclaim because of memory latency.
- *
- * Now, we use round-robin. Better algorithm is welcomed.
*/
-int mem_cgroup_select_victim_node(struct mem_cgroup *mem, nodemask_t **mask)
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg, nodemask_t **mask,
+ struct memcg_scanrecord *rec)
{
- int node;
+ int node = MAX_NUMNODES;
+ struct numascan_ticket *nt;
+ unsigned long lottery;
+ int generation;

+ if (rec->context == SCAN_BY_SHRINK)
+ goto out;
+
+ mem_cgroup_may_update_nodemask(memcg);
*mask = NULL;
- mem_cgroup_may_update_nodemask(mem);
- node = mem->last_scanned_node;
+ lottery = random32() & NUMA_TICKET_FACTOR;

- node = next_node(node, mem->scan_nodes);
- if (node == MAX_NUMNODES)
- node = first_node(mem->scan_nodes);
- /*
- * We call this when we hit limit, not when pages are added to LRU.
- * No LRU may hold pages because all pages are UNEVICTABLE or
- * memcg is too small and all pages are not on LRU. In that case,
- * we use curret node.
- */
- if (unlikely(node == MAX_NUMNODES))
+ rcu_read_lock();
+ generation = memcg->numascan_generation;
+ nt = bsearch((void *)lottery,
+ memcg->numascan_tickets[generation],
+ memcg->numascan_tickets_num[generation],
+ sizeof(struct numascan_ticket), node_weight_compare);
+ rcu_read_unlock();
+ if (nt)
+ node = nt->nid;
+out:
+ if (unlikely(node == MAX_NUMNODES)) {
node = numa_node_id();
- else
- *mask = &mem->scan_nodes;
+ *mask = NULL;
+ } else
+ *mask = &memcg->scan_nodes;

- mem->last_scanned_node = node;
return node;
}

@@ -1771,14 +1842,42 @@ bool mem_cgroup_reclaimable(struct mem_c
return false;
}

-static void mem_cgroup_numascan_init(struct mem_cgroup *memcg)
+static bool mem_cgroup_numascan_init(struct mem_cgroup *memcg)
{
+ struct numascan_ticket *nt;
+ int nr_nodes;
+
INIT_WORK(&memcg->numainfo_update_work,
mem_cgroup_numainfo_update_work);
+
+ nr_nodes = num_possible_nodes();
+ nt = kmalloc(sizeof(struct numascan_ticket) * nr_nodes,
+ GFP_KERNEL);
+ if (!nt)
+ return false;
+ memcg->numascan_tickets[0] = nt;
+ nt = kmalloc(sizeof(struct numascan_ticket) * nr_nodes,
+ GFP_KERNEL);
+ if (!nt) {
+ kfree(memcg->numascan_tickets[0]);
+ memcg->numascan_tickets[0] = NULL;
+ return false;
+ }
+ memcg->numascan_tickets[1] = nt;
+ memcg->numascan_tickets_num[0] = 0;
+ memcg->numascan_tickets_num[1] = 0;
+ return true;
+}
+
+static void mem_cgroup_numascan_free(struct mem_cgroup *memcg)
+{
+ kfree(memcg->numascan_tickets[0]);
+ kfree(memcg->numascan_tickets[1]);
}

#else
-int mem_cgroup_select_victim_node(struct mem_cgroup *mem, nodemask_t **mask)
+int mem_cgroup_select_victim_node(struct mem_cgroup *mem, nodemask_t **mask,
+ struct memcg_scanrecord *rec)
{
*mask = NULL;
return 0;
@@ -1791,6 +1890,9 @@ bool mem_cgroup_reclaimable(struct mem_c
static void mem_cgroup_numascan_init(struct mem_cgroup *memcg)
{
}
+static bool mem_cgroup_numascan_free(struct mem_cgroup *memcg)
+{
+}
#endif

static void __mem_cgroup_record_scanstat(unsigned long *stats,
@@ -5080,6 +5182,7 @@ static void __mem_cgroup_free(struct mem
int node;

mem_cgroup_remove_from_trees(mem);
+ mem_cgroup_numascan_free(mem);
free_css_id(&mem_cgroup_subsys, &mem->css);

for_each_node_state(node, N_POSSIBLE)
@@ -5218,7 +5321,8 @@ mem_cgroup_create(struct cgroup_subsys *
mem->move_charge_at_immigrate = 0;
mutex_init(&mem->thresholds_lock);
spin_lock_init(&mem->scanstat.lock);
- mem_cgroup_numascan_init(mem);
+ if (!mem_cgroup_numascan_init(mem))
+ goto free_out;
return &mem->css;
free_out:
__mem_cgroup_free(mem);
Index: mmotm-Aug3/mm/vmscan.c
===================================================================
--- mmotm-Aug3.orig/mm/vmscan.c
+++ mmotm-Aug3/mm/vmscan.c
@@ -2378,9 +2378,9 @@ unsigned long try_to_free_mem_cgroup_pag
* take care of from where we get pages. So the node where we start the
* scan does not need to be the current node.
*/
- nid = mem_cgroup_select_victim_node(mem_cont, &sc.nodemask);
+ nid = mem_cgroup_select_victim_node(mem_cont, &sc.nodemask, rec);

- zonelist = NODE_DATA(nid)->node_zonelists;
+ zonelist = &NODE_DATA(nid)->node_zonelists[0];

trace_mm_vmscan_memcg_reclaim_begin(0,
sc.may_writepage,
Index: mmotm-Aug3/include/linux/memcontrol.h
===================================================================
--- mmotm-Aug3.orig/include/linux/memcontrol.h
+++ mmotm-Aug3/include/linux/memcontrol.h
@@ -118,7 +118,8 @@ extern void mem_cgroup_end_migration(str
*/
int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
-int mem_cgroup_select_victim_node(struct mem_cgroup *memcg, nodemask_t **mask);
+int mem_cgroup_select_victim_node(struct mem_cgroup *memcg, nodemask_t **mask,
+ struct memcg_scanrecord *rec);
unsigned long mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg,
int nid, int zid, unsigned int lrumask);
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/