[PATCH 2/2] memcg: make memory.swappiness file. take2

From: KOSAKI Motohiro
Date: Tue Dec 02 2008 - 02:45:28 EST


Currently, /proc/sys/vm/swappiness can change swappiness ratio for global reclaim.
However, memcg reclaim doesn't have tuning parameter for itself.

In general, the optimal swappiness depend on workload.
(e.g. hpc workload need to low swappiness than the others.)

Then, per cgroup swappiness improve administrator tunability.



Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@xxxxxxxxxxxxxx>
---
Documentation/controllers/memory.txt | 6 ++
include/linux/swap.h | 3 -
mm/memcontrol.c | 72 ++++++++++++++++++++++++++++++++---
mm/vmscan.c | 7 +--
4 files changed, 78 insertions(+), 10 deletions(-)

Index: b/mm/memcontrol.c
===================================================================
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -161,6 +161,9 @@ struct mem_cgroup {
unsigned long last_oom_jiffies;
int obsolete;
atomic_t refcnt;
+
+ unsigned int swappiness;
+
/*
* statistics. This must be placed at the end of memcg.
*/
@@ -586,6 +589,22 @@ done:
return ret;
}

+static unsigned int get_swappiness(struct mem_cgroup *memcg)
+{
+ struct cgroup *cgrp = memcg->css.cgroup;
+ unsigned int swappiness;
+
+ /* root ? */
+ if (cgrp->parent == NULL)
+ return vm_swappiness;
+
+ spin_lock(&memcg->reclaim_param_lock);
+ swappiness = memcg->swappiness;
+ spin_unlock(&memcg->reclaim_param_lock);
+
+ return swappiness;
+}
+
/*
* Dance down the hierarchy if needed to reclaim memory. We remember the
* last child we reclaimed from, so that we don't end up penalizing
@@ -606,7 +625,8 @@ static int mem_cgroup_hierarchical_recla
* but there might be left over accounting, even after children
* have left.
*/
- ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap);
+ ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
+ get_swappiness(root_mem));
if (res_counter_check_under_limit(&root_mem->res))
return 0;

@@ -620,7 +640,8 @@ static int mem_cgroup_hierarchical_recla
cgroup_unlock();
continue;
}
- ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap);
+ ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
+ get_swappiness(next_mem));
if (res_counter_check_under_limit(&root_mem->res))
return 0;
cgroup_lock();
@@ -1348,7 +1369,8 @@ int mem_cgroup_shrink_usage(struct mm_st
rcu_read_unlock();

do {
- progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true);
+ progress = try_to_free_mem_cgroup_pages(mem, gfp_mask, true,
+ get_swappiness(mem));
progress += res_counter_check_under_limit(&mem->res);
} while (!progress && --retry);

@@ -1393,7 +1415,9 @@ static int mem_cgroup_resize_limit(struc
break;

progress = try_to_free_mem_cgroup_pages(memcg,
- GFP_HIGHUSER_MOVABLE, false);
+ GFP_HIGHUSER_MOVABLE,
+ false,
+ get_swappiness(memcg));
if (!progress) retry_count--;
}
return ret;
@@ -1433,7 +1457,8 @@ int mem_cgroup_resize_memsw_limit(struct
break;

oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
- try_to_free_mem_cgroup_pages(memcg, GFP_HIGHUSER_MOVABLE, true);
+ try_to_free_mem_cgroup_pages(memcg, GFP_HIGHUSER_MOVABLE, true,
+ get_swappiness(memcg));
curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
if (curusage >= oldusage)
retry_count--;
@@ -1565,7 +1590,9 @@ try_to_free:
goto out;
}
progress = try_to_free_mem_cgroup_pages(mem,
- GFP_HIGHUSER_MOVABLE, false);
+ GFP_HIGHUSER_MOVABLE,
+ false,
+ get_swappiness(mem));
if (!progress) {
nr_retries--;
/* maybe some writeback is necessary */
@@ -1755,6 +1782,31 @@ static int mem_control_stat_show(struct
return 0;
}

+static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+ return get_swappiness(memcg);
+}
+
+static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
+ u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+
+ if (val > 100)
+ return -EINVAL;
+
+ if (cgrp->parent == NULL)
+ return -EBUSY;
+
+ spin_lock(&memcg->reclaim_param_lock);
+ memcg->swappiness = val;
+ spin_unlock(&memcg->reclaim_param_lock);
+
+ return 0;
+}
+

static struct cftype mem_cgroup_files[] = {
{
@@ -1793,6 +1845,11 @@ static struct cftype mem_cgroup_files[]
.write_u64 = mem_cgroup_hierarchy_write,
.read_u64 = mem_cgroup_hierarchy_read,
},
+ {
+ .name = "swappiness",
+ .read_u64 = mem_cgroup_swappiness_read,
+ .write_u64 = mem_cgroup_swappiness_write,
+ },
};

#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
@@ -1984,6 +2041,9 @@ mem_cgroup_create(struct cgroup_subsys *
mem->last_scanned_child = NULL;
spin_lock_init(&mem->reclaim_param_lock);

+ if (parent)
+ mem->swappiness = get_swappiness(parent);
+
return &mem->css;
free_out:
for_each_node_state(node, N_POSSIBLE)
Index: b/mm/vmscan.c
===================================================================
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1716,14 +1716,15 @@ unsigned long try_to_free_pages(struct z
#ifdef CONFIG_CGROUP_MEM_RES_CTLR

unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
- gfp_t gfp_mask,
- bool noswap)
+ gfp_t gfp_mask,
+ bool noswap,
+ unsigned int swappiness)
{
struct scan_control sc = {
.may_writepage = !laptop_mode,
.may_swap = 1,
.swap_cluster_max = SWAP_CLUSTER_MAX,
- .swappiness = vm_swappiness,
+ .swappiness = swappiness,
.order = 0,
.mem_cgroup = mem_cont,
.isolate_pages = mem_cgroup_isolate_pages,
Index: b/include/linux/swap.h
===================================================================
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,7 +214,8 @@ static inline void lru_cache_add_active_
extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
gfp_t gfp_mask);
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
- gfp_t gfp_mask, bool noswap);
+ gfp_t gfp_mask, bool noswap,
+ unsigned int swappiness);
extern int __isolate_lru_page(struct page *page, int mode, int file);
extern unsigned long shrink_all_memory(unsigned long nr_pages);
extern int vm_swappiness;
Index: b/Documentation/controllers/memory.txt
===================================================================
--- a/Documentation/controllers/memory.txt
+++ b/Documentation/controllers/memory.txt
@@ -289,6 +289,12 @@ will be charged as a new owner of it.
Because rmdir() moves all pages to parent, some out-of-use page caches can be
moved to the parent. If you want to avoid that, force_empty will be useful.

+5.2 swappiness
+ Similar to /proc/sys/vm/swappiness, but affecting one group only.
+
+ The root cgroup can't be changed this parameter.
+ it always use /proc/sys/vm/swappiness internally.
+
6. Hierarchy support

The memory controller supports a deep hierarchy and hierarchical accounting.


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/