[PATCH 2/6] memcg: dirty-set limiting and filtered writeback

From: Konstantin Khebnikov
Date: Thu Jan 15 2015 - 13:56:46 EST


From: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx>

mem_cgroup_dirty_limits() checks thresholds and schedules per-bdi
writeback work (where ->for_memcg is set) which writes only inodes
where dirty limit is exceeded for owner memcg or for whole bdi.

Interface: memory.dirty_ratio percent of memory limit used as threshold
(0 = unlimited, default 50). Background threshold is a half of that.
And fs_dirty_threshold line in memory.stat shows current threshold.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx>
---
fs/fs-writeback.c | 18 ++++-
include/linux/backing-dev.h | 1
include/linux/memcontrol.h | 6 ++
include/linux/writeback.h | 1
include/trace/events/writeback.h | 1
mm/memcontrol.c | 145 ++++++++++++++++++++++++++++++++++++++
mm/page-writeback.c | 25 ++++++-
7 files changed, 190 insertions(+), 7 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2d609a5..9034768 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -20,6 +20,7 @@
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/mm.h>
+#include <linux/memcontrol.h>
#include <linux/pagemap.h>
#include <linux/kthread.h>
#include <linux/writeback.h>
@@ -47,6 +48,7 @@ struct wb_writeback_work {
unsigned int range_cyclic:1;
unsigned int for_background:1;
unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */
+ unsigned int for_memcg:1;
enum wb_reason reason; /* why was writeback initiated? */

struct list_head list; /* pending work list */
@@ -137,6 +139,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
work->nr_pages = nr_pages;
work->range_cyclic = range_cyclic;
work->reason = reason;
+ work->for_memcg = reason == WB_REASON_FOR_MEMCG;

bdi_queue_work(bdi, work);
}
@@ -258,15 +261,16 @@ static int move_expired_inodes(struct list_head *delaying_queue,
LIST_HEAD(tmp);
struct list_head *pos, *node;
struct super_block *sb = NULL;
- struct inode *inode;
+ struct inode *inode, *next;
int do_sb_sort = 0;
int moved = 0;

- while (!list_empty(delaying_queue)) {
- inode = wb_inode(delaying_queue->prev);
+ list_for_each_entry_safe(inode, next, delaying_queue, i_wb_list) {
if (work->older_than_this &&
inode_dirtied_after(inode, *work->older_than_this))
break;
+ if (work->for_memcg && !mem_cgroup_dirty_exceeded(inode))
+ continue;
list_move(&inode->i_wb_list, &tmp);
moved++;
if (sb_is_blkdev_sb(inode->i_sb))
@@ -650,6 +654,11 @@ static long writeback_sb_inodes(struct super_block *sb,
break;
}

+ if (work->for_memcg && !mem_cgroup_dirty_exceeded(inode)) {
+ redirty_tail(inode, wb);
+ continue;
+ }
+
/*
* Don't bother with new inodes or inodes being freed, first
* kind does not need periodic writeout yet, and for the latter
@@ -1014,6 +1023,9 @@ static long wb_do_writeback(struct bdi_writeback *wb)

wrote += wb_writeback(wb, work);

+ if (work->for_memcg)
+ clear_bit(BDI_memcg_writeback_running, &bdi->state);
+
/*
* Notify the caller of completion if this is a synchronous
* work item, otherwise just free it.
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5da6012..91b55d8 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -32,6 +32,7 @@ enum bdi_state {
BDI_sync_congested, /* The sync queue is getting full */
BDI_registered, /* bdi_register() was done */
BDI_writeback_running, /* Writeback is in progress */
+ BDI_memcg_writeback_running,
};

typedef int (congested_fn)(void *, int);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b281333..ae05563 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -178,6 +178,9 @@ void mem_cgroup_dec_page_dirty(struct address_space *mapping);
void mem_cgroup_inc_page_writeback(struct address_space *mapping);
void mem_cgroup_dec_page_writeback(struct address_space *mapping);
void mem_cgroup_forget_mapping(struct address_space *mapping);
+bool mem_cgroup_dirty_limits(struct address_space *mapping, unsigned long *dirty,
+ unsigned long *thresh, unsigned long *bg_thresh);
+bool mem_cgroup_dirty_exceeded(struct inode *inode);

#else /* CONFIG_MEMCG */
struct mem_cgroup;
@@ -352,6 +355,9 @@ static inline void mem_cgroup_dec_page_dirty(struct address_space *mapping) {}
static inline void mem_cgroup_inc_page_writeback(struct address_space *mapping) {}
static inline void mem_cgroup_dec_page_writeback(struct address_space *mapping) {}
static inline void mem_cgroup_forget_mapping(struct address_space *mapping) {}
+static inline bool mem_cgroup_dirty_limits(struct address_space *mapping, unsigned long *dirty,
+ unsigned long *thresh, unsigned long *bg_thresh) { return false; }
+static inline bool mem_cgroup_dirty_exceeded(struct inode *inode) { return false; }

#endif /* CONFIG_MEMCG */

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 0004833..1239fa6 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -47,6 +47,7 @@ enum wb_reason {
WB_REASON_LAPTOP_TIMER,
WB_REASON_FREE_MORE_MEM,
WB_REASON_FS_FREE_SPACE,
+ WB_REASON_FOR_MEMCG,
/*
* There is no bdi forker thread any more and works are done
* by emergency worker, however, this is TPs userland visible
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index cee02d6..106a8d7 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -29,6 +29,7 @@
{WB_REASON_LAPTOP_TIMER, "laptop_timer"}, \
{WB_REASON_FREE_MORE_MEM, "free_more_memory"}, \
{WB_REASON_FS_FREE_SPACE, "fs_free_space"}, \
+ {WB_REASON_FOR_MEMCG, "for_memcg"}, \
{WB_REASON_FORKER_THREAD, "forker_thread"}

struct wb_writeback_work;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index c5655f1..17d966a3b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -363,6 +363,10 @@ struct mem_cgroup {

struct percpu_counter nr_dirty;
struct percpu_counter nr_writeback;
+ unsigned long dirty_threshold;
+ unsigned long dirty_background;
+ unsigned int dirty_exceeded;
+ unsigned int dirty_ratio;

struct mem_cgroup_per_node *nodeinfo[0];
/* WARNING: nodeinfo must be the last member here */
@@ -3060,6 +3064,8 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,

static DEFINE_MUTEX(memcg_limit_mutex);

+static void mem_cgroup_update_dirty_thresh(struct mem_cgroup *memcg);
+
static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
unsigned long limit)
{
@@ -3112,6 +3118,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
if (!ret && enlarge)
memcg_oom_recover(memcg);

+ if (!ret)
+ mem_cgroup_update_dirty_thresh(memcg);
+
return ret;
}

@@ -3750,6 +3759,8 @@ static int memcg_stat_show(struct seq_file *m, void *v)
percpu_counter_sum_positive(&memcg->nr_dirty));
seq_printf(m, "fs_writeback %llu\n", PAGE_SIZE *
percpu_counter_sum_positive(&memcg->nr_writeback));
+ seq_printf(m, "fs_dirty_threshold %llu\n", (u64)PAGE_SIZE *
+ memcg->dirty_threshold);

#ifdef CONFIG_DEBUG_VM
{
@@ -3803,6 +3814,25 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
return 0;
}

+static u64 mem_cgroup_dirty_ratio_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ return memcg->dirty_ratio;
+}
+
+static int mem_cgroup_dirty_ratio_write(struct cgroup_subsys_state *css,
+ struct cftype *cft, u64 val)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+
+ memcg->dirty_ratio = val;
+ mem_cgroup_update_dirty_thresh(memcg);
+
+ return 0;
+}
+
static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
{
struct mem_cgroup_threshold_ary *t;
@@ -4454,6 +4484,11 @@ static struct cftype mem_cgroup_files[] = {
.write_u64 = mem_cgroup_swappiness_write,
},
{
+ .name = "dirty_ratio",
+ .read_u64 = mem_cgroup_dirty_ratio_read,
+ .write_u64 = mem_cgroup_dirty_ratio_write,
+ },
+ {
.name = "move_charge_at_immigrate",
.read_u64 = mem_cgroup_move_charge_read,
.write_u64 = mem_cgroup_move_charge_write,
@@ -4686,6 +4721,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
+ memcg->dirty_ratio = 50; /* default value for cgroups */
}

memcg->last_scanned_node = MAX_NUMNODES;
@@ -4750,6 +4786,10 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent != root_mem_cgroup)
memory_cgrp_subsys.broken_hierarchy = true;
}
+
+ memcg->dirty_ratio = parent->dirty_ratio;
+ mem_cgroup_update_dirty_thresh(memcg);
+
mutex_unlock(&memcg_create_mutex);

ret = memcg_init_kmem(memcg, &memory_cgrp_subsys);
@@ -5939,6 +5979,111 @@ void mem_cgroup_forget_mapping(struct address_space *mapping)
}
}

+static void mem_cgroup_update_dirty_thresh(struct mem_cgroup *memcg)
+{
+ struct cgroup_subsys_state *pos;
+
+ if (memcg->memory.limit > totalram_pages || !memcg->dirty_ratio) {
+ memcg->dirty_threshold = 0; /* 0 means no limit at all*/
+ memcg->dirty_background = ULONG_MAX;
+ } else {
+ memcg->dirty_threshold = memcg->memory.limit *
+ memcg->dirty_ratio / 100;
+ memcg->dirty_background = memcg->dirty_threshold / 2;
+ }
+
+ /* Propogate threshold into childs */
+ rcu_read_lock();
+ css_for_each_descendant_pre(pos, &memcg->css) {
+ struct mem_cgroup *memcg = mem_cgroup_from_css(pos);
+ struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+
+ if (!(pos->flags & CSS_ONLINE))
+ continue;
+
+ if (memcg->dirty_threshold == 0 ||
+ memcg->dirty_threshold == ULONG_MAX) {
+ if (parent && parent->use_hierarchy &&
+ parent->dirty_threshold)
+ memcg->dirty_threshold = ULONG_MAX;
+ else
+ memcg->dirty_threshold = 0;
+ }
+ }
+ rcu_read_unlock();
+}
+
+bool mem_cgroup_dirty_limits(struct address_space *mapping,
+ unsigned long *pdirty,
+ unsigned long *pthresh,
+ unsigned long *pbg_thresh)
+{
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ unsigned long dirty, threshold, background;
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_task(current);
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ /* No limit at all */
+ if (memcg->dirty_threshold == 0)
+ break;
+ /* No limit here, but must check parent */
+ if (memcg->dirty_threshold == ULONG_MAX)
+ continue;
+ dirty = percpu_counter_read_positive(&memcg->nr_dirty) +
+ percpu_counter_read_positive(&memcg->nr_writeback);
+ threshold = memcg->dirty_threshold;
+ background = memcg->dirty_background;
+ if (dirty > background) {
+ if (!memcg->dirty_exceeded)
+ memcg->dirty_exceeded = 1;
+ rcu_read_unlock();
+ if (dirty > (background + threshold) / 2 &&
+ !test_and_set_bit(BDI_memcg_writeback_running,
+ &bdi->state))
+ bdi_start_writeback(bdi, dirty - background,
+ WB_REASON_FOR_MEMCG);
+ *pdirty = dirty;
+ *pthresh = threshold;
+ *pbg_thresh = background;
+ return true;
+ }
+ }
+ rcu_read_unlock();
+
+ return false;
+}
+
+bool mem_cgroup_dirty_exceeded(struct inode *inode)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct mem_cgroup *memcg;
+ unsigned long dirty;
+
+ if (mapping->backing_dev_info->dirty_exceeded)
+ return true;
+
+ rcu_read_lock();
+ memcg = rcu_dereference(mapping->i_memcg);
+ for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+ if (!memcg->dirty_threshold) {
+ memcg = NULL;
+ break;
+ }
+ if (!memcg->dirty_exceeded)
+ continue;
+ dirty = percpu_counter_read_positive(&memcg->nr_dirty) +
+ percpu_counter_read_positive(&memcg->nr_writeback);
+ if (dirty > memcg->dirty_background)
+ break;
+ memcg->dirty_exceeded = 0;
+ }
+ rcu_read_unlock();
+
+ return memcg != NULL;
+}
+
/*
* subsys_initcall() for memory controller.
*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index afaf263..325510f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1328,6 +1328,17 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
}
}

+static unsigned long mem_cgroup_position_ratio(unsigned long dirty,
+ unsigned long thresh, unsigned long bg_thresh)
+{
+ unsigned long setpoint = dirty_freerun_ceiling(thresh, bg_thresh);
+
+ if (dirty > thresh)
+ return 0;
+
+ return pos_ratio_polynom(setpoint, dirty, thresh);
+}
+
/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
@@ -1362,6 +1373,7 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long uninitialized_var(bdi_dirty);
unsigned long dirty;
unsigned long bg_thresh;
+ bool memcg;

/*
* Unstable writes are a feature of certain networked
@@ -1387,6 +1399,8 @@ static void balance_dirty_pages(struct address_space *mapping,
bg_thresh = background_thresh;
}

+ memcg = mem_cgroup_dirty_limits(mapping, &dirty, &thresh, &bg_thresh);
+
/*
* Throttle it only when the background writeback cannot
* catch-up. This avoids (excessively) small writeouts
@@ -1404,7 +1418,7 @@ static void balance_dirty_pages(struct address_space *mapping,
break;
}

- if (unlikely(!writeback_in_progress(bdi)))
+ if (unlikely(!writeback_in_progress(bdi) && !memcg))
bdi_start_background_writeback(bdi);

if (!strictlimit)
@@ -1421,9 +1435,12 @@ static void balance_dirty_pages(struct address_space *mapping,
start_time);

dirty_ratelimit = bdi->dirty_ratelimit;
- pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
- background_thresh, nr_dirty,
- bdi_thresh, bdi_dirty);
+ if (memcg)
+ pos_ratio = mem_cgroup_position_ratio(dirty, thresh, bg_thresh);
+ else
+ pos_ratio = bdi_position_ratio(bdi, dirty_thresh,
+ background_thresh, nr_dirty,
+ bdi_thresh, bdi_dirty);
task_ratelimit = ((u64)dirty_ratelimit * pos_ratio) >>
RATELIMIT_CALC_SHIFT;
max_pause = bdi_max_pause(bdi, bdi_dirty);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/