[RFC] [PATCH -mm 2/2] use collected memory cgroup statistics for page writeback

From: Andrea Righi
Date: Fri Sep 12 2008 - 11:10:53 EST


Use per-cgroup memory statistics to evaluate dirty limits, dirtyable memory and
start background writeout via pdflush.

Also add an argument to pdflush_operation() to pass the memory cgroup that
requested the background writeout. In this way pdflush is able to check the
cgroup dirty limits according to the cgroup statistics.

Signed-off-by: Andrea Righi <righi.andrea@xxxxxxxxx>
---
fs/super.c | 4 +-
fs/sync.c | 7 ++-
include/linux/writeback.h | 11 +++--
kernel/trace/trace.c | 2 +-
mm/backing-dev.c | 3 +-
mm/page-writeback.c | 115 +++++++++++++++++++++++++++-----------------
mm/pdflush.c | 10 +++-
7 files changed, 95 insertions(+), 57 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index f31ef82..33fbcaa 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -646,7 +646,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
return 0;
}

-static void do_emergency_remount(unsigned long foo)
+static void do_emergency_remount(struct mem_cgroup *unused, unsigned long foo)
{
struct super_block *sb;

@@ -674,7 +674,7 @@ static void do_emergency_remount(unsigned long foo)

void emergency_remount(void)
{
- pdflush_operation(do_emergency_remount, 0);
+ pdflush_operation(do_emergency_remount, NULL, 0);
}

/*
diff --git a/fs/sync.c b/fs/sync.c
index 2967562..aac77c3 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,9 +42,14 @@ asmlinkage long sys_sync(void)
return 0;
}

+static void memcg_do_sync(struct mem_cgroup *unused, unsigned long wait)
+{
+ do_sync(wait);
+}
+
void emergency_sync(void)
{
- pdflush_operation(do_sync, 0);
+ pdflush_operation(memcg_do_sync, NULL, 0);
}

/*
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 12b15c5..dd5bc8a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -5,6 +5,7 @@
#define WRITEBACK_H

#include <linux/sched.h>
+#include <linux/memcontrol.h>
#include <linux/fs.h>

struct backing_dev_info;
@@ -106,7 +107,7 @@ extern int vm_highmem_is_dirtyable;
extern int block_dump;
extern int laptop_mode;

-extern unsigned long determine_dirtyable_memory(void);
+extern unsigned long determine_dirtyable_memory(struct mem_cgroup *mem);

extern int dirty_ratio_handler(struct ctl_table *table, int write,
struct file *filp, void __user *buffer, size_t *lenp,
@@ -117,8 +118,9 @@ struct file;
int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);

-void get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
- struct backing_dev_info *bdi);
+void get_dirty_limits(struct mem_cgroup *mem, long *pbackground,
+ long *pdirty, long *pbdi_dirty,
+ struct backing_dev_info *bdi);

void page_writeback_init(void);
void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
@@ -133,7 +135,8 @@ balance_dirty_pages_ratelimited(struct address_space *mapping)
typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
void *data);

-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
+int pdflush_operation(void (*fn)(struct mem_cgroup *, unsigned long),
+ struct mem_cgroup *mem, unsigned long arg0);
int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc);
int write_cache_pages(struct address_space *mapping,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bc6a22a..ec64004 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2877,7 +2877,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
goto out;
}

- freeable_pages = determine_dirtyable_memory();
+ freeable_pages = determine_dirtyable_memory(NULL);

/* we only allow to request 1/4 of useable memory */
if (pages_requested >
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f2e574d..df6a01c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -28,7 +28,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
long dirty_thresh;
long bdi_thresh;

- get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+ get_dirty_limits(NULL, &background_thresh, &dirty_thresh,
+ &bdi_thresh, bdi);

#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 17c6141..1a9b602 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -106,7 +106,8 @@ EXPORT_SYMBOL(laptop_mode);
/* End of sysctl-exported parameters */


-static void background_writeout(unsigned long _min_pages);
+static void background_writeout(struct mem_cgroup *mem,
+ unsigned long _min_pages);

/*
* Scale the writeback cache size proportional to the relative writeout speeds.
@@ -136,7 +137,9 @@ static int calc_period_shift(void)
{
unsigned long dirty_total;

- dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+ dirty_total = (mem_cgroup_dirty_ratio(NULL)
+ * determine_dirtyable_memory(NULL))
+ / 100;
return 2 + ilog2(dirty_total - 1);
}

@@ -147,9 +150,9 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
struct file *filp, void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int old_ratio = vm_dirty_ratio;
+ int old_ratio = mem_cgroup_dirty_ratio(NULL);
int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
- if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
+ if (ret == 0 && write && mem_cgroup_dirty_ratio(NULL) != old_ratio) {
int shift = calc_period_shift();
prop_change_shift(&vm_completions, shift);
prop_change_shift(&vm_dirties, shift);
@@ -350,30 +353,35 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
* Returns the numebr of pages that can currently be freed and used
* by the kernel for direct mappings.
*/
-unsigned long determine_dirtyable_memory(void)
+unsigned long determine_dirtyable_memory(struct mem_cgroup *memcg)
{
- unsigned long x;
+ unsigned long mem_memory, memcg_memory;

- x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
+ memcg_memory = mem_cgroup_get_free_pages(memcg) +
+ mem_cgroup_global_lru_pages(memcg);
+ mem_memory = global_page_state(NR_FREE_PAGES) + global_lru_pages();
+ if (memcg_memory && (memcg_memory < mem_memory))
+ return memcg_memory;

if (!vm_highmem_is_dirtyable)
- x -= highmem_dirtyable_memory(x);
+ mem_memory -= highmem_dirtyable_memory(mem_memory);

- return x + 1; /* Ensure that we never return 0 */
+ return mem_memory + 1; /* Ensure that we never return 0 */
}

void
-get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+get_dirty_limits(struct mem_cgroup *mem, long *pbackground,
+ long *pdirty, long *pbdi_dirty,
struct backing_dev_info *bdi)
{
int background_ratio; /* Percentages */
int dirty_ratio;
long background;
long dirty;
- unsigned long available_memory = determine_dirtyable_memory();
+ unsigned long available_memory = determine_dirtyable_memory(mem);
struct task_struct *tsk;

- dirty_ratio = vm_dirty_ratio;
+ dirty_ratio = mem_cgroup_dirty_ratio(mem);
if (dirty_ratio < 5)
dirty_ratio = 5;

@@ -383,10 +391,12 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,

background = (background_ratio * available_memory) / 100;
dirty = (dirty_ratio * available_memory) / 100;
- tsk = current;
- if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
- background += background / 4;
- dirty += dirty / 4;
+ if (mem == NULL) {
+ tsk = current;
+ if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+ background += background / 4;
+ dirty += dirty / 4;
+ }
}
*pbackground = background;
*pdirty = dirty;
@@ -409,16 +419,17 @@ get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,

*pbdi_dirty = bdi_dirty;
clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
- task_dirty_limit(current, pbdi_dirty);
+ if (mem == NULL)
+ task_dirty_limit(current, pbdi_dirty);
}
}

/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
- * the caller to perform writeback if the system is over `vm_dirty_ratio'.
- * If we're over `background_thresh' then pdflush is woken to perform some
- * writeout.
+ * the caller to perform writeback if the system is over
+ * `mem_cgroup_dirty_ratio()'. If we're over `background_thresh' then pdflush
+ * is woken to perform some writeout.
*/
static void balance_dirty_pages(struct address_space *mapping)
{
@@ -441,12 +452,11 @@ static void balance_dirty_pages(struct address_space *mapping)
.range_cyclic = 1,
};

- get_dirty_limits(&background_thresh, &dirty_thresh,
+ get_dirty_limits(NULL, &background_thresh, &dirty_thresh,
&bdi_thresh, bdi);

- nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- nr_writeback = global_page_state(NR_WRITEBACK);
+ nr_reclaimable = mem_cgroup_nr_file_dirty(NULL);
+ nr_writeback = mem_cgroup_nr_writeback(NULL);

bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
@@ -475,8 +485,9 @@ static void balance_dirty_pages(struct address_space *mapping)
if (bdi_nr_reclaimable) {
writeback_inodes(&wbc);
pages_written += write_chunk - wbc.nr_to_write;
- get_dirty_limits(&background_thresh, &dirty_thresh,
- &bdi_thresh, bdi);
+ get_dirty_limits(NULL,
+ &background_thresh, &dirty_thresh,
+ &bdi_thresh, bdi);
}

/*
@@ -521,10 +532,13 @@ static void balance_dirty_pages(struct address_space *mapping)
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
- (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
- + global_page_state(NR_UNSTABLE_NFS)
- > background_thresh)))
- pdflush_operation(background_writeout, 0);
+ (!laptop_mode &&
+ (mem_cgroup_nr_file_dirty(NULL) > background_thresh))) {
+ struct mem_cgroup *mem = get_current_mem_cgroup();
+
+ if (pdflush_operation(background_writeout, mem, 0))
+ put_mem_cgroup(mem);
+ }
}

void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -585,8 +599,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
long dirty_thresh;

for ( ; ; ) {
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
-
+ get_dirty_limits(NULL, &background_thresh, &dirty_thresh,
+ NULL, NULL);
/*
* Boost the allowable dirty threshold a bit for page
* allocators so they don't get DoS'ed by heavy writers
@@ -612,7 +626,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
* writeback at least _min_pages, and keep writing until the amount of dirty
* memory is less than the background threshold, or until we're all clean.
*/
-static void background_writeout(unsigned long _min_pages)
+static void background_writeout(struct mem_cgroup *mem,
+ unsigned long _min_pages)
{
long min_pages = _min_pages;
struct writeback_control wbc = {
@@ -628,9 +643,9 @@ static void background_writeout(unsigned long _min_pages)
long background_thresh;
long dirty_thresh;

- get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
- if (global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) < background_thresh
+ get_dirty_limits(mem, &background_thresh, &dirty_thresh,
+ NULL, NULL);
+ if (mem_cgroup_nr_file_dirty(mem) < background_thresh
&& min_pages <= 0)
break;
wbc.more_io = 0;
@@ -647,6 +662,7 @@ static void background_writeout(unsigned long _min_pages)
break;
}
}
+ put_mem_cgroup(mem);
}

/*
@@ -656,10 +672,15 @@ static void background_writeout(unsigned long _min_pages)
*/
int wakeup_pdflush(long nr_pages)
{
+ struct mem_cgroup *mem = get_current_mem_cgroup();
+ int ret;
+
if (nr_pages == 0)
- nr_pages = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS);
- return pdflush_operation(background_writeout, nr_pages);
+ nr_pages = mem_cgroup_nr_file_dirty(NULL);
+ ret = pdflush_operation(background_writeout, mem, nr_pages);
+ if (ret)
+ put_mem_cgroup(mem);
+ return ret;
}

static void wb_timer_fn(unsigned long unused);
@@ -683,7 +704,7 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0);
* older_than_this takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings.
*/
-static void wb_kupdate(unsigned long arg)
+static void wb_kupdate(struct mem_cgroup *mem, unsigned long arg)
{
unsigned long oldest_jif;
unsigned long start_jif;
@@ -704,8 +725,7 @@ static void wb_kupdate(unsigned long arg)
oldest_jif = jiffies - dirty_expire_interval;
start_jif = jiffies;
next_jif = start_jif + dirty_writeback_interval;
- nr_to_write = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_UNSTABLE_NFS) +
+ nr_to_write = mem_cgroup_nr_file_dirty(mem) +
(inodes_stat.nr_inodes - inodes_stat.nr_unused);
while (nr_to_write > 0) {
wbc.more_io = 0;
@@ -724,6 +744,7 @@ static void wb_kupdate(unsigned long arg)
next_jif = jiffies + HZ;
if (dirty_writeback_interval)
mod_timer(&wb_timer, next_jif);
+ put_mem_cgroup(mem);
}

/*
@@ -742,18 +763,22 @@ int dirty_writeback_centisecs_handler(ctl_table *table, int write,

static void wb_timer_fn(unsigned long unused)
{
- if (pdflush_operation(wb_kupdate, 0) < 0)
+ struct mem_cgroup *mem = get_current_mem_cgroup();
+
+ if (pdflush_operation(wb_kupdate, mem, 0) < 0) {
+ put_mem_cgroup(mem);
mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
+ }
}

-static void laptop_flush(unsigned long unused)
+static void laptop_flush(struct mem_cgroup *mem, unsigned long unused)
{
sys_sync();
}

static void laptop_timer_fn(unsigned long unused)
{
- pdflush_operation(laptop_flush, 0);
+ pdflush_operation(laptop_flush, NULL, 0);
}

/*
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 0cbe0c6..27f05b6 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -83,7 +83,9 @@ static unsigned long last_empty_jifs;
*/
struct pdflush_work {
struct task_struct *who; /* The thread */
- void (*fn)(unsigned long); /* A callback function */
+ void (*fn)(struct mem_cgroup *,
+ unsigned long); /* A callback function */
+ struct mem_cgroup *mem; /* callback memory cgroup argument */
unsigned long arg0; /* An argument to the callback */
struct list_head list; /* On pdflush_list, when idle */
unsigned long when_i_went_to_sleep;
@@ -124,7 +126,7 @@ static int __pdflush(struct pdflush_work *my_work)
}
spin_unlock_irq(&pdflush_lock);

- (*my_work->fn)(my_work->arg0);
+ (*my_work->fn)(my_work->mem, my_work->arg0);

/*
* Thread creation: For how long have there been zero
@@ -198,7 +200,8 @@ static int pdflush(void *dummy)
* Returns zero if it indeed managed to find a worker thread, and passed your
* payload to it.
*/
-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
+int pdflush_operation(void (*fn)(struct mem_cgroup *, unsigned long),
+ struct mem_cgroup *mem, unsigned long arg0)
{
unsigned long flags;
int ret = 0;
@@ -216,6 +219,7 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
if (list_empty(&pdflush_list))
last_empty_jifs = jiffies;
pdf->fn = fn;
+ pdf->mem = mem;
pdf->arg0 = arg0;
wake_up_process(pdf->who);
}
--
1.5.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/