async write IO controllers

From: Wu Fengguang
Date: Mon Apr 04 2011 - 14:12:29 EST


Hi Vivek,

To explore the possibility of an integrated async write cgroup IO
controller in balance_dirty_pages(), I did the attached patches.
They should serve it well to illustrate the basic ideas.

It's based on Andrea's two supporting patches and a slightly
simplified and improved version of this v6 patchset.

root@fat ~# cat test-blkio-cgroup.sh
#!/bin/sh

mount /dev/sda7 /fs

rmdir /cgroup/async_write
mkdir /cgroup/async_write
echo $$ > /cgroup/async_write/tasks
# echo "8:16 1048576" > /cgroup/async_write/blkio.throttle.read_bps_device

dd if=/dev/zero of=/fs/zero1 bs=1M count=100 &
dd if=/dev/zero of=/fs/zero2 bs=1M count=100 &

2-dd case:

root@fat ~# 100+0 records in
100+0 records out
104857600 bytes (105 MB) copied100+0 records in
100+0 records out
, 11.9477 s, 8.8 MB/s
104857600 bytes (105 MB) copied, 11.9496 s, 8.8 MB/s

1-dd case:

root@fat ~# 100+0 records in
100+0 records out
104857600 bytes (105 MB) copied, 6.21919 s, 16.9 MB/s

The patch hard codes a limit of 16MiB/s or 16.8MB/s. So the 1-dd case
is pretty accurate, and the 2-dd case is a bit leaked due to the time
to take the throttle bandwidth from its initial value 16MiB/s to
8MiB/s. This could be compensated by some position control in future,
so that it won't leak in normal cases.

The main bits, blkcg_update_throttle_bandwidth() is in fact a minimal
version of bdi_update_throttle_bandwidth(); blkcg_update_bandwidth()
is also a cut-down version of bdi_update_bandwidth().

Thanks,
Fengguang
Subject: blkcg: dirty rate accounting
Date: Sat Apr 02 20:15:28 CST 2011

To be used by the balance_dirty_pages() async write IO controller.

Signed-off-by: Wu Fengguang <fengguang.wu@xxxxxxxxx>
---
block/blk-cgroup.c | 4 ++++
include/linux/blk-cgroup.h | 1 +
mm/page-writeback.c | 4 ++++
3 files changed, 9 insertions(+)

--- linux-next.orig/block/blk-cgroup.c 2011-04-02 20:17:08.000000000 +0800
+++ linux-next/block/blk-cgroup.c 2011-04-02 21:59:24.000000000 +0800
@@ -1458,6 +1458,7 @@ static void blkiocg_destroy(struct cgrou

free_css_id(&blkio_subsys, &blkcg->css);
rcu_read_unlock();
+ percpu_counter_destroy(&blkcg->nr_dirtied);
if (blkcg != &blkio_root_cgroup)
kfree(blkcg);
}
@@ -1483,6 +1484,9 @@ done:
INIT_HLIST_HEAD(&blkcg->blkg_list);

INIT_LIST_HEAD(&blkcg->policy_list);
+
+ percpu_counter_init(&blkcg->nr_dirtied, 0);
+
return &blkcg->css;
}

--- linux-next.orig/include/linux/blk-cgroup.h 2011-04-02 20:17:08.000000000 +0800
+++ linux-next/include/linux/blk-cgroup.h 2011-04-02 21:59:02.000000000 +0800
@@ -111,6 +111,7 @@ struct blkio_cgroup {
spinlock_t lock;
struct hlist_head blkg_list;
struct list_head policy_list; /* list of blkio_policy_node */
+ struct percpu_counter nr_dirtied;
};

struct blkio_group_stats {
--- linux-next.orig/mm/page-writeback.c 2011-04-02 20:17:08.000000000 +0800
+++ linux-next/mm/page-writeback.c 2011-04-02 21:59:02.000000000 +0800
@@ -34,6 +34,7 @@
#include <linux/syscalls.h>
#include <linux/buffer_head.h>
#include <linux/pagevec.h>
+#include <linux/blk-cgroup.h>
#include <trace/events/writeback.h>

/*
@@ -221,6 +222,9 @@ EXPORT_SYMBOL_GPL(bdi_writeout_inc);

void task_dirty_inc(struct task_struct *tsk)
{
+ struct blkio_cgroup *blkcg = task_to_blkio_cgroup(tsk);
+ if (blkcg)
+ __percpu_counter_add(&blkcg->nr_dirtied, 1, BDI_STAT_BATCH);
prop_inc_single(&vm_dirties, &tsk->dirties);
}

Subject: writeback: async write IO controllers
Date: Fri Mar 04 10:38:04 CST 2011

- a bare per-task async write IO controller
- a bare per-cgroup async write IO controller

XXX: the per-task user interface is reusing RLIMIT_RSS for now.
XXX: the per-cgroup user interface is missing

CC: Vivek Goyal <vgoyal@xxxxxxxxxx>
CC: Andrea Righi <arighi@xxxxxxxxxxx>
Signed-off-by: Wu Fengguang <fengguang.wu@xxxxxxxxx>
---
block/blk-cgroup.c | 2
include/linux/blk-cgroup.h | 4 +
mm/page-writeback.c | 86 +++++++++++++++++++++++++++++++----
3 files changed, 84 insertions(+), 8 deletions(-)

--- linux-next.orig/mm/page-writeback.c 2011-04-05 01:26:38.000000000 +0800
+++ linux-next/mm/page-writeback.c 2011-04-05 01:26:53.000000000 +0800
@@ -1117,6 +1117,49 @@ static unsigned long max_pause(struct ba
return clamp_val(t, MIN_PAUSE, MAX_PAUSE);
}

+static void blkcg_update_throttle_bandwidth(struct blkio_cgroup *blkcg,
+ unsigned long dirtied,
+ unsigned long elapsed)
+{
+ unsigned long bw = blkcg->throttle_bandwidth;
+ unsigned long long ref_bw;
+ unsigned long dirty_bw;
+
+ ref_bw = blkcg->async_write_bps >> (3 + PAGE_SHIFT - RATIO_SHIFT);
+ dirty_bw = ((dirtied - blkcg->dirtied_stamp)*HZ + elapsed/2) / elapsed;
+ do_div(ref_bw, dirty_bw | 1);
+ ref_bw = bw * ref_bw >> RATIO_SHIFT;
+
+ blkcg->throttle_bandwidth = (bw + ref_bw) / 2;
+}
+
+void blkcg_update_bandwidth(struct blkio_cgroup *blkcg)
+{
+ unsigned long now = jiffies;
+ unsigned long dirtied;
+ unsigned long elapsed;
+
+ if (!blkcg)
+ return;
+ if (!spin_trylock(&blkcg->lock))
+ return;
+
+ elapsed = now - blkcg->bw_time_stamp;
+ dirtied = percpu_counter_read(&blkcg->nr_dirtied);
+
+ if (elapsed > MAX_PAUSE * 2)
+ goto snapshot;
+ if (elapsed <= MAX_PAUSE)
+ goto unlock;
+
+ blkcg_update_throttle_bandwidth(blkcg, dirtied, elapsed);
+snapshot:
+ blkcg->dirtied_stamp = dirtied;
+ blkcg->bw_time_stamp = now;
+unlock:
+ spin_unlock(&blkcg->lock);
+}
+
/*
* balance_dirty_pages() must be called by processes which are generating dirty
* data. It looks at the number of dirty pages in the machine and will force
@@ -1139,6 +1182,10 @@ static void balance_dirty_pages(struct a
unsigned long pause_max;
struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long start_time = jiffies;
+ struct blkio_cgroup *blkcg = task_to_blkio_cgroup(current);
+
+ if (blkcg == &blkio_root_cgroup)
+ blkcg = NULL;

for (;;) {
unsigned long now = jiffies;
@@ -1178,6 +1225,15 @@ static void balance_dirty_pages(struct a
* when the bdi limits are ramping up.
*/
if (nr_dirty <= (background_thresh + dirty_thresh) / 2) {
+ if (blkcg) {
+ pause_max = max_pause(bdi, 0);
+ goto cgroup_ioc;
+ }
+ if (current->signal->rlim[RLIMIT_RSS].rlim_cur !=
+ RLIM_INFINITY) {
+ pause_max = max_pause(bdi, 0);
+ goto task_ioc;
+ }
current->paused_when = now;
current->nr_dirtied = 0;
break;
@@ -1190,21 +1246,35 @@ static void balance_dirty_pages(struct a
bdi_start_background_writeback(bdi);

pause_max = max_pause(bdi, bdi_dirty);
-
base_bw = bdi->throttle_bandwidth;
- /*
- * Double the bandwidth for PF_LESS_THROTTLE (ie. nfsd) and
- * real-time tasks.
- */
- if (current->flags & PF_LESS_THROTTLE || rt_task(current))
- base_bw *= 2;
bw = position_ratio(bdi, dirty_thresh, nr_dirty, bdi_dirty);
if (unlikely(bw == 0)) {
period = pause_max;
pause = pause_max;
goto pause;
}
- bw = base_bw * (u64)bw >> RATIO_SHIFT;
+ bw = (u64)base_bw * bw >> RATIO_SHIFT;
+ if (blkcg && bw > blkcg->throttle_bandwidth) {
+cgroup_ioc:
+ blkcg_update_bandwidth(blkcg);
+ bw = blkcg->throttle_bandwidth;
+ base_bw = bw;
+ }
+ if (bw > current->signal->rlim[RLIMIT_RSS].rlim_cur >>
+ PAGE_SHIFT) {
+task_ioc:
+ bw = current->signal->rlim[RLIMIT_RSS].rlim_cur >>
+ PAGE_SHIFT;
+ base_bw = bw;
+ }
+ /*
+ * Double the bandwidth for PF_LESS_THROTTLE (ie. nfsd) and
+ * real-time tasks.
+ */
+ if (current->flags & PF_LESS_THROTTLE || rt_task(current)) {
+ bw *= 2;
+ base_bw = bw;
+ }
period = (HZ * pages_dirtied + bw / 2) / (bw | 1);
pause = current->paused_when + period - now;
/*
--- linux-next.orig/block/blk-cgroup.c 2011-04-05 01:26:38.000000000 +0800
+++ linux-next/block/blk-cgroup.c 2011-04-05 01:26:39.000000000 +0800
@@ -1486,6 +1486,8 @@ done:
INIT_LIST_HEAD(&blkcg->policy_list);

percpu_counter_init(&blkcg->nr_dirtied, 0);
+ blkcg->async_write_bps = 16 << 23; /* XXX: tunable interface */
+ blkcg->throttle_bandwidth = 16 << (20 - PAGE_SHIFT);

return &blkcg->css;
}
--- linux-next.orig/include/linux/blk-cgroup.h 2011-04-05 01:26:38.000000000 +0800
+++ linux-next/include/linux/blk-cgroup.h 2011-04-05 01:26:39.000000000 +0800
@@ -112,6 +112,10 @@ struct blkio_cgroup {
struct hlist_head blkg_list;
struct list_head policy_list; /* list of blkio_policy_node */
struct percpu_counter nr_dirtied;
+ unsigned long bw_time_stamp;
+ unsigned long dirtied_stamp;
+ unsigned long throttle_bandwidth;
+ unsigned long async_write_bps;
};

struct blkio_group_stats {