Re: [RFC][PATCH -mm 0/5] cgroup: block device i/o controller (v9)

From: Andrea Righi
Date: Tue Sep 02 2008 - 16:50:31 EST


Vivek Goyal wrote:
> On Wed, Aug 27, 2008 at 06:07:32PM +0200, Andrea Righi wrote:
>> The objective of the i/o controller is to improve i/o performance
>> predictability of different cgroups sharing the same block devices.
>>
>> Respect to other priority/weight-based solutions the approach used by this
>> controller is to explicitly choke applications' requests that directly (or
>> indirectly) generate i/o activity in the system.
>>
>
> Hi Andrea,
>
> I was checking out the pass discussion on this topic and there seemed to
> be two kind of people. One who wanted to control max bandwidth and other
> who liked proportional bandwidth approach (dm-ioband folks).
>
> I was just wondering, is it possible to have both the approaches and let
> users decide at run time which one do they want to use (something like
> the way users can choose io schedulers).
>
> Thanks
> Vivek

Hi Vivek,

yes, sounds reasonable (adding the proportional bandwidth control to my
TODO list).

Right now I've a totally experimental patch to add the ionice-like
functionality (it's not the same but it's quite similar to the
proportional bandwidth feature) on-top-of my IO controller. See below.

The patch is not very well tested, I don't even know if it applies
cleanly to the latest io-throttle patch I posted, or if it have runtime
failures, it needs more testing.

Anyway, this adds the file blockio.ionice that can be used to set
per-cgroup IO priorities, just like ionice, the difference is that it
works per-cgroup instead of per-task (it can be easily improved to
also support per-device priority).

The solution I've used is really trivial: all the tasks belonging to a
cgroup share the same io_context, so actually it means that they also
share the same disk time given by the IO scheduler and the tasks'
requests coming from a cgroup are considered as they were issued by a
single task. This works only for CFQ and AS, because deadline and noop
have no concept of IO contexts.

I would also like to merge the Satoshi's cfq-cgroup functionalities to
provide "fairness" also within each cgroup, but the drawback is that it
would work only for CFQ.

So, in conclusion, I'd really like to implement a more generic
weighted/priority cgroup-based policy to schedule bios like dm-ioband,
maybe implementing the hook directly in submit_bio() or
generic_make_request(), independent also of the dm infrastructure.

-Andrea

Signed-off-by: Andrea Righi <righi.andrea@xxxxxxxxx>
---
block/blk-io-throttle.c | 72 +++++++++++++++++++++++++++++++++++++--
block/blk-ioc.c | 16 +-------
include/linux/blk-io-throttle.h | 7 ++++
include/linux/iocontext.h | 15 ++++++++
kernel/fork.c | 3 +-
5 files changed, 95 insertions(+), 18 deletions(-)

diff --git a/block/blk-io-throttle.c b/block/blk-io-throttle.c
index 0fa235d..2a52e8d 100644
--- a/block/blk-io-throttle.c
+++ b/block/blk-io-throttle.c
@@ -29,6 +29,8 @@
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/genhd.h>
+#include <linux/iocontext.h>
+#include <linux/ioprio.h>
#include <linux/fs.h>
#include <linux/jiffies.h>
#include <linux/hardirq.h>
@@ -129,8 +131,10 @@ struct iothrottle_node {
struct iothrottle {
struct cgroup_subsys_state css;
struct list_head list;
+ struct io_context *ioc;
};
static struct iothrottle init_iothrottle;
+static struct io_context init_ioc;

static inline struct iothrottle *cgroup_to_iothrottle(struct cgroup *cgrp)
{
@@ -197,12 +201,17 @@ iothrottle_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct iothrottle *iot;

- if (unlikely((cgrp->parent) == NULL))
+ if (unlikely((cgrp->parent) == NULL)) {
iot = &init_iothrottle;
- else {
+ init_io_context(&init_ioc);
+ iot->ioc = &init_ioc;
+ } else {
iot = kmalloc(sizeof(*iot), GFP_KERNEL);
if (unlikely(!iot))
return ERR_PTR(-ENOMEM);
+ iot->ioc = alloc_io_context(GFP_KERNEL, -1);
+ if (unlikely(!iot->ioc))
+ return ERR_PTR(-ENOMEM);
}
INIT_LIST_HEAD(&iot->list);

@@ -223,6 +232,7 @@ static void iothrottle_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
*/
list_for_each_entry_safe(n, p, &iot->list, node)
kfree(n);
+ put_io_context(iot->ioc);
kfree(iot);
}

@@ -470,6 +480,27 @@ out1:
return ret;
}

+static u64 ionice_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+ struct iothrottle *iot = cgroup_to_iothrottle(cgrp);
+
+ return iot->ioc->ioprio;
+}
+
+static int ionice_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+ struct iothrottle *iot;
+
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+ iot = cgroup_to_iothrottle(cgrp);
+ iot->ioc->ioprio = (int)val;
+ iot->ioc->ioprio_changed = 1;
+ cgroup_unlock();
+
+ return 0;
+}
+
static struct cftype files[] = {
{
.name = "bandwidth-max",
@@ -486,6 +517,11 @@ static struct cftype files[] = {
.private = IOTHROTTLE_IOPS,
},
{
+ .name = "ionice",
+ .read_u64 = ionice_read_u64,
+ .write_u64 = ionice_write_u64,
+ },
+ {
.name = "throttlecnt",
.read_seq_string = iothrottle_read,
.private = IOTHROTTLE_FAILCNT,
@@ -497,15 +533,45 @@ static int iothrottle_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
}

+static void iothrottle_move_task(struct cgroup_subsys *ss,
+ struct cgroup *cgrp, struct cgroup *old_cgrp,
+ struct task_struct *tsk)
+{
+ struct iothrottle *iot;
+
+ iot = cgroup_to_iothrottle(cgrp);
+
+ task_lock(tsk);
+ put_io_context(tsk->io_context);
+ tsk->io_context = ioc_task_link(iot->ioc);
+ BUG_ON(!tsk->io_context);
+ task_unlock(tsk);
+}
+
struct cgroup_subsys iothrottle_subsys = {
.name = "blockio",
.create = iothrottle_create,
.destroy = iothrottle_destroy,
.populate = iothrottle_populate,
+ .attach = iothrottle_move_task,
.subsys_id = iothrottle_subsys_id,
- .early_init = 1,
+ .early_init = 0,
};

+int cgroup_copy_io(struct task_struct *tsk)
+{
+ struct iothrottle *iot;
+
+ rcu_read_lock();
+ iot = task_to_iothrottle(current);
+ BUG_ON(!iot);
+ tsk->io_context = ioc_task_link(iot->ioc);
+ rcu_read_unlock();
+ BUG_ON(!tsk->io_context);
+
+ return 0;
+}
+
/*
* NOTE: called with rcu_read_lock() held.
*/
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 012f065..629a80b 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -89,20 +89,8 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
struct io_context *ret;

ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
- if (ret) {
- atomic_set(&ret->refcount, 1);
- atomic_set(&ret->nr_tasks, 1);
- spin_lock_init(&ret->lock);
- ret->ioprio_changed = 0;
- ret->ioprio = 0;
- ret->last_waited = jiffies; /* doesn't matter... */
- ret->nr_batch_requests = 0; /* because this is 0 */
- ret->aic = NULL;
- INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH);
- INIT_HLIST_HEAD(&ret->cic_list);
- ret->ioc_data = NULL;
- }
-
+ if (ret)
+ init_io_context(ret);
return ret;
}

diff --git a/include/linux/blk-io-throttle.h b/include/linux/blk-io-throttle.h
index e901818..bee3975 100644
--- a/include/linux/blk-io-throttle.h
+++ b/include/linux/blk-io-throttle.h
@@ -14,6 +14,8 @@ extern unsigned long long
cgroup_io_throttle(struct page *page, struct block_device *bdev,
ssize_t bytes, int can_sleep);

+extern int cgroup_copy_io(struct task_struct *tsk);
+
static inline void set_in_aio(void)
{
atomic_set(&current->in_aio, 1);
@@ -51,6 +53,11 @@ cgroup_io_throttle(struct page *page, struct block_device *bdev,
return 0;
}

+static inline int cgroup_copy_io(struct task_struct *tsk)
+{
+ return -1;
+}
+
static inline void set_in_aio(void) { }

static inline void unset_in_aio(void) { }
diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h
index 08b987b..d06af02 100644
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -85,6 +85,21 @@ struct io_context {
void *ioc_data;
};

+static inline void init_io_context(struct io_context *ioc)
+{
+ atomic_set(&ioc->refcount, 1);
+ atomic_set(&ioc->nr_tasks, 1);
+ spin_lock_init(&ioc->lock);
+ ioc->ioprio_changed = 0;
+ ioc->ioprio = 0;
+ ioc->last_waited = jiffies; /* doesn't matter... */
+ ioc->nr_batch_requests = 0; /* because this is 0 */
+ ioc->aic = NULL;
+ INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
+ INIT_HLIST_HEAD(&ioc->cic_list);
+ ioc->ioc_data = NULL;
+}
+
static inline struct io_context *ioc_task_link(struct io_context *ioc)
{
/*
diff --git a/kernel/fork.c b/kernel/fork.c
index 9ee7408..cf38989 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -41,6 +41,7 @@
#include <linux/tracehook.h>
#include <linux/futex.h>
#include <linux/task_io_accounting_ops.h>
+#include <linux/blk-io-throttle.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
#include <linux/mount.h>
@@ -733,7 +734,7 @@ static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
#ifdef CONFIG_BLOCK
struct io_context *ioc = current->io_context;

- if (!ioc)
+ if (!ioc || !cgroup_copy_io(tsk))
return 0;
/*
* Share io context with parent, if CLONE_IO is set

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/