Re: [PATCH 2/3] i/o bandwidth controller infrastructure

From: Andrea Righi
Date: Wed Jun 18 2008 - 18:31:40 EST


Carl Henrik Lunde wrote:
On Sat, Jun 07, 2008 at 12:27:29AM +0200, Andrea Righi wrote:
This is the core io-throttle kernel infrastructure. It creates the basic
interfaces to cgroups and implements the I/O measurement and throttling
functions.
[...]
+void cgroup_io_account(struct block_device *bdev, size_t bytes)
[...]
+ /* Account the I/O activity */
+ node->req += bytes;
+
+ /* Evaluate if we need to throttle the current process */
+ delta = (long)jiffies - (long)node->last_request;
+ if (!delta)
+ goto out;
+
+ t = msecs_to_jiffies(node->req / node->iorate);
+ if (!t)
+ goto out;
+
+ sleep = t - delta;
+ if (unlikely(sleep > 0)) {
+ spin_unlock_irq(&iot->lock);
+ if (__cant_sleep())
+ return;
+ pr_debug("io-throttle: task %p (%s) must sleep %lu jiffies\n",
+ current, current->comm, sleep);
+ schedule_timeout_killable(sleep);
+ return;
+ }
+
+ /* Reset I/O accounting */
+ node->req = 0;
+ node->last_request = jiffies;
[...]

Did you consider using token bucket instead of this (leaky bucket?)?

I've attached a patch which implements token bucket. Although not as
precise as the leaky bucket the performance is better at high bandwidth
streaming loads.

Interesting! it could be great to have both available at runtime and
just switch between leaky or token bucket, e.g. by echo-ing "leaky" or
"token" to a file in the cgroup filesystem, ummm, block.limiting-algorithm?


The leaky bucket stops at around 53 MB/s while token bucket works for
up to 64 MB/s. The baseline (no cgroups) is 66 MB/s.

benchmark:
two streaming readers (fio) with block size 128k, bucket size 4 MB
90% of the bandwidth was allocated to one process, the other gets 10%

Thanks for posting the results, I'll look closely at your patch, test
it as well and try merge your work.

I also did some improvements in v2 in terms of scalability, in
particular I've replaced the rbtree with a liked list, in order to
remove the spinlocks and replace them by RCU to protect the list
structure. I need to do some stress tests before, but I'll post a v3
soon.

Some minor comments below for now.

diff --git a/block/blk-io-throttle.c b/block/blk-io-throttle.c
index 804df88..9ed0c7c 100644
--- a/block/blk-io-throttle.c
+++ b/block/blk-io-throttle.c
@@ -40,7 +40,8 @@ struct iothrottle_node {
struct rb_node node;
dev_t dev;
unsigned long iorate;
- unsigned long req;
+ long bucket_size; /* Max value for t */
+ long t;
unsigned long last_request;
};
@@ -180,18 +181,20 @@ static ssize_t iothrottle_read(struct cgroup *cont,
iothrottle_for_each(n, &iot->tree) {
struct iothrottle_node *node =
rb_entry(n, struct iothrottle_node, node);
- unsigned long delta = (long)jiffies - (long)node->last_request;
+ unsigned long delta = (((long)jiffies - (long)node->last_request) * 1000) / HZ;

Better to use jiffies_to_msecs() here.

BUG_ON(!node->dev);
s += snprintf(s, nbytes - (s - buffer),
"=== device (%u,%u) ===\n"
"bandwidth-max: %lu KiB/sec\n"
- " requested: %lu bytes\n"
- " last request: %lu jiffies\n"
- " delta: %lu jiffies\n",
+ "bucket size : %ld KiB\n"
+ "bucket fill : %ld KiB (after last request)\n"
+ "last request : %lu ms ago\n",
MAJOR(node->dev), MINOR(node->dev),
- node->iorate, node->req,
- node->last_request, delta);
+ node->iorate,
+ node->bucket_size / 1024,
+ node->t / 1024,
+ delta);
}
spin_unlock_irq(&iot->lock);
buffer[nbytes] = '\0';
@@ -220,21 +223,33 @@ static inline dev_t devname2dev_t(const char *buf)
return ret;
}
-static inline int iothrottle_parse_args(char *buf, size_t nbytes,
- dev_t *dev, unsigned long *val)
+static inline int iothrottle_parse_args(char *buf, size_t nbytes, dev_t *dev,
+ unsigned long *iorate,
+ unsigned long *bucket_size)
{
- char *p;
+ char *ioratep, *bucket_sizep;
- p = memchr(buf, ':', nbytes);
- if (!p)
+ ioratep = memchr(buf, ':', nbytes);
+ if (!ioratep)
return -EINVAL;
- *p++ = '\0';
+ *ioratep++ = '\0';
+
+ bucket_sizep = memchr(ioratep, ':', nbytes + ioratep - buf);
+ if (!bucket_sizep)
+ return -EINVAL;
+ *bucket_sizep++ = '\0';
*dev = devname2dev_t(buf);
if (!*dev)
return -ENOTBLK;
- return strict_strtoul(p, 10, val);
+ if (strict_strtoul(ioratep, 10, iorate))
+ return -EINVAL;
+
+ if (strict_strtoul(bucket_sizep, 10, bucket_size))
+ return -EINVAL;
+
+ return 0;
}
static ssize_t iothrottle_write(struct cgroup *cont,
@@ -247,7 +262,7 @@ static ssize_t iothrottle_write(struct cgroup *cont,
struct iothrottle_node *node, *tmpn = NULL;
char *buffer, *tmpp;
dev_t dev;
- unsigned long val;
+ unsigned long iorate, bucket_size;
int ret;
if (unlikely(!nbytes))
@@ -265,7 +280,7 @@ static ssize_t iothrottle_write(struct cgroup *cont,
buffer[nbytes] = '\0';
tmpp = strstrip(buffer);
- ret = iothrottle_parse_args(tmpp, nbytes, &dev, &val);
+ ret = iothrottle_parse_args(tmpp, nbytes, &dev, &iorate, &bucket_size);
if (ret)
goto out1;
@@ -284,7 +299,7 @@ static ssize_t iothrottle_write(struct cgroup *cont,
iot = cgroup_to_iothrottle(cont);
spin_lock_irq(&iot->lock);
- if (!val) {
+ if (!iorate) {
/* Delete a block device limiting rule */
iothrottle_delete_node(iot, dev);
ret = nbytes;
@@ -293,8 +308,9 @@ static ssize_t iothrottle_write(struct cgroup *cont,
node = iothrottle_search_node(iot, dev);
if (node) {
/* Update a block device limiting rule */
- node->iorate = val;
- node->req = 0;
+ node->iorate = iorate;
+ node->bucket_size = bucket_size * 1024;
+ node->t = 0;
node->last_request = jiffies;
ret = nbytes;
goto out3;
@@ -307,8 +323,9 @@ static ssize_t iothrottle_write(struct cgroup *cont,
node = tmpn;
tmpn = NULL;
- node->iorate = val;
- node->req = 0;
+ node->iorate = iorate;
+ node->bucket_size = bucket_size * 1024;
+ node->t = 0;
node->last_request = jiffies;
node->dev = dev;
ret = iothrottle_insert_node(iot, node);
@@ -355,7 +372,7 @@ void cgroup_io_account(struct block_device *bdev, size_t bytes)
{
struct iothrottle *iot;
struct iothrottle_node *node;
- unsigned long delta, t;
+ unsigned long delta;
long sleep;
if (unlikely(!bdev))
@@ -370,36 +387,37 @@ void cgroup_io_account(struct block_device *bdev, size_t bytes)
spin_lock_irq(&iot->lock);
node = iothrottle_search_node(iot, bdev->bd_inode->i_rdev);
- if (!node || !node->iorate)
- goto out;
-
- /* Account the I/O activity */
- node->req += bytes;
+ if (!node || !node->iorate) {
+ spin_unlock_irq(&iot->lock);
+ return;
+ }
- /* Evaluate if we need to throttle the current process */
+ /* Add tokens for time elapsed since last read */
delta = (long)jiffies - (long)node->last_request;
- if (!delta)
- goto out;
+ if (delta) {
+ node->last_request = jiffies;
+ node->t += (node->iorate * 1024 * delta) / HZ;

The same here:
node->t += node->iorate * 1024
* jiffies_to_msec(delta) * MSEC_PER_SEC;

- t = msecs_to_jiffies(node->req / node->iorate);
- if (!t)
- goto out;
+ if (node->t > node->bucket_size)
+ node->t = node->bucket_size;
+ }
- sleep = t - delta;
- if (unlikely(sleep > 0)) {
- spin_unlock_irq(&iot->lock);
- if (__cant_sleep())
- return;
- pr_debug("io-throttle: task %p (%s) must sleep %lu jiffies\n",
- current, current->comm, sleep);
- schedule_timeout_killable(sleep);
- return;
+ /* Account the I/O activity */
+ node->t -= bytes;
+
+ if (node->t < 0) {
+ sleep = (-node->t) * HZ / (node->iorate * 1024);

And again:
sleep = msec_to_jiffies(-node->t / (node->iorate * 1024)
* MSEC_PER_SEC);

+ } else {
+ sleep = 0;
}
- /* Reset I/O accounting */
- node->req = 0;
- node->last_request = jiffies;
-out:
spin_unlock_irq(&iot->lock);
+
+ if (sleep && !__cant_sleep()) {
+ pr_debug("io-throttle: %s[%d] must sleep %ld jiffies\n",
+ current->comm, current->pid, sleep);
+
+ schedule_timeout_killable(sleep);
+ }
}
EXPORT_SYMBOL(cgroup_io_account);

Thanks,
-Andrea
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/