Re: [PATCH 2/3] i/o bandwidth controller infrastructure

From: Carl Henrik Lunde
Date: Wed Jun 18 2008 - 14:06:34 EST


On Sat, Jun 07, 2008 at 12:27:29AM +0200, Andrea Righi wrote:
> This is the core io-throttle kernel infrastructure. It creates the basic
> interfaces to cgroups and implements the I/O measurement and throttling
> functions.
[...]
> +void cgroup_io_account(struct block_device *bdev, size_t bytes)
[...]
> + /* Account the I/O activity */
> + node->req += bytes;
> +
> + /* Evaluate if we need to throttle the current process */
> + delta = (long)jiffies - (long)node->last_request;
> + if (!delta)
> + goto out;
> +
> + t = msecs_to_jiffies(node->req / node->iorate);
> + if (!t)
> + goto out;
> +
> + sleep = t - delta;
> + if (unlikely(sleep > 0)) {
> + spin_unlock_irq(&iot->lock);
> + if (__cant_sleep())
> + return;
> + pr_debug("io-throttle: task %p (%s) must sleep %lu jiffies\n",
> + current, current->comm, sleep);
> + schedule_timeout_killable(sleep);
> + return;
> + }
> +
> + /* Reset I/O accounting */
> + node->req = 0;
> + node->last_request = jiffies;
[...]

Did you consider using token bucket instead of this (leaky bucket?)?

I've attached a patch which implements token bucket. Although not as
precise as the leaky bucket the performance is better at high bandwidth
streaming loads.

The leaky bucket stops at around 53 MB/s while token bucket works for
up to 64 MB/s. The baseline (no cgroups) is 66 MB/s.

benchmark:
two streaming readers (fio) with block size 128k, bucket size 4 MB
90% of the bandwidth was allocated to one process, the other gets 10%

bw-limit: actual bw algorithm bw1 bw2
5 MiB/s: 5.0 MiB/s leaky_bucket 0.5 4.5
5 MiB/s: 5.2 MiB/s token_bucket 0.6 4.6
10 MiB/s: 10.0 MiB/s leaky_bucket 1.0 9.0
10 MiB/s: 10.3 MiB/s token_bucket 1.0 9.2
15 MiB/s: 15.0 MiB/s leaky_bucket 1.5 13.5
15 MiB/s: 15.4 MiB/s token_bucket 1.5 13.8
20 MiB/s: 19.9 MiB/s leaky_bucket 2.0 17.9
20 MiB/s: 20.5 MiB/s token_bucket 2.1 18.4
25 MiB/s: 24.4 MiB/s leaky_bucket 2.5 21.9
25 MiB/s: 25.6 MiB/s token_bucket 2.6 23.0
30 MiB/s: 29.2 MiB/s leaky_bucket 3.0 26.2
30 MiB/s: 30.7 MiB/s token_bucket 3.1 27.7
35 MiB/s: 34.3 MiB/s leaky_bucket 3.4 30.9
35 MiB/s: 35.9 MiB/s token_bucket 3.6 32.3
40 MiB/s: 39.7 MiB/s leaky_bucket 3.9 35.8
40 MiB/s: 41.0 MiB/s token_bucket 4.1 36.9
45 MiB/s: 44.0 MiB/s leaky_bucket 4.3 39.7
45 MiB/s: 46.1 MiB/s token_bucket 4.6 41.5
50 MiB/s: 47.9 MiB/s leaky_bucket 4.7 43.2
50 MiB/s: 51.0 MiB/s token_bucket 5.1 45.9
55 MiB/s: 50.5 MiB/s leaky_bucket 5.0 45.5
55 MiB/s: 56.2 MiB/s token_bucket 5.6 50.5
60 MiB/s: 52.9 MiB/s leaky_bucket 5.2 47.7
60 MiB/s: 61.0 MiB/s token_bucket 6.1 54.9
65 MiB/s: 53.0 MiB/s leaky_bucket 5.4 47.6
65 MiB/s: 63.7 MiB/s token_bucket 6.6 57.1
70 MiB/s: 53.8 MiB/s leaky_bucket 5.5 48.4
70 MiB/s: 64.1 MiB/s token_bucket 7.1 57.0


diff --git a/block/blk-io-throttle.c b/block/blk-io-throttle.c
index 804df88..9ed0c7c 100644
--- a/block/blk-io-throttle.c
+++ b/block/blk-io-throttle.c
@@ -40,7 +40,8 @@ struct iothrottle_node {
struct rb_node node;
dev_t dev;
unsigned long iorate;
- unsigned long req;
+ long bucket_size; /* Max value for t */
+ long t;
unsigned long last_request;
};

@@ -180,18 +181,20 @@ static ssize_t iothrottle_read(struct cgroup *cont,
iothrottle_for_each(n, &iot->tree) {
struct iothrottle_node *node =
rb_entry(n, struct iothrottle_node, node);
- unsigned long delta = (long)jiffies - (long)node->last_request;
+ unsigned long delta = (((long)jiffies - (long)node->last_request) * 1000) / HZ;

BUG_ON(!node->dev);
s += snprintf(s, nbytes - (s - buffer),
"=== device (%u,%u) ===\n"
"bandwidth-max: %lu KiB/sec\n"
- " requested: %lu bytes\n"
- " last request: %lu jiffies\n"
- " delta: %lu jiffies\n",
+ "bucket size : %ld KiB\n"
+ "bucket fill : %ld KiB (after last request)\n"
+ "last request : %lu ms ago\n",
MAJOR(node->dev), MINOR(node->dev),
- node->iorate, node->req,
- node->last_request, delta);
+ node->iorate,
+ node->bucket_size / 1024,
+ node->t / 1024,
+ delta);
}
spin_unlock_irq(&iot->lock);
buffer[nbytes] = '\0';
@@ -220,21 +223,33 @@ static inline dev_t devname2dev_t(const char *buf)
return ret;
}

-static inline int iothrottle_parse_args(char *buf, size_t nbytes,
- dev_t *dev, unsigned long *val)
+static inline int iothrottle_parse_args(char *buf, size_t nbytes, dev_t *dev,
+ unsigned long *iorate,
+ unsigned long *bucket_size)
{
- char *p;
+ char *ioratep, *bucket_sizep;

- p = memchr(buf, ':', nbytes);
- if (!p)
+ ioratep = memchr(buf, ':', nbytes);
+ if (!ioratep)
return -EINVAL;
- *p++ = '\0';
+ *ioratep++ = '\0';
+
+ bucket_sizep = memchr(ioratep, ':', nbytes + ioratep - buf);
+ if (!bucket_sizep)
+ return -EINVAL;
+ *bucket_sizep++ = '\0';

*dev = devname2dev_t(buf);
if (!*dev)
return -ENOTBLK;

- return strict_strtoul(p, 10, val);
+ if (strict_strtoul(ioratep, 10, iorate))
+ return -EINVAL;
+
+ if (strict_strtoul(bucket_sizep, 10, bucket_size))
+ return -EINVAL;
+
+ return 0;
}

static ssize_t iothrottle_write(struct cgroup *cont,
@@ -247,7 +262,7 @@ static ssize_t iothrottle_write(struct cgroup *cont,
struct iothrottle_node *node, *tmpn = NULL;
char *buffer, *tmpp;
dev_t dev;
- unsigned long val;
+ unsigned long iorate, bucket_size;
int ret;

if (unlikely(!nbytes))
@@ -265,7 +280,7 @@ static ssize_t iothrottle_write(struct cgroup *cont,
buffer[nbytes] = '\0';
tmpp = strstrip(buffer);

- ret = iothrottle_parse_args(tmpp, nbytes, &dev, &val);
+ ret = iothrottle_parse_args(tmpp, nbytes, &dev, &iorate, &bucket_size);
if (ret)
goto out1;

@@ -284,7 +299,7 @@ static ssize_t iothrottle_write(struct cgroup *cont,
iot = cgroup_to_iothrottle(cont);

spin_lock_irq(&iot->lock);
- if (!val) {
+ if (!iorate) {
/* Delete a block device limiting rule */
iothrottle_delete_node(iot, dev);
ret = nbytes;
@@ -293,8 +308,9 @@ static ssize_t iothrottle_write(struct cgroup *cont,
node = iothrottle_search_node(iot, dev);
if (node) {
/* Update a block device limiting rule */
- node->iorate = val;
- node->req = 0;
+ node->iorate = iorate;
+ node->bucket_size = bucket_size * 1024;
+ node->t = 0;
node->last_request = jiffies;
ret = nbytes;
goto out3;
@@ -307,8 +323,9 @@ static ssize_t iothrottle_write(struct cgroup *cont,
node = tmpn;
tmpn = NULL;

- node->iorate = val;
- node->req = 0;
+ node->iorate = iorate;
+ node->bucket_size = bucket_size * 1024;
+ node->t = 0;
node->last_request = jiffies;
node->dev = dev;
ret = iothrottle_insert_node(iot, node);
@@ -355,7 +372,7 @@ void cgroup_io_account(struct block_device *bdev, size_t bytes)
{
struct iothrottle *iot;
struct iothrottle_node *node;
- unsigned long delta, t;
+ unsigned long delta;
long sleep;

if (unlikely(!bdev))
@@ -370,36 +387,37 @@ void cgroup_io_account(struct block_device *bdev, size_t bytes)
spin_lock_irq(&iot->lock);

node = iothrottle_search_node(iot, bdev->bd_inode->i_rdev);
- if (!node || !node->iorate)
- goto out;
-
- /* Account the I/O activity */
- node->req += bytes;
+ if (!node || !node->iorate) {
+ spin_unlock_irq(&iot->lock);
+ return;
+ }

- /* Evaluate if we need to throttle the current process */
+ /* Add tokens for time elapsed since last read */
delta = (long)jiffies - (long)node->last_request;
- if (!delta)
- goto out;
+ if (delta) {
+ node->last_request = jiffies;
+ node->t += (node->iorate * 1024 * delta) / HZ;

- t = msecs_to_jiffies(node->req / node->iorate);
- if (!t)
- goto out;
+ if (node->t > node->bucket_size)
+ node->t = node->bucket_size;
+ }

- sleep = t - delta;
- if (unlikely(sleep > 0)) {
- spin_unlock_irq(&iot->lock);
- if (__cant_sleep())
- return;
- pr_debug("io-throttle: task %p (%s) must sleep %lu jiffies\n",
- current, current->comm, sleep);
- schedule_timeout_killable(sleep);
- return;
+ /* Account the I/O activity */
+ node->t -= bytes;
+
+ if (node->t < 0) {
+ sleep = (-node->t) * HZ / (node->iorate * 1024);
+ } else {
+ sleep = 0;
}

- /* Reset I/O accounting */
- node->req = 0;
- node->last_request = jiffies;
-out:
spin_unlock_irq(&iot->lock);
+
+ if (sleep && !__cant_sleep()) {
+ pr_debug("io-throttle: %s[%d] must sleep %ld jiffies\n",
+ current->comm, current->pid, sleep);
+
+ schedule_timeout_killable(sleep);
+ }
}
EXPORT_SYMBOL(cgroup_io_account);

--
Carl Henrik
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/