[RFC PATCH] cfq-iosched: limit slice_idle when many busy queues arein idle window

From: Tomoki Sekiyama
Date: Tue Jul 30 2013 - 15:36:58 EST


Hi,

When some application launches several hundreds of processes that issue
only a few small sync I/O requests, CFQ may cause heavy latencies
(10+ seconds at the worst case), although the request rate is low enough for
the disk to handle it without waiting. This is because CFQ waits for
slice_idle (default:8ms) every time before processing each request, until
their thinktimes are evaluated.

This scenario can be reproduced using fio with parameters below:
fio -filename=/tmp/test -rw=randread -size=5G -runtime=15 -name=file1 \
-bs=4k -numjobs=500 -thinktime=1000000
In this case, 500 processes issue a random read request every second.

This problem can be avoided by setting slice_idle to 0, but there is a
risk to hurt throughput performance on S-ATA disks.

This patch tries to reduce the effect of slice_idle automatically when a
lot of busy queues are waiting in the idle window.
It adds a counter (busy_idle_queues) of queues in idle window that have
I/O requests to cfq_data. And if (busy_idle_queues * slice_idle) goes over
the slice allocated to the group, it limits the idle wait time to
(group_slice / busy_idle_queues).

Without this patch, fio benchmark with parameters above to an ext4
partition on a S-ATA HDD results in:
read : io=20140KB, bw=1258.5KB/s, iops=314 , runt= 16004msec
clat (usec): min=4 , max=6494.9K, avg=541264.54, stdev=993834.12

With this patch:
read : io=28040KB, bw=1750.1KB/s, iops=437 , runt= 16014msec
clat (usec): min=4 , max=2837.2K, avg=110236.79, stdev=303351.72

Average latency is reduced by 80%, and max is also reduced by 56%.

Any comments are appreciated.

Signed-off-by: Tomoki Sekiyama <tomoki.sekiyama@xxxxxxx>
---
block/cfq-iosched.c | 36 +++++++++++++++++++++++++++++++-----
1 file changed, 31 insertions(+), 5 deletions(-)

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index d5cd313..77ac27e80 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -329,6 +329,7 @@ struct cfq_data {

unsigned int busy_queues;
unsigned int busy_sync_queues;
+ unsigned int busy_idle_queues; /* busy but with idle window */

int rq_in_driver;
int rq_in_flight[2];
@@ -446,6 +447,20 @@ CFQ_CFQQ_FNS(deep);
CFQ_CFQQ_FNS(wait_busy);
#undef CFQ_CFQQ_FNS

+static inline void cfq_set_cfqq_idle_window(struct cfq_data *cfqd,
+ struct cfq_queue *cfqq, bool idle)
+{
+ if (idle) {
+ cfq_mark_cfqq_idle_window(cfqq);
+ if (cfq_cfqq_on_rr(cfqq))
+ cfqd->busy_idle_queues++;
+ } else {
+ cfq_clear_cfqq_idle_window(cfqq);
+ if (cfq_cfqq_on_rr(cfqq))
+ cfqd->busy_idle_queues--;
+ }
+}
+
static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd)
{
return pd ? container_of(pd, struct cfq_group, pd) : NULL;
@@ -2164,6 +2179,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
cfqd->busy_queues++;
if (cfq_cfqq_sync(cfqq))
cfqd->busy_sync_queues++;
+ if (cfq_cfqq_idle_window(cfqq))
+ cfqd->busy_idle_queues++;

cfq_resort_rr_list(cfqd, cfqq);
}
@@ -2192,6 +2209,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
cfqd->busy_queues--;
if (cfq_cfqq_sync(cfqq))
cfqd->busy_sync_queues--;
+ if (cfq_cfqq_idle_window(cfqq))
+ cfqd->busy_idle_queues--;
}

/*
@@ -2761,6 +2780,16 @@ static void cfq_arm_slice_timer(struct cfq_data *cfqd)
else
sl = cfqd->cfq_slice_idle;

+ /*
+ * If there too many queues with idle window, slice idle can cause
+ * unacceptable latency. Then we reduce slice idle here.
+ */
+ if (cfqd->busy_idle_queues) {
+ unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
+ unsigned long limit = group_slice / cfqd->busy_idle_queues;
+ sl = min(sl, limit);
+ }
+
mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
cfqg_stats_set_start_idle_time(cfqq->cfqg);
cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
@@ -3091,7 +3120,7 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
(cfq_cfqq_slice_new(cfqq) ||
(cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
cfq_clear_cfqq_deep(cfqq);
- cfq_clear_cfqq_idle_window(cfqq);
+ cfq_set_cfqq_idle_window(cfqd, cfqq, false);
}

if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
@@ -3742,10 +3771,7 @@ cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,

if (old_idle != enable_idle) {
cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
- if (enable_idle)
- cfq_mark_cfqq_idle_window(cfqq);
- else
- cfq_clear_cfqq_idle_window(cfqq);
+ cfq_set_cfqq_idle_window(cfqd, cfqq, enable_idle);
}
}


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/