Re: [PATCH 04/10] block: initial patch for on-stack per-task plugging

From: Jens Axboe
Date: Fri Mar 18 2011 - 08:55:08 EST


On 2011-03-18 07:36, Shaohua Li wrote:
> On Thu, 2011-03-17 at 17:43 +0800, Jens Axboe wrote:
>> On 2011-03-17 02:00, Shaohua Li wrote:
>>> On Thu, 2011-03-17 at 01:31 +0800, Vivek Goyal wrote:
>>>> On Wed, Mar 16, 2011 at 04:18:30PM +0800, Shaohua Li wrote:
>>>>> 2011/1/22 Jens Axboe <jaxboe@xxxxxxxxxxxx>:
>>>>>> Signed-off-by: Jens Axboe <jaxboe@xxxxxxxxxxxx>
>>>>>> ---
>>>>>> block/blk-core.c | 357 ++++++++++++++++++++++++++++++++------------
>>>>>> block/elevator.c | 6 +-
>>>>>> include/linux/blk_types.h | 2 +
>>>>>> include/linux/blkdev.h | 30 ++++
>>>>>> include/linux/elevator.h | 1 +
>>>>>> include/linux/sched.h | 6 +
>>>>>> kernel/exit.c | 1 +
>>>>>> kernel/fork.c | 3 +
>>>>>> kernel/sched.c | 11 ++-
>>>>>> 9 files changed, 317 insertions(+), 100 deletions(-)
>>>>>>
>>>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>>>> index 960f12c..42dbfcc 100644
>>>>>> --- a/block/blk-core.c
>>>>>> +++ b/block/blk-core.c
>>>>>> @@ -27,6 +27,7 @@
>>>>>> #include <linux/writeback.h>
>>>>>> #include <linux/task_io_accounting_ops.h>
>>>>>> #include <linux/fault-inject.h>
>>>>>> +#include <linux/list_sort.h>
>>>>>>
>>>>>> #define CREATE_TRACE_POINTS
>>>>>> #include <trace/events/block.h>
>>>>>> @@ -213,7 +214,7 @@ static void blk_delay_work(struct work_struct *work)
>>>>>>
>>>>>> q = container_of(work, struct request_queue, delay_work.work);
>>>>>> spin_lock_irq(q->queue_lock);
>>>>>> - q->request_fn(q);
>>>>>> + __blk_run_queue(q);
>>>>>> spin_unlock_irq(q->queue_lock);
>>>>>> }
>>>>> Hi Jens,
>>>>> I have some questions about the per-task plugging. Since the request
>>>>> list is per-task, and each task delivers its requests at finish flush
>>>>> or schedule. But when one cpu delivers requests to global queue, other
>>>>> cpus don't know. This seems to have problem. For example:
>>>>> 1. get_request_wait() can only flush current task's request list,
>>>>> other cpus/tasks might still have a lot of requests, which aren't sent
>>>>> to request_queue.
>>>>
>>>> But very soon these requests will be sent to request queue as soon task
>>>> is either scheduled out or task explicitly flushes the plug? So we might
>>>> wait a bit longer but that might not matter in general, i guess.
>>> Yes, I understand there is just a bit delay. I don't know how severe it
>>> is, but this still could be a problem, especially for fast storage or
>>> random I/O. My current tests show slight regression (3% or so) with
>>> Jens's for 2.6.39/core branch. I'm still checking if it's caused by the
>>> per-task plug, but the per-task plug is highly suspected.
>>
>> To check this particular case, you can always just bump the request
>> limit. What test is showing a slowdown?
> this is a simple multi-threaded seq read. The issue tends to be request
> merge related (not verified yet). The merge reduces about 60% with stack
> plug from fio reported data. From trace, without stack plug, requests
> from different threads get merged. But with it, such merge is impossible
> because flush_plug doesn't check merge, I thought we need add it again.

What we could try is have the plug flush insert be
ELEVATOR_INSERT_SORT_MERGE and have it lookup potential backmerges.

Here's a quick hack that does that, I have not tested it at all.

diff --git a/block/blk-core.c b/block/blk-core.c
index e1fcf7a..5256932 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2685,7 +2685,7 @@ static void flush_plug_list(struct blk_plug *plug)
/*
* rq is already accounted, so use raw insert
*/
- __elv_add_request(q, rq, ELEVATOR_INSERT_SORT);
+ __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
}

if (q) {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ea85e20..cfcc37c 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -465,3 +465,9 @@ int attempt_front_merge(struct request_queue *q, struct request *rq)

return 0;
}
+
+int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+ struct request *next)
+{
+ return attempt_merge(q, rq, next);
+}
diff --git a/block/blk.h b/block/blk.h
index 49d21af..c8db371 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -103,6 +103,8 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
struct bio *bio);
int attempt_back_merge(struct request_queue *q, struct request *rq);
int attempt_front_merge(struct request_queue *q, struct request *rq);
+int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+ struct request *next);
void blk_recalc_rq_segments(struct request *rq);
void blk_rq_set_mixed_merge(struct request *rq);

diff --git a/block/elevator.c b/block/elevator.c
index 542ce82..f493e18 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -521,6 +521,33 @@ int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
return ELEVATOR_NO_MERGE;
}

+/*
+ * Returns true if we merged, false otherwise
+ */
+static bool elv_attempt_insert_merge(struct request_queue *q,
+ struct request *rq)
+{
+ struct request *__rq;
+
+ if (blk_queue_nomerges(q) || blk_queue_noxmerges(q))
+ return false;
+
+ /*
+ * First try one-hit cache.
+ */
+ if (q->last_merge && blk_attempt_req_merge(q, rq, q->last_merge))
+ return true;
+
+ /*
+ * See if our hash lookup can find a potential backmerge.
+ */
+ __rq = elv_rqhash_find(q, blk_rq_pos(rq));
+ if (__rq && blk_attempt_req_merge(q, rq, __rq))
+ return true;
+
+ return false;
+}
+
void elv_merged_request(struct request_queue *q, struct request *rq, int type)
{
struct elevator_queue *e = q->elevator;
@@ -647,6 +674,9 @@ void elv_insert(struct request_queue *q, struct request *rq, int where)
__blk_run_queue(q, false);
break;

+ case ELEVATOR_INSERT_SORT_MERGE:
+ if (elv_attempt_insert_merge(q, rq))
+ break;
case ELEVATOR_INSERT_SORT:
BUG_ON(rq->cmd_type != REQ_TYPE_FS &&
!(rq->cmd_flags & REQ_DISCARD));
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index ec6f72b..d93efcc445 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -166,6 +166,7 @@ extern struct request *elv_rb_find(struct rb_root *, sector_t);
#define ELEVATOR_INSERT_SORT 3
#define ELEVATOR_INSERT_REQUEUE 4
#define ELEVATOR_INSERT_FLUSH 5
+#define ELEVATOR_INSERT_SORT_MERGE 6

/*
* return values from elevator_may_queue_fn


--
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/