Re: [PATCH 09/14] blk-mq: ensure that plug lists don't straddle hardware queues
From: Ming Lei
Date: Tue Oct 30 2018 - 04:09:14 EST
On Mon, Oct 29, 2018 at 01:49:18PM -0600, Jens Axboe wrote:
> On 10/29/18 1:30 PM, Jens Axboe wrote:
> > On 10/29/18 1:27 PM, Bart Van Assche wrote:
> >> On Mon, 2018-10-29 at 10:37 -0600, Jens Axboe wrote:
> >>> void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
> >>> {
> >>> struct blk_mq_ctx *this_ctx;
> >>> @@ -1628,7 +1649,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
> >>> struct request *rq;
> >>> LIST_HEAD(list);
> >>> LIST_HEAD(ctx_list);
> >>> - unsigned int depth;
> >>> + unsigned int depth, this_flags;
> >>>
> >>> list_splice_init(&plug->mq_list, &list);
> >>>
> >>> @@ -1636,13 +1657,14 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
> >>>
> >>> this_q = NULL;
> >>> this_ctx = NULL;
> >>> + this_flags = 0;
> >>> depth = 0;
> >>>
> >>> while (!list_empty(&list)) {
> >>> rq = list_entry_rq(list.next);
> >>> list_del_init(&rq->queuelist);
> >>> BUG_ON(!rq->q);
> >>> - if (rq->mq_ctx != this_ctx) {
> >>> + if (!ctx_match(rq, this_ctx, this_flags)) {
> >>> if (this_ctx) {
> >>> trace_block_unplug(this_q, depth, !from_schedule);
> >>> blk_mq_sched_insert_requests(this_q, this_ctx,
> >>> @@ -1650,6 +1672,7 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
> >>> from_schedule);
> >>> }
> >>>
> >>> + this_flags = rq->cmd_flags;
> >>> this_ctx = rq->mq_ctx;
> >>> this_q = rq->q;
> >>> depth = 0;
> >>
> >> This patch will cause the function stored in the flags_to_type pointer to be
> >> called 2 * (n - 1) times where n is the number of elements in 'list' when
> >> blk_mq_sched_insert_requests() is called. Have you considered to rearrange
> >> the code such that that number of calls is reduced to n?
> >
> > One alternative is to improve the sorting, but then we'd need to call
> > it in there instead. My longer term plan is something like the below,
> > which will reduce the number of calls in general, not just for
> > this path. But that is a separate change, should not be mixed
> > with this one.
>
> Here's an updated one that applies on top of the current tree,
> and also uses this information to sort efficiently. This eliminates
> all this, and also makes the whole thing more clear.
>
> I'll split this into two patches, just didn't want to include in
> the series just yet.
>
>
> diff --git a/block/blk-flush.c b/block/blk-flush.c
> index 7922dba81497..397985808b75 100644
> --- a/block/blk-flush.c
> +++ b/block/blk-flush.c
> @@ -219,7 +219,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
>
> /* release the tag's ownership to the req cloned from */
> spin_lock_irqsave(&fq->mq_flush_lock, flags);
> - hctx = blk_mq_map_queue(q, flush_rq->cmd_flags, flush_rq->mq_ctx->cpu);
> + hctx = flush_rq->mq_hctx;
> if (!q->elevator) {
> blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
> flush_rq->tag = -1;
> @@ -307,13 +307,13 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
> if (!q->elevator) {
> fq->orig_rq = first_rq;
> flush_rq->tag = first_rq->tag;
> - hctx = blk_mq_map_queue(q, first_rq->cmd_flags,
> - first_rq->mq_ctx->cpu);
> + hctx = flush_rq->mq_hctx;
> blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq);
> } else {
> flush_rq->internal_tag = first_rq->internal_tag;
> }
>
> + flush_rq->mq_hctx = first_rq->mq_hctx;
> flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
> flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
> flush_rq->rq_flags |= RQF_FLUSH_SEQ;
> @@ -326,13 +326,11 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
> static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
> {
> struct request_queue *q = rq->q;
> - struct blk_mq_hw_ctx *hctx;
> + struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
> struct blk_mq_ctx *ctx = rq->mq_ctx;
> unsigned long flags;
> struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx);
>
> - hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
> -
> if (q->elevator) {
> WARN_ON(rq->tag < 0);
> blk_mq_put_driver_tag_hctx(hctx, rq);
> diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
> index fac70c81b7de..cde19be36135 100644
> --- a/block/blk-mq-debugfs.c
> +++ b/block/blk-mq-debugfs.c
> @@ -427,10 +427,8 @@ struct show_busy_params {
> static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
> {
> const struct show_busy_params *params = data;
> - struct blk_mq_hw_ctx *hctx;
>
> - hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu);
> - if (hctx == params->hctx)
> + if (rq->mq_hctx == params->hctx)
> __blk_mq_debugfs_rq_show(params->m,
> list_entry_rq(&rq->queuelist));
> }
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index d232ecf3290c..25c558358255 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -367,9 +367,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
> struct request_queue *q = rq->q;
> struct elevator_queue *e = q->elevator;
> struct blk_mq_ctx *ctx = rq->mq_ctx;
> - struct blk_mq_hw_ctx *hctx;
> -
> - hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
> + struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
>
> /* flush rq in flush machinery need to be dispatched directly */
> if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
> @@ -399,16 +397,10 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
> }
>
> void blk_mq_sched_insert_requests(struct request_queue *q,
> - struct blk_mq_ctx *ctx,
> + struct blk_mq_hw_ctx *hctx,
> struct list_head *list, bool run_queue_async)
> {
> - struct blk_mq_hw_ctx *hctx;
> struct elevator_queue *e;
> - struct request *rq;
> -
> - /* For list inserts, requests better be on the same hw queue */
> - rq = list_first_entry(list, struct request, queuelist);
> - hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
>
> e = hctx->queue->elevator;
> if (e && e->type->ops.mq.insert_requests)
> @@ -424,7 +416,7 @@ void blk_mq_sched_insert_requests(struct request_queue *q,
> if (list_empty(list))
> return;
> }
> - blk_mq_insert_requests(hctx, ctx, list);
> + blk_mq_insert_requests(hctx, list);
> }
>
> blk_mq_run_hw_queue(hctx, run_queue_async);
> diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
> index 8a9544203173..a42547213f58 100644
> --- a/block/blk-mq-sched.h
> +++ b/block/blk-mq-sched.h
> @@ -20,7 +20,7 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
> void blk_mq_sched_insert_request(struct request *rq, bool at_head,
> bool run_queue, bool async);
> void blk_mq_sched_insert_requests(struct request_queue *q,
> - struct blk_mq_ctx *ctx,
> + struct blk_mq_hw_ctx *hctx,
> struct list_head *list, bool run_queue_async);
>
> void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 478a959357f5..fb836d818b80 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -527,14 +527,7 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
> */
> u32 blk_mq_unique_tag(struct request *rq)
> {
> - struct request_queue *q = rq->q;
> - struct blk_mq_hw_ctx *hctx;
> - int hwq = 0;
> -
> - hctx = blk_mq_map_queue(q, rq->cmd_flags, rq->mq_ctx->cpu);
> - hwq = hctx->queue_num;
> -
> - return (hwq << BLK_MQ_UNIQUE_TAG_BITS) |
> + return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) |
> (rq->tag & BLK_MQ_UNIQUE_TAG_MASK);
> }
> EXPORT_SYMBOL(blk_mq_unique_tag);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 37310cc55733..17ea522bd7c1 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -300,6 +300,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
> /* csd/requeue_work/fifo_time is initialized before use */
> rq->q = data->q;
> rq->mq_ctx = data->ctx;
> + rq->mq_hctx = data->hctx;
> rq->rq_flags = rq_flags;
> rq->cpu = -1;
> rq->cmd_flags = op;
> @@ -473,10 +474,11 @@ static void __blk_mq_free_request(struct request *rq)
> {
> struct request_queue *q = rq->q;
> struct blk_mq_ctx *ctx = rq->mq_ctx;
> - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
> + struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
> const int sched_tag = rq->internal_tag;
>
> blk_pm_mark_last_busy(rq);
> + rq->mq_hctx = NULL;
> if (rq->tag != -1)
> blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
> if (sched_tag != -1)
> @@ -490,7 +492,7 @@ void blk_mq_free_request(struct request *rq)
> struct request_queue *q = rq->q;
> struct elevator_queue *e = q->elevator;
> struct blk_mq_ctx *ctx = rq->mq_ctx;
> - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->cmd_flags, ctx->cpu);
> + struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
>
> if (rq->rq_flags & RQF_ELVPRIV) {
> if (e && e->type->ops.mq.finish_request)
> @@ -982,7 +984,7 @@ bool blk_mq_get_driver_tag(struct request *rq)
> {
> struct blk_mq_alloc_data data = {
> .q = rq->q,
> - .hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu),
> + .hctx = rq->mq_hctx,
> .flags = BLK_MQ_REQ_NOWAIT,
> .cmd_flags = rq->cmd_flags,
> };
> @@ -1148,7 +1150,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
>
> rq = list_first_entry(list, struct request, queuelist);
>
> - hctx = blk_mq_map_queue(rq->q, rq->cmd_flags, rq->mq_ctx->cpu);
> + hctx = rq->mq_hctx;
> if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
> break;
>
> @@ -1578,9 +1580,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
> */
> void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
> {
> - struct blk_mq_ctx *ctx = rq->mq_ctx;
> - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, rq->cmd_flags,
> - ctx->cpu);
> + struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
>
> spin_lock(&hctx->lock);
> list_add_tail(&rq->queuelist, &hctx->dispatch);
> @@ -1590,10 +1590,10 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
> blk_mq_run_hw_queue(hctx, false);
> }
>
> -void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
> - struct list_head *list)
> +void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *list)
>
> {
> + struct blk_mq_ctx *ctx = NULL;
> struct request *rq;
>
> /*
> @@ -1601,7 +1601,8 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
> * offline now
> */
> list_for_each_entry(rq, list, queuelist) {
> - BUG_ON(rq->mq_ctx != ctx);
> + BUG_ON(ctx && rq->mq_ctx != ctx);
> + ctx = rq->mq_ctx;
> trace_block_rq_insert(hctx->queue, rq);
> }
>
> @@ -1611,84 +1612,61 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
> spin_unlock(&ctx->lock);
> }
>
> -static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
> +static int plug_hctx_cmp(void *priv, struct list_head *a, struct list_head *b)
> {
> struct request *rqa = container_of(a, struct request, queuelist);
> struct request *rqb = container_of(b, struct request, queuelist);
>
> - return !(rqa->mq_ctx < rqb->mq_ctx ||
> - (rqa->mq_ctx == rqb->mq_ctx &&
> + return !(rqa->mq_hctx < rqb->mq_hctx ||
> + (rqa->mq_hctx == rqb->mq_hctx &&
> blk_rq_pos(rqa) < blk_rq_pos(rqb)));
> }
>
> -/*
> - * Need to ensure that the hardware queue matches, so we don't submit
> - * a list of requests that end up on different hardware queues.
> - */
> -static bool ctx_match(struct request *req, struct blk_mq_ctx *ctx,
> - unsigned int flags)
> -{
> - if (req->mq_ctx != ctx)
> - return false;
> -
> - /*
> - * If we just have one map, then we know the hctx will match
> - * if the ctx matches
> - */
> - if (req->q->tag_set->nr_maps == 1)
> - return true;
> -
> - return blk_mq_map_queue(req->q, req->cmd_flags, ctx->cpu) ==
> - blk_mq_map_queue(req->q, flags, ctx->cpu);
> -}
> -
> void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
> {
> - struct blk_mq_ctx *this_ctx;
> + struct blk_mq_hw_ctx *this_hctx;
> struct request_queue *this_q;
> struct request *rq;
> LIST_HEAD(list);
> - LIST_HEAD(ctx_list);
> - unsigned int depth, this_flags;
> + LIST_HEAD(hctx_list);
> + unsigned int depth;
>
> list_splice_init(&plug->mq_list, &list);
>
> - list_sort(NULL, &list, plug_ctx_cmp);
> + list_sort(NULL, &list, plug_hctx_cmp);
>
> this_q = NULL;
> - this_ctx = NULL;
> - this_flags = 0;
> + this_hctx = NULL;
> depth = 0;
>
> while (!list_empty(&list)) {
> rq = list_entry_rq(list.next);
> list_del_init(&rq->queuelist);
> BUG_ON(!rq->q);
> - if (!ctx_match(rq, this_ctx, this_flags)) {
> - if (this_ctx) {
> + if (rq->mq_hctx != this_hctx) {
> + if (this_hctx) {
> trace_block_unplug(this_q, depth, !from_schedule);
> - blk_mq_sched_insert_requests(this_q, this_ctx,
> - &ctx_list,
> + blk_mq_sched_insert_requests(this_q, this_hctx,
> + &hctx_list,
> from_schedule);
> }
Requests can be added to plug list from different ctx because of
preemption. However, blk_mq_sched_insert_requests() requires that
all requests in 'hctx_list' belong to same ctx.
Thanks,
Ming