Re: [PATCH 08/10] blk-mq-sched: add framework for MQ capable IO schedulers

From: Bart Van Assche
Date: Thu Jan 12 2017 - 16:45:37 EST


On Wed, 2017-01-11 at 14:40 -0700, Jens Axboe wrote:
> @@ -451,11 +456,11 @@ void blk_insert_flush(struct request *rq)
> * processed directly without going through flush machinery. Queue
> * for normal execution.
> */
> - if ((policy & REQ_FSEQ_DATA) &&
> - !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
> - if (q->mq_ops) {
> - blk_mq_insert_request(rq, false, true, false);
> - } else
> + if (((policy & REQ_FSEQ_DATA) &&
> + !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH)))) {
> + if (q->mq_ops)
> + blk_mq_sched_insert_request(rq, false, true, false);
> + else
> list_add_tail(&rq->queuelist, &q->queue_head);
> return;
> }

Not that it really matters, but this change adds a pair of parentheses --
"if (e)" is changed into "if ((e))". Is this necessary?

> +void blk_mq_sched_free_hctx_data(struct request_queue *q,
> + void (*exit)(struct blk_mq_hw_ctx *))
> +{
> + struct blk_mq_hw_ctx *hctx;
> + int i;
> +
> + queue_for_each_hw_ctx(q, hctx, i) {
> + if (exit)
> + exit(hctx);
> + kfree(hctx->sched_data);
> + hctx->sched_data = NULL;
> + }
> +}
> +EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
> +
> +int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
> + int (*init)(struct blk_mq_hw_ctx *),
> + void (*exit)(struct blk_mq_hw_ctx *))
> +{
> + struct blk_mq_hw_ctx *hctx;
> + int ret;
> + int i;
> +
> + queue_for_each_hw_ctx(q, hctx, i) {
> + hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
> + if (!hctx->sched_data) {
> + ret = -ENOMEM;
> + goto error;
> + }
> +
> + if (init) {
> + ret = init(hctx);
> + if (ret) {
> + /*
> + * We don't want to give exit() a partially
> + * initialized sched_data. init() must clean up
> + * if it fails.
> + */
> + kfree(hctx->sched_data);
> + hctx->sched_data = NULL;
> + goto error;
> + }
> + }
> + }
> +
> + return 0;
> +error:
> + blk_mq_sched_free_hctx_data(q, exit);
> + return ret;
> +}

If one of the init() calls by blk_mq_sched_init_hctx_data() fails then
blk_mq_sched_free_hctx_data() will call exit() even for hctx's for which
init() has not been called. How about changing "if (exit)" into "if (exit &&
hctx->sched_data)" such that exit() is only called for hctx's for which
init() has been called?

> +struct request *blk_mq_sched_get_request(struct request_queue *q,
> + struct bio *bio,
> + unsigned int op,
> + struct blk_mq_alloc_data *data)
> +{
> + struct elevator_queue *e = q->elevator;
> + struct blk_mq_hw_ctx *hctx;
> + struct blk_mq_ctx *ctx;
> + struct request *rq;
> +
> + blk_queue_enter_live(q);
> + ctx = blk_mq_get_ctx(q);
> + hctx = blk_mq_map_queue(q, ctx->cpu);
> +
> + blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
> +
> + if (e) {
> + data->flags |= BLK_MQ_REQ_INTERNAL;
> + if (e->type->ops.mq.get_request)
> + rq = e->type->ops.mq.get_request(q, op, data);
> + else
> + rq = __blk_mq_alloc_request(data, op);
> + } else {
> + rq = __blk_mq_alloc_request(data, op);
> + if (rq) {
> + rq->tag = rq->internal_tag;
> + rq->internal_tag = -1;
> + }
> + }
> +
> + if (rq) {
> + rq->elv.icq = NULL;
> + if (e && e->type->icq_cache)
> + blk_mq_sched_assign_ioc(q, rq, bio);
> + data->hctx->queued++;
> + return rq;
> + }
> +
> + blk_queue_exit(q);
> + return NULL;
> +}

The "rq->tag = rq->internal_tag; rq->internal_tag = -1;" occurs not only
here but also in blk_mq_alloc_request_hctx(). Has it been considered to move
that code into __blk_mq_alloc_request()?

@@ -223,14 +225,17 @@ struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
>
> tag = blk_mq_get_tag(data);
> if (tag != BLK_MQ_TAG_FAIL) {
> - rq = data->hctx->tags->rqs[tag];
> + struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
> +
> + rq = tags->rqs[tag];
>
> if (blk_mq_tag_busy(data->hctx)) {
> rq->rq_flags = RQF_MQ_INFLIGHT;
> atomic_inc(&data->hctx->nr_active);
> }
>
> - rq->tag = tag;
> + rq->tag = -1;
> + rq->internal_tag = tag;
> blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
> return rq;
> }

How about using the following code for tag assignment instead of "rq->tag =
-1; rq->internal_tag = tag"?

if (data->flags & BLK_MQ_REQ_INTERNAL) {
rq->tag = -1;
rq->internal_tag = tag;
} else {
rq->tag = tag;
rq->internal_tag = -1;
}

> @@ -313,6 +313,9 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
> goto out_queue_exit;
> }
>
> + rq->tag = rq->internal_tag;
> + rq->internal_tag = -1;
> +
> return rq;
>
> out_queue_exit:
> @@ -321,10 +324,10 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw,
> }
> EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);

Should something like "WARN_ON_ONCE(flags & BLK_MQ_REQ_INTERNAL)" be added
at the start of this function to avoid that BLK_MQ_REQ_INTERNAL is passed in
from outside the block layer?

Bart.