Re: [PATCH] blk-mq: Use request queue-wide tags for tagset-wide sbitmap

From: Ming Lei
Date: Thu May 06 2021 - 04:33:04 EST


On Mon, May 03, 2021 at 06:22:13PM +0800, John Garry wrote:
> The tags used for an IO scheduler are currently per hctx.
>
> As such, when q->nr_hw_queues grows, so does the request queue total IO
> scheduler tag depth.
>
> This may cause problems for SCSI MQ HBAs whose total driver depth is
> fixed.
>
> Ming and Yanhui report higher CPU usage and lower throughput in scenarios
> where the fixed total driver tag depth is appreciably lower than the total
> scheduler tag depth:
> https://lore.kernel.org/linux-block/440dfcfc-1a2c-bd98-1161-cec4d78c6dfc@xxxxxxxxxx/T/#mc0d6d4f95275a2743d1c8c3e4dc9ff6c9aa3a76b
>
> In that scenario, since the scheduler tag is got first, much contention
> is introduced since a driver tag may not be available after we have got
> the sched tag.
>
> Improve this scenario by introducing request queue-wide tags for when
> a tagset-wide sbitmap is used. The static sched requests are still
> allocated per hctx, as requests are initialised per hctx, as in
> blk_mq_init_request(..., hctx_idx, ...) ->
> set->ops->init_request(.., hctx_idx, ...).
>
> For simplicity of resizing the request queue sbitmap when updating the
> request queue depth, just init at the max possible size, so we don't need
> to deal with the possibly with swapping out a new sbitmap for old if
> we need to grow.
>
> Signed-off-by: John Garry <john.garry@xxxxxxxxxx>
>
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index e1e997af89a0..121207abc026 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -497,11 +497,9 @@ static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
> struct blk_mq_hw_ctx *hctx,
> unsigned int hctx_idx)
> {
> - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
> -
> if (hctx->sched_tags) {
> blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
> - blk_mq_free_rq_map(hctx->sched_tags, flags);
> + blk_mq_free_rq_map(hctx->sched_tags, set->flags);
> hctx->sched_tags = NULL;
> }
> }
> @@ -511,12 +509,10 @@ static int blk_mq_sched_alloc_tags(struct request_queue *q,
> unsigned int hctx_idx)
> {
> struct blk_mq_tag_set *set = q->tag_set;
> - /* Clear HCTX_SHARED so tags are init'ed */
> - unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
> int ret;
>
> hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
> - set->reserved_tags, flags);
> + set->reserved_tags, set->flags);
> if (!hctx->sched_tags)
> return -ENOMEM;
>
> @@ -534,11 +530,8 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q)
> int i;
>
> queue_for_each_hw_ctx(q, hctx, i) {
> - /* Clear HCTX_SHARED so tags are freed */
> - unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED;
> -
> if (hctx->sched_tags) {
> - blk_mq_free_rq_map(hctx->sched_tags, flags);
> + blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
> hctx->sched_tags = NULL;
> }
> }
> @@ -568,12 +561,25 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> queue_for_each_hw_ctx(q, hctx, i) {
> ret = blk_mq_sched_alloc_tags(q, hctx, i);
> if (ret)
> - goto err;
> + goto err_free_tags;
> + }
> +
> + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
> + ret = blk_mq_init_sched_shared_sbitmap(q);
> + if (ret)
> + goto err_free_tags;
> +
> + queue_for_each_hw_ctx(q, hctx, i) {
> + hctx->sched_tags->bitmap_tags =
> + q->sched_bitmap_tags;
> + hctx->sched_tags->breserved_tags =
> + q->sched_breserved_tags;
> + }
> }
>
> ret = e->ops.init_sched(q, e);
> if (ret)
> - goto err;
> + goto err_free_sbitmap;
>
> blk_mq_debugfs_register_sched(q);
>
> @@ -584,6 +590,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> eq = q->elevator;
> blk_mq_sched_free_requests(q);
> blk_mq_exit_sched(q, eq);
> + blk_mq_exit_sched_shared_sbitmap(q);

blk_mq_exit_sched_shared_sbitmap() has been called in blk_mq_exit_sched() already.

> kobject_put(&eq->kobj);
> return ret;
> }
> @@ -593,7 +600,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
>
> return 0;
>
> -err:
> +err_free_sbitmap:
> + if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
> + blk_mq_exit_sched_shared_sbitmap(q);
> +err_free_tags:
> blk_mq_sched_free_requests(q);
> blk_mq_sched_tags_teardown(q);
> q->elevator = NULL;
> @@ -631,5 +641,7 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
> if (e->type->ops.exit_sched)
> e->type->ops.exit_sched(e);
> blk_mq_sched_tags_teardown(q);
> + if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
> + blk_mq_exit_sched_shared_sbitmap(q);
> q->elevator = NULL;
> }
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 2a37731e8244..734fedceca7d 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -466,19 +466,40 @@ static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
> return -ENOMEM;
> }
>
> -int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags)
> +static int __blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
> + struct sbitmap_queue *breserved_tags,
> + struct blk_mq_tag_set *set,
> + unsigned int queue_depth,
> + unsigned int reserved)
> {
> - unsigned int depth = set->queue_depth - set->reserved_tags;
> + unsigned int depth = queue_depth - reserved;
> int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
> bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
> - int i, node = set->numa_node;
>
> - if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node))
> + if (bt_alloc(bitmap_tags, depth, round_robin, set->numa_node))
> return -ENOMEM;
> - if (bt_alloc(&set->__breserved_tags, set->reserved_tags,
> - round_robin, node))
> + if (bt_alloc(breserved_tags, set->reserved_tags,
> + round_robin, set->numa_node))
> goto free_bitmap_tags;
>
> + return 0;
> +
> +free_bitmap_tags:
> + sbitmap_queue_free(bitmap_tags);
> + return -ENOMEM;
> +}
> +
> +int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set)

IMO, this function should be named as blk_mq_init_shared_tags
and moved to blk-mq-sched.c

> +{
> + int i, ret;
> +
> + ret = __blk_mq_init_bitmaps(&set->__bitmap_tags,
> + &set->__breserved_tags,
> + set, set->queue_depth,
> + set->reserved_tags);
> + if (ret)
> + return ret;
> +
> for (i = 0; i < set->nr_hw_queues; i++) {
> struct blk_mq_tags *tags = set->tags[i];
>
> @@ -487,9 +508,6 @@ int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags)
> }
>
> return 0;
> -free_bitmap_tags:
> - sbitmap_queue_free(&set->__bitmap_tags);
> - return -ENOMEM;
> }
>
> void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
> @@ -498,6 +516,52 @@ void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
> sbitmap_queue_free(&set->__breserved_tags);
> }
>
> +#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ)
> +
> +int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
> +{
> + struct blk_mq_tag_set *set = queue->tag_set;
> + int ret;
> +
> + queue->sched_bitmap_tags =
> + kmalloc(sizeof(*queue->sched_bitmap_tags), GFP_KERNEL);
> + queue->sched_breserved_tags =
> + kmalloc(sizeof(*queue->sched_breserved_tags), GFP_KERNEL);
> + if (!queue->sched_bitmap_tags || !queue->sched_breserved_tags)
> + goto err;

The two sbitmap queues can be embedded into 'request queue', so that
we can avoid to re-allocation in every elevator switch.

I will ask Yanhui to test the patch and see if it can make a difference.


Thanks,
Ming