Re: [PATCH v2 10/11] blk-mq: Use shared tags for shared sbitmap support

From: Ming Lei
Date: Wed Aug 18 2021 - 04:17:20 EST


On Mon, Aug 09, 2021 at 10:29:37PM +0800, John Garry wrote:
> Currently we use separate sbitmap pairs and active_queues atomic_t for
> shared sbitmap support.
>
> However a full set of static requests are used per HW queue, which is
> quite wasteful, considering that the total number of requests usable at
> any given time across all HW queues is limited by the shared sbitmap depth.
>
> As such, it is considerably more memory efficient in the case of shared
> sbitmap to allocate a set of static rqs per tag set or request queue, and
> not per HW queue.
>
> So replace the sbitmap pairs and active_queues atomic_t with a shared
> tags per tagset and request queue.

This idea looks good and the current implementation is simplified a bit
too.

>
> Continue to use term "shared sbitmap" for now, as the meaning is known.

I guess shared tags is better.

>
> Signed-off-by: John Garry <john.garry@xxxxxxxxxx>
> ---
> block/blk-mq-sched.c | 77 ++++++++++++++++++++-----------------
> block/blk-mq-tag.c | 65 ++++++++++++-------------------
> block/blk-mq-tag.h | 4 +-
> block/blk-mq.c | 86 +++++++++++++++++++++++++-----------------
> block/blk-mq.h | 8 ++--
> include/linux/blk-mq.h | 13 +++----
> include/linux/blkdev.h | 3 +-
> 7 files changed, 131 insertions(+), 125 deletions(-)
>
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index ac0408ebcd47..1101a2e4da9a 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -522,14 +522,19 @@ static int blk_mq_sched_alloc_map_and_request(struct request_queue *q,
> struct blk_mq_tag_set *set = q->tag_set;
> int ret;
>
> + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
> + hctx->sched_tags = q->shared_sbitmap_tags;
> + return 0;
> + }
> +
> hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
> - set->reserved_tags, set->flags);
> + set->reserved_tags);
> if (!hctx->sched_tags)
> return -ENOMEM;
>
> ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
> if (ret) {
> - blk_mq_free_rq_map(hctx->sched_tags, set->flags);
> + blk_mq_free_rq_map(hctx->sched_tags);
> hctx->sched_tags = NULL;
> }
>
> @@ -544,35 +549,39 @@ static void blk_mq_sched_tags_teardown(struct request_queue *q)
>
> queue_for_each_hw_ctx(q, hctx, i) {
> if (hctx->sched_tags) {
> - blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
> + if (!blk_mq_is_sbitmap_shared(q->tag_set->flags))
> + blk_mq_free_rq_map(hctx->sched_tags);
> hctx->sched_tags = NULL;
> }
> }
> }
>
> +static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
> +{
> + blk_mq_free_rq_map(queue->shared_sbitmap_tags);
> + queue->shared_sbitmap_tags = NULL;
> +}
> +
> static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
> {
> struct blk_mq_tag_set *set = queue->tag_set;
> - int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
> - struct blk_mq_hw_ctx *hctx;
> - int ret, i;
> + struct blk_mq_tags *tags;
> + int ret;
>
> /*
> * Set initial depth at max so that we don't need to reallocate for
> * updating nr_requests.
> */
> - ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,
> - &queue->sched_breserved_tags,
> - MAX_SCHED_RQ, set->reserved_tags,
> - set->numa_node, alloc_policy);
> - if (ret)
> - return ret;
> + tags = queue->shared_sbitmap_tags = blk_mq_alloc_rq_map(set, 0,
> + set->queue_depth,
> + set->reserved_tags);
> + if (!queue->shared_sbitmap_tags)
> + return -ENOMEM;
>
> - queue_for_each_hw_ctx(queue, hctx, i) {
> - hctx->sched_tags->bitmap_tags =
> - &queue->sched_bitmap_tags;
> - hctx->sched_tags->breserved_tags =
> - &queue->sched_breserved_tags;
> + ret = blk_mq_alloc_rqs(set, tags, 0, set->queue_depth);
> + if (ret) {
> + blk_mq_exit_sched_shared_sbitmap(queue);
> + return ret;

There are two such patterns for allocate rq map and request pool
together, please put them into one helper(such as blk_mq_alloc_map_and_rqs)
which can return the allocated tags and handle failure inline. Also we may
convert current users into this helper.

> }
>
> blk_mq_tag_update_sched_shared_sbitmap(queue);
> @@ -580,12 +589,6 @@ static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
> return 0;
> }
>
> -static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
> -{
> - sbitmap_queue_free(&queue->sched_bitmap_tags);
> - sbitmap_queue_free(&queue->sched_breserved_tags);
> -}
> -
> int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> {
> struct blk_mq_hw_ctx *hctx;
> @@ -607,21 +610,21 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
> q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
> BLKDEV_DEFAULT_RQ);
>
> - queue_for_each_hw_ctx(q, hctx, i) {
> - ret = blk_mq_sched_alloc_map_and_request(q, hctx, i);
> + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
> + ret = blk_mq_init_sched_shared_sbitmap(q);
> if (ret)
> - goto err_free_map_and_request;
> + return ret;
> }
>
> - if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
> - ret = blk_mq_init_sched_shared_sbitmap(q);
> + queue_for_each_hw_ctx(q, hctx, i) {
> + ret = blk_mq_sched_alloc_map_and_request(q, hctx, i);
> if (ret)
> goto err_free_map_and_request;
> }
>
> ret = e->ops.init_sched(q, e);
> if (ret)
> - goto err_free_sbitmap;
> + goto err_free_map_and_request;
>
> blk_mq_debugfs_register_sched(q);
>
> @@ -641,12 +644,12 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
>
> return 0;
>
> -err_free_sbitmap:
> - if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
> - blk_mq_exit_sched_shared_sbitmap(q);
> err_free_map_and_request:
> blk_mq_sched_free_requests(q);
> blk_mq_sched_tags_teardown(q);
> + if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
> + blk_mq_exit_sched_shared_sbitmap(q);
> +
> q->elevator = NULL;
> return ret;
> }
> @@ -660,9 +663,13 @@ void blk_mq_sched_free_requests(struct request_queue *q)
> struct blk_mq_hw_ctx *hctx;
> int i;
>
> - queue_for_each_hw_ctx(q, hctx, i) {
> - if (hctx->sched_tags)
> - blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
> + if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
> + blk_mq_free_rqs(q->tag_set, q->shared_sbitmap_tags, 0);
> + } else {
> + queue_for_each_hw_ctx(q, hctx, i) {
> + if (hctx->sched_tags)
> + blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
> + }
> }
> }
>
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index 5f06ad6efc8f..e97bbf477b96 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -27,10 +27,11 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
> if (blk_mq_is_sbitmap_shared(hctx->flags)) {
> struct request_queue *q = hctx->queue;
> struct blk_mq_tag_set *set = q->tag_set;
> + struct blk_mq_tags *tags = set->shared_sbitmap_tags;
>
> if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
> !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
> - atomic_inc(&set->active_queues_shared_sbitmap);
> + atomic_inc(&tags->active_queues);
> } else {
> if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
> !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> @@ -61,10 +62,12 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
> struct blk_mq_tag_set *set = q->tag_set;
>
> if (blk_mq_is_sbitmap_shared(hctx->flags)) {
> + struct blk_mq_tags *tags = set->shared_sbitmap_tags;
> +
> if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
> &q->queue_flags))
> return;
> - atomic_dec(&set->active_queues_shared_sbitmap);
> + atomic_dec(&tags->active_queues);
> } else {
> if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> return;
> @@ -510,38 +513,16 @@ static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
> return 0;
> }
>
> -int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set)
> -{
> - int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
> - int i, ret;
> -
> - ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags,
> - set->queue_depth, set->reserved_tags,
> - set->numa_node, alloc_policy);
> - if (ret)
> - return ret;
> -
> - for (i = 0; i < set->nr_hw_queues; i++) {
> - struct blk_mq_tags *tags = set->tags[i];
> -
> - tags->bitmap_tags = &set->__bitmap_tags;
> - tags->breserved_tags = &set->__breserved_tags;
> - }
> -
> - return 0;
> -}
> -
> void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
> {
> - sbitmap_queue_free(&set->__bitmap_tags);
> - sbitmap_queue_free(&set->__breserved_tags);
> + blk_mq_free_rq_map(set->shared_sbitmap_tags);
> + set->shared_sbitmap_tags = NULL;
> }
>
> struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
> unsigned int reserved_tags,
> - int node, unsigned int flags)
> + int node, int alloc_policy)
> {
> - int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags);
> struct blk_mq_tags *tags;
>
> if (total_tags > BLK_MQ_TAG_MAX) {
> @@ -557,9 +538,6 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
> tags->nr_reserved_tags = reserved_tags;
> spin_lock_init(&tags->lock);
>
> - if (blk_mq_is_sbitmap_shared(flags))
> - return tags;
> -
> if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {
> kfree(tags);
> return NULL;
> @@ -567,12 +545,10 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
> return tags;
> }
>
> -void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags)
> +void blk_mq_free_tags(struct blk_mq_tags *tags)
> {
> - if (!blk_mq_is_sbitmap_shared(flags)) {
> - sbitmap_queue_free(tags->bitmap_tags);
> - sbitmap_queue_free(tags->breserved_tags);
> - }
> + sbitmap_queue_free(tags->bitmap_tags);
> + sbitmap_queue_free(tags->breserved_tags);
> kfree(tags);
> }
>
> @@ -604,18 +580,25 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
> if (tdepth > MAX_SCHED_RQ)
> return -EINVAL;
>
> + if (blk_mq_is_sbitmap_shared(set->flags)) {
> + /* No point in allowing this to happen */
> + if (tdepth > set->queue_depth)
> + return -EINVAL;
> + return 0;
> + }

The above looks wrong, it isn't unusual to see small queue depth
hardware meantime we often have scheduler queue depth of 2 * set->queue_depth.

> +
> new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
> - tags->nr_reserved_tags, set->flags);
> + tags->nr_reserved_tags);
> if (!new)
> return -ENOMEM;
> ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
> if (ret) {
> - blk_mq_free_rq_map(new, set->flags);
> + blk_mq_free_rq_map(new);
> return -ENOMEM;
> }
>
> blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
> - blk_mq_free_rq_map(*tagsptr, set->flags);
> + blk_mq_free_rq_map(*tagsptr);
> *tagsptr = new;
> } else {
> /*
> @@ -631,12 +614,14 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
>
> void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size)
> {
> - sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags);
> + struct blk_mq_tags *tags = set->shared_sbitmap_tags;
> +
> + sbitmap_queue_resize(&tags->__bitmap_tags, size - set->reserved_tags);
> }
>
> void blk_mq_tag_update_sched_shared_sbitmap(struct request_queue *q)
> {
> - sbitmap_queue_resize(&q->sched_bitmap_tags,
> + sbitmap_queue_resize(q->shared_sbitmap_tags->bitmap_tags,
> q->nr_requests - q->tag_set->reserved_tags);
> }
>
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> index 88f3c6485543..c9fc52ee07c4 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -30,8 +30,8 @@ struct blk_mq_tags {
>
> extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
> unsigned int reserved_tags,
> - int node, unsigned int flags);
> -extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags);
> + int node, int alloc_policy);
> +extern void blk_mq_free_tags(struct blk_mq_tags *tags);
> extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
> struct sbitmap_queue *breserved_tags,
> unsigned int queue_depth,
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 4d6723cfa582..d3dd5fab3426 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2348,6 +2348,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
> struct blk_mq_tags *drv_tags;
> struct page *page;
>
> + if (blk_mq_is_sbitmap_shared(set->flags))
> + drv_tags = set->shared_sbitmap_tags;
> + else
> drv_tags = set->tags[hctx_idx];

Here I guess you need to avoid to double ->exit_request()?

>
> if (tags->static_rqs && set->ops->exit_request) {
> @@ -2377,21 +2380,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
> }
> }
>
> -void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags)
> +void blk_mq_free_rq_map(struct blk_mq_tags *tags)
> {
> kfree(tags->rqs);
> tags->rqs = NULL;
> kfree(tags->static_rqs);
> tags->static_rqs = NULL;
>
> - blk_mq_free_tags(tags, flags);
> + blk_mq_free_tags(tags);
> }
>
> struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
> unsigned int hctx_idx,
> unsigned int nr_tags,
> - unsigned int reserved_tags,
> - unsigned int flags)
> + unsigned int reserved_tags)
> {
> struct blk_mq_tags *tags;
> int node;
> @@ -2400,7 +2402,8 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
> if (node == NUMA_NO_NODE)
> node = set->numa_node;
>
> - tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags);
> + tags = blk_mq_init_tags(nr_tags, reserved_tags, node,
> + BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
> if (!tags)
> return NULL;
>
> @@ -2408,7 +2411,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
> GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
> node);
> if (!tags->rqs) {
> - blk_mq_free_tags(tags, flags);
> + blk_mq_free_tags(tags);
> return NULL;
> }
>
> @@ -2417,7 +2420,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
> node);
> if (!tags->static_rqs) {
> kfree(tags->rqs);
> - blk_mq_free_tags(tags, flags);
> + blk_mq_free_tags(tags);
> return NULL;
> }
>
> @@ -2859,8 +2862,14 @@ static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
> unsigned int flags = set->flags;
> int ret = 0;
>
> + if (blk_mq_is_sbitmap_shared(flags)) {
> + set->tags[hctx_idx] = set->shared_sbitmap_tags;
> +
> + return true;
> + }
> +
> set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
> - set->queue_depth, set->reserved_tags, flags);
> + set->queue_depth, set->reserved_tags);
> if (!set->tags[hctx_idx])
> return false;
>
> @@ -2869,7 +2878,7 @@ static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
> if (!ret)
> return true;
>
> - blk_mq_free_rq_map(set->tags[hctx_idx], flags);
> + blk_mq_free_rq_map(set->tags[hctx_idx]);
> set->tags[hctx_idx] = NULL;
> return false;
> }
> @@ -2877,11 +2886,11 @@ static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
> static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
> unsigned int hctx_idx)
> {
> - unsigned int flags = set->flags;
> -
> if (set->tags && set->tags[hctx_idx]) {
> - blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
> - blk_mq_free_rq_map(set->tags[hctx_idx], flags);
> + if (!blk_mq_is_sbitmap_shared(set->flags)) {

I remember you hate negative check, :-)

> + blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
> + blk_mq_free_rq_map(set->tags[hctx_idx]);

We can add one helper of blk_mq_free_map_and_rqs(), and there seems
several such pattern.

> + }
> set->tags[hctx_idx] = NULL;
> }
> }
> @@ -3348,6 +3357,21 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
> {
> int i;
>
> + if (blk_mq_is_sbitmap_shared(set->flags)) {
> + int ret;
> +
> + set->shared_sbitmap_tags = blk_mq_alloc_rq_map(set, 0,
> + set->queue_depth,
> + set->reserved_tags);
> + if (!set->shared_sbitmap_tags)
> + return -ENOMEM;
> +
> + ret = blk_mq_alloc_rqs(set, set->shared_sbitmap_tags, 0,
> + set->queue_depth);
> + if (ret)
> + goto out_free_sbitmap_tags;
> + }
> +
> for (i = 0; i < set->nr_hw_queues; i++) {
> if (!__blk_mq_alloc_map_and_request(set, i))
> goto out_unwind;
> @@ -3359,6 +3383,11 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
> out_unwind:
> while (--i >= 0)
> blk_mq_free_map_and_requests(set, i);
> + if (blk_mq_is_sbitmap_shared(set->flags))
> + blk_mq_free_rqs(set, set->shared_sbitmap_tags, 0);
> +out_free_sbitmap_tags:
> + if (blk_mq_is_sbitmap_shared(set->flags))
> + blk_mq_exit_shared_sbitmap(set);

Once a helper of blk_mq_alloc_map_and_rqs() is added, the above failure
handling can be simplified too.


--
Ming