Re: [PATCH -next V5] blk-mq: fix tag_get wait task can't be awakened
From: Andy Shevchenko
Date: Thu Jan 13 2022 - 05:52:50 EST
On Thu, Jan 13, 2022 at 10:55:36AM +0800, Laibin Qiu wrote:
> In case of shared tags, there might be more than one hctx which
> allocates from the same tags, and each hctx is limited to allocate at
> most:
> hctx_max_depth = max((bt->sb.depth + users - 1) / users, 4U);
>
> tag idle detection is lazy, and may be delayed for 30sec, so there
> could be just one real active hctx(queue) but all others are actually
> idle and still accounted as active because of the lazy idle detection.
> Then if wake_batch is > hctx_max_depth, driver tag allocation may wait
> forever on this real active hctx.
>
> Fix this by recalculating wake_batch when inc or dec active_queues.
FWIW,
Reviewed-by: Andy Shevchenko <andriy.shevchenko@xxxxxxxxxxxxxxx>
Thanks!
> Fixes: 0d2602ca30e41 ("blk-mq: improve support for shared tags maps")
> Suggested-by: Ming Lei <ming.lei@xxxxxxxxxx>
> Suggested-by: John Garry <john.garry@xxxxxxxxxx>
> Signed-off-by: Laibin Qiu <qiulaibin@xxxxxxxxxx>
> ---
> block/blk-mq-tag.c | 40 +++++++++++++++++++++++++++++++++-------
> include/linux/sbitmap.h | 11 +++++++++++
> lib/sbitmap.c | 25 ++++++++++++++++++++++---
> 3 files changed, 66 insertions(+), 10 deletions(-)
>
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index e55a6834c9a6..845f74e8dd7b 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -16,6 +16,21 @@
> #include "blk-mq-sched.h"
> #include "blk-mq-tag.h"
>
> +/*
> + * Recalculate wakeup batch when tag is shared by hctx.
> + */
> +static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
> + unsigned int users)
> +{
> + if (!users)
> + return;
> +
> + sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
> + users);
> + sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
> + users);
> +}
> +
> /*
> * If a previously inactive queue goes active, bump the active user count.
> * We need to do this before try to allocate driver tag, then even if fail
> @@ -24,18 +39,26 @@
> */
> bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
> {
> + unsigned int users;
> +
> if (blk_mq_is_shared_tags(hctx->flags)) {
> struct request_queue *q = hctx->queue;
>
> - if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
> - !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
> - atomic_inc(&hctx->tags->active_queues);
> + if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
> + test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
> + return true;
> + }
> } else {
> - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
> - !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
> - atomic_inc(&hctx->tags->active_queues);
> + if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
> + test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
> + return true;
> + }
> }
>
> + users = atomic_inc_return(&hctx->tags->active_queues);
> +
> + blk_mq_update_wake_batch(hctx->tags, users);
> +
> return true;
> }
>
> @@ -56,6 +79,7 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
> void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
> {
> struct blk_mq_tags *tags = hctx->tags;
> + unsigned int users;
>
> if (blk_mq_is_shared_tags(hctx->flags)) {
> struct request_queue *q = hctx->queue;
> @@ -68,7 +92,9 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
> return;
> }
>
> - atomic_dec(&tags->active_queues);
> + users = atomic_dec_return(&tags->active_queues);
> +
> + blk_mq_update_wake_batch(tags, users);
>
> blk_mq_tag_wakeup_all(tags, false);
> }
> diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
> index fc0357a6e19b..95df357ec009 100644
> --- a/include/linux/sbitmap.h
> +++ b/include/linux/sbitmap.h
> @@ -415,6 +415,17 @@ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq)
> sbitmap_free(&sbq->sb);
> }
>
> +/**
> + * sbitmap_queue_recalculate_wake_batch() - Recalculate wake batch
> + * @sbq: Bitmap queue to recalculate wake batch.
> + * @users: Number of shares.
> + *
> + * Like sbitmap_queue_update_wake_batch(), this will calculate wake batch
> + * by depth. This interface is for HCTX shared tags or queue shared tags.
> + */
> +void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
> + unsigned int users);
> +
> /**
> * sbitmap_queue_resize() - Resize a &struct sbitmap_queue.
> * @sbq: Bitmap queue to resize.
> diff --git a/lib/sbitmap.c b/lib/sbitmap.c
> index 2709ab825499..6220fa67fb7e 100644
> --- a/lib/sbitmap.c
> +++ b/lib/sbitmap.c
> @@ -457,10 +457,9 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
> }
> EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
>
> -static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
> - unsigned int depth)
> +static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
> + unsigned int wake_batch)
> {
> - unsigned int wake_batch = sbq_calc_wake_batch(sbq, depth);
> int i;
>
> if (sbq->wake_batch != wake_batch) {
> @@ -476,6 +475,26 @@ static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
> }
> }
>
> +static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
> + unsigned int depth)
> +{
> + unsigned int wake_batch;
> +
> + wake_batch = sbq_calc_wake_batch(sbq, depth);
> + __sbitmap_queue_update_wake_batch(sbq, wake_batch);
> +}
> +
> +void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
> + unsigned int users)
> +{
> + unsigned int wake_batch;
> +
> + wake_batch = clamp_val((sbq->sb.depth + users - 1) /
> + users, 4, SBQ_WAKE_BATCH);
> + __sbitmap_queue_update_wake_batch(sbq, wake_batch);
> +}
> +EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch);
> +
> void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth)
> {
> sbitmap_queue_update_wake_batch(sbq, depth);
> --
> 2.22.0
>
--
With Best Regards,
Andy Shevchenko