Re: [PATCH v2 4/6] blk-mq: avoid inserting requests before establishing new mapping

From: Ming Lei
Date: Thu Jul 09 2015 - 03:04:46 EST


On Thu, Jul 2, 2015 at 10:29 PM, Akinobu Mita <akinobu.mita@xxxxxxxxx> wrote:
> Notifier callbacks for CPU_ONLINE action can be run on the other CPU
> than the CPU which was just onlined. So it is possible for the
> process running on the just onlined CPU to insert request and run
> hw queue before establishing new mapping which is done by
> blk_mq_queue_reinit_notify().
>
> This can cause a problem when the CPU has just been onlined first time
> since the request queue was initialized. At this time ctx->index_hw
> for the CPU, which is the index in hctx->ctxs[] for this ctx, is still
> zero before blk_mq_queue_reinit_notify() is called by notifier
> callbacks for CPU_ONLINE action.
>
> For example, there is a single hw queue (hctx) and two CPU queues
> (ctx0 for CPU0, and ctx1 for CPU1). Now CPU1 is just onlined and
> a request is inserted into ctx1->rq_list and set bit0 in pending
> bitmap as ctx1->index_hw is still zero.
>
> And then while running hw queue, flush_busy_ctxs() finds bit0 is set
> in pending bitmap and tries to retrieve requests in
> hctx->ctxs[0]->rq_list. But htx->ctxs[0] is a pointer to ctx0, so the
> request in ctx1->rq_list is ignored.
>
> Fix it by ensuring that new mapping is established before onlined cpu
> starts running.
>
> Signed-off-by: Akinobu Mita <akinobu.mita@xxxxxxxxx>
> Cc: Jens Axboe <axboe@xxxxxxxxx>
> Cc: Ming Lei <tom.leiming@xxxxxxxxx>
> ---
> block/blk-mq-cpumap.c | 9 +++++----
> block/blk-mq.c | 41 ++++++++++++++++++++++++++---------------
> block/blk-mq.h | 3 ++-
> 3 files changed, 33 insertions(+), 20 deletions(-)
>
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index 1e28ddb..8764c24 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -31,7 +31,8 @@ static int get_first_sibling(unsigned int cpu)
> return cpu;
> }
>
> -int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
> +int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
> + const struct cpumask *online_mask)
> {
> unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
> cpumask_var_t cpus;
> @@ -41,7 +42,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
>
> cpumask_clear(cpus);
> nr_cpus = nr_uniq_cpus = 0;
> - for_each_online_cpu(i) {
> + for_each_cpu(i, online_mask) {
> nr_cpus++;
> first_sibling = get_first_sibling(i);
> if (!cpumask_test_cpu(first_sibling, cpus))
> @@ -51,7 +52,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues)
>
> queue = 0;
> for_each_possible_cpu(i) {
> - if (!cpu_online(i)) {
> + if (!cpumask_test_cpu(i, online_mask)) {
> map[i] = 0;
> continue;
> }
> @@ -95,7 +96,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set)
> if (!map)
> return NULL;
>
> - if (!blk_mq_update_queue_map(map, set->nr_hw_queues))
> + if (!blk_mq_update_queue_map(map, set->nr_hw_queues, cpu_online_mask))
> return map;
>
> kfree(map);
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index ff72e18..ad07373 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1799,7 +1799,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
> }
> }
>
> -static void blk_mq_map_swqueue(struct request_queue *q)
> +static void blk_mq_map_swqueue(struct request_queue *q,
> + const struct cpumask *online_mask)
> {
> unsigned int i;
> struct blk_mq_hw_ctx *hctx;
> @@ -1816,7 +1817,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
> */
> queue_for_each_ctx(q, ctx, i) {
> /* If the cpu isn't online, the cpu is mapped to first hctx */
> - if (!cpu_online(i))
> + if (!cpumask_test_cpu(i, online_mask))
> continue;
>
> hctx = q->mq_ops->map_queue(q, i);
> @@ -2046,7 +2047,10 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
>
> blk_mq_add_queue_tag_set(set, q);
>
> - blk_mq_map_swqueue(q);
> + get_online_cpus();
> + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, cpu_online_mask);
> + blk_mq_map_swqueue(q, cpu_online_mask);
> + put_online_cpus();
>
> return q;
>
> @@ -2083,13 +2087,14 @@ void blk_mq_free_queue(struct request_queue *q)
> }
>
> /* Basically redo blk_mq_init_queue with queue frozen */
> -static void blk_mq_queue_reinit(struct request_queue *q)
> +static void blk_mq_queue_reinit(struct request_queue *q,
> + const struct cpumask *online_mask)
> {
> WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
>
> blk_mq_sysfs_unregister(q);
>
> - blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
> + blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
>
> /*
> * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
> @@ -2097,25 +2102,31 @@ static void blk_mq_queue_reinit(struct request_queue *q)
> * involves free and re-allocate memory, worthy doing?)
> */
>
> - blk_mq_map_swqueue(q);
> + blk_mq_map_swqueue(q, online_mask);
>
> blk_mq_sysfs_register(q);
> }
>
> +static struct cpumask online_new;

Better to comment on why there isn't race on the single
global variable.

> +
> static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
> unsigned long action, void *hcpu)
> {
> struct request_queue *q;
> + int cpu = (unsigned long)hcpu;
>
> - /*
> - * Before new mappings are established, hotadded cpu might already
> - * start handling requests. This doesn't break anything as we map
> - * offline CPUs to first hardware queue. We will re-init the queue
> - * below to get optimal settings.
> - */

You should put some comment here about why the change is introduced
because the race isn't very obvious.

> - if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
> - action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
> + switch (action & ~CPU_TASKS_FROZEN) {
> + case CPU_DEAD:
> + case CPU_UP_CANCELED:
> + cpumask_copy(&online_new, cpu_online_mask);
> + break;
> + case CPU_UP_PREPARE:
> + cpumask_copy(&online_new, cpu_online_mask);
> + cpumask_set_cpu(cpu, &online_new);
> + break;
> + default:
> return NOTIFY_OK;
> + }
>
> mutex_lock(&all_q_mutex);
>
> @@ -2139,7 +2150,7 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
> }
>
> list_for_each_entry(q, &all_q_list, all_q_node)
> - blk_mq_queue_reinit(q);
> + blk_mq_queue_reinit(q, &online_new);
>
> list_for_each_entry(q, &all_q_list, all_q_node)
> blk_mq_unfreeze_queue(q);
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 6a48c4c..f4fea79 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -51,7 +51,8 @@ void blk_mq_disable_hotplug(void);
> * CPU -> queue mappings
> */
> extern unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set);
> -extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues);
> +extern int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
> + const struct cpumask *online_mask);
> extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int);
>
> /*
> --
> 1.9.1
>



--
Ming Lei
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/