Re: 4.14: WARNING: CPU: 4 PID: 2895 at block/blk-mq.c:1144 with virtio-blk (also 4.12 stable)

From: Christian Borntraeger
Date: Tue Nov 21 2017 - 15:12:32 EST




On 11/21/2017 08:30 PM, Jens Axboe wrote:
> On 11/21/2017 12:15 PM, Christian Borntraeger wrote:
>>
>>
>> On 11/21/2017 07:39 PM, Jens Axboe wrote:
>>> On 11/21/2017 11:27 AM, Jens Axboe wrote:
>>>> On 11/21/2017 11:12 AM, Christian Borntraeger wrote:
>>>>>
>>>>>
>>>>> On 11/21/2017 07:09 PM, Jens Axboe wrote:
>>>>>> On 11/21/2017 10:27 AM, Jens Axboe wrote:
>>>>>>> On 11/21/2017 03:14 AM, Christian Borntraeger wrote:
>>>>>>>> Bisect points to
>>>>>>>>
>>>>>>>> 1b5a7455d345b223d3a4658a9e5fce985b7998c1 is the first bad commit
>>>>>>>> commit 1b5a7455d345b223d3a4658a9e5fce985b7998c1
>>>>>>>> Author: Christoph Hellwig <hch@xxxxxx>
>>>>>>>> Date: Mon Jun 26 12:20:57 2017 +0200
>>>>>>>>
>>>>>>>> blk-mq: Create hctx for each present CPU
>>>>>>>>
>>>>>>>> commit 4b855ad37194f7bdbb200ce7a1c7051fecb56a08 upstream.
>>>>>>>>
>>>>>>>> Currently we only create hctx for online CPUs, which can lead to a lot
>>>>>>>> of churn due to frequent soft offline / online operations. Instead
>>>>>>>> allocate one for each present CPU to avoid this and dramatically simplify
>>>>>>>> the code.
>>>>>>>>
>>>>>>>> Signed-off-by: Christoph Hellwig <hch@xxxxxx>
>>>>>>>> Reviewed-by: Jens Axboe <axboe@xxxxxxxxx>
>>>>>>>> Cc: Keith Busch <keith.busch@xxxxxxxxx>
>>>>>>>> Cc: linux-block@xxxxxxxxxxxxxxx
>>>>>>>> Cc: linux-nvme@xxxxxxxxxxxxxxxxxxx
>>>>>>>> Link: http://lkml.kernel.org/r/20170626102058.10200-3-hch@xxxxxx
>>>>>>>> Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
>>>>>>>> Cc: Oleksandr Natalenko <oleksandr@xxxxxxxxxxxxxx>
>>>>>>>> Cc: Mike Galbraith <efault@xxxxxx>
>>>>>>>> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
>>>>>>>
>>>>>>> I wonder if we're simply not getting the masks updated correctly. I'll
>>>>>>> take a look.
>>>>>>
>>>>>> Can't make it trigger here. We do init for each present CPU, which means
>>>>>> that if I offline a few CPUs here and register a queue, those still show
>>>>>> up as present (just offline) and get mapped accordingly.
>>>>>>
>>>>>> From the looks of it, your setup is different. If the CPU doesn't show
>>>>>> up as present and it gets hotplugged, then I can see how this condition
>>>>>> would trigger. What environment are you running this in? We might have
>>>>>> to re-introduce the cpu hotplug notifier, right now we just monitor
>>>>>> for a dead cpu and handle that.
>>>>>
>>>>> I am not doing a hot unplug and the replug, I use KVM and add a previously
>>>>> not available CPU.
>>>>>
>>>>> in libvirt/virsh speak:
>>>>> <vcpu placement='static' current='1'>4</vcpu>
>>>>
>>>> So that's why we run into problems. It's not present when we load the device,
>>>> but becomes present and online afterwards.
>>>>
>>>> Christoph, we used to handle this just fine, your patch broke it.
>>>>
>>>> I'll see if I can come up with an appropriate fix.
>>>
>>> Can you try the below?
>>
>>
>> It does prevent the crash but it seems that the new CPU is not "used " after the hotplug for mq:
>>
>>
>> output with 2 cpus:
>> /sys/kernel/debug/block/vda
>> /sys/kernel/debug/block/vda/hctx0
>> /sys/kernel/debug/block/vda/hctx0/cpu0
>> /sys/kernel/debug/block/vda/hctx0/cpu0/completed
>> /sys/kernel/debug/block/vda/hctx0/cpu0/merged
>> /sys/kernel/debug/block/vda/hctx0/cpu0/dispatched
>> /sys/kernel/debug/block/vda/hctx0/cpu0/rq_list
>> /sys/kernel/debug/block/vda/hctx0/active
>> /sys/kernel/debug/block/vda/hctx0/run
>> /sys/kernel/debug/block/vda/hctx0/queued
>> /sys/kernel/debug/block/vda/hctx0/dispatched
>> /sys/kernel/debug/block/vda/hctx0/io_poll
>> /sys/kernel/debug/block/vda/hctx0/sched_tags_bitmap
>> /sys/kernel/debug/block/vda/hctx0/sched_tags
>> /sys/kernel/debug/block/vda/hctx0/tags_bitmap
>> /sys/kernel/debug/block/vda/hctx0/tags
>> /sys/kernel/debug/block/vda/hctx0/ctx_map
>> /sys/kernel/debug/block/vda/hctx0/busy
>> /sys/kernel/debug/block/vda/hctx0/dispatch
>> /sys/kernel/debug/block/vda/hctx0/flags
>> /sys/kernel/debug/block/vda/hctx0/state
>> /sys/kernel/debug/block/vda/sched
>> /sys/kernel/debug/block/vda/sched/dispatch
>> /sys/kernel/debug/block/vda/sched/starved
>> /sys/kernel/debug/block/vda/sched/batching
>> /sys/kernel/debug/block/vda/sched/write_next_rq
>> /sys/kernel/debug/block/vda/sched/write_fifo_list
>> /sys/kernel/debug/block/vda/sched/read_next_rq
>> /sys/kernel/debug/block/vda/sched/read_fifo_list
>> /sys/kernel/debug/block/vda/write_hints
>> /sys/kernel/debug/block/vda/state
>> /sys/kernel/debug/block/vda/requeue_list
>> /sys/kernel/debug/block/vda/poll_stat
>
> Try this, basically just a revert.

Yes, seems to work.

Tested-by: Christian Borntraeger <borntraeger@xxxxxxxxxx>

Do you know why the original commit made it into 4.12 stable? After all
it has no Fixes tag and no cc stable-


>
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 11097477eeab..bc1950fa9ef6 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -37,6 +37,9 @@
> #include "blk-wbt.h"
> #include "blk-mq-sched.h"
>
> +static DEFINE_MUTEX(all_q_mutex);
> +static LIST_HEAD(all_q_list);
> +
> static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie);
> static void blk_mq_poll_stats_start(struct request_queue *q);
> static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
> @@ -2114,8 +2117,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
> INIT_LIST_HEAD(&__ctx->rq_list);
> __ctx->queue = q;
>
> - /* If the cpu isn't present, the cpu is mapped to first hctx */
> - if (!cpu_present(i))
> + /* If the cpu isn't online, the cpu is mapped to first hctx */
> + if (!cpu_online(i))
> continue;
>
> hctx = blk_mq_map_queue(q, i);
> @@ -2158,7 +2161,8 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
> }
> }
>
> -static void blk_mq_map_swqueue(struct request_queue *q)
> +static void blk_mq_map_swqueue(struct request_queue *q,
> + const struct cpumask *online_mask)
> {
> unsigned int i, hctx_idx;
> struct blk_mq_hw_ctx *hctx;
> @@ -2176,11 +2180,13 @@ static void blk_mq_map_swqueue(struct request_queue *q)
> }
>
> /*
> - * Map software to hardware queues.
> - *
> - * If the cpu isn't present, the cpu is mapped to first hctx.
> + * Map software to hardware queues
> */
> - for_each_present_cpu(i) {
> + for_each_possible_cpu(i) {
> + /* If the cpu isn't online, the cpu is mapped to first hctx */
> + if (!cpumask_test_cpu(i, online_mask))
> + continue;
> +
> hctx_idx = q->mq_map[i];
> /* unmapped hw queue can be remapped after CPU topo changed */
> if (!set->tags[hctx_idx] &&
> @@ -2495,8 +2501,16 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
> blk_queue_softirq_done(q, set->ops->complete);
>
> blk_mq_init_cpu_queues(q, set->nr_hw_queues);
> +
> + get_online_cpus();
> + mutex_lock(&all_q_mutex);
> +
> + list_add_tail(&q->all_q_node, &all_q_list);
> blk_mq_add_queue_tag_set(set, q);
> - blk_mq_map_swqueue(q);
> + blk_mq_map_swqueue(q, cpu_online_mask);
> +
> + mutex_unlock(&all_q_mutex);
> + put_online_cpus();
>
> if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
> int ret;
> @@ -2522,12 +2536,18 @@ void blk_mq_free_queue(struct request_queue *q)
> {
> struct blk_mq_tag_set *set = q->tag_set;
>
> + mutex_lock(&all_q_mutex);
> + list_del_init(&q->all_q_node);
> + mutex_unlock(&all_q_mutex);
> +
> blk_mq_del_queue_tag_set(q);
> +
> blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
> }
>
> /* Basically redo blk_mq_init_queue with queue frozen */
> -static void blk_mq_queue_reinit(struct request_queue *q)
> +static void blk_mq_queue_reinit(struct request_queue *q,
> + const struct cpumask *online_mask)
> {
> WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
>
> @@ -2539,12 +2559,76 @@ static void blk_mq_queue_reinit(struct request_queue *q)
> * we should change hctx numa_node according to the new topology (this
> * involves freeing and re-allocating memory, worth doing?)
> */
> - blk_mq_map_swqueue(q);
> + blk_mq_map_swqueue(q, online_mask);
>
> blk_mq_sysfs_register(q);
> blk_mq_debugfs_register_hctxs(q);
> }
>
> +/*
> + * New online cpumask which is going to be set in this hotplug event.
> + * Declare this cpumasks as global as cpu-hotplug operation is invoked
> + * one-by-one and dynamically allocating this could result in a failure.
> + */
> +static struct cpumask cpuhp_online_new;
> +
> +static void blk_mq_queue_reinit_work(void)
> +{
> + struct request_queue *q;
> +
> + mutex_lock(&all_q_mutex);
> + /*
> + * We need to freeze and reinit all existing queues. Freezing
> + * involves synchronous wait for an RCU grace period and doing it
> + * one by one may take a long time. Start freezing all queues in
> + * one swoop and then wait for the completions so that freezing can
> + * take place in parallel.
> + */
> + list_for_each_entry(q, &all_q_list, all_q_node)
> + blk_freeze_queue_start(q);
> + list_for_each_entry(q, &all_q_list, all_q_node)
> + blk_mq_freeze_queue_wait(q);
> +
> + list_for_each_entry(q, &all_q_list, all_q_node)
> + blk_mq_queue_reinit(q, &cpuhp_online_new);
> +
> + list_for_each_entry(q, &all_q_list, all_q_node)
> + blk_mq_unfreeze_queue(q);
> +
> + mutex_unlock(&all_q_mutex);
> +}
> +
> +static int blk_mq_queue_reinit_dead(unsigned int cpu)
> +{
> + cpumask_copy(&cpuhp_online_new, cpu_online_mask);
> + blk_mq_queue_reinit_work();
> + return 0;
> +}
> +
> +/*
> + * Before hotadded cpu starts handling requests, new mappings must be
> + * established. Otherwise, these requests in hw queue might never be
> + * dispatched.
> + *
> + * For example, there is a single hw queue (hctx) and two CPU queues (ctx0
> + * for CPU0, and ctx1 for CPU1).
> + *
> + * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list
> + * and set bit0 in pending bitmap as ctx1->index_hw is still zero.
> + *
> + * And then while running hw queue, blk_mq_flush_busy_ctxs() finds bit0 is set
> + * in pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list.
> + * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list is
> + * ignored.
> + */
> +static int blk_mq_queue_reinit_prepare(unsigned int cpu)
> +{
> + cpumask_copy(&cpuhp_online_new, cpu_online_mask);
> + cpumask_set_cpu(cpu, &cpuhp_online_new);
> + blk_mq_queue_reinit_work();
> + return 0;
> +}
> +
> static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
> {
> int i;
> @@ -2757,7 +2841,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
> blk_mq_update_queue_map(set);
> list_for_each_entry(q, &set->tag_list, tag_set_list) {
> blk_mq_realloc_hw_ctxs(set, q);
> - blk_mq_queue_reinit(q);
> + blk_mq_queue_reinit(q, cpu_online_mask);
> }
>
> list_for_each_entry(q, &set->tag_list, tag_set_list)
> @@ -2966,6 +3050,16 @@ static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie)
> return __blk_mq_poll(hctx, rq);
> }
>
> +void blk_mq_disable_hotplug(void)
> +{
> + mutex_lock(&all_q_mutex);
> +}
> +
> +void blk_mq_enable_hotplug(void)
> +{
> + mutex_unlock(&all_q_mutex);
> +}
> +
> static int __init blk_mq_init(void)
> {
> /*
> @@ -2976,6 +3070,10 @@ static int __init blk_mq_init(void)
>
> cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
> blk_mq_hctx_notify_dead);
> +
> + cpuhp_setup_state_nocalls(CPUHP_BLK_MQ_PREPARE, "block/mq:prepare",
> + blk_mq_queue_reinit_prepare,
> + blk_mq_queue_reinit_dead);
> return 0;
> }
> subsys_initcall(blk_mq_init);
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 6c7c3ff5bf62..83b13ef1915e 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -59,6 +59,11 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
> void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
> void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
> struct list_head *list);
> +/*
> + * CPU hotplug helpers
> + */
> +void blk_mq_enable_hotplug(void);
> +void blk_mq_disable_hotplug(void);
>
> /*
> * CPU -> queue mappings
> diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
> index 201ab7267986..c31d4e3bf6d0 100644
> --- a/include/linux/cpuhotplug.h
> +++ b/include/linux/cpuhotplug.h
> @@ -76,6 +76,7 @@ enum cpuhp_state {
> CPUHP_XEN_EVTCHN_PREPARE,
> CPUHP_ARM_SHMOBILE_SCU_PREPARE,
> CPUHP_SH_SH3X_PREPARE,
> + CPUHP_BLK_MQ_PREPARE,
> CPUHP_NET_FLOW_PREPARE,
> CPUHP_TOPOLOGY_PREPARE,
> CPUHP_NET_IUCV_PREPARE,
>