Re: [Query] increased latency observed in cpu hotplug path

From: Khan, Imran
Date: Thu Sep 22 2016 - 06:07:01 EST


On 9/21/2016 9:26 PM, Akinobu Mita wrote:
> 2016-09-21 23:06 GMT+09:00 Khan, Imran <kimran@xxxxxxxxxxxxxx>:
>
>>>> commit 084ee793ec1ff4e2171b481642bfbdaa2e10664c
>>>> Author: Imran Khan <kimran@xxxxxxxxxxxxxx>
>>>> Date: Thu Sep 15 16:44:02 2016 +0530
>>>>
>>>> blk-mq: use static mapping
>>>>
>>>> blk-mq layer performs a remapping between s/w and h/w contexts and also
>>>> between h/w contexts and CPUs, whenever a CPU hotplug event happens.
>>>> This remapping has to wait for queue freezing which may take tens of
>>>> miliseconds, resulting in a high latency in CPU hotplug path.
>>>> This patch makes the above mentioned mappings static so that we can
>>>> avoid remapping when CPU hotplug event happens and this results in
>>>> improved CPU hotplug latency.
>>>>
>>>> Change-Id: Idf38cb6c4e78c91fda3c86608c6d0441f01ab435
>>>> Signed-off-by: Imran Khan <kimran@xxxxxxxxxxxxxx>
>>>>
>>>> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
>>>> index 8764c24..85fdb88 100644
>>>> --- a/block/blk-mq-cpumap.c
>>>> +++ b/block/blk-mq-cpumap.c
>>>> @@ -52,18 +52,13 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
>>>>
>>>> queue = 0;
>>>> for_each_possible_cpu(i) {
>>>> - if (!cpumask_test_cpu(i, online_mask)) {
>>>> - map[i] = 0;
>>>> - continue;
>>>> - }
>>>> -
>>>> /*
>>>> * Easy case - we have equal or more hardware queues. Or
>>>> * there are no thread siblings to take into account. Do
>>>> * 1:1 if enough, or sequential mapping if less.
>>>> */
>>>> - if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
>>>> - map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
>>>> + if (nr_queues >= nr_cpu_ids) {
>>>> + map[i] = cpu_to_queue_index(nr_cpu_ids, nr_queues, queue);
>>>> queue++;
>>>> continue;
>>>> }
>>>> @@ -75,7 +70,7 @@ int blk_mq_update_queue_map(unsigned int *map, unsigned int nr_queues,
>>>> */
>>>> first_sibling = get_first_sibling(i);
>>>> if (first_sibling == i) {
>>>> - map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
>>>> + map[i] = cpu_to_queue_index(nr_cpu_ids, nr_queues,
>>>> queue);
>>>> queue++;
>>>> } else
>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>> index 6d6f8fe..0753fdf 100644
>>>> --- a/block/blk-mq.c
>>>> +++ b/block/blk-mq.c
>>>> @@ -1783,10 +1783,6 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
>>>> INIT_LIST_HEAD(&__ctx->rq_list);
>>>> __ctx->queue = q;
>>>>
>>>> - /* If the cpu isn't online, the cpu is mapped to first hctx */
>>>> - if (!cpu_online(i))
>>>> - continue;
>>>> -
>>>> hctx = q->mq_ops->map_queue(q, i);
>>>>
>>>> /*
>>>> @@ -1820,12 +1816,9 @@ static void blk_mq_map_swqueue(struct request_queue *q,
>>>> * Map software to hardware queues
>>>> */
>>>> queue_for_each_ctx(q, ctx, i) {
>>>> - /* If the cpu isn't online, the cpu is mapped to first hctx */
>>>> - if (!cpumask_test_cpu(i, online_mask))
>>>> - continue;
>>>> -
>>>> hctx = q->mq_ops->map_queue(q, i);
>>>> - cpumask_set_cpu(i, hctx->cpumask);
>>>> + if (cpumask_test_cpu(i, online_mask))
>>>> + cpumask_set_cpu(i, hctx->cpumask);
>>>> ctx->index_hw = hctx->nr_ctx;
>>>> hctx->ctxs[hctx->nr_ctx++] = ctx;
>>>> }
>>>> @@ -1863,17 +1856,22 @@ static void blk_mq_map_swqueue(struct request_queue *q,
>>>>
>>>> /*
>>>> * Initialize batch roundrobin counts
>>>> + * Set next_cpu for only those hctxs that have an online CPU
>>>> + * in their cpumask field. For hctxs that belong to few online
>>>> + * and few offline CPUs, this will always provide one CPU from
>>>> + * online ones. For hctxs belonging to all offline CPUs, their
>>>> + * cpumask will be updated in reinit_notify.
>>>> */
>>>> - hctx->next_cpu = cpumask_first(hctx->cpumask);
>>>> - hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
>>>> + if(cpumask_first(hctx->cpumask) < nr_cpu_ids) {
>>>> + hctx->next_cpu = cpumask_first(hctx->cpumask);
>>>> + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
>>>> + }
>>>> }
>>>>
>>>> queue_for_each_ctx(q, ctx, i) {
>>>> - if (!cpumask_test_cpu(i, online_mask))
>>>> - continue;
>>>> -
>>>> hctx = q->mq_ops->map_queue(q, i);
>>>> - cpumask_set_cpu(i, hctx->tags->cpumask);
>>>> + if (cpumask_test_cpu(i, online_mask))
>>>> + cpumask_set_cpu(i, hctx->tags->cpumask);
>>>> }
>>>> }
>>>>
>>>> @@ -2101,31 +2099,12 @@ void blk_mq_free_queue(struct request_queue *q)
>>>> blk_mq_free_hw_queues(q, set);
>>>> }
>>>>
>>>> -/* Basically redo blk_mq_init_queue with queue frozen */
>>>> -static void blk_mq_queue_reinit(struct request_queue *q,
>>>> - const struct cpumask *online_mask)
>>>> -{
>>>> - WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth));
>>>> -
>>>> - blk_mq_sysfs_unregister(q);
>>>> -
>>>> - blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues, online_mask);
>
> Now blk_mq_update_queue_map() is only used in block/blk-mq-cpumap.c,
> so you can make blk_mq_update_queue_map() static function.
>
>>>> -
>>>> - /*
>>>> - * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
>>>> - * we should change hctx numa_node according to new topology (this
>>>> - * involves free and re-allocate memory, worthy doing?)
>>>> - */
>>>> -
>>>> - blk_mq_map_swqueue(q, online_mask);
>>>> -
>>>> - blk_mq_sysfs_register(q);
>>>> -}
>>>> -
>>>> static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
>>>> unsigned long action, void *hcpu)
>>>> {
>>>> struct request_queue *q;
>>>> + struct blk_mq_hw_ctx *hctx;
>>>> + int i;
>>>> int cpu = (unsigned long)hcpu;
>>>> /*
>>>> * New online cpumask which is going to be set in this hotplug event.
>>>> @@ -2155,43 +2134,29 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
>>>> case CPU_DEAD:
>>>> case CPU_UP_CANCELED:
>>>> cpumask_copy(&online_new, cpu_online_mask);
>>>> + list_for_each_entry(q, &all_q_list, all_q_node) {
>>>> + queue_for_each_hw_ctx(q, hctx, i) {
>>>> + cpumask_clear_cpu(cpu, hctx->cpumask);
>>>> + cpumask_clear_cpu(cpu, hctx->tags->cpumask);
>>>> + }
>>>> + }
>
> Do we need to hold all_q_mutex while iterating through all_q_list?
>
Yes, we need to hold all_q_mutex here.
>>>> break;
>>>> case CPU_UP_PREPARE:
>>>> cpumask_copy(&online_new, cpu_online_mask);
>>>> cpumask_set_cpu(cpu, &online_new);
>>>> + /* Update hctx->cpumask for newly onlined CPUs */
>>>> + list_for_each_entry(q, &all_q_list, all_q_node) {
>>>> + queue_for_each_hw_ctx(q, hctx, i) {
>>>> + cpumask_set_cpu(cpu, hctx->cpumask);
>>>> + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
>>>> + cpumask_set_cpu(cpu, hctx->tags->cpumask);
>>>> + }
>>>> + }
>>>> break;
>>>> default:
>>>> return NOTIFY_OK;
>>>> }
>>>>
>>>> - mutex_lock(&all_q_mutex);
>>>> -
>>>> - /*
>>>> - * We need to freeze and reinit all existing queues. Freezing
>>>> - * involves synchronous wait for an RCU grace period and doing it
>>>> - * one by one may take a long time. Start freezing all queues in
>>>> - * one swoop and then wait for the completions so that freezing can
>>>> - * take place in parallel.
>>>> - */
>>>> - list_for_each_entry(q, &all_q_list, all_q_node)
>>>> - blk_mq_freeze_queue_start(q);
>>>> - list_for_each_entry(q, &all_q_list, all_q_node) {
>>>> - blk_mq_freeze_queue_wait(q);
>>>> -
>>>> - /*
>>>> - * timeout handler can't touch hw queue during the
>>>> - * reinitialization
>>>> - */
>>>> - del_timer_sync(&q->timeout);
>>>> - }
>>>> -
>>>> - list_for_each_entry(q, &all_q_list, all_q_node)
>>>> - blk_mq_queue_reinit(q, &online_new);
>>>> -
>>>> - list_for_each_entry(q, &all_q_list, all_q_node)
>>>> - blk_mq_unfreeze_queue(q);
>>>> -
>>>> - mutex_unlock(&all_q_mutex);
>
> This change makes 'online_new' cpumask useless in blk_mq_queue_reinit_notify(),
> so you can remove it completely.
>

Will remove this in the new patch set.
Thanks for the review comments. I will take into account the review comments and
send a new patch set.

>>>> return NOTIFY_OK;
>>>> }
>>>>
>>>>
>>>>
>>>>
>>> With the patch I have collected some numbers, running some android monkey tests along with
>>> an app that does random hootplugging of CPUs. The numbers obtained are as below:
>>>
>>> CPU UP time(in micro seconds):
>>> default blk-mq: 58629 54207 61792 58458 59286
>>> blk-mq with above patch: 2755 2588 3412 3239 3010
>>>
>>> CPU DOWN time(in micros seconds):
>>> default blk-mq: 95990 103379 103686 93214 128274
>>> blk-mq with above patch: 67178 49876 70027 61261 59656
>>>
>> Could someone please review the patch and provide some feedback?
>> Even if the feedback is about the approach we are trying here or some
>> suggestions to move ahead in that direction.
>>>>>> Did you tried Tejun's percpu_ref patchset that I said in the last email?
>>>>>> (http://www.mail-archive.com/linux-kernel@xxxxxxxxxxxxxxx/msg984823.html)
>>>>>>
>>>>>>
>>>>> I have tried the suggested patch but did not see any improvements in cpu
>>>>> hotplug latencies
>>>>>
>>>>
>>>>
>>>
>>>
>>
>>
>> --
>> Imran Khan
>> QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a\nmember of the Code Aurora Forum, hosted by The Linux Foundation


--
Imran Khan
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a\nmember of the Code Aurora Forum, hosted by The Linux Foundation