Re: 4.14: WARNING: CPU: 4 PID: 2895 at block/blk-mq.c:1144 with virtio-blk (also 4.12 stable)
From: Hannes Reinecke
Date: Thu Nov 23 2017 - 09:43:14 EST
On 11/23/2017 03:34 PM, Christoph Hellwig wrote:
> FYI, the patch below changes both the irq and block mappings to
> always use the cpu possible map (should be split in two in due time).
>
> I think this is the right way forward. For every normal machine
> those two are the same, but for VMs with maxcpus above their normal
> count or some big iron that can grow more cpus it means we waster
> a few more resources for the not present but reserved cpus. It
> fixes the reported issue for me:
>
> diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
> index 9f8cffc8a701..3eb169f15842 100644
> --- a/block/blk-mq-cpumap.c
> +++ b/block/blk-mq-cpumap.c
> @@ -16,11 +16,6 @@
>
> static int cpu_to_queue_index(unsigned int nr_queues, const int cpu)
> {
> - /*
> - * Non present CPU will be mapped to queue index 0.
> - */
> - if (!cpu_present(cpu))
> - return 0;
> return cpu % nr_queues;
> }
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 11097477eeab..612ce1fb7c4e 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -2114,16 +2114,11 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
> INIT_LIST_HEAD(&__ctx->rq_list);
> __ctx->queue = q;
>
> - /* If the cpu isn't present, the cpu is mapped to first hctx */
> - if (!cpu_present(i))
> - continue;
> -
> - hctx = blk_mq_map_queue(q, i);
> -
> /*
> * Set local node, IFF we have more than one hw queue. If
> * not, we remain on the home node of the device
> */
> + hctx = blk_mq_map_queue(q, i);
> if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
> hctx->numa_node = local_memory_node(cpu_to_node(i));
> }
> @@ -2180,7 +2175,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
> *
> * If the cpu isn't present, the cpu is mapped to first hctx.
> */
> - for_each_present_cpu(i) {
> + for_each_possible_cpu(i) {
> hctx_idx = q->mq_map[i];
> /* unmapped hw queue can be remapped after CPU topo changed */
> if (!set->tags[hctx_idx] &&
> diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
> index e12d35108225..a37a3b4b6342 100644
> --- a/kernel/irq/affinity.c
> +++ b/kernel/irq/affinity.c
> @@ -39,7 +39,7 @@ static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
> }
> }
>
> -static cpumask_var_t *alloc_node_to_present_cpumask(void)
> +static cpumask_var_t *alloc_node_to_possible_cpumask(void)
> {
> cpumask_var_t *masks;
> int node;
> @@ -62,7 +62,7 @@ static cpumask_var_t *alloc_node_to_present_cpumask(void)
> return NULL;
> }
>
> -static void free_node_to_present_cpumask(cpumask_var_t *masks)
> +static void free_node_to_possible_cpumask(cpumask_var_t *masks)
> {
> int node;
>
> @@ -71,22 +71,22 @@ static void free_node_to_present_cpumask(cpumask_var_t *masks)
> kfree(masks);
> }
>
> -static void build_node_to_present_cpumask(cpumask_var_t *masks)
> +static void build_node_to_possible_cpumask(cpumask_var_t *masks)
> {
> int cpu;
>
> - for_each_present_cpu(cpu)
> + for_each_possible_cpu(cpu)
> cpumask_set_cpu(cpu, masks[cpu_to_node(cpu)]);
> }
>
> -static int get_nodes_in_cpumask(cpumask_var_t *node_to_present_cpumask,
> +static int get_nodes_in_cpumask(cpumask_var_t *node_to_possible_cpumask,
> const struct cpumask *mask, nodemask_t *nodemsk)
> {
> int n, nodes = 0;
>
> /* Calculate the number of nodes in the supplied affinity mask */
> for_each_node(n) {
> - if (cpumask_intersects(mask, node_to_present_cpumask[n])) {
> + if (cpumask_intersects(mask, node_to_possible_cpumask[n])) {
> node_set(n, *nodemsk);
> nodes++;
> }
> @@ -109,7 +109,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
> int last_affv = affv + affd->pre_vectors;
> nodemask_t nodemsk = NODE_MASK_NONE;
> struct cpumask *masks;
> - cpumask_var_t nmsk, *node_to_present_cpumask;
> + cpumask_var_t nmsk, *node_to_possible_cpumask;
>
> /*
> * If there aren't any vectors left after applying the pre/post
> @@ -125,8 +125,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
> if (!masks)
> goto out;
>
> - node_to_present_cpumask = alloc_node_to_present_cpumask();
> - if (!node_to_present_cpumask)
> + node_to_possible_cpumask = alloc_node_to_possible_cpumask();
> + if (!node_to_possible_cpumask)
> goto out;
>
> /* Fill out vectors at the beginning that don't need affinity */
> @@ -135,8 +135,8 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
>
> /* Stabilize the cpumasks */
> get_online_cpus();
> - build_node_to_present_cpumask(node_to_present_cpumask);
> - nodes = get_nodes_in_cpumask(node_to_present_cpumask, cpu_present_mask,
> + build_node_to_possible_cpumask(node_to_possible_cpumask);
> + nodes = get_nodes_in_cpumask(node_to_possible_cpumask, cpu_possible_mask,
> &nodemsk);
>
> /*
> @@ -146,7 +146,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
> if (affv <= nodes) {
> for_each_node_mask(n, nodemsk) {
> cpumask_copy(masks + curvec,
> - node_to_present_cpumask[n]);
> + node_to_possible_cpumask[n]);
> if (++curvec == last_affv)
> break;
> }
> @@ -160,7 +160,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
> vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes;
>
> /* Get the cpus on this node which are in the mask */
> - cpumask_and(nmsk, cpu_present_mask, node_to_present_cpumask[n]);
> + cpumask_and(nmsk, cpu_possible_mask, node_to_possible_cpumask[n]);
>
> /* Calculate the number of cpus per vector */
> ncpus = cpumask_weight(nmsk);
> @@ -192,7 +192,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd)
> /* Fill out vectors at the end that don't need affinity */
> for (; curvec < nvecs; curvec++)
> cpumask_copy(masks + curvec, irq_default_affinity);
> - free_node_to_present_cpumask(node_to_present_cpumask);
> + free_node_to_possible_cpumask(node_to_possible_cpumask);
> out:
> free_cpumask_var(nmsk);
> return masks;
> @@ -214,7 +214,7 @@ int irq_calc_affinity_vectors(int minvec, int maxvec, const struct irq_affinity
> return 0;
>
> get_online_cpus();
> - ret = min_t(int, cpumask_weight(cpu_present_mask), vecs) + resv;
> + ret = min_t(int, cpumask_weight(cpu_possible_mask), vecs) + resv;
> put_online_cpus();
> return ret;
> }
>
What will happen for the CPU hotplug case?
Wouldn't we route I/O to a disabled CPU with this patch?
Cheers,
Hannes
--
Dr. Hannes Reinecke Teamlead Storage & Networking
hare@xxxxxxx +49 911 74053 688
SUSE LINUX GmbH, Maxfeldstr. 5, 90409 NÃrnberg
GF: F. ImendÃrffer, J. Smithard, J. Guild, D. Upmanyu, G. Norton
HRB 21284 (AG NÃrnberg)