Re: [tip:x86/numa] x86: Unify cpu/apicid <-> NUMA node mapping between32 and 64bit

From: Yinghai Lu
Date: Fri Jan 28 2011 - 15:35:18 EST


On 01/28/2011 06:41 AM, tip-bot for Tejun Heo wrote:
> Commit-ID: bbc9e2f452d9c4b166d1f9a78d941d80173312fe
> Gitweb: http://git.kernel.org/tip/bbc9e2f452d9c4b166d1f9a78d941d80173312fe
> Author: Tejun Heo <tj@xxxxxxxxxx>
> AuthorDate: Sun, 23 Jan 2011 14:37:39 +0100
> Committer: Ingo Molnar <mingo@xxxxxxx>
> CommitDate: Fri, 28 Jan 2011 14:54:09 +0100
>
> x86: Unify cpu/apicid <-> NUMA node mapping between 32 and 64bit
>
> The mapping between cpu/apicid and node is done via
> apicid_to_node[] on 64bit and apicid_2_node[] +
> apic->x86_32_numa_cpu_node() on 32bit. This difference makes it
> difficult to further unify 32 and 64bit NUMA handling.
>
> This patch unifies it by replacing both apicid_to_node[] and
> apicid_2_node[] with __apicid_to_node[] array, which is accessed
> by two accessors - set_apicid_to_node() and numa_cpu_node(). On
> 64bit, numa_cpu_node() always consults __apicid_to_node[]
> directly while 32bit goes through apic->numa_cpu_node() method
> to allow apic implementations to override it.
>
> srat_detect_node() for amd cpus contains workaround for broken
> NUMA configuration which assumes relationship between APIC ID,
> HT node ID and NUMA topology. Leave it to access
> __apicid_to_node[] directly as mapping through CPU might result
> in undesirable behavior change. The comment is reformatted and
> updated to note the ugliness.
>
> Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
> Reviewed-by: Pekka Enberg <penberg@xxxxxxxxxx>
> Cc: eric.dumazet@xxxxxxxxx
> Cc: yinghai@xxxxxxxxxx
> Cc: brgerst@xxxxxxxxx
> Cc: gorcunov@xxxxxxxxx
> Cc: shaohui.zheng@xxxxxxxxx
> Cc: rientjes@xxxxxxxxxx
> LKML-Reference: <1295789862-25482-14-git-send-email-tj@xxxxxxxxxx>
> Signed-off-by: Ingo Molnar <mingo@xxxxxxx>
> Cc: David Rientjes <rientjes@xxxxxxxxxx>
> ---
> arch/x86/include/asm/mpspec.h | 1 -
> arch/x86/include/asm/numa.h | 28 +++++++++++++++++++++++
> arch/x86/include/asm/numa_32.h | 6 +++++
> arch/x86/include/asm/numa_64.h | 5 +--
> arch/x86/kernel/acpi/boot.c | 3 +-
> arch/x86/kernel/apic/apic.c | 2 +-
> arch/x86/kernel/cpu/amd.c | 47 +++++++++++++++++++++++++--------------
> arch/x86/kernel/cpu/intel.c | 3 +-
> arch/x86/kernel/smpboot.c | 6 +----
> arch/x86/mm/amdtopology_64.c | 4 +-
> arch/x86/mm/numa.c | 6 ++++-
> arch/x86/mm/numa_32.c | 6 +++++
> arch/x86/mm/numa_64.c | 26 +++++++++------------
> arch/x86/mm/srat_32.c | 2 +-
> arch/x86/mm/srat_64.c | 12 +++++-----
> 15 files changed, 101 insertions(+), 56 deletions(-)
>
> diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
> index edc2a45..9c7d95f 100644
> --- a/arch/x86/include/asm/mpspec.h
> +++ b/arch/x86/include/asm/mpspec.h
> @@ -25,7 +25,6 @@ extern int pic_mode;
> #define MAX_IRQ_SOURCES 256
>
> extern unsigned int def_to_bigsmp;
> -extern u8 apicid_2_node[];
>
> #ifdef CONFIG_X86_NUMAQ
> extern int mp_bus_id_to_node[MAX_MP_BUSSES];
> diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
> index 27da400..5e01c76 100644
> --- a/arch/x86/include/asm/numa.h
> +++ b/arch/x86/include/asm/numa.h
> @@ -1,5 +1,33 @@
> +#ifndef _ASM_X86_NUMA_H
> +#define _ASM_X86_NUMA_H
> +
> +#include <asm/apicdef.h>
> +
> +#ifdef CONFIG_NUMA
> +/*
> + * __apicid_to_node[] stores the raw mapping between physical apicid and
> + * node and is used to initialize cpu_to_node mapping.
> + *
> + * The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus
> + * should be accessed by the accessors - set_apicid_to_node() and
> + * numa_cpu_node().
> + */
> +extern s16 __apicid_to_node[MAX_LOCAL_APIC];
> +
> +static inline void set_apicid_to_node(int apicid, s16 node)
> +{
> + __apicid_to_node[apicid] = node;
> +}
> +#else /* CONFIG_NUMA */
> +static inline void set_apicid_to_node(int apicid, s16 node)
> +{
> +}
> +#endif /* CONFIG_NUMA */
> +
> #ifdef CONFIG_X86_32
> # include "numa_32.h"
> #else
> # include "numa_64.h"
> #endif
> +
> +#endif /* _ASM_X86_NUMA_H */
> diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
> index b0ef2b4..cdf8043 100644
> --- a/arch/x86/include/asm/numa_32.h
> +++ b/arch/x86/include/asm/numa_32.h
> @@ -6,6 +6,12 @@ extern int numa_off;
> extern int pxm_to_nid(int pxm);
> extern void numa_remove_cpu(int cpu);
>
> +#ifdef CONFIG_NUMA
> +extern int __cpuinit numa_cpu_node(int apicid);

cpu or apicid?

> +#else /* CONFIG_NUMA */
> +static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
> +#endif /* CONFIG_NUMA */
> +
> #ifdef CONFIG_HIGHMEM
> extern void set_highmem_pages_init(void);
> #else
> diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
> index 0493be3..4982a9c 100644
> --- a/arch/x86/include/asm/numa_64.h
> +++ b/arch/x86/include/asm/numa_64.h
> @@ -2,7 +2,6 @@
> #define _ASM_X86_NUMA_64_H
>
> #include <linux/nodemask.h>
> -#include <asm/apicdef.h>
>
> struct bootnode {
> u64 start;
> @@ -17,8 +16,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
> extern void numa_init_array(void);
> extern int numa_off;
>
> -extern s16 apicid_to_node[MAX_LOCAL_APIC];
> -
> extern unsigned long numa_free_all_bootmem(void);
> extern void setup_node_bootmem(int nodeid, unsigned long start,
> unsigned long end);
> @@ -32,6 +29,7 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
> #define NODE_MIN_SIZE (4*1024*1024)
>
> extern void __init init_cpu_to_node(void);
> +extern int __cpuinit numa_cpu_node(int cpu);
> extern void __cpuinit numa_set_node(int cpu, int node);
> extern void __cpuinit numa_clear_node(int cpu);
> extern void __cpuinit numa_add_cpu(int cpu);
> @@ -44,6 +42,7 @@ void numa_emu_cmdline(char *);
> #endif /* CONFIG_NUMA_EMU */
> #else
> static inline void init_cpu_to_node(void) { }
> +static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
> static inline void numa_set_node(int cpu, int node) { }
> static inline void numa_clear_node(int cpu) { }
> static inline void numa_add_cpu(int cpu, int node) { }
> diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
> index b3a7113..a7bca59 100644
> --- a/arch/x86/kernel/acpi/boot.c
> +++ b/arch/x86/kernel/acpi/boot.c
> @@ -589,11 +589,10 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
> nid = acpi_get_node(handle);
> if (nid == -1 || !node_online(nid))
> return;
> + set_apicid_to_node(physid, nid);
> #ifdef CONFIG_X86_64
> - apicid_to_node[physid] = nid;
> numa_set_node(cpu, nid);
> #else /* CONFIG_X86_32 */
> - apicid_2_node[physid] = nid;
> cpu_to_node_map[cpu] = nid;
> #endif
>
> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
> index 0f4f3c1..4686ea5 100644
> --- a/arch/x86/kernel/apic/apic.c
> +++ b/arch/x86/kernel/apic/apic.c
> @@ -2026,7 +2026,7 @@ int default_x86_32_numa_cpu_node(int cpu)
> int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
>
> if (apicid != BAD_APICID)
> - return apicid_2_node[apicid];
> + return __apicid_to_node[apicid];
> return NUMA_NO_NODE;
> #else
> return 0;
> diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
> index 7c7bedb..3cce8f2 100644
> --- a/arch/x86/kernel/cpu/amd.c
> +++ b/arch/x86/kernel/cpu/amd.c
> @@ -234,17 +234,21 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
> #endif
>
> #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
> +/*
> + * To workaround broken NUMA config. Read the comment in
> + * srat_detect_node().
> + */
> static int __cpuinit nearby_node(int apicid)
> {
> int i, node;
>
> for (i = apicid - 1; i >= 0; i--) {
> - node = apicid_to_node[i];
> + node = __apicid_to_node[i];
> if (node != NUMA_NO_NODE && node_online(node))
> return node;
> }
> for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
> - node = apicid_to_node[i];
> + node = __apicid_to_node[i];
> if (node != NUMA_NO_NODE && node_online(node))
> return node;
> }
> @@ -339,26 +343,35 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
> int node;
> unsigned apicid = c->apicid;
>
> - node = per_cpu(cpu_llc_id, cpu);
> + node = numa_cpu_node(cpu);
> + if (node == NUMA_NO_NODE)
> + node = per_cpu(cpu_llc_id, cpu);
>
> - if (apicid_to_node[apicid] != NUMA_NO_NODE)
> - node = apicid_to_node[apicid];
> if (!node_online(node)) {
> - /* Two possibilities here:
> - - The CPU is missing memory and no node was created.
> - In that case try picking one from a nearby CPU
> - - The APIC IDs differ from the HyperTransport node IDs
> - which the K8 northbridge parsing fills in.
> - Assume they are all increased by a constant offset,
> - but in the same order as the HT nodeids.
> - If that doesn't result in a usable node fall back to the
> - path for the previous case. */
> -
> + /*
> + * Two possibilities here:
> + *
> + * - The CPU is missing memory and no node was created. In
> + * that case try picking one from a nearby CPU.
> + *
> + * - The APIC IDs differ from the HyperTransport node IDs
> + * which the K8 northbridge parsing fills in. Assume
> + * they are all increased by a constant offset, but in
> + * the same order as the HT nodeids. If that doesn't
> + * result in a usable node fall back to the path for the
> + * previous case.
> + *
> + * This workaround operates directly on the mapping between
> + * APIC ID and NUMA node, assuming certain relationship
> + * between APIC ID, HT node ID and NUMA topology. As going
> + * through CPU mapping may alter the outcome, directly
> + * access __apicid_to_node[].
> + */
> int ht_nodeid = c->initial_apicid;
>
> if (ht_nodeid >= 0 &&
> - apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
> - node = apicid_to_node[ht_nodeid];
> + __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
> + node = __apicid_to_node[ht_nodeid];
> /* Pick a nearby node */
> if (!node_online(node))
> node = nearby_node(apicid);
> diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
> index d16c2c5..6052004 100644
> --- a/arch/x86/kernel/cpu/intel.c
> +++ b/arch/x86/kernel/cpu/intel.c
> @@ -279,11 +279,10 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
> #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
> unsigned node;
> int cpu = smp_processor_id();
> - int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
>
> /* Don't do the funky fallback heuristics the AMD version employs
> for now. */
> - node = apicid_to_node[apicid];
> + node = numa_cpu_node(cpu);
> if (node == NUMA_NO_NODE || !node_online(node)) {
> /* reuse the value from init_cpu_to_node() */
> node = cpu_to_node(cpu);
> diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
> index 5319cdd..b7cfce5 100644
> --- a/arch/x86/kernel/smpboot.c
> +++ b/arch/x86/kernel/smpboot.c
> @@ -71,10 +71,6 @@
> #include <asm/smpboot_hooks.h>
> #include <asm/i8259.h>
>
> -#ifdef CONFIG_X86_32
> -u8 apicid_2_node[MAX_LOCAL_APIC];
> -#endif
> -
> /* State of each CPU */
> DEFINE_PER_CPU(int, cpu_state) = { 0 };
>
> @@ -170,7 +166,7 @@ static void map_cpu_to_logical_apicid(void)
> int cpu = smp_processor_id();
> int node;
>
> - node = apic->x86_32_numa_cpu_node(cpu);
> + node = numa_cpu_node(cpu);
> if (!node_online(node))
> node = first_online_node;
>
> diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology_64.c
> index f21962c..c7fae38 100644
> --- a/arch/x86/mm/amdtopology_64.c
> +++ b/arch/x86/mm/amdtopology_64.c
> @@ -247,7 +247,7 @@ void __init amd_fake_nodes(const struct bootnode *nodes, int nr_nodes)
> __acpi_map_pxm_to_node(nid, i);
> #endif
> }
> - memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
> + memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node));
> }
> #endif /* CONFIG_NUMA_EMU */
>
> @@ -285,7 +285,7 @@ int __init amd_scan_nodes(void)
> nodes[i].start >> PAGE_SHIFT,
> nodes[i].end >> PAGE_SHIFT);
> for (j = apicid_base; j < cores + apicid_base; j++)
> - apicid_to_node[(i << bits) + j] = i;
> + set_apicid_to_node((i << bits) + j, i);
> setup_node_bootmem(i, nodes[i].start, nodes[i].end);
> }
>
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index ebf6d78..480b357 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -26,8 +26,12 @@ static __init int numa_setup(char *opt)
> early_param("numa", numa_setup);
>
> /*
> - * Which logical CPUs are on which nodes
> + * apicid, cpu, node mappings
> */
> +s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
> + [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
> +};
> +
> cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
> EXPORT_SYMBOL(node_to_cpumask_map);
>
> diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
> index 84a3e4c..8d91d22 100644
> --- a/arch/x86/mm/numa_32.c
> +++ b/arch/x86/mm/numa_32.c
> @@ -110,6 +110,12 @@ void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
>
> static unsigned long kva_start_pfn;
> static unsigned long kva_pages;
> +
> +int __cpuinit numa_cpu_node(int cpu)
> +{
> + return apic->x86_32_numa_cpu_node(cpu);
> +}
> +
> /*
> * FLAT - support for basic PC memory model with discontig enabled, essentially
> * a single node with all available processors in it with a flat
> diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
> index 95ea155..1e1026f 100644
> --- a/arch/x86/mm/numa_64.c
> +++ b/arch/x86/mm/numa_64.c
> @@ -26,10 +26,6 @@ EXPORT_SYMBOL(node_data);
>
> struct memnode memnode;
>
> -s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
> - [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
> -};
> -
> static unsigned long __initdata nodemap_addr;
> static unsigned long __initdata nodemap_size;
>
> @@ -716,12 +712,8 @@ void __init init_cpu_to_node(void)
> BUG_ON(cpu_to_apicid == NULL);
>
> for_each_possible_cpu(cpu) {
> - int node;
> - u16 apicid = cpu_to_apicid[cpu];
> + int node = numa_cpu_node(cpu);
>
> - if (apicid == BAD_APICID)
> - continue;
> - node = apicid_to_node[apicid];
> if (node == NUMA_NO_NODE)
> continue;
> if (!node_online(node))
> @@ -731,6 +723,14 @@ void __init init_cpu_to_node(void)
> }
> #endif
>
> +int __cpuinit numa_cpu_node(int cpu)
> +{
> + int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
> +
> + if (apicid != BAD_APICID)
> + return __apicid_to_node[apicid];
> + return NUMA_NO_NODE;
> +}

it should be changed to cpu_to_node_via_apicid(), it could return not onlined node, aka node without memory.

So don't mess it up with cpu_to_node()

Yinghai

>
> void __cpuinit numa_set_node(int cpu, int node)
> {
> @@ -776,13 +776,9 @@ void __cpuinit numa_remove_cpu(int cpu)
> void __cpuinit numa_add_cpu(int cpu)
> {
> unsigned long addr;
> - u16 apicid;
> - int physnid;
> - int nid = NUMA_NO_NODE;
> + int physnid, nid;
>
> - apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
> - if (apicid != BAD_APICID)
> - nid = apicid_to_node[apicid];
> + nid = numa_cpu_node(cpu);
> if (nid == NUMA_NO_NODE)
> nid = early_cpu_to_node(cpu);
> BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
> diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
> index 6027a48..48651c6 100644
> --- a/arch/x86/mm/srat_32.c
> +++ b/arch/x86/mm/srat_32.c
> @@ -255,7 +255,7 @@ int __init get_memcfg_from_srat(void)
> num_memory_chunks);
>
> for (i = 0; i < MAX_LOCAL_APIC; i++)
> - apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
> + set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
>
> for (j = 0; j < num_memory_chunks; j++){
> struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
> diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
> index 603d285..9a97261 100644
> --- a/arch/x86/mm/srat_64.c
> +++ b/arch/x86/mm/srat_64.c
> @@ -79,7 +79,7 @@ static __init void bad_srat(void)
> printk(KERN_ERR "SRAT: SRAT not used.\n");
> acpi_numa = -1;
> for (i = 0; i < MAX_LOCAL_APIC; i++)
> - apicid_to_node[i] = NUMA_NO_NODE;
> + set_apicid_to_node(i, NUMA_NO_NODE);
> for (i = 0; i < MAX_NUMNODES; i++) {
> nodes[i].start = nodes[i].end = 0;
> nodes_add[i].start = nodes_add[i].end = 0;
> @@ -138,7 +138,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
> printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
> return;
> }
> - apicid_to_node[apic_id] = node;
> + set_apicid_to_node(apic_id, node);
> node_set(node, cpu_nodes_parsed);
> acpi_numa = 1;
> printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
> @@ -178,7 +178,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
> return;
> }
>
> - apicid_to_node[apic_id] = node;
> + set_apicid_to_node(apic_id, node);
> node_set(node, cpu_nodes_parsed);
> acpi_numa = 1;
> printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
> @@ -521,7 +521,7 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
> * node, it must now point to the fake node ID.
> */
> for (j = 0; j < MAX_LOCAL_APIC; j++)
> - if (apicid_to_node[j] == nid &&
> + if (__apicid_to_node[j] == nid &&
> fake_apicid_to_node[j] == NUMA_NO_NODE)
> fake_apicid_to_node[j] = i;
> }
> @@ -532,13 +532,13 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
> * value.
> */
> for (i = 0; i < MAX_LOCAL_APIC; i++)
> - if (apicid_to_node[i] != NUMA_NO_NODE &&
> + if (__apicid_to_node[i] != NUMA_NO_NODE &&
> fake_apicid_to_node[i] == NUMA_NO_NODE)
> fake_apicid_to_node[i] = 0;
>
> for (i = 0; i < num_nodes; i++)
> __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
> - memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
> + memcpy(__apicid_to_node, fake_apicid_to_node, sizeof(__apicid_to_node));
>
> nodes_clear(nodes_parsed);
> for (i = 0; i < num_nodes; i++)
> --
> To unsubscribe from this list: send the line "unsubscribe linux-tip-commits" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/