Re: 2.6.24 git2/mm1: cpu_to_node mapping to non-existant nodes causing boot failure

From: Mel Gorman
Date: Thu Feb 14 2008 - 15:17:56 EST


On (13/02/08 10:45), Mike Travis didst pronounce:
> Mel Gorman wrote:
> > On (03/02/08 17:16), Andrew Morton didst pronounce:
> >> ftp://ftp.kernel.org/pub/linux/kernel/people/akpm/patches/2.6/2.6.24/2.6.24-mm1/
> >>
> >
> > bl6-13 (4-way x86_64 machine) from test.kernel.org is failing to boot recent
> > -mm and mainline trees. I noticed it when testing -mm before rebasing other
> > patches but the oops on mainline looks the same. The full console log is
> > below but the important difference between a working and non-working kernel
> > is the following
> >
> > -PERCPU: Allocating 62512 bytes of per cpu data
> > -Built 1 zonelists in Node order, mobility grouping on. Total pages: 255875
> > +PERCPU: Allocating 65560 bytes of per cpu data
> > +cpu with no node 2, num_online_nodes 1
> > +cpu with no node 3, num_online_nodes 1
> > +Built 1 zonelists in Node order, mobility grouping on. Total pages:
> > 251257
> >
> > "cpu with no node 2" is actually saying that cpu 2 has no node and the
> > message is a just misleading. The number of online nodes and cpu mappings
> > are not adding up as I got this from a debugging patch
>
> I'll take a closer look though I've not been able to duplicate your
> error yet. It does appear from the message text that the code is
> out-of-date. The latest "setup_per_cpu_areas()" should say:
>
> "cpu %d has no node, num_online_nodes %d\n",
> i, num_online_nodes());
>
> There are a number of backed up patches in the queue. I'm resubmitting
> the whole set re-based on 2.6.25-rc1 shortly. (I don't know though, that
> any will address this problem.)
>

According to git-bisect, the problem patch is below. It doesn't back out
cleanly so I haven't verified for sure the bisect is correct yet.

commit ef97001f3d869d7cc1956e0cc0d89e514e3f7db0
Author: travis@xxxxxxx <travis@xxxxxxx>
Date: Wed Jan 30 13:33:10 2008 +0100

x86: change size of APICIDs from u8 to u16

Change the size of APICIDs from u8 to u16. This partially
supports the new x2apic mode that will be present on future
processor chips. (Chips actually support 32-bit APICIDs, but that
change is more intrusive. Supporting 16-bit is sufficient for now).

Signed-off-by: Jack Steiner <steiner@xxxxxxx>

I've included just the partial change from u8 to u16 apicids. The
remaining x2apic changes will be in a separate patch.

In addition, the fake_node_to_pxm_map[] and fake_apicid_to_node[]
tables have been moved from local data to the __initdata section
reducing stack pressure when MAX_NUMNODES and MAX_LOCAL_APIC are
increased in size.

Signed-off-by: Mike Travis <travis@xxxxxxx>
Reviewed-by: Christoph Lameter <clameter@xxxxxxx>
Signed-off-by: Ingo Molnar <mingo@xxxxxxx>
Signed-off-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>

diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index ce703e2..ac2b78f 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -32,10 +32,10 @@
* array during this time. Is it zeroed when the per_cpu
* data area is removed.
*/
-u8 x86_cpu_to_apicid_init[NR_CPUS] __initdata
+u16 x86_cpu_to_apicid_init[NR_CPUS] __initdata
= { [0 ... NR_CPUS-1] = BAD_APICID };
void *x86_cpu_to_apicid_ptr;
-DEFINE_PER_CPU(u8, x86_cpu_to_apicid) = BAD_APICID;
+DEFINE_PER_CPU(u16, x86_cpu_to_apicid) = BAD_APICID;
EXPORT_PER_CPU_SYMBOL(x86_cpu_to_apicid);

struct genapic __read_mostly *genapic = &apic_flat;
diff --git a/arch/x86/kernel/mpparse_64.c b/arch/x86/kernel/mpparse_64.c
index ef4aab1..17d21e5 100644
--- a/arch/x86/kernel/mpparse_64.c
+++ b/arch/x86/kernel/mpparse_64.c
@@ -67,7 +67,7 @@ unsigned disabled_cpus __cpuinitdata;
/* Bitmask of physically existing CPUs */
physid_mask_t phys_cpu_present_map = PHYSID_MASK_NONE;

-u8 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };
+u16 bios_cpu_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID };


/*
@@ -132,7 +132,7 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
* area is created.
*/
if (x86_cpu_to_apicid_ptr) {
- u8 *x86_cpu_to_apicid = (u8 *)x86_cpu_to_apicid_ptr;
+ u16 *x86_cpu_to_apicid = (u16 *)x86_cpu_to_apicid_ptr;
x86_cpu_to_apicid[cpu] = m->mpc_apicid;
} else {
per_cpu(x86_cpu_to_apicid, cpu) = m->mpc_apicid;
diff --git a/arch/x86/kernel/smpboot_64.c b/arch/x86/kernel/smpboot_64.c
index 5bd42ce..1fea185 100644
--- a/arch/x86/kernel/smpboot_64.c
+++ b/arch/x86/kernel/smpboot_64.c
@@ -65,7 +65,7 @@ int smp_num_siblings = 1;
EXPORT_SYMBOL(smp_num_siblings);

/* Last level cache ID of each logical CPU */
-DEFINE_PER_CPU(u8, cpu_llc_id) = BAD_APICID;
+DEFINE_PER_CPU(u16, cpu_llc_id) = BAD_APICID;

/* Bitmask of currently online CPUs */
cpumask_t cpu_online_map __read_mostly;
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 551e359..650001a 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -626,7 +626,7 @@ void __init init_cpu_to_node(void)
int i;

for (i = 0; i < NR_CPUS; i++) {
- u8 apicid = x86_cpu_to_apicid_init[i];
+ u16 apicid = x86_cpu_to_apicid_init[i];

if (apicid == BAD_APICID)
continue;
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 5c0637e..b367bc3 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -130,6 +130,9 @@ void __init
acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
{
int pxm, node;
+ int apic_id;
+
+ apic_id = pa->apic_id;
if (srat_disabled())
return;
if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
@@ -145,10 +148,10 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
bad_srat();
return;
}
- apicid_to_node[pa->apic_id] = node;
+ apicid_to_node[apic_id] = node;
acpi_numa = 1;
printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n",
- pxm, pa->apic_id, node);
+ pxm, apic_id, node);
}

int update_end_of_memory(unsigned long end) {return -1;}
@@ -343,7 +346,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
/* First clean up the node list */
for (i = 0; i < MAX_NUMNODES; i++) {
cutoff_node(i, start, end);
- if ((nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
+ /* ZZZ why was this needed. At least add a comment */
+ if (nodes[i].end && (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
unparse_node(i);
node_set_offline(i);
}
@@ -384,6 +388,12 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
}

#ifdef CONFIG_NUMA_EMU
+static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
+ [0 ... MAX_NUMNODES-1] = PXM_INVAL
+};
+static unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
+ [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
+};
static int __init find_node_by_addr(unsigned long addr)
{
int ret = NUMA_NO_NODE;
@@ -414,12 +424,6 @@ static int __init find_node_by_addr(unsigned long addr)
void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
{
int i, j;
- int fake_node_to_pxm_map[MAX_NUMNODES] = {
- [0 ... MAX_NUMNODES-1] = PXM_INVAL
- };
- unsigned char fake_apicid_to_node[MAX_LOCAL_APIC] = {
- [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
- };

printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
"topology.\n");
diff --git a/include/asm-x86/processor.h b/include/asm-x86/processor.h
index e701ac5..81ecfed 100644
--- a/include/asm-x86/processor.h
+++ b/include/asm-x86/processor.h
@@ -90,14 +90,14 @@ struct cpuinfo_x86 {
#ifdef CONFIG_SMP
cpumask_t llc_shared_map; /* cpus sharing the last level cache */
#endif
- unsigned char x86_max_cores; /* cpuid returned max cores value */
- unsigned char apicid;
- unsigned short x86_clflush_size;
+ u16 x86_max_cores; /* cpuid returned max cores value */
+ u16 apicid;
+ u16 x86_clflush_size;
#ifdef CONFIG_SMP
- unsigned char booted_cores; /* number of cores as seen by OS */
- __u8 phys_proc_id; /* Physical processor id. */
- __u8 cpu_core_id; /* Core id */
- __u8 cpu_index; /* index into per_cpu list */
+ u16 booted_cores; /* number of cores as seen by OS */
+ u16 phys_proc_id; /* Physical processor id. */
+ u16 cpu_core_id; /* Core id */
+ u16 cpu_index; /* index into per_cpu list */
#endif
} __attribute__((__aligned__(SMP_CACHE_BYTES)));

diff --git a/include/asm-x86/smp_64.h b/include/asm-x86/smp_64.h
index 2feddda..b1d5381 100644
--- a/include/asm-x86/smp_64.h
+++ b/include/asm-x86/smp_64.h
@@ -26,14 +26,14 @@ extern void unlock_ipi_call_lock(void);
extern int smp_call_function_mask(cpumask_t mask, void (*func)(void *),
void *info, int wait);

-extern u8 __initdata x86_cpu_to_apicid_init[];
+extern u16 __initdata x86_cpu_to_apicid_init[];
extern void *x86_cpu_to_apicid_ptr;
-extern u8 bios_cpu_apicid[];
+extern u16 bios_cpu_apicid[];

DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
DECLARE_PER_CPU(cpumask_t, cpu_core_map);
-DECLARE_PER_CPU(u8, cpu_llc_id);
-DECLARE_PER_CPU(u8, x86_cpu_to_apicid);
+DECLARE_PER_CPU(u16, cpu_llc_id);
+DECLARE_PER_CPU(u16, x86_cpu_to_apicid);

static inline int cpu_present_to_apicid(int mps_cpu)
{
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/