Re: [PATCH] mm/alloc: fallback to first node if the wanted node offline

From: Michal Hocko
Date: Mon Dec 10 2018 - 07:37:45 EST


On Fri 07-12-18 16:56:27, Michal Hocko wrote:
> On Fri 07-12-18 22:27:13, Pingfan Liu wrote:
> [...]
> > diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> > index 1308f54..4dc497d 100644
> > --- a/arch/x86/mm/numa.c
> > +++ b/arch/x86/mm/numa.c
> > @@ -754,18 +754,23 @@ void __init init_cpu_to_node(void)
> > {
> > int cpu;
> > u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
> > + int node, nr;
> >
> > BUG_ON(cpu_to_apicid == NULL);
> > + nr = cpumask_weight(cpu_possible_mask);
> > +
> > + /* bring up all possible node, since dev->numa_node */
> > + //should check acpi works for node possible,
> > + for_each_node(node)
> > + if (!node_online(node))
> > + init_memory_less_node(node);
>
> I suspect there is no change if you replace for_each_node by
> for_each_node_mask(nid, node_possible_map)
>
> here. If that is the case then we are probably calling
> free_area_init_node too early. I do not see it yet though.

OK, so it is not about calling it late or soon. It is just that
node_possible_map is a misnomer and it has a different semantic than
I've expected. numa_nodemask_from_meminfo simply considers only nodes
with some memory. So my patch didn't really make any difference and the
node stayed uninialized.

In other words. Does the following work? I am sorry to wildguess this
way but I am not able to recreate your setups to play with this myself.

diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 1308f5408bf7..d51643e10d00 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)

node_data[nid] = nd;
memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
-
- node_set_online(nid);
}

/**
@@ -527,6 +525,19 @@ static void __init numa_clear_kernel_node_hotplug(void)
}
}

+static void __init init_memory_less_node(int nid)
+{
+ unsigned long zones_size[MAX_NR_ZONES] = {0};
+ unsigned long zholes_size[MAX_NR_ZONES] = {0};
+
+ free_area_init_node(nid, zones_size, 0, zholes_size);
+
+ /*
+ * All zonelists will be built later in start_kernel() after per cpu
+ * areas are initialized.
+ */
+}
+
static int __init numa_register_memblks(struct numa_meminfo *mi)
{
unsigned long uninitialized_var(pfn_align);
@@ -570,7 +581,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
return -EINVAL;

/* Finally register nodes. */
- for_each_node_mask(nid, node_possible_map) {
+ for_each_node(nid) {
u64 start = PFN_PHYS(max_pfn);
u64 end = 0;

@@ -592,6 +603,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
continue;

alloc_node_data(nid);
+ if (!end)
+ init_memory_less_node(nid);
+ else
+ node_set_online(nid);
}

/* Dump memblock with node info and return. */
@@ -721,21 +736,6 @@ void __init x86_numa_init(void)
numa_init(dummy_numa_init);
}

-static void __init init_memory_less_node(int nid)
-{
- unsigned long zones_size[MAX_NR_ZONES] = {0};
- unsigned long zholes_size[MAX_NR_ZONES] = {0};
-
- /* Allocate and initialize node data. Memory-less node is now online.*/
- alloc_node_data(nid);
- free_area_init_node(nid, zones_size, 0, zholes_size);
-
- /*
- * All zonelists will be built later in start_kernel() after per cpu
- * areas are initialized.
- */
-}
-
/*
* Setup early cpu_to_node.
*
@@ -763,9 +763,6 @@ void __init init_cpu_to_node(void)
if (node == NUMA_NO_NODE)
continue;

- if (!node_online(node))
- init_memory_less_node(node);
-
numa_set_node(cpu, node);
}
}
--
Michal Hocko
SUSE Labs