Re: [PATCH] mm/alloc: fallback to first node if the wanted node offline

From: Pingfan Liu
Date: Thu Jan 10 2019 - 22:13:00 EST


On Tue, Jan 8, 2019 at 10:34 PM Michal Hocko <mhocko@xxxxxxxxxx> wrote:
>
> On Thu 20-12-18 10:19:34, Michal Hocko wrote:
> > On Thu 20-12-18 15:19:39, Pingfan Liu wrote:
> > > Hi Michal,
> > >
> > > WIth this patch applied on the old one, I got the following message.
> > > Please get it from attachment.
> > [...]
> > > [ 0.409637] NUMA: Node 1 [mem 0x00000000-0x0009ffff] + [mem 0x00100000-0x7fffffff] -> [mem 0x00000000-0x7fffffff]
> > > [ 0.419858] NUMA: Node 1 [mem 0x00000000-0x7fffffff] + [mem 0x100000000-0x47fffffff] -> [mem 0x00000000-0x47fffffff]
> > > [ 0.430356] NODE_DATA(0) allocated [mem 0x87efd4000-0x87effefff]
> > > [ 0.436325] NODE_DATA(0) on node 5
> > > [ 0.440092] Initmem setup node 0 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.447078] node[0] zonelist:
> > > [ 0.450106] NODE_DATA(1) allocated [mem 0x47ffd5000-0x47fffffff]
> > > [ 0.456114] NODE_DATA(2) allocated [mem 0x87efa9000-0x87efd3fff]
> > > [ 0.462064] NODE_DATA(2) on node 5
> > > [ 0.465852] Initmem setup node 2 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.472813] node[2] zonelist:
> > > [ 0.475846] NODE_DATA(3) allocated [mem 0x87ef7e000-0x87efa8fff]
> > > [ 0.481827] NODE_DATA(3) on node 5
> > > [ 0.485590] Initmem setup node 3 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.492575] node[3] zonelist:
> > > [ 0.495608] NODE_DATA(4) allocated [mem 0x87ef53000-0x87ef7dfff]
> > > [ 0.501587] NODE_DATA(4) on node 5
> > > [ 0.505349] Initmem setup node 4 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.512334] node[4] zonelist:
> > > [ 0.515370] NODE_DATA(5) allocated [mem 0x87ef28000-0x87ef52fff]
> > > [ 0.521384] NODE_DATA(6) allocated [mem 0x87eefd000-0x87ef27fff]
> > > [ 0.527329] NODE_DATA(6) on node 5
> > > [ 0.531091] Initmem setup node 6 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.538076] node[6] zonelist:
> > > [ 0.541109] NODE_DATA(7) allocated [mem 0x87eed2000-0x87eefcfff]
> > > [ 0.547090] NODE_DATA(7) on node 5
> > > [ 0.550851] Initmem setup node 7 [mem 0x0000000000000000-0x0000000000000000]
> > > [ 0.557836] node[7] zonelist:
> >
> > OK, so it is clear that building zonelists this early is not going to
> > fly. We do not have the complete information yet. I am not sure when do
> > we get that at this moment but I suspect the we either need to move that
> > initialization to a sooner stage or we have to reconsider whether the
> > phase when we build zonelists really needs to consider only online numa
> > nodes.
> >
> > [...]
> > > [ 1.067658] percpu: Embedded 46 pages/cpu @(____ptrval____) s151552 r8192 d28672 u262144
> > > [ 1.075692] node[1] zonelist: 1:Normal 1:DMA32 1:DMA 5:Normal
> > > [ 1.081376] node[5] zonelist: 5:Normal 1:Normal 1:DMA32 1:DMA
> >
> > I hope to get to this before I leave for christmas vacation, if not I
> > will stare into it after then.
>
> I am sorry but I didn't get to this sooner. But I've got another idea. I
> concluded that the whole dance is simply bogus and we should treat
> memory less nodes, well, as nodes with no memory ranges rather than
> special case them. Could you give the following a spin please?
>
> ---
> diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
> index 1308f5408bf7..0e79445cfd85 100644
> --- a/arch/x86/mm/numa.c
> +++ b/arch/x86/mm/numa.c
> @@ -216,8 +216,6 @@ static void __init alloc_node_data(int nid)
>
> node_data[nid] = nd;
> memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
> -
> - node_set_online(nid);
> }
>
> /**
> @@ -535,6 +533,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> /* Account for nodes with cpus and no memory */
> node_possible_map = numa_nodes_parsed;
> numa_nodemask_from_meminfo(&node_possible_map, mi);
> + pr_info("parsed=%*pbl, possible=%*pbl\n", nodemask_pr_args(&numa_nodes_parsed), nodemask_pr_args(&node_possible_map));
> if (WARN_ON(nodes_empty(node_possible_map)))
> return -EINVAL;
>
> @@ -570,7 +569,7 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> return -EINVAL;
>
> /* Finally register nodes. */
> - for_each_node_mask(nid, node_possible_map) {
> + for_each_node_mask(nid, numa_nodes_parsed) {
> u64 start = PFN_PHYS(max_pfn);
> u64 end = 0;
>
> @@ -581,9 +580,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> end = max(mi->blk[i].end, end);
> }
>
> - if (start >= end)
> - continue;
> -
> /*
> * Don't confuse VM with a node that doesn't have the
> * minimum amount of memory:
> @@ -592,6 +588,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
> continue;
>
> alloc_node_data(nid);
> + if (end)
> + node_set_online(nid);
> }
>
> /* Dump memblock with node info and return. */
> @@ -721,21 +719,6 @@ void __init x86_numa_init(void)
> numa_init(dummy_numa_init);
> }
>
> -static void __init init_memory_less_node(int nid)
> -{
> - unsigned long zones_size[MAX_NR_ZONES] = {0};
> - unsigned long zholes_size[MAX_NR_ZONES] = {0};
> -
> - /* Allocate and initialize node data. Memory-less node is now online.*/
> - alloc_node_data(nid);
> - free_area_init_node(nid, zones_size, 0, zholes_size);
> -
> - /*
> - * All zonelists will be built later in start_kernel() after per cpu
> - * areas are initialized.
> - */
> -}
> -
> /*
> * Setup early cpu_to_node.
> *
> @@ -763,9 +746,6 @@ void __init init_cpu_to_node(void)
> if (node == NUMA_NO_NODE)
> continue;
>
> - if (!node_online(node))
> - init_memory_less_node(node);
> -
> numa_set_node(cpu, node);
> }
> }
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 2ec9cc407216..52e54d16662a 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -5234,6 +5234,8 @@ static void build_zonelists(pg_data_t *pgdat)
> int node, load, nr_nodes = 0;
> nodemask_t used_mask;
> int local_node, prev_node;
> + struct zone *zone;
> + struct zoneref *z;
>
> /* NUMA-aware ordering of nodes */
> local_node = pgdat->node_id;
> @@ -5259,6 +5261,11 @@ static void build_zonelists(pg_data_t *pgdat)
>
> build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
> build_thisnode_zonelists(pgdat);
> +
> + pr_info("node[%d] zonelist: ", pgdat->node_id);
> + for_each_zone_zonelist(zone, z, &pgdat->node_zonelists[ZONELIST_FALLBACK], MAX_NR_ZONES-1)
> + pr_cont("%d:%s ", zone_to_nid(zone), zone->name);
> + pr_cont("\n");
> }
>
> #ifdef CONFIG_HAVE_MEMORYLESS_NODES
> @@ -5361,10 +5368,11 @@ static void __build_all_zonelists(void *data)
> if (self && !node_online(self->node_id)) {
> build_zonelists(self);
> } else {
> - for_each_online_node(nid) {
> + for_each_node(nid) {
> pg_data_t *pgdat = NODE_DATA(nid);
>
> - build_zonelists(pgdat);
> + if (pgdat)
> + build_zonelists(pgdat);
> }
>
> #ifdef CONFIG_HAVE_MEMORYLESS_NODES
> @@ -6644,10 +6652,8 @@ static unsigned long __init find_min_pfn_for_node(int nid)
> for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
> min_pfn = min(min_pfn, start_pfn);
>
> - if (min_pfn == ULONG_MAX) {
> - pr_warn("Could not find start_pfn for node %d\n", nid);
> + if (min_pfn == ULONG_MAX)
> return 0;
> - }
>
> return min_pfn;
> }
> @@ -6991,8 +6997,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
> mminit_verify_pageflags_layout();
> setup_nr_node_ids();
> zero_resv_unavail();
> - for_each_online_node(nid) {
> + for_each_node(nid) {
> pg_data_t *pgdat = NODE_DATA(nid);
> +
> + if (!pgdat)
> + continue;
> +
> free_area_init_node(nid, NULL,
> find_min_pfn_for_node(nid), NULL);
>
Hi, this patch works! Feel free to use tested-by me

Best Regards
Pingfan