[PATCH 08/14] mm: meminit: Initialise remaining memory in parallel with kswapd

From: Mel Gorman
Date: Mon Apr 13 2015 - 06:19:45 EST


Memory is only partially initialised at the moment. When this patch is
applied the first task kswapd does is initialise all remaining memmap in
parallel. This has two advantages. The first is that it's parallelised,
should take less time and boot faster on large machines. The second is
that the initialisation is taking place on a CPU local to the memory being
initialised. The user-visible effect on large machines is that free memory
will appear to rapidly increase early in the lifetime of the system until
kswapd reports that all memory is initialised in the kernel log. It'll
report the time to initialise so that the speedup can be estimated. Once
initialised there should be no other user-visibile effects. Potentially
the initialisation could be further parallelised using all CPUs local to
a node but it's expected that a single CPU will be able to saturate the
memory bus during initialisation and the additional complexity is not
justified.

Signed-off-by: Mel Gorman <mgorman@xxxxxxx>
---
mm/internal.h | 6 +++
mm/mm_init.c | 1 +
mm/page_alloc.c | 116 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
mm/vmscan.c | 6 ++-
4 files changed, 123 insertions(+), 6 deletions(-)

diff --git a/mm/internal.h b/mm/internal.h
index c6718e2c051e..2fad7c066e5c 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -388,9 +388,15 @@ static inline void mminit_verify_zonelist(void)
#ifdef CONFIG_DEFERRED_MEM_INIT
#define __defermem_init __meminit
#define __defer_init __meminit
+
+void deferred_init_memmap(int nid);
#else
#define __defermem_init
#define __defer_init __init
+
+static inline void deferred_init_memmap(int nid)
+{
+}
#endif

/* mminit_validate_memmodel_limits is independent of CONFIG_DEBUG_MEMORY_INIT */
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 5f420f7fafa1..28fbf87b20aa 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -11,6 +11,7 @@
#include <linux/export.h>
#include <linux/memory.h>
#include <linux/notifier.h>
+#include <linux/sched.h>
#include "internal.h"

#ifdef CONFIG_DEBUG_MEMORY_INIT
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e15e74da31a5..10ba841c7609 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -252,6 +252,14 @@ static inline bool __defermem_init early_page_uninitialised(unsigned long pfn)
return false;
}

+static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
+{
+ if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
+ return true;
+
+ return false;
+}
+
/*
* Returns false when the remaining initialisation should be deferred until
* later in the boot cycle when it can be parallelised.
@@ -283,6 +291,11 @@ static inline bool early_page_uninitialised(unsigned long pfn)
return false;
}

+static inline bool early_page_nid_uninitialised(unsigned long pfn, int nid)
+{
+ return false;
+}
+
static inline bool update_defer_init(pg_data_t *pgdat,
unsigned long pfn,
unsigned long *nr_initialised)
@@ -879,14 +892,45 @@ static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
return __init_single_page(pfn_to_page(pfn), pfn, zone, nid);
}

-void reserve_bootmem_region(unsigned long start, unsigned long end)
+#ifdef CONFIG_DEFERRED_MEM_INIT
+static void init_reserved_page(unsigned long pfn)
+{
+ pg_data_t *pgdat;
+ int nid, zid;
+
+ if (!early_page_uninitialised(pfn))
+ return;
+
+ nid = early_pfn_to_nid(pfn);
+ pgdat = NODE_DATA(nid);
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = &pgdat->node_zones[zid];
+
+ if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
+ break;
+ }
+ __init_single_pfn(pfn, zid, nid);
+}
+#else
+static inline void init_reserved_page(unsigned long pfn)
+{
+}
+#endif /* CONFIG_DEFERRED_MEM_INIT */
+
+void __meminit reserve_bootmem_region(unsigned long start, unsigned long end)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long end_pfn = PFN_UP(end);

- for (; start_pfn < end_pfn; start_pfn++)
- if (pfn_valid(start_pfn))
- SetPageReserved(pfn_to_page(start_pfn));
+ for (; start_pfn < end_pfn; start_pfn++) {
+ if (pfn_valid(start_pfn)) {
+ struct page *page = pfn_to_page(start_pfn);
+
+ init_reserved_page(start_pfn);
+ SetPageReserved(page);
+ }
+ }
}

static bool free_pages_prepare(struct page *page, unsigned int order)
@@ -1007,6 +1051,67 @@ void __defer_init __free_pages_bootmem(struct page *page, unsigned long pfn,
return __free_pages_boot_core(page, pfn, order);
}

+#ifdef CONFIG_DEFERRED_MEM_INIT
+/* Initialise remaining memory on a node */
+void __defermem_init deferred_init_memmap(int nid)
+{
+ unsigned long start = jiffies;
+ struct mminit_pfnnid_cache nid_init_state = { };
+
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int zid;
+ unsigned long first_init_pfn = pgdat->first_deferred_pfn;
+
+ if (first_init_pfn == ULONG_MAX)
+ return;
+
+ /* Sanity check boundaries */
+ BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn);
+ BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
+ pgdat->first_deferred_pfn = ULONG_MAX;
+
+ for (zid = 0; zid < MAX_NR_ZONES; zid++) {
+ struct zone *zone = pgdat->node_zones + zid;
+ unsigned long walk_start, walk_end;
+ int i;
+
+ for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) {
+ unsigned long pfn, end_pfn;
+
+ end_pfn = min(walk_end, zone_end_pfn(zone));
+ pfn = first_init_pfn;
+ if (pfn < walk_start)
+ pfn = walk_start;
+ if (pfn < zone->zone_start_pfn)
+ pfn = zone->zone_start_pfn;
+
+ for (; pfn < end_pfn; pfn++) {
+ struct page *page;
+
+ if (!pfn_valid(pfn))
+ continue;
+
+ if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state))
+ continue;
+
+ if (page->flags) {
+ VM_BUG_ON(page_zone(page) != zone);
+ continue;
+ }
+
+ __init_single_page(page, pfn, zid, nid);
+ __free_pages_boot_core(page, pfn, 0);
+ cond_resched();
+ }
+ first_init_pfn = max(end_pfn, first_init_pfn);
+ }
+ }
+
+ pr_info("kswapd %d initialised deferred memory in %ums\n", nid,
+ jiffies_to_msecs(jiffies - start));
+}
+#endif /* CONFIG_DEFERRED_MEM_INIT */
+
#ifdef CONFIG_CMA
/* Free whole pageblock and set its migration type to MIGRATE_CMA. */
void __init init_cma_reserved_pageblock(struct page *page)
@@ -4217,6 +4322,9 @@ static void setup_zone_migrate_reserve(struct zone *zone)
zone->nr_migrate_reserve_block = reserve;

for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
+ if (!early_page_nid_uninitialised(pfn, zone_to_nid(zone)))
+ return;
+
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5e8eadd71bac..c4895d26d036 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -3348,7 +3348,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
* If there are applications that are active memory-allocators
* (most normal use), this basically shouldn't matter.
*/
-static int kswapd(void *p)
+static int __defermem_init kswapd(void *p)
{
unsigned long order, new_order;
unsigned balanced_order;
@@ -3383,6 +3383,8 @@ static int kswapd(void *p)
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();

+ deferred_init_memmap(pgdat->node_id);
+
order = new_order = 0;
balanced_order = 0;
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
@@ -3538,7 +3540,7 @@ static int cpu_callback(struct notifier_block *nfb, unsigned long action,
* This kswapd start function will be called by init and node-hot-add.
* On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
*/
-int kswapd_run(int nid)
+int __defermem_init kswapd_run(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
int ret = 0;
--
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/