[RFC PATCH] mm: support CONFIG_ZONE_DEVICE + CONFIG_ZONE_DMA

From: Dan Williams
Date: Mon Jan 25 2016 - 19:07:13 EST


It appears devices requiring ZONE_DMA are still prevalent (see link
below). For this reason the proposal to require turning off ZONE_DMA to
enable ZONE_DEVICE is untenable in the short term. We want a single
kernel image to be able to support legacy devices as well as next
generation persistent memory platforms.

Towards this end, alias ZONE_DMA and ZONE_DEVICE to work around needing
to maintain a unique zone number for ZONE_DEVICE. Record the geometry
of ZONE_DMA at init (->init_spanned_pages) and use that information in
is_zone_device_page() to differentiate pages allocated via
devm_memremap_pages() vs true ZONE_DMA pages. Otherwise, use the
simpler definition of is_zone_device_page() when ZONE_DMA is turned off.

Note that this also teaches the memory hot remove path that the zone may
not have sections for all pfn spans (->zone_dyn_start_pfn).

A user visible implication of this change is potentially an unexpectedly
high "spanned" value in /proc/zoneinfo for the DMA zone.

Cc: H. Peter Anvin <hpa@xxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Jerome Glisse <j.glisse@xxxxxxxxx>
Cc: Christoph Hellwig <hch@xxxxxx>
Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=110931
Fixes: 033fbae988fc ("mm: ZONE_DEVICE for "device memory"")
Reported-by: Sudip Mukherjee <sudipm.mukherjee@xxxxxxxxx>
Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
---
include/linux/mm.h | 46 ++++++++++++++++++++++++++++++++--------------
include/linux/mmzone.h | 24 ++++++++++++++++++++----
mm/Kconfig | 1 -
mm/memory_hotplug.c | 15 +++++++++++----
mm/page_alloc.c | 9 ++++++---
5 files changed, 69 insertions(+), 26 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f1cd22f2df1a..b4bccd3d3c41 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -664,12 +664,44 @@ static inline enum zone_type page_zonenum(const struct page *page)
return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
}

+#ifdef NODE_NOT_IN_PAGE_FLAGS
+extern int page_to_nid(const struct page *page);
+#else
+static inline int page_to_nid(const struct page *page)
+{
+ return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
+}
+#endif
+
+static inline struct zone *page_zone(const struct page *page)
+{
+ return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
+}
+
#ifdef CONFIG_ZONE_DEVICE
void get_zone_device_page(struct page *page);
void put_zone_device_page(struct page *page);
static inline bool is_zone_device_page(const struct page *page)
{
+#ifndef CONFIG_ZONE_DMA
return page_zonenum(page) == ZONE_DEVICE;
+#else /* ZONE_DEVICE == ZONE_DMA */
+ struct zone *zone;
+
+ if (page_zonenum(page) != ZONE_DEVICE)
+ return false;
+
+ /*
+ * If ZONE_DEVICE is aliased with ZONE_DMA we need to check
+ * whether this was a dynamically allocated page from
+ * devm_memremap_pages() by checking against the size of
+ * ZONE_DMA at boot.
+ */
+ zone = page_zone(page);
+ if (page_to_pfn(page) <= zone_end_pfn_boot(zone))
+ return false;
+ return true;
+#endif
}
#else
static inline void get_zone_device_page(struct page *page)
@@ -735,15 +767,6 @@ static inline int zone_to_nid(struct zone *zone)
#endif
}

-#ifdef NODE_NOT_IN_PAGE_FLAGS
-extern int page_to_nid(const struct page *page);
-#else
-static inline int page_to_nid(const struct page *page)
-{
- return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
-}
-#endif
-
#ifdef CONFIG_NUMA_BALANCING
static inline int cpu_pid_to_cpupid(int cpu, int pid)
{
@@ -857,11 +880,6 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
}
#endif /* CONFIG_NUMA_BALANCING */

-static inline struct zone *page_zone(const struct page *page)
-{
- return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
-}
-
#ifdef SECTION_IN_PAGE_FLAGS
static inline void set_page_section(struct page *page, unsigned long section)
{
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 33bb1b19273e..a0ef09b7f893 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -288,6 +288,13 @@ enum zone_type {
*/
ZONE_DMA,
#endif
+#ifdef CONFIG_ZONE_DEVICE
+#ifndef CONFIG_ZONE_DMA
+ ZONE_DEVICE,
+#else
+ ZONE_DEVICE = ZONE_DMA,
+#endif
+#endif
#ifdef CONFIG_ZONE_DMA32
/*
* x86_64 needs two ZONE_DMAs because it supports devices that are
@@ -314,11 +321,7 @@ enum zone_type {
ZONE_HIGHMEM,
#endif
ZONE_MOVABLE,
-#ifdef CONFIG_ZONE_DEVICE
- ZONE_DEVICE,
-#endif
__MAX_NR_ZONES
-
};

#ifndef __GENERATING_BOUNDS_H
@@ -379,12 +382,19 @@ struct zone {

/* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
unsigned long zone_start_pfn;
+ /* first dynamically added pfn of the zone */
+ unsigned long zone_dyn_start_pfn;

/*
* spanned_pages is the total pages spanned by the zone, including
* holes, which is calculated as:
* spanned_pages = zone_end_pfn - zone_start_pfn;
*
+ * init_spanned_pages is the boot/init time total pages spanned
+ * by the zone for differentiating statically assigned vs
+ * dynamically hot added memory to a zone.
+ * init_spanned_pages = init_zone_end_pfn - zone_start_pfn;
+ *
* present_pages is physical pages existing within the zone, which
* is calculated as:
* present_pages = spanned_pages - absent_pages(pages in holes);
@@ -423,6 +433,7 @@ struct zone {
*/
unsigned long managed_pages;
unsigned long spanned_pages;
+ unsigned long init_spanned_pages;
unsigned long present_pages;

const char *name;
@@ -546,6 +557,11 @@ static inline unsigned long zone_end_pfn(const struct zone *zone)
return zone->zone_start_pfn + zone->spanned_pages;
}

+static inline unsigned long zone_end_pfn_boot(const struct zone *zone)
+{
+ return zone->zone_start_pfn + zone->init_spanned_pages;
+}
+
static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
{
return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
diff --git a/mm/Kconfig b/mm/Kconfig
index 97a4e06b15c0..08a92a9c8fbd 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -652,7 +652,6 @@ config IDLE_PAGE_TRACKING
config ZONE_DEVICE
bool "Device memory (pmem, etc...) hotplug support" if EXPERT
default !ZONE_DMA
- depends on !ZONE_DMA
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on X86_64 #arch_add_memory() comprehends device memory
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 4af58a3a8ffa..c3f0ff45bd47 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -300,6 +300,8 @@ static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn,

zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
zone->zone_start_pfn;
+ if (!zone->zone_dyn_start_pfn || start_pfn < zone->zone_dyn_start_pfn)
+ zone->zone_dyn_start_pfn = start_pfn;

zone_span_writeunlock(zone);
}
@@ -601,8 +603,9 @@ static int find_biggest_section_pfn(int nid, struct zone *zone,
static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long zone_start_pfn = zone->zone_start_pfn;
+ unsigned long zone_start_pfn = zone->zone_dyn_start_pfn;
unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
+ bool dyn_zone = zone->zone_start_pfn == zone_start_pfn;
unsigned long zone_end_pfn = z;
unsigned long pfn;
struct mem_section *ms;
@@ -619,7 +622,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
pfn = find_smallest_section_pfn(nid, zone, end_pfn,
zone_end_pfn);
if (pfn) {
- zone->zone_start_pfn = pfn;
+ if (dyn_zone)
+ zone->zone_start_pfn = pfn;
+ zone->zone_dyn_start_pfn = pfn;
zone->spanned_pages = zone_end_pfn - pfn;
}
} else if (zone_end_pfn == end_pfn) {
@@ -661,8 +666,10 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
}

/* The zone has no valid section */
- zone->zone_start_pfn = 0;
- zone->spanned_pages = 0;
+ if (dyn_zone)
+ zone->zone_start_pfn = 0;
+ zone->zone_dyn_start_pfn = 0;
+ zone->spanned_pages = zone->init_spanned_pages;
zone_span_writeunlock(zone);
}

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 63358d9f9aa9..2d8b1d602ff3 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -209,6 +209,10 @@ EXPORT_SYMBOL(totalram_pages);
static char * const zone_names[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
"DMA",
+#else
+#ifdef CONFIG_ZONE_DEVICE
+ "Device",
+#endif
#endif
#ifdef CONFIG_ZONE_DMA32
"DMA32",
@@ -218,9 +222,6 @@ static char * const zone_names[MAX_NR_ZONES] = {
"HighMem",
#endif
"Movable",
-#ifdef CONFIG_ZONE_DEVICE
- "Device",
-#endif
};

compound_page_dtor * const compound_page_dtors[] = {
@@ -5082,6 +5083,8 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
node_start_pfn, node_end_pfn,
zholes_size);
zone->spanned_pages = size;
+ zone->init_spanned_pages = size;
+ zone->zone_dyn_start_pfn = 0;
zone->present_pages = real_size;

totalpages += size;