Re: [RFC PATCH] mm: support CONFIG_ZONE_DEVICE + CONFIG_ZONE_DMA

From: Vlastimil Babka
Date: Tue Jan 26 2016 - 16:42:41 EST


On 26.1.2016 1:06, Dan Williams wrote:
> It appears devices requiring ZONE_DMA are still prevalent (see link
> below). For this reason the proposal to require turning off ZONE_DMA to
> enable ZONE_DEVICE is untenable in the short term. We want a single
> kernel image to be able to support legacy devices as well as next
> generation persistent memory platforms.
>
> Towards this end, alias ZONE_DMA and ZONE_DEVICE to work around needing
> to maintain a unique zone number for ZONE_DEVICE. Record the geometry
> of ZONE_DMA at init (->init_spanned_pages) and use that information in
> is_zone_device_page() to differentiate pages allocated via
> devm_memremap_pages() vs true ZONE_DMA pages. Otherwise, use the
> simpler definition of is_zone_device_page() when ZONE_DMA is turned off.
>
> Note that this also teaches the memory hot remove path that the zone may
> not have sections for all pfn spans (->zone_dyn_start_pfn).
>
> A user visible implication of this change is potentially an unexpectedly
> high "spanned" value in /proc/zoneinfo for the DMA zone.

[+CC Joonsoo, Laura]

Sounds like quite a hack :( Would it be possible to extend the bits encoding
zone? Potentially, ZONE_CMA could be added one day...

> Cc: H. Peter Anvin <hpa@xxxxxxxxx>
> Cc: Ingo Molnar <mingo@xxxxxxxxxx>
> Cc: Rik van Riel <riel@xxxxxxxxxx>
> Cc: Mel Gorman <mgorman@xxxxxxx>
> Cc: Jerome Glisse <j.glisse@xxxxxxxxx>
> Cc: Christoph Hellwig <hch@xxxxxx>
> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=110931
> Fixes: 033fbae988fc ("mm: ZONE_DEVICE for "device memory"")
> Reported-by: Sudip Mukherjee <sudipm.mukherjee@xxxxxxxxx>
> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
> ---
> include/linux/mm.h | 46 ++++++++++++++++++++++++++++++++--------------
> include/linux/mmzone.h | 24 ++++++++++++++++++++----
> mm/Kconfig | 1 -
> mm/memory_hotplug.c | 15 +++++++++++----
> mm/page_alloc.c | 9 ++++++---
> 5 files changed, 69 insertions(+), 26 deletions(-)
>
> diff --git a/include/linux/mm.h b/include/linux/mm.h
> index f1cd22f2df1a..b4bccd3d3c41 100644
> --- a/include/linux/mm.h
> +++ b/include/linux/mm.h
> @@ -664,12 +664,44 @@ static inline enum zone_type page_zonenum(const struct page *page)
> return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK;
> }
>
> +#ifdef NODE_NOT_IN_PAGE_FLAGS
> +extern int page_to_nid(const struct page *page);
> +#else
> +static inline int page_to_nid(const struct page *page)
> +{
> + return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
> +}
> +#endif
> +
> +static inline struct zone *page_zone(const struct page *page)
> +{
> + return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
> +}
> +
> #ifdef CONFIG_ZONE_DEVICE
> void get_zone_device_page(struct page *page);
> void put_zone_device_page(struct page *page);
> static inline bool is_zone_device_page(const struct page *page)
> {
> +#ifndef CONFIG_ZONE_DMA
> return page_zonenum(page) == ZONE_DEVICE;
> +#else /* ZONE_DEVICE == ZONE_DMA */
> + struct zone *zone;
> +
> + if (page_zonenum(page) != ZONE_DEVICE)
> + return false;
> +
> + /*
> + * If ZONE_DEVICE is aliased with ZONE_DMA we need to check
> + * whether this was a dynamically allocated page from
> + * devm_memremap_pages() by checking against the size of
> + * ZONE_DMA at boot.
> + */
> + zone = page_zone(page);
> + if (page_to_pfn(page) <= zone_end_pfn_boot(zone))
> + return false;
> + return true;
> +#endif
> }
> #else
> static inline void get_zone_device_page(struct page *page)
> @@ -735,15 +767,6 @@ static inline int zone_to_nid(struct zone *zone)
> #endif
> }
>
> -#ifdef NODE_NOT_IN_PAGE_FLAGS
> -extern int page_to_nid(const struct page *page);
> -#else
> -static inline int page_to_nid(const struct page *page)
> -{
> - return (page->flags >> NODES_PGSHIFT) & NODES_MASK;
> -}
> -#endif
> -
> #ifdef CONFIG_NUMA_BALANCING
> static inline int cpu_pid_to_cpupid(int cpu, int pid)
> {
> @@ -857,11 +880,6 @@ static inline bool cpupid_match_pid(struct task_struct *task, int cpupid)
> }
> #endif /* CONFIG_NUMA_BALANCING */
>
> -static inline struct zone *page_zone(const struct page *page)
> -{
> - return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
> -}
> -
> #ifdef SECTION_IN_PAGE_FLAGS
> static inline void set_page_section(struct page *page, unsigned long section)
> {
> diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
> index 33bb1b19273e..a0ef09b7f893 100644
> --- a/include/linux/mmzone.h
> +++ b/include/linux/mmzone.h
> @@ -288,6 +288,13 @@ enum zone_type {
> */
> ZONE_DMA,
> #endif
> +#ifdef CONFIG_ZONE_DEVICE
> +#ifndef CONFIG_ZONE_DMA
> + ZONE_DEVICE,
> +#else
> + ZONE_DEVICE = ZONE_DMA,
> +#endif
> +#endif
> #ifdef CONFIG_ZONE_DMA32
> /*
> * x86_64 needs two ZONE_DMAs because it supports devices that are
> @@ -314,11 +321,7 @@ enum zone_type {
> ZONE_HIGHMEM,
> #endif
> ZONE_MOVABLE,
> -#ifdef CONFIG_ZONE_DEVICE
> - ZONE_DEVICE,
> -#endif
> __MAX_NR_ZONES
> -
> };
>
> #ifndef __GENERATING_BOUNDS_H
> @@ -379,12 +382,19 @@ struct zone {
>
> /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
> unsigned long zone_start_pfn;
> + /* first dynamically added pfn of the zone */
> + unsigned long zone_dyn_start_pfn;
>
> /*
> * spanned_pages is the total pages spanned by the zone, including
> * holes, which is calculated as:
> * spanned_pages = zone_end_pfn - zone_start_pfn;
> *
> + * init_spanned_pages is the boot/init time total pages spanned
> + * by the zone for differentiating statically assigned vs
> + * dynamically hot added memory to a zone.
> + * init_spanned_pages = init_zone_end_pfn - zone_start_pfn;
> + *
> * present_pages is physical pages existing within the zone, which
> * is calculated as:
> * present_pages = spanned_pages - absent_pages(pages in holes);
> @@ -423,6 +433,7 @@ struct zone {
> */
> unsigned long managed_pages;
> unsigned long spanned_pages;
> + unsigned long init_spanned_pages;
> unsigned long present_pages;
>
> const char *name;
> @@ -546,6 +557,11 @@ static inline unsigned long zone_end_pfn(const struct zone *zone)
> return zone->zone_start_pfn + zone->spanned_pages;
> }
>
> +static inline unsigned long zone_end_pfn_boot(const struct zone *zone)
> +{
> + return zone->zone_start_pfn + zone->init_spanned_pages;
> +}
> +
> static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn)
> {
> return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone);
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 97a4e06b15c0..08a92a9c8fbd 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -652,7 +652,6 @@ config IDLE_PAGE_TRACKING
> config ZONE_DEVICE
> bool "Device memory (pmem, etc...) hotplug support" if EXPERT
> default !ZONE_DMA
> - depends on !ZONE_DMA
> depends on MEMORY_HOTPLUG
> depends on MEMORY_HOTREMOVE
> depends on X86_64 #arch_add_memory() comprehends device memory
> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
> index 4af58a3a8ffa..c3f0ff45bd47 100644
> --- a/mm/memory_hotplug.c
> +++ b/mm/memory_hotplug.c
> @@ -300,6 +300,8 @@ static void __meminit grow_zone_span(struct zone *zone, unsigned long start_pfn,
>
> zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
> zone->zone_start_pfn;
> + if (!zone->zone_dyn_start_pfn || start_pfn < zone->zone_dyn_start_pfn)
> + zone->zone_dyn_start_pfn = start_pfn;
>
> zone_span_writeunlock(zone);
> }
> @@ -601,8 +603,9 @@ static int find_biggest_section_pfn(int nid, struct zone *zone,
> static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
> unsigned long end_pfn)
> {
> - unsigned long zone_start_pfn = zone->zone_start_pfn;
> + unsigned long zone_start_pfn = zone->zone_dyn_start_pfn;
> unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
> + bool dyn_zone = zone->zone_start_pfn == zone_start_pfn;
> unsigned long zone_end_pfn = z;
> unsigned long pfn;
> struct mem_section *ms;
> @@ -619,7 +622,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
> pfn = find_smallest_section_pfn(nid, zone, end_pfn,
> zone_end_pfn);
> if (pfn) {
> - zone->zone_start_pfn = pfn;
> + if (dyn_zone)
> + zone->zone_start_pfn = pfn;
> + zone->zone_dyn_start_pfn = pfn;
> zone->spanned_pages = zone_end_pfn - pfn;
> }
> } else if (zone_end_pfn == end_pfn) {
> @@ -661,8 +666,10 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
> }
>
> /* The zone has no valid section */
> - zone->zone_start_pfn = 0;
> - zone->spanned_pages = 0;
> + if (dyn_zone)
> + zone->zone_start_pfn = 0;
> + zone->zone_dyn_start_pfn = 0;
> + zone->spanned_pages = zone->init_spanned_pages;
> zone_span_writeunlock(zone);
> }
>
> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> index 63358d9f9aa9..2d8b1d602ff3 100644
> --- a/mm/page_alloc.c
> +++ b/mm/page_alloc.c
> @@ -209,6 +209,10 @@ EXPORT_SYMBOL(totalram_pages);
> static char * const zone_names[MAX_NR_ZONES] = {
> #ifdef CONFIG_ZONE_DMA
> "DMA",
> +#else
> +#ifdef CONFIG_ZONE_DEVICE
> + "Device",
> +#endif
> #endif
> #ifdef CONFIG_ZONE_DMA32
> "DMA32",
> @@ -218,9 +222,6 @@ static char * const zone_names[MAX_NR_ZONES] = {
> "HighMem",
> #endif
> "Movable",
> -#ifdef CONFIG_ZONE_DEVICE
> - "Device",
> -#endif
> };
>
> compound_page_dtor * const compound_page_dtors[] = {
> @@ -5082,6 +5083,8 @@ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
> node_start_pfn, node_end_pfn,
> zholes_size);
> zone->spanned_pages = size;
> + zone->init_spanned_pages = size;
> + zone->zone_dyn_start_pfn = 0;
> zone->present_pages = real_size;
>
> totalpages += size;
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to majordomo@xxxxxxxxxx For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"dont@xxxxxxxxx";> email@xxxxxxxxx </a>
>