[RFC PATCH 2/3] mm/memory_hotplug: Create __shrink_pages and move it to offline_pages

From: osalvador
Date: Tue Aug 07 2018 - 09:38:33 EST


From: Oscar Salvador <osalvador@xxxxxxx>

Currently, we decrement zone/node spanned_pages when we
__remove__ the memory.
This is not really great.

Incrementing of spanned pages is done in online_pages() path,
decrementing spanned pages should be moved to offline_pages().

This, besides making the core more consistent, helps in fixing
the bug explained in the cover letter, since from now on,
we will not touch any pages in remove_memory path.

This does all the work to accomplish this.
It removes __remove_zone() and creates __shrink_pages(), which does pretty much the same.

It also changes find_smallest/biggest_section_pfn to check for:

!online_section(ms)

instead of

!valid_section(ms)

as we offline the section in:

__offline_pages
offline_isolated_pages
offline_isolated_pages_cb
__offline_isolated_pages
offline_mem_sections

right before shrinking the pages.

Signed-off-by: Oscar Salvador <osalvador@xxxxxxx>
---
arch/ia64/mm/init.c | 4 +--
arch/powerpc/mm/mem.c | 11 +------
arch/sh/mm/init.c | 4 +--
arch/x86/mm/init_32.c | 4 +--
arch/x86/mm/init_64.c | 8 +-----
include/linux/memory_hotplug.h | 6 ++--
kernel/memremap.c | 5 ++++
mm/hmm.c | 2 +-
mm/memory_hotplug.c | 65 +++++++++++++++++++++---------------------
mm/sparse.c | 4 +--
10 files changed, 50 insertions(+), 63 deletions(-)

diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index bc5e15045cee..0029c4d3f574 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -664,11 +664,9 @@ int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- struct zone *zone;
int ret;

- zone = page_zone(pfn_to_page(start_pfn));
- ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
+ ret = __remove_pages(nid, start_pfn, nr_pages, altmap);
if (ret)
pr_warn("%s: Problem encountered in __remove_pages() as"
" ret=%d\n", __func__, ret);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 9e17d57a9948..12879191bc6b 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -143,18 +143,9 @@ int __meminit arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altma
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- struct page *page;
int ret;

- /*
- * If we have an altmap then we need to skip over any reserved PFNs
- * when querying the zone.
- */
- page = pfn_to_page(start_pfn);
- if (altmap)
- page += vmem_altmap_offset(altmap);
-
- ret = __remove_pages(page_zone(page), start_pfn, nr_pages, altmap);
+ ret = __remove_pages(nid, start_pfn, nr_pages, altmap);
if (ret)
return ret;

diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index 55c740ab861b..4f183857b522 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -458,11 +458,9 @@ int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
- struct zone *zone;
int ret;

- zone = page_zone(pfn_to_page(start_pfn));
- ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
+ ret = __remove_pages(nid, start_pfn, nr_pages, altmap);
if (unlikely(ret))
pr_warn("%s: Failed, __remove_pages() == %d\n", __func__,
ret);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9fa503f2f56c..88909e88c873 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -865,10 +865,8 @@ int arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *altmap)
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- struct zone *zone;

- zone = page_zone(pfn_to_page(start_pfn));
- return __remove_pages(zone, start_pfn, nr_pages, altmap);
+ return __remove_pages(nid, start_pfn, nr_pages, altmap);
}
#endif
#endif
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 26728df07072..1aad5ea2e274 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1152,15 +1152,9 @@ int __ref arch_remove_memory(int nid, u64 start, u64 size, struct vmem_altmap *a
{
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
- struct page *page = pfn_to_page(start_pfn);
- struct zone *zone;
int ret;

- /* With altmap the first mapped page is offset from @start */
- if (altmap)
- page += vmem_altmap_offset(altmap);
- zone = page_zone(page);
- ret = __remove_pages(zone, start_pfn, nr_pages, altmap);
+ ret = __remove_pages(nid, start_pfn, nr_pages, altmap);
WARN_ON_ONCE(ret);
kernel_physical_mapping_remove(start, start + size);

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index c68b03fd87bd..eff460ba3ab1 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -109,8 +109,10 @@ static inline bool movable_node_is_enabled(void)
#ifdef CONFIG_MEMORY_HOTREMOVE
extern int arch_remove_memory(int nid, u64 start, u64 size,
struct vmem_altmap *altmap);
-extern int __remove_pages(struct zone *zone, unsigned long start_pfn,
+extern int __remove_pages(int nid, unsigned long start_pfn,
unsigned long nr_pages, struct vmem_altmap *altmap);
+extern void shrink_pages(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages);
#endif /* CONFIG_MEMORY_HOTREMOVE */

/* reasonably generic interface to expand the physical pages */
@@ -333,7 +335,7 @@ extern bool is_memblock_offlined(struct memory_block *mem);
extern void remove_memory(int nid, u64 start, u64 size);
extern int sparse_add_one_section(struct pglist_data *pgdat,
unsigned long start_pfn, struct vmem_altmap *altmap);
-extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
+extern void sparse_remove_one_section(int nid, struct mem_section *ms,
unsigned long map_offset, struct vmem_altmap *altmap);
extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map,
unsigned long pnum);
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 7a832b844f24..2057af5c1c4f 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -121,6 +121,7 @@ static void devm_memremap_pages_release(void *data)
struct resource *res = &pgmap->res;
resource_size_t align_start, align_size;
unsigned long pfn;
+ struct page *page;
int nid;

for_each_device_pfn(pfn, pgmap)
@@ -138,6 +139,10 @@ static void devm_memremap_pages_release(void *data)
nid = dev_to_node(dev);

mem_hotplug_begin();
+ page = pfn_to_page(pfn);
+ if (pgmap->altmap_valid)
+ page += vmem_altmap_offset(&pgmap->altmap);
+ shrink_pages(page_zone(page), pfn, align_size >> PAGE_SHIFT);
arch_remove_memory(nid, align_start, align_size, pgmap->altmap_valid ?
&pgmap->altmap : NULL);
kasan_remove_zero_shadow(__va(align_start), align_size);
diff --git a/mm/hmm.c b/mm/hmm.c
index 21787e480b4a..b2f2dcadb5fb 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1012,7 +1012,7 @@ static void hmm_devmem_release(struct device *dev, void *data)

mem_hotplug_begin();
if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
- __remove_pages(zone, start_pfn, npages, NULL);
+ __remove_pages(nid, start_pfn, npages, NULL);
else
arch_remove_memory(nid, start_pfn << PAGE_SHIFT,
npages << PAGE_SHIFT, NULL);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 9bd629944c91..e33555651e46 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -320,12 +320,10 @@ static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mem_section *ms;
-
for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(start_pfn);
+ struct mem_section *ms = __pfn_to_section(start_pfn);

- if (unlikely(!valid_section(ms)))
+ if (unlikely(!online_section(ms)))
continue;

if (unlikely(pfn_to_nid(start_pfn) != nid))
@@ -345,15 +343,14 @@ static unsigned long find_biggest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mem_section *ms;
unsigned long pfn;

/* pfn is the end pfn of a memory section. */
pfn = end_pfn - 1;
for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
+ struct mem_section *ms = __pfn_to_section(pfn);

- if (unlikely(!valid_section(ms)))
+ if (unlikely(!online_section(ms)))
continue;

if (unlikely(pfn_to_nid(pfn) != nid))
@@ -415,7 +412,7 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
ms = __pfn_to_section(pfn);

- if (unlikely(!valid_section(ms)))
+ if (unlikely(!online_section(ms)))
continue;

if (page_zone(pfn_to_page(pfn)) != zone)
@@ -483,7 +480,7 @@ static void shrink_pgdat_span(struct pglist_data *pgdat,
for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
ms = __pfn_to_section(pfn);

- if (unlikely(!valid_section(ms)))
+ if (unlikely(!online_section(ms)))
continue;

if (pfn_to_nid(pfn) != nid)
@@ -502,19 +499,32 @@ static void shrink_pgdat_span(struct pglist_data *pgdat,
pgdat->node_spanned_pages = 0;
}

-static void __remove_zone(struct zone *zone, unsigned long start_pfn)
+static void __shrink_pages(struct zone *zone, unsigned long start_pfn,
+ unsigned long end_pfn,
+ unsigned long offlined_pages)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nr_pages = PAGES_PER_SECTION;
unsigned long flags;
+ unsigned long pfn;

- pgdat_resize_lock(zone->zone_pgdat, &flags);
- shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
- shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ clear_zone_contiguous(zone);
+ pgdat_resize_lock(pgdat, &flags);
+ for(pfn = start_pfn; pfn < end_pfn; pfn += nr_pages) {
+ shrink_zone_span(zone, pfn, pfn + nr_pages);
+ shrink_pgdat_span(pgdat, pfn, pfn + nr_pages);
+ }
+ pgdat->node_present_pages -= offlined_pages;
+ pgdat_resize_unlock(pgdat, &flags);
+ set_zone_contiguous(zone);
}

-static int __remove_section(struct zone *zone, struct mem_section *ms,
+void shrink_pages(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages)
+{
+ __shrink_pages(zone, start_pfn, start_pfn + nr_pages, nr_pages);
+}
+
+static int __remove_section(int nid, struct mem_section *ms,
unsigned long map_offset, struct vmem_altmap *altmap)
{
unsigned long start_pfn;
@@ -530,15 +540,14 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,

scn_nr = __section_nr(ms);
start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
- __remove_zone(zone, start_pfn);

- sparse_remove_one_section(zone, ms, map_offset, altmap);
+ sparse_remove_one_section(nid, ms, map_offset, altmap);
return 0;
}

/**
* __remove_pages() - remove sections of pages from a zone
- * @zone: zone from which pages need to be removed
+ * @nid: node which pages belong to
* @phys_start_pfn: starting pageframe (must be aligned to start of a section)
* @nr_pages: number of pages to remove (must be multiple of section size)
* @altmap: alternative device page map or %NULL if default memmap is used
@@ -548,7 +557,7 @@ static int __remove_section(struct zone *zone, struct mem_section *ms,
* sure that pages are marked reserved and zones are adjust properly by
* calling offline_pages().
*/
-int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
+int __remove_pages(int nid, unsigned long phys_start_pfn,
unsigned long nr_pages, struct vmem_altmap *altmap)
{
unsigned long i;
@@ -556,10 +565,9 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
int sections_to_remove, ret = 0;

/* In the ZONE_DEVICE case device driver owns the memory region */
- if (is_dev_zone(zone)) {
- if (altmap)
- map_offset = vmem_altmap_offset(altmap);
- } else {
+ if (altmap)
+ map_offset = vmem_altmap_offset(altmap);
+ else {
resource_size_t start, size;

start = phys_start_pfn << PAGE_SHIFT;
@@ -575,8 +583,6 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
}
}

- clear_zone_contiguous(zone);
-
/*
* We can only remove entire sections
*/
@@ -587,15 +593,13 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
for (i = 0; i < sections_to_remove; i++) {
unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;

- ret = __remove_section(zone, __pfn_to_section(pfn), map_offset,
+ ret = __remove_section(nid, __pfn_to_section(pfn), map_offset,
altmap);
map_offset = 0;
if (ret)
break;
}

- set_zone_contiguous(zone);
-
return ret;
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
@@ -1595,7 +1599,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
unsigned long pfn, nr_pages;
long offlined_pages;
int ret, node;
- unsigned long flags;
unsigned long valid_start, valid_end;
struct zone *zone;
struct memory_notify arg;
@@ -1667,9 +1670,7 @@ static int __ref __offline_pages(unsigned long start_pfn,
adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
zone->present_pages -= offlined_pages;

- pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages -= offlined_pages;
- pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ __shrink_pages(zone, valid_start, valid_end, offlined_pages);

init_per_zone_wmark_min();

diff --git a/mm/sparse.c b/mm/sparse.c
index 10b07eea9a6e..016020bd20b5 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -766,12 +766,12 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap,
free_map_bootmem(memmap);
}

-void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
+void sparse_remove_one_section(int nid, struct mem_section *ms,
unsigned long map_offset, struct vmem_altmap *altmap)
{
struct page *memmap = NULL;
unsigned long *usemap = NULL, flags;
- struct pglist_data *pgdat = zone->zone_pgdat;
+ struct pglist_data *pgdat = NODE_DATA(nid);

pgdat_resize_lock(pgdat, &flags);
if (ms->section_mem_map) {
--
2.13.6