[PATCH RFC v1] mm: is_mem_section_removable() overhaul

From: David Hildenbrand
Date: Fri Jan 17 2020 - 05:58:18 EST


Let's refactor that code. We want to check if we can offline memory
blocks. Add a new function is_mem_section_offlineable() for that and
make it call is_mem_section_offlineable() for each contained section.
Within is_mem_section_offlineable(), add some more sanity checks and
directly bail out if the section contains holes or if it spans multiple
zones.

The old code was inherently racy with concurrent offlining/memory
unplug. Let's avoid that and grab the device_hotplug_lock. Luckily
we are already holding it when calling from powerpc code.

Note1: If somebody wants to export this function for use in driver code, we
need a variant that takes the device_hotplug_lock()

Note2: If we could have a zombie device (not clear yet), the present
section checks would properly bail out early.

Note3: I'd prefer the mem_hotplug_lock in read, but as we are about to
change the locking on the removal path (IOW, don't hold it when removing
memory block devices), I do not want to go down that path.

Note4: For now we would have returned "removable" although we would
block offlining due to memory holes, multiple zones, or missing
sections.

Tested with DIMMs on x86-64. Compile-tested on Power.

Cc: Benjamin Herrenschmidt <benh@xxxxxxxxxxxxxxxxxxx>
Cc: Paul Mackerras <paulus@xxxxxxxxx>
Cc: Michael Ellerman <mpe@xxxxxxxxxxxxxx>
Cc: Greg Kroah-Hartman <gregkh@xxxxxxxxxxxxxxxxxxx>
Cc: "Rafael J. Wysocki" <rafael@xxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: Leonardo Bras <leonardo@xxxxxxxxxxxxx>
Cc: Nathan Lynch <nathanl@xxxxxxxxxxxxx>
Cc: Allison Randal <allison@xxxxxxxxxxx>
Cc: Nathan Fontenot <nfont@xxxxxxxxxxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Michal Hocko <mhocko@xxxxxxxx>
Cc: Dan Williams <dan.j.williams@xxxxxxxxx>
Cc: Stephen Rothwell <sfr@xxxxxxxxxxxxxxxx>
Cc: Anshuman Khandual <anshuman.khandual@xxxxxxx>
Cc: lantianyu1986@xxxxxxxxx
Cc: linuxppc-dev@xxxxxxxxxxxxxxxx
Signed-off-by: David Hildenbrand <david@xxxxxxxxxx>
---
.../platforms/pseries/hotplug-memory.c | 24 ++-----
drivers/base/memory.c | 37 ++++++----
include/linux/memory.h | 1 +
include/linux/memory_hotplug.h | 5 +-
mm/memory_hotplug.c | 68 +++++++++----------
5 files changed, 67 insertions(+), 68 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c
index c126b94d1943..8d80159465e4 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -337,34 +337,24 @@ static int pseries_remove_mem_node(struct device_node *np)

static bool lmb_is_removable(struct drmem_lmb *lmb)
{
- int i, scns_per_block;
- bool rc = true;
- unsigned long pfn, block_sz;
- u64 phys_addr;
+ struct memory_block *mem;
+ bool rc = false;

if (!(lmb->flags & DRCONF_MEM_ASSIGNED))
return false;

- block_sz = memory_block_size_bytes();
- scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE;
- phys_addr = lmb->base_addr;
-
#ifdef CONFIG_FA_DUMP
/*
* Don't hot-remove memory that falls in fadump boot memory area
* and memory that is reserved for capturing old kernel memory.
*/
- if (is_fadump_memory_area(phys_addr, block_sz))
+ if (is_fadump_memory_area(lmb->base_addr, memory_block_size_bytes()))
return false;
#endif
-
- for (i = 0; i < scns_per_block; i++) {
- pfn = PFN_DOWN(phys_addr);
- if (!pfn_present(pfn))
- continue;
-
- rc = rc && is_mem_section_removable(pfn, PAGES_PER_SECTION);
- phys_addr += MIN_MEMORY_BLOCK_SIZE;
+ mem = lmb_to_memblock(lmb);
+ if (mem) {
+ rc = is_memory_block_offlineable(mem);
+ put_device(&mem->dev);
}

return rc;
diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index c6d288fad493..f744250c34d0 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -104,6 +104,25 @@ static ssize_t phys_index_show(struct device *dev,
return sprintf(buf, "%08lx\n", phys_index);
}

+/*
+ * Test if a memory block is likely to be offlineable. Returns true if
+ * the block is already offline.
+ *
+ * Called under device_hotplug_lock.
+ */
+bool is_memory_block_offlineable(struct memory_block *mem)
+{
+ int i;
+
+ if (mem->state != MEM_ONLINE)
+ return true;
+
+ for (i = 0; i < sections_per_block; i++)
+ if (!is_mem_section_offlineable(mem->start_section_nr + i))
+ return false;
+ return true;
+}
+
/*
* Show whether the memory block is likely to be offlineable (or is already
* offline). Once offline, the memory block could be removed. The return
@@ -114,20 +133,14 @@ static ssize_t removable_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct memory_block *mem = to_memory_block(dev);
- unsigned long pfn;
- int ret = 1, i;
-
- if (mem->state != MEM_ONLINE)
- goto out;
+ int ret;

- for (i = 0; i < sections_per_block; i++) {
- if (!present_section_nr(mem->start_section_nr + i))
- continue;
- pfn = section_nr_to_pfn(mem->start_section_nr + i);
- ret &= is_mem_section_removable(pfn, PAGES_PER_SECTION);
- }
+ ret = lock_device_hotplug_sysfs();
+ if (ret)
+ return ret;
+ ret = is_memory_block_offlineable(mem);
+ unlock_device_hotplug();

-out:
return sprintf(buf, "%d\n", ret);
}

diff --git a/include/linux/memory.h b/include/linux/memory.h
index 0b8d791b6669..faf03eb64ecc 100644
--- a/include/linux/memory.h
+++ b/include/linux/memory.h
@@ -91,6 +91,7 @@ typedef int (*walk_memory_blocks_func_t)(struct memory_block *, void *);
extern int walk_memory_blocks(unsigned long start, unsigned long size,
void *arg, walk_memory_blocks_func_t func);
extern int for_each_memory_block(void *arg, walk_memory_blocks_func_t func);
+extern bool is_memory_block_offlineable(struct memory_block *mem);
#define CONFIG_MEM_BLOCK_SIZE (PAGES_PER_SECTION<<PAGE_SHIFT)
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index f4d59155f3d4..8d087772f748 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -306,15 +306,14 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {}

#ifdef CONFIG_MEMORY_HOTREMOVE

-extern bool is_mem_section_removable(unsigned long pfn, unsigned long nr_pages);
+extern bool is_mem_section_offlineable(unsigned long nr);
extern void try_offline_node(int nid);
extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
extern int remove_memory(int nid, u64 start, u64 size);
extern void __remove_memory(int nid, u64 start, u64 size);

#else
-static inline bool is_mem_section_removable(unsigned long pfn,
- unsigned long nr_pages)
+static inline bool is_mem_section_offlineable(unsigned long nr)
{
return false;
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 7a6de9b0dcab..a6d14d2b7f0c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1128,46 +1128,51 @@ static unsigned long next_active_pageblock(unsigned long pfn)
return pfn + pageblock_nr_pages;
}

-static bool is_pageblock_removable_nolock(unsigned long pfn)
+static int count_system_ram_pages_cb(unsigned long start_pfn,
+ unsigned long nr_pages, void *data)
{
- struct page *page = pfn_to_page(pfn);
+ unsigned long *nr_system_ram_pages = data;
+
+ *nr_system_ram_pages += nr_pages;
+ return 0;
+}
+
+/*
+ * Check if a section is likely to be offlineable.
+ *
+ * Called with device_hotplug_lock.
+ */
+bool is_mem_section_offlineable(unsigned long nr)
+{
+ const unsigned long start_pfn = section_nr_to_pfn(nr);
+ const unsigned long end_pfn = start_pfn + PAGES_PER_SECTION;
+ unsigned long pfn, nr_pages = 0;
struct zone *zone;

- /*
- * We have to be careful here because we are iterating over memory
- * sections which are not zone aware so we might end up outside of
- * the zone but still within the section.
- * We have to take care about the node as well. If the node is offline
- * its NODE_DATA will be NULL - see page_zone.
- */
- if (!node_online(page_to_nid(page)))
+ if (!present_section_nr(nr))
return false;
-
- zone = page_zone(page);
- pfn = page_to_pfn(page);
- if (!zone_spans_pfn(zone, pfn))
+ if (!online_section_nr(nr))
return false;

- return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE,
- MEMORY_OFFLINE);
-}
-
-/* Checks if this range of memory is likely to be hot-removable. */
-bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
-{
- unsigned long end_pfn, pfn;
+ /* we don't allow to offline sections with holes */
+ walk_system_ram_range(start_pfn, PAGES_PER_SECTION, &nr_pages,
+ count_system_ram_pages_cb);
+ if (nr_pages != PAGES_PER_SECTION)
+ return false;

- end_pfn = min(start_pfn + nr_pages,
- zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
+ /* we don't allow to offline sections with mixed zones/nodes */
+ zone = test_pages_in_a_zone(start_pfn, end_pfn);
+ if (!zone)
+ return false;

- /* Check the starting page of each pageblock within the range */
+ /* check each pageblock if it contains unmovable pages */
for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
- if (!is_pageblock_removable_nolock(pfn))
+ if (has_unmovable_pages(zone, pfn_to_page(pfn), MIGRATE_MOVABLE,
+ MEMORY_OFFLINE))
return false;
cond_resched();
}

- /* All pageblocks in the memory block are likely to be hot-removable */
return true;
}

@@ -1436,15 +1441,6 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
node_clear_state(node, N_MEMORY);
}

-static int count_system_ram_pages_cb(unsigned long start_pfn,
- unsigned long nr_pages, void *data)
-{
- unsigned long *nr_system_ram_pages = data;
-
- *nr_system_ram_pages += nr_pages;
- return 0;
-}
-
static int __ref __offline_pages(unsigned long start_pfn,
unsigned long end_pfn)
{
--
2.24.1