Re: [PATCH v5 1/5] mm,memory_hotplug: Allocate memmap from the added memory range

From: David Hildenbrand
Date: Wed Mar 24 2021 - 10:53:50 EST


On 24.03.21 15:42, Michal Hocko wrote:
On Wed 24-03-21 13:03:29, Michal Hocko wrote:
On Wed 24-03-21 11:12:59, Oscar Salvador wrote:
[...]
I kind of understand to be reluctant to use vmemmap_pages terminology here, but
unfortunately we need to know about it.
We could rename nr_vmemmap_pages to offset_buddy_pages or something like that.

I am not convinced. It seems you are justr trying to graft the new
functionality in. But I still believe that {on,off}lining shouldn't care
about where their vmemmaps come from at all. It should be a
responsibility of the code which reserves that space to compansate for
accounting. Otherwise we will end up with a hard to maintain code
because expectations would be spread at way too many places. Not to
mention different pfns that the code should care about.

The below is a quick hack on top of this patch to illustrate my
thinking. I have dug out all the vmemmap pieces out of the
{on,off}lining and hooked all the accounting when the space is reserved.
This just compiles without any deeper look so there are likely some
minor problems but I haven't really encountered any major problems or
hacks to introduce into the code. The separation seems to be possible.
The diffstat also looks promising. Am I missing something fundamental in
this?


From a quick glimpse, this touches on two things discussed in the past:

1. If the underlying memory block is offline, all sections are offline. Zone shrinking code will happily skip over the vmemmap pages and you can end up with out-of-zone pages assigned to the zone. Can happen in corner cases. There is no way to know that the memmap of these pages was initialized and is of value.

2. You heavily fragment zone layout although you might end up with consecutive zones (e.g., online all hotplugged memory movable)

---
drivers/base/memory.c | 8 +--
include/linux/memory_hotplug.h | 6 +-
mm/memory_hotplug.c | 151 ++++++++++++++++++++---------------------
3 files changed, 80 insertions(+), 85 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 5ea2b3fbce02..9697acfe96eb 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -181,15 +181,15 @@ memory_block_action(unsigned long start_section_nr, unsigned long action,
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
int ret;
- start_pfn = section_nr_to_pfn(start_section_nr);
+ start_pfn = section_nr_to_pfn(start_section_nr) + nr_vmemmap_pages;
+ nr_pages -= nr_vmemmap_pages;
switch (action) {
case MEM_ONLINE:
- ret = online_pages(start_pfn, nr_pages, nr_vmemmap_pages,
- online_type, nid);
+ ret = online_pages(start_pfn, nr_pages, online_type, nid);
break;
case MEM_OFFLINE:
- ret = offline_pages(start_pfn, nr_pages, nr_vmemmap_pages);
+ ret = offline_pages(start_pfn, nr_pages);
break;
default:
WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index a85d4b7d15c2..673d2d4a8443 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -109,8 +109,7 @@ extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
/* VM interface that may be used by firmware interface */
extern int online_pages(unsigned long pfn, unsigned long nr_pages,
- unsigned long nr_vmemmap_pages, int online_type,
- int nid);
+ int online_type, int nid);
extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
unsigned long end_pfn);
extern void __offline_isolated_pages(unsigned long start_pfn,
@@ -317,8 +316,7 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
#ifdef CONFIG_MEMORY_HOTREMOVE
extern void try_offline_node(int nid);
-extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
- unsigned long nr_vmemmap_pages);
+extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
extern int remove_memory(int nid, u64 start, u64 size);
extern void __remove_memory(int nid, u64 start, u64 size);
extern int offline_and_remove_memory(int nid, u64 start, u64 size);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0c3a98cb8cde..754026a9164d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -844,30 +844,19 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
}
int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
- unsigned long nr_vmemmap_pages, int online_type, int nid)
+ int online_type, int nid)
{
- unsigned long flags, buddy_start_pfn, buddy_nr_pages;
+ unsigned long flags;
struct zone *zone;
int need_zonelists_rebuild = 0;
int ret;
struct memory_notify arg;
- /* We can only online full sections (e.g., SECTION_IS_ONLINE) */
- if (WARN_ON_ONCE(!nr_pages ||
- !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
- return -EINVAL;
-
- buddy_start_pfn = pfn + nr_vmemmap_pages;
- buddy_nr_pages = nr_pages - nr_vmemmap_pages;
-
mem_hotplug_begin();
/* associate pfn range with the zone */
zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
- if (nr_vmemmap_pages)
- move_pfn_range_to_zone(zone, pfn, nr_vmemmap_pages, NULL,
- MIGRATE_UNMOVABLE);
- move_pfn_range_to_zone(zone, buddy_start_pfn, buddy_nr_pages, NULL,
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL,
MIGRATE_ISOLATE);
arg.start_pfn = pfn;
@@ -884,7 +873,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
* onlining, such that undo_isolate_page_range() works correctly.
*/
spin_lock_irqsave(&zone->lock, flags);
- zone->nr_isolate_pageblock += buddy_nr_pages / pageblock_nr_pages;
+ zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);
/*
@@ -897,7 +886,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
setup_zone_pageset(zone);
}
- online_pages_range(pfn, nr_pages, buddy_start_pfn);
+ online_pages_range(pfn, nr_pages, pfn);
zone->present_pages += nr_pages;
pgdat_resize_lock(zone->zone_pgdat, &flags);
@@ -910,9 +899,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
zone_pcp_update(zone);
/* Basic onlining is complete, allow allocation of onlined pages. */
- undo_isolate_page_range(buddy_start_pfn,
- buddy_start_pfn + buddy_nr_pages,
- MIGRATE_MOVABLE);
+ undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
/*
* Freshly onlined pages aren't shuffled (e.g., all pages are placed to
@@ -1126,6 +1113,59 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
}
+static void reserve_vmmemmap_space(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap)
+{
+ struct zone *zone = &NODE_DATA(nid)->node_zones[ZONE_NORMAL];
+
+ altmap->free = nr_pages;
+ altmap->base_pfn = pfn;
+
+ /* initialize struct pages and account for this space */
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+}
+
+static void unaccount_vmemmap_space(int nid, unsigned long start_pfn, unsigned long nr_pages)
+{
+ struct zone *zone = &NODE_DATA(nid)->node_zones[ZONE_NORMAL];
+ unsigned long flags;
+
+ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
+ zone->present_pages -= nr_pages;
+
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ zone->zone_pgdat->node_present_pages -= nr_pages;
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+ remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
+}
+
+static int remove_memory_block_cb(struct memory_block *mem, void *arg)
+{
+ unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+ struct vmem_altmap mhp_altmap = {};
+ struct vmem_altmap *altmap = NULL;
+ u64 start = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
+ u64 size = memory_block_size_bytes();
+
+ if (!mem->nr_vmemmap_pages) {
+ arch_remove_memory(mem->nid, start, size, NULL);
+ return 0;
+ }
+
+ /*
+ * Let remove_pmd_table->free_hugepage_table
+ * do the right thing if we used vmem_altmap
+ * when hot-adding the range.
+ */
+ mhp_altmap.alloc = nr_vmemmap_pages;
+ altmap = &mhp_altmap;
+
+ unaccount_vmemmap_space(mem->nid, PHYS_PFN(start), nr_vmemmap_pages);
+ arch_remove_memory(mem->nid, start, size, altmap);
+
+ return 0;
+}
+
/*
* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
* and online/offline operations (triggered e.g. by sysfs).
@@ -1170,8 +1210,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
ret = -EINVAL;
goto error;
}
- mhp_altmap.free = PHYS_PFN(size);
- mhp_altmap.base_pfn = PHYS_PFN(start);
+ reserve_vmmemmap_space(nid, PHYS_PFN(start), PHYS_PFN(size), &mhp_altmap);
params.altmap = &mhp_altmap;
}
@@ -1639,25 +1678,16 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
return 0;
}
-int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
- unsigned long nr_vmemmap_pages)
+int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
const unsigned long end_pfn = start_pfn + nr_pages;
- unsigned long pfn, buddy_start_pfn, buddy_nr_pages, system_ram_pages = 0;
+ unsigned long pfn, system_ram_pages = 0;
unsigned long flags;
struct zone *zone;
struct memory_notify arg;
int ret, node;
char *reason;
- /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
- if (WARN_ON_ONCE(!nr_pages ||
- !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
- return -EINVAL;
-
- buddy_start_pfn = start_pfn + nr_vmemmap_pages;
- buddy_nr_pages = nr_pages - nr_vmemmap_pages;
-
mem_hotplug_begin();
/*
@@ -1693,7 +1723,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
zone_pcp_disable(zone);
/* set above range as isolated */
- ret = start_isolate_page_range(buddy_start_pfn, end_pfn,
+ ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
MEMORY_OFFLINE | REPORT_FAILURE);
if (ret) {
@@ -1713,7 +1743,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
}
do {
- pfn = buddy_start_pfn;
+ pfn = start_pfn;
do {
if (signal_pending(current)) {
ret = -EINTR;
@@ -1744,18 +1774,18 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
* offlining actually in order to make hugetlbfs's object
* counting consistent.
*/
- ret = dissolve_free_huge_pages(buddy_start_pfn, end_pfn);
+ ret = dissolve_free_huge_pages(start_pfn, end_pfn);
if (ret) {
reason = "failure to dissolve huge pages";
goto failed_removal_isolated;
}
- ret = test_pages_isolated(buddy_start_pfn, end_pfn, MEMORY_OFFLINE);
+ ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
} while (ret);
/* Mark all sections offline and remove free pages from the buddy. */
- __offline_isolated_pages(start_pfn, end_pfn, buddy_start_pfn);
+ __offline_isolated_pages(start_pfn, end_pfn, start_pfn);
pr_debug("Offlined Pages %ld\n", nr_pages);
/*
@@ -1764,13 +1794,13 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
* of isolated pageblocks, memory onlining will properly revert this.
*/
spin_lock_irqsave(&zone->lock, flags);
- zone->nr_isolate_pageblock -= buddy_nr_pages / pageblock_nr_pages;
+ zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);
zone_pcp_enable(zone);
/* removal success */
- adjust_managed_page_count(pfn_to_page(start_pfn), -buddy_nr_pages);
+ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
zone->present_pages -= nr_pages;
pgdat_resize_lock(zone->zone_pgdat, &flags);
@@ -1799,7 +1829,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
return 0;
failed_removal_isolated:
- undo_isolate_page_range(buddy_start_pfn, end_pfn, MIGRATE_MOVABLE);
+ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
memory_notify(MEM_CANCEL_OFFLINE, &arg);
failed_removal_pcplists_disabled:
zone_pcp_enable(zone);
@@ -1830,14 +1860,6 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
return 0;
}
-static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
-{
- /*
- * If not set, continue with the next block.
- */
- return mem->nr_vmemmap_pages;
-}
-
static int check_cpu_on_node(pg_data_t *pgdat)
{
int cpu;
@@ -1912,9 +1934,6 @@ EXPORT_SYMBOL(try_offline_node);
static int __ref try_remove_memory(int nid, u64 start, u64 size)
{
int rc = 0;
- struct vmem_altmap mhp_altmap = {};
- struct vmem_altmap *altmap = NULL;
- unsigned long nr_vmemmap_pages = 0;
BUG_ON(check_hotplug_memory_range(start, size));
@@ -1927,31 +1946,6 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
if (rc)
return rc;
- /*
- * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
- * the same granularity it was added - a single memory block.
- */
- if (memmap_on_memory) {
- nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
- get_nr_vmemmap_pages_cb);
- if (nr_vmemmap_pages) {
- if (size != memory_block_size_bytes()) {
- pr_warn("Refuse to remove %#llx - %#llx,"
- "wrong granularity\n",
- start, start + size);
- return -EINVAL;
- }
-
- /*
- * Let remove_pmd_table->free_hugepage_table
- * do the right thing if we used vmem_altmap
- * when hot-adding the range.
- */
- mhp_altmap.alloc = nr_vmemmap_pages;
- altmap = &mhp_altmap;
- }
- }
-
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
@@ -1963,7 +1957,10 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
mem_hotplug_begin();
- arch_remove_memory(nid, start, size, altmap);
+ if (!memmap_on_memory)
+ arch_remove_memory(nid, start, size, NULL);
+ else
+ walk_memory_blocks(start, size, NULL, remove_memory_block_cb);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
memblock_free(start, size);



--
Thanks,

David / dhildenb