Re: [PATCH v5 1/5] mm,memory_hotplug: Allocate memmap from the added memory range

From: Michal Hocko
Date: Wed Mar 24 2021 - 10:43:04 EST


On Wed 24-03-21 13:03:29, Michal Hocko wrote:
> On Wed 24-03-21 11:12:59, Oscar Salvador wrote:
[...]
> > I kind of understand to be reluctant to use vmemmap_pages terminology here, but
> > unfortunately we need to know about it.
> > We could rename nr_vmemmap_pages to offset_buddy_pages or something like that.
>
> I am not convinced. It seems you are justr trying to graft the new
> functionality in. But I still believe that {on,off}lining shouldn't care
> about where their vmemmaps come from at all. It should be a
> responsibility of the code which reserves that space to compansate for
> accounting. Otherwise we will end up with a hard to maintain code
> because expectations would be spread at way too many places. Not to
> mention different pfns that the code should care about.

The below is a quick hack on top of this patch to illustrate my
thinking. I have dug out all the vmemmap pieces out of the
{on,off}lining and hooked all the accounting when the space is reserved.
This just compiles without any deeper look so there are likely some
minor problems but I haven't really encountered any major problems or
hacks to introduce into the code. The separation seems to be possible.
The diffstat also looks promising. Am I missing something fundamental in
this?

---
drivers/base/memory.c | 8 +--
include/linux/memory_hotplug.h | 6 +-
mm/memory_hotplug.c | 151 ++++++++++++++++++++---------------------
3 files changed, 80 insertions(+), 85 deletions(-)

diff --git a/drivers/base/memory.c b/drivers/base/memory.c
index 5ea2b3fbce02..9697acfe96eb 100644
--- a/drivers/base/memory.c
+++ b/drivers/base/memory.c
@@ -181,15 +181,15 @@ memory_block_action(unsigned long start_section_nr, unsigned long action,
unsigned long nr_pages = PAGES_PER_SECTION * sections_per_block;
int ret;

- start_pfn = section_nr_to_pfn(start_section_nr);
+ start_pfn = section_nr_to_pfn(start_section_nr) + nr_vmemmap_pages;
+ nr_pages -= nr_vmemmap_pages;

switch (action) {
case MEM_ONLINE:
- ret = online_pages(start_pfn, nr_pages, nr_vmemmap_pages,
- online_type, nid);
+ ret = online_pages(start_pfn, nr_pages, online_type, nid);
break;
case MEM_OFFLINE:
- ret = offline_pages(start_pfn, nr_pages, nr_vmemmap_pages);
+ ret = offline_pages(start_pfn, nr_pages);
break;
default:
WARN(1, KERN_WARNING "%s(%ld, %ld) unknown action: "
diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index a85d4b7d15c2..673d2d4a8443 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -109,8 +109,7 @@ extern int zone_grow_waitqueues(struct zone *zone, unsigned long nr_pages);
extern int add_one_highpage(struct page *page, int pfn, int bad_ppro);
/* VM interface that may be used by firmware interface */
extern int online_pages(unsigned long pfn, unsigned long nr_pages,
- unsigned long nr_vmemmap_pages, int online_type,
- int nid);
+ int online_type, int nid);
extern struct zone *test_pages_in_a_zone(unsigned long start_pfn,
unsigned long end_pfn);
extern void __offline_isolated_pages(unsigned long start_pfn,
@@ -317,8 +316,7 @@ static inline void pgdat_resize_init(struct pglist_data *pgdat) {}
#ifdef CONFIG_MEMORY_HOTREMOVE

extern void try_offline_node(int nid);
-extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
- unsigned long nr_vmemmap_pages);
+extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages);
extern int remove_memory(int nid, u64 start, u64 size);
extern void __remove_memory(int nid, u64 start, u64 size);
extern int offline_and_remove_memory(int nid, u64 start, u64 size);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0c3a98cb8cde..754026a9164d 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -844,30 +844,19 @@ struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
}

int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
- unsigned long nr_vmemmap_pages, int online_type, int nid)
+ int online_type, int nid)
{
- unsigned long flags, buddy_start_pfn, buddy_nr_pages;
+ unsigned long flags;
struct zone *zone;
int need_zonelists_rebuild = 0;
int ret;
struct memory_notify arg;

- /* We can only online full sections (e.g., SECTION_IS_ONLINE) */
- if (WARN_ON_ONCE(!nr_pages ||
- !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
- return -EINVAL;
-
- buddy_start_pfn = pfn + nr_vmemmap_pages;
- buddy_nr_pages = nr_pages - nr_vmemmap_pages;
-
mem_hotplug_begin();

/* associate pfn range with the zone */
zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
- if (nr_vmemmap_pages)
- move_pfn_range_to_zone(zone, pfn, nr_vmemmap_pages, NULL,
- MIGRATE_UNMOVABLE);
- move_pfn_range_to_zone(zone, buddy_start_pfn, buddy_nr_pages, NULL,
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL,
MIGRATE_ISOLATE);

arg.start_pfn = pfn;
@@ -884,7 +873,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
* onlining, such that undo_isolate_page_range() works correctly.
*/
spin_lock_irqsave(&zone->lock, flags);
- zone->nr_isolate_pageblock += buddy_nr_pages / pageblock_nr_pages;
+ zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);

/*
@@ -897,7 +886,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
setup_zone_pageset(zone);
}

- online_pages_range(pfn, nr_pages, buddy_start_pfn);
+ online_pages_range(pfn, nr_pages, pfn);
zone->present_pages += nr_pages;

pgdat_resize_lock(zone->zone_pgdat, &flags);
@@ -910,9 +899,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
zone_pcp_update(zone);

/* Basic onlining is complete, allow allocation of onlined pages. */
- undo_isolate_page_range(buddy_start_pfn,
- buddy_start_pfn + buddy_nr_pages,
- MIGRATE_MOVABLE);
+ undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);

/*
* Freshly onlined pages aren't shuffled (e.g., all pages are placed to
@@ -1126,6 +1113,59 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
IS_ALIGNED(remaining_size, (pageblock_nr_pages << PAGE_SHIFT));
}

+static void reserve_vmmemmap_space(int nid, unsigned long pfn, unsigned long nr_pages, struct vmem_altmap *altmap)
+{
+ struct zone *zone = &NODE_DATA(nid)->node_zones[ZONE_NORMAL];
+
+ altmap->free = nr_pages;
+ altmap->base_pfn = pfn;
+
+ /* initialize struct pages and account for this space */
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE);
+}
+
+static void unaccount_vmemmap_space(int nid, unsigned long start_pfn, unsigned long nr_pages)
+{
+ struct zone *zone = &NODE_DATA(nid)->node_zones[ZONE_NORMAL];
+ unsigned long flags;
+
+ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
+ zone->present_pages -= nr_pages;
+
+ pgdat_resize_lock(zone->zone_pgdat, &flags);
+ zone->zone_pgdat->node_present_pages -= nr_pages;
+ pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+ remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
+}
+
+static int remove_memory_block_cb(struct memory_block *mem, void *arg)
+{
+ unsigned long nr_vmemmap_pages = mem->nr_vmemmap_pages;
+ struct vmem_altmap mhp_altmap = {};
+ struct vmem_altmap *altmap = NULL;
+ u64 start = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
+ u64 size = memory_block_size_bytes();
+
+ if (!mem->nr_vmemmap_pages) {
+ arch_remove_memory(mem->nid, start, size, NULL);
+ return 0;
+ }
+
+ /*
+ * Let remove_pmd_table->free_hugepage_table
+ * do the right thing if we used vmem_altmap
+ * when hot-adding the range.
+ */
+ mhp_altmap.alloc = nr_vmemmap_pages;
+ altmap = &mhp_altmap;
+
+ unaccount_vmemmap_space(mem->nid, PHYS_PFN(start), nr_vmemmap_pages);
+ arch_remove_memory(mem->nid, start, size, altmap);
+
+ return 0;
+}
+
/*
* NOTE: The caller must call lock_device_hotplug() to serialize hotplug
* and online/offline operations (triggered e.g. by sysfs).
@@ -1170,8 +1210,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
ret = -EINVAL;
goto error;
}
- mhp_altmap.free = PHYS_PFN(size);
- mhp_altmap.base_pfn = PHYS_PFN(start);
+ reserve_vmmemmap_space(nid, PHYS_PFN(start), PHYS_PFN(size), &mhp_altmap);
params.altmap = &mhp_altmap;
}

@@ -1639,25 +1678,16 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
return 0;
}

-int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
- unsigned long nr_vmemmap_pages)
+int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
const unsigned long end_pfn = start_pfn + nr_pages;
- unsigned long pfn, buddy_start_pfn, buddy_nr_pages, system_ram_pages = 0;
+ unsigned long pfn, system_ram_pages = 0;
unsigned long flags;
struct zone *zone;
struct memory_notify arg;
int ret, node;
char *reason;

- /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
- if (WARN_ON_ONCE(!nr_pages ||
- !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
- return -EINVAL;
-
- buddy_start_pfn = start_pfn + nr_vmemmap_pages;
- buddy_nr_pages = nr_pages - nr_vmemmap_pages;
-
mem_hotplug_begin();

/*
@@ -1693,7 +1723,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
zone_pcp_disable(zone);

/* set above range as isolated */
- ret = start_isolate_page_range(buddy_start_pfn, end_pfn,
+ ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
MEMORY_OFFLINE | REPORT_FAILURE);
if (ret) {
@@ -1713,7 +1743,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
}

do {
- pfn = buddy_start_pfn;
+ pfn = start_pfn;
do {
if (signal_pending(current)) {
ret = -EINTR;
@@ -1744,18 +1774,18 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
* offlining actually in order to make hugetlbfs's object
* counting consistent.
*/
- ret = dissolve_free_huge_pages(buddy_start_pfn, end_pfn);
+ ret = dissolve_free_huge_pages(start_pfn, end_pfn);
if (ret) {
reason = "failure to dissolve huge pages";
goto failed_removal_isolated;
}

- ret = test_pages_isolated(buddy_start_pfn, end_pfn, MEMORY_OFFLINE);
+ ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);

} while (ret);

/* Mark all sections offline and remove free pages from the buddy. */
- __offline_isolated_pages(start_pfn, end_pfn, buddy_start_pfn);
+ __offline_isolated_pages(start_pfn, end_pfn, start_pfn);
pr_debug("Offlined Pages %ld\n", nr_pages);

/*
@@ -1764,13 +1794,13 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
* of isolated pageblocks, memory onlining will properly revert this.
*/
spin_lock_irqsave(&zone->lock, flags);
- zone->nr_isolate_pageblock -= buddy_nr_pages / pageblock_nr_pages;
+ zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);

zone_pcp_enable(zone);

/* removal success */
- adjust_managed_page_count(pfn_to_page(start_pfn), -buddy_nr_pages);
+ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
zone->present_pages -= nr_pages;

pgdat_resize_lock(zone->zone_pgdat, &flags);
@@ -1799,7 +1829,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
return 0;

failed_removal_isolated:
- undo_isolate_page_range(buddy_start_pfn, end_pfn, MIGRATE_MOVABLE);
+ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
memory_notify(MEM_CANCEL_OFFLINE, &arg);
failed_removal_pcplists_disabled:
zone_pcp_enable(zone);
@@ -1830,14 +1860,6 @@ static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
return 0;
}

-static int get_nr_vmemmap_pages_cb(struct memory_block *mem, void *arg)
-{
- /*
- * If not set, continue with the next block.
- */
- return mem->nr_vmemmap_pages;
-}
-
static int check_cpu_on_node(pg_data_t *pgdat)
{
int cpu;
@@ -1912,9 +1934,6 @@ EXPORT_SYMBOL(try_offline_node);
static int __ref try_remove_memory(int nid, u64 start, u64 size)
{
int rc = 0;
- struct vmem_altmap mhp_altmap = {};
- struct vmem_altmap *altmap = NULL;
- unsigned long nr_vmemmap_pages = 0;

BUG_ON(check_hotplug_memory_range(start, size));

@@ -1927,31 +1946,6 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
if (rc)
return rc;

- /*
- * We only support removing memory added with MHP_MEMMAP_ON_MEMORY in
- * the same granularity it was added - a single memory block.
- */
- if (memmap_on_memory) {
- nr_vmemmap_pages = walk_memory_blocks(start, size, NULL,
- get_nr_vmemmap_pages_cb);
- if (nr_vmemmap_pages) {
- if (size != memory_block_size_bytes()) {
- pr_warn("Refuse to remove %#llx - %#llx,"
- "wrong granularity\n",
- start, start + size);
- return -EINVAL;
- }
-
- /*
- * Let remove_pmd_table->free_hugepage_table
- * do the right thing if we used vmem_altmap
- * when hot-adding the range.
- */
- mhp_altmap.alloc = nr_vmemmap_pages;
- altmap = &mhp_altmap;
- }
- }
-
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");

@@ -1963,7 +1957,10 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)

mem_hotplug_begin();

- arch_remove_memory(nid, start, size, altmap);
+ if (!memmap_on_memory)
+ arch_remove_memory(nid, start, size, NULL);
+ else
+ walk_memory_blocks(start, size, NULL, remove_memory_block_cb);

if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
memblock_free(start, size);
--
Michal Hocko
SUSE Labs