[PATCH 1/1] mm: proof-of-concept for balancing node zones occupancy

From: Charan Teja Reddy
Date: Thu Feb 18 2021 - 14:03:19 EST


This is a Proof-of-concept code for balancing the node zones occupancy
whose imbalance may be caused by the memory hotplug.

Signed-off-by: Charan Teja Reddy <charante@xxxxxxxxxxxxxx>
---
include/linux/migrate.h | 8 +-
include/linux/mm.h | 3 +
include/linux/mmzone.h | 2 +
kernel/sysctl.c | 11 ++
mm/compaction.c | 4 +-
mm/memory_hotplug.c | 265 ++++++++++++++++++++++++++++++++++++++++++++++++
6 files changed, 290 insertions(+), 3 deletions(-)

diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 4594838..b7dc259 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -53,6 +53,8 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page);
extern int migrate_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page, int extra_count);
+extern void split_map_pages(struct list_head *list);
+extern unsigned long release_freepages(struct list_head *freelist);
#else

static inline void putback_movable_pages(struct list_head *l) {}
@@ -81,7 +83,11 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
{
return -ENOSYS;
}
-
+static inline void split_map_pages(struct list_head *list) { }
+static inline unsigned long release_freepages(struct list_head *freelist)
+{
+ return 0;
+}
#endif /* CONFIG_MIGRATION */

#ifdef CONFIG_COMPACTION
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ecdf8a8..1014139 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2465,6 +2465,9 @@ extern int watermark_boost_factor;
extern int watermark_scale_factor;
extern bool arch_has_descending_max_zone_pfns(void);

+/* memory_hotplug.c */
+extern int balance_node_occupancy_pages;
+
/* nommu.c */
extern atomic_long_t mmap_pages_allocated;
extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t);
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index b593316..ce417c3 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -977,6 +977,8 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
void *, size_t *, loff_t *);
int numa_zonelist_order_handler(struct ctl_table *, int,
void *, size_t *, loff_t *);
+extern int sysctl_balance_node_occupancy_handler(struct ctl_table *tbl,
+ int write, void *buf, size_t *len, loff_t *pos);
extern int percpu_pagelist_fraction;
extern char numa_zonelist_order[];
#define NUMA_ZONELIST_ORDER_LEN 16
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c9fbdd8..4b95a90 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -3140,6 +3140,17 @@ static struct ctl_table vm_table[] = {
.extra2 = SYSCTL_ONE,
},
#endif
+#ifdef CONFIG_MEMORY_HOTPLUG
+ {
+ .procname = "balance_node_occupancy_pages",
+ .data = &balance_node_occupancy_pages,
+ .maxlen = sizeof(balance_node_occupancy_pages),
+ .mode = 0200,
+ .proc_handler = sysctl_balance_node_occupancy_handler,
+ .extra1 = SYSCTL_ZERO,
+ },
+
+#endif
{ }
};

diff --git a/mm/compaction.c b/mm/compaction.c
index 190ccda..da3c015 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -68,7 +68,7 @@ static const unsigned int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500;
#define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT)
#endif

-static unsigned long release_freepages(struct list_head *freelist)
+unsigned long release_freepages(struct list_head *freelist)
{
struct page *page, *next;
unsigned long high_pfn = 0;
@@ -84,7 +84,7 @@ static unsigned long release_freepages(struct list_head *freelist)
return high_pfn;
}

-static void split_map_pages(struct list_head *list)
+void split_map_pages(struct list_head *list)
{
unsigned int i, order, nr_pages;
struct page *page, *next;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f9d57b9..2780c91 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -97,6 +97,271 @@ void mem_hotplug_done(void)

u64 max_mem_size = U64_MAX;

+int balance_node_occupancy_pages;
+static atomic_t target_migrate_pages = ATOMIC_INIT(0);
+
+struct movable_zone_fill_control {
+ struct list_head freepages;
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+ unsigned long nr_migrate_pages;
+ unsigned long nr_free_pages;
+ unsigned long limit;
+ int target;
+ struct zone *zone;
+};
+
+static void fill_movable_zone_fn(struct work_struct *work);
+static DECLARE_WORK(fill_movable_zone_work, fill_movable_zone_fn);
+static DEFINE_MUTEX(page_migrate_lock);
+
+static inline void reset_page_order(struct page *page)
+{
+ __ClearPageBuddy(page);
+ set_page_private(page, 0);
+}
+
+static int isolate_free_page(struct page *page, unsigned int order)
+{
+ struct zone *zone;
+
+ zone = page_zone(page);
+ list_del(&page->lru);
+ zone->free_area[order].nr_free--;
+ reset_page_order(page);
+
+ return 1UL << order;
+}
+
+static void isolate_free_pages(struct movable_zone_fill_control *fc)
+{
+ struct page *page;
+ unsigned long flags;
+ unsigned int order;
+ unsigned long start_pfn = fc->start_pfn;
+ unsigned long end_pfn = fc->end_pfn;
+
+ spin_lock_irqsave(&fc->zone->lock, flags);
+ for (; start_pfn < end_pfn; start_pfn++) {
+ unsigned long isolated;
+
+ if (!pfn_valid(start_pfn))
+ continue;
+
+ page = pfn_to_page(start_pfn);
+ if (!page)
+ continue;
+
+ if (PageCompound(page)) {
+ struct page *head = compound_head(page);
+ int skip;
+
+ skip = (1 << compound_order(head)) - (page - head);
+ start_pfn += skip - 1;
+ continue;
+ }
+
+ if (!PageBuddy(page))
+ continue;
+
+ order = page_private(page);
+ isolated = isolate_free_page(page, order);
+ set_page_private(page, order);
+ list_add_tail(&page->lru, &fc->freepages);
+ fc->nr_free_pages += isolated;
+ __mod_zone_page_state(fc->zone, NR_FREE_PAGES, -isolated);
+ start_pfn += isolated - 1;
+
+ /*
+ * Make sure that the zone->lock is not held for long by
+ * returning once we have SWAP_CLUSTER_MAX pages in the
+ * free list for migration.
+ */
+ if (fc->nr_free_pages >= SWAP_CLUSTER_MAX)
+ break;
+ }
+ fc->start_pfn = start_pfn + 1;
+ spin_unlock_irqrestore(&fc->zone->lock, flags);
+
+ split_map_pages(&fc->freepages);
+}
+
+static struct page *movable_page_alloc(struct page *page, unsigned long data)
+{
+ struct movable_zone_fill_control *fc;
+ struct page *freepage;
+
+ fc = (struct movable_zone_fill_control *)data;
+ if (list_empty(&fc->freepages)) {
+ isolate_free_pages(fc);
+ if (list_empty(&fc->freepages))
+ return NULL;
+ }
+
+ freepage = list_entry(fc->freepages.next, struct page, lru);
+ list_del(&freepage->lru);
+ fc->nr_free_pages--;
+
+ return freepage;
+}
+
+static void movable_page_free(struct page *page, unsigned long data)
+{
+ struct movable_zone_fill_control *fc;
+
+ fc = (struct movable_zone_fill_control *)data;
+ list_add(&page->lru, &fc->freepages);
+ fc->nr_free_pages++;
+}
+
+static unsigned long get_anon_movable_pages(
+ struct movable_zone_fill_control *fc,
+ unsigned long start_pfn,
+ unsigned long end_pfn, struct list_head *list)
+{
+ int found = 0, pfn, ret;
+ int limit = min_t(int, fc->target, (int)pageblock_nr_pages);
+
+ fc->nr_migrate_pages = 0;
+ for (pfn = start_pfn; pfn < end_pfn && found < limit; ++pfn) {
+ struct page *page = pfn_to_page(pfn);
+
+ if (!pfn_valid(pfn))
+ continue;
+
+ if (PageCompound(page)) {
+ struct page *head = compound_head(page);
+ int skip;
+
+ skip = (1 << compound_order(head)) - (page - head);
+ pfn += skip - 1;
+ continue;
+ }
+
+ if (PageBuddy(page)) {
+ unsigned long freepage_order;
+
+ freepage_order = READ_ONCE(page_private(page));
+ if (freepage_order > 0 && freepage_order < MAX_ORDER)
+ pfn += (1 << page_private(page)) - 1;
+ continue;
+ }
+
+ if (!PageLRU(page) || !PageAnon(page))
+ continue;
+
+ if (!get_page_unless_zero(page))
+ continue;
+
+ found++;
+ ret = isolate_lru_page(page);
+ if (!ret) {
+ list_add_tail(&page->lru, list);
+ inc_node_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_lru(page));
+ ++fc->nr_migrate_pages;
+ }
+
+ put_page(page);
+ }
+
+ return pfn;
+}
+
+static void prepare_fc(struct movable_zone_fill_control *fc)
+{
+ struct zone *zone;
+
+ zone = &(NODE_DATA(0)->node_zones[ZONE_MOVABLE]);
+ fc->zone = zone;
+ fc->start_pfn = zone->zone_start_pfn;
+ fc->end_pfn = zone_end_pfn(zone);
+ fc->limit = atomic64_read(&zone->managed_pages);
+ INIT_LIST_HEAD(&fc->freepages);
+}
+
+#define MIGRATE_TIMEOUT_SEC (20)
+static void fill_movable_zone_fn(struct work_struct *work)
+{
+ unsigned long start_pfn, end_pfn;
+ unsigned long movable_highmark;
+ struct zone *normal_zone = &(NODE_DATA(0)->node_zones[ZONE_NORMAL]);
+ struct zone *movable_zone = &(NODE_DATA(0)->node_zones[ZONE_MOVABLE]);
+ LIST_HEAD(source);
+ int ret, free;
+ struct movable_zone_fill_control fc = { {0} };
+ unsigned long timeout = MIGRATE_TIMEOUT_SEC * HZ, expire;
+
+ start_pfn = normal_zone->zone_start_pfn;
+ end_pfn = zone_end_pfn(normal_zone);
+ movable_highmark = high_wmark_pages(movable_zone);
+
+ lru_add_drain_all();
+ drain_all_pages(normal_zone);
+ if (!mutex_trylock(&page_migrate_lock))
+ return;
+ prepare_fc(&fc);
+ if (!fc.limit)
+ goto out;
+ expire = jiffies + timeout;
+restart:
+ fc.target = atomic_xchg(&target_migrate_pages, 0);
+ if (!fc.target)
+ goto out;
+repeat:
+ cond_resched();
+ if (time_after(jiffies, expire))
+ goto out;
+ free = zone_page_state(movable_zone, NR_FREE_PAGES);
+ if (free - fc.target <= movable_highmark)
+ fc.target = free - movable_highmark;
+ if (fc.target <= 0)
+ goto out;
+
+ start_pfn = get_anon_movable_pages(&fc, start_pfn, end_pfn, &source);
+ if (list_empty(&source) && start_pfn < end_pfn)
+ goto repeat;
+
+ ret = migrate_pages(&source, movable_page_alloc, movable_page_free,
+ (unsigned long) &fc,
+ MIGRATE_ASYNC, MR_MEMORY_HOTPLUG);
+ if (ret)
+ putback_movable_pages(&source);
+
+ fc.target -= fc.nr_migrate_pages;
+ if (ret == -ENOMEM || start_pfn >= end_pfn)
+ goto out;
+ else if (fc.target <= 0)
+ goto restart;
+
+ goto repeat;
+out:
+ mutex_unlock(&page_migrate_lock);
+ if (fc.nr_free_pages > 0)
+ release_freepages(&fc.freepages);
+}
+
+
+
+int sysctl_balance_node_occupancy_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ if (write) {
+ atomic_add(balance_node_occupancy_pages, &target_migrate_pages);
+
+ if (!work_pending(&fill_movable_zone_work))
+ queue_work(system_unbound_wq, &fill_movable_zone_work);
+ }
+
+ return 0;
+}
+
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size,
const char *resource_name)
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a
member of the Code Aurora Forum, hosted by The Linux Foundation