Re: [RFC PATCH v4 8/9] mm: sched: Move hot page promotion from NUMAB=2 to pghot tracking
From: Alok Rathore
Date: Mon Dec 22 2025 - 05:40:10 EST
On 06/12/25 03:44PM, Bharata B Rao wrote:
Currently hot page promotion (NUMA_BALANCING_MEMORY_TIERING
mode of NUMA Balancing) does hot page detection (via hint faults),
hot page classification and eventual promotion, all by itself and
sits within the scheduler.
With the new hot page tracking and promotion mechanism being
available, NUMA Balancing can limit itself to detection of
hot pages (via hint faults) and off-load rest of the
functionality to the common hot page tracking system.
pghot_record_access(PGHOT_HINT_FAULT) API is used to feed the
hot page info. In addition, the migration rate limiting and
dynamic threshold logic are moved to kmigrated so that the same
can be used for hot pages reported by other sources too.
Signed-off-by: Bharata B Rao <bharata@xxxxxxx>
<snip>
--- a/mm/pghot.c
+++ b/mm/pghot.c
@@ -12,6 +12,9 @@
* the hot pages. kmigrated runs for each lower tier node. It iterates
* over the node's PFNs and migrates pages marked for migration into
* their targeted nodes.
+ *
+ * Migration rate-limiting and dynamic threshold logic implementations
+ * were moved from NUMA Balancing mode 2.
*/
#include <linux/mm.h>
#include <linux/migrate.h>
@@ -25,6 +28,8 @@ static unsigned int pghot_freq_threshold = PGHOT_DEFAULT_FREQ_THRESHOLD;
static unsigned int kmigrated_sleep_ms = KMIGRATED_DEFAULT_SLEEP_MS;
static unsigned int kmigrated_batch_nr = KMIGRATED_DEFAULT_BATCH_NR;
+/* Restrict the NUMA promotion throughput (MB/s) for each target node. */
+static unsigned int sysctl_pghot_promote_rate_limit = 65536;
static unsigned int sysctl_pghot_freq_window = PGHOT_DEFAULT_FREQ_WINDOW;
static DEFINE_STATIC_KEY_FALSE(pghot_src_hwhints);
@@ -43,6 +48,14 @@ static const struct ctl_table pghot_sysctls[] = {
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
+ {
+ .procname = "pghot_promote_rate_limit_MBps",
+ .data = &sysctl_pghot_promote_rate_limit,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = SYSCTL_ZERO,
+ },
};
#endif
@@ -137,8 +150,13 @@ int pghot_record_access(unsigned long pfn, int nid, int src, unsigned long now)
old_freq = (hotness >> PGHOT_FREQ_SHIFT) & PGHOT_FREQ_MASK;
old_time = (hotness >> PGHOT_TIME_SHIFT) & PGHOT_TIME_MASK;
- if (((time - old_time) > msecs_to_jiffies(sysctl_pghot_freq_window))
- || (nid != NUMA_NO_NODE && old_nid != nid))
+ /*
+ * Bypass the new window logic for NUMA hint fault source
+ * as it is too slow in reporting accesses.
+ * TODO: Fix this.
+ */
+ if ((((time - old_time) > msecs_to_jiffies(sysctl_pghot_freq_window))
+ && (src != PGHOT_HINT_FAULT)) || (nid != NUMA_NO_NODE && old_nid != nid))
new_window = true;
if (new_window)
@@ -166,6 +184,110 @@ int pghot_record_access(unsigned long pfn, int nid, int src, unsigned long now)
return 0;
}
+/*
+ * For memory tiering mode, if there are enough free pages (more than
+ * enough watermark defined here) in fast memory node, to take full
+ * advantage of fast memory capacity, all recently accessed slow
+ * memory pages will be migrated to fast memory node without
+ * considering hot threshold.
+ */
+static bool pgdat_free_space_enough(struct pglist_data *pgdat)
+{
+ int z;
+ unsigned long enough_wmark;
+
+ enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT,
+ pgdat->node_present_pages >> 4);
+ for (z = pgdat->nr_zones - 1; z >= 0; z--) {
+ struct zone *zone = pgdat->node_zones + z;
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone_watermark_ok(zone, 0,
+ promo_wmark_pages(zone) + enough_wmark,
+ ZONE_MOVABLE, 0))
+ return true;
+ }
+ return false;
+}
+
+/*
+ * For memory tiering mode, too high promotion/demotion throughput may
+ * hurt application latency. So we provide a mechanism to rate limit
+ * the number of pages that are tried to be promoted.
+ */
+static bool kmigrated_promotion_rate_limit(struct pglist_data *pgdat, unsigned long rate_limit,
+ int nr, unsigned long now_ms)
+{
+ unsigned long nr_cand;
+ unsigned int start;
+
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr);
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ start = pgdat->nbp_rl_start;
+ if (now_ms - start > MSEC_PER_SEC &&
+ cmpxchg(&pgdat->nbp_rl_start, start, now_ms) == start)
+ pgdat->nbp_rl_nr_cand = nr_cand;
+ if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit)
+ return true;
+ return false;
+}
+
+static void kmigrated_promotion_adjust_threshold(struct pglist_data *pgdat,
+ unsigned long rate_limit, unsigned int ref_th,
+ unsigned long now_ms)
+{
+ unsigned int start, th_period, unit_th, th;
+ unsigned long nr_cand, ref_cand, diff_cand;
+
+ th_period = KMIGRATED_PROMOTION_THRESHOLD_WINDOW;
+ start = pgdat->nbp_th_start;
+ if (now_ms - start > th_period &&
+ cmpxchg(&pgdat->nbp_th_start, start, now_ms) == start) {
+ ref_cand = rate_limit *
+ KMIGRATED_PROMOTION_THRESHOLD_WINDOW / MSEC_PER_SEC;
+ nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE);
+ diff_cand = nr_cand - pgdat->nbp_th_nr_cand;
+ unit_th = ref_th * 2 / KMIGRATED_MIGRATION_ADJUST_STEPS;
+ th = pgdat->nbp_threshold ? : ref_th;
+ if (diff_cand > ref_cand * 11 / 10)
+ th = max(th - unit_th, unit_th);
+ else if (diff_cand < ref_cand * 9 / 10)
+ th = min(th + unit_th, ref_th * 2);
+ pgdat->nbp_th_nr_cand = nr_cand;
+ pgdat->nbp_threshold = th;
+ }
+}
+
+static bool kmigrated_should_migrate_memory(unsigned long nr_pages, unsigned long nid,
+ unsigned long time)
+{
+ struct pglist_data *pgdat;
+ unsigned long rate_limit;
+ unsigned int th, def_th;
+ unsigned long now = jiffies;
now = jiffies & PGHOT_TIME_MASK;
+ unsigned long now_ms = jiffies_to_msecs(now);
+
+ pgdat = NODE_DATA(nid);
+ if (pgdat_free_space_enough(pgdat)) {
+ /* workload changed, reset hot threshold */
+ pgdat->nbp_threshold = 0;
+ mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE_NRL, nr_pages);
+ return true;
+ }
+
+ def_th = sysctl_pghot_freq_window;
+ rate_limit = MB_TO_PAGES(sysctl_pghot_promote_rate_limit);
+ kmigrated_promotion_adjust_threshold(pgdat, rate_limit, def_th, now_ms);
+
+ th = pgdat->nbp_threshold ? : def_th;
+ if (jiffies_to_msecs(now - time) >= th)
Setting time in pfn hotness using PGHOT_TIME_MASK in pghot_record_access(). Therefore
here also it should be calculated using PGHOT_TIME_MASK. Then it'll be right comparision.
Regards,
Alok Rathore