On 3/6/25 16:45, Bharata B Rao wrote:
kpromoted is a kernel daemon that accumulates hot page info
from different sources and tries to promote pages from slow
tiers to top tiers. One instance of this thread runs on each
node that has CPUs.
Could you please elaborate on what is slow vs top tier? A top tier uses
adist (which is a combination of bandwidth and latency), so I am
not sure the terminology here holds.
Subsystems that generate hot page access info can report that
to kpromoted via this API:
int kpromoted_record_access(u64 pfn, int nid, int src,
unsigned long time)
@pfn: The PFN of the memory accessed
@nid: The accessing NUMA node ID
@src: The temperature source (subsystem) that generated the
access info
@time: The access time in jiffies
Some temperature sources may not provide the nid from which
What is a temperature source?
the page was accessed. This is true for sources that use
page table scanning for PTE Accessed bit. Currently the toptier
node to which such pages should be promoted to is hard coded.
What would it take to make this flexible?
Also, the access time provided some sources may at best be
considered approximate. This is especially true for hot pages
detected by PTE A bit scanning.
kpromoted currently maintains the hot PFN records in hash lists
hashed by PFN value. Each record stores the following info:
struct page_hotness_info {
unsigned long pfn;
/* Time when this record was updated last */
unsigned long last_update;
/*
* Number of times this page was accessed in the
* current window
*/
int frequency;
/* Most recent access time */
unsigned long recency;
/* Most recent access from this node */
int hot_node;
struct hlist_node hnode;
};
The way in which a page is categorized as hot enough to be
promoted is pretty primitive now.
Signed-off-by: Bharata B Rao <bharata@xxxxxxx>
---
include/linux/kpromoted.h | 54 ++++++
include/linux/mmzone.h | 4 +
include/linux/vm_event_item.h | 13 ++
mm/Kconfig | 7 +
mm/Makefile | 1 +
mm/kpromoted.c | 305 ++++++++++++++++++++++++++++++++++
mm/mm_init.c | 10 ++
mm/vmstat.c | 13 ++
8 files changed, 407 insertions(+)
create mode 100644 include/linux/kpromoted.h
create mode 100644 mm/kpromoted.c
diff --git a/include/linux/kpromoted.h b/include/linux/kpromoted.h
new file mode 100644
index 000000000000..2bef3d74f03a
--- /dev/null
+++ b/include/linux/kpromoted.h
@@ -0,0 +1,54 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_KPROMOTED_H
+#define _LINUX_KPROMOTED_H
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/workqueue_types.h>
+
+/* Page hotness temperature sources */
+enum kpromoted_src {
+ KPROMOTED_HW_HINTS,
+ KPROMOTED_PGTABLE_SCAN,
+};
+
+#ifdef CONFIG_KPROMOTED
+
+#define KPROMOTED_FREQ_WINDOW (5 * MSEC_PER_SEC)
+
+/* 2 accesses within a window will make the page a promotion candidate */
+#define KPRMOTED_FREQ_THRESHOLD 2
+
Were these value derived empirically?
+#define KPROMOTED_HASH_ORDER 16
+
+struct page_hotness_info {
+ unsigned long pfn;
+
+ /* Time when this record was updated last */
+ unsigned long last_update;
+
+ /*
+ * Number of times this page was accessed in the
+ * current window
+ */
+ int frequency;
+
+ /* Most recent access time */
+ unsigned long recency;
+
+ /* Most recent access from this node */
+ int hot_node;
+ struct hlist_node hnode;
+};
+
+#define KPROMOTE_DELAY MSEC_PER_SEC
+
+int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now);
+#else
+static inline int kpromoted_record_access(u64 pfn, int nid, int src,
+ unsigned long now)
+{
+ return 0;
+}
+#endif /* CONFIG_KPROMOTED */
+#endif /* _LINUX_KPROMOTED_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 9540b41894da..a5c4e789aa55 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1459,6 +1459,10 @@ typedef struct pglist_data {
#ifdef CONFIG_MEMORY_FAILURE
struct memory_failure_stats mf_stats;
#endif
+#ifdef CONFIG_KPROMOTED
+ struct task_struct *kpromoted;
+ wait_queue_head_t kpromoted_wait;
+#endif
} pg_data_t;
#define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages)
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index f70d0958095c..b5823b037883 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -182,6 +182,19 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
KSTACK_REST,
#endif
#endif /* CONFIG_DEBUG_STACK_USAGE */
+ KPROMOTED_RECORDED_ACCESSES,
+ KPROMOTED_RECORD_HWHINTS,
+ KPROMOTED_RECORD_PGTSCANS,
+ KPROMOTED_RECORD_TOPTIER,
+ KPROMOTED_RECORD_ADDED,
+ KPROMOTED_RECORD_EXISTS,
+ KPROMOTED_MIG_RIGHT_NODE,
+ KPROMOTED_MIG_NON_LRU,
+ KPROMOTED_MIG_COLD_OLD,
+ KPROMOTED_MIG_COLD_NOT_ACCESSED,
+ KPROMOTED_MIG_CANDIDATE,
+ KPROMOTED_MIG_PROMOTED,
+ KPROMOTED_MIG_DROPPED,
NR_VM_EVENT_ITEMS
};
diff --git a/mm/Kconfig b/mm/Kconfig
index 1b501db06417..ceaa462a0ce6 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1358,6 +1358,13 @@ config PT_RECLAIM
Note: now only empty user PTE page table pages will be reclaimed.
+config KPROMOTED
+ bool "Kernel hot page promotion daemon"
+ def_bool y
+ depends on NUMA && MIGRATION && MMU
+ help
+ Promote hot pages from lower tier to top tier by using the
+ memory access information provided by various sources.
source "mm/damon/Kconfig"
diff --git a/mm/Makefile b/mm/Makefile
index 850386a67b3e..bf4f5f18f1f9 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -147,3 +147,4 @@ obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
obj-$(CONFIG_EXECMEM) += execmem.o
obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
obj-$(CONFIG_PT_RECLAIM) += pt_reclaim.o
+obj-$(CONFIG_KPROMOTED) += kpromoted.o
diff --git a/mm/kpromoted.c b/mm/kpromoted.c
new file mode 100644
index 000000000000..2a8b8495b6b3
--- /dev/null
+++ b/mm/kpromoted.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * kpromoted is a kernel thread that runs on each node that has CPU i,e.,
+ * on regular nodes.
+ *
+ * Maintains list of hot pages from lower tiers and promotes them.
+ */
+#include <linux/kpromoted.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/mmzone.h>
+#include <linux/migrate.h>
+#include <linux/memory-tiers.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/cpuhotplug.h>
+#include <linux/hashtable.h>
+
+static DEFINE_HASHTABLE(page_hotness_hash, KPROMOTED_HASH_ORDER);
+static struct mutex page_hotness_lock[1UL << KPROMOTED_HASH_ORDER];
+
+static int kpromote_page(struct page_hotness_info *phi)
+{
Why not just call it kpromote_folio?
+ struct page *page = pfn_to_page(phi->pfn);
+ struct folio *folio;
+ int ret;
+
+ if (!page)
+ return 1;
Do we need to check for is_zone_device_page() here?
+
+ folio = page_folio(page);
+ ret = migrate_misplaced_folio_prepare(folio, NULL, phi->hot_node);
+ if (ret)
+ return 1;
+
+ return migrate_misplaced_folio(folio, phi->hot_node);
+}
Could you please document the assumptions for kpromote_page(), what locks
should be held? Does the ref count need to be incremented?
+
+static int page_should_be_promoted(struct page_hotness_info *phi)
+{
+ struct page *page = pfn_to_online_page(phi->pfn);
+ unsigned long now = jiffies;
+ struct folio *folio;
+
+ if (!page || is_zone_device_page(page))
+ return false;
+
+ folio = page_folio(page);
+ if (!folio_test_lru(folio)) {
+ count_vm_event(KPROMOTED_MIG_NON_LRU);
+ return false;
+ }
+ if (folio_nid(folio) == phi->hot_node) {
+ count_vm_event(KPROMOTED_MIG_RIGHT_NODE);
+ return false;
+ }
+
+ /* If the page was hot a while ago, don't promote */
+ if ((now - phi->last_update) > 2 * msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
+ count_vm_event(KPROMOTED_MIG_COLD_OLD);
Shouldn't we update phi->last_update here?
+ return false;
+ }
+
+ /* If the page hasn't been accessed enough number of times, don't promote */
+ if (phi->frequency < KPRMOTED_FREQ_THRESHOLD) {
+ count_vm_event(KPROMOTED_MIG_COLD_NOT_ACCESSED);
+ return false;
+ }
+ return true;
+}
+
+/*
+ * Go thro' page hotness information and migrate pages if required.
+ *
+ * Promoted pages are not longer tracked in the hot list.
+ * Cold pages are pruned from the list as well.
+ *
+ * TODO: Batching could be done
+ */
+static void kpromoted_migrate(pg_data_t *pgdat)
+{
+ int nid = pgdat->node_id;
+ struct page_hotness_info *phi;
+ struct hlist_node *tmp;
+ int nr_bkts = HASH_SIZE(page_hotness_hash);
+ int bkt;
+
+ for (bkt = 0; bkt < nr_bkts; bkt++) {
+ mutex_lock(&page_hotness_lock[bkt]);
+ hlist_for_each_entry_safe(phi, tmp, &page_hotness_hash[bkt], hnode) {
+ if (phi->hot_node != nid)
+ continue;
+
+ if (page_should_be_promoted(phi)) {
+ count_vm_event(KPROMOTED_MIG_CANDIDATE);
+ if (!kpromote_page(phi)) {
+ count_vm_event(KPROMOTED_MIG_PROMOTED);
+ hlist_del_init(&phi->hnode);
+ kfree(phi);
+ }
+ } else {
+ /*
+ * Not a suitable page or cold page, stop tracking it.
+ * TODO: Identify cold pages and drive demotion?
+ */
+ count_vm_event(KPROMOTED_MIG_DROPPED);
+ hlist_del_init(&phi->hnode);
+ kfree(phi);
Won't existing demotion already handle this?
+ }
+ }
+ mutex_unlock(&page_hotness_lock[bkt]);
+ }
+}
+
It sounds like NUMA balancing, promotion and demotion can all act on parallel on
these folios, if not could you clarify their relationship and dependency?
+static struct page_hotness_info *__kpromoted_lookup(unsigned long pfn, int bkt)
+{
+ struct page_hotness_info *phi;
+
+ hlist_for_each_entry(phi, &page_hotness_hash[bkt], hnode) {
+ if (phi->pfn == pfn)
+ return phi;
+ }
+ return NULL;
+}
+
+static struct page_hotness_info *kpromoted_lookup(unsigned long pfn, int bkt, unsigned long now)
+{
+ struct page_hotness_info *phi;
+
+ phi = __kpromoted_lookup(pfn, bkt);
+ if (!phi) {
+ phi = kzalloc(sizeof(struct page_hotness_info), GFP_KERNEL);
+ if (!phi)
+ return ERR_PTR(-ENOMEM);
+
+ phi->pfn = pfn;
+ phi->frequency = 1;
+ phi->last_update = now;
+ phi->recency = now;
+ hlist_add_head(&phi->hnode, &page_hotness_hash[bkt]);
+ count_vm_event(KPROMOTED_RECORD_ADDED);
+ } else {
+ count_vm_event(KPROMOTED_RECORD_EXISTS);
+ }
+ return phi;
+}
+
+/*
+ * Called by subsystems that generate page hotness/access information.
+ *
+ * Records the memory access info for futher action by kpromoted.
+ */
+int kpromoted_record_access(u64 pfn, int nid, int src, unsigned long now)
+{
+ struct page_hotness_info *phi;
+ struct page *page;
+ struct folio *folio;
+ int ret, bkt;
+
+ count_vm_event(KPROMOTED_RECORDED_ACCESSES);
+
+ switch (src) {
+ case KPROMOTED_HW_HINTS:
+ count_vm_event(KPROMOTED_RECORD_HWHINTS);
+ break;
+ case KPROMOTED_PGTABLE_SCAN:
+ count_vm_event(KPROMOTED_RECORD_PGTSCANS);
+ break;
+ default:
+ break;
+ }
+
+ /*
+ * Record only accesses from lower tiers.
+ * Assuming node having CPUs as toptier for now.
+ */
+ if (node_is_toptier(pfn_to_nid(pfn))) {
+ count_vm_event(KPROMOTED_RECORD_TOPTIER);
+ return 0;
+ }
+
+ page = pfn_to_online_page(pfn);
+ if (!page || is_zone_device_page(page))
+ return 0;
+
+ folio = page_folio(page);
+ if (!folio_test_lru(folio))
+ return 0;
+
+ bkt = hash_min(pfn, KPROMOTED_HASH_ORDER);
+ mutex_lock(&page_hotness_lock[bkt]);
+ phi = kpromoted_lookup(pfn, bkt, now);
+ if (!phi) {
+ ret = PTR_ERR(phi);
+ goto out;
+ }
+
+ if ((phi->last_update - now) > msecs_to_jiffies(KPROMOTED_FREQ_WINDOW)) {
+ /* New window */
+ phi->frequency = 1; /* TODO: Factor in the history */
+ phi->last_update = now;
+ } else {
+ phi->frequency++;
+ }
+ phi->recency = now;
+
+ /*
+ * TODOs:
+ * 1. Source nid is hard-coded for some temperature sources
+ * 2. Take action if hot_node changes - may be a shared page?
+ * 3. Maintain node info for every access within the window?
+ */
+ phi->hot_node = (nid == NUMA_NO_NODE) ? 1 : nid;
I don't understand why nid needs to be 1 if nid is NUMA_NODE_ID? Does
it mean that it's being promoted to the top tier, the mix of hot_node,
tier and nid is not very clear here.
+ mutex_unlock(&page_hotness_lock[bkt]);
+out:
+ return 0;
+}
+
+/*
+ * Go through the accumulated mem_access_info and migrate
+ * pages if required.
+ */
+static void kpromoted_do_work(pg_data_t *pgdat)
+{
+ kpromoted_migrate(pgdat);
+}
+
+static inline bool kpromoted_work_requested(pg_data_t *pgdat)
+{
+ return false;
+}
+
+static int kpromoted(void *p)
+{
+ pg_data_t *pgdat = (pg_data_t *)p;
+ struct task_struct *tsk = current;
+ long timeout = msecs_to_jiffies(KPROMOTE_DELAY);
+
+ const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+
+ if (!cpumask_empty(cpumask))
+ set_cpus_allowed_ptr(tsk, cpumask);
+
+ while (!kthread_should_stop()) {
+ wait_event_timeout(pgdat->kpromoted_wait,
+ kpromoted_work_requested(pgdat), timeout);
+ kpromoted_do_work(pgdat);
+ }
+ return 0;
+}
+
+static void kpromoted_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+
+ if (pgdat->kpromoted)
+ return;
+
+ pgdat->kpromoted = kthread_run(kpromoted, pgdat, "kpromoted%d", nid);
+ if (IS_ERR(pgdat->kpromoted)) {
+ pr_err("Failed to start kpromoted on node %d\n", nid);
+ pgdat->kpromoted = NULL;
+ }
+}
+
+static int kpromoted_cpu_online(unsigned int cpu)
+{
+ int nid;
+
+ for_each_node_state(nid, N_CPU) {
+ pg_data_t *pgdat = NODE_DATA(nid);
+ const struct cpumask *mask;
+
+ mask = cpumask_of_node(pgdat->node_id);
+
+ if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
+ /* One of our CPUs online: restore mask */
+ if (pgdat->kpromoted)
+ set_cpus_allowed_ptr(pgdat->kpromoted, mask);
+ }
+ return 0;
+}
+
+static int __init kpromoted_init(void)
+{
+ int nid, ret, i;
+
+ ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
+ "mm/promotion:online",
+ kpromoted_cpu_online, NULL);
+ if (ret < 0) {
+ pr_err("kpromoted: failed to register hotplug callbacks.\n");
+ return ret;
+ }
+
+ for (i = 0; i < (1UL << KPROMOTED_HASH_ORDER); i++)
+ mutex_init(&page_hotness_lock[i]);
+
+ for_each_node_state(nid, N_CPU)
+ kpromoted_run(nid);
+
I think we need a dynamic way to disabling promotion at run time
as well, right?