[RFC PATCH 1 1/1] sched/numa: Hot VMA and shared VMA optimization

From: Raghavendra K T
Date: Fri Mar 22 2024 - 09:45:53 EST


Optimizations are based on history of PIDs accessing VMA.

- Increase tasks' access history windows (PeterZ) from 2 to 4.
( This patch is from Peter Zijlstra <peterz@xxxxxxxxxxxxx>)

Idea: A task is allowed to scan a VMA if:
- VMA was very recently accessed as indicated by the latest
access PIDs information (hot VMA).
- VMA is shared by more than 2 tasks. Here whole history of VMA's
access PIDs is considered using bitmap_weight().

Signed-off-by: Raghavendra K T <raghavendra.kt@xxxxxxx>
---
I will split the patset and post if we find this pathset useful
going further. First patch is from PeterZ.

include/linux/mm.h | 12 ++++++---
include/linux/mm_types.h | 11 +++++---
kernel/sched/fair.c | 58 ++++++++++++++++++++++++++++++++++++----
3 files changed, 69 insertions(+), 12 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index f5a97dec5169..1bf1df064b60 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1744,10 +1744,14 @@ static inline int folio_xchg_access_time(struct folio *folio, int time)
static inline void vma_set_access_pid_bit(struct vm_area_struct *vma)
{
unsigned int pid_bit;
-
- pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
- if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) {
- __set_bit(pid_bit, &vma->numab_state->pids_active[1]);
+ unsigned long *pids, pid_idx;
+
+ if (vma->numab_state) {
+ pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
+ pid_idx = READ_ONCE(vma->numab_state->pids_active_idx);
+ pids = vma->numab_state->pids_active + pid_idx;
+ if (!test_bit(pid_bit, pids))
+ __set_bit(pid_bit, pids);
}
}
#else /* !CONFIG_NUMA_BALANCING */
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 8b611e13153e..050ceef1e9d5 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -574,6 +574,7 @@ struct vma_lock {
struct rw_semaphore lock;
};

+#define NR_ACCESS_PID_HIST 4
struct vma_numab_state {
/*
* Initialised as time in 'jiffies' after which VMA
@@ -588,17 +589,21 @@ struct vma_numab_state {
*/
unsigned long pids_active_reset;

+ /* Points to current active PID tracking index. */
+ unsigned long pids_active_idx;
+
/*
* Approximate tracking of PIDs that trapped a NUMA hinting
* fault. May produce false positives due to hash collisions.
*
- * [0] Previous PID tracking
- * [1] Current PID tracking
+ * [pids_active_idx - 1] Previous PID tracking
+ * [pids_active_idx] Current PID tracking
*
+ * Whole array is used in a rotating manner to track latest PIDs.
* Window moves after next_pid_reset has expired approximately
* every VMA_PID_RESET_PERIOD jiffies:
*/
- unsigned long pids_active[2];
+ unsigned long pids_active[NR_ACCESS_PID_HIST];

/* MM scan sequence ID when scan first started after VMA creation */
int start_scan_seq;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6a16129f9a5c..ed329b2f4d53 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3157,9 +3157,44 @@ static void reset_ptenuma_scan(struct task_struct *p)
p->mm->numa_scan_offset = 0;
}

+static inline bool vma_test_access_pid_history(struct vm_area_struct *vma)
+{
+ unsigned int i, pid_bit;
+ unsigned long pids = 0;
+
+ pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG));
+
+ for (i = 0; i < NR_ACCESS_PID_HIST; i++)
+ pids |= vma->numab_state->pids_active[i];
+
+ return test_bit(pid_bit, &pids);
+}
+
+static inline bool vma_accessed_recent(struct vm_area_struct *vma)
+{
+ unsigned long *pids, pid_idx;
+
+ pid_idx = vma->numab_state->pids_active_idx;
+ pids = vma->numab_state->pids_active + pid_idx;
+
+ return (bitmap_weight(pids, BITS_PER_LONG) >= 1);
+}
+
+#define SHARED_VMA_THRESH 3
+
+static inline bool vma_shared_access(struct vm_area_struct *vma)
+{
+ int i;
+ unsigned long pids = 0;
+
+ for (i = 0; i < NR_ACCESS_PID_HIST; i++)
+ pids |= vma->numab_state->pids_active[i];
+
+ return (bitmap_weight(&pids, BITS_PER_LONG) >= SHARED_VMA_THRESH);
+}
+
static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
{
- unsigned long pids;
/*
* Allow unconditional access first two times, so that all the (pages)
* of VMAs get prot_none fault introduced irrespective of accesses.
@@ -3169,8 +3204,16 @@ static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma)
if ((READ_ONCE(current->mm->numa_scan_seq) - vma->numab_state->start_scan_seq) < 2)
return true;

- pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1];
- if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids))
+ /* Check if the current task had historically accessed VMA. */
+ if (vma_test_access_pid_history(vma))
+ return true;
+
+ /* Check at least one task had accessed VMA recently. */
+ if (vma_accessed_recent(vma))
+ return true;
+
+ /* Check if VMA is shared by many tasks. */
+ if (vma_shared_access(vma))
return true;

/*
@@ -3202,6 +3245,7 @@ static void task_numa_work(struct callback_head *work)
unsigned long nr_pte_updates = 0;
long pages, virtpages;
struct vma_iterator vmi;
+ unsigned long pid_idx;
bool vma_pids_skipped;
bool vma_pids_forced = false;

@@ -3341,8 +3385,12 @@ static void task_numa_work(struct callback_head *work)
time_after(jiffies, vma->numab_state->pids_active_reset)) {
vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset +
msecs_to_jiffies(VMA_PID_RESET_PERIOD);
- vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]);
- vma->numab_state->pids_active[1] = 0;
+
+ pid_idx = vma->numab_state->pids_active_idx;
+ pid_idx = (pid_idx + 1) % NR_ACCESS_PID_HIST;
+
+ vma->numab_state->pids_active_idx = pid_idx;
+ vma->numab_state->pids_active[pid_idx] = 0;
}

/* Do not rescan VMAs twice within the same sequence. */
--
2.34.1