[RFC PATCH 4/5] x86/ibs: Adjust access faults sampling period
From: Bharata B Rao
Date: Wed Feb 08 2023 - 02:37:26 EST
Adjust the access faults sampling period of a thread to be within
the fixed mininum and maximum value. The adjustment logic uses the
private/shared and local/remote access faults stats. The algorithm
is same as the logic followed to adjust the scan period.
Unlike hinting faults, the min and max sampling period aren't
adjusted (yet) for access based sampling.
Signed-off-by: Bharata B Rao <bharata@xxxxxxx>
---
include/linux/sched.h | 2 +
kernel/sched/debug.c | 8 +++
kernel/sched/fair.c | 130 +++++++++++++++++++++++++++++++++++++-----
kernel/sched/sched.h | 4 ++
4 files changed, 130 insertions(+), 14 deletions(-)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 66c532418d38..101c6377abbc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1257,6 +1257,8 @@ struct task_struct {
unsigned int numa_sample_period;
int numa_preferred_nid;
unsigned long numa_migrate_retry;
+ unsigned int numa_access_faults;
+ unsigned int numa_access_faults_window;
/* Migration stamp: */
u64 node_stamp;
u64 last_task_numa_placement;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1637b65ba07a..1cf19778a232 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -334,6 +334,14 @@ static __init int sched_init_debug(void)
debugfs_create_u32("scan_period_max_ms", 0644, numa, &sysctl_numa_balancing_scan_period_max);
debugfs_create_u32("scan_size_mb", 0644, numa, &sysctl_numa_balancing_scan_size);
debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold);
+ debugfs_create_u32("sample_period_def", 0644, numa,
+ &sysctl_numa_balancing_sample_period_def);
+ debugfs_create_u32("sample_period_min", 0644, numa,
+ &sysctl_numa_balancing_sample_period_min);
+ debugfs_create_u32("sample_period_max", 0644, numa,
+ &sysctl_numa_balancing_sample_period_max);
+ debugfs_create_u32("access_faults_threshold", 0644, numa,
+ &sysctl_numa_balancing_access_faults_threshold);
#endif
debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3f617c799821..1b0665b034d0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1093,6 +1093,11 @@ adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
#endif /* CONFIG_NUMA */
#ifdef CONFIG_NUMA_BALANCING
+unsigned int sysctl_numa_balancing_sample_period_def = 10000;
+unsigned int sysctl_numa_balancing_sample_period_min = 5000;
+unsigned int sysctl_numa_balancing_sample_period_max = 20000;
+unsigned int sysctl_numa_balancing_access_faults_threshold = 250;
+
/*
* Approximate time to scan a full NUMA task in ms. The task scan period is
* calculated based on the tasks virtual memory size and
@@ -1572,6 +1577,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
struct numa_group *ng = deref_curr_numa_group(p);
int dst_nid = cpu_to_node(dst_cpu);
int last_cpupid, this_cpupid;
+ bool early = false;
/*
* The pages in slow memory node should be migrated according
@@ -1611,13 +1617,21 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
!node_is_toptier(src_nid) && !cpupid_valid(last_cpupid))
return false;
+ if (static_branch_unlikely(&hw_access_hints)) {
+ if (p->numa_access_faults < sysctl_numa_balancing_access_faults_threshold * 4)
+ early = true;
+ } else {
+ if (p->numa_scan_seq <= 4)
+ early = true;
+ }
+
/*
* Allow first faults or private faults to migrate immediately early in
* the lifetime of a task. The magic number 4 is based on waiting for
* two full passes of the "multi-stage node selection" test that is
* executed below.
*/
- if ((p->numa_preferred_nid == NUMA_NO_NODE || p->numa_scan_seq <= 4) &&
+ if ((p->numa_preferred_nid == NUMA_NO_NODE || early) &&
(cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
return true;
@@ -2305,7 +2319,11 @@ static void numa_migrate_preferred(struct task_struct *p)
return;
/* Periodically retry migrating the task to the preferred node */
- interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+ if (static_branch_unlikely(&hw_access_hints))
+ interval = min(interval, msecs_to_jiffies(p->numa_sample_period) / 16);
+ else
+ interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
+
p->numa_migrate_retry = jiffies + interval;
/* Success if task is already running on preferred CPU */
@@ -2430,6 +2448,77 @@ static void update_task_scan_period(struct task_struct *p,
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
+static void update_task_sample_period(struct task_struct *p,
+ unsigned long shared, unsigned long private)
+{
+ unsigned int period_slot;
+ int lr_ratio, ps_ratio;
+ int diff;
+
+ unsigned long remote = p->numa_faults_locality[0];
+ unsigned long local = p->numa_faults_locality[1];
+
+ /*
+ * If there were no access faults then either the task is
+ * completely idle or all activity is in areas that are not of interest
+ * to automatic numa balancing. Related to that, if there were failed
+ * migration then it implies we are migrating too quickly or the local
+ * node is overloaded. In either case, increase the sampling rate.
+ */
+ if (local + shared == 0 || p->numa_faults_locality[2]) {
+ p->numa_sample_period = min(sysctl_numa_balancing_sample_period_max,
+ p->numa_sample_period << 1);
+ return;
+ }
+
+ /*
+ * Prepare to scale scan period relative to the current period.
+ * == NUMA_PERIOD_THRESHOLD sample period stays the same
+ * < NUMA_PERIOD_THRESHOLD sample period decreases
+ * >= NUMA_PERIOD_THRESHOLD sample period increases
+ */
+ period_slot = DIV_ROUND_UP(p->numa_sample_period, NUMA_PERIOD_SLOTS);
+ lr_ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ ps_ratio = (private * NUMA_PERIOD_SLOTS) / (private + shared);
+
+ if (ps_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are local. There is no need to
+ * do fast access sampling, since memory is already local.
+ */
+ int slot = ps_ratio - NUMA_PERIOD_THRESHOLD;
+
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else if (lr_ratio >= NUMA_PERIOD_THRESHOLD) {
+ /*
+ * Most memory accesses are shared with other tasks.
+ * There is no point in continuing fast access sampling,
+ * since other tasks may just move the memory elsewhere.
+ */
+ int slot = lr_ratio - NUMA_PERIOD_THRESHOLD;
+
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else {
+ /*
+ * Private memory faults exceed (SLOTS-THRESHOLD)/SLOTS,
+ * yet they are not on the local NUMA node. Speed up
+ * access sampling to get the memory moved over.
+ */
+ int ratio = max(lr_ratio, ps_ratio);
+
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+ }
+
+ p->numa_sample_period = clamp(p->numa_sample_period + diff,
+ sysctl_numa_balancing_sample_period_min,
+ sysctl_numa_balancing_sample_period_max);
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
+
/*
* Get the fraction of time the task has been running since the last
* NUMA placement cycle. The scheduler keeps similar statistics, but
@@ -2560,16 +2649,24 @@ static void task_numa_placement(struct task_struct *p)
spinlock_t *group_lock = NULL;
struct numa_group *ng;
- /*
- * The p->mm->numa_scan_seq field gets updated without
- * exclusive access. Use READ_ONCE() here to ensure
- * that the field is read in a single access:
- */
- seq = READ_ONCE(p->mm->numa_scan_seq);
- if (p->numa_scan_seq == seq)
- return;
- p->numa_scan_seq = seq;
- p->numa_scan_period_max = task_scan_max(p);
+ if (static_branch_unlikely(&hw_access_hints)) {
+ p->numa_access_faults_window++;
+ p->numa_access_faults++;
+ if (p->numa_access_faults_window < sysctl_numa_balancing_access_faults_threshold)
+ return;
+ p->numa_access_faults_window = 0;
+ } else {
+ /*
+ * The p->mm->numa_scan_seq field gets updated without
+ * exclusive access. Use READ_ONCE() here to ensure
+ * that the field is read in a single access:
+ */
+ seq = READ_ONCE(p->mm->numa_scan_seq);
+ if (p->numa_scan_seq == seq)
+ return;
+ p->numa_scan_seq = seq;
+ p->numa_scan_period_max = task_scan_max(p);
+ }
total_faults = p->numa_faults_locality[0] +
p->numa_faults_locality[1];
@@ -2672,7 +2769,10 @@ static void task_numa_placement(struct task_struct *p)
sched_setnuma(p, max_nid);
}
- update_task_scan_period(p, fault_types[0], fault_types[1]);
+ if (static_branch_unlikely(&hw_access_hints))
+ update_task_sample_period(p, fault_types[0], fault_types[1]);
+ else
+ update_task_scan_period(p, fault_types[0], fault_types[1]);
}
static inline int get_numa_group(struct numa_group *grp)
@@ -3094,7 +3194,9 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
p->node_stamp = 0;
p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
- p->numa_sample_period = 0;
+ p->numa_sample_period = sysctl_numa_balancing_sample_period_def;
+ p->numa_access_faults = 0;
+ p->numa_access_faults_window = 0;
p->numa_migrate_retry = 0;
/* Protect against double add, see task_tick_numa and task_numa_work */
p->numa_work.next = &p->numa_work;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 953d16c802d6..0367dc727cc4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2473,6 +2473,10 @@ extern unsigned int sysctl_numa_balancing_scan_period_min;
extern unsigned int sysctl_numa_balancing_scan_period_max;
extern unsigned int sysctl_numa_balancing_scan_size;
extern unsigned int sysctl_numa_balancing_hot_threshold;
+extern unsigned int sysctl_numa_balancing_sample_period_def;
+extern unsigned int sysctl_numa_balancing_sample_period_min;
+extern unsigned int sysctl_numa_balancing_sample_period_max;
+extern unsigned int sysctl_numa_balancing_access_faults_threshold;
#endif
#ifdef CONFIG_SCHED_HRTICK
--
2.25.1