[RFC PATCH 2/2] sched: Implement adaptative rate limiting of task migrations

From: Mathieu Desnoyers
Date: Tue Sep 05 2023 - 14:58:19 EST


Define a task migration quota (initially 10) within a 2ms window for
each task. Whenever a task reaches its quota, the following affine
wakeups keep the task on its previous CPU for the rest of the window.
When the migration quota has been entirely used within a window, the
available quota is divided by 2 for the next window, down to a minimum
of 1 per window. When the available quota is not entirely used within a
window, it is replenished by adding 1 per window.

One advantage of this approach is to leave extra room for sporadic
migration bursts, limiting the migration rate more strictly as tasks
repeatedly bust their quota.

Another advantage of this approach is to reduce the number of
sched_clock calls on the wakeup path when tasks are below their quota.

The resulting benchmarks are similar to those achieved with the
non-adaptative approach for hackbench and perf bench sched messaging
workloads.

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@xxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Valentin Schneider <vschneid@xxxxxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Ben Segall <bsegall@xxxxxxxxxx>
Cc: Mel Gorman <mgorman@xxxxxxx>
Cc: Daniel Bristot de Oliveira <bristot@xxxxxxxxxx>
Cc: Vincent Guittot <vincent.guittot@xxxxxxxxxx>
Cc: Juri Lelli <juri.lelli@xxxxxxxxxx>
Cc: Swapnil Sapkal <Swapnil.Sapkal@xxxxxxx>
Cc: Aaron Lu <aaron.lu@xxxxxxxxx>
Cc: Julien Desfossez <jdesfossez@xxxxxxxxxxxxxxxx>
Cc: x86@xxxxxxxxxx
---
include/linux/sched.h | 2 ++
kernel/sched/core.c | 2 ++
kernel/sched/fair.c | 36 +++++++++++++++++++++++++++++++-----
kernel/sched/sched.h | 2 ++
4 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1111d04255cc..2190e59ebc99 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -565,6 +565,8 @@ struct sched_entity {
u64 nr_migrations;

u64 next_migration_time;
+ int nr_migrations_per_window;
+ int quota_migrations_per_window;

#ifdef CONFIG_FAIR_GROUP_SCHED
int depth;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0d294fce261d..834e37231c36 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4511,6 +4511,8 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->se.vlag = 0;
p->se.slice = sysctl_sched_base_slice;
p->se.next_migration_time = 0;
+ p->se.nr_migrations_per_window = 0;
+ p->se.quota_migrations_per_window = SCHED_MIGRATION_QUOTA;
INIT_LIST_HEAD(&p->se.group_node);

#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 24ac69913005..e2f4c30483a1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -963,9 +963,11 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se);
static bool should_migrate_task(struct task_struct *p, int prev_cpu)
{
/* Rate limit task migration. */
- if (sched_clock_cpu(prev_cpu) < p->se.next_migration_time)
- return false;
- return true;
+ if (p->se.nr_migrations_per_window < p->se.quota_migrations_per_window)
+ return true;
+ if (sched_clock_cpu(prev_cpu) > p->se.next_migration_time)
+ return true;
+ return false;
}

/*
@@ -7946,6 +7948,31 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
return new_cpu;
}

+static void migrate_task_ratelimit_fair(struct task_struct *p, int new_cpu)
+{
+ struct sched_entity *se = &p->se;
+ u64 now;
+
+ /* Rate limit task migration. */
+ now = sched_clock_cpu(task_cpu(p));
+ if (now < se->next_migration_time) {
+ se->nr_migrations_per_window++;
+ if (!sched_clock_stable())
+ se->next_migration_time += sched_clock_cpu(new_cpu) - now;
+ } else {
+ if (!sched_clock_stable())
+ now = sched_clock_cpu(new_cpu);
+ se->next_migration_time = now + SCHED_MIGRATION_RATELIMIT_WINDOW;
+ if (se->nr_migrations_per_window >= se->quota_migrations_per_window)
+ se->quota_migrations_per_window = max(1, se->quota_migrations_per_window >> 1);
+ else
+ se->quota_migrations_per_window =
+ min(SCHED_MIGRATION_QUOTA,
+ se->quota_migrations_per_window + SCHED_MIGRATION_QUOTA_REPLENISH);
+ se->nr_migrations_per_window = 1;
+ }
+}
+
/*
* Called immediately before a task is migrated to a new CPU; task_cpu(p) and
* cfs_rq_of(p) references at time of call are still valid and identify the
@@ -7955,8 +7982,7 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
struct sched_entity *se = &p->se;

- /* Rate limit task migration. */
- se->next_migration_time = sched_clock_cpu(new_cpu) + SCHED_MIGRATION_RATELIMIT_WINDOW;
+ migrate_task_ratelimit_fair(p, new_cpu);

if (!task_on_rq_migrating(p)) {
remove_entity_load_avg(se);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c9b1a5976761..04529661976c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -105,6 +105,8 @@ struct cpuidle_state;
#define TASK_ON_RQ_MIGRATING 2

#define SCHED_MIGRATION_RATELIMIT_WINDOW 2000000 /* 2 ms */
+#define SCHED_MIGRATION_QUOTA 10
+#define SCHED_MIGRATION_QUOTA_REPLENISH 1

extern __read_mostly int scheduler_running;

--
2.39.2