[RFC PATCH 3/3] sched/fair: Add a per-shard overload flag

From: K Prateek Nayak
Date: Thu Aug 31 2023 - 06:47:54 EST


Even with the two patches, I still observe the following lock
contention when profiling the tbench 128-clients run with IBS:

- 12.61% swapper [kernel.vmlinux] [k] native_queued_spin_lock_slowpath
- 10.94% native_queued_spin_lock_slowpath
- 10.73% _raw_spin_lock
- 9.57% __schedule
schedule_idle
do_idle
+ cpu_startup_entry
- 0.82% task_rq_lock
newidle_balance
pick_next_task_fair
__schedule
schedule_idle
do_idle
+ cpu_startup_entry

Since David mentioned rq->avg_idle check is probably not the right step
towards the solution, this experiment introduces a per-shard
"overload" flag. Similar to "rq->rd->overload", per-shard overload flag
notifies of the possibility of one or more rq covered in the shard's
domain having a queued task. shard's overload flag is set at the same
time as "rq->rd->overload", and is cleared when shard's list is found
to be empty.

With these changes, following are the results for tbench 128-clients:

tip : 1.00 (var: 1.00%)
tip + v3 + series till patch 2 : 0.41 (var: 1.15%) (diff: -58.81%)
tip + v3 + full series : 1.01 (var: 0.36%) (diff: +00.92%)

Signed-off-by: K Prateek Nayak <kprateek.nayak@xxxxxxx>
---
kernel/sched/fair.c | 13 +++++++++++--
kernel/sched/sched.h | 17 +++++++++++++++++
2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 446ffdad49e1..31fe109fdaf0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -186,6 +186,7 @@ static void shared_runq_reassign_domains(void)
rq->cfs.shared_runq = shared_runq;
rq->cfs.shard = &shared_runq->shards[shard_idx];
rq_unlock(rq, &rf);
+ WRITE_ONCE(rq->cfs.shard->overload, 0);
}
}

@@ -202,6 +203,7 @@ static void __shared_runq_drain(struct shared_runq *shared_runq)
list_for_each_entry_safe(p, tmp, &shard->list, shared_runq_node)
list_del_init(&p->shared_runq_node);
raw_spin_unlock(&shard->lock);
+ WRITE_ONCE(shard->overload, 0);
}
}

@@ -258,13 +260,20 @@ shared_runq_pop_task(struct shared_runq_shard *shard, int target)
{
struct task_struct *p;

- if (list_empty(&shard->list))
+ if (!READ_ONCE(shard->overload))
return NULL;

+ if (list_empty(&shard->list)) {
+ WRITE_ONCE(shard->overload, 0);
+ return NULL;
+ }
+
raw_spin_lock(&shard->lock);
p = list_first_entry_or_null(&shard->list, struct task_struct,
shared_runq_node);
- if (p && is_cpu_allowed(p, target))
+ if (!p)
+ WRITE_ONCE(shard->overload, 0);
+ else if (is_cpu_allowed(p, target))
list_del_init(&p->shared_runq_node);
else
p = NULL;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f50176f720b1..e8d4d948f742 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -601,6 +601,20 @@ do { \
struct shared_runq_shard {
struct list_head list;
raw_spinlock_t lock;
+ /*
+ * shared_runq_shard can contain running tasks.
+ * In such cases where all the tasks are running,
+ * it is futile to attempt to pull tasks from the
+ * list. Overload flag is used to indicate case
+ * where one or more rq in the shard domain may
+ * have a queued task. If the flag is 0, it is
+ * very likely that all tasks in the shard are
+ * running and cannot be migrated. This is not
+ * guarded by the shard lock, and since it may
+ * be updated often, it is placed into its own
+ * cacheline.
+ */
+ int overload ____cacheline_aligned;
} ____cacheline_aligned;

/* This would likely work better as a configurable knob via debugfs */
@@ -2585,6 +2599,9 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
if (prev_nr < 2 && rq->nr_running >= 2) {
if (!READ_ONCE(rq->rd->overload))
WRITE_ONCE(rq->rd->overload, 1);
+
+ if (rq->cfs.shard && !READ_ONCE(rq->cfs.shard->overload))
+ WRITE_ONCE(rq->cfs.shard->overload, 1);
}
#endif

--
2.34.1