[PATCH 09/24] sched: Prepare generic code for delayed dequeue
From: Peter Zijlstra
Date: Sat Jul 27 2024 - 07:03:51 EST
While most of the delayed dequeue code can be done inside the
sched_class itself, there is one location where we do not have an
appropriate hook, namely ttwu_runnable().
Add an ENQUEUE_DELAYED call to the on_rq path to deal with waking
delayed dequeue tasks.
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/sched.h | 1 +
kernel/sched/core.c | 17 ++++++++++++++++-
kernel/sched/sched.h | 2 ++
3 files changed, 19 insertions(+), 1 deletion(-)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -542,6 +542,7 @@ struct sched_entity {
struct list_head group_node;
unsigned int on_rq;
+ unsigned int sched_delayed;
u64 exec_start;
u64 sum_exec_runtime;
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2036,6 +2036,8 @@ void activate_task(struct rq *rq, struct
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
{
+ SCHED_WARN_ON(flags & DEQUEUE_SLEEP);
+
WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING);
ASSERT_EXCLUSIVE_WRITER(p->on_rq);
@@ -3677,12 +3679,14 @@ static int ttwu_runnable(struct task_str
rq = __task_rq_lock(p, &rf);
if (task_on_rq_queued(p)) {
+ update_rq_clock(rq);
+ if (p->se.sched_delayed)
+ enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED);
if (!task_on_cpu(rq, p)) {
/*
* When on_rq && !on_cpu the task is preempted, see if
* it should preempt the task that is current now.
*/
- update_rq_clock(rq);
wakeup_preempt(rq, p, wake_flags);
}
ttwu_do_wakeup(p);
@@ -4062,11 +4069,16 @@ int try_to_wake_up(struct task_struct *p
* case the whole 'p->on_rq && ttwu_runnable()' case below
* without taking any locks.
*
+ * Specifically, given current runs ttwu() we must be before
+ * schedule()'s block_task(), as such this must not observe
+ * sched_delayed.
+ *
* In particular:
* - we rely on Program-Order guarantees for all the ordering,
* - we're serialized against set_special_state() by virtue of
* it disabling IRQs (this allows not taking ->pi_lock).
*/
+ SCHED_WARN_ON(p->se.sched_delayed);
if (!ttwu_state_match(p, state, &success))
goto out;
@@ -4358,6 +4370,9 @@ static void __sched_fork(unsigned long c
p->se.slice = sysctl_sched_base_slice;
INIT_LIST_HEAD(&p->se.group_node);
+ /* A delayed task cannot be in clone(). */
+ SCHED_WARN_ON(p->se.sched_delayed);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
p->se.cfs_rq = NULL;
#endif
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2258,6 +2258,7 @@ extern const u32 sched_prio_to_wmult[40
#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
#define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */
+#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_RESTORE 0x02
@@ -2273,6 +2274,7 @@ extern const u32 sched_prio_to_wmult[40
#endif
#define ENQUEUE_INITIAL 0x80
#define ENQUEUE_MIGRATING 0x100
+#define ENQUEUE_DELAYED 0x200
#define RETRY_TASK ((void *)-1UL)