[PATCH 1/2] sched/core: switch struct rq->nr_iowait to an unsigned int

From: Jens Axboe
Date: Tue Feb 27 2024 - 16:12:14 EST


In 3 of the 4 spots where we modify rq->nr_iowait we already hold the
rq lock, and in the 4th one we can just grab it. This avoids an atomic
in the scheduler fast path if we're in iowait, with the tradeoff being
that we'll grab the rq lock for the case where a task is scheduled out
in iowait mode on one CPU, and scheduled in on another CPU.

This obviously leaves the reading side as potentially racy, but that
should be OK. iowait states change all of the time, and can change while
it's being read as well, or summed up.

Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
---
kernel/sched/core.c | 15 ++++++++++-----
kernel/sched/cputime.c | 2 +-
kernel/sched/sched.h | 2 +-
3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9116bcc90346..ecc6c26096e5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3789,7 +3789,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
#endif
if (p->in_iowait) {
delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
+ task_rq(p)->nr_iowait--;
}

activate_task(rq, p, en_flags);
@@ -4354,8 +4354,13 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
if (task_cpu(p) != cpu) {
if (p->in_iowait) {
+ struct rq *rq = task_rq(p);
+ struct rq_flags rf;
+
+ rq_lock(rq, &rf);
+ task_rq(p)->nr_iowait--;
+ rq_unlock(rq, &rf);
delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
}

wake_flags |= WF_MIGRATED;
@@ -5463,7 +5468,7 @@ unsigned long long nr_context_switches(void)

unsigned int nr_iowait_cpu(int cpu)
{
- return atomic_read(&cpu_rq(cpu)->nr_iowait);
+ return cpu_rq(cpu)->nr_iowait;
}

/*
@@ -6681,7 +6686,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);

if (prev->in_iowait) {
- atomic_inc(&rq->nr_iowait);
+ rq->nr_iowait++;
delayacct_blkio_start();
}
}
@@ -10029,7 +10034,7 @@ void __init sched_init(void)
#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
- atomic_set(&rq->nr_iowait, 0);
+ rq->nr_iowait = 0;

#ifdef CONFIG_SCHED_CORE
rq->core = rq;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index af7952f12e6c..b970b6c6151e 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -224,7 +224,7 @@ void account_idle_time(u64 cputime)
u64 *cpustat = kcpustat_this_cpu->cpustat;
struct rq *rq = this_rq();

- if (atomic_read(&rq->nr_iowait) > 0)
+ if (rq->nr_iowait)
cpustat[CPUTIME_IOWAIT] += cputime;
else
cpustat[CPUTIME_IDLE] += cputime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 001fe047bd5d..a1222a4bdc7b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1049,7 +1049,7 @@ struct rq {
u64 clock_idle_copy;
#endif

- atomic_t nr_iowait;
+ unsigned int nr_iowait;

#ifdef CONFIG_SCHED_DEBUG
u64 last_seen_need_resched_ns;
--
2.43.0