[PATCH 1/2] sched/core: switch struct rq->nr_iowait to a normal int

From: Jens Axboe
Date: Wed Feb 28 2024 - 14:24:17 EST


In 3 of the 4 spots where we modify rq->nr_iowait we already hold the
rq lock, and hence don't need atomics to modify the current per-rq
iowait count. In the 4th case, where we are scheduling in on a different
CPU than the task was previously on, we do not hold the previous rq lock,
and hence still need to use an atomic to increment the iowait count.

Rename the existing nr_iowait to nr_iowait_remote, and use that for the
4th case. The other three cases can simply inc/dec in a non-atomic
fashion under the held rq lock.

The per-rq iowait now becomes the difference between the two, the local
count minus the remote count.

Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
---
kernel/sched/core.c | 15 ++++++++++-----
kernel/sched/cputime.c | 3 +--
kernel/sched/sched.h | 8 +++++++-
3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9116bcc90346..48d15529a777 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3789,7 +3789,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
#endif
if (p->in_iowait) {
delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
+ task_rq(p)->nr_iowait--;
}

activate_task(rq, p, en_flags);
@@ -4354,8 +4354,10 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
if (task_cpu(p) != cpu) {
if (p->in_iowait) {
+ struct rq *__rq = task_rq(p);
+
delayacct_blkio_end(p);
- atomic_dec(&task_rq(p)->nr_iowait);
+ atomic_inc(&__rq->nr_iowait_remote);
}

wake_flags |= WF_MIGRATED;
@@ -5463,7 +5465,9 @@ unsigned long long nr_context_switches(void)

unsigned int nr_iowait_cpu(int cpu)
{
- return atomic_read(&cpu_rq(cpu)->nr_iowait);
+ struct rq *rq = cpu_rq(cpu);
+
+ return rq->nr_iowait - atomic_read(&rq->nr_iowait_remote);
}

/*
@@ -6681,7 +6685,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);

if (prev->in_iowait) {
- atomic_inc(&rq->nr_iowait);
+ rq->nr_iowait++;
delayacct_blkio_start();
}
}
@@ -10029,7 +10033,8 @@ void __init sched_init(void)
#endif
#endif /* CONFIG_SMP */
hrtick_rq_init(rq);
- atomic_set(&rq->nr_iowait, 0);
+ rq->nr_iowait = 0;
+ atomic_set(&rq->nr_iowait_remote, 0);

#ifdef CONFIG_SCHED_CORE
rq->core = rq;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index af7952f12e6c..0ed81c2d3c3b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -222,9 +222,8 @@ void account_steal_time(u64 cputime)
void account_idle_time(u64 cputime)
{
u64 *cpustat = kcpustat_this_cpu->cpustat;
- struct rq *rq = this_rq();

- if (atomic_read(&rq->nr_iowait) > 0)
+ if (nr_iowait_cpu(smp_processor_id()) > 0)
cpustat[CPUTIME_IOWAIT] += cputime;
else
cpustat[CPUTIME_IDLE] += cputime;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 001fe047bd5d..91fa5b4d45ed 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1049,7 +1049,13 @@ struct rq {
u64 clock_idle_copy;
#endif

- atomic_t nr_iowait;
+ /*
+ * Total per-cpu iowait is the difference of the two below. One is
+ * modified under the rq lock (nr_iowait), and if we don't have the rq
+ * lock, then nr_iowait_remote is used.
+ */
+ unsigned int nr_iowait;
+ atomic_t nr_iowait_remote;

#ifdef CONFIG_SCHED_DEBUG
u64 last_seen_need_resched_ns;
--
2.43.0