Re: [PATCH 1/2] sched/core: switch struct rq->nr_iowait to an unsigned int

From: David Wei
Date: Tue Feb 27 2024 - 18:07:32 EST


On 2024-02-27 21:06, Jens Axboe wrote:
> In 3 of the 4 spots where we modify rq->nr_iowait we already hold the
> rq lock, and in the 4th one we can just grab it. This avoids an atomic
> in the scheduler fast path if we're in iowait, with the tradeoff being
> that we'll grab the rq lock for the case where a task is scheduled out
> in iowait mode on one CPU, and scheduled in on another CPU.
>
> This obviously leaves the reading side as potentially racy, but that
> should be OK. iowait states change all of the time, and can change while
> it's being read as well, or summed up.
>
> Signed-off-by: Jens Axboe <axboe@xxxxxxxxx>
> ---
> kernel/sched/core.c | 15 ++++++++++-----
> kernel/sched/cputime.c | 2 +-
> kernel/sched/sched.h | 2 +-
> 3 files changed, 12 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 9116bcc90346..ecc6c26096e5 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3789,7 +3789,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
> #endif
> if (p->in_iowait) {
> delayacct_blkio_end(p);
> - atomic_dec(&task_rq(p)->nr_iowait);
> + task_rq(p)->nr_iowait--;
> }
>
> activate_task(rq, p, en_flags);
> @@ -4354,8 +4354,13 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
> cpu = select_task_rq(p, p->wake_cpu, wake_flags | WF_TTWU);
> if (task_cpu(p) != cpu) {
> if (p->in_iowait) {
> + struct rq *rq = task_rq(p);
> + struct rq_flags rf;
> +
> + rq_lock(rq, &rf);
> + task_rq(p)->nr_iowait--;

Could this use rq directly, or does it not matter?

> + rq_unlock(rq, &rf);
> delayacct_blkio_end(p);
> - atomic_dec(&task_rq(p)->nr_iowait);
> }
>
> wake_flags |= WF_MIGRATED;
> @@ -5463,7 +5468,7 @@ unsigned long long nr_context_switches(void)
>
> unsigned int nr_iowait_cpu(int cpu)
> {
> - return atomic_read(&cpu_rq(cpu)->nr_iowait);
> + return cpu_rq(cpu)->nr_iowait;
> }
>
> /*
> @@ -6681,7 +6686,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
> deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
>
> if (prev->in_iowait) {
> - atomic_inc(&rq->nr_iowait);
> + rq->nr_iowait++;
> delayacct_blkio_start();
> }
> }
> @@ -10029,7 +10034,7 @@ void __init sched_init(void)
> #endif
> #endif /* CONFIG_SMP */
> hrtick_rq_init(rq);
> - atomic_set(&rq->nr_iowait, 0);
> + rq->nr_iowait = 0;

I checked that both ttwu_do_activate() and __schedule() have the rq lock
held, but I couldn't find it for this. Is it under the assumption that
the rq is in a pre-init state (maybe because scheduler_running = 0?) so
no lock is needed?

>
> #ifdef CONFIG_SCHED_CORE
> rq->core = rq;
> diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
> index af7952f12e6c..b970b6c6151e 100644
> --- a/kernel/sched/cputime.c
> +++ b/kernel/sched/cputime.c
> @@ -224,7 +224,7 @@ void account_idle_time(u64 cputime)
> u64 *cpustat = kcpustat_this_cpu->cpustat;
> struct rq *rq = this_rq();
>
> - if (atomic_read(&rq->nr_iowait) > 0)
> + if (rq->nr_iowait)
> cpustat[CPUTIME_IOWAIT] += cputime;
> else
> cpustat[CPUTIME_IDLE] += cputime;
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 001fe047bd5d..a1222a4bdc7b 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -1049,7 +1049,7 @@ struct rq {
> u64 clock_idle_copy;
> #endif
>
> - atomic_t nr_iowait;
> + unsigned int nr_iowait;
>
> #ifdef CONFIG_SCHED_DEBUG
> u64 last_seen_need_resched_ns;