Re: [PATCH v2] psi: Remove the redundant psi_task_tick

From: Johannes Weiner
Date: Tue Feb 09 2021 - 10:49:13 EST


Hello Chengming,

On Tue, Feb 09, 2021 at 03:10:33PM +0800, Chengming Zhou wrote:
> When the current task in a cgroup is in_memstall, the corresponding groupc
> on that cpu is in PSI_MEM_FULL state, so we can exploit that to remove the
> redundant psi_task_tick from scheduler_tick to save this periodic cost.

Can you please update the patch name and the changelog to the new
version of the patch? It's not removing the redundant tick, it's
moving the reclaim detection from the timer tick to the task state
tracking machinery using the recently added ONCPU state.

> Signed-off-by: Muchun Song <songmuchun@xxxxxxxxxxxxx>
> Signed-off-by: Chengming Zhou <zhouchengming@xxxxxxxxxxxxx>
> ---
> include/linux/psi.h | 1 -
> kernel/sched/core.c | 1 -
> kernel/sched/psi.c | 49 ++++++++++++++-----------------------------------
> kernel/sched/stats.h | 9 ---------
> 4 files changed, 14 insertions(+), 46 deletions(-)
>
> diff --git a/include/linux/psi.h b/include/linux/psi.h
> index 7361023f3fdd..65eb1476ac70 100644
> --- a/include/linux/psi.h
> +++ b/include/linux/psi.h
> @@ -20,7 +20,6 @@ void psi_task_change(struct task_struct *task, int clear, int set);
> void psi_task_switch(struct task_struct *prev, struct task_struct *next,
> bool sleep);
>
> -void psi_memstall_tick(struct task_struct *task, int cpu);
> void psi_memstall_enter(unsigned long *flags);
> void psi_memstall_leave(unsigned long *flags);
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 15d2562118d1..31788a9b335b 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4533,7 +4533,6 @@ void scheduler_tick(void)
> update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
> curr->sched_class->task_tick(rq, curr, 0);
> calc_global_load_tick(rq);
> - psi_task_tick(rq);
>
> rq_unlock(rq, &rf);
>
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 2293c45d289d..6e46d9eb279b 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -644,8 +644,7 @@ static void poll_timer_fn(struct timer_list *t)
> wake_up_interruptible(&group->poll_wait);
> }
>
> -static void record_times(struct psi_group_cpu *groupc, int cpu,
> - bool memstall_tick)
> +static void record_times(struct psi_group_cpu *groupc, int cpu)
> {
> u32 delta;
> u64 now;
> @@ -664,23 +663,6 @@ static void record_times(struct psi_group_cpu *groupc, int cpu,
> groupc->times[PSI_MEM_SOME] += delta;
> if (groupc->state_mask & (1 << PSI_MEM_FULL))
> groupc->times[PSI_MEM_FULL] += delta;
> - else if (memstall_tick) {
> - u32 sample;
> - /*
> - * Since we care about lost potential, a
> - * memstall is FULL when there are no other
> - * working tasks, but also when the CPU is
> - * actively reclaiming and nothing productive
> - * could run even if it were runnable.
> - *
> - * When the timer tick sees a reclaiming CPU,
> - * regardless of runnable tasks, sample a FULL
> - * tick (or less if it hasn't been a full tick
> - * since the last state change).
> - */
> - sample = min(delta, (u32)jiffies_to_nsecs(1));
> - groupc->times[PSI_MEM_FULL] += sample;
> - }
> }
>
> if (groupc->state_mask & (1 << PSI_CPU_SOME)) {
> @@ -714,7 +696,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
> */
> write_seqcount_begin(&groupc->seq);
>
> - record_times(groupc, cpu, false);
> + record_times(groupc, cpu);
>
> for (t = 0, m = clear; m; m &= ~(1 << t), t++) {
> if (!(m & (1 << t)))
> @@ -738,6 +720,18 @@ static void psi_group_change(struct psi_group *group, int cpu,
> if (test_state(groupc->tasks, s))
> state_mask |= (1 << s);
> }
> +
> + /*
> + * Since we care about lost potential, a memstall is FULL
> + * when there are no other working tasks, but also when
> + * the CPU is actively reclaiming and nothing productive
> + * could run even if it were runnable. So when the current
> + * task in a cgroup is in_memstall, the corresponding groupc
> + * on that cpu is in PSI_MEM_FULL state.
> + */
> + if (groupc->tasks[NR_ONCPU] && cpu_curr(cpu)->in_memstall)
> + state_mask |= (1 << PSI_MEM_FULL);

This doesn't really work with the psi_task_switch() optimization. If
we switch between two tasks inside a leaf group, where one is memstall
and the other is not, we don't update the parents properly. So you
need another branch in there as well for checking memstall. At which
point the timer tick implementation is likely cheaper and simpler...

> @@ -144,14 +144,6 @@ static inline void psi_sched_switch(struct task_struct *prev,
> psi_task_switch(prev, next, sleep);
> }
>
> -static inline void psi_task_tick(struct rq *rq)
> -{
> - if (static_branch_likely(&psi_disabled))
> - return;
> -
> - if (unlikely(rq->curr->in_memstall))
> - psi_memstall_tick(rq->curr, cpu_of(rq));
> -}
> #else /* CONFIG_PSI */
> static inline void psi_enqueue(struct task_struct *p, bool wakeup) {}
> static inline void psi_dequeue(struct task_struct *p, bool sleep) {}