Re: [RFC PATCH] psi : calc psi memstall time more precisely

From: Vlastimil Babka
Date: Thu Sep 09 2021 - 09:32:07 EST


On 9/9/21 14:00, Huangzhaoyang wrote:
> From: Zhaoyang Huang <zhaoyang.huang@xxxxxxxxxx>
>
> psi's memstall time is counted as simple as exit - entry so far, which ignore
> the task's off cpu time. Fix it by calc the percentage of off time via task and
> rq's util and runq load.

Wouldn't this also filter out IO wait time as that means sleeping, thus
again defeat the purpose of observing real stalls due to memory shortage?
If cpu starvation (due to overcommited system?) affects pci memstall
accuracy then that's suboptimal, but IMHO fixing it this way would do more
harm than good?

> Signed-off-by: Zhaoyang Huang <zhaoyang.huang@xxxxxxxxxx>
> ---
> kernel/sched/psi.c | 22 +++++++++++++++++++++-
> 1 file changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index cc25a3c..6812c7e 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -182,6 +182,8 @@ struct psi_group psi_system = {
>
> static void psi_avgs_work(struct work_struct *work);
>
> +static unsigned long psi_memtime_fixup(u32 growth);
> +
> static void group_init(struct psi_group *group)
> {
> int cpu;
> @@ -492,6 +494,23 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value)
> return growth;
> }
>
> +static unsigned long psi_memtime_fixup(u32 growth)
> +{
> + struct rq *rq = task_rq(current);
> + unsigned long growth_fixed = (unsigned long)growth;
> +
> + if !(current->policy == SCHED_NORMAL || current->policy == SCHED_BATCH)
> + return growth_fixed;
> +
> + if ((current->in_memstall)
> + && (current->se.avg.util_avg <= rq->cfs.avg.util_avg)
> + && current->se.avg.util_avg != 0)
> + growth_fixed = div64_ul((current->se.avg.util_avg + 1) * growth,
> + rq->cfs.avg.util_avg + rq->avg_rt.util_avg + 1);
> +
> + return growth_fixed;
> +}
> +
> static void init_triggers(struct psi_group *group, u64 now)
> {
> struct psi_trigger *t;
> @@ -658,6 +677,7 @@ static void record_times(struct psi_group_cpu *groupc, u64 now)
> }
>
> if (groupc->state_mask & (1 << PSI_MEM_SOME)) {
> + delta = psi_memtime_fixup(delta);
> groupc->times[PSI_MEM_SOME] += delta;
> if (groupc->state_mask & (1 << PSI_MEM_FULL))
> groupc->times[PSI_MEM_FULL] += delta;
> @@ -928,8 +948,8 @@ void psi_memstall_leave(unsigned long *flags)
> */
> rq = this_rq_lock_irq(&rf);
>
> - current->in_memstall = 0;
> psi_task_change(current, TSK_MEMSTALL, 0);
> + current->in_memstall = 0;
>
> rq_unlock_irq(rq, &rf);
> }
>