Re: [PATCH v2 09/10] sched/psi: per-cgroup PSI stats disable/re-enable interface
From: Johannes Weiner
Date: Mon Aug 15 2022 - 11:50:11 EST
On Mon, Aug 08, 2022 at 07:03:40PM +0800, Chengming Zhou wrote:
> +static ssize_t cgroup_psi_write(struct kernfs_open_file *of,
> + char *buf, size_t nbytes, loff_t off)
> +{
> + ssize_t ret;
> + int enable;
> + struct cgroup *cgrp;
> + struct psi_group *psi;
> +
> + ret = kstrtoint(strstrip(buf), 0, &enable);
> + if (ret)
> + return ret;
> +
> + if (enable < 0 || enable > 1)
> + return -ERANGE;
> +
> + cgrp = cgroup_kn_lock_live(of->kn, false);
> + if (!cgrp)
> + return -ENOENT;
> +
> + psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
> + psi_cgroup_enable(psi, enable);
I think it should also add/remove the pressure files when enabling and
disabling the aggregation, since their contents would be stale and
misleading.
Take a look at cgroup_add_dfl_cftypes() and cgroup_rm_cftypes()
> @@ -5115,6 +5152,12 @@ static struct cftype cgroup_base_files[] = {
> .release = cgroup_pressure_release,
> },
> #endif
> + {
> + .name = "cgroup.psi",
> + .flags = CFTYPE_PRESSURE,
> + .seq_show = cgroup_psi_show,
> + .write = cgroup_psi_write,
> + },
> #endif /* CONFIG_PSI */
> { } /* terminate */
> };
> diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> index 58f8092c938f..9df1686ee02d 100644
> --- a/kernel/sched/psi.c
> +++ b/kernel/sched/psi.c
> @@ -181,6 +181,7 @@ static void group_init(struct psi_group *group)
> {
> int cpu;
>
> + group->enabled = true;
> for_each_possible_cpu(cpu)
> seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
> group->avg_last_update = sched_clock();
> @@ -700,17 +701,16 @@ static void psi_group_change(struct psi_group *group, int cpu,
> groupc = per_cpu_ptr(group->pcpu, cpu);
>
> /*
> - * First we assess the aggregate resource states this CPU's
> - * tasks have been in since the last change, and account any
> - * SOME and FULL time these may have resulted in.
> - *
> - * Then we update the task counts according to the state
> + * First we update the task counts according to the state
> * change requested through the @clear and @set bits.
> + *
> + * Then if the cgroup PSI stats accounting enabled, we
> + * assess the aggregate resource states this CPU's tasks
> + * have been in since the last change, and account any
> + * SOME and FULL time these may have resulted in.
> */
> write_seqcount_begin(&groupc->seq);
>
> - record_times(groupc, now);
> -
> /*
> * Start with TSK_ONCPU, which doesn't have a corresponding
> * task count - it's just a boolean flag directly encoded in
> @@ -750,6 +750,14 @@ static void psi_group_change(struct psi_group *group, int cpu,
> if (set & (1 << t))
> groupc->tasks[t]++;
>
> + if (!group->enabled) {
> + if (groupc->state_mask & (1 << PSI_NONIDLE))
> + record_times(groupc, now);
Why record the nonidle time? It's only used for aggregation, which is
stopped as well.
> @@ -1088,6 +1097,23 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to)
>
> task_rq_unlock(rq, task, &rf);
> }
> +
> +void psi_cgroup_enable(struct psi_group *group, bool enable)
> +{
> + struct psi_group_cpu *groupc;
> + int cpu;
> + u64 now;
> +
> + if (group->enabled == enable)
> + return;
> + group->enabled = enable;
> +
> + for_each_possible_cpu(cpu) {
> + groupc = per_cpu_ptr(group->pcpu, cpu);
> + now = cpu_clock(cpu);
> + psi_group_change(group, cpu, 0, 0, now, true);
This loop deserves a comment, IMO.