Re: [PATCH v3 5/5] psi: introduce psi monitor

From: Johannes Weiner
Date: Tue Jan 29 2019 - 13:35:41 EST


Hi Minchan,

good to see your name on the lists again :)

On Tue, Jan 29, 2019 at 08:53:58AM +0900, Minchan Kim wrote:
> On Thu, Jan 24, 2019 at 01:15:18PM -0800, Suren Baghdasaryan wrote:
> > @@ -68,6 +69,50 @@ struct psi_group_cpu {
> > u32 times_prev[NR_PSI_STATES] ____cacheline_aligned_in_smp;
> > };
> >
> > +/* PSI growth tracking window */
> > +struct psi_window {
> > + /* Window size in ns */
> > + u64 size;
>
> As rest of field are time, how about "interval" instead of size?

If it were "size" on its own, I would agree, but "window size" is an
existing term that works pretty well here. "window interval" wouldn't.

> > + /* Start time of the current window in ns */
> > + u64 start_time;
> > +
> > + /* Value at the start of the window */
> > + u64 start_value;
>
> "value" is rather vague. starting_stall?

I'm not a fan of using stall here, because it reads like an event,
when it's really a time interval we're interested in.

For an abstraction that samples time intervals, value seems like a
pretty good, straight-forward name...

> > +
> > + /* Value growth per previous window(s) */
> > + u64 per_win_growth;
>
> Rather than per, prev would be more meaninful, I think.
> How about prev_win_stall?

Agreed on the "per", but still not loving the stall. How about
prev_delta? prev_growth?

> > +struct psi_trigger {
> > + /* PSI state being monitored by the trigger */
> > + enum psi_states state;
> > +
> > + /* User-spacified threshold in ns */
> > + u64 threshold;
> > +
> > + /* List node inside triggers list */
> > + struct list_head node;
> > +
> > + /* Backpointer needed during trigger destruction */
> > + struct psi_group *group;
> > +
> > + /* Wait queue for polling */
> > + wait_queue_head_t event_wait;
> > +
> > + /* Pending event flag */
> > + int event;
> > +
> > + /* Tracking window */
> > + struct psi_window win;
> > +
> > + /*
> > + * Time last event was generated. Used for rate-limiting
> > + * events to one per window
> > + */
> > + u64 last_event_time;
> > +};
> > +
> > struct psi_group {
> > /* Protects data used by the aggregator */
> > struct mutex update_lock;
> > @@ -75,6 +120,8 @@ struct psi_group {
> > /* Per-cpu task state & time tracking */
> > struct psi_group_cpu __percpu *pcpu;
> >
> > + /* Periodic work control */
> > + atomic_t polling;
> > struct delayed_work clock_work;
> >
> > /* Total stall times observed */
> > @@ -85,6 +132,18 @@ struct psi_group {
> > u64 avg_last_update;
> > u64 avg_next_update;
> > unsigned long avg[NR_PSI_STATES - 1][3];
> > +
> > + /* Configured polling triggers */
> > + struct list_head triggers;
> > + u32 nr_triggers[NR_PSI_STATES - 1];
> > + u32 trigger_mask;
>
> This is a state we have an interest.
> How about trigger_states?

Sounds good to me, I'd also switch change_mask below to
changed_states:

if (changed_states & trigger_states)
/* engage! */

[ After reading the rest, I see Minchan proposed the same. ]

> > + u64 trigger_min_period;
> > +
> > + /* Polling state */
> > + /* Total stall times at the start of monitor activation */
> > + u64 polling_total[NR_PSI_STATES - 1];
> > + u64 polling_next_update;
> > + u64 polling_until;
> > };
> >
> > #else /* CONFIG_PSI */
> > diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> > index e8cd12c6a553..de3ac22a5e23 100644
> > --- a/kernel/cgroup/cgroup.c
> > +++ b/kernel/cgroup/cgroup.c
> > @@ -3464,7 +3464,101 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
> > {
> > return psi_show(seq, &seq_css(seq)->cgroup->psi, PSI_CPU);
> > }
> > -#endif
> > +
> > +static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
> > + size_t nbytes, enum psi_res res)
> > +{
> > + enum psi_states state;
> > + struct psi_trigger *old;
> > + struct psi_trigger *new;
> > + struct cgroup *cgrp;
> > + u32 threshold_us;
> > + u32 win_sz_us;
>
> window_us?

We don't really encode units in variables in the rest of the code,
maybe we can drop it here as well.

Btw, it looks like the original reason for splitting up trigger_parse
and trigger_create seems gone from the code. Can we merge them again
and keep all those details out of the filesystem ->write methods?

new = psi_trigger_create(group, buf, nbytes, res);

> > + ssize_t ret;
> > +
> > + cgrp = cgroup_kn_lock_live(of->kn, false);
> > + if (!cgrp)
> > + return -ENODEV;
> > +
> > + cgroup_get(cgrp);
> > + cgroup_kn_unlock(of->kn);
> > +
> > + ret = psi_trigger_parse(buf, nbytes, res,
> > + &state, &threshold_us, &win_sz_us);
> > + if (ret) {
> > + cgroup_put(cgrp);
> > + return ret;
> > + }
> > +
> > + new = psi_trigger_create(&cgrp->psi,
> > + state, threshold_us, win_sz_us);
> > + if (IS_ERR(new)) {
> > + cgroup_put(cgrp);
> > + return PTR_ERR(new);
> > + }
> > +
> > + old = of->priv;
> > + rcu_assign_pointer(of->priv, new);
> > + if (old) {
> > + synchronize_rcu();
> > + psi_trigger_destroy(old);
> > + }
> > +
> > + cgroup_put(cgrp);
> > +
> > + return nbytes;
> > +}
> > +
> > +static ssize_t cgroup_io_pressure_write(struct kernfs_open_file *of,
> > + char *buf, size_t nbytes,
> > + loff_t off)
> > +{
> > + return cgroup_pressure_write(of, buf, nbytes, PSI_IO);
> > +}
> > +
> > +static ssize_t cgroup_memory_pressure_write(struct kernfs_open_file *of,
> > + char *buf, size_t nbytes,
> > + loff_t off)
> > +{
> > + return cgroup_pressure_write(of, buf, nbytes, PSI_MEM);
> > +}
> > +
> > +static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
> > + char *buf, size_t nbytes,
> > + loff_t off)
> > +{
> > + return cgroup_pressure_write(of, buf, nbytes, PSI_CPU);
> > +}
> > +
> > +static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
> > + poll_table *pt)
> > +{
> > + struct psi_trigger *t;
> > + __poll_t ret;
> > +
> > + rcu_read_lock();
> > + t = rcu_dereference(of->priv);
> > + if (t)
> > + ret = psi_trigger_poll(t, of->file, pt);
> > + else
> > + ret = DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
> > + rcu_read_unlock();
> > +
> > + return ret;
> > +}
> > +
> > +static void cgroup_pressure_release(struct kernfs_open_file *of)
> > +{
> > + struct psi_trigger *t = of->priv;
> > +
> > + if (!t)
> > + return;
> > +
> > + rcu_assign_pointer(of->priv, NULL);
> > + synchronize_rcu();
> > + psi_trigger_destroy(t);
> > +}
> > +#endif /* CONFIG_PSI */
> >
> > static int cgroup_file_open(struct kernfs_open_file *of)
> > {
> > @@ -4619,18 +4713,27 @@ static struct cftype cgroup_base_files[] = {
> > .name = "io.pressure",
> > .flags = CFTYPE_NOT_ON_ROOT,
> > .seq_show = cgroup_io_pressure_show,
> > + .write = cgroup_io_pressure_write,
> > + .poll = cgroup_pressure_poll,
> > + .release = cgroup_pressure_release,
> > },
> > {
> > .name = "memory.pressure",
> > .flags = CFTYPE_NOT_ON_ROOT,
> > .seq_show = cgroup_memory_pressure_show,
> > + .write = cgroup_memory_pressure_write,
> > + .poll = cgroup_pressure_poll,
> > + .release = cgroup_pressure_release,
> > },
> > {
> > .name = "cpu.pressure",
> > .flags = CFTYPE_NOT_ON_ROOT,
> > .seq_show = cgroup_cpu_pressure_show,
> > + .write = cgroup_cpu_pressure_write,
> > + .poll = cgroup_pressure_poll,
> > + .release = cgroup_pressure_release,
> > },
> > -#endif
> > +#endif /* CONFIG_PSI */
> > { } /* terminate */
> > };
> >
> > diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
> > index c366503ba135..fefb98f19a80 100644
> > --- a/kernel/sched/psi.c
> > +++ b/kernel/sched/psi.c
> > @@ -4,6 +4,9 @@
> > * Copyright (c) 2018 Facebook, Inc.
> > * Author: Johannes Weiner <hannes@xxxxxxxxxxx>
> > *
> > + * Polling support by Suren Baghdasaryan <surenb@xxxxxxxxxx>
> > + * Copyright (c) 2018 Google, Inc.
> > + *
> > * When CPU, memory and IO are contended, tasks experience delays that
> > * reduce throughput and introduce latencies into the workload. Memory
> > * and IO contention, in addition, can cause a full loss of forward
> > @@ -126,11 +129,16 @@
> >
> > #include <linux/sched/loadavg.h>
> > #include <linux/seq_file.h>
> > +#include <linux/eventfd.h>
> > #include <linux/proc_fs.h>
> > #include <linux/seqlock.h>
> > +#include <linux/uaccess.h>
> > #include <linux/cgroup.h>
> > #include <linux/module.h>
> > #include <linux/sched.h>
> > +#include <linux/ctype.h>
> > +#include <linux/file.h>
> > +#include <linux/poll.h>
> > #include <linux/psi.h>
> > #include "sched.h"
> >
> > @@ -150,11 +158,16 @@ static int __init setup_psi(char *str)
> > __setup("psi=", setup_psi);
> >
> > /* Running averages - we need to be higher-res than loadavg */
> > -#define PSI_FREQ (2*HZ+1) /* 2 sec intervals */
> > +#define PSI_FREQ (2*HZ+1UL) /* 2 sec intervals */
> > #define EXP_10s 1677 /* 1/exp(2s/10s) as fixed-point */
> > #define EXP_60s 1981 /* 1/exp(2s/60s) */
> > #define EXP_300s 2034 /* 1/exp(2s/300s) */
> >
> > +/* PSI trigger definitions */
> > +#define PSI_TRIG_MIN_WIN_US 500000 /* Min window size is 500ms */
> > +#define PSI_TRIG_MAX_WIN_US 10000000 /* Max window size is 10s */
> > +#define PSI_TRIG_UPDATES_PER_WIN 10 /* 10 updates per window */
>
> To me, it's rather long.
> How about WINDOW_MIN_US, WINDOW_MAX_US, UPDATES_PER_WINDOW?

Sounds good to me too. I'm just ambivalent on the _US suffix. Dealer's
choice, though.

> > +
> > /* Sampling frequency in nanoseconds */
> > static u64 psi_period __read_mostly;
> >
> > @@ -173,8 +186,17 @@ static void group_init(struct psi_group *group)
> > for_each_possible_cpu(cpu)
> > seqcount_init(&per_cpu_ptr(group->pcpu, cpu)->seq);
> > group->avg_next_update = sched_clock() + psi_period;
> > + atomic_set(&group->polling, 0);
> > INIT_DELAYED_WORK(&group->clock_work, psi_update_work);
> > mutex_init(&group->update_lock);
> > + /* Init trigger-related members */
> > + INIT_LIST_HEAD(&group->triggers);
> > + memset(group->nr_triggers, 0, sizeof(group->nr_triggers));
> > + group->trigger_mask = 0;
> > + group->trigger_min_period = U32_MAX;
> > + memset(group->polling_total, 0, sizeof(group->polling_total));
> > + group->polling_next_update = ULLONG_MAX;
> > + group->polling_until = 0;
> > }
> >
> > void __init psi_init(void)
> > @@ -209,10 +231,11 @@ static bool test_state(unsigned int *tasks, enum psi_states state)
> > }
> > }
> >
> > -static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
> > +static u32 get_recent_times(struct psi_group *group, int cpu, u32 *times)
>
> Rather awkward to return change_mask when we consider function name as
> get_recent_times It would be better to add additional parameter
> instead of return value.

Good suggestion, I have to agree this would be nicer.

> > {
> > struct psi_group_cpu *groupc = per_cpu_ptr(group->pcpu, cpu);
> > u64 now, state_start;
> > + u32 change_mask = 0;
> > enum psi_states s;
> > unsigned int seq;
> > u32 state_mask;
> > @@ -245,7 +268,11 @@ static void get_recent_times(struct psi_group *group, int cpu, u32 *times)
> > groupc->times_prev[s] = times[s];
> >
> > times[s] = delta;
> > + if (delta)
> > + change_mask |= (1 << s);
> > }
> > +
> > + return change_mask;
> > }
> >
> > static void calc_avgs(unsigned long avg[3], int missed_periods,
> > @@ -268,17 +295,14 @@ static void calc_avgs(unsigned long avg[3], int missed_periods,
> > avg[2] = calc_load(avg[2], EXP_300s, pct);
> > }
> >
> > -static bool update_stats(struct psi_group *group)
> > +static u32 collect_percpu_times(struct psi_group *group)
>
> Not sure it's a good idea to add "implementation facility" in here.
> How about update_stall_time with additional parameter of
> "[changed|updated]_states?
>
> IOW,
> static void update_stall_times(struct psi_group *group, u32 *changed_states)

I disagree on this one. collect_percpu_times() isn't too detailed of a
name, but it does reflect the complexity/cost of the function and the
structure that is being aggregated, which is a good thing.

But the return-by-parameter is a good idea.

> > u64 deltas[NR_PSI_STATES - 1] = { 0, };
> > - unsigned long missed_periods = 0;
> > unsigned long nonidle_total = 0;
> > - u64 now, expires, period;
> > + u32 change_mask = 0;
> > int cpu;
> > int s;
> >
> > - mutex_lock(&group->update_lock);
> > -
> > /*
> > * Collect the per-cpu time buckets and average them into a
> > * single time sample that is normalized to wallclock time.
> > @@ -291,7 +315,7 @@ static bool update_stats(struct psi_group *group)
> > u32 times[NR_PSI_STATES];
> > u32 nonidle;
> >
> > - get_recent_times(group, cpu, times);
> > + change_mask |= get_recent_times(group, cpu, times);
> >
> > nonidle = nsecs_to_jiffies(times[PSI_NONIDLE]);
> > nonidle_total += nonidle;
> > @@ -316,11 +340,18 @@ static bool update_stats(struct psi_group *group)
> > for (s = 0; s < NR_PSI_STATES - 1; s++)
> > group->total[s] += div_u64(deltas[s], max(nonidle_total, 1UL));
> >
> > + return change_mask;
> > +}
> > +
> > +static u64 calculate_averages(struct psi_group *group, u64 now)
>
> time?
>
> The function name is still awkward to me.
> If someone see this function, he will expect return value as "average", not next_update.
> If we want to have next_update as return value, it would better to rename *update_avgs*.

update_averages() would be nice, agreed.

But I disagree on the now -> time. time is really vague - could be a
random timestamp or a period. We use "now" everywhere in this code to
mean the current time (cpu clock in cpu-local paths, sched clock for
global stuff), so let's keep it here as well.

> > +{
> > + unsigned long missed_periods = 0;
> > + u64 expires, period;
> > + u64 avg_next_update;
> > + int s;
> > +
> > /* avgX= */
> > - now = sched_clock();
> > expires = group->avg_next_update;
> > - if (now < expires)
> > - goto out;
> > if (now - expires > psi_period)
> > missed_periods = div_u64(now - expires, psi_period);
> >
> > @@ -331,7 +362,7 @@ static bool update_stats(struct psi_group *group)
> > * But the deltas we sample out of the per-cpu buckets above
> > * are based on the actual time elapsing between clock ticks.
> > */
> > - group->avg_next_update = expires + ((1 + missed_periods) * psi_period);
> > + avg_next_update = expires + ((1 + missed_periods) * psi_period);
> > period = now - (group->avg_last_update + (missed_periods * psi_period));
> > group->avg_last_update = now;
> >
> > @@ -361,20 +392,237 @@ static bool update_stats(struct psi_group *group)
> > group->avg_total[s] += sample;
> > calc_avgs(group->avg[s], missed_periods, sample, period);
> > }
> > -out:
> > - mutex_unlock(&group->update_lock);
> > - return nonidle_total;
> > +
> > + return avg_next_update;
> > +}
> > +
> > +/* Trigger tracking window manupulations */
> > +static void window_init(struct psi_window *win, u64 now, u64 value)
> > +{
> > + win->start_value = value;
> > + win->start_time = now;
> > + win->per_win_growth = 0;
> > +}
> > +
> > +/*
> > + * PSI growth tracking window update and growth calculation routine.
>
> Let's add empty line here.

Agreed.

> > + * This approximates a sliding tracking window by interpolating
> > + * partially elapsed windows using historical growth data from the
> > + * previous intervals. This minimizes memory requirements (by not storing
> > + * all the intermediate values in the previous window) and simplifies
> > + * the calculations. It works well because PSI signal changes only in
> > + * positive direction and over relatively small window sizes the growth
> > + * is close to linear.
> > + */
> > +static u64 window_update(struct psi_window *win, u64 now, u64 value)
>
> Hope to change now as just time for function.
>
> Insetad of value, couldn't we use more concrete naming?
> Maybe stall_time or just stall?

Disagreed on both :)

> > +{
> > + u64 interval;
>
> elapsed?

Hm, elapsed is a bit better, but how about period? We use that in the
averages code for the same functionality.

> > + u64 growth;
> > +
> > + interval = now - win->start_time;
> > + growth = value - win->start_value;
> > + /*
> > + * After each tracking window passes win->start_value and
> > + * win->start_time get reset and win->per_win_growth stores
> > + * the average per-window growth of the previous window.
> > + * win->per_win_growth is then used to interpolate additional
> > + * growth from the previous window assuming it was linear.
> > + */
> > + if (interval > win->size) {
> > + win->per_win_growth = growth;
> > + win->start_value = value;
> > + win->start_time = now;
>
> We can use window_init via adding per_win_growth in the function
> parameter. Maybe, window_reset would be better function name.
>
> > + } else {
> > + u32 unelapsed;
>
> remaining? remained?

Yup, much better.

> > +
> > + unelapsed = win->size - interval;
> > + growth += div_u64(win->per_win_growth * unelapsed, win->size);
> > + }
> > +
> > + return growth;
> > +}
> > +
> > +static void init_triggers(struct psi_group *group, u64 now)
> > +{
> > + struct psi_trigger *t;
> > +
> > + list_for_each_entry(t, &group->triggers, node)
> > + window_init(&t->win, now, group->total[t->state]);
> > + memcpy(group->polling_total, group->total,
> > + sizeof(group->polling_total));
> > + group->polling_next_update = now + group->trigger_min_period;
> > +}
> > +
> > +static u64 poll_triggers(struct psi_group *group, u64 now)
>
> How about update_[poll|trigger]_stat?

update_triggers()? The signature already matches the update_averages()
one, so we might as well do the same thing there I guess.

> > +{
> > + struct psi_trigger *t;
> > + bool new_stall = false;
> > +
> > + /*
> > + * On subsequent updates, calculate growth deltas and let
> > + * watchers know when their specified thresholds are exceeded.
> > + */
> > + list_for_each_entry(t, &group->triggers, node) {
> > + u64 growth;
> > +
> > + /* Check for stall activity */
> > + if (group->polling_total[t->state] == group->total[t->state])
> > + continue;
> > +
> > + /*
> > + * Multiple triggers might be looking at the same state,
> > + * remember to update group->polling_total[] once we've
> > + * been through all of them. Also remember to extend the
> > + * polling time if we see new stall activity.
> > + */
> > + new_stall = true;
> > +
> > + /* Calculate growth since last update */
> > + growth = window_update(&t->win, now, group->total[t->state]);
> > + if (growth < t->threshold)
> > + continue;
> > +
> > + /* Limit event signaling to once per window */
> > + if (now < t->last_event_time + t->win.size)
> > + continue;
> > +
> > + /* Generate an event */
> > + if (cmpxchg(&t->event, 0, 1) == 0)
> > + wake_up_interruptible(&t->event_wait);
> > + t->last_event_time = now;
> > + }
> > +
> > + if (new_stall) {
> > + memcpy(group->polling_total, group->total,
> > + sizeof(group->polling_total));
> > + }
> > +
> > + return now + group->trigger_min_period;
> > }
> >
> > +/*
> > + * psi_update_work represents slowpath accounting part while psi_group_change
> > + * represents hotpath part. There are two potential races between them:
> > + * 1. Changes to group->polling when slowpath checks for new stall, then hotpath
> > + * records new stall and then slowpath resets group->polling flag. This leads
> > + * to the exit from the polling mode while monitored state is still changing.
> > + * 2. Slowpath overwriting an immediate update scheduled from the hotpath with
> > + * a regular update further in the future and missing the immediate update.
> > + * Both races are handled with a retry cycle in the slowpath:
> > + *
> > + * HOTPATH: | SLOWPATH:
> > + * |
> > + * A) times[cpu] += delta | E) delta = times[*]
> > + * B) start_poll = (delta[poll_mask] &&| if delta[poll_mask]:
> > + * cmpxchg(g->polling, 0, 1) == 0)| F) polling_until = now + grace_period
> > + * if start_poll: | if now > polling_until:
> > + * C) mod_delayed_work(1) | if g->polling:
> > + * else if !delayed_work_pending():| G) g->polling = polling = 0
> > + * D) schedule_delayed_work(PSI_FREQ)| smp_mb
> > + * | H) goto SLOWPATH
> > + * | else:
> > + * | if !g->polling:
> > + * | I) g->polling = polling = 1
> > + * | J) if delta && first_pass:
> > + * | next_avg = calculate_averages()
> > + * | if polling:
> > + * | next_poll = poll_triggers()
> > + * | if (delta && first_pass) || polling:
> > + * | K) mod_delayed_work(
> > + * | min(next_avg, next_poll))
> > + * | if !polling:
> > + * | first_pass = false
> > + * | L) goto SLOWPATH
> > + *
> > + * Race #1 is represented by (EABGD) sequence in which case slowpath deactivates
> > + * polling mode because it misses new monitored stall and hotpath doesn't
> > + * activate it because at (B) g->polling is not yet reset by slowpath in (G).
> > + * This race is handled by the (H) retry, which in the race described above
> > + * results in the new sequence of (EABGDHEIK) that reactivates polling mode.
> > + *
> > + * Race #2 is represented by polling==false && (JABCK) sequence which overwrites
> > + * immediate update scheduled at (C) with a later (next_avg) update scheduled at
> > + * (K). This race is handled by the (L) retry which results in the new sequence
> > + * of polling==false && (JABCKLEIK) that reactivates polling mode and
> > + * reschedules the next polling update (next_poll).
> > + *
> > + * Note that retries can't result in an infinite loop because retry #1 happens
> > + * only during polling reactivation and retry #2 happens only on the first pass.
> > + * Constant reactivations are impossible because polling will stay active for at
> > + * least grace_period. Worst case scenario involves two retries (HEJKLE)
> > + */
> > static void psi_update_work(struct work_struct *work)
> > {
> > struct delayed_work *dwork;
> > struct psi_group *group;
> > + bool first_pass = true;
> > + u64 next_update;
> > + u32 change_mask;
>
> How about [changed|updated]_states?

changed_states sounds good to me.