Re: [PATCH 2/2] sched/debug: add sched_update_nr_running tracepoint

From: Qais Yousef
Date: Wed Sep 04 2019 - 10:37:18 EST


On 09/03/19 17:43, Radim KrÄmÃÅ wrote:
> The paper "The Linux Scheduler: a Decade of Wasted Cores" used several
> custom data gathering points to better understand what was going on in
> the scheduler.
> Red Hat adapted one of them for the tracepoint framework and created a
> tool to plot a heatmap of nr_running, where the sched_update_nr_running
> tracepoint is being used for fine grained monitoring of scheduling
> imbalance.
> The tool is available from https://github.com/jirvoz/plot-nr-running.
>
> The best place for the tracepoints is inside the add/sub_nr_running,
> which requires some shenanigans to make it work as they are defined
> inside sched.h.

I managed to hook into sched_switch to get the nr_running of cfs tasks via
eBPF.

```
int on_switch(struct sched_switch_args *args) {
struct task_struct *prev = (struct task_struct *)bpf_get_current_task();
struct cgroup *prev_cgroup = prev->cgroups->subsys[cpuset_cgrp_id]->cgroup;
const char *prev_cgroup_name = prev_cgroup->kn->name;

if (prev_cgroup->kn->parent) {
bpf_trace_printk("sched_switch_ext: nr_running=%d prev_cgroup=%s\\n",
prev->se.cfs_rq->nr_running,
prev_cgroup_name);
} else {
bpf_trace_printk("sched_switch_ext: nr_running=%d prev_cgroup=/\\n",
prev->se.cfs_rq->nr_running);
}
return 0;
};
```

You can do something similar by attaching to the sched_switch tracepoint from
a module and a create a new event to get the nr_running.

Now this is not as accurate as your proposed new tracepoint in terms where you
sample nr_running, but should be good enough?

Cheers

--
Qais Yousef

> The tracepoints have to be included from sched.h, which means that
> CREATE_TRACE_POINTS has to be defined for the whole header and this
> might cause problems if tree-wide headers expose tracepoints in sched.h
> dependencies, but I'd argue it's the other side's misuse of tracepoints.
>
> Moving the import sched.h line lower would require fixes in s390 and ppc
> headers, because they don't include dependecies properly and expect
> sched.h to do it, so it is simpler to keep sched.h there and
> preventively undefine CREATE_TRACE_POINTS right after.
>
> Exports of the pelt tracepoints remain because they don't need to be
> protected by CREATE_TRACE_POINTS and moving them closer would be
> unsightly.
>
> Tested-by: Jirka Hladkà <jhladky@xxxxxxxxxx>
> Tested-by: JiÅÃ VozÃr <jvozar@xxxxxxxxxx>
> Signed-off-by: Radim KrÄmÃÅ <rkrcmar@xxxxxxxxxx>
> ---
> include/trace/events/sched.h | 22 ++++++++++++++++++++++
> kernel/sched/core.c | 7 ++-----
> kernel/sched/fair.c | 2 --
> kernel/sched/sched.h | 7 +++++++
> 4 files changed, 31 insertions(+), 7 deletions(-)
>
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index 420e80e56e55..1527fc695609 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -625,6 +625,28 @@ DECLARE_TRACE(sched_overutilized_tp,
> TP_PROTO(struct root_domain *rd, bool overutilized),
> TP_ARGS(rd, overutilized));
>
> +TRACE_EVENT(sched_update_nr_running,
> +
> + TP_PROTO(int cpu, int change, unsigned int nr_running),
> +
> + TP_ARGS(cpu, change, nr_running),
> +
> + TP_STRUCT__entry(
> + __field(int, cpu)
> + __field(int, change)
> + __field(unsigned int, nr_running)
> + ),
> +
> + TP_fast_assign(
> + __entry->cpu = cpu;
> + __entry->change = change;
> + __entry->nr_running = nr_running;
> + ),
> +
> + TP_printk("cpu=%u nr_running=%u (%d)",
> + __entry->cpu, __entry->nr_running, __entry->change)
> +);
> +
> #endif /* _TRACE_SCHED_H */
>
> /* This part must be outside protection */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 71981ce84231..31ac37b9f6f7 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -6,7 +6,9 @@
> *
> * Copyright (C) 1991-2002 Linus Torvalds
> */
> +#define CREATE_TRACE_POINTS
> #include "sched.h"
> +#undef CREATE_TRACE_POINTS
>
> #include <linux/nospec.h>
>
> @@ -20,9 +22,6 @@
>
> #include "pelt.h"
>
> -#define CREATE_TRACE_POINTS
> -#include <trace/events/sched.h>
> -
> /*
> * Export tracepoints that act as a bare tracehook (ie: have no trace event
> * associated with them) to allow external modules to probe them.
> @@ -7618,5 +7617,3 @@ const u32 sched_prio_to_wmult[40] = {
> /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
> /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
> };
> -
> -#undef CREATE_TRACE_POINTS
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 84959d3285d1..421236d39902 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -22,8 +22,6 @@
> */
> #include "sched.h"
>
> -#include <trace/events/sched.h>
> -
> /*
> * Targeted preemption latency for CPU-bound tasks:
> *
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index c4915f46035a..b89d7786109a 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -75,6 +75,8 @@
> #include "cpupri.h"
> #include "cpudeadline.h"
>
> +#include <trace/events/sched.h>
> +
> #ifdef CONFIG_SCHED_DEBUG
> # define SCHED_WARN_ON(x) WARN_ONCE(x, #x)
> #else
> @@ -1887,6 +1889,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
>
> rq->nr_running = prev_nr + count;
>
> + trace_sched_update_nr_running(cpu_of(rq), count, rq->nr_running);
> +
> #ifdef CONFIG_SMP
> if (prev_nr < 2 && rq->nr_running >= 2) {
> if (!READ_ONCE(rq->rd->overload))
> @@ -1900,6 +1904,9 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
> static inline void sub_nr_running(struct rq *rq, unsigned count)
> {
> rq->nr_running -= count;
> +
> + trace_sched_update_nr_running(cpu_of(rq), -count, rq->nr_running);
> +
> /* Check if we still need preemption */
> sched_update_tick_dependency(rq);
> }
> --
> 2.23.0
>