Re: [RFC PATCH v4 5/7] sched, tracing: add to report task state in symbolic chars

From: Steven Rostedt
Date: Wed Aug 02 2023 - 11:08:13 EST


On Wed, 2 Aug 2023 08:10:00 -0400
Ze Gao <zegao2021@xxxxxxxxx> wrote:

> From: Ze Gao <zegao2021@xxxxxxxxx>
>
> Internal representations of task state are likely to be changed
> or ordered, and reporting them to userspace without exporting
> them as part of API is basically wrong, which can easily break
> a userspace observability tool as kernel evolves. For example,
> perf suffers from this and still reports wrong states as of this
> writing.
>
> OTOH, some masqueraded states like TASK_REPORT_IDLE and
> TASK_REPORT_MAX are also reported inadvertently, which confuses
> things even more and most userspace tools do not even take them
> into consideration.
>
> So add a new variable in company with the old raw value to
> report task state in symbolic chars, which are self-explaining
> and no further translation is needed. Of course this does not
> break any userspace tool.
>
> Note for PREEMPT_ACTIVE, we introduce 'p' to report it and use
> the old conventions for the rest.
>
> Signed-off-by: Ze Gao <zegao@xxxxxxxxxxx>
> Reviewed-by: Masami Hiramatsu (Google) <mhiramat@xxxxxxxxxx>
> Acked-by: Ian Rogers <irogers@xxxxxxxxxx>
> ---
> include/trace/events/sched.h | 44 ++++++++++++++++++++++--------------
> 1 file changed, 27 insertions(+), 17 deletions(-)
>
> diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
> index 7d34db20b2c6..1c7b94793495 100644
> --- a/include/trace/events/sched.h
> +++ b/include/trace/events/sched.h
> @@ -6,6 +6,7 @@
> #define _TRACE_SCHED_H
>
> #include <linux/kthread.h>
> +#include <linux/sched.h>
> #include <linux/sched/numa_balancing.h>
> #include <linux/tracepoint.h>
> #include <linux/binfmts.h>
> @@ -214,6 +215,27 @@ static inline int __trace_sched_switch_state(bool preempt,
>
> return state ? (1 << (state - 1)) : state;
> }
> +
> +static inline char __trace_sched_switch_state_char(bool preempt,
> + unsigned int prev_state,
> + struct task_struct *p)
> +{
> + long state;
> +
> +#ifdef CONFIG_SCHED_DEBUG
> + BUG_ON(p != current);

BUG? Why not WARN_ON()?

-- Steve

> +#endif /* CONFIG_SCHED_DEBUG */
> +
> + /*
> + * For PREEMPT_ACTIVE, we introduce 'p' to report it and use the old
> + * conventions for the rest.
> + */
> + if (preempt)
> + return 'p';
> +
> + state = __task_state_index(prev_state, p->exit_state);
> + return task_index_to_char(state);
> +}
> #endif /* CREATE_TRACE_POINTS */
>
> /*
> @@ -236,6 +258,7 @@ TRACE_EVENT(sched_switch,
> __field( int, prev_state )
> __array( char, prev_comm, TASK_COMM_LEN )
> __array( char, next_comm, TASK_COMM_LEN )
> + __field( char, prev_state_char )
> ),
>
> TP_fast_assign(
> @@ -246,26 +269,13 @@ TRACE_EVENT(sched_switch,
> __entry->prev_state = __trace_sched_switch_state(preempt, prev_state, prev);
> memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
> memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
> + __entry->prev_state_char = __trace_sched_switch_state_char(preempt, prev_state, prev);
> /* XXX SCHED_DEADLINE */
> ),
>
> - TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
> - __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
> -
> - (__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
> - __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
> - { TASK_INTERRUPTIBLE, "S" },
> - { TASK_UNINTERRUPTIBLE, "D" },
> - { __TASK_STOPPED, "T" },
> - { __TASK_TRACED, "t" },
> - { EXIT_DEAD, "X" },
> - { EXIT_ZOMBIE, "Z" },
> - { TASK_PARKED, "P" },
> - { TASK_DEAD, "I" }) :
> - "R",
> -
> - __entry->prev_state & TASK_REPORT_MAX ? "+" : "",
> - __entry->next_comm, __entry->next_pid, __entry->next_prio)
> + TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%c ==> next_comm=%s next_pid=%d next_prio=%d",
> + __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, __entry->prev_state_char, __entry->next_comm,
> + __entry->next_pid, __entry->next_prio)
> );
>
> /*