[RFC][PATCH] sched: Cleanup task->state printing

From: Peter Zijlstra
Date: Fri Sep 22 2017 - 07:54:47 EST


On Fri, Sep 22, 2017 at 11:35:33AM +0200, Markus Trippelsdorf wrote:
> > It seems to work. Simply returning "I (idle)" from get_task_state() in
> > fs/proc/array.c when the state is TASK_IDLE does the trick.
> > I've tested top, htop and ps.

I ended up with the below; there was quite a lot of inconsistent state
printing around it seems.

I should probably split this thing into a bunch of patches :/

Alongside an explicit idle state, this also exposes TASK_PARKED,
although arguably we could map that to idle too. Opinions?

---
fs/proc/array.c | 35 ++++++++++++-----------
include/linux/sched.h | 58 +++++++++++++++++++++++----------------
include/trace/events/sched.h | 24 +++++++++++-----
kernel/sched/core.c | 22 ++++++++++++++-
kernel/sched/debug.c | 2 --
kernel/trace/trace_output.c | 21 ++++----------
kernel/trace/trace_sched_wakeup.c | 12 ++++----
7 files changed, 103 insertions(+), 71 deletions(-)

diff --git a/fs/proc/array.c b/fs/proc/array.c
index 88c355574aa0..5a076854857f 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -118,28 +118,31 @@ static inline void task_name(struct seq_file *m, struct task_struct *p)
* simple bit tests.
*/
static const char * const task_state_array[] = {
- "R (running)", /* 0 */
- "S (sleeping)", /* 1 */
- "D (disk sleep)", /* 2 */
- "T (stopped)", /* 4 */
- "t (tracing stop)", /* 8 */
- "X (dead)", /* 16 */
- "Z (zombie)", /* 32 */
+ /* states inside TASK_REPORT */
+
+ "R (running)", /* 0x00 */
+ "S (sleeping)", /* 0x01 */
+ "D (disk sleep)", /* 0x02 */
+ "T (stopped)", /* 0x04 */
+ "t (tracing stop)", /* 0x08 */
+ "X (dead)", /* 0x10 */
+ "Z (zombie)", /* 0x20 */
+ "P (parked)", /* 0x40 */
+
+ /* extra states, beyond TASK_REPORT */
+
+ "I (idle)", /* 0x80 */
};

static inline const char *get_task_state(struct task_struct *tsk)
{
- unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT;
+ unsigned int tsk_state = READ_ONCE(tsk->state);
+ unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;

- /*
- * Parked tasks do not run; they sit in __kthread_parkme().
- * Without this check, we would report them as running, which is
- * clearly wrong, so we report them as sleeping instead.
- */
- if (tsk->state == TASK_PARKED)
- state = TASK_INTERRUPTIBLE;
+ if (tsk_state == TASK_IDLE)
+ state = TASK_REPORT_MAX;

- BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1);
+ BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-2);

return task_state_array[fls(state)];
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 68b38335d33c..7ae81efb17bd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -65,25 +65,27 @@ struct task_group;
*/

/* Used in tsk->state: */
-#define TASK_RUNNING 0
-#define TASK_INTERRUPTIBLE 1
-#define TASK_UNINTERRUPTIBLE 2
-#define __TASK_STOPPED 4
-#define __TASK_TRACED 8
+#define TASK_RUNNING 0x0000
+#define TASK_INTERRUPTIBLE 0x0001
+#define TASK_UNINTERRUPTIBLE 0x0002
+#define __TASK_STOPPED 0x0004
+#define __TASK_TRACED 0x0008
/* Used in tsk->exit_state: */
-#define EXIT_DEAD 16
-#define EXIT_ZOMBIE 32
+#define EXIT_DEAD 0x0010
+#define EXIT_ZOMBIE 0x0020
#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD)
/* Used in tsk->state again: */
-#define TASK_DEAD 64
-#define TASK_WAKEKILL 128
-#define TASK_WAKING 256
-#define TASK_PARKED 512
-#define TASK_NOLOAD 1024
-#define TASK_NEW 2048
-#define TASK_STATE_MAX 4096
+#define TASK_PARKED 0x0040
+#define TASK_REPORT_MAX 0x0080

-#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPNn"
+/* Not in TASK_REPORT: */
+#define TASK_DEAD 0x0080
+#define TASK_WAKEKILL 0x0100
+#define TASK_WAKING 0x0200
+#define TASK_NOLOAD 0x0400
+#define TASK_NEW 0x0800
+
+#define TASK_STATE_MAX 0x1000

/* Convenience macros for the sake of set_current_state: */
#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
@@ -96,10 +98,11 @@ struct task_group;
#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)

-/* get_task_state(): */
+/* task_state_to_char(), get_task_state(), trace_sched_switch() */
#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
- __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
+ __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
+ TASK_PARKED)

#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)

@@ -1244,17 +1247,24 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
return task_pgrp_nr_ns(tsk, &init_pid_ns);
}

-static inline char task_state_to_char(struct task_struct *task)
+static inline char __task_state_to_char(unsigned int state)
{
- const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
- unsigned long state = task->state;
+ static const char state_char[] = "RSDTtXZPI";

- state = state ? __ffs(state) + 1 : 0;
+ BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != sizeof(state_char) - 3);
+
+ return state_char[fls(state)];
+}
+
+static inline char task_state_to_char(struct task_struct *task)
+{
+ unsigned int tsk_state = READ_ONCE(task->state);
+ unsigned int state = (tsk_state | task->exit_state) & TASK_REPORT;

- /* Make sure the string lines up properly with the number of task states: */
- BUILD_BUG_ON(sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1);
+ if (tsk_state == TASK_IDLE)
+ state = TASK_REPORT_MAX;

- return state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?';
+ return __task_state_to_char(state);
}

/**
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index ae1409ffe99a..af1858a01335 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -106,6 +106,8 @@ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new,
#ifdef CREATE_TRACE_POINTS
static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p)
{
+ unsigned int state = READ_ONCE(p->state);
+
#ifdef CONFIG_SCHED_DEBUG
BUG_ON(p != current);
#endif /* CONFIG_SCHED_DEBUG */
@@ -114,10 +116,18 @@ static inline long __trace_sched_switch_state(bool preempt, struct task_struct *
* Preemption ignores task state, therefore preempted tasks are always
* RUNNING (we will not have dequeued if state != RUNNING).
*/
- return preempt ? TASK_RUNNING | TASK_STATE_MAX : p->state;
+ if (preempt)
+ return TASK_REPORT_MAX << 1;
+
+ if (state == TASK_IDLE)
+ return TASK_REPORT_MAX;
+
+ return (state | p->exit_state) & TASK_REPORT;
}
#endif /* CREATE_TRACE_POINTS */

+#define TRACE_REPORT_MASK ((TASK_REPORT_MAX << 1) - 1)
+
/*
* Tracepoint for task switches, performed by the scheduler:
*/
@@ -152,13 +162,13 @@ TRACE_EVENT(sched_switch,

TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
- __entry->prev_state & (TASK_STATE_MAX-1) ?
- __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
+
+ __entry->prev_state & TRACE_REPORT_MASK ?
+ __print_flags(__entry->prev_state & TRACE_REPORT_MASK, "|",
{ 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
- { 16, "Z" }, { 32, "X" }, { 64, "x" },
- { 128, "K" }, { 256, "W" }, { 512, "P" },
- { 1024, "N" }) : "R",
- __entry->prev_state & TASK_STATE_MAX ? "+" : "",
+ { 16, "X" }, { 32, "Z" }, { 64, "P" },
+ { 128, "I" }) : "R",
+ __entry->prev_state & (TASK_REPORT_MAX << 1) ? "+" : "",
__entry->next_comm, __entry->next_pid, __entry->next_prio)
);

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 703f5831738e..431e2d6c709e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5164,6 +5164,26 @@ void sched_show_task(struct task_struct *p)
put_task_stack(p);
}

+static inline bool state_filter_match(unsigned long state_filter, struct task_struct *p)
+{
+ /* no filter, everything matches */
+ if (!state_filter)
+ return true;
+
+ /* filter, but doesn't match */
+ if (!(p->state & state_filter))
+ return false;
+
+ /*
+ * When looking for TASK_UNINTERRUPTIBLE, skip TASK_IDLE, but allow
+ * TASK_KILLABLE.
+ */
+ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
+ return false;
+
+ return true;
+}
+
void show_state_filter(unsigned long state_filter)
{
struct task_struct *g, *p;
@@ -5186,7 +5206,7 @@ void show_state_filter(unsigned long state_filter)
*/
touch_nmi_watchdog();
touch_all_softlockup_watchdogs();
- if (!state_filter || (p->state & state_filter))
+ if (state_filter_match(state_filter, p))
sched_show_task(p);
}

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4a23bbc3111b..244619e402cc 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -461,8 +461,6 @@ static char *task_group_path(struct task_group *tg)
}
#endif

-static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
-
static void
print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
{
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index bac629af2285..c738e764e2a5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -656,15 +656,6 @@ int trace_print_lat_context(struct trace_iterator *iter)
return !trace_seq_has_overflowed(s);
}

-static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
-
-static int task_state_char(unsigned long state)
-{
- int bit = state ? __ffs(state) + 1 : 0;
-
- return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
-}
-
/**
* ftrace_find_event - find a registered event
* @type: the type of event to look for
@@ -930,8 +921,8 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,

trace_assign_type(field, iter->ent);

- T = task_state_char(field->next_state);
- S = task_state_char(field->prev_state);
+ T = __task_state_to_char(field->next_state);
+ S = __task_state_to_char(field->prev_state);
trace_find_cmdline(field->next_pid, comm);
trace_seq_printf(&iter->seq,
" %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
@@ -966,8 +957,8 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);

if (!S)
- S = task_state_char(field->prev_state);
- T = task_state_char(field->next_state);
+ S = __task_state_to_char(field->prev_state);
+ T = __task_state_to_char(field->next_state);
trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
field->prev_pid,
field->prev_prio,
@@ -1002,8 +993,8 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
trace_assign_type(field, iter->ent);

if (!S)
- S = task_state_char(field->prev_state);
- T = task_state_char(field->next_state);
+ S = __task_state_to_char(field->prev_state);
+ T = __task_state_to_char(field->next_state);

SEQ_PUT_HEX_FIELD(s, field->prev_pid);
SEQ_PUT_HEX_FIELD(s, field->prev_prio);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ddec53b67646..b14caa0afd35 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -380,7 +380,7 @@ probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu)
}

static void
-tracing_sched_switch_trace(struct trace_array *tr,
+tracing_sched_switch_trace(bool preempt, struct trace_array *tr,
struct task_struct *prev,
struct task_struct *next,
unsigned long flags, int pc)
@@ -397,10 +397,10 @@ tracing_sched_switch_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = prev->pid;
entry->prev_prio = prev->prio;
- entry->prev_state = prev->state;
+ entry->prev_state = __trace_sched_switch_state(preempt, prev);
entry->next_pid = next->pid;
entry->next_prio = next->prio;
- entry->next_state = next->state;
+ entry->next_state = __trace_sched_switch_state(false, next);
entry->next_cpu = task_cpu(next);

if (!call_filter_check_discard(call, entry, buffer, event))
@@ -425,10 +425,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
entry = ring_buffer_event_data(event);
entry->prev_pid = curr->pid;
entry->prev_prio = curr->prio;
- entry->prev_state = curr->state;
+ entry->prev_state = __trace_sched_switch_state(false, curr);
entry->next_pid = wakee->pid;
entry->next_prio = wakee->prio;
- entry->next_state = wakee->state;
+ entry->next_state = __trace_sched_switch_state(false, wakee);
entry->next_cpu = task_cpu(wakee);

if (!call_filter_check_discard(call, entry, buffer, event))
@@ -482,7 +482,7 @@ probe_wakeup_sched_switch(void *ignore, bool preempt,
data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);

__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
- tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
+ tracing_sched_switch_trace(preempt, wakeup_trace, prev, next, flags, pc);

T0 = data->preempt_timestamp;
T1 = ftrace_now(cpu);