Re: [PATCH v4] sched/debug: Use sched_debug_lock to serialize use of cgroup_path[] only

From: Peter Zijlstra
Date: Tue Apr 06 2021 - 05:17:05 EST


On Mon, Apr 05, 2021 at 07:42:03PM -0400, Waiman Long wrote:
> The handling of sysrq key can be activated by echoing the key to
> /proc/sysrq-trigger or via the magic key sequence typed into a terminal
> that is connected to the system in some way (serial, USB or other mean).
> In the former case, the handling is done in a user context. In the
> latter case, it is likely to be in an interrupt context.

> [ 7809.796281] </NMI>
> [ 7809.796282] _raw_spin_lock_irqsave+0x32/0x40
> [ 7809.796283] print_cpu+0x261/0x7c0
> [ 7809.796283] sysrq_sched_debug_show+0x34/0x50
> [ 7809.796284] sysrq_handle_showstate+0xc/0x20
> [ 7809.796284] __handle_sysrq.cold.11+0x48/0xfb
> [ 7809.796285] write_sysrq_trigger+0x2b/0x30
> [ 7809.796285] proc_reg_write+0x39/0x60
> [ 7809.796286] vfs_write+0xa5/0x1a0
> [ 7809.796286] ksys_write+0x4f/0xb0
> [ 7809.796287] do_syscall_64+0x5b/0x1a0
> [ 7809.796287] entry_SYSCALL_64_after_hwframe+0x65/0xca
> [ 7809.796288] RIP: 0033:0x7fabe4ceb648
>
> The purpose of sched_debug_lock is to serialize the use of the global
> cgroup_path[] buffer in print_cpu(). The rests of the printk calls don't
> need serialization from sched_debug_lock.

> The print_cpu() function has two callers - sched_debug_show() and
> sysrq_sched_debug_show().

So what idiot is doing sysrq and that proc file at the same time? Why is
it a problem now?

> @@ -470,16 +468,49 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
> #endif
>
> #ifdef CONFIG_CGROUP_SCHED
> +static DEFINE_SPINLOCK(sched_debug_lock);
> static char group_path[PATH_MAX];
> +static enum {
> + TOKEN_NONE,
> + TOKEN_ACQUIRED,
> + TOKEN_NA /* Not applicable */
> +} console_token = TOKEN_ACQUIRED;

> +/*
> + * All the print_cpu() callers from sched_debug_show() will be allowed
> + * to contend for sched_debug_lock and use group_path[] as their SEQ_printf()
> + * calls will be much faster. However only one print_cpu() caller from
> + * sysrq_sched_debug_show() which outputs to the console will be allowed
> + * to use group_path[]. Another parallel console writer will have to use
> + * a shorter stack buffer instead. Since the console output will be garbled
> + * anyway, truncation of some cgroup paths shouldn't be a big issue.
> + */
> +#define SEQ_printf_task_group_path(m, tg, fmt...) \
> +{ \
> + unsigned long flags; \
> + int token = m ? TOKEN_NA \
> + : xchg_acquire(&console_token, TOKEN_NONE); \
> + \
> + if (token == TOKEN_NONE) { \
> + char buf[128]; \
> + task_group_path(tg, buf, sizeof(buf)); \
> + SEQ_printf(m, fmt, buf); \
> + } else { \
> + spin_lock_irqsave(&sched_debug_lock, flags); \
> + task_group_path(tg, group_path, sizeof(group_path)); \
> + SEQ_printf(m, fmt, group_path); \
> + spin_unlock_irqrestore(&sched_debug_lock, flags); \
> + if (token == TOKEN_ACQUIRED) \
> + smp_store_release(&console_token, token); \
> + } \
> }

This is disgusting... you have an open-coded test-and-set lock like
thing *AND* a spinlock, what gives?


What's wrong with something simple like this?

---
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4b49cc2af5c4..2ac2977f3b96 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -8,8 +8,6 @@
*/
#include "sched.h"

-static DEFINE_SPINLOCK(sched_debug_lock);
-
/*
* This allows printing both to /proc/sched_debug and
* to the console
@@ -470,6 +468,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#endif

#ifdef CONFIG_CGROUP_SCHED
+static DEFINE_SPINLOCK(group_path_lock);
static char group_path[PATH_MAX];

static char *task_group_path(struct task_group *tg)
@@ -481,6 +480,22 @@ static char *task_group_path(struct task_group *tg)

return group_path;
}
+
+#define SEQ_printf_task_group_path(m, tg) \
+do { \
+ if (spin_trylock(&group_path_lock)) { \
+ task_group_path(tg, group_path, sizeof(group_path)); \
+ SEQ_printf(m, "%s", group_path); \
+ spin_unlock(&group_path_lock); \
+ } else { \
+ SEQ_printf(m, "looser!"); \
+ }
+} while (0)
+
+#else
+
+#define SEQ_printf_task_group_path(m, tg) do { } while (0)
+
#endif

static void
@@ -505,9 +520,8 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
#ifdef CONFIG_NUMA_BALANCING
SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
#endif
-#ifdef CONFIG_CGROUP_SCHED
- SEQ_printf(m, " %s", task_group_path(task_group(p)));
-#endif
+ SEQ_printf(m, " ");
+ SEQ_printf_task_group_path(m, task_group(p));

SEQ_printf(m, "\n");
}
@@ -541,13 +555,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
struct sched_entity *last;
unsigned long flags;

-#ifdef CONFIG_FAIR_GROUP_SCHED
SEQ_printf(m, "\n");
- SEQ_printf(m, "cfs_rq[%d]:%s\n", cpu, task_group_path(cfs_rq->tg));
-#else
+ SEQ_printf(m, "cfs_rq[%d]:", cpu);
+ SEQ_printf_task_group_path(m, cfs_rq->tg);
SEQ_printf(m, "\n");
- SEQ_printf(m, "cfs_rq[%d]:\n", cpu);
-#endif
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
SPLIT_NS(cfs_rq->exec_clock));

@@ -612,13 +623,10 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)

void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
{
-#ifdef CONFIG_RT_GROUP_SCHED
SEQ_printf(m, "\n");
- SEQ_printf(m, "rt_rq[%d]:%s\n", cpu, task_group_path(rt_rq->tg));
-#else
+ SEQ_printf(m, "rt_rq[%d]:", cpu);
+ SEQ_printf_task_group_path(m, rt_rq->tg);
SEQ_printf(m, "\n");
- SEQ_printf(m, "rt_rq[%d]:\n", cpu);
-#endif

#define P(x) \
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
@@ -666,7 +674,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
static void print_cpu(struct seq_file *m, int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long flags;

#ifdef CONFIG_X86
{
@@ -717,13 +724,11 @@ do { \
}
#undef P

- spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
print_dl_stats(m, cpu);

print_rq(m, rq, cpu);
- spin_unlock_irqrestore(&sched_debug_lock, flags);
SEQ_printf(m, "\n");
}