On Mon, Apr 05, 2021 at 07:42:03PM -0400, Waiman Long wrote:We got a customer bug report on watchdog panic because a process somehow stay within the sched_debug_lock for too long. I don't know what exactly the customer was doing, but we can reproduce the panic just by having 2 parallel "echo t > /proc/sysrq-trigger" commands. This happens on certain selected systems. I was using some Dell systems for my testing. Of course, it is not sensible to have more than one sysrq happening at the same time. However, a parallel thread accessing /proc/sched_debug is certainly possible. That invokes similar code path.
The handling of sysrq key can be activated by echoing the key toSo what idiot is doing sysrq and that proc file at the same time? Why is
/proc/sysrq-trigger or via the magic key sequence typed into a terminal
that is connected to the system in some way (serial, USB or other mean).
In the former case, the handling is done in a user context. In the
latter case, it is likely to be in an interrupt context.
[ 7809.796281] </NMI>
[ 7809.796282] _raw_spin_lock_irqsave+0x32/0x40
[ 7809.796283] print_cpu+0x261/0x7c0
[ 7809.796283] sysrq_sched_debug_show+0x34/0x50
[ 7809.796284] sysrq_handle_showstate+0xc/0x20
[ 7809.796284] __handle_sysrq.cold.11+0x48/0xfb
[ 7809.796285] write_sysrq_trigger+0x2b/0x30
[ 7809.796285] proc_reg_write+0x39/0x60
[ 7809.796286] vfs_write+0xa5/0x1a0
[ 7809.796286] ksys_write+0x4f/0xb0
[ 7809.796287] do_syscall_64+0x5b/0x1a0
[ 7809.796287] entry_SYSCALL_64_after_hwframe+0x65/0xca
[ 7809.796288] RIP: 0033:0x7fabe4ceb648
The purpose of sched_debug_lock is to serialize the use of the global
cgroup_path[] buffer in print_cpu(). The rests of the printk calls don't
need serialization from sched_debug_lock.
The print_cpu() function has two callers - sched_debug_show() and
sysrq_sched_debug_show().
it a problem now?
@@ -470,16 +468,49 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_groupThis is disgusting... you have an open-coded test-and-set lock like
#endif
#ifdef CONFIG_CGROUP_SCHED
+static DEFINE_SPINLOCK(sched_debug_lock);
static char group_path[PATH_MAX];
+static enum {
+ TOKEN_NONE,
+ TOKEN_ACQUIRED,
+ TOKEN_NA /* Not applicable */
+} console_token = TOKEN_ACQUIRED;
+/*
+ * All the print_cpu() callers from sched_debug_show() will be allowed
+ * to contend for sched_debug_lock and use group_path[] as their SEQ_printf()
+ * calls will be much faster. However only one print_cpu() caller from
+ * sysrq_sched_debug_show() which outputs to the console will be allowed
+ * to use group_path[]. Another parallel console writer will have to use
+ * a shorter stack buffer instead. Since the console output will be garbled
+ * anyway, truncation of some cgroup paths shouldn't be a big issue.
+ */
+#define SEQ_printf_task_group_path(m, tg, fmt...) \
+{ \
+ unsigned long flags; \
+ int token = m ? TOKEN_NA \
+ : xchg_acquire(&console_token, TOKEN_NONE); \
+ \
+ if (token == TOKEN_NONE) { \
+ char buf[128]; \
+ task_group_path(tg, buf, sizeof(buf)); \
+ SEQ_printf(m, fmt, buf); \
+ } else { \
+ spin_lock_irqsave(&sched_debug_lock, flags); \
+ task_group_path(tg, group_path, sizeof(group_path)); \
+ SEQ_printf(m, fmt, group_path); \
+ spin_unlock_irqrestore(&sched_debug_lock, flags); \
+ if (token == TOKEN_ACQUIRED) \
+ smp_store_release(&console_token, token); \
+ } \
}
thing *AND* a spinlock, what gives?
What's wrong with something simple like this?
---
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 4b49cc2af5c4..2ac2977f3b96 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -8,8 +8,6 @@
*/
#include "sched.h"
-static DEFINE_SPINLOCK(sched_debug_lock);
-
/*
* This allows printing both to /proc/sched_debug and
* to the console
@@ -470,6 +468,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
#endif
#ifdef CONFIG_CGROUP_SCHED
+static DEFINE_SPINLOCK(group_path_lock);
static char group_path[PATH_MAX];
static char *task_group_path(struct task_group *tg)
@@ -481,6 +480,22 @@ static char *task_group_path(struct task_group *tg)
return group_path;
}
+
+#define SEQ_printf_task_group_path(m, tg) \
+do { \
+ if (spin_trylock(&group_path_lock)) { \
+ task_group_path(tg, group_path, sizeof(group_path)); \
+ SEQ_printf(m, "%s", group_path); \
+ spin_unlock(&group_path_lock); \
+ } else { \
+ SEQ_printf(m, "looser!"); \
+ }
+} while (0)