Re: 5.8-rc*: kernel BUG at kernel/signal.c:1917

From: Oleg Nesterov
Date: Fri Jul 17 2020 - 08:40:29 EST


On 07/17, Oleg Nesterov wrote:
>
> On 07/17, Jiri Slaby wrote:
> >
> > On 17. 07. 20, 12:45, Jiri Slaby wrote:
> > > Hi,
> > >
> > > the strace testsuite triggers this on 5.8-rc4 and -rc5 both on x86_64
> > > and i586:
> >
> > make check needs -jsomething, running is sequentially (-j1) doesn't
> > trigger it. After the error, I cannot run anything. Like ps to find out
> > what test caused the crash...
>
> Strange... I'll try to reproduce but I can't do this till Monday.
>
> Meanwhile, could you try the patch below? It needs CONFIG_DEBUG_ATOMIC_SLEEP.

please see the updated patch below, lets check ptrace_unfreeze() too.

Oleg.

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 43d6179508d6..71c76bc7dec6 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -180,6 +180,7 @@ static bool ptrace_freeze_traced(struct task_struct *task)

spin_lock_irq(&task->sighand->siglock);
if (task_is_traced(task) && !__fatal_signal_pending(task)) {
+ task->task_state_change = _THIS_IP_;
task->state = __TASK_TRACED;
ret = true;
}
@@ -203,8 +204,10 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
if (task->state == __TASK_TRACED) {
if (__fatal_signal_pending(task))
wake_up_state(task, __TASK_TRACED);
- else
+ else {
+ task->task_state_change = _THIS_IP_;
task->state = TASK_TRACED;
+ }
}
spin_unlock_irq(&task->sighand->siglock);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index ee22ec78fd6d..ba4c4c9ed9b5 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1914,7 +1914,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
BUG_ON(sig == -1);

/* do_notify_parent_cldstop should have been called instead. */
- BUG_ON(task_is_stopped_or_traced(tsk));
+ WARN(task_is_stopped_or_traced(tsk),
+ "exist with state=%ld set at %pS",
+ tsk->state, (void*)tsk->task_state_change);

BUG_ON(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk)));
@@ -2214,6 +2216,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t
preempt_enable_no_resched();
freezable_schedule();
cgroup_leave_frozen(true);
+ WARN_ON(current->state);
} else {
/*
* By the time we got the lock, our tracer went away.
@@ -2393,6 +2396,7 @@ static bool do_signal_stop(int signr)
/* Now we don't run again until woken by SIGCONT or SIGKILL */
cgroup_enter_frozen();
freezable_schedule();
+ WARN_ON(current->state);
return true;
} else {
/*