[PATCH] rcu: detect the preempt_rcu hang for triage jing's board

From: he, bo
Date: Sun Dec 09 2018 - 05:11:33 EST


Change-Id: I2ffceec2ae4847867753609e45c99afc66956003
Tracked-On:
Signed-off-by: he, bo <bo.he@xxxxxxxxx>
---
kernel/rcu/tree.c | 20 ++++++++++++++++++--
1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 78c0cf2..d6de363 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2192,8 +2192,13 @@ static int __noreturn rcu_gp_kthread(void *arg)
int ret;
struct rcu_state *rsp = arg;
struct rcu_node *rnp = rcu_get_root(rsp);
+ pid_t rcu_preempt_pid;

rcu_bind_gp_kthread();
+ if(!strcmp(rsp->name, "rcu_preempt")) {
+ rcu_preempt_pid = rsp->gp_kthread->pid;
+ }
+
for (;;) {

/* Handle grace-period start. */
@@ -2202,8 +2207,19 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gp_seq),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
- swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
- RCU_GP_FLAG_INIT);
+ if (current->pid != rcu_preempt_pid) {
+ swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
+ RCU_GP_FLAG_INIT);
+ } else {
+ ret = swait_event_idle_timeout_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
+ RCU_GP_FLAG_INIT, 2*HZ);
+
+ if(!ret) {

We get here if ret==0. Therefore, the above "if" statement needs to
instead be "if (ret == 1) {".

In addition, in order to get event traces dumped, we also need:

rcu_ftrace_dump(DUMP_ALL);

+ show_rcu_gp_kthreads();
+ panic("hung_task: blocked in rcu_gp_kthread init");
+ }
+ }
+
rsp->gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))
--
2.7.4

------------------------------------------------------------------------

So, again, please change the "if(!ret) {" to "if (ret == 1) {", and
please add "rcu_ftrace_dump(DUMP_ALL);" right after this "if" statement,
as shown above.

With that change, I bet that you will again see failures.

> Compared with the swait_event_idle_timeout_exclusive will do 3 times to check the condition, while swait_event_idle_ exclusive will do 2 times check the condition.
>
> so today I will do another experiment, only change as below:
> - swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
> - RCU_GP_FLAG_INIT);
> + ret = swait_event_idle_timeout_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
> + RCU_GP_FLAG_INIT, MAX_SCHEDULE_TIMEOUT);
> +
>
> Can you get some clues from the experiment?

Again, please instead make the changes that I called out above, with
the replacement for your patch 0001 shown below.

Thanx, Paul

PS. I have been testing for quite some time, but am still unable
to reproduce this. So we must depend on you to reproduce it.

------------------------------------------------------------------------

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0b760c1369f7..86152af1a580 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2153,8 +2153,13 @@ static int __noreturn rcu_gp_kthread(void *arg)
int ret;
struct rcu_state *rsp = arg;
struct rcu_node *rnp = rcu_get_root(rsp);
+ pid_t rcu_preempt_pid;

rcu_bind_gp_kthread();
+ if(!strcmp(rsp->name, "rcu_preempt")) {
+ rcu_preempt_pid = rsp->gp_kthread->pid;
+ }
+
for (;;) {

/* Handle grace-period start. */
@@ -2163,8 +2168,20 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gp_seq),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
- swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
- RCU_GP_FLAG_INIT);
+ if (current->pid != rcu_preempt_pid) {
+ swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
+ RCU_GP_FLAG_INIT);
+ } else {
+ ret = swait_event_idle_timeout_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
+ RCU_GP_FLAG_INIT, 2*HZ);
+
+ if (ret == 1) {
+ rcu_ftrace_dump(DUMP_ALL);
+ show_rcu_gp_kthreads();
+ panic("hung_task: blocked in rcu_gp_kthread init");
+ }
+ }
+
rsp->gp_state = RCU_GP_DONE_GPS;
/* Locking provides needed memory barrier. */
if (rcu_gp_init(rsp))