[PATCH] tipc: fix stall during bclink wakeup procedure

From: Kolmakov Dmitriy
Date: Wed Sep 02 2015 - 11:33:34 EST


From: Dmitry S Kolmakov <kolmakov.dmitriy@xxxxxxxxxx>

If an attempt to wake up users of broadcast link is made when there is no enough place in send queue than it may hang up inside the tipc_sk_rcv() function since the loop breaks only after the wake up queue becomes empty. This can lead to complete CPU stall with the following message generated by RCU:

Aug 17 15:11:28 [kernel] INFO: rcu_sched self-detected stall on CPU { 0} (t=2101 jiffies g=54225 c=54224 q=11465)
Aug 17 15:11:28 [kernel] Task dump for CPU 0:
Aug 17 15:11:28 [kernel] tpch R running task 0 39949 39948 0x0000000a
Aug 17 15:11:28 [kernel] ffffffff818536c0 ffff88181fa037a0 ffffffff8106a4be 0000000000000000
Aug 17 15:11:28 [kernel] ffffffff818536c0 ffff88181fa037c0 ffffffff8106d8a8 ffff88181fa03800
Aug 17 15:11:28 [kernel] 0000000000000001 ffff88181fa037f0 ffffffff81094a50 ffff88181fa15680
Aug 17 15:11:28 [kernel] Call Trace:
Aug 17 15:11:28 [kernel] <IRQ> [<ffffffff8106a4be>] sched_show_task+0xae/0x120
Aug 17 15:11:28 [kernel] [<ffffffff8106d8a8>] dump_cpu_task+0x38/0x40
Aug 17 15:11:28 [kernel] [<ffffffff81094a50>] rcu_dump_cpu_stacks+0x90/0xd0
Aug 17 15:11:28 [kernel] [<ffffffff81097c3b>] rcu_check_callbacks+0x3eb/0x6e0
Aug 17 15:11:28 [kernel] [<ffffffff8106e53f>] ? account_system_time+0x7f/0x170
Aug 17 15:11:28 [kernel] [<ffffffff81099e64>] update_process_times+0x34/0x60
Aug 17 15:11:28 [kernel] [<ffffffff810a84d1>] tick_sched_handle.isra.18+0x31/0x40
Aug 17 15:11:28 [kernel] [<ffffffff810a851c>] tick_sched_timer+0x3c/0x70
Aug 17 15:11:28 [kernel] [<ffffffff8109a43d>] __run_hrtimer.isra.34+0x3d/0xc0
Aug 17 15:11:28 [kernel] [<ffffffff8109aa95>] hrtimer_interrupt+0xc5/0x1e0
Aug 17 15:11:28 [kernel] [<ffffffff81030d52>] ? native_smp_send_reschedule+0x42/0x60
Aug 17 15:11:28 [kernel] [<ffffffff81032f04>] local_apic_timer_interrupt+0x34/0x60
Aug 17 15:11:28 [kernel] [<ffffffff810335bc>] smp_apic_timer_interrupt+0x3c/0x60
Aug 17 15:11:28 [kernel] [<ffffffff8165a3fb>] apic_timer_interrupt+0x6b/0x70
Aug 17 15:11:28 [kernel] [<ffffffff81659129>] ? _raw_spin_unlock_irqrestore+0x9/0x10
Aug 17 15:11:28 [kernel] [<ffffffff8107eb9f>] __wake_up_sync_key+0x4f/0x60
Aug 17 15:11:28 [kernel] [<ffffffffa313ddd1>] tipc_write_space+0x31/0x40 [tipc]
Aug 17 15:11:28 [kernel] [<ffffffffa313dadf>] filter_rcv+0x31f/0x520 [tipc]
Aug 17 15:11:28 [kernel] [<ffffffffa313d699>] ? tipc_sk_lookup+0xc9/0x110 [tipc]
Aug 17 15:11:28 [kernel] [<ffffffff81659259>] ? _raw_spin_lock_bh+0x19/0x30
Aug 17 15:11:28 [kernel] [<ffffffffa314122c>] tipc_sk_rcv+0x2dc/0x3e0 [tipc]
Aug 17 15:11:28 [kernel] [<ffffffffa312e7ff>] tipc_bclink_wakeup_users+0x2f/0x40 [tipc]
Aug 17 15:11:28 [kernel] [<ffffffffa313ce26>] tipc_node_unlock+0x186/0x190 [tipc]
Aug 17 15:11:28 [kernel] [<ffffffff81597c1c>] ? kfree_skb+0x2c/0x40
Aug 17 15:11:28 [kernel] [<ffffffffa313475c>] tipc_rcv+0x2ac/0x8c0 [tipc]
Aug 17 15:11:28 [kernel] [<ffffffffa312ff58>] tipc_l2_rcv_msg+0x38/0x50 [tipc]
Aug 17 15:11:28 [kernel] [<ffffffff815a76d3>] __netif_receive_skb_core+0x5a3/0x950
Aug 17 15:11:28 [kernel] [<ffffffff815a98d3>] __netif_receive_skb+0x13/0x60
Aug 17 15:11:28 [kernel] [<ffffffff815a993e>] netif_receive_skb_internal+0x1e/0x90
Aug 17 15:11:28 [kernel] [<ffffffff815aa138>] napi_gro_receive+0x78/0xa0
Aug 17 15:11:28 [kernel] [<ffffffffa07f93f4>] tg3_poll_work+0xc54/0xf40 [tg3]
Aug 17 15:11:28 [kernel] [<ffffffff81597c8c>] ? consume_skb+0x2c/0x40
Aug 17 15:11:28 [kernel] [<ffffffffa07f9721>] tg3_poll_msix+0x41/0x160 [tg3]
Aug 17 15:11:28 [kernel] [<ffffffff815ab0f2>] net_rx_action+0xe2/0x290
Aug 17 15:11:28 [kernel] [<ffffffff8104b92a>] __do_softirq+0xda/0x1f0
Aug 17 15:11:28 [kernel] [<ffffffff8104bc26>] irq_exit+0x76/0xa0
Aug 17 15:11:28 [kernel] [<ffffffff81004355>] do_IRQ+0x55/0xf0
Aug 17 15:11:28 [kernel] [<ffffffff8165a12b>] common_interrupt+0x6b/0x6b
Aug 17 15:12:31 [kernel] <EOI>

This issue was happened on quite big networks of 32-64 sockets which send several multicast messages all-to-all at the same time. The patch fixes the issue by reusing the link_prepare_wakeup() procedure which moves users as permitted by space available in send queue to a separate queue which in its turn is conveyed to tipc_sk_rcv().
The link_prepare_wakeup() procedure was also modified a bit:
1. Firstly to enable its reuse some actions related to unicast link were moved out of the function.
2. And secondly the internal loop doesn't break now when only one send queue is exhausted but it continues up to the end of wake up queue so all send queues can be refilled.

Signed-off-by: Dmitry S Kolmakov <kolmakov.dmitriy@xxxxxxxxxx>
---
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index c5cbdcb..b56f74a 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -176,8 +176,12 @@ static void bclink_retransmit_pkt(struct tipc_net *tn, u32 after, u32 to)
void tipc_bclink_wakeup_users(struct net *net)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_link *bcl = tn->bcl;
+ struct sk_buff_head resultq;

- tipc_sk_rcv(net, &tn->bclink->link.wakeupq);
+ skb_queue_head_init(&resultq);
+ link_prepare_wakeup(bcl, &resultq);
+ tipc_sk_rcv(net, &resultq);
}

/**
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 43a515d..467edbc 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -372,10 +372,11 @@ err:
/**
* link_prepare_wakeup - prepare users for wakeup after congestion
* @link: congested link
+ * @resultq: queue for users which can be woken up
* Move a number of waiting users, as permitted by available space in
- * the send queue, from link wait queue to node wait queue for wakeup
+ * the send queue, from link wait queue to specified queue for wakeup
*/
-void link_prepare_wakeup(struct tipc_link *l)
+void link_prepare_wakeup(struct tipc_link *l, struct sk_buff_head *resultq)
{
int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,};
int imp, lim;
@@ -386,11 +387,9 @@ void link_prepare_wakeup(struct tipc_link *l)
lim = l->window + l->backlog[imp].limit;
pnd[imp] += TIPC_SKB_CB(skb)->chain_sz;
if ((pnd[imp] + l->backlog[imp].len) >= lim)
- break;
+ continue;
skb_unlink(skb, &l->wakeupq);
- skb_queue_tail(&l->inputq, skb);
- l->owner->inputq = &l->inputq;
- l->owner->action_flags |= TIPC_MSG_EVT;
+ skb_queue_tail(resultq, skb);
}
}

@@ -1112,8 +1111,11 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr)
if (unlikely(skb_queue_len(&l_ptr->backlogq)))
tipc_link_push_packets(l_ptr);

- if (released && !skb_queue_empty(&l_ptr->wakeupq))
- link_prepare_wakeup(l_ptr);
+ if (released && !skb_queue_empty(&l_ptr->wakeupq)) {
+ link_prepare_wakeup(l_ptr, &l_ptr->inputq);
+ l_ptr->owner->inputq = &l_ptr->inputq;
+ l_ptr->owner->action_flags |= TIPC_MSG_EVT;
+ }

/* Process the incoming packet */
if (unlikely(!link_working_working(l_ptr))) {
diff --git a/net/tipc/link.h b/net/tipc/link.h
index b5b4e35..4520d4c 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -245,7 +245,7 @@ int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info);
int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]);
-void link_prepare_wakeup(struct tipc_link *l);
+void link_prepare_wakeup(struct tipc_link *l, struct sk_buff_head *resultq);

/*
* Link sequence number manipulation routines (uses modulo 2**16 arithmetic)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/