[PATCH net v5 1/3] net: sched: fix packet stuck problem for lockless qdisc

From: Yunsheng Lin
Date: Wed May 05 2021 - 21:57:50 EST


Lockless qdisc has below concurrent problem:
cpu0 cpu1
. .
q->enqueue .
. .
qdisc_run_begin() .
. .
dequeue_skb() .
. .
sch_direct_xmit() .
. .
. q->enqueue
. qdisc_run_begin()
. return and do nothing
. .
qdisc_run_end() .

cpu1 enqueue a skb without calling __qdisc_run() because cpu0
has not released the lock yet and spin_trylock() return false
for cpu1 in qdisc_run_begin(), and cpu0 do not see the skb
enqueued by cpu1 when calling dequeue_skb() because cpu1 may
enqueue the skb after cpu0 calling dequeue_skb() and before
cpu0 calling qdisc_run_end().

Lockless qdisc has below another concurrent problem when
tx_action is involved:

cpu0(serving tx_action) cpu1 cpu2
. . .
. q->enqueue .
. qdisc_run_begin() .
. dequeue_skb() .
. . q->enqueue
. . .
. sch_direct_xmit() .
. . qdisc_run_begin()
. . return and do nothing
. . .
clear __QDISC_STATE_SCHED . .
qdisc_run_begin() . .
return and do nothing . .
. . .
. qdisc_run_end() .

This patch fixes the above data race by:
1. Test STATE_MISSED before doing spin_trylock().
2. If the first spin_trylock() return false and STATE_MISSED is
not set before the first spin_trylock(), Set STATE_MISSED and
retry another spin_trylock() in case other CPU may not see
STATE_MISSED after it releases the lock.
3. reschedule if STATE_MISSED is set after the lock is released
at the end of qdisc_run_end().

For tx_action case, STATE_MISSED is also set when cpu1 is at the
end if qdisc_run_end(), so tx_action will be rescheduled again
to dequeue the skb enqueued by cpu2.

Clear STATE_MISSED before retrying a dequeuing when dequeuing
returns NULL in order to reduce the overhead of the above double
spin_trylock() and __netif_schedule() calling.

The performance impact of this patch, tested using pktgen and
dummy netdev with pfifo_fast qdisc attached:

threads without+this_patch with+this_patch delta
1 2.61Mpps 2.60Mpps -0.3%
2 3.97Mpps 3.82Mpps -3.7%
4 5.62Mpps 5.59Mpps -0.5%
8 2.78Mpps 2.77Mpps -0.3%
16 2.22Mpps 2.22Mpps -0.0%

Fixes: 6b3ba9146fe6 ("net: sched: allow qdiscs to handle locking")
Signed-off-by: Yunsheng Lin <linyunsheng@xxxxxxxxxx>
Tested-by: Juergen Gross <jgross@xxxxxxxx>
---
V4: Change STATE_NEED_RESCHEDULE to STATE_MISSED mirroring
NAPI's NAPIF_STATE_MISSED, and add Juergen's "Tested-by"
tag for there is only renaming and typo fixing between
V4 and V3.
V3: Fix a compile error and a few comment typo, remove the
__QDISC_STATE_DEACTIVATED checking, and update the
performance data.
V2: Avoid the overhead of fixing the data race as much as
possible.
---
include/net/sch_generic.h | 37 ++++++++++++++++++++++++++++++++++++-
net/sched/sch_generic.c | 12 ++++++++++++
2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f7a6e14..b85b8ea 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -36,6 +36,7 @@ struct qdisc_rate_table {
enum qdisc_state_t {
__QDISC_STATE_SCHED,
__QDISC_STATE_DEACTIVATED,
+ __QDISC_STATE_MISSED,
};

struct qdisc_size_table {
@@ -159,8 +160,37 @@ static inline bool qdisc_is_empty(const struct Qdisc *qdisc)
static inline bool qdisc_run_begin(struct Qdisc *qdisc)
{
if (qdisc->flags & TCQ_F_NOLOCK) {
+ bool dont_retry = test_bit(__QDISC_STATE_MISSED,
+ &qdisc->state);
+
+ if (spin_trylock(&qdisc->seqlock))
+ goto nolock_empty;
+
+ /* If the flag is set before doing the spin_trylock() and
+ * the above spin_trylock() return false, it means other cpu
+ * holding the lock will do dequeuing for us, or it wil see
+ * the flag set after releasing lock and reschedule the
+ * net_tx_action() to do the dequeuing.
+ */
+ if (dont_retry)
+ return false;
+
+ /* We could do set_bit() before the first spin_trylock(),
+ * and avoid doing second spin_trylock() completely, then
+ * we could have multi cpus doing the set_bit(). Here use
+ * dont_retry to avoid doing the set_bit() and the second
+ * spin_trylock(), which has 5% performance improvement than
+ * doing the set_bit() before the first spin_trylock().
+ */
+ set_bit(__QDISC_STATE_MISSED, &qdisc->state);
+
+ /* Retry again in case other CPU may not see the new flag
+ * after it releases the lock at the end of qdisc_run_end().
+ */
if (!spin_trylock(&qdisc->seqlock))
return false;
+
+nolock_empty:
WRITE_ONCE(qdisc->empty, false);
} else if (qdisc_is_running(qdisc)) {
return false;
@@ -176,8 +206,13 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc)
static inline void qdisc_run_end(struct Qdisc *qdisc)
{
write_seqcount_end(&qdisc->running);
- if (qdisc->flags & TCQ_F_NOLOCK)
+ if (qdisc->flags & TCQ_F_NOLOCK) {
spin_unlock(&qdisc->seqlock);
+
+ if (unlikely(test_bit(__QDISC_STATE_MISSED,
+ &qdisc->state)))
+ __netif_schedule(qdisc);
+ }
}

static inline bool qdisc_may_bulk(const struct Qdisc *qdisc)
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 44991ea..9bc73ea 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -640,8 +640,10 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
{
struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
struct sk_buff *skb = NULL;
+ bool need_retry = true;
int band;

+retry:
for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
struct skb_array *q = band2list(priv, band);

@@ -652,6 +654,16 @@ static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
}
if (likely(skb)) {
qdisc_update_stats_at_dequeue(qdisc, skb);
+ } else if (need_retry &&
+ test_and_clear_bit(__QDISC_STATE_MISSED,
+ &qdisc->state)) {
+ /* do another dequeuing after clearing the flag to
+ * avoid calling __netif_schedule().
+ */
+ smp_mb__after_atomic();
+ need_retry = false;
+
+ goto retry;
} else {
WRITE_ONCE(qdisc->empty, true);
}
--
2.7.4