[PATCH v2 4.19] tcp: fix TCP socks unreleased in BBR mode

From: kerneljasonxing
Date: Thu Jun 04 2020 - 05:01:34 EST


From: Jason Xing <kerneljasonxing@xxxxxxxxx>

When using BBR mode, too many tcp socks cannot be released because of
duplicate use of the sock_hold() in the manner of tcp_internal_pacing()
when RTO happens. Therefore, this situation maddly increases the slab
memory and then constantly triggers the OOM until crash.

Besides, in addition to BBR mode, if some mode applies pacing function,
it could trigger what we've discussed above,

Reproduce procedure:
0) cat /proc/slabinfo | grep TCP
1) switch net.ipv4.tcp_congestion_control to bbr
2) using wrk tool something like that to send packages
3) using tc to increase the delay and loss to simulate the RTO case.
4) cat /proc/slabinfo | grep TCP
5) kill the wrk command and observe the number of objects and slabs in
TCP.
6) at last, you could notice that the number would not decrease.

v2: extend the timer which could cover all those related potential risks
(suggested by Eric Dumazet and Neal Cardwell)

Signed-off-by: Jason Xing <kerneljasonxing@xxxxxxxxx>
Signed-off-by: liweishi <liweishi@xxxxxxxxxxxx>
Signed-off-by: Shujin Li <lishujin@xxxxxxxxxxxx>
---
net/ipv4/tcp_output.c | 31 +++++++++++++++++++++++--------
1 file changed, 23 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index cc4ba42..4626f4e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -966,6 +966,8 @@ enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer)

static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)
{
+ struct tcp_sock *tp = tcp_sk(sk);
+ ktime_t expire, now;
u64 len_ns;
u32 rate;

@@ -977,12 +979,28 @@ static void tcp_internal_pacing(struct sock *sk, const struct sk_buff *skb)

len_ns = (u64)skb->len * NSEC_PER_SEC;
do_div(len_ns, rate);
- hrtimer_start(&tcp_sk(sk)->pacing_timer,
- ktime_add_ns(ktime_get(), len_ns),
+ now = ktime_get();
+ /* If hrtimer is already armed, then our caller has not
+ * used tcp_pacing_check().
+ */
+ if (unlikely(hrtimer_is_queued(&tp->pacing_timer))) {
+ expire = hrtimer_get_softexpires(&tp->pacing_timer);
+ if (ktime_after(expire, now))
+ now = expire;
+ if (hrtimer_try_to_cancel(&tp->pacing_timer) == 1)
+ __sock_put(sk);
+ }
+ hrtimer_start(&tp->pacing_timer, ktime_add_ns(now, len_ns),
HRTIMER_MODE_ABS_PINNED_SOFT);
sock_hold(sk);
}

+static bool tcp_pacing_check(const struct sock *sk)
+{
+ return tcp_needs_internal_pacing(sk) &&
+ hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
+}
+
static void tcp_update_skb_after_send(struct tcp_sock *tp, struct sk_buff *skb)
{
skb->skb_mstamp = tp->tcp_mstamp;
@@ -2117,6 +2135,9 @@ static int tcp_mtu_probe(struct sock *sk)
if (!tcp_can_coalesce_send_queue_head(sk, probe_size))
return -1;

+ if (tcp_pacing_check(sk))
+ return -1;
+
/* We're allowed to probe. Build it now. */
nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC, false);
if (!nskb)
@@ -2190,12 +2211,6 @@ static int tcp_mtu_probe(struct sock *sk)
return -1;
}

-static bool tcp_pacing_check(const struct sock *sk)
-{
- return tcp_needs_internal_pacing(sk) &&
- hrtimer_is_queued(&tcp_sk(sk)->pacing_timer);
-}
-
/* TCP Small Queues :
* Control number of packets in qdisc/devices to two packets / or ~1 ms.
* (These limits are doubled for retransmits)
--
1.8.3.1