[PATCH RFC v2 net-next 1/5] net: Introduce Qdisc backpressure infrastructure
From: Peilin Ye
Date: Mon Aug 22 2022 - 05:13:19 EST
From: Peilin Ye <peilin.ye@xxxxxxxxxxxxx>
Currently sockets (especially UDP ones) can drop a lot of traffic at TC
egress when rate limited by shaper Qdiscs like HTB. Improve this by
introducing a Qdisc backpressure infrastructure:
a. A new 'sock struct' field, @sk_overlimits, which keeps track of the
number of bytes in socket send buffer that are currently
unavailable due to TC egress congestion. The size of an overlimit
socket's "effective" send buffer is represented by @sk_sndbuf minus
@sk_overlimits, with a lower limit of SOCK_MIN_SNDBUF:
max(@sk_sndbuf - @sk_overlimits, SOCK_MIN_SNDBUF)
b. A new (*backpressure) 'struct proto' callback, which is the
protocol's private algorithm for Qdisc backpressure.
Working together:
1. When a shaper Qdisc (TBF, HTB, CBQ, etc.) drops a packet that
belongs to a local socket, it calls qdisc_backpressure().
2. qdisc_backpressure() eventually invokes the socket protocol's
(*backpressure) callback, which should increase @sk_overlimits.
3. The transport layer then sees a smaller "effective" send buffer and
will send slower.
4. It is the per-protocol (*backpressure) implementation's
responsibility to decrease @sk_overlimits when TC egress becomes
idle again, potentially by using a timer.
Suggested-by: Cong Wang <cong.wang@xxxxxxxxxxxxx>
Signed-off-by: Peilin Ye <peilin.ye@xxxxxxxxxxxxx>
---
include/net/sch_generic.h | 11 +++++++++++
include/net/sock.h | 21 +++++++++++++++++++++
net/core/sock.c | 1 +
3 files changed, 33 insertions(+)
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index ec693fe7c553..afdf4bf64936 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -19,6 +19,7 @@
#include <net/gen_stats.h>
#include <net/rtnetlink.h>
#include <net/flow_offload.h>
+#include <net/sock.h>
struct Qdisc_ops;
struct qdisc_walker;
@@ -1188,6 +1189,16 @@ static inline int qdisc_drop_all(struct sk_buff *skb, struct Qdisc *sch,
return NET_XMIT_DROP;
}
+static inline void qdisc_backpressure(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+
+ if (!sk || !sk_fullsock(sk))
+ return;
+
+ sk_backpressure(sk);
+}
+
/* Length to Time (L2T) lookup in a qdisc_rate_table, to determine how
long it will take to send a packet given its size.
*/
diff --git a/include/net/sock.h b/include/net/sock.h
index 05a1bbdf5805..ef10ca66cf26 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -277,6 +277,7 @@ struct sk_filter;
* @sk_pacing_status: Pacing status (requested, handled by sch_fq)
* @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
* @sk_sndbuf: size of send buffer in bytes
+ * @sk_overlimits: size of temporarily unavailable send buffer in bytes
* @__sk_flags_offset: empty field used to determine location of bitfield
* @sk_padding: unused element for alignment
* @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
@@ -439,6 +440,7 @@ struct sock {
struct dst_entry __rcu *sk_dst_cache;
atomic_t sk_omem_alloc;
int sk_sndbuf;
+ int sk_overlimits;
/* ===== cache line for TX ===== */
int sk_wmem_queued;
@@ -1264,6 +1266,7 @@ struct proto {
bool (*stream_memory_free)(const struct sock *sk, int wake);
bool (*sock_is_readable)(struct sock *sk);
+ void (*backpressure)(struct sock *sk);
/* Memory pressure */
void (*enter_memory_pressure)(struct sock *sk);
void (*leave_memory_pressure)(struct sock *sk);
@@ -2499,6 +2502,24 @@ static inline void sk_stream_moderate_sndbuf(struct sock *sk)
WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF));
}
+static inline int sk_sndbuf_avail(struct sock *sk)
+{
+ int overlimits, sndbuf = READ_ONCE(sk->sk_sndbuf);
+
+ if (!sk->sk_prot->backpressure)
+ return sndbuf;
+
+ overlimits = READ_ONCE(sk->sk_overlimits);
+
+ return max_t(int, sndbuf - overlimits, SOCK_MIN_SNDBUF);
+}
+
+static inline void sk_backpressure(struct sock *sk)
+{
+ if (sk->sk_prot->backpressure)
+ sk->sk_prot->backpressure(sk);
+}
+
/**
* sk_page_frag - return an appropriate page_frag
* @sk: socket
diff --git a/net/core/sock.c b/net/core/sock.c
index 4cb957d934a2..167d471b176f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2194,6 +2194,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
/* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
refcount_set(&newsk->sk_wmem_alloc, 1);
+ newsk->sk_overlimits = 0;
atomic_set(&newsk->sk_omem_alloc, 0);
sk_init_common(newsk);
--
2.20.1