[RFCv2 1/3] tcp: Use smaller mtu probes if RACK is enabled

From: Leonard Crestez
Date: Wed May 26 2021 - 06:38:51 EST


RACK allows detecting a loss in rtt + min_rtt / 4 based on just one
extra packet. If enabled use this instead of relying of fast retransmit.

Suggested-by: Neal Cardwell <ncardwell@xxxxxxxxxx>
Signed-off-by: Leonard Crestez <cdleonard@xxxxxxxxx>
---
Documentation/networking/ip-sysctl.rst | 5 +++++
include/net/netns/ipv4.h | 1 +
net/ipv4/sysctl_net_ipv4.c | 7 +++++++
net/ipv4/tcp_ipv4.c | 1 +
net/ipv4/tcp_output.c | 26 +++++++++++++++++++++++++-
5 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index a5c250044500..7ab52a105a5d 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -349,10 +349,15 @@ tcp_mtu_probe_floor - INTEGER
If MTU probing is enabled this caps the minimum MSS used for search_low
for the connection.

Default : 48

+tcp_mtu_probe_rack - BOOLEAN
+ Try to use shorter probes if RACK is also enabled
+
+ Default: 1
+
tcp_min_snd_mss - INTEGER
TCP SYN and SYNACK messages usually advertise an ADVMSS option,
as described in RFC 1122 and RFC 6691.

If this ADVMSS option is smaller than tcp_min_snd_mss,
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 746c80cd4257..b4ff12f25a7f 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -112,10 +112,11 @@ struct netns_ipv4 {
#ifdef CONFIG_NET_L3_MASTER_DEV
u8 sysctl_tcp_l3mdev_accept;
#endif
u8 sysctl_tcp_mtu_probing;
int sysctl_tcp_mtu_probe_floor;
+ int sysctl_tcp_mtu_probe_rack;
int sysctl_tcp_base_mss;
int sysctl_tcp_min_snd_mss;
int sysctl_tcp_probe_threshold;
u32 sysctl_tcp_probe_interval;

diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 4fa77f182dcb..275c91fb9cf8 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -847,10 +847,17 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &tcp_min_snd_mss_min,
.extra2 = &tcp_min_snd_mss_max,
},
+ {
+ .procname = "tcp_mtu_probe_rack",
+ .data = &init_net.ipv4.sysctl_tcp_mtu_probe_rack,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{
.procname = "tcp_probe_threshold",
.data = &init_net.ipv4.sysctl_tcp_probe_threshold,
.maxlen = sizeof(int),
.mode = 0644,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 4f5b68a90be9..ed8af4a7325b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2892,10 +2892,11 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
net->ipv4.sysctl_tcp_min_snd_mss = TCP_MIN_SND_MSS;
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
net->ipv4.sysctl_tcp_mtu_probe_floor = TCP_MIN_SND_MSS;
+ net->ipv4.sysctl_tcp_mtu_probe_rack = 1;

net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index bde781f46b41..9691f435477b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -2311,10 +2311,19 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)
}

return true;
}

+/* Check if rack is supported for current connection */
+static int tcp_mtu_probe_is_rack(const struct sock *sk)
+{
+ struct net *net = sock_net(sk);
+
+ return (net->ipv4.sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION &&
+ net->ipv4.sysctl_tcp_mtu_probe_rack);
+}
+
/* Create a new MTU probe if we are ready.
* MTU probe is regularly attempting to increase the path MTU by
* deliberately sending larger packets. This discovers routing
* changes resulting in larger path MTUs.
*
@@ -2351,11 +2360,26 @@ static int tcp_mtu_probe(struct sock *sk)
* smaller than a threshold, backoff from probing.
*/
mss_now = tcp_current_mss(sk);
probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
icsk->icsk_mtup.search_low) >> 1);
- size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
+ /* Probing the MTU requires one packet which is larger that current MSS as well
+ * as enough following mtu-sized packets to ensure that a probe loss can be
+ * detected without a full Retransmit Time Out.
+ */
+ if (tcp_mtu_probe_is_rack(sk)) {
+ /* RACK allows recovering in min_rtt / 4 based on just one extra packet
+ * Use two to account for unrelated losses
+ */
+ size_needed = probe_size + 2 * tp->mss_cache;
+ } else {
+ /* Without RACK send enough extra packets to trigger fast retransmit
+ * This is dynamic DupThresh + 1
+ */
+ size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
+ }
+
interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
/* When misfortune happens, we are reprobing actively,
* and then reprobe timer has expired. We stick with current
* probing process by not resetting search range to its orignal.
*/
--
2.25.1