Re: [PATCH net v3 2/2] net/smc: Use correct buffer sizes when switching between TCP and SMC

From: Wen Gu
Date: Thu May 30 2024 - 05:20:43 EST


Tuning of the effective buffer size through setsockopts was working for
SMC traffic only but not for TCP fall-back connections even before
commit 0227f058aa29 ("net/smc: Unbind r/w buffer size from clcsock and
make them tunable"). That change made it apparent that TCP fall-back
connections would use net.smc.[rw]mem as buffer size instead of
net.ipv4_tcp_[rw]mem.

Amend the code that copies attributes between the (TCP) clcsock and the
SMC socket and adjust buffer sizes appropriately:
- Copy over sk_userlocks so that both sockets agree on whether tuning
via setsockopt is active.
- When falling back to TCP use sk_sndbuf or sk_rcvbuf as specified with
setsockopt. Otherwise, use the sysctl value for TCP/IPv4.
- Likewise, use either values from setsockopt or from sysctl for SMC
(duplicated) on successful SMC connect.

In smc_tcp_listen_work() drop the explicit copy of buffer sizes as that
is taken care of by the attribute copy.

[...]
+/* if set, use value set by setsockopt() - else use IPv4 or SMC sysctl value */
+static void smc_adjust_sock_bufsizes(struct sock *nsk, struct sock *osk,
+ unsigned long mask)
+{
+ struct net *nnet = sock_net(nsk);
+
+ nsk->sk_userlocks = osk->sk_userlocks;
+ if (osk->sk_userlocks & SOCK_SNDBUF_LOCK) {
+ nsk->sk_sndbuf = osk->sk_sndbuf;
+ } else {
+ if (mask == SK_FLAGS_SMC_TO_CLC)
+ WRITE_ONCE(nsk->sk_sndbuf,
+ READ_ONCE(nnet->ipv4.sysctl_tcp_wmem[1]));

Hi Gerd,

I noticed that during TCP connection establishment, tcp_sndbuf_expand()
will tune sk->sk_sndbuf, that causes clcsock's sk_sndbuf to no longer
be sysctl_tcp_wmem[1]. But here we set it back to sysctl_tcp_wmem[1].

So I did some tests to see if the values of sk_sndbuf and sk_rcvbuf are
as expected in SMC and fallback cases (see the attached server.c and
client.c for the reproducer and here are the sysctl values in my environment)

net.ipv4.tcp_wmem = 4096 4096 16777216
net.ipv4.tcp_rmem = 4096 4096 16777216
net.smc.wmem = 65536
net.smc.rmem = 65536


1. No additional sk_{snd|rcv}buf settings

1.1 TCP

./server
./client -i <serv_ip>

results:
- server: sndbuf_size 87040, rcvbuf_size 4096
- client: sndbuf_size 87040, rcvbuf_size 4096

1.2 SMC

smc_run ./server
smc_run ./client -i <serv_ip>

results:
- server: sndbuf_size 131072, rcvbuf_size 131072
- client: sndbuf_size 131072, rcvbuf_size 131072

1.3 SMC, but server fallback

smc_run ./server
./client -i <serv_ip>

results:
- server: sndbuf_size 87040, rcvbuf_size 4096
- client: sndbuf_size 87040, rcvbuf_size 4096

1.4 SMC, but client fallback

./server
smc_run ./client -i <serv_ip>

results:
- server: sndbuf_size 87040, rcvbuf_size 4096
- client: sndbuf_size 4096, rcvbuf_size 4096 <--- I think clcsock's sk_sndbuf should
be the same as 1.1 after fallback?


2. Set server listen sock's and client sock's sk_{snd|rcv}buf
as 16KB by setsockopt() before connection establishment.

2.1 TCP

./server -s 16384
./client -i <serv_ip> -s 16384

results:
- server: sndbuf_size 32768, rcvbuf_size 32768
- client: sndbuf_size 32768, rcvbuf_size 32768

2.2 SMC

smc_run ./server -s 16384
smc_run ./client -i <serv_ip> -s 16384

results:
- server: sndbuf_size 32768, rcvbuf_size 32768
- client: sndbuf_size 32768, rcvbuf_size 32768

2.3 SMC, but server fallback

smc_run ./server -s 16384
./client -i <serv_ip> -s 16384

results:
- server: sndbuf_size 32768, rcvbuf_size 32768
- client: sndbuf_size 32768, rcvbuf_size 32768

2.4 SMC, but client fallback

./server -s 16384
smc_run ./client -i <serv_ip> -s 16384

results:
- server: sndbuf_size 32768, rcvbuf_size 32768
- client: sndbuf_size 32768, rcvbuf_size 32768


In the above 8 sets of tests, 1.4 does not seem to meet expectations.
It is because we reset clcsock's sk_sndbuf to sysctl_tcp_wmem[1] in
smc_copy_sock_settings_to_clc(). I think it should be like 1.1 TCP values
after fallback. What do you think?

If so, we may need to avoid setting sysctl value to clcsock's sk_sndbuf
in smc_adjust_sock_bufsizes(). Furthermore, maybe all the setting-sysctl-value
can be omitted, since smc sock's and clcsock's sk_{snd|rcv}buf have been
set to sysctl value during their sock initialization (smc_sock_alloc() and
tcp_init_sock()).


And another question is why 1.3 is as expected? The direct cause is that
server does not call smc_copy_sock_settings_to_clc() when fallback, like the
client smc_connect_fallback() does. But I didn't figure out what is the
reason for the different behavior? Do you have any information? Thanks a lot!


Best regards,
Wen Gu

+ else
+ WRITE_ONCE(nsk->sk_sndbuf,
+ 2 * READ_ONCE(nnet->smc.sysctl_wmem));
+ }
+ if (osk->sk_userlocks & SOCK_RCVBUF_LOCK) {
+ nsk->sk_rcvbuf = osk->sk_rcvbuf;
+ } else {
+ if (mask == SK_FLAGS_SMC_TO_CLC)
+ WRITE_ONCE(nsk->sk_rcvbuf,
+ READ_ONCE(nnet->ipv4.sysctl_tcp_rmem[1]));
+ else
+ WRITE_ONCE(nsk->sk_rcvbuf,
+ 2 * READ_ONCE(nnet->smc.sysctl_rmem));
+ }
+}
+

[...]
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <stdbool.h>
#include <errno.h>
#include <netinet/tcp.h>

#ifndef AF_SMC
#define AF_SMC 43
#endif
#define NET_PROTOCAL AF_INET
#define SERV_IP "11.213.5.33"
#define SERV_PORT 10012

char *ip;

int net_clnt(int buf_size, int port)
{
int sndbuf_size, rcvbuf_size;
struct sockaddr_in s_addr;
char msg[128] = { 0 };
int optlen = 4;
int sock;
int rc;

if (!port)
port = SERV_PORT;

sock = socket(NET_PROTOCAL, SOCK_STREAM, 0);

if (buf_size) {
sndbuf_size = rcvbuf_size = buf_size;
/* set sndbuf and rcvbuf */
if (setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
&sndbuf_size, sizeof(int))) {
printf("set sndbuf failed\n");
return 0;
}
if (setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
&rcvbuf_size, sizeof(int))) {
printf("set rcvbuf failed\n");
return 0;
}
}

memset(&s_addr, 0, sizeof(s_addr));
s_addr.sin_family = NET_PROTOCAL;
if (ip)
s_addr.sin_addr.s_addr = inet_addr(ip);
else
s_addr.sin_addr.s_addr = inet_addr(SERV_IP);
s_addr.sin_port = htons(port);
if (connect(sock, (struct sockaddr*)&s_addr, sizeof(s_addr))){
printf("connect fail\n");
return 0;
}

sndbuf_size = 0; rcvbuf_size = 0;
getsockopt(sock, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, &optlen);
getsockopt(sock, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, &optlen);
printf("client: sndbuf_size %d, rcvbuf_size %d\n", sndbuf_size, rcvbuf_size);

recv(sock, msg, sizeof(msg), 0);
printf("get msg: %s\n", msg);
send(sock, "Response", sizeof("Response"), MSG_NOSIGNAL);

close(sock);
}

int main(int argc, char **argv){
bool wrong_param = false;
int buf_size = 0, port = 0;
int c;
while(!wrong_param &&
(-1 != (c = getopt(argc, argv, "p:s:i:")))) {
switch (c) {
case 's':
buf_size = atoi(optarg);
break;
case 'i':
ip = strdup(optarg);
break;
case 'p':
port = atoi(optarg);
break;
case '?':
printf("usage: ./client -s <bufsize> -i <ip> -p <port>\n");
wrong_param = true;
break;
}
}
if (!wrong_param)
net_clnt(buf_size, port);
return 0;
}
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <arpa/inet.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <errno.h>
#include <stdbool.h>
#include <netinet/tcp.h>

#ifndef AF_SMC
#define AF_SMC 43
#endif
#define NET_PROTOCAL AF_INET
#define SERV_IP "0.0.0.0"
#define SERV_PORT 10012

int net_serv(int buf_size, int port)
{
int sndbuf_size, rcvbuf_size;
struct sockaddr_in s_addr;
struct sockaddr_in c_addr;
char msg[128] = "Request";
int l_sock, s_sock;
int optlen = 4;

if (!port)
port = SERV_PORT;

l_sock = socket(NET_PROTOCAL, SOCK_STREAM, 0);

if (buf_size) {
sndbuf_size = rcvbuf_size = buf_size;
/* set sndbuf and rcvbuf */
if (setsockopt(l_sock, SOL_SOCKET, SO_SNDBUF,
&sndbuf_size, sizeof(int))) {
printf("set sndbuf failed\n");
return 0;
}
if (setsockopt(l_sock, SOL_SOCKET, SO_RCVBUF,
&rcvbuf_size, sizeof(int))) {
printf("set rcvbuf failed\n", rcvbuf_size);
return 0;
}
}

memset(&s_addr, 0, sizeof(struct sockaddr_in));
s_addr.sin_family = NET_PROTOCAL;
s_addr.sin_addr.s_addr = inet_addr(SERV_IP);
s_addr.sin_port = htons(port);
if (bind(l_sock, (struct sockaddr*)&s_addr, sizeof(s_addr))) {
printf("bind listen socket error %d\n", errno);
return 0;
}
if (listen(l_sock, 20)) {
printf("listen error\n");
return 0;
}

socklen_t c_addr_len = sizeof(c_addr);
s_sock = accept(l_sock, (struct sockaddr*)&c_addr,
&c_addr_len);
if (s_sock < 0) {
printf("accept fail\n");
return 0;
} else {
char ip[16] = { 0 };
inet_ntop(NET_PROTOCAL, &(c_addr.sin_addr), ip, INET_ADDRSTRLEN);
printf("accept connection: ip %s port %d\n",
ip, c_addr.sin_port);
}
getsockopt(s_sock, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, &optlen);
getsockopt(s_sock, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, &optlen);
printf("server: sndbuf_size %d, rcvbuf_size %d\n", sndbuf_size, rcvbuf_size);

send(s_sock, "Request", sizeof("Request"), MSG_NOSIGNAL);
recv(s_sock, msg, sizeof(msg), 0);
printf("get msg: %s\n", msg);

close(s_sock);
close(l_sock);
return 0;
}

int main(int argc, char **argv)
{
bool wrong_param = false;
int buf_size = 0, port = 0;
int c;
while(!wrong_param &&
(-1 != (c = getopt(argc, argv, "p:s:")))) {
switch (c) {
case 's':
buf_size = atoi(optarg);
break;
case 'p':
port = atoi(optarg);
break;
case '?':
printf("usage: ./server -s <bufsize> -p <port>\n");
wrong_param = true;
break;
}
}
if (!wrong_param)
net_serv(buf_size, port);
return 0;
}