Re: [net-next] net/smc: transition to RDMA core CQ pooling
From: Jakub Kicinski
Date: Thu Feb 26 2026 - 22:53:27 EST
This is an AI-generated review of your patch. The human sending this
email has considered the AI review valid, or at least plausible.
For local reproduction steps see https://netdev-ai.bots.linux.dev/ai-local.html
---
net/smc: transition to RDMA core CQ pooling
This commit refactors SMC-R completion handling to use the ib_cqe API
and RDMA core CQ pooling, replacing the previous per-device global CQs
and manual tasklet polling. The refactoring aims to eliminate scalability
bottlenecks by enabling multi-CQ parallelization and O(1) context retrieval.
> diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> index 5feafa98ab1a..4a4924c5cade 100644
> --- a/net/smc/smc_wr.c
> +++ b/net/smc/smc_wr.c
[ ... ]
> @@ -63,55 +60,52 @@ void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
> wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
> }
>
> -static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
> +static void smc_wr_tx_rdma_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
> {
> - u32 i;
> + struct smc_link *link = wc->qp->qp_context;
>
> - for (i = 0; i < link->wr_tx_cnt; i++) {
> - if (link->wr_tx_pends[i].wr_id == wr_id)
> - return i;
> - }
> - return link->wr_tx_cnt;
> + /* terminate link */
> + if (wc->status)
> + smcr_link_down_cond_sched(link);
> +}
> +
> +static void smc_wr_reg_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
> +{
> + struct smc_link *link = wc->qp->qp_context;
> +
> + if (wc->status)
> + link->wr_reg_state = FAILED;
> + else
> + link->wr_reg_state = CONFIRMED;
> + smc_wr_wakeup_reg_wait(link);
> }
>
> -static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
> +static void smc_wr_tx_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
> {
> - struct smc_wr_tx_pend pnd_snd;
> + struct smc_wr_tx_pend *tx_pend, pnd_snd;
> + struct smc_ib_send_wr *send_wr;
> struct smc_link *link;
> u32 pnd_snd_idx;
>
> link = wc->qp->qp_context;
>
> - if (wc->opcode == IB_WC_REG_MR) {
> - if (wc->status)
> - link->wr_reg_state = FAILED;
> - else
> - link->wr_reg_state = CONFIRMED;
> - smc_wr_wakeup_reg_wait(link);
> - return;
> - }
> + send_wr = container_of(wc->wr_cqe, struct smc_ib_send_wr, cqe);
> + pnd_snd_idx = send_wr->idx;
> +
> + tx_pend = (pnd_snd_idx == link->wr_tx_cnt) ? link->wr_tx_v2_pend :
> + &link->wr_tx_pends[pnd_snd_idx];
> +
> + tx_pend->wc_status = wc->status;
> + memcpy(&pnd_snd, tx_pend, sizeof(pnd_snd));
> + /* clear the full struct smc_wr_tx_pend including .priv */
> + memset(tx_pend, 0, sizeof(*tx_pend));
>
> - pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
> if (pnd_snd_idx == link->wr_tx_cnt) {
> - if (link->lgr->smc_version != SMC_V2 ||
> - link->wr_tx_v2_pend->wr_id != wc->wr_id)
> - return;
> - link->wr_tx_v2_pend->wc_status = wc->status;
> - memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd));
> - /* clear the full struct smc_wr_tx_pend including .priv */
> - memset(link->wr_tx_v2_pend, 0,
> - sizeof(*link->wr_tx_v2_pend));
> memset(link->lgr->wr_tx_buf_v2, 0,
> sizeof(*link->lgr->wr_tx_buf_v2));
> } else {
> - link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
> - if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
> + if (tx_pend->compl_requested)
^^^^^^
Does this read the correct field? The code above memsets tx_pend to zero,
so compl_requested is always 0 and complete() is never called.
The refactoring created a local copy pnd_snd before the memset:
memcpy(&pnd_snd, tx_pend, sizeof(pnd_snd));
memset(tx_pend, 0, sizeof(*tx_pend));
But then it reads from the zeroed tx_pend instead of pnd_snd. Should this be
pnd_snd.compl_requested instead?
This affects smc_wr_tx_send_wait(), which sets compl_requested and waits for
the completion that never arrives, causing LLC message timeouts.
> complete(&link->wr_tx_compl[pnd_snd_idx]);
> - memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx],
> - sizeof(pnd_snd));
> - /* clear the full struct smc_wr_tx_pend including .priv */
> - memset(&link->wr_tx_pends[pnd_snd_idx], 0,
> - sizeof(link->wr_tx_pends[pnd_snd_idx]));
> memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
> sizeof(link->wr_tx_bufs[pnd_snd_idx]));
> if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
[ ... ]
> @@ -550,7 +472,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
> lnk->wr_tx_cnt = min_t(size_t, lnk->max_send_wr,
> lnk->qp_attr.cap.max_send_wr);
> lnk->wr_rx_cnt = min_t(size_t, lnk->max_recv_wr,
> - lnk->qp_attr.cap.max_recv_wr);
> + lnk->qp_attr.cap.max_recv_wr - 1); /* -1 for ib_draib_rq() */
^^^^^^^^^^^^^^
This isn't a bug, but there's a typo in the comment: ib_draib_rq() should be
ib_drain_rq().
> }
[ ... ]
--
pw-bot: cr