Re: [net-next] net/smc: transition to RDMA core CQ pooling

From: D. Wythe

Date: Fri Feb 27 2026 - 04:35:15 EST

On Thu, Feb 26, 2026 at 07:53:00PM -0800, Jakub Kicinski wrote:
> This is an AI-generated review of your patch. The human sending this
> email has considered the AI review valid, or at least plausible.
>
> For local reproduction steps see https://netdev-ai.bots.linux.dev/ai-local.html
> ---
> net/smc: transition to RDMA core CQ pooling
>
> This commit refactors SMC-R completion handling to use the ib_cqe API
> and RDMA core CQ pooling, replacing the previous per-device global CQs
> and manual tasklet polling. The refactoring aims to eliminate scalability
> bottlenecks by enabling multi-CQ parallelization and O(1) context retrieval.
>
> > diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c
> > index 5feafa98ab1a..4a4924c5cade 100644
> > --- a/net/smc/smc_wr.c
> > +++ b/net/smc/smc_wr.c
>
> [ ... ]
>
> > @@ -63,55 +60,52 @@ void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
> > wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
> > }
> >
> > -static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
> > +static void smc_wr_tx_rdma_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
> > {
> > - u32 i;
> > + struct smc_link *link = wc->qp->qp_context;
> >
> > - for (i = 0; i < link->wr_tx_cnt; i++) {
> > - if (link->wr_tx_pends[i].wr_id == wr_id)
> > - return i;
> > - }
> > - return link->wr_tx_cnt;
> > + /* terminate link */
> > + if (wc->status)
> > + smcr_link_down_cond_sched(link);
> > +}
> > +
> > +static void smc_wr_reg_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
> > +{
> > + struct smc_link *link = wc->qp->qp_context;
> > +
> > + if (wc->status)
> > + link->wr_reg_state = FAILED;
> > + else
> > + link->wr_reg_state = CONFIRMED;
> > + smc_wr_wakeup_reg_wait(link);
> > }
> >
> > -static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
> > +static void smc_wr_tx_process_cqe(struct ib_cq *cq, struct ib_wc *wc)
> > {
> > - struct smc_wr_tx_pend pnd_snd;
> > + struct smc_wr_tx_pend *tx_pend, pnd_snd;
> > + struct smc_ib_send_wr *send_wr;
> > struct smc_link *link;
> > u32 pnd_snd_idx;
> >
> > link = wc->qp->qp_context;
> >
> > - if (wc->opcode == IB_WC_REG_MR) {
> > - if (wc->status)
> > - link->wr_reg_state = FAILED;
> > - else
> > - link->wr_reg_state = CONFIRMED;
> > - smc_wr_wakeup_reg_wait(link);
> > - return;
> > - }
> > + send_wr = container_of(wc->wr_cqe, struct smc_ib_send_wr, cqe);
> > + pnd_snd_idx = send_wr->idx;
> > +
> > + tx_pend = (pnd_snd_idx == link->wr_tx_cnt) ? link->wr_tx_v2_pend :
> > + &link->wr_tx_pends[pnd_snd_idx];
> > +
> > + tx_pend->wc_status = wc->status;
> > + memcpy(&pnd_snd, tx_pend, sizeof(pnd_snd));
> > + /* clear the full struct smc_wr_tx_pend including .priv */
> > + memset(tx_pend, 0, sizeof(*tx_pend));
> >
> > - pnd_snd_idx = smc_wr_tx_find_pending_index(link, wc->wr_id);
> > if (pnd_snd_idx == link->wr_tx_cnt) {
> > - if (link->lgr->smc_version != SMC_V2 ||
> > - link->wr_tx_v2_pend->wr_id != wc->wr_id)
> > - return;
> > - link->wr_tx_v2_pend->wc_status = wc->status;
> > - memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd));
> > - /* clear the full struct smc_wr_tx_pend including .priv */
> > - memset(link->wr_tx_v2_pend, 0,
> > - sizeof(*link->wr_tx_v2_pend));
> > memset(link->lgr->wr_tx_buf_v2, 0,
> > sizeof(*link->lgr->wr_tx_buf_v2));
> > } else {
> > - link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status;
> > - if (link->wr_tx_pends[pnd_snd_idx].compl_requested)
> > + if (tx_pend->compl_requested)
> ^^^^^^
>
> Does this read the correct field? The code above memsets tx_pend to zero,
> so compl_requested is always 0 and complete() is never called.
>
> The refactoring created a local copy pnd_snd before the memset:
>
> memcpy(&pnd_snd, tx_pend, sizeof(pnd_snd));
> memset(tx_pend, 0, sizeof(*tx_pend));
>
> But then it reads from the zeroed tx_pend instead of pnd_snd. Should this be
> pnd_snd.compl_requested instead?
>
> This affects smc_wr_tx_send_wait(), which sets compl_requested and waits for
> the completion that never arrives, causing LLC message timeouts.
>

Nice catch! It should indeed be pnd_snd.compl_requested here since tx_pend has been zeroed.
Will fix in v2.

> > complete(&link->wr_tx_compl[pnd_snd_idx]);
> > - memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx],
> > - sizeof(pnd_snd));
> > - /* clear the full struct smc_wr_tx_pend including .priv */
> > - memset(&link->wr_tx_pends[pnd_snd_idx], 0,
> > - sizeof(link->wr_tx_pends[pnd_snd_idx]));
> > memset(&link->wr_tx_bufs[pnd_snd_idx], 0,
> > sizeof(link->wr_tx_bufs[pnd_snd_idx]));
> > if (!test_and_clear_bit(pnd_snd_idx, link->wr_tx_mask))
>
> [ ... ]
>
> > @@ -550,7 +472,7 @@ void smc_wr_remember_qp_attr(struct smc_link *lnk)
> > lnk->wr_tx_cnt = min_t(size_t, lnk->max_send_wr,
> > lnk->qp_attr.cap.max_send_wr);
> > lnk->wr_rx_cnt = min_t(size_t, lnk->max_recv_wr,
> > - lnk->qp_attr.cap.max_recv_wr);
> > + lnk->qp_attr.cap.max_recv_wr - 1); /* -1 for ib_draib_rq() */
> ^^^^^^^^^^^^^^
> This isn't a bug, but there's a typo in the comment: ib_draib_rq() should be
> ib_drain_rq().

Will fix in v2. Thanks.

D. Wythe

>
> > }
>
> [ ... ]
> --
> pw-bot: cr