[PATCH net-next v2] net/mlx5e: Precompute xdpsq assignments for mlx5e_xdp_xmit()

From: Finn Dayton

Date: Mon Mar 02 2026 - 00:59:54 EST


mlx5e_xdp_xmit() currently selects the xdpsq (send queue) using
smp_processor_id() (i.e. cpu id). When doing XDP_REDIRECT from a cpu
with id >= priv->channels.num, however, mlx5e_xdp_xmit() returns -ENXIO
and the redirect fails.

Previous discussion proposed using modulo or while loop subtraction
in mlx5e_xdp_xmit() to map cpu id to send queue, but these approaches
introduce hot path overhead on modern systems where the number of
logical cores >> the number of XDP send queues (xdpsq).

The below approach precomputes per-cpu priv->xdpsq assignments when
channels are (re)configured and does a constant-time lookup in
mlx5e_xdp_xmit().

Because multiple CPUs may now map to the same xdpsq (whenever cpu count
exceeds channel count), we serialize writes in xdp_xmit with a tx_lock.

Link: https://lore.kernel.org/all/610D8F9E-0038-46D9-AD8A-1D596236B1EF@xxxxxxxxxx/
Link: https://lore.kernel.org/all/474c1f71-3a5c-4fe5-a01e-80f2ba95fd7e@xxxxxxxxxxxxx/
Signed-off-by: Finn Dayton <finnius.dayton@xxxxxxxxxx>
---
v2:
- Removed unnecessary guards
- Improved variable naming and placement
- Change mapping from cpu -> index to cpu -> xdpsq
- Call smp_wmb() after updates to mapping

drivers/net/ethernet/mellanox/mlx5/core/en.h | 4 +++
.../net/ethernet/mellanox/mlx5/core/en/xdp.c | 17 ++++++-------
.../net/ethernet/mellanox/mlx5/core/en_main.c | 25 +++++++++++++++++++
3 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index ea2cd1f5d1d0..713dc7f9bae3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -493,6 +493,8 @@ struct mlx5e_xdpsq {
u16 pc;
struct mlx5_wqe_ctrl_seg *doorbell_cseg;
struct mlx5e_tx_mpwqe mpwqe;
+ /* serialize writes by multiple CPUs to this send queue */
+ spinlock_t tx_lock;

struct mlx5e_cq cq;

@@ -898,6 +900,8 @@ struct mlx5e_priv {
struct mlx5e_selq selq;
struct mlx5e_txqsq **txq2sq;
struct mlx5e_sq_stats **txq2sq_stats;
+ /* selects the xdpsq during mlx5e_xdp_xmit() */
+ struct mlx5e_xdpsq * __percpu *send_queue_ptr;

#ifdef CONFIG_MLX5_CORE_EN_DCB
struct mlx5e_dcbx_dp dcbx_dp;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 80f9fc10877a..1db83a69055c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -845,7 +845,6 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
struct mlx5e_priv *priv = netdev_priv(dev);
struct mlx5e_xdpsq *sq;
int nxmit = 0;
- int sq_num;
int i;

/* this flag is sufficient, no need to test internal sq state */
@@ -854,14 +853,12 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,

if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
return -EINVAL;
-
- sq_num = smp_processor_id();
-
- if (unlikely(sq_num >= priv->channels.num))
- return -ENXIO;
-
- sq = priv->channels.c[sq_num]->xdpsq;
-
+ /* Per-CPU xdpsq mapping, rebuilt on channel (re)configuration while XDP TX is disabled */
+ sq = *this_cpu_ptr(priv->send_queue_ptr);
+ /* The number of queues configured on a netdev may be smaller than the
+ * CPU pool, so two CPUs might map to this queue. We must serialize writes.
+ */
+ spin_lock(&sq->tx_lock);
for (i = 0; i < n; i++) {
struct mlx5e_xmit_data_frags xdptxdf = {};
struct xdp_frame *xdpf = frames[i];
@@ -941,7 +938,7 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,

if (flags & XDP_XMIT_FLUSH)
mlx5e_xmit_xdp_doorbell(sq);
-
+ spin_unlock(&sq->tx_lock);
return nxmit;
}

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index b6c12460b54a..434db74f096b 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1505,6 +1505,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
sq->stop_room = param->is_mpw ? mlx5e_stop_room_for_mpwqe(mdev) :
mlx5e_stop_room_for_max_wqe(mdev);
sq->max_sq_mpw_wqebbs = mlx5e_get_max_sq_aligned_wqebbs(mdev);
+ spin_lock_init(&sq->tx_lock);

param->wq.db_numa_node = cpu_to_node(c->cpu);
err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, wq, &sq->wq_ctrl);
@@ -3283,10 +3284,27 @@ static void mlx5e_build_txq_maps(struct mlx5e_priv *priv)
smp_wmb();
}

+static void mlx5e_build_xdpsq_maps(struct mlx5e_priv *priv)
+{
+ /* Build mapping from CPU id to XDP send queue, used by
+ * mlx5e_xdp_xmit() to determine which send queue to transmit packet on.
+ */
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ int send_queue_idx = cpu % priv->channels.num;
+ struct mlx5e_xdpsq *sq = priv->channels.c[send_queue_idx]->xdpsq;
+ *per_cpu_ptr(priv->send_queue_ptr, cpu) = sq;
+ }
+ /* Publish the new CPU->xdpsq map before re-enabling XDP TX */
+ smp_wmb();
+}
+
void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
{
mlx5e_build_txq_maps(priv);
mlx5e_activate_channels(priv, &priv->channels);
+ mlx5e_build_xdpsq_maps(priv);
mlx5e_xdp_tx_enable(priv);

/* dev_watchdog() wants all TX queues to be started when the carrier is
@@ -6262,8 +6280,14 @@ int mlx5e_priv_init(struct mlx5e_priv *priv,
if (!priv->fec_ranges)
goto err_free_channel_stats;

+ priv->send_queue_ptr = alloc_percpu(struct mlx5e_xdpsq *);
+ if (!priv->send_queue_ptr)
+ goto err_free_fec_ranges;
+
return 0;

+err_free_fec_ranges:
+ kfree(priv->fec_ranges);
err_free_channel_stats:
kfree(priv->channel_stats);
err_free_tx_rates:
@@ -6290,6 +6314,7 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv)
if (!priv->mdev)
return;

+ free_percpu(priv->send_queue_ptr);
kfree(priv->fec_ranges);
for (i = 0; i < priv->stats_nch; i++)
kvfree(priv->channel_stats[i]);
--
2.43.0