Re: [PATCH net] net/mlx5e: Precompute xdpsq assignments for mlx5e_xdp_xmit()
From: Tariq Toukan
Date: Tue Feb 24 2026 - 02:36:48 EST
On 24/02/2026 6:32, Finn Dayton wrote:
On 2/23/26, 5:46 AM, "Tariq Toukan" <ttoukan.linux@xxxxxxxxx <mailto:ttoukan.linux@xxxxxxxxx>> wrote:
On 23/02/2026 2:05, Finn Dayton wrote:
mlx5e_xdp_xmit() selects an XDP SQ (Send Queue) using smp_processor_id()
(CPU ID). When doing XDP_REDIRECT from a CPU whose ID is
= priv->channels.num, mlx5e_xdp_xmit() returns -ENXIO and theredirect fails.
Previous discussion proposed using modulo in mlx5e_xdp_xmit() to map
CPU IDs into the channel range, but modulo/division is too costly in
the hot path.
That discussion reached to an agreement of using a while loop with
subtraction. It's expected to be fast, and optimizes the common case.
Instead, this solution precomputes per-cpu priv->xdpsq assignments when
channels are (re)configured and does a single lookup in mlx5e_xdp_xmit().
Because multiple CPUs map to the same xdpsq when CPU count exceeds
channel count, serialize xdp_xmit on the ring with xdp_tx_lock.
What's the advantage of this solution over the one we already agreed to?
Subtraction in a while loop is great for the common case (cpu_id < channels.num).
But in worst case, i.e. CPUs >> XDP SQs, the while-subtraction approach
Is O(cpu_id / channels.num) iterations. With 256 CPUs and 8 channels, cpu_id=255 does
31 subtractions; with 2 channels it can be 127 iterations in the mlx5e_xdp_xmit() hot path.
Precomputing moves the expensive operation out of the hot path into (re)configuration
time and makes runtime lookups constant. The tradeoff is added per-cpu state that needs
to be rebuilt whenever channels are reconfigured.
Given modern systems where CPU count can greatly exceed the number of XDP SQs, constant-
time lookups avoid a large/variable loop in the hot path.
Valid point.
The while loop solution is significantly simpler.
We can go with this one, but must make sure we take care of all needed syncs, for safety.
See comments on original mail.
Also, is the performance impact measurable?
Do you see the regression in your tests?
Thank you,
Finn
Fixes: 58b99ee3e3eb ("net/mlx5e: Add support for XDP_REDIRECT in device-out side")
Link: https://urldefense.us/v3/__https://lore.kernel.org/netdev/20251031231038.1092673-1-zijianzhang@xxxxxxxxxxxxx <mailto:20251031231038.1092673-1-zijianzhang@xxxxxxxxxxxxx>/__;!!Fqb0NABsjhF0Kh8I!Y83cZX7QBDtN-kWQrfMsOLg2GhV5biqKsT4Dkj0Zk677rm_78pu6_4ij7TtkVuh859B_YQf8n0yDiBMAtI0Caie0lpk$
Link: https://urldefense.us/v3/__https://lore.kernel.org/netdev/44f69955-b566-4fb1-904d-f551046ff2d4@xxxxxxxxx <mailto:44f69955-b566-4fb1-904d-f551046ff2d4@xxxxxxxxx>__;!!Fqb0NABsjhF0Kh8I!Y83cZX7QBDtN-kWQrfMsOLg2GhV5biqKsT4Dkj0Zk677rm_78pu6_4ij7TtkVuh859B_YQf8n0yDiBMAtI0CNvBcoVA$
Cc: stable@xxxxxxxxxxxxxxx <mailto:stable@xxxxxxxxxxxxxxx> # 6.12+
Signed-off-by: Finn Dayton <finnius.dayton@xxxxxxxxxx <mailto:finnius.dayton@xxxxxxxxxx>>
---
Testing:
- XDP forwarding / XDP_REDIRECT verified with both low CPU ids and
CPU ids > than number of send queues.
- No -ENXIO observed, successful forwarding.
drivers/net/ethernet/mellanox/mlx5/core/en.h | 4 +++
.../net/ethernet/mellanox/mlx5/core/en/xdp.c | 16 +++++++----
.../net/ethernet/mellanox/mlx5/core/en_main.c | 28 +++++++++++++++++++
3 files changed, 43 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index ea2cd1f5d1d0..387954201640 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -519,6 +519,8 @@ struct mlx5e_xdpsq {
/* control path */
struct mlx5_wq_ctrl wq_ctrl;
struct mlx5e_channel *channel;
+ /* serialize writes by multiple CPUs to this send queue */
+ spinlock_t xdp_tx_lock;
} ____cacheline_aligned_in_smp;
struct mlx5e_xdp_buff {
@@ -909,6 +911,8 @@ struct mlx5e_priv {
struct mlx5e_rq drop_rq;
struct mlx5e_channels channels;
+ /* selects the xdpsq during mlx5e_xdp_xmit() */
+ int __percpu *send_queue_idx_ptr;
struct mlx5e_rx_res *rx_res;
u32 *tx_rates;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
index 80f9fc10877a..2dd44ad873a1 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xdp.c
@@ -845,7 +845,7 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
struct mlx5e_priv *priv = netdev_priv(dev);
struct mlx5e_xdpsq *sq;
int nxmit = 0;
- int sq_num;
+ int send_queue_idx = 0;
int i;
/* this flag is sufficient, no need to test internal sq state */
@@ -855,13 +855,19 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
return -EINVAL;
- sq_num = smp_processor_id();
- if (unlikely(sq_num >= priv->channels.num))
+ if (unlikely(!priv->send_queue_idx_ptr))
return -ENXIO;
- sq = priv->channels.c[sq_num]->xdpsq;
+ send_queue_idx = *this_cpu_ptr(priv->send_queue_idx_ptr);
+ if (unlikely(send_queue_idx >= priv->channels.num || send_queue_idx < 0))
+ return -ENXIO;
+ sq = priv->channels.c[send_queue_idx]->xdpsq;
+ /* The number of queues configured on a netdev may be smaller than the
+ * CPU pool, so two CPUs might map to this queue. We must serialize writes.
+ */
+ spin_lock(&sq->xdp_tx_lock);
for (i = 0; i < n; i++) {
struct mlx5e_xmit_data_frags xdptxdf = {};
struct xdp_frame *xdpf = frames[i];
@@ -941,7 +947,7 @@ int mlx5e_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
if (flags & XDP_XMIT_FLUSH)
mlx5e_xmit_xdp_doorbell(sq);
-
+ spin_unlock(&sq->xdp_tx_lock);
return nxmit;
}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 7eb691c2a1bd..adef35d06b89 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -1492,6 +1492,7 @@ static int mlx5e_alloc_xdpsq(struct mlx5e_channel *c,
sq->pdev = c->pdev;
sq->mkey_be = c->mkey_be;
sq->channel = c;
+ spin_lock_init(&sq->xdp_tx_lock);
sq->uar_map = c->bfreg->map;
sq->min_inline_mode = params->tx_min_inline_mode;
sq->hw_mtu = MLX5E_SW2HW_MTU(params, params->sw_mtu) - ETH_FCS_LEN;
@@ -3283,10 +3284,30 @@ static void mlx5e_build_txq_maps(struct mlx5e_priv *priv)
smp_wmb();
}
+static void build_priv_to_xdpsq_associations(struct mlx5e_priv *priv)
+{
+ /*
+ * Build the mapping from CPU to XDP send queue index for priv.
+ * This is used by mlx5e_xdp_xmit() to determine which xdpsq (send queue)
+ * should handle the xdptx data, based on the CPU running mlx5e_xdp_xmit()
+ * and the target priv (netdev).
+ */
+ int send_queue_idx, cpu;
+
+ if (unlikely(priv->channels.num == 0))
+ return;
+
+ for_each_possible_cpu(cpu) {
+ send_queue_idx = cpu % priv->channels.num;
+ *per_cpu_ptr(priv->send_queue_idx_ptr, cpu) = send_queue_idx;
+ }
+}
+
void mlx5e_activate_priv_channels(struct mlx5e_priv *priv)
{
mlx5e_build_txq_maps(priv);
mlx5e_activate_channels(priv, &priv->channels);
+ build_priv_to_xdpsq_associations(priv);
mlx5e_xdp_tx_enable(priv);
/* dev_watchdog() wants all TX queues to be started when the carrier is
@@ -6263,8 +6284,14 @@ int mlx5e_priv_init(struct mlx5e_priv *priv,
if (!priv->fec_ranges)
goto err_free_channel_stats;
+ priv->send_queue_idx_ptr = alloc_percpu(int);
+ if (!priv->send_queue_idx_ptr)
+ goto err_free_fec_ranges;
+
return 0;
+err_free_fec_ranges:
+ kfree(priv->fec_ranges);
err_free_channel_stats:
kfree(priv->channel_stats);
err_free_tx_rates:
@@ -6295,6 +6322,7 @@ void mlx5e_priv_cleanup(struct mlx5e_priv *priv)
for (i = 0; i < priv->stats_nch; i++)
kvfree(priv->channel_stats[i]);
kfree(priv->channel_stats);
+ free_percpu(priv->send_queue_idx_ptr);
kfree(priv->tx_rates);
kfree(priv->txq2sq_stats);
kfree(priv->txq2sq);