Re: [PATCH net-next 12/15] net/mlx5: LAG, add MPESW over SD LAG support
From: Shay Drori
Date: Sun Jun 07 2026 - 07:26:45 EST
On 04/06/2026 14:44, Tariq Toukan wrote:
From: Shay Drory <shayd@xxxxxxxxxx>
Enable MPESW LAG creation over SD LAG members, forming a composite LAG
hierarchy. This allows bonding multiple SD groups together under a
single MPESW configuration with shared FDB.
When enabling composite MPESW, the individual SD LAG shared FDB
configurations are temporarily torn down and recreated when the
composite LAG is disabled.
Signed-off-by: Shay Drory <shayd@xxxxxxxxxx>
Reviewed-by: Mark Bloch <mbloch@xxxxxxxxxx>
Signed-off-by: Tariq Toukan <tariqt@xxxxxxxxxx>
---
.../net/ethernet/mellanox/mlx5/core/lag/lag.c | 6 ++
.../net/ethernet/mellanox/mlx5/core/lag/lag.h | 8 ++
.../ethernet/mellanox/mlx5/core/lag/mpesw.c | 95 +++++++++++++++++--
.../ethernet/mellanox/mlx5/core/lag/mpesw.h | 4 +
4 files changed, 105 insertions(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index 9566fbf59fdb..25a9012e3014 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -2545,6 +2545,7 @@ void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
struct mlx5_core_dev *primary;
struct mlx5_lag *ldev;
struct lag_func *pf;
+ bool mpesw;
int i;
ldev = mlx5_lag_dev(dev);
@@ -2553,6 +2554,9 @@ void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
primary = mlx5_sd_get_primary(dev) ?: dev;
mlx5_devcom_comp_lock(primary->priv.hca_devcom_comp);
+ mpesw = ldev->mode == MLX5_LAG_MODE_MPESW;
+ if (mpesw)
+ mlx5_mpesw_sd_devcoms_lock(ldev);
mutex_lock(&ldev->lock);
ldev->mode_changes_in_progress++;
@@ -2564,6 +2568,8 @@ void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
}
mutex_unlock(&ldev->lock);
+ if (mpesw)
+ mlx5_mpesw_sd_devcoms_unlock(ldev);
mlx5_devcom_comp_unlock(primary->priv.hca_devcom_comp);
if (!sd_devcom)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
index 34350b0a7307..3a90d360d724 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
@@ -157,6 +157,14 @@ __mlx5_lag_is_sd(struct mlx5_lag *ldev, struct mlx5_core_dev *dev)
return pf && pf->group_id != 0;
}
+static inline bool
+__mlx5_lag_dev_is_port(struct mlx5_lag *ldev, struct mlx5_core_dev *dev)
+{
+ struct lag_func *pf = mlx5_lag_pf_by_dev(ldev, dev);
+
+ return pf && xa_get_mark(&ldev->pfs, pf->idx, MLX5_LAG_XA_MARK_PORT);
+}
+
static inline bool
__mlx5_lag_is_active(struct mlx5_lag *ldev)
{
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
index 2cb44084e239..50bfb450c71e 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.c
@@ -15,7 +15,7 @@ static void mlx5_mpesw_metadata_cleanup(struct mlx5_lag *ldev)
u32 pf_metadata;
int i;
- mlx5_ldev_for_each(i, 0, ldev) {
+ mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
dev = mlx5_lag_pf(ldev, i)->dev;
esw = dev->priv.eswitch;
pf_metadata = ldev->lag_mpesw.pf_metadata[i];
@@ -36,7 +36,7 @@ static int mlx5_mpesw_metadata_set(struct mlx5_lag *ldev)
u32 pf_metadata;
int i, err;
- mlx5_ldev_for_each(i, 0, ldev) {
+ mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
dev = mlx5_lag_pf(ldev, i)->dev;
esw = dev->priv.eswitch;
pf_metadata = mlx5_esw_match_metadata_alloc(esw);
@@ -52,7 +52,7 @@ static int mlx5_mpesw_metadata_set(struct mlx5_lag *ldev)
goto err_metadata;
}
- mlx5_ldev_for_each(i, 0, ldev) {
+ mlx5_lag_for_each(i, 0, ldev, MLX5_LAG_FILTER_ALL) {
dev = mlx5_lag_pf(ldev, i)->dev;
mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_MULTIPORT_ESW,
(void *)0);
@@ -65,6 +65,48 @@ static int mlx5_mpesw_metadata_set(struct mlx5_lag *ldev)
return err;
}
+static void mlx5_mpesw_restore_sd_fdb(struct mlx5_lag *ldev)
+{
+ struct lag_func *pf;
+ int err, i;
+
+ mlx5_ldev_for_each(i, 0, ldev) {
+ pf = mlx5_lag_pf(ldev, i);
+ err = mlx5_lag_shared_fdb_create(ldev, NULL, 0, pf->group_id);
sashiko.dev says:
Does this accidentally activate global FW LAG for non-SD ports?
This loop unconditionally calls mlx5_lag_shared_fdb_create() using
pf->group_id. For standalone non-SD ports, pf->group_id is 0.
Inside mlx5_lag_shared_fdb_create(), passing group_id = 0 sets the filter
to MLX5_LAG_FILTER_ALL, which bypasses the SD-specific single-FDB path and
incorrectly triggers a full global LAG activation via mlx5_activate_lag().
Should there be a check for pf->group_id != 0 or pf->sd_fdb_active here,
similar to the check in mlx5_mpesw_teardown_sd_fdb()?
pf->sd_fdb_active is false here by definition-we are re-creating it.
And this API is already gated by pf->group_id check.
When MPESW is destroyed, all PFs have group ID (if this is SD NIC)
+ if (err)
+ mlx5_core_warn(pf->dev,
+ "Failed to restore SD shared FDB (%d)\n",
+ err);
+ }
+}
+
+static int mlx5_mpesw_teardown_sd_fdb(struct mlx5_lag *ldev)
+{
+ struct lag_func *pf;
+ int i;
+
+ mlx5_ldev_for_each(i, 0, ldev) {
+ pf = mlx5_lag_pf(ldev, i);
+ if (!pf->sd_fdb_active)
+ continue;
+ mlx5_lag_shared_fdb_destroy(ldev, pf->group_id);
+ }
+ return 0;
+}
+
+static bool mlx5_lag_has_sd_group(struct mlx5_lag *ldev)
+{
+ struct lag_func *pf;
+ int i;
+
+ mlx5_ldev_for_each(i, 0, ldev) {
+ pf = mlx5_lag_pf(ldev, i);
+ if (pf->group_id)
+ return true;
+ }
+ return false;
+}
+
static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
{
int idx = mlx5_lag_get_dev_index_by_seq(ldev, MLX5_LAG_P1);
@@ -92,10 +134,17 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
if (err)
return err;
+ if (mlx5_lag_has_sd_group(ldev))
+ mlx5_mpesw_teardown_sd_fdb(ldev);
+
err = mlx5_lag_shared_fdb_create(ldev, NULL, MLX5_LAG_MODE_MPESW,
MLX5_LAG_FILTER_ALL);
if (err) {
- mlx5_core_warn(dev0, "Failed to create LAG in MPESW mode (%d)\n", err);
+ mlx5_core_warn(dev0,
+ "Failed to create LAG in MPESW mode (%d)\n",
+ err);
+ if (mlx5_lag_has_sd_group(ldev))
+ mlx5_mpesw_restore_sd_fdb(ldev);
mlx5_mpesw_metadata_cleanup(ldev);
return err;
}
@@ -105,9 +154,36 @@ static int mlx5_lag_enable_mpesw(struct mlx5_lag *ldev)
void mlx5_lag_disable_mpesw(struct mlx5_lag *ldev)
{
- if (ldev->mode == MLX5_LAG_MODE_MPESW) {
- mlx5_mpesw_metadata_cleanup(ldev);
- mlx5_lag_shared_fdb_destroy(ldev, MLX5_LAG_FILTER_ALL);
+ if (ldev->mode != MLX5_LAG_MODE_MPESW)
+ return;
+
+ mlx5_mpesw_metadata_cleanup(ldev);
+ mlx5_lag_shared_fdb_destroy(ldev, MLX5_LAG_FILTER_ALL);
+ if (mlx5_lag_has_sd_group(ldev))
+ mlx5_mpesw_restore_sd_fdb(ldev);
+}
+
+void mlx5_mpesw_sd_devcoms_lock(struct mlx5_lag *ldev)
+{
+ struct mlx5_devcom_comp_dev *sd_devcom;
+ int i;
+
+ mlx5_ldev_for_each(i, 0, ldev) {
+ sd_devcom = mlx5_sd_get_devcom(mlx5_lag_pf(ldev, i)->dev);
sashiko.dev says:
Can this lead to a use-after-free on the pf pointer?
The mlx5_ldev_for_each() loop iterates over the ldev->pfs xarray, fetching pf
pointers via mlx5_lag_pf() and dereferencing pf->dev.
However, this helper is called in mlx5_lag_disable_change() before acquiring
ldev->lock and without an explicit rcu_read_lock(). Because xa_load() drops
its internal RCU locks before returning, the returned pf pointer is completely
unprotected.
If a concurrent device unload via mlx5_lag_remove_mdev() acquires ldev->lock,
calls xa_erase(), and frees the pf object with kfree(pf), would this access
to pf->dev trigger a use-after-free?
No. mlx5_mpesw_sd_devcoms_lock() runs only when ldev->mode == MPESW
(checked under the held HCA devcom_comp). mlx5_lag_remove_mdev() is
reachable only after mlx5_eswitch_disable()/mlx5_lag_disable_change()
has torn down MPESW for that LAG (setting ldev->mode = NONE).
The shared HCA devcom_comp serializes the two paths, so any thread
that reaches remove_mdev's xa_erase has already moved ldev->mode out
of MPESW, and any concurrent disable_change waiting on HCA devcom
will then see mpesw=false and skip the iteration.
+ if (sd_devcom)
+ mlx5_devcom_comp_lock(sd_devcom);
+ }
+}
+
+void mlx5_mpesw_sd_devcoms_unlock(struct mlx5_lag *ldev)
+{
+ struct mlx5_devcom_comp_dev *sd_devcom;
+ int i;
+
+ mlx5_ldev_for_each_reverse(i, MLX5_MAX_PORTS, 0, ldev) {
+ sd_devcom = mlx5_sd_get_devcom(mlx5_lag_pf(ldev, i)->dev);
+ if (sd_devcom)
+ mlx5_devcom_comp_unlock(sd_devcom);
}
}
@@ -122,6 +198,7 @@ static void mlx5_mpesw_work(struct work_struct *work)
return;
mlx5_devcom_comp_lock(devcom);
+ mlx5_mpesw_sd_devcoms_lock(ldev);
mutex_lock(&ldev->lock);
if (ldev->mode_changes_in_progress) {
mpesww->result = -EAGAIN;
@@ -134,6 +211,7 @@ static void mlx5_mpesw_work(struct work_struct *work)
mlx5_lag_disable_mpesw(ldev);
unlock:
mutex_unlock(&ldev->lock);
+ mlx5_mpesw_sd_devcoms_unlock(ldev);
mlx5_devcom_comp_unlock(devcom);
complete(&mpesww->comp);
}
@@ -199,7 +277,8 @@ bool mlx5_lag_is_mpesw(struct mlx5_core_dev *dev)
{
struct mlx5_lag *ldev = mlx5_lag_dev(dev);
- return ldev && ldev->mode == MLX5_LAG_MODE_MPESW;
+ return ldev && ldev->mode == MLX5_LAG_MODE_MPESW &&
+ __mlx5_lag_dev_is_port(ldev, dev);
}
EXPORT_SYMBOL(mlx5_lag_is_mpesw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h
index b767dbb4f457..5099723ba0f7 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/mpesw.h
@@ -33,8 +33,12 @@ void mlx5_lag_mpesw_disable(struct mlx5_core_dev *dev);
int mlx5_lag_mpesw_enable(struct mlx5_core_dev *dev);
#ifdef CONFIG_MLX5_ESWITCH
void mlx5_lag_disable_mpesw(struct mlx5_lag *ldev);
+void mlx5_mpesw_sd_devcoms_lock(struct mlx5_lag *ldev);
+void mlx5_mpesw_sd_devcoms_unlock(struct mlx5_lag *ldev);
#else
static inline void mlx5_lag_disable_mpesw(struct mlx5_lag *ldev) {}
+static inline void mlx5_mpesw_sd_devcoms_lock(struct mlx5_lag *ldev) {}
+static inline void mlx5_mpesw_sd_devcoms_unlock(struct mlx5_lag *ldev) {}
#endif /* CONFIG_MLX5_ESWITCH */
#ifdef CONFIG_MLX5_ESWITCH