Re: [PATCH net-next V3 13/15] net/mlx5: E-Switch, Tie rep load/unload to SD LAG state

From: Shay Drori

Date: Sun Jun 14 2026 - 12:46:35 EST

On 12/06/2026 14:39, Tariq Toukan wrote:

From: Shay Drory <shayd@xxxxxxxxxx>

On an SD device, vport representors are not functional until the SD
group is combined and shared FDB is active. Skip the initial load and
the reload paths in that window; reps are loaded as part of the SD LAG
activation flow once it becomes active.

In addition, explicitly unload representors when SD LAG is destroyed.

Signed-off-by: Shay Drory <shayd@xxxxxxxxxx>
Reviewed-by: Mark Bloch <mbloch@xxxxxxxxxx>
Signed-off-by: Tariq Toukan <tariqt@xxxxxxxxxx>
---
.../net/ethernet/mellanox/mlx5/core/eswitch.h | 4 +++
.../mellanox/mlx5/core/eswitch_offloads.c | 26 +++++++++++++++++++
.../net/ethernet/mellanox/mlx5/core/lag/lag.c | 26 +++++++++++++++++++
.../net/ethernet/mellanox/mlx5/core/lag/lag.h | 1 +
.../mellanox/mlx5/core/lag/shared_fdb.c | 1 +
5 files changed, 58 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index a5f0774834fe..b2b3150f1f04 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -959,6 +959,7 @@ int mlx5_eswitch_offloads_single_fdb_add_one(struct mlx5_eswitch *master_esw,
void mlx5_eswitch_offloads_single_fdb_del_one(struct mlx5_eswitch *master_esw,
struct mlx5_eswitch *slave_esw);
int mlx5_eswitch_reload_ib_reps(struct mlx5_eswitch *esw);
+void mlx5_eswitch_unload_reps(struct mlx5_eswitch *esw);
bool mlx5_eswitch_is_peer(struct mlx5_eswitch *esw,
struct mlx5_eswitch *peer_esw);
@@ -1063,6 +1064,9 @@ mlx5_eswitch_reload_ib_reps(struct mlx5_eswitch *esw)
return 0;
}
+static inline void
+mlx5_eswitch_unload_reps(struct mlx5_eswitch *esw) {}
+
static inline bool
mlx5_eswitch_block_encap(struct mlx5_core_dev *dev, bool from_fdb)
{
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index a24719cfba34..4dc190a4e7b2 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -2863,6 +2863,10 @@ static int mlx5_esw_offloads_rep_load(struct mlx5_eswitch *esw, u16 vport_num)
int rep_type;
int err;
+ if (vport_num != MLX5_VPORT_UPLINK &&
+ mlx5_get_sd(esw->dev) && !mlx5_lag_is_active(esw->dev))
+ return 0;

sashiko.dev says:
If we intentionally skip the initial load of the Ethernet representors
(REP_ETH) here, will secondary devices permanently fail to load them later?
Looking at mlx5_lag_shared_fdb_create(), the code triggers an asynchronous
rescan exclusively for the primary device (dev0):
mlx5_lag_rescan_dev_locked(ldev, dev0, true);
err = mlx5_lag_reload_ib_reps_from_locked(ldev, 0, filter, false);
Because secondary devices are not rescanned, their auxiliary eth-rep drivers
are never re-probed, and their asynchronous reload is never queued. Since the
synchronous reload depends on REP_ETH already being loaded, skipping it here
seems to leave secondary devices permanently without network interfaces.

[SD] primary will invoke mlx5_esw_add_work() for secondary devices as
well via mlx5_eswitch_register_vport_reps_nested().

+
rep = mlx5_eswitch_get_rep(esw, vport_num);
for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) {
err = __esw_offloads_load_rep(esw, rep, rep_type,
@@ -3779,6 +3783,21 @@ static void esw_destroy_offloads_acl_tables(struct mlx5_eswitch *esw)
esw_vport_destroy_offloads_acl_tables(esw, vport);
}
+void mlx5_eswitch_unload_reps(struct mlx5_eswitch *esw)
+{
+ struct mlx5_eswitch_rep *rep;
+ unsigned long i;
+
+ if (!esw || esw->mode != MLX5_ESWITCH_OFFLOADS)
+ return;
+
+ mlx5_esw_for_each_rep(esw, i, rep) {
+ if (rep->vport == MLX5_VPORT_UPLINK)
+ continue;
+ mlx5_esw_offloads_rep_unload(esw, rep->vport);
+ }
+}
+
int mlx5_eswitch_reload_ib_reps(struct mlx5_eswitch *esw)
{
struct mlx5_eswitch_rep *rep;
@@ -3805,6 +3824,10 @@ int mlx5_eswitch_reload_ib_reps(struct mlx5_eswitch *esw)
if (!mlx5_sd_is_primary(esw->dev) &&
rep->vport == MLX5_VPORT_UPLINK)
continue;
+ if (rep->vport != MLX5_VPORT_UPLINK &&
+ mlx5_get_sd(esw->dev) && !mlx5_lag_is_active(esw->dev))
+ continue;
+

Is there a race condition here during SD LAG activation that bypasses the
synchronous load of primary device representors?
In mlx5_lag_shared_fdb_create(), the unbind/rebind of auxiliary drivers for
the primary device queues an asynchronous work item to load REP_ETH.
Immediately following this, mlx5_lag_reload_ib_reps_from_locked() executes
synchronously.
Because the asynchronous work hasn't run yet, REP_ETH is not loaded.
Consequently, this synchronous loop will evaluate the REP_LOADED check as
false and silently skip loading REP_IB.

[SD] The async reload loads both REP_ETH and REP_IB for VF/SF. The
synchronous reload_ib_reps only re-adds IB for reps whose ETH is already
loaded; skipping IB when ETH isn't up yet is not a loss — the async path
loads both. No race.

if (atomic_read(&rep->rep_data[REP_ETH].state) == REP_LOADED)
__esw_offloads_load_rep(esw, rep, REP_IB, NULL);
}
@@ -4764,6 +4787,9 @@ static void mlx5_eswitch_reload_reps_blocked(struct mlx5_eswitch *esw)
return;
}
+ if (mlx5_get_sd(esw->dev) && !mlx5_lag_is_active(esw->dev))
+ return;
+
mlx5_esw_for_each_vport(esw, i, vport) {
if (!vport)
continue;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
index 424478e649ef..28d16fdc3f06 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.c
@@ -1312,6 +1312,32 @@ int mlx5_lag_reload_ib_reps_from_locked(struct mlx5_lag *ldev, u32 flags,
return mlx5_lag_reload_ib_reps(ldev, flags, filter, cont_on_fail);
}
+static void mlx5_lag_unload_reps_unlocked(struct mlx5_lag *ldev, u32 filter)
+{
+ struct lag_func *pf;
+ int i;
+
+ mlx5_lag_for_each(i, 0, ldev, filter) {
+ struct mlx5_eswitch *esw;
+
+ pf = mlx5_lag_pf(ldev, i);
+ esw = pf->dev->priv.eswitch;
+ mlx5_esw_reps_block(esw);
+ mlx5_eswitch_unload_reps(esw);
+ mlx5_esw_reps_unblock(esw);
+ }
+}
+
+void mlx5_lag_unload_reps_from_locked(struct mlx5_lag *ldev, u32 filter)
+{
+ /* Same lock dance as mlx5_lag_reload_ib_reps: drop ldev->lock around
+ * the per-eswitch reps_lock to keep the reps_lock -> ldev->lock order.
+ */
+ mlx5_lag_drop_lock_for_reps(ldev, filter);
+ mlx5_lag_unload_reps_unlocked(ldev, filter);
+ mlx5_lag_retake_lock_after_reps(ldev);
+}
+
void mlx5_disable_lag(struct mlx5_lag *ldev)
{
bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
index 8481ce55c10a..e9f0ef83ce1d 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/lag.h
@@ -310,6 +310,7 @@ int mlx5_lag_num_devs(struct mlx5_lag *ldev);
int mlx5_lag_num_netdevs(struct mlx5_lag *ldev);
int mlx5_lag_reload_ib_reps_from_locked(struct mlx5_lag *ldev, u32 flags,
u32 filter, bool cont_on_fail);
+void mlx5_lag_unload_reps_from_locked(struct mlx5_lag *ldev, u32 filter);
int mlx5_ldev_add_mdev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev,
u32 group_id);
void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev, struct mlx5_core_dev *dev);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c b/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c
index 8d4f2903a101..113866494d16 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lag/shared_fdb.c
@@ -296,6 +296,7 @@ void mlx5_lag_shared_fdb_destroy(struct mlx5_lag *ldev, u32 group_id)
pf->sd_fdb_active = false;
}
mlx5_lag_destroy_single_fdb_filter(ldev, group_id);
+ mlx5_lag_unload_reps_from_locked(ldev, filter);

sashiko.dev says:
Does explicitly unloading all representors here render the subsequent IB reload
a dead code no-op?
Immediately after mlx5_lag_unload_reps_from_locked() forcefully unloads REP_ETH
and other representors, this function calls
mlx5_lag_reload_ib_reps_from_locked().
Because REP_ETH was just unloaded, the condition checking if the state is
REP_LOADED inside mlx5_eswitch_reload_ib_reps() will evaluate to false,
silently skipping all IB representors.

[SD] this is intended

}
mlx5_lag_add_devices_filter(ldev, filter);