Re: [PATCH net-next 07/15] net/mlx5: SD, support switchdev mode transition with shared FDB

From: Shay Drori

Date: Sun Jun 07 2026 - 07:03:52 EST




On 04/06/2026 14:44, Tariq Toukan wrote:
From: Shay Drory <shayd@xxxxxxxxxx>

When the eswitch transitions, propagate the change to SD: secondaries
get their TX flow table root reconfigured for the new mode, and when
all group devices move to switchdev, the per-group shared FDB is
activated.

Shared FDB activation is best-effort - failure does not block the
eswitch transition; the next transition retries.

Note: the existing mlx5_get_sd() guard that blocks switchdev for SD
devices is intentionally retained. It will be removed once all
supporting patches are in place.

Signed-off-by: Shay Drory <shayd@xxxxxxxxxx>
Reviewed-by: Mark Bloch <mbloch@xxxxxxxxxx>
Signed-off-by: Tariq Toukan <tariqt@xxxxxxxxxx>
---
.../mellanox/mlx5/core/eswitch_offloads.c | 24 +++-
.../net/ethernet/mellanox/mlx5/core/lib/sd.c | 133 +++++++++++++++++-
.../net/ethernet/mellanox/mlx5/core/lib/sd.h | 7 +
3 files changed, 156 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 366531d8ef02..1133267a53fb 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -46,6 +46,7 @@
#include "fs_core.h"
#include "lib/mlx5.h"
#include "lib/devcom.h"
+#include "lib/sd.h"
#include "lib/eq.h"
#include "lib/fs_chains.h"
#include "en_tc.h"
@@ -3164,6 +3165,9 @@ static void esw_unset_master_egress_rule(struct mlx5_core_dev *dev,
vport = mlx5_eswitch_get_vport(dev->priv.eswitch,
dev->priv.eswitch->manager_vport);
+ if (!vport->egress.acl)
+ return;
+
esw_acl_egress_ofld_bounce_rule_destroy(vport, MLX5_CAP_GEN(slave_dev, vhca_id));
if (xa_empty(&vport->egress.offloads.bounce_rules)) {
@@ -3182,6 +3186,9 @@ int mlx5_eswitch_offloads_single_fdb_add_one(struct mlx5_eswitch *master_esw,
if (err)
return err;
+ if (!mlx5_sd_is_primary(slave_esw->dev))
+ return 0;
+
err = esw_set_master_egress_rule(master_esw->dev,
slave_esw->dev, max_slaves);
if (err)
@@ -3401,7 +3408,7 @@ void mlx5_esw_offloads_devcom_init(struct mlx5_eswitch *esw,
return;
if ((MLX5_VPORT_MANAGER(esw->dev) || mlx5_core_is_ecpf_esw_manager(esw->dev)) &&
- !mlx5_lag_is_supported(esw->dev))
+ (!mlx5_lag_is_supported(esw->dev) && !mlx5_get_sd(esw->dev)))
return;
xa_init(&esw->paired);
@@ -4219,11 +4226,6 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
if (IS_ERR(esw))
return PTR_ERR(esw);
- if (mlx5_fw_reset_in_progress(esw->dev)) {
- NL_SET_ERR_MSG_MOD(extack, "Can't change eswitch mode during firmware reset");
- return -EBUSY;
- }
-
if (esw_mode_from_devlink(mode, &mlx5_mode))
return -EINVAL;
@@ -4233,11 +4235,18 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
return -EPERM;
}
+ if (mlx5_fw_reset_in_progress(esw->dev)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "Can't change eswitch mode during firmware reset");
+ return -EBUSY;
+ }
+
/* Avoid try_lock, active/inactive mode change is not restricted */
if (mlx5_devlink_switchdev_active_mode_change(esw, mode))
return 0;
mlx5_lag_disable_change(esw->dev);
+
err = mlx5_esw_try_lock(esw);
if (err < 0) {
NL_SET_ERR_MSG_MOD(extack, "Can't change mode, E-Switch is busy");
@@ -4304,6 +4313,9 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
esw->eswitch_operation_in_progress = false;
unlock:
mlx5_esw_unlock(esw);
+ /* Shared FDB activation is creating LAG which is changing reps. */
+ if (!err)
+ mlx5_sd_eswitch_mode_set(esw->dev, mlx5_mode);
enable_lag:
mlx5_lag_enable_change(esw->dev);
return err;
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
index 8b1f3a25d80d..d2ed156ed1c6 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
@@ -5,6 +5,8 @@
#include "../lag/lag.h"
#include "mlx5_core.h"
#include "lib/mlx5.h"
+#include "devlink.h"
+#include "eswitch.h"
#include "fs_cmd.h"
#include <linux/mlx5/eswitch.h>
#include <linux/mlx5/vport.h>
@@ -33,6 +35,8 @@ struct mlx5_sd {
struct { /* secondary */
struct mlx5_core_dev *primary_dev;
u32 alias_obj_id;
+ /* TX flow table root in switchdev (silent) config */
+ bool tx_root_silent;
};
};
};
@@ -669,6 +673,29 @@ static void sd_secondary_destroy_alias_ft(struct mlx5_core_dev *secondary)
MLX5_GENERAL_OBJECT_TYPES_FLOW_TABLE_ALIAS);
}
+static int mlx5_sd_secondary_conf_tx_root(struct mlx5_core_dev *secondary,
+ bool disconnect)
+{
+ struct mlx5_sd *sd = mlx5_get_sd(secondary);
+ int err;
+
+ /* Idempotent: skip if TX root is already in the requested state. */
+ if (sd->tx_root_silent == disconnect)
+ return 0;
+
+ if (disconnect)
+ err = mlx5_fs_cmd_set_tx_flow_table_root(secondary, 0, true);
+ else
+ err = mlx5_fs_cmd_set_tx_flow_table_root(secondary,
+ sd->alias_obj_id,
+ false);
+ if (err)
+ return err;
+
+ sd->tx_root_silent = disconnect;
+ return 0;
+}
+
static int sd_cmd_set_secondary(struct mlx5_core_dev *secondary,
struct mlx5_core_dev *primary,
u8 *alias_key)
@@ -688,7 +715,8 @@ static int sd_cmd_set_secondary(struct mlx5_core_dev *secondary,
if (err)
goto err_unset_silent;
- err = mlx5_fs_cmd_set_tx_flow_table_root(secondary, sd->alias_obj_id, false);
+ err = mlx5_fs_cmd_set_tx_flow_table_root(secondary, sd->alias_obj_id,
+ false);
if (err)
goto err_destroy_alias_ft;
@@ -707,7 +735,7 @@ static void sd_cmd_unset_secondary(struct mlx5_core_dev *secondary)
struct mlx5_sd *primary_sd;
primary_sd = mlx5_get_sd(mlx5_sd_get_primary(secondary));
- mlx5_fs_cmd_set_tx_flow_table_root(secondary, 0, true);
+ mlx5_sd_secondary_conf_tx_root(secondary, true);
sd_secondary_destroy_alias_ft(secondary);
if (!primary_sd->fw_silents_secondaries)
mlx5_fs_cmd_set_l2table_entry_silent(secondary, 0);
@@ -936,6 +964,107 @@ struct auxiliary_device *mlx5_sd_get_adev(struct mlx5_core_dev *dev,
return &primary_adev->adev;
}
+#ifdef CONFIG_MLX5_ESWITCH
+/* All SD members must have completed esw_offloads_enable (i.e., reached
+ * mlx5_esw_offloads_devcom_init) and become eswitch-peers of the primary.
+ * Until then, mlx5_eswitch_is_peer() returns false for the not-yet-paired
+ * member and shared_fdb_supported_filter would reject. When all PFs transition
+ * in parallel, only the last one to finish satisfies this gate; the earlier
+ * ones return 0 silently here.
+ */
+static bool mlx5_sd_all_paired(struct mlx5_core_dev *primary)
+{
+ struct mlx5_eswitch *primary_esw = primary->priv.eswitch;
+ struct mlx5_core_dev *pos;
+ int i;
+
+ mlx5_sd_for_each_secondary(i, primary, pos) {
+ if (!mlx5_eswitch_is_peer(primary_esw, pos->priv.eswitch))
+ return false;
+ }
+ return true;
+}
+
+static void mlx5_sd_activate_shared_fdb(struct mlx5_core_dev *primary)
+{
+ struct mlx5_sd *sd = mlx5_get_sd(primary);
+ struct mlx5_lag *ldev;
+ struct lag_func *pf;
+ int err;
+ int i;
+
+ if (!mlx5_sd_all_paired(primary))
+ return;
+
+ ldev = mlx5_lag_dev(primary);
+ if (!ldev) {
+ sd_warn(primary, "Shared FDB MUST have ldev\n");
+ return;
+ }
+
+ mutex_lock(&ldev->lock);
+ /* Check if SD FDB is already active for this group */
+ mlx5_lag_for_each(i, 0, ldev, sd->group_id) {
+ pf = mlx5_lag_pf(ldev, i);
+ if (pf->sd_fdb_active)
+ goto unlock;
+ break;
+ }
+
+ if (!mlx5_lag_shared_fdb_supported_filter(ldev, sd->group_id)) {
+ sd_warn(primary, "Shared FDB not supported\n");
+ goto unlock;
+ }
+
+ err = mlx5_lag_shared_fdb_create(ldev, NULL, 0, sd->group_id);

sashiko.dev says:

Does this setup race with concurrent eswitch teardown when deferred metadata
initialization runs?
In the final state of the series, mlx5_sd_activate_shared_fdb() iterates over
the SD group devices and calls mlx5_esw_offloads_init_deferred_metadata(esw)
on peer eswitches without acquiring their mode_lock:
mlx5_sd_for_each_dev(i, primary, pos) {
struct mlx5_eswitch *esw = pos->priv.eswitch;
err = mlx5_esw_offloads_init_deferred_metadata(esw);
...
Concurrently, a peer device transitioning to LEGACY mode executes
esw_offloads_disable() while holding its mode_lock. The initialization thread
only holds the sd->devcom lock, which the teardown thread briefly holds and
releases before entering esw_offloads_disable().
Could this concurrent execution allow the initialization thread to allocate
metadata or update ACLs on a peer eswitch while the teardown thread is actively
destroying them, leading to memory leaks or a use-after-free?

Indeed it can race.
will fix in V2


+ if (err)
+ sd_warn(primary, "Failed to create shared FDB: %d\n", err);
+ else
+ sd_info(primary, "Shared FDB created\n");
+
+unlock:
+ mutex_unlock(&ldev->lock);
+}
+
+void mlx5_sd_eswitch_mode_set(struct mlx5_core_dev *dev, u16 mlx5_mode)
+{
+ struct mlx5_core_dev *primary;
+ struct mlx5_sd *sd;
+ int err;
+
+ sd = mlx5_get_sd(dev);
+ if (!sd || !mlx5_devcom_comp_is_ready(sd->devcom))
+ return;
+
+ mlx5_devcom_comp_lock(sd->devcom);
+ if (!mlx5_devcom_comp_is_ready(sd->devcom))
+ goto unlock;
+
+ primary = mlx5_sd_get_primary(dev);
+
+ /* Secondary devices need TX root reconfiguration */
+ if (dev != primary) {
+ bool disconnect = (mlx5_mode == MLX5_ESWITCH_OFFLOADS);
+
+ err = mlx5_sd_secondary_conf_tx_root(dev, disconnect);
+ if (err) {
+ sd_warn(dev, "Failed to set TX root: %d\n", err);
+ goto unlock;
+ }
+ }
+
+ /* Try to activate shared FDB when all devices are in switchdev.
+ * Shared FDB is optional - failure here doesn't fail the transition.
+ */
+ if (mlx5_mode == MLX5_ESWITCH_OFFLOADS)
+ mlx5_sd_activate_shared_fdb(primary);
+
+unlock:
+ mlx5_devcom_comp_unlock(sd->devcom);
+}
+
+#endif /* CONFIG_MLX5_ESWITCH */
+
void mlx5_sd_put_adev(struct auxiliary_device *actual_adev,
struct auxiliary_device *adev)
{
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
index 7a41adbcee71..cb88bf34079a 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.h
@@ -45,6 +45,13 @@ mlx5_sd_get_devcom(struct mlx5_core_dev *dev)
}
#endif
+#ifdef CONFIG_MLX5_ESWITCH
+void mlx5_sd_eswitch_mode_set(struct mlx5_core_dev *dev, u16 mlx5_mode);
+#else
+static inline void
+mlx5_sd_eswitch_mode_set(struct mlx5_core_dev *dev, u16 mlx5_mode) { return; }
+#endif
+
#define mlx5_sd_for_each_dev_from_to(i, primary, ix_from, to, pos) \
for (i = ix_from; \
(pos = mlx5_sd_primary_get_peer(primary, i)) && pos != (to); i++)