Re: [PATCH net-next V3 14/15] net/mlx5: SD, defer vport metadata init until SD is ready
From: Shay Drori
Date: Sun Jun 14 2026 - 12:46:37 EST
On 12/06/2026 14:39, Tariq Toukan wrote:
From: Shay Drory <shayd@xxxxxxxxxx>
Allow SD devices to transition to switchdev before the SD group is
fully up. Metadata allocation requires the SD group to be ready, so
defer it from esw_offloads_enable() until SD shared-FDB activation.
Add mlx5_esw_offloads_init_deferred_metadata() which allocates per-vport
metadata and refreshes the ingress ACLs that were previously programmed
with metadata=0. The helper is idempotent and can be called multiple
times.
Signed-off-by: Shay Drory <shayd@xxxxxxxxxx>
Reviewed-by: Mark Bloch <mbloch@xxxxxxxxxx>
Signed-off-by: Tariq Toukan <tariqt@xxxxxxxxxx>
---
.../net/ethernet/mellanox/mlx5/core/eswitch.h | 1 +
.../mellanox/mlx5/core/eswitch_offloads.c | 79 ++++++++++++++++++-
.../net/ethernet/mellanox/mlx5/core/lib/sd.c | 16 ++++
3 files changed, 93 insertions(+), 3 deletions(-)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
index b2b3150f1f04..fea72b1dedab 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.h
@@ -440,6 +440,7 @@ struct mlx5_eswitch {
void esw_offloads_disable(struct mlx5_eswitch *esw);
int esw_offloads_enable(struct mlx5_eswitch *esw);
+int mlx5_esw_offloads_init_deferred_metadata(struct mlx5_eswitch *esw);
void esw_offloads_cleanup(struct mlx5_eswitch *esw);
int esw_offloads_init(struct mlx5_eswitch *esw);
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 4dc190a4e7b2..8fa7e633451c 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3675,6 +3675,7 @@ static void esw_offloads_vport_metadata_cleanup(struct mlx5_eswitch *esw,
WARN_ON(vport->metadata != vport->default_metadata);
mlx5_esw_match_metadata_free(esw, vport->default_metadata);
+ vport->default_metadata = 0;
}
static void esw_offloads_metadata_uninit(struct mlx5_eswitch *esw)
@@ -3711,6 +3712,73 @@ static int esw_offloads_metadata_init(struct mlx5_eswitch *esw)
return err;
}
+/* Deferred metadata init for SD devices: allocate vport metadata and
+ * refresh the ingress ACL for every vport whose ACL was created with
+ * metadata=0 in esw_create_offloads_acl_tables() / esw_vport_setup().
+ *
+ * No Rep is loaded at this point ==> no Rep net-dev exists, so no need
+ * to take rtnl lock.
+ *
+ * Safe to call multiple times - subsequent calls are no-ops.
+ */
+int mlx5_esw_offloads_init_deferred_metadata(struct mlx5_eswitch *esw)
+{
+ struct mlx5_vport *manager, *vport;
+ unsigned long i;
+ int err;
+
+ if (!mlx5_eswitch_vport_match_metadata_enabled(esw))
+ return 0;
+
+ manager = mlx5_eswitch_get_vport(esw, esw->manager_vport);
+ if (IS_ERR(manager))
+ return PTR_ERR(manager);
+
+ /* Sanity check: skip if metadata was already initialized */
+ if (manager->default_metadata)
+ return 0;
+
+ err = esw_offloads_metadata_init(esw);
sashiko.dev says:
Does unconditionally allocating new metadata here overwrite any dynamically
assigned vport metadata?
[SD] No. no REPs are loaded until SD LAG is created, so user can't do
any REPs bonding
+ if (err)
+ return err;
+
+ mutex_lock(&esw->state_lock);
+ /* Manager vport doesn't have a rep/netdev loaded but its ingress ACL
+ * was programmed with metadata=0 - refresh it explicitly.
+ */
+ err = mlx5_esw_acl_ingress_vport_metadata_update(esw,
+ esw->manager_vport,
+ 0);
+ if (err)
+ goto err_acl;
+
+ /* UPLINK is never marked enabled but its ACL is programmed in
+ * esw_create_offloads_acl_tables(); refresh it explicitly.
+ */
+ err = mlx5_esw_acl_ingress_vport_metadata_update(esw, MLX5_VPORT_UPLINK,
+ 0);
sashiko.dev says:
Is the rtnl_lock required during these deferred ACL updates?
The code comment assumes no representor netdev exists at this point. However,
for the primary SD device, representor netdevs were loaded and exposed to
user-space much earlier when its switchdev transition completed.
[SD] same as above, no REPs net-dev exists at this point
+ if (err)
+ goto err_acl;
+
+ mlx5_esw_for_each_vport(esw, i, vport) {
+ if (!vport || !vport->enabled)
+ continue;
+ err = mlx5_esw_acl_ingress_vport_metadata_update(esw,
+ vport->vport,
+ 0);
+ if (err)
+ goto err_acl;
+ }
+
+ mutex_unlock(&esw->state_lock);
+ return 0;
+
+err_acl:
+ esw_offloads_metadata_uninit(esw);
+ mutex_unlock(&esw->state_lock);
+ return err;
+}
+
int
esw_vport_create_offloads_acl_tables(struct mlx5_eswitch *esw,
struct mlx5_vport *vport)
@@ -4072,9 +4140,14 @@ int esw_offloads_enable(struct mlx5_eswitch *esw)
if (err)
goto err_roce;
- err = esw_offloads_metadata_init(esw);
- if (err)
- goto err_metadata;
+ /* SD devices defer metadata init until SD is ready and
+ * mlx5_sd_pf_num_get() can return the correct pf_num.
+ */
+ if (!mlx5_get_sd(esw->dev)) {
+ err = esw_offloads_metadata_init(esw);
sashiko.dev says:
Does skipping metadata initialization here prematurely expose representors
with uninitialized metadata?
esw_offloads_enable() skips metadata initialization for SD devices, leaving
vport->metadata = 0. Later in this function, representor netdevs are loaded
and exposed to userspace, which allows userspace to add TC rules matching
metadata=0 before the metadata is actually initialized.
[SD] same as above
+ if (err)
+ goto err_metadata;
+ }
err = esw_set_passing_vport_metadata(esw, true);
if (err)
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
index b35795bac098..2fcccd329eb5 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/sd.c
@@ -992,6 +992,7 @@ static bool mlx5_sd_all_paired(struct mlx5_core_dev *primary)
static void mlx5_sd_activate_shared_fdb(struct mlx5_core_dev *primary)
{
struct mlx5_sd *sd = mlx5_get_sd(primary);
+ struct mlx5_core_dev *pos;
struct mlx5_lag *ldev;
struct lag_func *pf;
int err;
@@ -1024,6 +1025,21 @@ static void mlx5_sd_activate_shared_fdb(struct mlx5_core_dev *primary)
goto unlock;
}
+ /* Initialize vport metadata for all group devices. This is deferred
+ * from esw_offloads_enable() because mlx5_sd_pf_num_get() requires
+ * the SD group to be ready.
+ */
+ mlx5_sd_for_each_dev(i, primary, pos) {
+ struct mlx5_eswitch *esw = pos->priv.eswitch;
+
+ err = mlx5_esw_offloads_init_deferred_metadata(esw);
+ if (err) {
+ sd_warn(primary, "Failed to init metadata for %s: %d\n",
+ dev_name(pos->device), err);
+ goto unlock;
+ }
+ }
+
err = mlx5_lag_shared_fdb_create(ldev, NULL, 0, sd->group_id);
if (err)
sd_warn(primary, "Failed to create shared FDB: %d\n", err);