[PATCH net-next 4/7] net/mlx5: E-Switch, fix deadlock between devlink lock and esw->wq

From: Tariq Toukan

Date: Thu Apr 09 2026 - 08:02:03 EST


From: Mark Bloch <mbloch@xxxxxxxxxx>

mlx5_eswitch_cleanup() calls destroy_workqueue() while holding the
devlink lock (via mlx5_uninit_one()). Workers on the queue call
devl_lock() before checking whether their work is stale, which
deadlocks:

mlx5_uninit_one (holds devlink lock)
mlx5_eswitch_cleanup()
destroy_workqueue() <- waits for workers to finish
worker: devl_lock() <- blocked on
devlink lock held above

The same pattern affects mlx5_devlink_eswitch_mode_set(), which can
drain the queue while holding devlink lock.

Fix by making esw_wq_handler() check the generation counter BEFORE
acquiring the devlink lock, using devl_trylock() in a loop with
cond_resched(). If the work is stale the handler exits immediately
without ever contending for the lock.

To guarantee stale detection, increment the generation counter at
every E-Switch operation boundary:

- mlx5_eswitch_cleanup(): increment before destroy_workqueue() so
any in-flight worker sees stale and drains without blocking. Also
move mlx5_esw_qos_cleanup() to after destroy_workqueue() so it
runs only once all workers have finished.
- mlx5_devlink_eswitch_mode_set(): increment before starting the
mode change so workers from the previous mode are discarded.
- mlx5_eswitch_disable(): increment so workers queued before the
disable see stale and exit.
- mlx5_eswitch_enable() and mlx5_eswitch_disable_sriov(): increment
so in-flight work against an old VF count or mode is discarded
when these operations begin.

Remove the conditional atomic_inc() in
mlx5_eswitch_event_handler_unregister(); the mlx5_eswitch_disable()
increment now covers it unconditionally and earlier in the call chain.

Signed-off-by: Mark Bloch <mbloch@xxxxxxxxxx>
Reviewed-by: Cosmin Ratiu <cratiu@xxxxxxxxxx>
Signed-off-by: Tariq Toukan <tariqt@xxxxxxxxxx>
---
.../net/ethernet/mellanox/mlx5/core/eswitch.c | 11 +++++++----
.../mellanox/mlx5/core/eswitch_offloads.c | 18 +++++++++++++++++-
2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
index 1986d4d0e886..d315484390c8 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
@@ -1073,10 +1073,8 @@ static void mlx5_eswitch_event_handler_register(struct mlx5_eswitch *esw)
static void mlx5_eswitch_event_handler_unregister(struct mlx5_eswitch *esw)
{
if (esw->mode == MLX5_ESWITCH_OFFLOADS &&
- mlx5_eswitch_is_funcs_handler(esw->dev)) {
+ mlx5_eswitch_is_funcs_handler(esw->dev))
mlx5_eq_notifier_unregister(esw->dev, &esw->esw_funcs.nb);
- atomic_inc(&esw->generation);
- }
}

static void mlx5_eswitch_clear_vf_vports_info(struct mlx5_eswitch *esw)
@@ -1701,6 +1699,8 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
if (toggle_lag)
mlx5_lag_disable_change(esw->dev);

+ atomic_inc(&esw->generation);
+
if (!mlx5_esw_is_fdb_created(esw)) {
ret = mlx5_eswitch_enable_locked(esw, num_vfs);
} else {
@@ -1745,6 +1745,7 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf)
esw_info(esw->dev, "Unload vfs: mode(%s), nvfs(%d), necvfs(%d), active vports(%d)\n",
esw->mode == MLX5_ESWITCH_LEGACY ? "LEGACY" : "OFFLOADS",
esw->esw_funcs.num_vfs, esw->esw_funcs.num_ec_vfs, esw->enabled_vports);
+ atomic_inc(&esw->generation);

if (!mlx5_core_is_ecpf(esw->dev)) {
mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);
@@ -1809,6 +1810,7 @@ void mlx5_eswitch_disable(struct mlx5_eswitch *esw)
return;

devl_assert_locked(priv_to_devlink(esw->dev));
+ atomic_inc(&esw->generation);
mlx5_lag_disable_change(esw->dev);
mlx5_eswitch_disable_locked(esw);
esw->mode = MLX5_ESWITCH_LEGACY;
@@ -2110,8 +2112,9 @@ void mlx5_eswitch_cleanup(struct mlx5_eswitch *esw)

esw_info(esw->dev, "cleanup\n");

- mlx5_esw_qos_cleanup(esw);
+ atomic_inc(&esw->generation);
destroy_workqueue(esw->work_queue);
+ mlx5_esw_qos_cleanup(esw);
WARN_ON(refcount_read(&esw->qos.refcnt));
mutex_destroy(&esw->state_lock);
WARN_ON(!xa_empty(&esw->offloads.vhca_map));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
index 23af5a12dc07..988595e1b425 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@ -3699,7 +3699,20 @@ static void esw_wq_handler(struct work_struct *work)
esw = host_work->esw;
devlink = priv_to_devlink(esw->dev);

- devl_lock(devlink);
+ /* Check for stale work BEFORE acquiring devlink lock.
+ * mlx5_eswitch_cleanup() increments the generation counter
+ * before destroy_workqueue() while holding devlink lock,
+ * so acquiring devlink lock here would deadlock.
+ */
+ for (;;) {
+ if (host_work->work_gen != atomic_read(&esw->generation))
+ goto free;
+
+ if (devl_trylock(devlink))
+ break;
+
+ cond_resched();
+ }

/* Stale work from one or more mode changes ago. Bail out. */
if (host_work->work_gen != atomic_read(&esw->generation))
@@ -3709,6 +3722,7 @@ static void esw_wq_handler(struct work_struct *work)

unlock:
devl_unlock(devlink);
+free:
kfree(host_work);
}

@@ -4161,6 +4175,8 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
goto skip;
}

+ atomic_inc(&esw->generation);
+
if (mlx5_mode == MLX5_ESWITCH_LEGACY)
esw->dev->priv.flags |= MLX5_PRIV_FLAGS_SWITCH_LEGACY;
if (mlx5_mode == MLX5_ESWITCH_OFFLOADS)
--
2.44.0