Re: [PATCH net-next V2 4/7] net/mlx5: E-Switch, serialize representor lifecycle
From: Mark Bloch
Date: Sat May 02 2026 - 16:05:36 EST
On 01/05/2026 7:16, Tariq Toukan wrote:
> From: Mark Bloch <mbloch@xxxxxxxxxx>
>
> Representor callbacks can be registered and unregistered while the
> E-Switch is already in switchdev mode, and the same E-Switch may also be
> reconfigured by devlink, VF changes and SF changes. Serialize these paths
> with the per-E-Switch representor mutex instead of relying on ad-hoc bit
> state and wait queues.
>
> Take the representor lock around the mode transition, VF/SF representor
> changes and representor ops registration. Keep mode_lock and the
> representor lock unnested by using the operation flag while the mode lock
> is dropped. During mode changes, drop the representor lock around the
> auxiliary bus rescan because driver bind/unbind may register or unregister
> representor ops.
>
> Split representor ops registration into locked public wrappers and blocked
> internal helpers, clear the ops pointer on unregister, and add nested
> wrappers for the shared-FDB master IB path that registers peer
> representor ops while another E-Switch representor lock is already held.
>
> Signed-off-by: Mark Bloch <mbloch@xxxxxxxxxx>
> Signed-off-by: Tariq Toukan <tariqt@xxxxxxxxxx>
> ---
> drivers/infiniband/hw/mlx5/ib_rep.c | 6 +-
> .../net/ethernet/mellanox/mlx5/core/eswitch.c | 10 ++
> .../mellanox/mlx5/core/eswitch_offloads.c | 102 ++++++++++++++++--
> .../ethernet/mellanox/mlx5/core/sf/devlink.c | 5 +
> include/linux/mlx5/eswitch.h | 6 ++
> 5 files changed, 119 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/infiniband/hw/mlx5/ib_rep.c b/drivers/infiniband/hw/mlx5/ib_rep.c
> index 1709b628702e..65d8767d1830 100644
> --- a/drivers/infiniband/hw/mlx5/ib_rep.c
> +++ b/drivers/infiniband/hw/mlx5/ib_rep.c
> @@ -262,9 +262,10 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
> struct mlx5_core_dev *peer_mdev;
> struct mlx5_eswitch *esw;
>
> + /* Called while the master E-Switch reps_lock is held. */
> mlx5_lag_for_each_peer_mdev(mdev, peer_mdev, i) {
> esw = peer_mdev->priv.eswitch;
> - mlx5_eswitch_unregister_vport_reps(esw, REP_IB);
> + mlx5_eswitch_unregister_vport_reps_nested(esw, REP_IB);
> }
> mlx5_ib_release_transport(mdev);
> }
> @@ -284,9 +285,10 @@ static void mlx5_ib_register_peer_vport_reps(struct mlx5_core_dev *mdev)
> struct mlx5_eswitch *esw;
> int i;
>
> + /* Called while the master E-Switch reps_lock is held. */
> mlx5_lag_for_each_peer_mdev(mdev, peer_mdev, i) {
> esw = peer_mdev->priv.eswitch;
> - mlx5_eswitch_register_vport_reps(esw, &rep_ops, REP_IB);
> + mlx5_eswitch_register_vport_reps_nested(esw, &rep_ops, REP_IB);
> }
> }
>
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> index 66a773a99876..f70737437954 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
> @@ -1712,6 +1712,7 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
> mlx5_lag_disable_change(esw->dev);
>
> mlx5_eswitch_invalidate_wq(esw);
> + mlx5_esw_reps_block(esw);
>
> if (!mlx5_esw_is_fdb_created(esw)) {
> ret = mlx5_eswitch_enable_locked(esw, num_vfs);
> @@ -1735,6 +1736,8 @@ int mlx5_eswitch_enable(struct mlx5_eswitch *esw, int num_vfs)
> }
> }
>
> + mlx5_esw_reps_unblock(esw);
> +
> if (toggle_lag)
> mlx5_lag_enable_change(esw->dev);
>
> @@ -1759,6 +1762,7 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf)
> esw->esw_funcs.num_vfs, esw->esw_funcs.num_ec_vfs, esw->enabled_vports);
>
> mlx5_eswitch_invalidate_wq(esw);
> + mlx5_esw_reps_block(esw);
>
> if (!mlx5_core_is_ecpf(esw->dev)) {
> mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);
> @@ -1770,6 +1774,8 @@ void mlx5_eswitch_disable_sriov(struct mlx5_eswitch *esw, bool clear_vf)
> mlx5_eswitch_clear_ec_vf_vports_info(esw);
> }
>
> + mlx5_esw_reps_unblock(esw);
> +
> if (esw->mode == MLX5_ESWITCH_OFFLOADS) {
> struct devlink *devlink = priv_to_devlink(esw->dev);
>
> @@ -1825,7 +1831,11 @@ void mlx5_eswitch_disable(struct mlx5_eswitch *esw)
>
> devl_assert_locked(priv_to_devlink(esw->dev));
> mlx5_lag_disable_change(esw->dev);
> +
> + mlx5_esw_reps_block(esw);
> mlx5_eswitch_disable_locked(esw);
> + mlx5_esw_reps_unblock(esw);
> +
> esw->mode = MLX5_ESWITCH_LEGACY;
> mlx5_lag_enable_change(esw->dev);
> }
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> index 6a5143b63dfd..d4ac07c995b9 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
> @@ -36,6 +36,7 @@
> #include <linux/mlx5/mlx5_ifc.h>
> #include <linux/mlx5/vport.h>
> #include <linux/mlx5/fs.h>
> +#include <linux/lockdep.h>
> #include "mlx5_core.h"
> #include "eswitch.h"
> #include "esw/indir_table.h"
> @@ -2413,11 +2414,21 @@ static int esw_create_restore_table(struct mlx5_eswitch *esw)
> return err;
> }
>
> +static void mlx5_esw_assert_reps_locked(struct mlx5_eswitch *esw)
> +{
> + lockdep_assert_held(&esw->offloads.reps_lock);
> +}
> +
> void mlx5_esw_reps_block(struct mlx5_eswitch *esw)
> {
> mutex_lock(&esw->offloads.reps_lock);
> }
>
> +static void mlx5_esw_reps_block_nested(struct mlx5_eswitch *esw)
> +{
> + mutex_lock_nested(&esw->offloads.reps_lock, SINGLE_DEPTH_NESTING);
> +}
> +
> void mlx5_esw_reps_unblock(struct mlx5_eswitch *esw)
> {
> mutex_unlock(&esw->offloads.reps_lock);
> @@ -2425,21 +2436,22 @@ void mlx5_esw_reps_unblock(struct mlx5_eswitch *esw)
>
> static void esw_mode_change(struct mlx5_eswitch *esw, u16 mode)
> {
> + mlx5_esw_reps_unblock(esw);
> mlx5_devcom_comp_lock(esw->dev->priv.hca_devcom_comp);
> if (esw->dev->priv.flags & MLX5_PRIV_FLAGS_DISABLE_IB_ADEV ||
> mlx5_core_mp_enabled(esw->dev)) {
> esw->mode = mode;
> - mlx5_rescan_drivers_locked(esw->dev);
> - mlx5_devcom_comp_unlock(esw->dev->priv.hca_devcom_comp);
> - return;
> + goto out;
> }
>
> esw->dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
> mlx5_rescan_drivers_locked(esw->dev);
> esw->mode = mode;
> esw->dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
> +out:
> mlx5_rescan_drivers_locked(esw->dev);
> mlx5_devcom_comp_unlock(esw->dev->priv.hca_devcom_comp);
> + mlx5_esw_reps_block(esw);
> }
>
> static void mlx5_esw_fdb_drop_destroy(struct mlx5_eswitch *esw)
> @@ -2776,6 +2788,8 @@ void esw_offloads_cleanup(struct mlx5_eswitch *esw)
> static int __esw_offloads_load_rep(struct mlx5_eswitch *esw,
> struct mlx5_eswitch_rep *rep, u8 rep_type)
> {
> + mlx5_esw_assert_reps_locked(esw);
> +
> if (atomic_cmpxchg(&rep->rep_data[rep_type].state,
> REP_REGISTERED, REP_LOADED) == REP_REGISTERED)
> return esw->offloads.rep_ops[rep_type]->load(esw->dev, rep);
> @@ -2786,6 +2800,8 @@ static int __esw_offloads_load_rep(struct mlx5_eswitch *esw,
> static void __esw_offloads_unload_rep(struct mlx5_eswitch *esw,
> struct mlx5_eswitch_rep *rep, u8 rep_type)
> {
> + mlx5_esw_assert_reps_locked(esw);
> +
> if (atomic_cmpxchg(&rep->rep_data[rep_type].state,
> REP_LOADED, REP_REGISTERED) == REP_LOADED) {
> if (rep_type == REP_ETH)
> @@ -3691,6 +3707,7 @@ static void esw_vfs_changed_event_handler(struct mlx5_eswitch *esw)
> if (new_num_vfs == esw->esw_funcs.num_vfs || host_pf_disabled)
> goto free;
>
> + mlx5_esw_reps_block(esw);
> /* Number of VFs can only change from "0 to x" or "x to 0". */
> if (esw->esw_funcs.num_vfs > 0) {
> mlx5_eswitch_unload_vf_vports(esw, esw->esw_funcs.num_vfs);
> @@ -3700,9 +3717,11 @@ static void esw_vfs_changed_event_handler(struct mlx5_eswitch *esw)
> err = mlx5_eswitch_load_vf_vports(esw, new_num_vfs,
> MLX5_VPORT_UC_ADDR_CHANGE);
> if (err)
> - goto free;
> + goto unblock;
> }
> esw->esw_funcs.num_vfs = new_num_vfs;
> +unblock:
> + mlx5_esw_reps_unblock(esw);
> free:
> kvfree(out);
> }
> @@ -4188,9 +4207,14 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
> goto unlock;
> }
>
> + /* Keep mode_lock and reps_lock unnested. The operation flag excludes
> + * mode users while mode_lock is dropped before taking reps_lock.
> + */
> esw->eswitch_operation_in_progress = true;
> up_write(&esw->mode_lock);
>
> + mlx5_esw_reps_block(esw);
> +
> if (mlx5_mode == MLX5_ESWITCH_OFFLOADS &&
> !mlx5_devlink_netdev_netns_immutable_set(devlink, true)) {
> NL_SET_ERR_MSG_MOD(extack,
> @@ -4223,6 +4247,10 @@ int mlx5_devlink_eswitch_mode_set(struct devlink *devlink, u16 mode,
> skip:
> if (mlx5_mode == MLX5_ESWITCH_OFFLOADS && err)
> mlx5_devlink_netdev_netns_immutable_set(devlink, false);
> + /* Reconfiguration is done; drop reps_lock before taking mode_lock again
> + * to clear the operation flag.
> + */
> + mlx5_esw_reps_unblock(esw);
> down_write(&esw->mode_lock);
> esw->eswitch_operation_in_progress = false;
> unlock:
> @@ -4496,9 +4524,10 @@ mlx5_eswitch_vport_has_rep(const struct mlx5_eswitch *esw, u16 vport_num)
> return true;
> }
>
> -void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
> - const struct mlx5_eswitch_rep_ops *ops,
> - u8 rep_type)
> +static void
> +mlx5_eswitch_register_vport_reps_blocked(struct mlx5_eswitch *esw,
> + const struct mlx5_eswitch_rep_ops *ops,
> + u8 rep_type)
> {
> struct mlx5_eswitch_rep_data *rep_data;
> struct mlx5_eswitch_rep *rep;
> @@ -4513,9 +4542,40 @@ void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
> }
> }
> }
> +
> +static void
> +mlx5_eswitch_register_vport_reps_locked(struct mlx5_eswitch *esw,
> + const struct mlx5_eswitch_rep_ops *ops,
> + u8 rep_type, bool nested)
> +{
> + if (nested)
> + mlx5_esw_reps_block_nested(esw);
> + else
> + mlx5_esw_reps_block(esw);
> + mlx5_eswitch_register_vport_reps_blocked(esw, ops, rep_type);
> + mlx5_esw_reps_unblock(esw);
> +}
> +
> +void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
> + const struct mlx5_eswitch_rep_ops *ops,
> + u8 rep_type)
> +{
> + mlx5_eswitch_register_vport_reps_locked(esw, ops, rep_type, false);
> +}
> EXPORT_SYMBOL(mlx5_eswitch_register_vport_reps);
>
> -void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
> +void
> +mlx5_eswitch_register_vport_reps_nested(struct mlx5_eswitch *esw,
> + const struct mlx5_eswitch_rep_ops *ops,
> + u8 rep_type)
> +{
> + mlx5_eswitch_register_vport_reps_locked(esw, ops, rep_type, true);
> +}
> +EXPORT_SYMBOL(mlx5_eswitch_register_vport_reps_nested);
> +
> +static void
> +mlx5_eswitch_unregister_vport_reps_blocked(struct mlx5_eswitch *esw,
> + u8 rep_type)
> {
> struct mlx5_eswitch_rep *rep;
> unsigned long i;
> @@ -4525,9 +4585,35 @@ void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
>
> mlx5_esw_for_each_rep(esw, i, rep)
> atomic_set(&rep->rep_data[rep_type].state, REP_UNREGISTERED);
> +
> + esw->offloads.rep_ops[rep_type] = NULL;
sashiko.dev says:
"
Could this assignment cause a NULL pointer dereference in concurrent readers?
In mlx5_eswitch_get_proto_dev(), the state is checked before accessing the ops
pointer without holding reps_lock:
if (atomic_read(&rep->rep_data[rep_type].state) == REP_LOADED &&
esw->offloads.rep_ops[rep_type]->get_proto_dev)
return esw->offloads.rep_ops[rep_type]->get_proto_dev(rep);
If a thread in mlx5_eswitch_get_proto_dev() evaluates the state check to true
and is then preempted, can the unregister path execute and set rep_ops to NULL?
When the preempted thread resumes, it might dereference the now-NULL pointer.
Also, since the ops pointer isn't fetched into a local variable using
READ_ONCE(), could the compiler emit multiple loads, further widening the
race window?
"
The REP_LOADED check is not the only protection here, get_proto_dev()
is only reached from representor-owned contexts, and unregister first
unloads all reps under reps_lock. That unload tears down the users
that can call into this helper before the state is set to REP_UNREGISTERED
and before rep_ops is cleared. So clearing rep_ops does not create a new
live-reader window; it only removes the stale ops pointer after the
representor lifecycle is already quiesced.
Mark
> +}
> +
> +static void
> +mlx5_eswitch_unregister_vport_reps_locked(struct mlx5_eswitch *esw,
> + u8 rep_type, bool nested)
> +{
> + if (nested)
> + mlx5_esw_reps_block_nested(esw);
> + else
> + mlx5_esw_reps_block(esw);
> + mlx5_eswitch_unregister_vport_reps_blocked(esw, rep_type);
> + mlx5_esw_reps_unblock(esw);
> +}
> +
> +void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
> +{
> + mlx5_eswitch_unregister_vport_reps_locked(esw, rep_type, false);
> }
> EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_reps);
>
> +void mlx5_eswitch_unregister_vport_reps_nested(struct mlx5_eswitch *esw,
> + u8 rep_type)
> +{
> + mlx5_eswitch_unregister_vport_reps_locked(esw, rep_type, true);
> +}
> +EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_reps_nested);
> +
> void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type)
> {
> struct mlx5_eswitch_rep *rep;
> diff --git a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
> index 8503e532f423..2fc69897e35b 100644
> --- a/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
> +++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/devlink.c
> @@ -245,8 +245,10 @@ static int mlx5_sf_add(struct mlx5_core_dev *dev, struct mlx5_sf_table *table,
> if (IS_ERR(sf))
> return PTR_ERR(sf);
>
> + mlx5_esw_reps_block(esw);
> err = mlx5_eswitch_load_sf_vport(esw, sf->hw_fn_id, MLX5_VPORT_UC_ADDR_CHANGE,
> &sf->dl_port, new_attr->controller, new_attr->sfnum);
> + mlx5_esw_reps_unblock(esw);
> if (err)
> goto esw_err;
> *dl_port = &sf->dl_port.dl_port;
> @@ -367,7 +369,10 @@ int mlx5_devlink_sf_port_del(struct devlink *devlink,
> struct mlx5_sf_table *table = dev->priv.sf_table;
> struct mlx5_sf *sf = mlx5_sf_by_dl_port(dl_port);
>
> + mlx5_esw_reps_block(dev->priv.eswitch);
> mlx5_sf_del(table, sf);
> + mlx5_esw_reps_unblock(dev->priv.eswitch);
> +
> return 0;
> }
>
> diff --git a/include/linux/mlx5/eswitch.h b/include/linux/mlx5/eswitch.h
> index 3b29a3c6794d..a0dd162baa78 100644
> --- a/include/linux/mlx5/eswitch.h
> +++ b/include/linux/mlx5/eswitch.h
> @@ -63,7 +63,13 @@ struct mlx5_eswitch_rep {
> void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
> const struct mlx5_eswitch_rep_ops *ops,
> u8 rep_type);
> +void
> +mlx5_eswitch_register_vport_reps_nested(struct mlx5_eswitch *esw,
> + const struct mlx5_eswitch_rep_ops *ops,
> + u8 rep_type);
> void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type);
> +void mlx5_eswitch_unregister_vport_reps_nested(struct mlx5_eswitch *esw,
> + u8 rep_type);
> void *mlx5_eswitch_get_proto_dev(struct mlx5_eswitch *esw,
> u16 vport_num,
> u8 rep_type);