[PATCH v5] IB/mlx5: Fix loopback enablement state and resource leaks

From: Prathamesh Deshpande

Date: Sat Apr 04 2026 - 17:54:20 EST


In mlx5_ib_alloc_transport_domain(), an early success path was
incorrectly returning a variable instead of a literal 0. Additionally,
the loopback (LB) infrastructure required updates to address
initialization and concurrency issues:

1. Initialization: The LB mutex was initialized conditionally, but
paths like create_raw_packet_qp_tir() in qp.c call LB functions
regardless of those conditions. Move initialization to
stage_init_init() to ensure the lock is always valid.
2. State Integrity: Update mlx5_ib_enable_lb() to roll back reference
counters if the hardware command fails, ensuring software state
matches hardware.
3. Concurrency: Move 'force_enable' checks inside the mutex in both
enable/disable paths to prevent potential counter underflows.
4. Resource Leak: Deallocate the transport domain if LB enablement
fails in mlx5_ib_alloc_transport_domain().

Signed-off-by: Prathamesh Deshpande <prathameshdeshpande7@xxxxxxxxx>
---
v5:
- Moved mutex_init to stage_init_init to prevent crashes on non-ETH hardware.
- Implemented 'goto unlock' for concurrency safety in enable/disable paths.
- Added atomic rollback and fixed tdn leak.
v4:
- Moved rollback logic into mlx5_ib_enable_lb() to ensure atomicity
within the mutex and prevent race conditions [Sashiko].
v3:
- Also call mlx5_ib_disable_lb() on failure to roll back software state/counters
[Sashiko].
v2:
- Added deallocation of tdn if mlx5_ib_enable_lb() fails [Sashiko].
- Reworded commit message to reflect the functional fix and credit the tool.

drivers/infiniband/hw/mlx5/main.c | 33 ++++++++++++++++++++-----------
1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
index 635002e684a5..2d546d3b1dfe 100644
--- a/drivers/infiniband/hw/mlx5/main.c
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -2009,10 +2009,10 @@ int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
{
int err = 0;

+ mutex_lock(&dev->lb.mutex);
if (dev->lb.force_enable)
- return 0;
+ goto unlock;

- mutex_lock(&dev->lb.mutex);
if (td)
dev->lb.user_td++;
if (qp)
@@ -2022,10 +2022,18 @@ int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
dev->lb.qps == 1) {
if (!dev->lb.enabled) {
err = mlx5_nic_vport_update_local_lb(dev->mdev, true);
- dev->lb.enabled = true;
+ if (err) {
+ if (td)
+ dev->lb.user_td--;
+ if (qp)
+ dev->lb.qps--;
+ } else {
+ dev->lb.enabled = true;
+ }
}
}

+unlock:
mutex_unlock(&dev->lb.mutex);

return err;
@@ -2033,10 +2041,10 @@ int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)

void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
{
+ mutex_lock(&dev->lb.mutex);
if (dev->lb.force_enable)
- return;
+ goto unlock;

- mutex_lock(&dev->lb.mutex);
if (td)
dev->lb.user_td--;
if (qp)
@@ -2050,6 +2058,7 @@ void mlx5_ib_disable_lb(struct mlx5_ib_dev *dev, bool td, bool qp)
}
}

+unlock:
mutex_unlock(&dev->lb.mutex);
}

@@ -2068,9 +2077,13 @@ static int mlx5_ib_alloc_transport_domain(struct mlx5_ib_dev *dev, u32 *tdn,
if ((MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) ||
(!MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) &&
!MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
- return err;
+ return 0;

- return mlx5_ib_enable_lb(dev, true, false);
+ err = mlx5_ib_enable_lb(dev, true, false);
+ if (err)
+ mlx5_cmd_dealloc_transport_domain(dev->mdev, *tdn, uid);
+
+ return err;
}

static void mlx5_ib_dealloc_transport_domain(struct mlx5_ib_dev *dev, u32 tdn,
@@ -4473,6 +4486,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
mutex_init(&dev->data_direct_lock);
INIT_LIST_HEAD(&dev->qp_list);
spin_lock_init(&dev->reset_flow_resource_lock);
+ mutex_init(&dev->lb.mutex);
xa_init(&dev->odp_mkeys);
xa_init(&dev->sig_mrs);
atomic_set(&dev->mkey_var, 0);
@@ -4713,11 +4727,6 @@ static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
if (err)
return err;

- if ((MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) &&
- (MLX5_CAP_GEN(dev->mdev, disable_local_lb_uc) ||
- MLX5_CAP_GEN(dev->mdev, disable_local_lb_mc)))
- mutex_init(&dev->lb.mutex);
-
if (MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
MLX5_GENERAL_OBJ_TYPES_CAP_VIRTIO_NET_Q) {
err = mlx5_ib_init_var_table(dev);
--
2.43.0