[PATCH 6.19 481/844] RDMA/core: Fix stale RoCE GIDs during netdev events at registration
From: Sasha Levin
Date: Sat Feb 28 2026 - 15:21:08 EST
From: Jiri Pirko <jiri@xxxxxxxxxx>
[ Upstream commit 9af0feae8016ba58ad7ff784a903404986b395b1 ]
RoCE GID entries become stale when netdev properties change during the
IB device registration window. This is reproducible with a udev rule
that sets a MAC address when a VF netdev appears:
ACTION=="add", SUBSYSTEM=="net", KERNEL=="eth4", \
RUN+="/sbin/ip link set eth4 address 88:22:33:44:55:66"
After VF creation, show_gids displays GIDs derived from the original
random MAC rather than the configured one.
The root cause is a race between netdev event processing and device
registration:
CPU 0 (driver) CPU 1 (udev/workqueue)
────────────── ──────────────────────
ib_register_device()
ib_cache_setup_one()
gid_table_setup_one()
_gid_table_setup_one()
← GID table allocated
rdma_roce_rescan_device()
← GIDs populated with
OLD MAC
ip link set eth4 addr NEW_MAC
NETDEV_CHANGEADDR queued
netdevice_event_work_handler()
ib_enum_all_roce_netdevs()
← Iterates DEVICE_REGISTERED
← Device NOT marked yet, SKIP!
enable_device_and_get()
xa_set_mark(DEVICE_REGISTERED)
← Too late, event was lost
The netdev event handler uses ib_enum_all_roce_netdevs() which only
iterates devices marked DEVICE_REGISTERED. However, this mark is set
late in the registration process, after the GID cache is already
populated. Events arriving in this window are silently dropped.
Fix this by introducing a new xarray mark DEVICE_GID_UPDATES that is
set immediately after the GID table is allocated and initialized. Use
the new mark in ib_enum_all_roce_netdevs() function to iterate devices
instead of DEVICE_REGISTERED.
This is safe because:
- After _gid_table_setup_one(), all required structures exist (port_data,
immutable, cache.gid)
- The GID table mutex serializes concurrent access between the initial
rescan and event handlers
- Event handlers correctly update stale GIDs even when racing with rescan
- The mark is cleared in ib_cache_cleanup_one() before teardown
This also fixes similar races for IP address events (inetaddr_event,
inet6addr_event) which use the same enumeration path.
Fixes: 0df91bb67334 ("RDMA/devices: Use xarray to store the client_data")
Signed-off-by: Jiri Pirko <jiri@xxxxxxxxxx>
Link: https://patch.msgid.link/20260127093839.126291-1-jiri@xxxxxxxxxxx
Reported-by: syzbot+881d65229ca4f9ae8c84@xxxxxxxxxxxxxxxxxxxxxxxxx
Closes: https://syzkaller.appspot.com/bug?extid=881d65229ca4f9ae8c84
Signed-off-by: Leon Romanovsky <leon@xxxxxxxxxx>
Signed-off-by: Sasha Levin <sashal@xxxxxxxxxx>
---
drivers/infiniband/core/cache.c | 13 +++++++++++
drivers/infiniband/core/core_priv.h | 3 +++
drivers/infiniband/core/device.c | 34 ++++++++++++++++++++++++++++-
3 files changed, 49 insertions(+), 1 deletion(-)
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
index 0fc1c5bce2f0d..78bc7d83edc65 100644
--- a/drivers/infiniband/core/cache.c
+++ b/drivers/infiniband/core/cache.c
@@ -927,6 +927,13 @@ static int gid_table_setup_one(struct ib_device *ib_dev)
if (err)
return err;
+ /*
+ * Mark the device as ready for GID cache updates. This allows netdev
+ * event handlers to update the GID cache even before the device is
+ * fully registered.
+ */
+ ib_device_enable_gid_updates(ib_dev);
+
rdma_roce_rescan_device(ib_dev);
return err;
@@ -1639,6 +1646,12 @@ void ib_cache_release_one(struct ib_device *device)
void ib_cache_cleanup_one(struct ib_device *device)
{
+ /*
+ * Clear the GID updates mark first to prevent event handlers from
+ * accessing the device while it's being torn down.
+ */
+ ib_device_disable_gid_updates(device);
+
/* The cleanup function waits for all in-progress workqueue
* elements and cleans up the GID cache. This function should be
* called after the device was removed from the devices list and
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
index 05102769a918a..a2c36666e6fcb 100644
--- a/drivers/infiniband/core/core_priv.h
+++ b/drivers/infiniband/core/core_priv.h
@@ -100,6 +100,9 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
roce_netdev_callback cb,
void *cookie);
+void ib_device_enable_gid_updates(struct ib_device *device);
+void ib_device_disable_gid_updates(struct ib_device *device);
+
typedef int (*nldev_callback)(struct ib_device *device,
struct sk_buff *skb,
struct netlink_callback *cb,
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index 1174ab7da6295..87eaefd3794bb 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -93,6 +93,7 @@ static struct workqueue_struct *ib_unreg_wq;
static DEFINE_XARRAY_FLAGS(devices, XA_FLAGS_ALLOC);
static DECLARE_RWSEM(devices_rwsem);
#define DEVICE_REGISTERED XA_MARK_1
+#define DEVICE_GID_UPDATES XA_MARK_2
static u32 highest_client_id;
#define CLIENT_REGISTERED XA_MARK_1
@@ -2441,11 +2442,42 @@ void ib_enum_all_roce_netdevs(roce_netdev_filter filter,
unsigned long index;
down_read(&devices_rwsem);
- xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED)
+ xa_for_each_marked(&devices, index, dev, DEVICE_GID_UPDATES)
ib_enum_roce_netdev(dev, filter, filter_cookie, cb, cookie);
up_read(&devices_rwsem);
}
+/**
+ * ib_device_enable_gid_updates - Mark device as ready for GID cache updates
+ * @device: Device to mark
+ *
+ * Called after GID table is allocated and initialized. After this mark is set,
+ * netdevice event handlers can update the device's GID cache. This allows
+ * events that arrive during device registration to be processed, avoiding
+ * stale GID entries when netdev properties change during the device
+ * registration process.
+ */
+void ib_device_enable_gid_updates(struct ib_device *device)
+{
+ down_write(&devices_rwsem);
+ xa_set_mark(&devices, device->index, DEVICE_GID_UPDATES);
+ up_write(&devices_rwsem);
+}
+
+/**
+ * ib_device_disable_gid_updates - Clear the GID updates mark
+ * @device: Device to unmark
+ *
+ * Called before GID table cleanup to prevent event handlers from accessing
+ * the device while it's being torn down.
+ */
+void ib_device_disable_gid_updates(struct ib_device *device)
+{
+ down_write(&devices_rwsem);
+ xa_clear_mark(&devices, device->index, DEVICE_GID_UPDATES);
+ up_write(&devices_rwsem);
+}
+
/*
* ib_enum_all_devs - enumerate all ib_devices
* @cb: Callback to call for each found ib_device
--
2.51.0