Re: [Bug report] NULL pointer dereference in frwr_unmap_sync()
From: Dai Ngo
Date: Sun Mar 09 2025 - 15:41:31 EST
Hi Nan,
Can you try the attached patch with 6.14-rc4?
This patch adds a spinlock to protect the rl_free_mrs and rl_registered list.
I have seen list corruption in our RDMA testing but the problem is hard to
reproduce so I have not submitted this patch to upstream.
Thanks,
-Dai
On 3/5/25 6:40 PM, Li Nan wrote:
在 2025/3/5 22:02, Chuck Lever 写道:
On 3/4/25 9:43 PM, Li Nan wrote:
We found a following problem in kernel 5.10, and the same problem
should
exist in mainline:
During NFS mount using 'soft' option over RoCE network, we observed
kernel
crash with below trace when network issues occur
(congestion/disconnect):
nfs: server 10.10.253.211 not responding, timed out
BUG: kernel NULL pointer dereference, address: 00000000000000a0
RIP: 0010:frwr_unmap_sync+0x77/0x200 [rpcrdma]
Call Trace:
? __die_body.cold+0x8/0xd
? no_context+0x155/0x230
? __bad_area_nosemaphore+0x52/0x1a0
? exc_page_fault+0x2dc/0x550
? asm_exc_page_fault+0x1e/0x30
? frwr_unmap_sync+0x77/0x200 [rpcrdma]
xprt_release+0x9e/0x1a0 [sunrpc]
rpc_release_resources_task+0xe/0x50 [sunrpc]
rpc_release_task+0x19/0xa0 [sunrpc]
rpc_async_schedule+0x29/0x40 [sunrpc]
process_one_work+0x1b2/0x350
worker_thread+0x49/0x310
? rescuer_thread+0x380/0x380
kthread+0xfb/0x140
Problem analysis:
The crash happens in frwr_unmap_sync() when accessing
req->rl_registered
list, caused by either NULL pointer or accessing freed MR resources.
There's a race condition between:
T1
__ib_process_cq
wc->wr_cqe->done (frwr_wc_localinv)
rpcrdma_flush_disconnect
rpcrdma_force_disconnect
xprt_force_disconnect
xprt_autoclose
xprt_rdma_close
rpcrdma_xprt_disconnect
rpcrdma_reqs_reset
frwr_reset
rpcrdma_mr_pop(&req->rl_registered)
T2
rpc_async_schedule
rpc_release_task
rpc_release_resources_task
xprt_release
xprt_rdma_free
frwr_unmap_sync
rpcrdma_mr_pop(&req->rl_registered)
This problem also exists in function
rpcrdma_mrs_destroy().
Dai, is this the same as the system test problem you've been looking at?
Thank you for looking into it. Is there a patch that needs to be
tested? We
are happy to help with the testing.
From ff8d664c9606135d52fd2dc4778dd75a56a3b38e Mon Sep 17 00:00:00 2001
From: Dai Ngo <dai.ngo@xxxxxxxxxx>
Date: Thu, 4 Apr 2024 11:11:21 -0600
Subject: [PATCH] xprtrdma: add spinlock in rpcrdma_req to protect rl_free_mrs
and rl_registered list
In some rare conditions, the top half and the bottom half of the
xprtrdma module access the same rl_free_mrs and rl_registered
list of the request.
One of the cases is when rpcrdma_marshal_req calls frwr_reset while
the rpcrdma_reply_handler is executing on the same rpcrdma_req. When
this happens the rl_free_mrs and rl_registered are corrupted.
This patch adds a spinlock in rpcrdma_req to protect rl_free_mrs and
rl_registered list. Since the chance that the top and bottom half of
the xprtrdma module executing at the same time on the same request is
rare, only in error conditions, it's expected that the contention of
this lock is very low.
Signed-off-by: Dai Ngo <dai.ngo@xxxxxxxxxx>
---
net/sunrpc/xprtrdma/frwr_ops.c | 27 +++++++++++++++++++++++----
net/sunrpc/xprtrdma/rpc_rdma.c | 9 ++++++---
net/sunrpc/xprtrdma/verbs.c | 6 ++++++
net/sunrpc/xprtrdma/xprt_rdma.h | 4 +++-
4 files changed, 38 insertions(+), 8 deletions(-)
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index ffbf99894970..e4e0f5532e8c 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -87,9 +87,11 @@ static void frwr_mr_put(struct rpcrdma_mr *mr)
frwr_mr_unmap(mr->mr_xprt, mr);
/* The MR is returned to the req's MR free list instead
- * of to the xprt's MR free list. No spinlock is needed.
+ * of to the xprt's MR free list.
*/
+ spin_lock(&mr->mr_req->rl_mr_list_lock);
rpcrdma_mr_push(mr, &mr->mr_req->rl_free_mrs);
+ spin_unlock(&mr->mr_req->rl_mr_list_lock);
}
/* frwr_reset - Place MRs back on the free list
@@ -106,8 +108,13 @@ void frwr_reset(struct rpcrdma_req *req)
{
struct rpcrdma_mr *mr;
- while ((mr = rpcrdma_mr_pop(&req->rl_registered)))
+ spin_lock(&req->rl_mr_list_lock);
+ while ((mr = rpcrdma_mr_pop(&req->rl_registered))) {
+ spin_unlock(&req->rl_mr_list_lock);
frwr_mr_put(mr);
+ spin_lock(&req->rl_mr_list_lock);
+ }
+ spin_unlock(&req->rl_mr_list_lock);
}
/**
@@ -390,6 +397,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
num_wrs = 1;
post_wr = send_wr;
+ spin_lock(&req->rl_mr_list_lock);
list_for_each_entry(mr, &req->rl_registered, mr_list) {
trace_xprtrdma_mr_fastreg(mr);
@@ -402,6 +410,7 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
post_wr = &mr->mr_regwr.wr;
++num_wrs;
}
+ spin_unlock(&req->rl_mr_list_lock);
if ((kref_read(&req->rl_kref) > 1) || num_wrs > ep->re_send_count) {
send_wr->send_flags |= IB_SEND_SIGNALED;
@@ -425,17 +434,23 @@ int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* @mrs: list of MRs to check
*
*/
-void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
+void frwr_reminv(struct rpcrdma_rep *rep, struct rpcrdma_req *req)
{
struct rpcrdma_mr *mr;
+ struct list_head *mrs = &req->rl_registered;
- list_for_each_entry(mr, mrs, mr_list)
+ spin_lock(&req->rl_mr_list_lock);
+ list_for_each_entry(mr, mrs, mr_list) {
if (mr->mr_handle == rep->rr_inv_rkey) {
list_del_init(&mr->mr_list);
trace_xprtrdma_mr_reminv(mr);
+ spin_unlock(&req->rl_mr_list_lock);
frwr_mr_put(mr);
+ spin_lock(&req->rl_mr_list_lock);
break; /* only one invalidated MR per RPC */
}
+ }
+ spin_unlock(&req->rl_mr_list_lock);
}
static void frwr_mr_done(struct ib_wc *wc, struct rpcrdma_mr *mr)
@@ -507,6 +522,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* a single ib_post_send() call.
*/
prev = &first;
+ spin_lock(&req->rl_mr_list_lock);
mr = rpcrdma_mr_pop(&req->rl_registered);
do {
trace_xprtrdma_mr_localinv(mr);
@@ -526,6 +542,7 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
*prev = last;
prev = &last->next;
} while ((mr = rpcrdma_mr_pop(&req->rl_registered)));
+ spin_unlock(&req->rl_mr_list_lock);
mr = container_of(last, struct rpcrdma_mr, mr_invwr);
@@ -610,6 +627,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
* a single ib_post_send() call.
*/
prev = &first;
+ spin_lock(&req->rl_mr_list_lock);
mr = rpcrdma_mr_pop(&req->rl_registered);
do {
trace_xprtrdma_mr_localinv(mr);
@@ -629,6 +647,7 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
*prev = last;
prev = &last->next;
} while ((mr = rpcrdma_mr_pop(&req->rl_registered)));
+ spin_unlock(&req->rl_mr_list_lock);
/* Strong send queue ordering guarantees that when the
* last WR in the chain completes, all WRs in the chain
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 190a4de239c8..29b10f6eb8b0 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -298,15 +298,18 @@ static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt,
int nsegs, bool writing,
struct rpcrdma_mr **mr)
{
+ spin_lock(&req->rl_mr_list_lock);
*mr = rpcrdma_mr_pop(&req->rl_free_mrs);
if (!*mr) {
*mr = rpcrdma_mr_get(r_xprt);
- if (!*mr)
+ if (!*mr) {
+ spin_unlock(&req->rl_mr_list_lock);
goto out_getmr_err;
+ }
(*mr)->mr_req = req;
}
-
rpcrdma_mr_push(*mr, &req->rl_registered);
+ spin_unlock(&req->rl_mr_list_lock);
return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr);
out_getmr_err:
@@ -1485,7 +1488,7 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
trace_xprtrdma_reply(rqst->rq_task, rep, credits);
if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE)
- frwr_reminv(rep, &req->rl_registered);
+ frwr_reminv(rep, req);
if (!list_empty(&req->rl_registered))
frwr_unmap_async(r_xprt, req);
/* LocalInv completion will complete the RPC */
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 4f8d7efa469f..a8663f104729 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -825,6 +825,8 @@ struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt,
INIT_LIST_HEAD(&req->rl_free_mrs);
INIT_LIST_HEAD(&req->rl_registered);
+
+ spin_lock_init(&req->rl_mr_list_lock);
spin_lock(&buffer->rb_lock);
list_add(&req->rl_all, &buffer->rb_allreqs);
spin_unlock(&buffer->rb_lock);
@@ -1084,15 +1086,19 @@ void rpcrdma_req_destroy(struct rpcrdma_req *req)
list_del(&req->rl_all);
+ spin_lock(&req->rl_mr_list_lock);
while ((mr = rpcrdma_mr_pop(&req->rl_free_mrs))) {
struct rpcrdma_buffer *buf = &mr->mr_xprt->rx_buf;
+ spin_unlock(&req->rl_mr_list_lock);
spin_lock(&buf->rb_lock);
list_del(&mr->mr_all);
spin_unlock(&buf->rb_lock);
frwr_mr_release(mr);
+ spin_lock(&req->rl_mr_list_lock);
}
+ spin_unlock(&req->rl_mr_list_lock);
rpcrdma_regbuf_free(req->rl_recvbuf);
rpcrdma_regbuf_free(req->rl_sendbuf);
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index da409450dfc0..12db0250c9ce 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -327,6 +327,8 @@ struct rpcrdma_req {
struct list_head rl_all;
struct kref rl_kref;
+ /* rl_mr_list_locks used for rl_free_mrs and rl_registered list */
+ spinlock_t rl_mr_list_lock;
struct list_head rl_free_mrs;
struct list_head rl_registered;
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
@@ -539,7 +541,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
int nsegs, bool writing, __be32 xid,
struct rpcrdma_mr *mr);
int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
-void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
+void frwr_reminv(struct rpcrdma_rep *rep, struct rpcrdma_req *req);
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
int frwr_wp_create(struct rpcrdma_xprt *r_xprt);
--
2.43.0