Re: [PATCH] fs/ceph/mds_client: ignore responses for waiting requests

From: Xiubo Li
Date: Thu Mar 09 2023 - 00:32:39 EST



On 08/03/2023 23:17, Max Kellermann wrote:
On Wed, Mar 8, 2023 at 4:42 AM Xiubo Li <xiubli@xxxxxxxxxx> wrote:
How could this happen ?

Since the req hasn't been submitted yet, how could it receive a reply
normally ?
I have no idea. We have frequent problems with MDS closing the
connection (once or twice a week), and sometimes, this leads to the
WARNING problem which leaves the server hanging. This seems to be some
timing problem, but that MDS connection problem is a different
problem.
My patch just attempts to address the WARNING; not knowing much about
Ceph internals, my idea was that even if the server sends bad reply
packets, the client shouldn't panic.

It should be a corrupted reply and it lead us to get a incorrect req,
which hasn't been submitted yet.

BTW, do you have the dump of the corrupted msg by 'ceph_msg_dump(msg)' ?
Unfortunately not - we have already scrubbed the server that had this
problem and rebooted it with a fresh image including my patch. It
seems I don't have a full copy of the kernel log anymore.

Coincidentally, the patch has prevented another kernel hang just a few
minutes ago:

Mar 08 15:48:53 sweb1 kernel: ceph: mds0 caps stale
Mar 08 15:49:13 sweb1 kernel: ceph: mds0 caps stale
Mar 08 15:49:35 sweb1 kernel: ceph: mds0 caps went stale, renewing
Mar 08 15:49:35 sweb1 kernel: ceph: mds0 caps stale
Mar 08 15:49:35 sweb1 kernel: libceph: mds0 (1)10.41.2.11:6801 socket
error on write
Mar 08 15:49:35 sweb1 kernel: libceph: mds0 (1)10.41.2.11:6801 session reset
Mar 08 15:49:35 sweb1 kernel: ceph: mds0 closed our session
Mar 08 15:49:35 sweb1 kernel: ceph: mds0 reconnect start
Mar 08 15:49:36 sweb1 kernel: ceph: mds0 reconnect success
Mar 08 15:49:36 sweb1 kernel: ceph: dropping dirty+flushing Fx state
for 0000000064778286 2199046848012
Mar 08 15:49:40 sweb1 kernel: ceph: mdsc_handle_reply on waiting
request tid 1106187
Mar 08 15:49:53 sweb1 kernel: ceph: mds0 caps renewed

Since my patch is already in place, the kernel hasn't checked the
unexpected packet and thus hasn't dumped it....

If you need more information and have a patch with more logging, I
could easily boot those servers with your patch and post that data
next time it happens.

Hi Max,

I figured out one possible case:

For example when mds closes our session the kclient will call 'cleanup_session_requests()', which will drop the unsafe requests and then set 'req->r_attempts' to 0, that means for the requests if they were sent out without receiving unsafe reply they will be retried and then they will be possibly added to the wait list in '__do_request()'. If these requests will get a reply later just after being retried we could see this warning.

I attached one testing patch based yours, just added more debug logs, which won't be introduce perf issue since all the logs should be printed in corner cases.

Could you help test it ?

Thanks,

- Xiubo

Max
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b8d6cca16005..52f5e40a341d 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1754,7 +1754,7 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req;
struct rb_node *p;

- dout("cleanup_session_requests mds%d\n", session->s_mds);
+ pr_info("cleanup_session_requests mds%d\n", session->s_mds);
mutex_lock(&mdsc->mutex);
while (!list_empty(&session->s_unsafe)) {
req = list_first_entry(&session->s_unsafe,
@@ -1773,8 +1773,12 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc,
req = rb_entry(p, struct ceph_mds_request, r_node);
p = rb_next(p);
if (req->r_session &&
- req->r_session->s_mds == session->s_mds)
+ req->r_session->s_mds == session->s_mds) {
+ if (req->r_attempts)
+ pr_info("mds%d req tid %llu reattempted\n",
+ session->s_mds, req->r_tid);
req->r_attempts = 0;
+ }
}
mutex_unlock(&mdsc->mutex);
}
@@ -3235,7 +3239,8 @@ static void __do_request(struct ceph_mds_client *mdsc,
goto finish;
}
if (mdsc->mdsmap->m_epoch == 0) {
- dout("do_request no mdsmap, waiting for map\n");
+ pr_info("do_request no mdsmap, waiting for map req tid %llu\n",
+ req->r_tid);
list_add(&req->r_wait, &mdsc->waiting_for_map);
return;
}
@@ -3256,7 +3261,8 @@ static void __do_request(struct ceph_mds_client *mdsc,
err = -EJUKEBOX;
goto finish;
}
- dout("do_request no mds or not active, waiting for map\n");
+ pr_info("do_request no mds or not active, waiting for map req tid %llu\n",
+ req->r_tid);
list_add(&req->r_wait, &mdsc->waiting_for_map);
return;
}
@@ -3302,8 +3308,10 @@ static void __do_request(struct ceph_mds_client *mdsc,
* it to the mdsc queue.
*/
if (session->s_state == CEPH_MDS_SESSION_REJECTED) {
- if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER))
+ if (ceph_test_mount_opt(mdsc->fsc, CLEANRECOVER)) {
+ pr_info(" CLEANRECOVER req tid %llu\n", req->r_tid);
list_add(&req->r_wait, &mdsc->waiting_for_map);
+ }
else
err = -EACCES;
goto out_session;
@@ -3318,6 +3326,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (random)
req->r_resend_mds = mds;
}
+ pr_info(" session is not opened, req tid %llu\n", req->r_tid);
list_add(&req->r_wait, &session->s_waiting);
goto out_session;
}
@@ -3621,6 +3630,14 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
}
dout("handle_reply %p\n", req);

+ /* waiting, not yet submitted? */
+ if (!list_empty(&req->r_wait)) {
+ pr_err("mdsc_handle_reply on waiting request tid %llu\n", tid);
+ mutex_unlock(&mdsc->mutex);
+ ceph_msg_dump(msg);
+ goto out;
+ }
+
/* correct session? */
if (req->r_session != session) {
pr_err("mdsc_handle_reply got %llu on session mds%d"
@@ -4019,6 +4036,7 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_CLOSE:
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
pr_info("mds%d reconnect denied\n", session->s_mds);
+ pr_info("mds%d closed our session\n", session->s_mds);
session->s_state = CEPH_MDS_SESSION_CLOSED;
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
@@ -4727,6 +4745,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
__wake_requests(mdsc, &s->s_waiting);
mutex_unlock(&mdsc->mutex);

+ pr_info("check_new_map exceed max rank mds%d\n", i);
mutex_lock(&s->s_mutex);
cleanup_session_requests(mdsc, s);
remove_session_caps(s);
@@ -5483,6 +5502,7 @@ void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc)
mutex_lock(&session->s_mutex);
__close_session(mdsc, session);
if (session->s_state == CEPH_MDS_SESSION_CLOSING) {
+ pr_info("ceph_mdsc_force_umount mds%d\n", mds);
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
}