[PATCH v2 3/7] ceph: harden send_mds_reconnect and handle active-MDS peer reset

From: Alex Markuze

Date: Wed Apr 15 2026 - 13:26:01 EST

Change send_mds_reconnect() to return an error code so callers can detect
and report reconnect failures instead of silently ignoring them. Add early
bailout checks for sessions that are already closed, rejected, or
unregistered, which avoids sending reconnect messages for sessions that
can no longer be recovered.

The early -ESTALE and -ENOENT bailouts use a separate fail_return label
that skips the pr_err_client diagnostic, since these codes indicate
expected concurrent-teardown races rather than genuine reconnect build
failures.

Save the prior session state before transitioning to RECONNECTING,
and restore it in the failure path. Without this, a transient
build or encoding failure (-ENOMEM, -ENOSPC) strands the session
in RECONNECTING indefinitely because check_new_map() only retries
sessions in RESTARTING state.

Rewrite mds_peer_reset() to handle the case where the MDS is past its
RECONNECT phase (i.e. active). An active MDS rejects CLIENT_RECONNECT
messages because it only accepts them during its own RECONNECT window
after restart. Previously, the client would send a doomed reconnect
that the MDS would reject or ignore. Now, the client tears the session
down locally and lets new requests re-open a fresh session, which is
the correct recovery for this scenario. The RECONNECTING state is
handled on the same teardown path, since the MDS will reject reconnect
attempts from an active client regardless of the session's local state.

The session teardown path in mds_peer_reset() follows the established
drop-and-reacquire locking pattern from check_new_map(): take
mdsc->mutex for session unregistration, release it, then take s->s_mutex
separately for cleanup. This avoids introducing a new simultaneous lock
nesting pattern.

Log reconnect failures from check_new_map() and mds_peer_reset() at
pr_warn level rather than pr_err, since return codes like -ESTALE
(closed/rejected session) and -ENOENT (unregistered session) are
expected during concurrent teardown. Log dropped messages for
unregistered sessions via doutc() (dynamic debug) rather than
pr_info, as post-reset message arrival is routine and does not
warrant unconditional logging.

Signed-off-by: Alex Markuze <amarkuze@xxxxxxxxxx>
---
fs/ceph/mds_client.c | 163 +++++++++++++++++++++++++++++++++++++++----
1 file changed, 151 insertions(+), 12 deletions(-)

diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 871f0eef468d..b14ede808436 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -4416,9 +4416,14 @@ static void handle_session(struct ceph_mds_session *session,
break;

case CEPH_SESSION_REJECT:
- WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING);
- pr_info_client(cl, "mds%d rejected session\n",
- session->s_mds);
+ WARN_ON(session->s_state != CEPH_MDS_SESSION_OPENING &&
+ session->s_state != CEPH_MDS_SESSION_RECONNECTING);
+ if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+ pr_info_client(cl, "mds%d reconnect rejected\n",
+ session->s_mds);
+ else
+ pr_info_client(cl, "mds%d rejected session\n",
+ session->s_mds);
session->s_state = CEPH_MDS_SESSION_REJECTED;
cleanup_session_requests(mdsc, session);
remove_session_caps(session);
@@ -4678,6 +4683,14 @@ static int reconnect_caps_cb(struct inode *inode, int mds, void *arg)
cap->mseq = 0; /* and migrate_seq */
cap->cap_gen = atomic_read(&cap->session->s_cap_gen);

+ /*
+ * Note: CEPH_I_ERROR_FILELOCK is not set during reconnect.
+ * Instead, locks are submitted for best-effort MDS reclaim
+ * via the flock_len field below. If reclaim fails (e.g.,
+ * another client grabbed a conflicting lock), future lock
+ * operations will fail and set the error flag at that point.
+ */
+
/* These are lost when the session goes away */
if (S_ISDIR(inode->i_mode)) {
if (cap->issued & CEPH_CAP_DIR_CREATE) {
@@ -4892,13 +4905,14 @@ static int encode_snap_realms(struct ceph_mds_client *mdsc,
*
* This is a relatively heavyweight operation, but it's rare.
*/
-static void send_mds_reconnect(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
+static int send_mds_reconnect(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
{
struct ceph_client *cl = mdsc->fsc->client;
struct ceph_msg *reply;
int mds = session->s_mds;
int err = -ENOMEM;
+ int old_state;
struct ceph_reconnect_state recon_state = {
.session = session,
};
@@ -4917,6 +4931,31 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
xa_destroy(&session->s_delegated_inos);

mutex_lock(&session->s_mutex);
+ if (session->s_state == CEPH_MDS_SESSION_CLOSED ||
+ session->s_state == CEPH_MDS_SESSION_REJECTED) {
+ pr_info_client(cl, "mds%d skipping reconnect, session %s\n",
+ mds,
+ ceph_session_state_name(session->s_state));
+ mutex_unlock(&session->s_mutex);
+ ceph_msg_put(reply);
+ err = -ESTALE;
+ goto fail_return;
+ }
+
+ mutex_lock(&mdsc->mutex);
+ if (mds >= mdsc->max_sessions || mdsc->sessions[mds] != session) {
+ mutex_unlock(&mdsc->mutex);
+ pr_info_client(cl,
+ "mds%d skipping reconnect, session unregistered\n",
+ mds);
+ mutex_unlock(&session->s_mutex);
+ ceph_msg_put(reply);
+ err = -ENOENT;
+ goto fail_return;
+ }
+ mutex_unlock(&mdsc->mutex);
+
+ old_state = session->s_state;
session->s_state = CEPH_MDS_SESSION_RECONNECTING;
session->s_seq = 0;

@@ -5046,18 +5085,34 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,

up_read(&mdsc->snap_rwsem);
ceph_pagelist_release(recon_state.pagelist);
- return;
+ return 0;

fail:
ceph_msg_put(reply);
up_read(&mdsc->snap_rwsem);
+ /*
+ * Restore prior session state so map-driven reconnect logic
+ * (check_new_map) can retry. Without this, a transient build
+ * failure strands the session in RECONNECTING indefinitely.
+ */
+ session->s_state = old_state;
mutex_unlock(&session->s_mutex);
fail_nomsg:
ceph_pagelist_release(recon_state.pagelist);
fail_nopagelist:
pr_err_client(cl, "error %d preparing reconnect for mds%d\n",
err, mds);
- return;
+ return err;
+
+fail_return:
+ /*
+ * Early-exit path for expected concurrent-teardown races
+ * (-ESTALE for closed/rejected sessions, -ENOENT for
+ * unregistered sessions). Skip the pr_err_client diagnostic
+ * since these are not genuine reconnect build failures.
+ */
+ ceph_pagelist_release(recon_state.pagelist);
+ return err;
}

@@ -5138,9 +5193,15 @@ static void check_new_map(struct ceph_mds_client *mdsc,
*/
if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
newstate >= CEPH_MDS_STATE_RECONNECT) {
+ int rc;
+
mutex_unlock(&mdsc->mutex);
clear_bit(i, targets);
- send_mds_reconnect(mdsc, s);
+ rc = send_mds_reconnect(mdsc, s);
+ if (rc)
+ pr_warn_client(cl,
+ "mds%d reconnect failed: %d\n",
+ i, rc);
mutex_lock(&mdsc->mutex);
}

@@ -5204,7 +5265,11 @@ static void check_new_map(struct ceph_mds_client *mdsc,
}
doutc(cl, "send reconnect to export target mds.%d\n", i);
mutex_unlock(&mdsc->mutex);
- send_mds_reconnect(mdsc, s);
+ err = send_mds_reconnect(mdsc, s);
+ if (err)
+ pr_warn_client(cl,
+ "mds%d export target reconnect failed: %d\n",
+ i, err);
ceph_put_mds_session(s);
mutex_lock(&mdsc->mutex);
}
@@ -6284,12 +6349,84 @@ static void mds_peer_reset(struct ceph_connection *con)
{
struct ceph_mds_session *s = con->private;
struct ceph_mds_client *mdsc = s->s_mdsc;
+ int session_state;

pr_warn_client(mdsc->fsc->client, "mds%d closed our session\n",
s->s_mds);
- if (READ_ONCE(mdsc->fsc->mount_state) != CEPH_MOUNT_FENCE_IO &&
- ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) >= CEPH_MDS_STATE_RECONNECT)
- send_mds_reconnect(mdsc, s);
+
+ if (READ_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_FENCE_IO ||
+ ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) < CEPH_MDS_STATE_RECONNECT)
+ return;
+
+ if (ceph_mdsmap_get_state(mdsc->mdsmap, s->s_mds) == CEPH_MDS_STATE_RECONNECT) {
+ int rc = send_mds_reconnect(mdsc, s);
+
+ if (rc)
+ pr_warn_client(mdsc->fsc->client,
+ "mds%d reconnect failed: %d\n",
+ s->s_mds, rc);
+ return;
+ }
+
+ /*
+ * MDS is active (past RECONNECT). It will not accept a
+ * CLIENT_RECONNECT from us, so tear the session down locally
+ * and let new requests re-open a fresh session.
+ *
+ * Snapshot session state with READ_ONCE, then revalidate under
+ * mdsc->mutex before acting. The subsequent mdsc->mutex
+ * section rechecks s_state to catch concurrent transitions, so
+ * the lockless snapshot here is safe. s->s_mutex is taken
+ * separately for cleanup after unregistration, which avoids
+ * introducing a new s->s_mutex + mdsc->mutex nesting.
+ */
+ session_state = READ_ONCE(s->s_state);
+
+ switch (session_state) {
+ case CEPH_MDS_SESSION_RESTARTING:
+ case CEPH_MDS_SESSION_RECONNECTING:
+ case CEPH_MDS_SESSION_CLOSING:
+ case CEPH_MDS_SESSION_OPEN:
+ case CEPH_MDS_SESSION_HUNG:
+ case CEPH_MDS_SESSION_OPENING:
+ mutex_lock(&mdsc->mutex);
+ if (s->s_mds >= mdsc->max_sessions ||
+ mdsc->sessions[s->s_mds] != s ||
+ s->s_state != session_state) {
+ pr_info_client(mdsc->fsc->client,
+ "mds%d state changed to %s during peer reset\n",
+ s->s_mds,
+ ceph_session_state_name(s->s_state));
+ mutex_unlock(&mdsc->mutex);
+ return;
+ }
+
+ ceph_get_mds_session(s);
+ s->s_state = CEPH_MDS_SESSION_CLOSED;
+ __unregister_session(mdsc, s);
+ __wake_requests(mdsc, &s->s_waiting);
+ mutex_unlock(&mdsc->mutex);
+
+ mutex_lock(&s->s_mutex);
+ cleanup_session_requests(mdsc, s);
+ remove_session_caps(s);
+ mutex_unlock(&s->s_mutex);
+
+ wake_up_all(&mdsc->session_close_wq);
+
+ mutex_lock(&mdsc->mutex);
+ kick_requests(mdsc, s->s_mds);
+ mutex_unlock(&mdsc->mutex);
+
+ ceph_put_mds_session(s);
+ break;
+ default:
+ pr_warn_client(mdsc->fsc->client,
+ "mds%d peer reset in unexpected state %s\n",
+ s->s_mds,
+ ceph_session_state_name(session_state));
+ break;
+ }
}

static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)
@@ -6301,6 +6438,8 @@ static void mds_dispatch(struct ceph_connection *con, struct ceph_msg *msg)

mutex_lock(&mdsc->mutex);
if (__verify_registered_session(mdsc, s) < 0) {
+ doutc(cl, "dropping tid %llu from unregistered session %d\n",
+ le64_to_cpu(msg->hdr.tid), s->s_mds);
mutex_unlock(&mdsc->mutex);
goto out;
}
--
2.34.1