[PATCH v1 10/13] ceph: force mdsmap refresh on persistent MDS connection failures
From: Ionut Nechita (Wind River)
Date: Thu Mar 12 2026 - 04:21:06 EST
From: Ionut Nechita <ionut.nechita@xxxxxxxxxxxxx>
During rolling upgrades in containerized environments (e.g.,
rook-ceph in containerized environments), MDS daemons are restarted and
may receive new IP addresses from the CNI plugin. The kernel CephFS
client (libceph) maintains a cached mdsmap with the old MDS address
and attempts to reconnect indefinitely.
The monitor client subscribes to mdsmap updates with
start=current_epoch+1, expecting the monitor to push new maps.
However, if the monitor connection was also disrupted during the
upgrade (e.g., due to EADDRNOTAVAIL from IPv6 DAD), the subscription
may not be properly re-established, leaving the client with a stale
mdsmap.
This results in a deadlock:
- The kernel client retries connecting to the old MDS address forever
- The MDS connection has no .fault callback, so the MDS client is
never notified of persistent connection failures
- The stale mdsmap is never refreshed because the client believes
its subscription is active
- New pod mounts via CSI hang in ContainerCreating state
- The rook-ceph upgrade cannot complete
Observed in production (kernel 6.12.0-1-rt-amd64,
Ceph Reef 18.2.2->18.2.5 upgrade):
- mdsmap stuck at epoch 53 while cluster was at epoch 68
- MDS session state: hung
- monc showed: have mdsmap 53 want 54+
- MDS address changed from dead:beef::...eb75 to dead:beef::...bc76
- Client kept retrying on old address for 30+ minutes
Fix this by:
1. Adding a .fault callback to the MDS connection operations
(mds_con_ops) so the MDS client is notified when connections fail
2. Tracking consecutive connection failures per MDS session via a
new s_con_failures counter
3. When failures exceed MDS_CON_FAIL_REFRESH_MDSMAP (10 consecutive
failures, ~2.5-15 seconds depending on backoff), forcing a fresh
mdsmap subscription with start=0 to get the complete current map
4. Resetting the failure counter when a session message is
successfully received (in handle_session)
Signed-off-by: Ionut Nechita <ionut.nechita@xxxxxxxxxxxxx>
---
fs/ceph/mds_client.c | 73 ++++++++++++++++++++++++++++++++++++++++++++
fs/ceph/mds_client.h | 2 ++
2 files changed, 75 insertions(+)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index ac86225595b5f..0e766880056c0 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -66,6 +66,12 @@ static void ceph_cap_release_work(struct work_struct *work);
static void ceph_cap_reclaim_work(struct work_struct *work);
static const struct ceph_connection_operations mds_con_ops;
+/*
+ * Number of consecutive MDS connection failures before forcing
+ * a fresh mdsmap subscription. This handles stale mdsmap scenarios
+ * during rolling upgrades where MDS addresses change.
+ */
+#define MDS_CON_FAIL_REFRESH_MDSMAP 10
/*
@@ -997,6 +1003,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_mdsc = mdsc;
s->s_mds = mds;
s->s_state = CEPH_MDS_SESSION_NEW;
+ s->s_con_failures = 0;
mutex_init(&s->s_mutex);
ceph_con_init(&s->s_con, s, &mds_con_ops, &mdsc->fsc->client->msgr);
@@ -4341,6 +4348,9 @@ static void handle_session(struct ceph_mds_session *session,
ceph_session_op_name(op), session,
ceph_session_state_name(session->s_state), seq);
+ /* Reset connection failure counter on successful session message */
+ session->s_con_failures = 0;
+
if (session->s_state == CEPH_MDS_SESSION_HUNG) {
session->s_state = CEPH_MDS_SESSION_OPEN;
pr_info_client(cl, "mds%d came back\n", session->s_mds);
@@ -5427,6 +5437,22 @@ bool check_session_state(struct ceph_mds_session *s)
if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
s->s_state = CEPH_MDS_SESSION_HUNG;
pr_info_client(cl, "mds%d hung\n", s->s_mds);
+
+ /*
+ * Force a fresh mdsmap subscription when a session
+ * becomes hung. The MDS may have restarted with a
+ * new address during a rolling upgrade, and the
+ * connection may have entered STANDBY state (no
+ * .fault callback) rather than generating connect
+ * errors. Requesting mdsmap from epoch 0 ensures
+ * we get the current map with updated addresses.
+ */
+ pr_warn_client(cl,
+ "mds%d hung, requesting fresh mdsmap\n",
+ s->s_mds);
+ if (ceph_monc_want_map(&s->s_mdsc->fsc->client->monc,
+ CEPH_SUB_MDSMAP, 0, true))
+ ceph_monc_renew_subs(&s->s_mdsc->fsc->client->monc);
}
break;
case CEPH_MDS_SESSION_CLOSING:
@@ -6528,12 +6554,59 @@ static int mds_check_message_signature(struct ceph_msg *msg)
return ceph_auth_check_message_signature(auth, msg);
}
+/*
+ * Handle MDS connection fault.
+ *
+ * Track consecutive connection failures and force a fresh mdsmap
+ * subscription when failures exceed the threshold. This handles the
+ * case where the MDS address has changed (e.g., during a rolling
+ * upgrade) but the client has a stale mdsmap and keeps retrying
+ * on the old address.
+ */
+static void mds_fault(struct ceph_connection *con)
+{
+ struct ceph_mds_session *s = con->private;
+ struct ceph_mds_client *mdsc = s->s_mdsc;
+ struct ceph_client *cl = mdsc->fsc->client;
+ int failures;
+
+ failures = ++s->s_con_failures;
+
+ if (failures == MDS_CON_FAIL_REFRESH_MDSMAP) {
+ pr_warn_client(cl,
+ "mds%d connection failed %d times, requesting fresh mdsmap\n",
+ s->s_mds, failures);
+
+ /*
+ * Force a fresh mdsmap subscription by requesting from
+ * epoch 0. This ensures we get the complete current map
+ * with up-to-date MDS addresses, rather than waiting for
+ * an incremental update that may never arrive if our
+ * subscription was lost during a monitor reconnection.
+ */
+ if (ceph_monc_want_map(&mdsc->fsc->client->monc,
+ CEPH_SUB_MDSMAP, 0, true))
+ ceph_monc_renew_subs(&mdsc->fsc->client->monc);
+ } else if (failures > MDS_CON_FAIL_REFRESH_MDSMAP &&
+ failures % MDS_CON_FAIL_REFRESH_MDSMAP == 0) {
+ /*
+ * Periodically retry the fresh mdsmap request in case
+ * the previous one was lost or the monitor was also
+ * temporarily unavailable.
+ */
+ if (ceph_monc_want_map(&mdsc->fsc->client->monc,
+ CEPH_SUB_MDSMAP, 0, true))
+ ceph_monc_renew_subs(&mdsc->fsc->client->monc);
+ }
+}
+
static const struct ceph_connection_operations mds_con_ops = {
.get = mds_get_con,
.put = mds_put_con,
.alloc_msg = mds_alloc_msg,
.dispatch = mds_dispatch,
.peer_reset = mds_peer_reset,
+ .fault = mds_fault,
.get_authorizer = mds_get_authorizer,
.add_authorizer_challenge = mds_add_authorizer_challenge,
.verify_authorizer_reply = mds_verify_authorizer_reply,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 695c5a9c94026..44585b1cb4485 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -251,6 +251,8 @@ struct ceph_mds_session {
struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */
struct xarray s_delegated_inos;
+
+ int s_con_failures; /* consecutive connection failures */
};
/*
--
2.53.0