[PATCH v1 11/13] libceph: reset source address on persistent EADDRNOTAVAIL

From: Ionut Nechita (Wind River)

Date: Thu Mar 12 2026 - 04:20:07 EST

From: Ionut Nechita <ionut.nechita@xxxxxxxxxxxxx>

In containerized environments (e.g., Rook-Ceph with
Calico CNI), the kernel CephFS client's source address
(msgr->inst.addr) is learned from the first successful monitor
connection via process_hello(). If the initial connection was made
through a transient CNI pod address (e.g., a Calico-assigned
dead:beef::... address from a CSI plugin pod), that address is
stored permanently in inst.addr.

When the pod is later rescheduled or the CNI reconfigures networking,
the original pod address is removed and Calico installs a blackhole
route for the old address range. All subsequent kernel socket
connections fail with EADDRNOTAVAIL at ip6_dst_lookup_flow() before
even sending a TCP SYN, because the IPv6 source address selection
finds the blackhole route for the old address range.

This creates a permanent deadlock:
- All connections (mon, mds, osd) fail with EADDRNOTAVAIL
- The client cannot reach any monitor to re-learn its address
- inst.addr is never blank again (set once, never cleared)
- The only recovery is force-unmounting and remounting

Fix this by tracking consecutive EADDRNOTAVAIL failures across all
connections using an atomic counter in struct ceph_messenger. After
ADDRNOTAVAIL_RESET_THRESHOLD (30) consecutive failures (~3 seconds
at 100ms retry interval), reset inst.addr.in_addr to zero (blank)
while preserving the nonce and type. This allows process_hello()
(msgr2) or process_banner() (msgr1) to re-learn the source address
from the next successful monitor connection, which will use the
current stable host address instead of the defunct pod address.

The counter is reset to zero when:
- A TCP connection succeeds (in ceph_tcp_connect)
- The address is successfully re-learned (in process_hello/
process_banner)

Observed in production (kernel 6.12.0-1-rt-amd64, Ceph Reef
18.2.2->18.2.5 upgrade, IPv6-only cluster):
- Client instance: client.55136 [dead:beef::a2bf:c94c:345d:bc66]:0
- Address dead:beef::a2bf:c94c:345d:bc66 was a Calico pod address
- After pod reschedule: blackhole dead:beef::a2bf:c94c:345d:bc40/122
- All connections stuck in EADDRNOTAVAIL loop for 16+ hours
- After force-unmount + remount: new client got stable host address
[aefd::2b93:d245:fd09:127e]:0 and worked immediately

Signed-off-by: Ionut Nechita <ionut.nechita@xxxxxxxxxxxxx>
---
include/linux/ceph/messenger.h | 20 +++++++++++++
net/ceph/messenger.c | 51 ++++++++++++++++++++++++++++++++++
net/ceph/messenger_v1.c | 7 +++++
net/ceph/messenger_v2.c | 12 ++++++++
4 files changed, 90 insertions(+)

diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 730a754353aed..d8f7946d85a68 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -113,6 +113,17 @@ struct ceph_messenger {
*/
u32 global_seq;
spinlock_t global_seq_lock;
+
+ /*
+ * Track consecutive EADDRNOTAVAIL failures across all
+ * connections. When this exceeds a threshold, the client's
+ * inst.addr is reset to blank so that process_hello() will
+ * re-learn the source address from the next successful
+ * monitor connection. This handles the case where the
+ * original source address was a transient CNI pod address
+ * that no longer exists.
+ */
+ atomic_t addr_notavail_count;
};

enum ceph_msg_data_type {
@@ -328,6 +339,15 @@ struct ceph_msg {
*/
#define ADDRNOTAVAIL_DELAY (HZ / 10)

+/*
+ * Number of consecutive EADDRNOTAVAIL failures (across all connections)
+ * before resetting the messenger's source address. At ~100ms per retry,
+ * 30 failures means ~3 seconds of persistent EADDRNOTAVAIL before we
+ * conclude the source address is permanently gone (e.g., a CNI pod
+ * address that was removed) and needs to be re-learned.
+ */
+#define ADDRNOTAVAIL_RESET_THRESHOLD 30
+
struct ceph_connection_v1_info {
struct kvec out_kvec[8], /* sending header/footer data */
*out_kvec_cur;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index c40c7c332e7f4..8165e6a8fe092 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -497,6 +497,10 @@ int ceph_tcp_connect(struct ceph_connection *con)
else
con->v1.addr_notavail = false;

+ /* Reset the persistent EADDRNOTAVAIL counter on success */
+ if (atomic_read(&con->msgr->addr_notavail_count) > 0)
+ atomic_set(&con->msgr->addr_notavail_count, 0);
+
return 0;
}

@@ -1663,6 +1667,52 @@ static void con_fault(struct ceph_connection *con)
}
}

+ /*
+ * Track persistent EADDRNOTAVAIL across all connections.
+ * If the source address stored in msgr->inst.addr is no longer
+ * valid (e.g., it was a transient CNI pod address that has been
+ * removed), all connections will fail with EADDRNOTAVAIL at
+ * ip6_dst_lookup_flow() before even sending a SYN.
+ *
+ * After ADDRNOTAVAIL_RESET_THRESHOLD consecutive failures,
+ * reset inst.addr to blank so that process_hello() will
+ * re-learn the source address from the next successful
+ * monitor connection. The nonce is preserved.
+ */
+ if (addr_issue) {
+ int count = atomic_inc_return(&con->msgr->addr_notavail_count);
+
+ if (count == ADDRNOTAVAIL_RESET_THRESHOLD) {
+ struct ceph_entity_addr *my_addr =
+ &con->msgr->inst.addr;
+
+ pr_warn("libceph: %d consecutive EADDRNOTAVAIL errors, resetting source address %s (will re-learn from monitor)\n",
+ count, ceph_pr_addr(my_addr));
+
+ /*
+ * Zero out the address portion of in_addr but
+ * preserve ss_family, nonce, and type so the
+ * client identity is maintained and debug output
+ * remains readable. process_hello() checks
+ * ceph_addr_is_blank() and will fill in the new
+ * address from the monitor's addr_for_me response.
+ *
+ * We preserve ss_family so that ceph_pr_addr()
+ * shows e.g. "[::]:0" instead of
+ * "(unknown sockaddr family 0)".
+ */
+ {
+ sa_family_t family =
+ get_unaligned(&my_addr->in_addr.ss_family);
+ memset(&my_addr->in_addr, 0,
+ sizeof(my_addr->in_addr));
+ put_unaligned(family,
+ &my_addr->in_addr.ss_family);
+ }
+ ceph_encode_my_addr(con->msgr);
+ }
+ }
+
WARN_ON(con->state == CEPH_CON_S_STANDBY ||
con->state == CEPH_CON_S_CLOSED);

@@ -1740,6 +1790,7 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
ceph_encode_my_addr(msgr);

atomic_set(&msgr->stopping, 0);
+ atomic_set(&msgr->addr_notavail_count, 0);
write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));

dout("%s %p\n", __func__, msgr);
diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c
index 0cb61c76b9b87..4f3868f296c06 100644
--- a/net/ceph/messenger_v1.c
+++ b/net/ceph/messenger_v1.c
@@ -736,6 +736,13 @@ static int process_banner(struct ceph_connection *con)
ceph_encode_my_addr(con->msgr);
dout("process_banner learned my addr is %s\n",
ceph_pr_addr(my_addr));
+
+ if (atomic_read(&con->msgr->addr_notavail_count) > 0) {
+ pr_info("libceph: re-learned source address %s from peer %s\n",
+ ceph_pr_addr(my_addr),
+ ceph_pr_addr(&con->peer_addr));
+ atomic_set(&con->msgr->addr_notavail_count, 0);
+ }
}

return 0;
diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c
index bd608ffa06279..12ad9f571dcca 100644
--- a/net/ceph/messenger_v2.c
+++ b/net/ceph/messenger_v2.c
@@ -2260,6 +2260,18 @@ static int process_hello(struct ceph_connection *con, void *p, void *end)
dout("%s con %p set my addr %s, as seen by peer %s\n",
__func__, con, ceph_pr_addr(my_addr),
ceph_pr_addr(&con->peer_addr));
+
+ /*
+ * If we re-learned the address after a reset due to
+ * persistent EADDRNOTAVAIL, log it and clear the
+ * failure counter.
+ */
+ if (atomic_read(&con->msgr->addr_notavail_count) > 0) {
+ pr_info("libceph: re-learned source address %s from monitor %s\n",
+ ceph_pr_addr(my_addr),
+ ceph_pr_addr(&con->peer_addr));
+ atomic_set(&con->msgr->addr_notavail_count, 0);
+ }
} else {
dout("%s con %p my addr already set %s\n",
__func__, con, ceph_pr_addr(my_addr));
--
2.53.0