[PATCH v2 4/7] ceph: add diagnostic timeout loop to wait_caps_flush()
From: Alex Markuze
Date: Wed Apr 15 2026 - 13:08:40 EST
Convert wait_caps_flush() from a silent indefinite wait into a diagnostic
wait loop that periodically dumps pending cap flush state.
The underlying wait semantics remain intact: callers still wait until the
requested cap flushes complete. The difference is that long stalls now
produce actionable diagnostics instead of looking like a silent hang.
CEPH_CAP_FLUSH_MAX_DUMP_COUNT bounds the diagnostics in two ways:
it limits the number of entries emitted per diagnostic dump, and it
limits the number of timed diagnostic dumps before the wait continues
silently.
READ_ONCE is used for the i_last_cap_flush_ack field, which is read
outside the inode lock domain.
Add a ci pointer to struct ceph_cap_flush so that the diagnostic
dump can identify which inode each pending flush belongs to. The
new i_last_cap_flush_ack field tracks the latest acknowledged flush
tid per inode for diagnostic correlation.
This improves reset-drain observability and is also useful for
existing sync and writeback troubleshooting paths.
Signed-off-by: Alex Markuze <amarkuze@xxxxxxxxxx>
---
fs/ceph/caps.c | 5 ++++
fs/ceph/inode.c | 1 +
fs/ceph/mds_client.c | 56 ++++++++++++++++++++++++++++++++++++++++----
fs/ceph/super.h | 6 +++++
4 files changed, 64 insertions(+), 4 deletions(-)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index cb9e78b713d9..c40175dd77ae 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1648,6 +1648,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
spin_lock(&mdsc->cap_dirty_lock);
capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
+ capsnap->cap_flush.ci = ci;
list_add_tail(&capsnap->cap_flush.g_list,
&mdsc->cap_flush_list);
if (oldest_flush_tid == 0)
@@ -1846,6 +1847,7 @@ struct ceph_cap_flush *ceph_alloc_cap_flush(void)
return NULL;
cf->is_capsnap = false;
+ cf->ci = NULL;
return cf;
}
@@ -1931,6 +1933,7 @@ static u64 __mark_caps_flushing(struct inode *inode,
doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode));
swap(cf, ci->i_prealloc_cap_flush);
+ cf->ci = ci;
cf->caps = flushing;
cf->wake = wake;
@@ -3826,6 +3829,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
bool wake_ci = false;
bool wake_mdsc = false;
+ WRITE_ONCE(ci->i_last_cap_flush_ack, flush_tid);
+
list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
/* Is this the one that was flushed? */
if (cf->tid == flush_tid)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f75d66760d54..de465c7e96e8 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -670,6 +670,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ci->i_cap_snaps);
ci->i_head_snapc = NULL;
ci->i_snap_caps = 0;
+ ci->i_last_cap_flush_ack = 0;
ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index b14ede808436..7d17332d72d7 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -27,6 +27,8 @@
#include <trace/events/ceph.h>
#define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)
+#define CEPH_CAP_FLUSH_WAIT_TIMEOUT_SEC 60
+#define CEPH_CAP_FLUSH_MAX_DUMP_COUNT 5
/*
* A cluster of MDS (metadata server) daemons is responsible for
@@ -2286,19 +2288,65 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
}
/*
- * flush all dirty inode data to disk.
+ * Dump pending cap flushes for diagnostic purposes.
*
- * returns true if we've flushed through want_flush_tid
+ * cf->ci is safe to dereference here because the cap_dirty_lock is
+ * held, and cap_flush entries are removed from the global
+ * cap_flush_list under the same lock in the purge path.
+ */
+static void dump_cap_flushes(struct ceph_mds_client *mdsc, u64 want_tid)
+{
+ struct ceph_client *cl = mdsc->fsc->client;
+ struct ceph_cap_flush *cf;
+ int dumped = 0;
+
+ pr_info_client(cl, "still waiting for cap flushes through %llu:\n",
+ want_tid);
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_for_each_entry(cf, &mdsc->cap_flush_list, g_list) {
+ if (cf->tid > want_tid)
+ break;
+ if (++dumped > CEPH_CAP_FLUSH_MAX_DUMP_COUNT)
+ break;
+ if (!cf->ci) {
+ pr_info_client(cl,
+ "(null ci) %s tid=%llu wake=%d%s\n",
+ ceph_cap_string(cf->caps), cf->tid,
+ cf->wake,
+ cf->is_capsnap ? " is_capsnap" : "");
+ continue;
+ }
+ pr_info_client(cl,
+ "%llx:%llx %s tid=%llu last_ack=%llu wake=%d%s\n",
+ ceph_vinop(&cf->ci->netfs.inode),
+ ceph_cap_string(cf->caps), cf->tid,
+ READ_ONCE(cf->ci->i_last_cap_flush_ack),
+ cf->wake,
+ cf->is_capsnap ? " is_capsnap" : "");
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+}
+
+/*
+ * Wait for all cap flushes through @want_flush_tid to complete.
+ * Periodically dumps pending cap flush state for diagnostics.
*/
static void wait_caps_flush(struct ceph_mds_client *mdsc,
u64 want_flush_tid)
{
struct ceph_client *cl = mdsc->fsc->client;
+ int i = 0;
+ long ret;
doutc(cl, "want %llu\n", want_flush_tid);
- wait_event(mdsc->cap_flushing_wq,
- check_caps_flush(mdsc, want_flush_tid));
+ do {
+ ret = wait_event_timeout(mdsc->cap_flushing_wq,
+ check_caps_flush(mdsc, want_flush_tid),
+ CEPH_CAP_FLUSH_WAIT_TIMEOUT_SEC * HZ);
+ if (ret == 0 && i++ < CEPH_CAP_FLUSH_MAX_DUMP_COUNT)
+ dump_cap_flushes(mdsc, want_flush_tid);
+ } while (ret == 0);
doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);
}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index c89ad8dcc969..1f901b1647e6 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -238,6 +238,7 @@ struct ceph_cap_flush {
bool is_capsnap; /* true means capsnap */
struct list_head g_list; // global
struct list_head i_list; // per inode
+ struct ceph_inode_info *ci;
};
/*
@@ -443,6 +444,11 @@ struct ceph_inode_info {
struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
dirty|flushing caps */
unsigned i_snap_caps; /* cap bits for snapped files */
+ /*
+ * Written under i_ceph_lock, read via READ_ONCE()
+ * from diagnostic paths.
+ */
+ u64 i_last_cap_flush_ack;
unsigned long i_last_rd;
unsigned long i_last_wr;
--
2.34.1