Re: [EXTERNAL] [PATCH v2 4/7] ceph: add diagnostic timeout loop to wait_caps_flush()

From: Viacheslav Dubeyko

Date: Thu Apr 16 2026 - 15:56:26 EST

On Wed, 2026-04-15 at 17:00 +0000, Alex Markuze wrote:
> Convert wait_caps_flush() from a silent indefinite wait into a diagnostic
> wait loop that periodically dumps pending cap flush state.
>
> The underlying wait semantics remain intact: callers still wait until the
> requested cap flushes complete. The difference is that long stalls now
> produce actionable diagnostics instead of looking like a silent hang.
>
> CEPH_CAP_FLUSH_MAX_DUMP_COUNT bounds the diagnostics in two ways:
> it limits the number of entries emitted per diagnostic dump, and it
> limits the number of timed diagnostic dumps before the wait continues
> silently.
>
> READ_ONCE is used for the i_last_cap_flush_ack field, which is read
> outside the inode lock domain.
>
> Add a ci pointer to struct ceph_cap_flush so that the diagnostic
> dump can identify which inode each pending flush belongs to. The
> new i_last_cap_flush_ack field tracks the latest acknowledged flush
> tid per inode for diagnostic correlation.
>
> This improves reset-drain observability and is also useful for
> existing sync and writeback troubleshooting paths.
>
> Signed-off-by: Alex Markuze <amarkuze@xxxxxxxxxx>
> ---
> fs/ceph/caps.c | 5 ++++
> fs/ceph/inode.c | 1 +
> fs/ceph/mds_client.c | 56 ++++++++++++++++++++++++++++++++++++++++----
> fs/ceph/super.h | 6 +++++
> 4 files changed, 64 insertions(+), 4 deletions(-)
>
> diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
> index cb9e78b713d9..c40175dd77ae 100644
> --- a/fs/ceph/caps.c
> +++ b/fs/ceph/caps.c
> @@ -1648,6 +1648,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
>
> spin_lock(&mdsc->cap_dirty_lock);
> capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
> + capsnap->cap_flush.ci = ci;
> list_add_tail(&capsnap->cap_flush.g_list,
> &mdsc->cap_flush_list);
> if (oldest_flush_tid == 0)
> @@ -1846,6 +1847,7 @@ struct ceph_cap_flush *ceph_alloc_cap_flush(void)
> return NULL;
>
> cf->is_capsnap = false;
> + cf->ci = NULL;
> return cf;
> }
>
> @@ -1931,6 +1933,7 @@ static u64 __mark_caps_flushing(struct inode *inode,
> doutc(cl, "%p %llx.%llx now !dirty\n", inode, ceph_vinop(inode));
>
> swap(cf, ci->i_prealloc_cap_flush);
> + cf->ci = ci;
> cf->caps = flushing;
> cf->wake = wake;
>
> @@ -3826,6 +3829,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
> bool wake_ci = false;
> bool wake_mdsc = false;
>
> + WRITE_ONCE(ci->i_last_cap_flush_ack, flush_tid);

Could acks ever arrived out of order? If so, then i_last_cap_flush_ack could
contain not the maximum value. Do we need to use max() here?

> +
> list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
> /* Is this the one that was flushed? */
> if (cf->tid == flush_tid)
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index f75d66760d54..de465c7e96e8 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -670,6 +670,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
> INIT_LIST_HEAD(&ci->i_cap_snaps);
> ci->i_head_snapc = NULL;
> ci->i_snap_caps = 0;
> + ci->i_last_cap_flush_ack = 0;
>
> ci->i_last_rd = ci->i_last_wr = jiffies - 3600 * HZ;
> for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index b14ede808436..7d17332d72d7 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -27,6 +27,8 @@
> #include <trace/events/ceph.h>
>
> #define RECONNECT_MAX_SIZE (INT_MAX - PAGE_SIZE)
> +#define CEPH_CAP_FLUSH_WAIT_TIMEOUT_SEC 60
> +#define CEPH_CAP_FLUSH_MAX_DUMP_COUNT 5
>
> /*
> * A cluster of MDS (metadata server) daemons is responsible for
> @@ -2286,19 +2288,65 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
> }
>
> /*
> - * flush all dirty inode data to disk.
> + * Dump pending cap flushes for diagnostic purposes.
> *
> - * returns true if we've flushed through want_flush_tid
> + * cf->ci is safe to dereference here because the cap_dirty_lock is
> + * held, and cap_flush entries are removed from the global
> + * cap_flush_list under the same lock in the purge path.
> + */
> +static void dump_cap_flushes(struct ceph_mds_client *mdsc, u64 want_tid)
> +{
> + struct ceph_client *cl = mdsc->fsc->client;
> + struct ceph_cap_flush *cf;
> + int dumped = 0;
> +
> + pr_info_client(cl, "still waiting for cap flushes through %llu:\n",
> + want_tid);
> + spin_lock(&mdsc->cap_dirty_lock);
> + list_for_each_entry(cf, &mdsc->cap_flush_list, g_list) {
> + if (cf->tid > want_tid)
> + break;
> + if (++dumped > CEPH_CAP_FLUSH_MAX_DUMP_COUNT)
> + break;

We break silently if dumped is bigger than CEPH_CAP_FLUSH_MAX_DUMP_COUNT. But we
could have more items in the list. Should we inform that the dump is truncated?

> + if (!cf->ci) {

Is it normal situation? Could we expect such condition as regular workflow?
Maybe, we need to have WARN_ON_ONCE(!cf->ci) instead?

> + pr_info_client(cl,
> + "(null ci) %s tid=%llu wake=%d%s\n",
> + ceph_cap_string(cf->caps), cf->tid,
> + cf->wake,
> + cf->is_capsnap ? " is_capsnap" : "");
> + continue;
> + }
> + pr_info_client(cl,
> + "%llx:%llx %s tid=%llu last_ack=%llu wake=%d%s\n",
> + ceph_vinop(&cf->ci->netfs.inode),
> + ceph_cap_string(cf->caps), cf->tid,
> + READ_ONCE(cf->ci->i_last_cap_flush_ack),
> + cf->wake,
> + cf->is_capsnap ? " is_capsnap" : "");

Multiple pr_info_client() calls happen while cap_dirty_lock is held. I think
it's not good at all. Could we rework it somehow?

> + }
> + spin_unlock(&mdsc->cap_dirty_lock);
> +}
> +
> +/*
> + * Wait for all cap flushes through @want_flush_tid to complete.
> + * Periodically dumps pending cap flush state for diagnostics.
> */
> static void wait_caps_flush(struct ceph_mds_client *mdsc,
> u64 want_flush_tid)
> {
> struct ceph_client *cl = mdsc->fsc->client;
> + int i = 0;
> + long ret;
>
> doutc(cl, "want %llu\n", want_flush_tid);
>
> - wait_event(mdsc->cap_flushing_wq,
> - check_caps_flush(mdsc, want_flush_tid));
> + do {
> + ret = wait_event_timeout(mdsc->cap_flushing_wq,
> + check_caps_flush(mdsc, want_flush_tid),
> + CEPH_CAP_FLUSH_WAIT_TIMEOUT_SEC * HZ);

Are we safe here? Could we have overflow for CEPH_CAP_FLUSH_WAIT_TIMEOUT_SEC *
HZ calculation?

Thanks,
Slava.

> + if (ret == 0 && i++ < CEPH_CAP_FLUSH_MAX_DUMP_COUNT)
> + dump_cap_flushes(mdsc, want_flush_tid);
> + } while (ret == 0);
>
> doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);
> }
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index c89ad8dcc969..1f901b1647e6 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -238,6 +238,7 @@ struct ceph_cap_flush {
> bool is_capsnap; /* true means capsnap */
> struct list_head g_list; // global
> struct list_head i_list; // per inode
> + struct ceph_inode_info *ci;
> };
>
> /*
> @@ -443,6 +444,11 @@ struct ceph_inode_info {
> struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 or
> dirty|flushing caps */
> unsigned i_snap_caps; /* cap bits for snapped files */
> + /*
> + * Written under i_ceph_lock, read via READ_ONCE()
> + * from diagnostic paths.
> + */
> + u64 i_last_cap_flush_ack;
>
> unsigned long i_last_rd;
> unsigned long i_last_wr;