Re: [PATCH v1 02/13] ceph: add timeout protection to ceph_mdsc_sync() path

From: Viacheslav Dubeyko

Date: Thu Mar 12 2026 - 15:31:18 EST


On Thu, 2026-03-12 at 10:16 +0200, Ionut Nechita (Wind River) wrote:
> From: Ionut Nechita <ionut.nechita@xxxxxxxxxxxxx>
>
> When Ceph MDS becomes unreachable (e.g., due to IPv6 EADDRNOTAVAIL
> during DAD or network transitions), the sync syscall can block
> indefinitely in ceph_mdsc_sync(). The hung_task detector fires
> repeatedly (122s, 245s, 368s... up to 983+ seconds) with traces like:
> INFO: task sync:12345 blocked for more than 122 seconds.
> Call Trace:
> ceph_mdsc_sync+0x4d6/0x5a0 [ceph]
> ceph_sync_fs+0x31/0x130 [ceph]
> iterate_supers+0x97/0x100
> ksys_sync+0x32/0xb0
> Three functions in the MDS sync path use indefinite waits:
> 1. wait_caps_flush() uses wait_event() with no timeout
> 2. flush_mdlog_and_wait_mdsc_unsafe_requests() uses
> wait_for_completion() with no timeout
> 3. ceph_mdsc_sync() returns void, cannot propagate errors
> This is particularly problematic in containerized environments with
> PREEMPT_RT kernels where Ceph storage pods undergo rolling updates
> and IPv6 network reconfigurations cause temporary MDS unavailability.
> Fix this by adding mount_timeout-based timeouts (default 60s) to the
> blocking waits, following the existing pattern used by wait_requests()
> and ceph_mdsc_close_sessions() in the same file:
> - wait_caps_flush(): use wait_event_timeout() with mount_timeout
> - flush_mdlog_and_wait_mdsc_unsafe_requests(): use
> wait_for_completion_timeout() with mount_timeout
> - ceph_mdsc_sync(): change return type to int, propagate -ETIMEDOUT
> - ceph_sync_fs(): propagate error from ceph_mdsc_sync() to VFS
> On timeout, dirty caps and pending requests are NOT discarded - they
> remain in memory and are re-synced when MDS reconnects. The timeout
> simply unblocks the calling task. If mount_timeout is set to 0,
> ceph_timeout_jiffies() returns MAX_SCHEDULE_TIMEOUT, preserving the
> original infinite-wait behavior.
> Real-world impact: In production logs showing 'task sync blocked for
> more than 983 seconds', this patch limits the block to mount_timeout
> (60s default), returning -ETIMEDOUT to the VFS layer instead of
> hanging indefinitely.
> Fixes: 1b2ba3c5616e ("ceph: flush the mdlog for filesystem sync")
> Signed-off-by: Ionut Nechita <ionut.nechita@xxxxxxxxxxxxx>
> ---
> fs/ceph/mds_client.c | 50 ++++++++++++++++++++++++++++++++++----------
> fs/ceph/mds_client.h | 2 +-
> fs/ceph/super.c | 5 +++--
> 3 files changed, 43 insertions(+), 14 deletions(-)
>
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index df89d45f33a1f..37899464101f7 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -2296,17 +2296,26 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
> *
> * returns true if we've flushed through want_flush_tid
> */
> -static void wait_caps_flush(struct ceph_mds_client *mdsc,
> - u64 want_flush_tid)
> +static int wait_caps_flush(struct ceph_mds_client *mdsc,
> + u64 want_flush_tid)
> {
> struct ceph_client *cl = mdsc->fsc->client;
> + struct ceph_options *opts = mdsc->fsc->client->options;
> + long ret;
>
> doutc(cl, "want %llu\n", want_flush_tid);
>
> - wait_event(mdsc->cap_flushing_wq,
> - check_caps_flush(mdsc, want_flush_tid));
> + ret = wait_event_timeout(mdsc->cap_flushing_wq,
> + check_caps_flush(mdsc, want_flush_tid),
> + ceph_timeout_jiffies(opts->mount_timeout));

Technically speaking, opts->mount_timeout is configurable option and it can be
defined long enough. Finally, you could see the same issue even with your
solution. Maybe, we need to have some check of opts->mount_timeout?

> + if (!ret) {
> + pr_warn_client(cl, "cap flush timeout waiting for tid %llu\n",
> + want_flush_tid);

Now we will have these messages instead of "process has been blocked" messages.
:) Do we really need to inform about this? Maybe, debug message here?

> + return -ETIMEDOUT;
> + }
>
> doutc(cl, "ok, flushed thru %llu\n", want_flush_tid);
> + return 0;
> }
>
> /*
> @@ -5838,13 +5847,15 @@ void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
> /*
> * flush the mdlog and wait for all write mds requests to flush.
> */
> -static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
> - u64 want_tid)
> +static int flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *mdsc,
> + u64 want_tid)
> {
> struct ceph_client *cl = mdsc->fsc->client;
> + struct ceph_options *opts = mdsc->fsc->client->options;
> struct ceph_mds_request *req = NULL, *nextreq;
> struct ceph_mds_session *last_session = NULL;
> struct rb_node *n;
> + unsigned long left;
>
> mutex_lock(&mdsc->mutex);
> doutc(cl, "want %lld\n", want_tid);
> @@ -5883,7 +5894,19 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
> }
> doutc(cl, "wait on %llu (want %llu)\n",
> req->r_tid, want_tid);
> - wait_for_completion(&req->r_safe_completion);
> + left = wait_for_completion_timeout(
> + &req->r_safe_completion,
> + ceph_timeout_jiffies(opts->mount_timeout));
> + if (!left) {
> + pr_warn_client(cl,
> + "flush mdlog request tid %llu timed out\n",
> + req->r_tid);
> + ceph_mdsc_put_request(req);
> + if (nextreq)
> + ceph_mdsc_put_request(nextreq);
> + ceph_put_mds_session(last_session);
> + return -ETIMEDOUT;

The same concerns here.

> + }
>
> mutex_lock(&mdsc->mutex);
> ceph_mdsc_put_request(req);
> @@ -5901,15 +5924,17 @@ static void flush_mdlog_and_wait_mdsc_unsafe_requests(struct ceph_mds_client *md
> mutex_unlock(&mdsc->mutex);
> ceph_put_mds_session(last_session);
> doutc(cl, "done\n");
> + return 0;
> }
>
> -void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
> +int ceph_mdsc_sync(struct ceph_mds_client *mdsc)
> {
> struct ceph_client *cl = mdsc->fsc->client;
> u64 want_tid, want_flush;
> + int ret;
>
> if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN)
> - return;
> + return -EIO;

Why -EIO here? As far as I can follow, we will retry the sync operation. Am I
correct? So, it's not I/O failure yet.

Thanks,
Slava.

>
> doutc(cl, "sync\n");
> mutex_lock(&mdsc->mutex);
> @@ -5930,8 +5955,11 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
>
> doutc(cl, "sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
>
> - flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
> - wait_caps_flush(mdsc, want_flush);
> + ret = flush_mdlog_and_wait_mdsc_unsafe_requests(mdsc, want_tid);
> + if (ret)
> + return ret;
> +
> + return wait_caps_flush(mdsc, want_flush);
> }
>
> /*
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 0a602080d8ef6..695c5a9c94026 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -564,7 +564,7 @@ extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
> extern void ceph_mdsc_force_umount(struct ceph_mds_client *mdsc);
> extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
>
> -extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
> +extern int ceph_mdsc_sync(struct ceph_mds_client *mdsc);
>
> extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
> extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
> diff --git a/fs/ceph/super.c b/fs/ceph/super.c
> index b61074b377ac5..b52960402d68e 100644
> --- a/fs/ceph/super.c
> +++ b/fs/ceph/super.c
> @@ -122,6 +122,7 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
> {
> struct ceph_fs_client *fsc = ceph_sb_to_fs_client(sb);
> struct ceph_client *cl = fsc->client;
> + int ret;
>
> if (!wait) {
> doutc(cl, "(non-blocking)\n");
> @@ -133,9 +134,9 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
>
> doutc(cl, "(blocking)\n");
> ceph_osdc_sync(&fsc->client->osdc);
> - ceph_mdsc_sync(fsc->mdsc);
> + ret = ceph_mdsc_sync(fsc->mdsc);
> doutc(cl, "(blocking) done\n");
> - return 0;
> + return ret;
> }
>
> /*