Re: [PATCH 01/14] f2fs: add lock elapsed time trace facility for f2fs rwsemphore
From: Jaegeuk Kim
Date: Sun Jan 04 2026 - 00:28:12 EST
Stuck when disabling checkpoint:
[ 403.438233][ T739] __switch_to+0x164/0x324
[ 403.442532][ T739] __schedule+0xbd0/0x1d28
[ 403.446822][ T739] schedule+0x4c/0x118
[ 403.450758][ T739] schedule_preempt_disabled+0x24/0x44
[ 403.456083][ T739] rwsem_down_write_slowpath+0x438/0x988
[ 403.461588][ T739] down_write+0x10c/0x16c
[ 403.465785][ T739] f2fs_down_write_trace+0x1c/0x70
[ 403.470797][ T739] f2fs_disable_checkpoint+0x114/0x2b8
[ 403.476126][ T739] f2fs_fill_super+0x22c8/0x2728
[ 403.480924][ T739] get_tree_bdev_flags+0x144/0x1dc
[ 403.485894][ T739] get_tree_bdev+0x14/0x24
[ 403.490176][ T739] f2fs_get_tree+0x18/0x28
[ 403.494448][ T739] vfs_get_tree+0x48/0x10c
[ 403.498724][ T739] path_mount+0x62c/0xb74
On 01/04, Chao Yu wrote:
> This patch adds lock elapsed time trace facility for f2fs rwsemphore.
>
> If total elapsed time of critical region covered by lock exceeds a
> threshold, it will print tracepoint to dump information of lock related
> context, including:
> - thread information
> - CPU/IO priority
> - lock information
> - elapsed time
> - total time
> - running time (depend on CONFIG_64BIT)
> - runnable time (depend on CONFIG_SCHED_INFO and CONFIG_SCHEDSTATS)
> - io sleep time (depend on CONFIG_TASK_DELAY_ACCT and
> /proc/sys/kernel/task_delayacct)
> - other time (by default other time will account nonio sleep time,
> but, if above kconfig is not defined, other time will
> include runnable time and/or io sleep time as wll)
>
> output:
> f2fs_lock_elapsed_time: dev = (254,52), comm: sh, pid: 13855, prio: 120, ioprio_class: 2, ioprio_data: 4, lock_name: cp_rwsem, lock_type: rlock, total: 1000, running: 993, runnable: 2, io_sleep: 0, other: 5
>
> Signed-off-by: Chao Yu <chao@xxxxxxxxxx>
> ---
> fs/f2fs/checkpoint.c | 106 ++++++++++++++++++++++++++++++++++++
> fs/f2fs/f2fs.h | 51 +++++++++++++++--
> include/trace/events/f2fs.h | 68 +++++++++++++++++++++++
> 3 files changed, 221 insertions(+), 4 deletions(-)
>
> diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
> index 300664269eb6..bc6058a3122b 100644
> --- a/fs/f2fs/checkpoint.c
> +++ b/fs/f2fs/checkpoint.c
> @@ -14,6 +14,9 @@
> #include <linux/pagevec.h>
> #include <linux/swap.h>
> #include <linux/kthread.h>
> +#include <linux/delayacct.h>
> +#include <linux/ioprio.h>
> +#include <linux/math64.h>
>
> #include "f2fs.h"
> #include "node.h"
> @@ -21,6 +24,109 @@
> #include "iostat.h"
> #include <trace/events/f2fs.h>
>
> +static inline void get_lock_elapsed_time(struct f2fs_time_stat *ts)
> +{
> + ts->total_time = ktime_get();
> +#ifdef CONFIG_64BIT
> + ts->running_time = current->se.sum_exec_runtime;
> +#endif
> +#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
> + ts->runnable_time = current->sched_info.run_delay;
> +#endif
> +#ifdef CONFIG_TASK_DELAY_ACCT
> + if (current->delays)
> + ts->io_sleep_time = current->delays->blkio_delay;
> +#endif
> +}
> +
> +static inline void trace_lock_elapsed_time_start(struct f2fs_rwsem *sem,
> + struct f2fs_lock_context *lc)
> +{
> + lc->lock_trace = trace_f2fs_lock_elapsed_time_enabled();
> + if (!lc->lock_trace)
> + return;
> +
> + get_lock_elapsed_time(&lc->ts);
> +}
> +
> +static inline void trace_lock_elapsed_time_end(struct f2fs_rwsem *sem,
> + struct f2fs_lock_context *lc, bool is_write)
> +{
> + struct f2fs_time_stat tts;
> + unsigned long long total_time;
> + unsigned long long running_time = 0;
> + unsigned long long runnable_time = 0;
> + unsigned long long io_sleep_time = 0;
> + unsigned long long other_time = 0;
> + unsigned npm = NSEC_PER_MSEC;
> +
> + if (!lc->lock_trace)
> + return;
> +
> + get_lock_elapsed_time(&tts);
> +
> + total_time = div_u64(tts.total_time - lc->ts.total_time, npm);
> + if (total_time <= MAX_LOCK_ELAPSED_TIME)
> + return;
> +
> +#ifdef CONFIG_64BIT
> + running_time = div_u64(tts.running_time - lc->ts.running_time, npm);
> +#endif
> +#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
> + runnable_time = div_u64(tts.runnable_time - lc->ts.runnable_time, npm);
> +#endif
> +#ifdef CONFIG_TASK_DELAY_ACCT
> + io_sleep_time = div_u64(tts.io_sleep_time - lc->ts.io_sleep_time, npm);
> +#endif
> + if (total_time > running_time + io_sleep_time + runnable_time)
> + other_time = total_time - running_time -
> + io_sleep_time - runnable_time;
> +
> + trace_f2fs_lock_elapsed_time(sem->sbi, sem->name, is_write, current,
> + get_current_ioprio(), total_time, running_time,
> + runnable_time, io_sleep_time, other_time);
> +}
> +
> +void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
> +{
> + f2fs_down_read(sem);
> + trace_lock_elapsed_time_start(sem, lc);
> +}
> +
> +int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
> +{
> + if (!f2fs_down_read_trylock(sem))
> + return 0;
> + trace_lock_elapsed_time_start(sem, lc);
> + return 1;
> +}
> +
> +void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
> +{
> + f2fs_up_read(sem);
> + trace_lock_elapsed_time_end(sem, lc, false);
> +}
> +
> +void f2fs_down_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
> +{
> + f2fs_down_write(sem);
> + trace_lock_elapsed_time_start(sem, lc);
> +}
> +
> +int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
> +{
> + if (!f2fs_down_write_trylock(sem))
> + return 0;
> + trace_lock_elapsed_time_start(sem, lc);
> + return 1;
> +}
> +
> +void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc)
> +{
> + f2fs_up_write(sem);
> + trace_lock_elapsed_time_end(sem, lc, true);
> +}
> +
> #define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3))
>
> static struct kmem_cache *ino_entry_slab;
> diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
> index c3e1611fce18..7e22315dbedd 100644
> --- a/fs/f2fs/f2fs.h
> +++ b/fs/f2fs/f2fs.h
> @@ -173,6 +173,10 @@ enum device_allocation_policy {
> ALLOCATE_FORWARD_FROM_HINT,
> };
>
> +enum f2fs_lock_name {
> + LOCK_NAME_NONE,
> +};
> +
> /*
> * An implementation of an rwsem that is explicitly unfair to readers. This
> * prevents priority inversion when a low-priority reader acquires the read lock
> @@ -181,6 +185,8 @@ enum device_allocation_policy {
> */
>
> struct f2fs_rwsem {
> + struct f2fs_sb_info *sbi;
> + enum f2fs_lock_name name;
> struct rw_semaphore internal_rwsem;
> #ifdef CONFIG_F2FS_UNFAIR_RWSEM
> wait_queue_head_t read_waiters;
> @@ -1409,6 +1415,24 @@ struct f2fs_gc_control {
> unsigned int nr_free_secs; /* # of free sections to do GC */
> };
>
> +struct f2fs_time_stat {
> + unsigned long long total_time; /* total wall clock time */
> +#ifdef CONFIG_64BIT
> + unsigned long long running_time; /* running time */
> +#endif
> +#if defined(CONFIG_SCHED_INFO) && defined(CONFIG_SCHEDSTATS)
> + unsigned long long runnable_time; /* runnable(including preempted) time */
> +#endif
> +#ifdef CONFIG_TASK_DELAY_ACCT
> + unsigned long long io_sleep_time; /* IO sleep time */
> +#endif
> +};
> +
> +struct f2fs_lock_context {
> + struct f2fs_time_stat ts;
> + bool lock_trace;
> +};
> +
> /*
> * For s_flag in struct f2fs_sb_info
> * Modification on enum should be synchronized with s_flag array
> @@ -1525,6 +1549,9 @@ enum f2fs_lookup_mode {
> LOOKUP_AUTO,
> };
>
> +/* a threshold of maximum elapsed time in critical region to print tracepoint */
> +#define MAX_LOCK_ELAPSED_TIME 500
> +
> static inline int f2fs_test_bit(unsigned int nr, char *addr);
> static inline void f2fs_set_bit(unsigned int nr, char *addr);
> static inline void f2fs_clear_bit(unsigned int nr, char *addr);
> @@ -2263,16 +2290,22 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
> spin_unlock_irqrestore(&sbi->cp_lock, flags);
> }
>
> -#define init_f2fs_rwsem(sem) \
> +#define init_f2fs_rwsem(sem) __init_f2fs_rwsem(sem, NULL, LOCK_NAME_NONE)
> +#define init_f2fs_rwsem_trace __init_f2fs_rwsem
> +
> +#define __init_f2fs_rwsem(sem, sbi, name) \
> do { \
> static struct lock_class_key __key; \
> \
> - __init_f2fs_rwsem((sem), #sem, &__key); \
> + do_init_f2fs_rwsem((sem), #sem, &__key, sbi, name); \
> } while (0)
>
> -static inline void __init_f2fs_rwsem(struct f2fs_rwsem *sem,
> - const char *sem_name, struct lock_class_key *key)
> +static inline void do_init_f2fs_rwsem(struct f2fs_rwsem *sem,
> + const char *sem_name, struct lock_class_key *key,
> + struct f2fs_sb_info *sbi, enum f2fs_lock_name name)
> {
> + sem->sbi = sbi;
> + sem->name = name;
> __init_rwsem(&sem->internal_rwsem, sem_name, key);
> #ifdef CONFIG_F2FS_UNFAIR_RWSEM
> init_waitqueue_head(&sem->read_waiters);
> @@ -2341,6 +2374,16 @@ static inline void f2fs_up_write(struct f2fs_rwsem *sem)
> #endif
> }
>
> +void f2fs_down_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc);
> +int f2fs_down_read_trylock_trace(struct f2fs_rwsem *sem,
> + struct f2fs_lock_context *lc);
> +void f2fs_up_read_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc);
> +void f2fs_down_write_trace(struct f2fs_rwsem *sem,
> + struct f2fs_lock_context *lc);
> +int f2fs_down_write_trylock_trace(struct f2fs_rwsem *sem,
> + struct f2fs_lock_context *lc);
> +void f2fs_up_write_trace(struct f2fs_rwsem *sem, struct f2fs_lock_context *lc);
> +
> static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock)
> {
> unsigned long flags;
> diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
> index 635dcabcf1c7..9a852a16df9c 100644
> --- a/include/trace/events/f2fs.h
> +++ b/include/trace/events/f2fs.h
> @@ -184,6 +184,10 @@ TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT);
> { CP_PHASE_FINISH_BLOCK_OPS, "finish block_ops" }, \
> { CP_PHASE_FINISH_CHECKPOINT, "finish checkpoint" })
>
> +#define show_lock_name(lock) \
> + __print_symbolic(lock, \
> + { LOCK_NAME_NONE, "none" })
> +
> struct f2fs_sb_info;
> struct f2fs_io_info;
> struct extent_info;
> @@ -2452,6 +2456,70 @@ DEFINE_EVENT(f2fs__rw_end, f2fs_datawrite_end,
> TP_ARGS(inode, offset, bytes)
> );
>
> +TRACE_EVENT(f2fs_lock_elapsed_time,
> +
> + TP_PROTO(struct f2fs_sb_info *sbi, enum f2fs_lock_name lock_name,
> + bool is_write, struct task_struct *p, int ioprio,
> + unsigned long long total_time,
> + unsigned long long running_time,
> + unsigned long long runnable_time,
> + unsigned long long io_sleep_time,
> + unsigned long long other_time),
> +
> + TP_ARGS(sbi, lock_name, is_write, p, ioprio, total_time, running_time,
> + runnable_time, io_sleep_time, other_time),
> +
> + TP_STRUCT__entry(
> + __field(dev_t, dev)
> + __array(char, comm, TASK_COMM_LEN)
> + __field(pid_t, pid)
> + __field(int, prio)
> + __field(int, ioprio_class)
> + __field(int, ioprio_data)
> + __field(unsigned int, lock_name)
> + __field(bool, is_write)
> + __field(unsigned long long, total_time)
> + __field(unsigned long long, running_time)
> + __field(unsigned long long, runnable_time)
> + __field(unsigned long long, io_sleep_time)
> + __field(unsigned long long, other_time)
> + ),
> +
> + TP_fast_assign(
> + __entry->dev = sbi->sb->s_dev;
> + memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
> + __entry->pid = p->pid;
> + __entry->prio = p->prio;
> + __entry->ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
> + __entry->ioprio_data = IOPRIO_PRIO_DATA(ioprio);
> + __entry->lock_name = lock_name;
> + __entry->is_write = is_write;
> + __entry->total_time = total_time;
> + __entry->running_time = running_time;
> + __entry->runnable_time = runnable_time;
> + __entry->io_sleep_time = io_sleep_time;
> + __entry->other_time = other_time;
> + ),
> +
> + TP_printk("dev = (%d,%d), comm: %s, pid: %d, prio: %d, "
> + "ioprio_class: %d, ioprio_data: %d, lock_name: %s, "
> + "lock_type: %s, total: %llu, running: %llu, "
> + "runnable: %llu, io_sleep: %llu, other: %llu",
> + show_dev(__entry->dev),
> + __entry->comm,
> + __entry->pid,
> + __entry->prio,
> + __entry->ioprio_class,
> + __entry->ioprio_data,
> + show_lock_name(__entry->lock_name),
> + __entry->is_write ? "wlock" : "rlock",
> + __entry->total_time,
> + __entry->running_time,
> + __entry->runnable_time,
> + __entry->io_sleep_time,
> + __entry->other_time)
> +);
> +
> #endif /* _TRACE_F2FS_H */
>
> /* This part must be outside protection */
> --
> 2.49.0