Re: [PATCH block/for-4.3] writeback: update writeback tracepoints to report cgroup

From: Jan Kara
Date: Wed Jul 08 2015 - 04:21:01 EST


On Mon 06-07-15 15:36:42, Tejun Heo wrote:
> The following tracepoints are updated to report the cgroup used during
> cgroup writeback.
>
> * writeback_write_inode[_start]
> * writeback_queue
> * writeback_exec
> * writeback_start
> * writeback_written
> * writeback_wait
> * writeback_nowork
> * writeback_wake_background
> * wbc_writepage
> * writeback_queue_io
> * bdi_dirty_ratelimit
> * balance_dirty_pages
> * writeback_sb_inodes_requeue
> * writeback_single_inode[_start]
>
> Note that writeback_bdi_register is separated out from writeback_class
> as reporting cgroup doesn't make sense to it. Tracepoints which take
> bdi are updated to take bdi_writeback instead.
>
> Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
> Cc: Jan Kara <jack@xxxxxxx>
> ---
> Hello,
>
> Will soon post this as part of a patch series of cgroup writeback
> updates.

Thanks. The patch looks good. You can add:

Reviewed-by: Jan Kara <jack@xxxxxxxx>

Honza


> fs/fs-writeback.c | 14 +--
> include/trace/events/writeback.h | 180 ++++++++++++++++++++++++++++++---------
> mm/page-writeback.c | 6 -
> 3 files changed, 151 insertions(+), 49 deletions(-)
>
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -176,7 +176,7 @@ static void wb_wakeup(struct bdi_writeba
> static void wb_queue_work(struct bdi_writeback *wb,
> struct wb_writeback_work *work)
> {
> - trace_writeback_queue(wb->bdi, work);
> + trace_writeback_queue(wb, work);
>
> spin_lock_bh(&wb->work_lock);
> if (!test_bit(WB_registered, &wb->state))
> @@ -882,7 +882,7 @@ void wb_start_writeback(struct bdi_write
> */
> work = kzalloc(sizeof(*work), GFP_ATOMIC);
> if (!work) {
> - trace_writeback_nowork(wb->bdi);
> + trace_writeback_nowork(wb);
> wb_wakeup(wb);
> return;
> }
> @@ -912,7 +912,7 @@ void wb_start_background_writeback(struc
> * We just wake up the flusher thread. It will perform background
> * writeback as soon as there is no other work to do.
> */
> - trace_writeback_wake_background(wb->bdi);
> + trace_writeback_wake_background(wb);
> wb_wakeup(wb);
> }
>
> @@ -1615,14 +1615,14 @@ static long wb_writeback(struct bdi_writ
> } else if (work->for_background)
> oldest_jif = jiffies;
>
> - trace_writeback_start(wb->bdi, work);
> + trace_writeback_start(wb, work);
> if (list_empty(&wb->b_io))
> queue_io(wb, work);
> if (work->sb)
> progress = writeback_sb_inodes(work->sb, wb, work);
> else
> progress = __writeback_inodes_wb(wb, work);
> - trace_writeback_written(wb->bdi, work);
> + trace_writeback_written(wb, work);
>
> wb_update_bandwidth(wb, wb_start);
>
> @@ -1647,7 +1647,7 @@ static long wb_writeback(struct bdi_writ
> * we'll just busyloop.
> */
> if (!list_empty(&wb->b_more_io)) {
> - trace_writeback_wait(wb->bdi, work);
> + trace_writeback_wait(wb, work);
> inode = wb_inode(wb->b_more_io.prev);
> spin_lock(&inode->i_lock);
> spin_unlock(&wb->list_lock);
> @@ -1753,7 +1753,7 @@ static long wb_do_writeback(struct bdi_w
> while ((work = get_next_work_item(wb)) != NULL) {
> struct wb_completion *done = work->done;
>
> - trace_writeback_exec(wb->bdi, work);
> + trace_writeback_exec(wb, work);
>
> wrote += wb_writeback(wb, work);
>
> --- a/include/trace/events/writeback.h
> +++ b/include/trace/events/writeback.h
> @@ -131,6 +131,66 @@ DEFINE_EVENT(writeback_dirty_inode_templ
> TP_ARGS(inode, flags)
> );
>
> +#ifdef CREATE_TRACE_POINTS
> +#ifdef CONFIG_CGROUP_WRITEBACK
> +
> +static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
> +{
> + return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1;
> +}
> +
> +static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
> +{
> + struct cgroup *cgrp = wb->memcg_css->cgroup;
> + char *path;
> +
> + path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1);
> + WARN_ON_ONCE(path != buf);
> +}
> +
> +static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
> +{
> + if (wbc->wb)
> + return __trace_wb_cgroup_size(wbc->wb);
> + else
> + return 2;
> +}
> +
> +static inline void __trace_wbc_assign_cgroup(char *buf,
> + struct writeback_control *wbc)
> +{
> + if (wbc->wb)
> + __trace_wb_assign_cgroup(buf, wbc->wb);
> + else
> + strcpy(buf, "/");
> +}
> +
> +#else /* CONFIG_CGROUP_WRITEBACK */
> +
> +static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
> +{
> + return 2;
> +}
> +
> +static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
> +{
> + strcpy(buf, "/");
> +}
> +
> +static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
> +{
> + return 2;
> +}
> +
> +static inline void __trace_wbc_assign_cgroup(char *buf,
> + struct writeback_control *wbc)
> +{
> + strcpy(buf, "/");
> +}
> +
> +#endif /* CONFIG_CGROUP_WRITEBACK */
> +#endif /* CREATE_TRACE_POINTS */
> +
> DECLARE_EVENT_CLASS(writeback_write_inode_template,
>
> TP_PROTO(struct inode *inode, struct writeback_control *wbc),
> @@ -141,6 +201,7 @@ DECLARE_EVENT_CLASS(writeback_write_inod
> __array(char, name, 32)
> __field(unsigned long, ino)
> __field(int, sync_mode)
> + __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
> ),
>
> TP_fast_assign(
> @@ -148,12 +209,14 @@ DECLARE_EVENT_CLASS(writeback_write_inod
> dev_name(inode_to_bdi(inode)->dev), 32);
> __entry->ino = inode->i_ino;
> __entry->sync_mode = wbc->sync_mode;
> + __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
> ),
>
> - TP_printk("bdi %s: ino=%lu sync_mode=%d",
> + TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s",
> __entry->name,
> __entry->ino,
> - __entry->sync_mode
> + __entry->sync_mode,
> + __get_str(cgroup)
> )
> );
>
> @@ -172,8 +235,8 @@ DEFINE_EVENT(writeback_write_inode_templ
> );
>
> DECLARE_EVENT_CLASS(writeback_work_class,
> - TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work),
> - TP_ARGS(bdi, work),
> + TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
> + TP_ARGS(wb, work),
> TP_STRUCT__entry(
> __array(char, name, 32)
> __field(long, nr_pages)
> @@ -183,10 +246,11 @@ DECLARE_EVENT_CLASS(writeback_work_class
> __field(int, range_cyclic)
> __field(int, for_background)
> __field(int, reason)
> + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
> ),
> TP_fast_assign(
> strncpy(__entry->name,
> - bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
> + wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32);
> __entry->nr_pages = work->nr_pages;
> __entry->sb_dev = work->sb ? work->sb->s_dev : 0;
> __entry->sync_mode = work->sync_mode;
> @@ -194,9 +258,10 @@ DECLARE_EVENT_CLASS(writeback_work_class
> __entry->range_cyclic = work->range_cyclic;
> __entry->for_background = work->for_background;
> __entry->reason = work->reason;
> + __trace_wb_assign_cgroup(__get_str(cgroup), wb);
> ),
> TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
> - "kupdate=%d range_cyclic=%d background=%d reason=%s",
> + "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s",
> __entry->name,
> MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
> __entry->nr_pages,
> @@ -204,13 +269,14 @@ DECLARE_EVENT_CLASS(writeback_work_class
> __entry->for_kupdate,
> __entry->range_cyclic,
> __entry->for_background,
> - __print_symbolic(__entry->reason, WB_WORK_REASON)
> + __print_symbolic(__entry->reason, WB_WORK_REASON),
> + __get_str(cgroup)
> )
> );
> #define DEFINE_WRITEBACK_WORK_EVENT(name) \
> DEFINE_EVENT(writeback_work_class, name, \
> - TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
> - TP_ARGS(bdi, work))
> + TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
> + TP_ARGS(wb, work))
> DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
> DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
> DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
> @@ -230,26 +296,42 @@ TRACE_EVENT(writeback_pages_written,
> );
>
> DECLARE_EVENT_CLASS(writeback_class,
> - TP_PROTO(struct backing_dev_info *bdi),
> - TP_ARGS(bdi),
> + TP_PROTO(struct bdi_writeback *wb),
> + TP_ARGS(wb),
> TP_STRUCT__entry(
> __array(char, name, 32)
> + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
> ),
> TP_fast_assign(
> - strncpy(__entry->name, dev_name(bdi->dev), 32);
> + strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
> + __trace_wb_assign_cgroup(__get_str(cgroup), wb);
> ),
> - TP_printk("bdi %s",
> - __entry->name
> + TP_printk("bdi %s: cgroup=%s",
> + __entry->name,
> + __get_str(cgroup)
> )
> );
> #define DEFINE_WRITEBACK_EVENT(name) \
> DEFINE_EVENT(writeback_class, name, \
> - TP_PROTO(struct backing_dev_info *bdi), \
> - TP_ARGS(bdi))
> + TP_PROTO(struct bdi_writeback *wb), \
> + TP_ARGS(wb))
>
> DEFINE_WRITEBACK_EVENT(writeback_nowork);
> DEFINE_WRITEBACK_EVENT(writeback_wake_background);
> -DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
> +
> +TRACE_EVENT(writeback_bdi_register,
> + TP_PROTO(struct backing_dev_info *bdi),
> + TP_ARGS(bdi),
> + TP_STRUCT__entry(
> + __array(char, name, 32)
> + ),
> + TP_fast_assign(
> + strncpy(__entry->name, dev_name(bdi->dev), 32);
> + ),
> + TP_printk("bdi %s",
> + __entry->name
> + )
> +);
>
> DECLARE_EVENT_CLASS(wbc_class,
> TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
> @@ -265,6 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class,
> __field(int, range_cyclic)
> __field(long, range_start)
> __field(long, range_end)
> + __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
> ),
>
> TP_fast_assign(
> @@ -278,11 +361,12 @@ DECLARE_EVENT_CLASS(wbc_class,
> __entry->range_cyclic = wbc->range_cyclic;
> __entry->range_start = (long)wbc->range_start;
> __entry->range_end = (long)wbc->range_end;
> + __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
> ),
>
> TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
> "bgrd=%d reclm=%d cyclic=%d "
> - "start=0x%lx end=0x%lx",
> + "start=0x%lx end=0x%lx cgroup=%s",
> __entry->name,
> __entry->nr_to_write,
> __entry->pages_skipped,
> @@ -292,7 +376,9 @@ DECLARE_EVENT_CLASS(wbc_class,
> __entry->for_reclaim,
> __entry->range_cyclic,
> __entry->range_start,
> - __entry->range_end)
> + __entry->range_end,
> + __get_str(cgroup)
> + )
> )
>
> #define DEFINE_WBC_EVENT(name) \
> @@ -312,6 +398,7 @@ TRACE_EVENT(writeback_queue_io,
> __field(long, age)
> __field(int, moved)
> __field(int, reason)
> + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
> ),
> TP_fast_assign(
> unsigned long *older_than_this = work->older_than_this;
> @@ -321,13 +408,15 @@ TRACE_EVENT(writeback_queue_io,
> (jiffies - *older_than_this) * 1000 / HZ : -1;
> __entry->moved = moved;
> __entry->reason = work->reason;
> + __trace_wb_assign_cgroup(__get_str(cgroup), wb);
> ),
> - TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s",
> + TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
> __entry->name,
> __entry->older, /* older_than_this in jiffies */
> __entry->age, /* older_than_this in relative milliseconds */
> __entry->moved,
> - __print_symbolic(__entry->reason, WB_WORK_REASON)
> + __print_symbolic(__entry->reason, WB_WORK_REASON),
> + __get_str(cgroup)
> )
> );
>
> @@ -381,11 +470,11 @@ TRACE_EVENT(global_dirty_state,
>
> TRACE_EVENT(bdi_dirty_ratelimit,
>
> - TP_PROTO(struct backing_dev_info *bdi,
> + TP_PROTO(struct bdi_writeback *wb,
> unsigned long dirty_rate,
> unsigned long task_ratelimit),
>
> - TP_ARGS(bdi, dirty_rate, task_ratelimit),
> + TP_ARGS(wb, dirty_rate, task_ratelimit),
>
> TP_STRUCT__entry(
> __array(char, bdi, 32)
> @@ -395,36 +484,39 @@ TRACE_EVENT(bdi_dirty_ratelimit,
> __field(unsigned long, dirty_ratelimit)
> __field(unsigned long, task_ratelimit)
> __field(unsigned long, balanced_dirty_ratelimit)
> + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
> ),
>
> TP_fast_assign(
> - strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
> - __entry->write_bw = KBps(bdi->wb.write_bandwidth);
> - __entry->avg_write_bw = KBps(bdi->wb.avg_write_bandwidth);
> + strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
> + __entry->write_bw = KBps(wb->write_bandwidth);
> + __entry->avg_write_bw = KBps(wb->avg_write_bandwidth);
> __entry->dirty_rate = KBps(dirty_rate);
> - __entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit);
> + __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
> __entry->task_ratelimit = KBps(task_ratelimit);
> __entry->balanced_dirty_ratelimit =
> - KBps(bdi->wb.balanced_dirty_ratelimit);
> + KBps(wb->balanced_dirty_ratelimit);
> + __trace_wb_assign_cgroup(__get_str(cgroup), wb);
> ),
>
> TP_printk("bdi %s: "
> "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
> "dirty_ratelimit=%lu task_ratelimit=%lu "
> - "balanced_dirty_ratelimit=%lu",
> + "balanced_dirty_ratelimit=%lu cgroup=%s",
> __entry->bdi,
> __entry->write_bw, /* write bandwidth */
> __entry->avg_write_bw, /* avg write bandwidth */
> __entry->dirty_rate, /* bdi dirty rate */
> __entry->dirty_ratelimit, /* base ratelimit */
> __entry->task_ratelimit, /* ratelimit with position control */
> - __entry->balanced_dirty_ratelimit /* the balanced ratelimit */
> + __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
> + __get_str(cgroup)
> )
> );
>
> TRACE_EVENT(balance_dirty_pages,
>
> - TP_PROTO(struct backing_dev_info *bdi,
> + TP_PROTO(struct bdi_writeback *wb,
> unsigned long thresh,
> unsigned long bg_thresh,
> unsigned long dirty,
> @@ -437,7 +529,7 @@ TRACE_EVENT(balance_dirty_pages,
> long pause,
> unsigned long start_time),
>
> - TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
> + TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
> dirty_ratelimit, task_ratelimit,
> dirtied, period, pause, start_time),
>
> @@ -456,11 +548,12 @@ TRACE_EVENT(balance_dirty_pages,
> __field( long, pause)
> __field(unsigned long, period)
> __field( long, think)
> + __dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
> ),
>
> TP_fast_assign(
> unsigned long freerun = (thresh + bg_thresh) / 2;
> - strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
> + strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
>
> __entry->limit = global_wb_domain.dirty_limit;
> __entry->setpoint = (global_wb_domain.dirty_limit +
> @@ -478,6 +571,7 @@ TRACE_EVENT(balance_dirty_pages,
> __entry->period = period * 1000 / HZ;
> __entry->pause = pause * 1000 / HZ;
> __entry->paused = (jiffies - start_time) * 1000 / HZ;
> + __trace_wb_assign_cgroup(__get_str(cgroup), wb);
> ),
>
>
> @@ -486,7 +580,7 @@ TRACE_EVENT(balance_dirty_pages,
> "bdi_setpoint=%lu bdi_dirty=%lu "
> "dirty_ratelimit=%lu task_ratelimit=%lu "
> "dirtied=%u dirtied_pause=%u "
> - "paused=%lu pause=%ld period=%lu think=%ld",
> + "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s",
> __entry->bdi,
> __entry->limit,
> __entry->setpoint,
> @@ -500,7 +594,8 @@ TRACE_EVENT(balance_dirty_pages,
> __entry->paused, /* ms */
> __entry->pause, /* ms */
> __entry->period, /* ms */
> - __entry->think /* ms */
> + __entry->think, /* ms */
> + __get_str(cgroup)
> )
> );
>
> @@ -514,6 +609,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
> __field(unsigned long, ino)
> __field(unsigned long, state)
> __field(unsigned long, dirtied_when)
> + __dynamic_array(char, cgroup,
> + __trace_wb_cgroup_size(inode_to_wb(inode)))
> ),
>
> TP_fast_assign(
> @@ -522,14 +619,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
> __entry->ino = inode->i_ino;
> __entry->state = inode->i_state;
> __entry->dirtied_when = inode->dirtied_when;
> + __trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode));
> ),
>
> - TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu",
> + TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s",
> __entry->name,
> __entry->ino,
> show_inode_state(__entry->state),
> __entry->dirtied_when,
> - (jiffies - __entry->dirtied_when) / HZ
> + (jiffies - __entry->dirtied_when) / HZ,
> + __get_str(cgroup)
> )
> );
>
> @@ -585,6 +684,7 @@ DECLARE_EVENT_CLASS(writeback_single_ino
> __field(unsigned long, writeback_index)
> __field(long, nr_to_write)
> __field(unsigned long, wrote)
> + __dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
> ),
>
> TP_fast_assign(
> @@ -596,10 +696,11 @@ DECLARE_EVENT_CLASS(writeback_single_ino
> __entry->writeback_index = inode->i_mapping->writeback_index;
> __entry->nr_to_write = nr_to_write;
> __entry->wrote = nr_to_write - wbc->nr_to_write;
> + __trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
> ),
>
> TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
> - "index=%lu to_write=%ld wrote=%lu",
> + "index=%lu to_write=%ld wrote=%lu cgroup=%s",
> __entry->name,
> __entry->ino,
> show_inode_state(__entry->state),
> @@ -607,7 +708,8 @@ DECLARE_EVENT_CLASS(writeback_single_ino
> (jiffies - __entry->dirtied_when) / HZ,
> __entry->writeback_index,
> __entry->nr_to_write,
> - __entry->wrote
> + __entry->wrote,
> + __get_str(cgroup)
> )
> );
>
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -1289,7 +1289,7 @@ static void wb_update_dirty_ratelimit(st
> wb->dirty_ratelimit = max(dirty_ratelimit, 1UL);
> wb->balanced_dirty_ratelimit = balanced_dirty_ratelimit;
>
> - trace_bdi_dirty_ratelimit(wb->bdi, dirty_rate, task_ratelimit);
> + trace_bdi_dirty_ratelimit(wb, dirty_rate, task_ratelimit);
> }
>
> static void __wb_update_bandwidth(struct dirty_throttle_control *gdtc,
> @@ -1683,7 +1683,7 @@ static void balance_dirty_pages(struct a
> * do a reset, as it may be a light dirtier.
> */
> if (pause < min_pause) {
> - trace_balance_dirty_pages(bdi,
> + trace_balance_dirty_pages(wb,
> sdtc->thresh,
> sdtc->bg_thresh,
> sdtc->dirty,
> @@ -1712,7 +1712,7 @@ static void balance_dirty_pages(struct a
> }
>
> pause:
> - trace_balance_dirty_pages(bdi,
> + trace_balance_dirty_pages(wb,
> sdtc->thresh,
> sdtc->bg_thresh,
> sdtc->dirty,
--
Jan Kara <jack@xxxxxxx>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/