Re: [PATCH v2 3/6] writeback: support retrieving per group debug writeback stats of bdi
From: Brian Foster
Date: Fri Mar 29 2024 - 11:02:14 EST
On Wed, Mar 27, 2024 at 11:57:48PM +0800, Kemeng Shi wrote:
> Add /sys/kernel/debug/bdi/xxx/wb_stats to show per group writeback stats
> of bdi.
>
Hi Kemeng,
Just a few random thoughts/comments..
> Following domain hierarchy is tested:
> global domain (320G)
> / \
> cgroup domain1(10G) cgroup domain2(10G)
> | |
> bdi wb1 wb2
>
> /* per wb writeback info of bdi is collected */
> cat /sys/kernel/debug/bdi/252:16/wb_stats
> WbCgIno: 1
> WbWriteback: 0 kB
> WbReclaimable: 0 kB
> WbDirtyThresh: 0 kB
> WbDirtied: 0 kB
> WbWritten: 0 kB
> WbWriteBandwidth: 102400 kBps
> b_dirty: 0
> b_io: 0
> b_more_io: 0
> b_dirty_time: 0
> state: 1
Maybe some whitespace or something between entries would improve
readability?
> WbCgIno: 4094
> WbWriteback: 54432 kB
> WbReclaimable: 766080 kB
> WbDirtyThresh: 3094760 kB
> WbDirtied: 1656480 kB
> WbWritten: 837088 kB
> WbWriteBandwidth: 132772 kBps
> b_dirty: 1
> b_io: 1
> b_more_io: 0
> b_dirty_time: 0
> state: 7
> WbCgIno: 4135
> WbWriteback: 15232 kB
> WbReclaimable: 786688 kB
> WbDirtyThresh: 2909984 kB
> WbDirtied: 1482656 kB
> WbWritten: 681408 kB
> WbWriteBandwidth: 124848 kBps
> b_dirty: 0
> b_io: 1
> b_more_io: 0
> b_dirty_time: 0
> state: 7
>
> Signed-off-by: Kemeng Shi <shikemeng@xxxxxxxxxxxxxxx>
> ---
> include/linux/writeback.h | 1 +
> mm/backing-dev.c | 88 +++++++++++++++++++++++++++++++++++++++
> mm/page-writeback.c | 19 +++++++++
> 3 files changed, 108 insertions(+)
>
..
> diff --git a/mm/backing-dev.c b/mm/backing-dev.c
> index 8daf950e6855..e3953db7d88d 100644
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
> @@ -103,6 +103,91 @@ static void collect_wb_stats(struct wb_stats *stats,
> }
>
> #ifdef CONFIG_CGROUP_WRITEBACK
..
> +static int cgwb_debug_stats_show(struct seq_file *m, void *v)
> +{
> + struct backing_dev_info *bdi;
> + unsigned long background_thresh;
> + unsigned long dirty_thresh;
> + struct bdi_writeback *wb;
> + struct wb_stats stats;
> +
> + rcu_read_lock();
> + bdi = lookup_bdi(m);
> + if (!bdi) {
> + rcu_read_unlock();
> + return -EEXIST;
> + }
> +
> + global_dirty_limits(&background_thresh, &dirty_thresh);
> +
> + list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node) {
> + memset(&stats, 0, sizeof(stats));
> + stats.dirty_thresh = dirty_thresh;
If you did something like the following here, wouldn't that also zero
the rest of the structure?
struct wb_stats stats = { .dirty_thresh = dirty_thresh };
> + collect_wb_stats(&stats, wb);
> +
Also, similar question as before on whether you'd want to check
WB_registered or something here..
> + if (mem_cgroup_wb_domain(wb) == NULL) {
> + wb_stats_show(m, wb, &stats);
> + continue;
> + }
Can you explain what this logic is about? Is the cgwb_calc_thresh()
thing not needed in this case? A comment might help for those less
familiar with the implementation details.
BTW, I'm also wondering if something like the following is correct
and/or roughly equivalent:
list_for_each_*(wb, ...) {
struct wb_stats stats = ...;
if (!wb_tryget(wb))
continue;
collect_wb_stats(&stats, wb);
/*
* Extra wb_thresh magic. Drop rcu lock because ... . We
* can do so here because we have a ref.
*/
if (mem_cgroup_wb_domain(wb)) {
rcu_read_unlock();
stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
rcu_read_lock();
}
wb_stats_show(m, wb, &stats)
wb_put(wb);
}
> +
> + /*
> + * cgwb_release will destroy wb->memcg_completions which
> + * will be ued in cgwb_calc_thresh. Use wb_tryget to prevent
> + * memcg_completions destruction from cgwb_release.
> + */
> + if (!wb_tryget(wb))
> + continue;
> +
> + rcu_read_unlock();
> + /* cgwb_calc_thresh may sleep in cgroup_rstat_flush */
> + stats.wb_thresh = min(stats.wb_thresh, cgwb_calc_thresh(wb));
> + wb_stats_show(m, wb, &stats);
> + rcu_read_lock();
> + wb_put(wb);
> + }
> + rcu_read_unlock();
> +
> + return 0;
> +}
> +DEFINE_SHOW_ATTRIBUTE(cgwb_debug_stats);
> +
> +static void cgwb_debug_register(struct backing_dev_info *bdi)
> +{
> + debugfs_create_file("wb_stats", 0444, bdi->debug_dir, bdi,
> + &cgwb_debug_stats_fops);
> +}
> +
> static void bdi_collect_stats(struct backing_dev_info *bdi,
> struct wb_stats *stats)
> {
> @@ -117,6 +202,8 @@ static void bdi_collect_stats(struct backing_dev_info *bdi,
> {
> collect_wb_stats(stats, &bdi->wb);
> }
> +
> +static inline void cgwb_debug_register(struct backing_dev_info *bdi) { }
Could we just create the wb_stats file regardless of whether cgwb is
enabled? Obviously theres only one wb in the !CGWB case and it's
somewhat duplicative with the bdi stats file, but that seems harmless if
the same code can be reused..? Maybe there's also a small argument for
dropping the state info from the bdi stats file and moving it to
wb_stats.
Brian
> #endif
>
> static int bdi_debug_stats_show(struct seq_file *m, void *v)
> @@ -182,6 +269,7 @@ static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
>
> debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
> &bdi_debug_stats_fops);
> + cgwb_debug_register(bdi);
> }
>
> static void bdi_debug_unregister(struct backing_dev_info *bdi)
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index 0e20467367fe..3724c7525316 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -893,6 +893,25 @@ unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh)
> return __wb_calc_thresh(&gdtc, thresh);
> }
>
> +unsigned long cgwb_calc_thresh(struct bdi_writeback *wb)
> +{
> + struct dirty_throttle_control gdtc = { GDTC_INIT_NO_WB };
> + struct dirty_throttle_control mdtc = { MDTC_INIT(wb, &gdtc) };
> + unsigned long filepages, headroom, writeback;
> +
> + gdtc.avail = global_dirtyable_memory();
> + gdtc.dirty = global_node_page_state(NR_FILE_DIRTY) +
> + global_node_page_state(NR_WRITEBACK);
> +
> + mem_cgroup_wb_stats(wb, &filepages, &headroom,
> + &mdtc.dirty, &writeback);
> + mdtc.dirty += writeback;
> + mdtc_calc_avail(&mdtc, filepages, headroom);
> + domain_dirty_limits(&mdtc);
> +
> + return __wb_calc_thresh(&mdtc, mdtc.thresh);
> +}
> +
> /*
> * setpoint - dirty 3
> * f(dirty) := 1.0 + (----------------)
> --
> 2.30.0
>