[PATCH 13/13] blk: introduce iostat per cgroup module

From: Wang Jianchao
Date: Mon Jan 10 2022 - 04:14:29 EST


From: Wang Jianchao <wangjianchao@xxxxxxxxxxxx>

iostat can only track the whole device's io statistics. This patch
introduces iostat per cgroup based on blk-rq-qos framework which
can track bw, iops, queue latency and device latency and distinguish
regular or meta data. The blkio.iostat per cgroup output in following
format,
vda-data bytes iops queue_lat dev_lat [ditto] [ditto]
meta \___________ ______________/ | |
v v v
read write discard
In particular, the blkio.iostat of root only output the statistics
of IOs from root cgroup. However, the non-root blkio.iostat outputs
all of the children cgroups. With meta stats in root cgroup, hope
to observe the performace of fs metadata.

Signed-off-by: Wang Jianchao <wangjianchao@xxxxxxxxxxxx>
---
block/Kconfig | 9 ++
block/Makefile | 2 +
block/blk-iostat.c | 356 +++++++++++++++++++++++++++++++++++++++++
include/linux/blkdev.h | 2 +-
4 files changed, 368 insertions(+), 1 deletion(-)
create mode 100644 block/blk-iostat.c

diff --git a/block/Kconfig b/block/Kconfig
index ea612cb5c8ee..35f24db3ec92 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -156,6 +156,15 @@ config BLK_CGROUP_IOPRIO
scheduler and block devices process requests. Only some I/O schedulers
and some block devices support I/O priorities.

+config BLK_CGROUP_IOSTAT
+ tristate "IO statistics monitor per cgroup"
+ select BLK_RQ_BLKCG_GQ
+ select BLK_RQ_ALLOC_TIME
+ depends on BLK_CGROUP
+ help
+ Monitor IO statistics, including bw, iops, queue latency and device
+ latency, in per-cgroup level.
+
config BLK_DEBUG_FS
bool "Block layer debugging information in debugfs"
default y
diff --git a/block/Makefile b/block/Makefile
index 3f76836076b2..ad89015e37ce 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -23,6 +23,8 @@ iolat-y := blk-iolatency.o
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += iolat.o
iocost-y := blk-iocost.o
obj-$(CONFIG_BLK_CGROUP_IOCOST) += iocost.o
+iostat-y := blk-iostat.o
+obj-$(CONFIG_BLK_CGROUP_IOSTAT) += iostat.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
diff --git a/block/blk-iostat.c b/block/blk-iostat.c
new file mode 100644
index 000000000000..3c6bcb6ab055
--- /dev/null
+++ b/block/blk-iostat.c
@@ -0,0 +1,356 @@
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/module.h>
+#include <linux/blk-cgroup.h>
+#include <linux/bio.h>
+#include <linux/spinlock.h>
+
+#include "blk.h"
+#include "blk-rq-qos.h"
+
+enum {
+ IOSTAT_READ = 0,
+ IOSTAT_WRITE,
+ IOSTAT_DISCARD,
+ IOSTAT_MAX,
+};
+
+struct iostat_data {
+ u64 bytes[IOSTAT_MAX];
+ u64 ios[IOSTAT_MAX];
+ u64 queue_lat[IOSTAT_MAX];
+ u64 dev_lat[IOSTAT_MAX];
+};
+
+struct iostat_queue {
+ struct rq_qos rqos;
+};
+
+struct iostat_gq {
+ struct blkg_policy_data pd;
+ char disk_name[DISK_NAME_LEN];
+ struct {
+ struct iostat_data __percpu *data;
+ struct iostat_data __percpu *meta;
+ } stat;
+};
+
+struct iostat_cgrp {
+ struct blkcg_policy_data cpd;
+};
+
+DEFINE_MUTEX(iostat_mutex);
+
+static struct blkcg_policy blkcg_policy_iostat;
+
+static inline struct iostat_gq *pd_to_ist(struct blkg_policy_data *pd)
+{
+ return pd ? container_of(pd, struct iostat_gq, pd) : NULL;
+}
+
+static inline struct iostat_gq *blkg_to_ist(struct blkcg_gq *blkg)
+{
+ return pd_to_ist(blkg_to_pd(blkg, &blkcg_policy_iostat));
+}
+
+static inline bool req_is_meta(struct request *req)
+{
+ return req->cmd_flags & REQ_META;
+}
+
+static inline int iostat_op(struct request *req)
+{
+ int op;
+
+ if (unlikely(op_is_discard(req_op(req))))
+ op = IOSTAT_DISCARD;
+ else if (op_is_write(req_op(req)))
+ op = IOSTAT_WRITE;
+ else
+ op = IOSTAT_READ;
+
+ return op;
+}
+
+static void __iostat_issue(struct rq_qos *rqos,
+ struct iostat_gq *is, struct request *req)
+{
+ struct iostat_data *stat;
+ int op = iostat_op(req);
+
+ /*
+ * blk_mq_start_request() inherents bio_issue_time() when BLK_CGROUP
+ * to avoid overhead of readtsc.
+ */
+ req->io_start_time_ns = ktime_get_ns();
+ if (req_is_meta(req))
+ stat = get_cpu_ptr(is->stat.meta);
+ else
+ stat = get_cpu_ptr(is->stat.data);
+ /*
+ * alloc_time_ns is get before get tag, we use it monitor depth,
+ * tag waits and in queue time.
+ */
+ stat->queue_lat[op] += req->io_start_time_ns - req->alloc_time_ns;
+ stat->ios[op]++;
+ stat->bytes[op] += blk_rq_bytes(req);
+ put_cpu_ptr(stat);
+}
+
+static void iostat_issue(struct rq_qos *rqos, struct request *req)
+{
+ struct iostat_gq *is;
+
+ if (unlikely(!req->bio))
+ return;
+
+ is = blkg_to_ist(req->blkg);
+ /*
+ * Most of time, bios from submit_bio would have the valid bi_blkg,
+ * however, blk_execute_rq case is an exception.
+ */
+ if (is)
+ __iostat_issue(rqos, is, req);
+}
+
+static void __iostat_done(struct rq_qos *rq_qos,
+ struct iostat_gq *is, struct request *req)
+{
+ struct iostat_data *stat;
+ int op = iostat_op(req);
+
+ if (req_is_meta(req))
+ stat = get_cpu_ptr(is->stat.meta);
+ else
+ stat = get_cpu_ptr(is->stat.data);
+ if (req->io_start_time_ns)
+ stat->dev_lat[op] += ktime_get_ns() - req->io_start_time_ns;
+ put_cpu_ptr(stat);
+}
+
+static void iostat_done(struct rq_qos *rqos, struct request *req)
+{
+ struct iostat_gq *is = blkg_to_ist(req->blkg);
+
+ if (is)
+ __iostat_done(rqos, is, req);
+}
+
+static void iostat_exit(struct rq_qos *rqos)
+{
+ struct iostat_queue *isq = container_of(rqos, struct iostat_queue, rqos);
+
+ blkcg_deactivate_policy(rqos->q, &blkcg_policy_iostat);
+ rq_qos_deactivate(rqos);
+ kfree(isq);
+}
+
+static int iostat_init(struct request_queue *q);
+
+struct rq_qos_ops iostat_rq_ops = {
+#if IS_MODULE(CONFIG_BLK_CGROUP_IOLATENCY)
+ .owner = THIS_MODULE,
+#endif
+ .name = "iostat",
+ .flags = RQOS_FLAG_CGRP_POL | RQOS_FLAG_RQ_ALLOC_TIME,
+ .issue = iostat_issue,
+ .done = iostat_done,
+ .exit = iostat_exit,
+ .init = iostat_init,
+};
+
+static int iostat_init(struct request_queue *q)
+{
+ struct iostat_queue *isq;
+ struct rq_qos *rqos;
+ int ret;
+
+ isq = kzalloc_node(sizeof(*isq), GFP_KERNEL, q->node);
+ if (!isq) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, q);
+ rqos = &isq->rqos;
+ rq_qos_activate(q, rqos, &iostat_rq_ops);
+
+ ret = blkcg_activate_policy(q, &blkcg_policy_iostat);
+ if (ret) {
+ rq_qos_deactivate(rqos);
+ kfree(isq);
+ }
+out:
+ return ret;
+}
+
+static void iostat_sum(struct blkcg_gq *blkg,
+ struct iostat_data *sum, bool meta)
+{
+ struct iostat_gq *is = blkg_to_ist(blkg);
+ struct iostat_data *stat;
+ int cpu, i;
+
+ for_each_possible_cpu(cpu) {
+ if (meta)
+ stat = per_cpu_ptr(is->stat.meta, cpu);
+ else
+ stat = per_cpu_ptr(is->stat.data, cpu);
+ for (i = 0; i < IOSTAT_MAX; i++) {
+ sum->bytes[i] += stat->bytes[i];
+ sum->ios[i] += stat->ios[i];
+ sum->dev_lat[i] += stat->dev_lat[i];
+ sum->queue_lat[i] += stat->queue_lat[i];
+ }
+ }
+}
+
+static int iostat_show(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct cgroup_subsys_state *pos_css;
+ struct iostat_gq *is;
+ struct blkcg_gq *blkg, *pos_blkg;
+ struct iostat_data data_sum, meta_sum;
+ int i;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ is = blkg_to_ist(blkg);
+ /*
+ * The is activated on demand so iostat may be NULL
+ */
+ if (!is)
+ continue;
+
+ memset(&data_sum, 0, sizeof(data_sum));
+ memset(&meta_sum, 0, sizeof(meta_sum));
+ if (blkg == blkg->q->root_blkg) {
+ iostat_sum(blkg, &data_sum, false);
+ iostat_sum(blkg, &meta_sum, true);
+ } else {
+ /*
+ * Iterate every children blkg to agregate statistics
+ */
+ blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+ if (!pos_blkg->online)
+ continue;
+ iostat_sum(pos_blkg, &data_sum, false);
+ iostat_sum(pos_blkg, &meta_sum, true);
+ }
+ }
+
+ seq_printf(sf, "%s-data ", is->disk_name);
+ for (i = 0; i < IOSTAT_MAX; i++)
+ seq_printf(sf, "%llu %llu %llu %llu ",
+ data_sum.bytes[i], data_sum.ios[i],
+ data_sum.queue_lat[i], data_sum.dev_lat[i]);
+ seq_printf(sf, "\n");
+ seq_printf(sf, "%s-meta ", is->disk_name);
+ for (i = 0; i < IOSTAT_MAX; i++)
+ seq_printf(sf, "%llu %llu %llu %llu ",
+ meta_sum.bytes[i], meta_sum.ios[i],
+ meta_sum.queue_lat[i], meta_sum.dev_lat[i]);
+ seq_printf(sf, "\n");
+ }
+ rcu_read_unlock();
+
+ return 0;
+}
+
+static struct cftype iostat_files[] = {
+ {
+ .name = "iostat",
+ .seq_show = iostat_show,
+ },
+ {}
+};
+
+static struct cftype iostat_legacy_files[] = {
+ {
+ .name = "iostat",
+ .seq_show = iostat_show,
+ },
+ {}
+};
+
+static void iostat_pd_free(struct blkg_policy_data *pd)
+{
+ struct iostat_gq *is = pd_to_ist(pd);
+
+ if (is->stat.data)
+ free_percpu(is->stat.data);
+
+ if (is->stat.meta)
+ free_percpu(is->stat.meta);
+
+ kfree(is);
+}
+
+static struct blkg_policy_data *iostat_pd_alloc(gfp_t gfp,
+ struct request_queue *q, struct blkcg *blkcg)
+{
+ struct iostat_gq *is;
+
+ is = kzalloc_node(sizeof(*is), gfp, q->node);
+ if (!is)
+ return NULL;
+
+ is->stat.data = __alloc_percpu_gfp(sizeof(struct iostat_data),
+ __alignof__(struct iostat_data), gfp);
+ if (!is->stat.data)
+ goto out_free;
+
+ is->stat.meta = __alloc_percpu_gfp(sizeof(struct iostat_data),
+ __alignof__(struct iostat_data), gfp);
+ if (!is->stat.meta)
+ goto out_free;
+ /*
+ * request_queue.kobj's parent is gendisk
+ */
+ strlcpy(is->disk_name,
+ kobject_name(q->kobj.parent),
+ DISK_NAME_LEN);
+ return &is->pd;
+out_free:
+ if (is->stat.data)
+ free_percpu(is->stat.data);
+ iostat_pd_free(&is->pd);
+ return NULL;
+}
+
+static struct blkcg_policy blkcg_policy_iostat = {
+ .dfl_cftypes = iostat_files,
+ .legacy_cftypes = iostat_legacy_files,
+ .pd_alloc_fn = iostat_pd_alloc,
+ .pd_free_fn = iostat_pd_free,
+};
+
+static int __init iostat_mod_init(void)
+{
+ int ret;
+
+ ret = rq_qos_register(&iostat_rq_ops);
+ if (ret)
+ return ret;
+
+ ret = blkcg_policy_register(&blkcg_policy_iostat);
+ if (ret) {
+ rq_qos_unregister(&iostat_rq_ops);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void __exit iostat_mod_exit(void)
+{
+ rq_qos_unregister(&iostat_rq_ops);
+ blkcg_policy_unregister(&blkcg_policy_iostat);
+}
+
+module_init(iostat_mod_init);
+module_exit(iostat_mod_exit);
+MODULE_AUTHOR("Wang Jianchao");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Block Statistics per Cgroup");
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ed30b3c3fee7..75026cf54384 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -42,7 +42,7 @@ struct blk_crypto_profile;
* Maximum number of blkcg policies allowed to be registered concurrently.
* Defined here to simplify include dependency.
*/
-#define BLKCG_MAX_POLS 6
+#define BLKCG_MAX_POLS 7
/*
* Non blk-rq-qos blkcg policies include blk-throttle and bfq
*/
--
2.17.1