[PATCH 6/8] block: add scalable completion tracking of requests
From: Jens Axboe
Date: Wed Oct 26 2016 - 16:52:56 EST
For legacy block, we simply track them in the request queue. For
blk-mq, we track them on a per-sw queue basis, which we can then
sum up through the hardware queues and finally to a per device
state.
The stats are tracked in, roughly, 0.1s interval windows.
Add sysfs files to display the stats.
Signed-off-by: Jens Axboe <axboe@xxxxxx>
---
block/Makefile | 2 +-
block/blk-core.c | 4 +
block/blk-mq-sysfs.c | 47 ++++++++++
block/blk-mq.c | 14 +++
block/blk-mq.h | 3 +
block/blk-stat.c | 226 ++++++++++++++++++++++++++++++++++++++++++++++
block/blk-stat.h | 37 ++++++++
block/blk-sysfs.c | 26 ++++++
include/linux/blk_types.h | 16 ++++
include/linux/blkdev.h | 4 +
10 files changed, 378 insertions(+), 1 deletion(-)
create mode 100644 block/blk-stat.c
create mode 100644 block/blk-stat.h
diff --git a/block/Makefile b/block/Makefile
index 36acdd7545be..ff6edd5593d1 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- blk-lib.o blk-mq.o blk-mq-tag.o \
+ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
badblocks.o partitions/
diff --git a/block/blk-core.c b/block/blk-core.c
index 14d7c0740dc0..a822383e858e 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2475,6 +2475,8 @@ void blk_start_request(struct request *req)
{
blk_dequeue_request(req);
+ blk_stat_set_issue_time(&req->issue_stat);
+
/*
* We are now handing the request to the hardware, initialize
* resid_len to full count and add the timeout handler.
@@ -2542,6 +2544,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
trace_block_rq_complete(req->q, req, nr_bytes);
+ blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req);
+
if (!req->bio)
return false;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 01fb455d3377..633c79a538ea 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -259,6 +259,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
return ret;
}
+static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx)
+{
+ struct blk_mq_ctx *ctx;
+ unsigned int i;
+
+ hctx_for_each_ctx(hctx, ctx, i) {
+ blk_stat_init(&ctx->stat[0]);
+ blk_stat_init(&ctx->stat[1]);
+ }
+}
+
+static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx,
+ const char *page, size_t count)
+{
+ blk_mq_stat_clear(hctx);
+ return count;
+}
+
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
+{
+ return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
+ pre, (long long) stat->nr_samples,
+ (long long) stat->mean, (long long) stat->min,
+ (long long) stat->max);
+}
+
+static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page)
+{
+ struct blk_rq_stat stat[2];
+ ssize_t ret;
+
+ blk_stat_init(&stat[0]);
+ blk_stat_init(&stat[1]);
+
+ blk_hctx_stat_get(hctx, stat);
+
+ ret = print_stat(page, &stat[0], "read :");
+ ret += print_stat(page + ret, &stat[1], "write:");
+ return ret;
+}
+
static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = {
.attr = {.name = "dispatched", .mode = S_IRUGO },
.show = blk_mq_sysfs_dispatched_show,
@@ -317,6 +358,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = {
.show = blk_mq_hw_sysfs_poll_show,
.store = blk_mq_hw_sysfs_poll_store,
};
+static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = {
+ .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR },
+ .show = blk_mq_hw_sysfs_stat_show,
+ .store = blk_mq_hw_sysfs_stat_store,
+};
static struct attribute *default_hw_ctx_attrs[] = {
&blk_mq_hw_sysfs_queued.attr,
@@ -327,6 +373,7 @@ static struct attribute *default_hw_ctx_attrs[] = {
&blk_mq_hw_sysfs_cpus.attr,
&blk_mq_hw_sysfs_active.attr,
&blk_mq_hw_sysfs_poll.attr,
+ &blk_mq_hw_sysfs_stat.attr,
NULL,
};
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ddc2eed64771..dc668353e228 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -30,6 +30,7 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
+#include "blk-stat.h"
static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
@@ -378,10 +379,19 @@ static void blk_mq_ipi_complete_request(struct request *rq)
put_cpu();
}
+static void blk_mq_stat_add(struct request *rq)
+{
+ struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)];
+
+ blk_stat_add(stat, rq);
+}
+
static void __blk_mq_complete_request(struct request *rq)
{
struct request_queue *q = rq->q;
+ blk_mq_stat_add(rq);
+
if (!q->softirq_done_fn)
blk_mq_end_request(rq, rq->errors);
else
@@ -425,6 +435,8 @@ void blk_mq_start_request(struct request *rq)
if (unlikely(blk_bidi_rq(rq)))
rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
+ blk_stat_set_issue_time(&rq->issue_stat);
+
blk_add_timer(rq);
/*
@@ -1723,6 +1735,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
spin_lock_init(&__ctx->lock);
INIT_LIST_HEAD(&__ctx->rq_list);
__ctx->queue = q;
+ blk_stat_init(&__ctx->stat[0]);
+ blk_stat_init(&__ctx->stat[1]);
/* If the cpu isn't online, the cpu is mapped to first hctx */
if (!cpu_online(i))
diff --git a/block/blk-mq.h b/block/blk-mq.h
index e5d25249028c..8cf16cb69f64 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -1,6 +1,8 @@
#ifndef INT_BLK_MQ_H
#define INT_BLK_MQ_H
+#include "blk-stat.h"
+
struct blk_mq_tag_set;
struct blk_mq_ctx {
@@ -18,6 +20,7 @@ struct blk_mq_ctx {
/* incremented at completion time */
unsigned long ____cacheline_aligned_in_smp rq_completed[2];
+ struct blk_rq_stat stat[2];
struct request_queue *queue;
struct kobject kobj;
diff --git a/block/blk-stat.c b/block/blk-stat.c
new file mode 100644
index 000000000000..642afdc6d0f8
--- /dev/null
+++ b/block/blk-stat.c
@@ -0,0 +1,226 @@
+/*
+ * Block stat tracking code
+ *
+ * Copyright (C) 2016 Jens Axboe
+ */
+#include <linux/kernel.h>
+#include <linux/blk-mq.h>
+
+#include "blk-stat.h"
+#include "blk-mq.h"
+
+static void blk_stat_flush_batch(struct blk_rq_stat *stat)
+{
+ if (!stat->nr_batch)
+ return;
+ if (!stat->nr_samples)
+ stat->mean = div64_s64(stat->batch, stat->nr_batch);
+ else {
+ stat->mean = div64_s64((stat->mean * stat->nr_samples) +
+ stat->batch,
+ stat->nr_samples + stat->nr_batch);
+ }
+
+ stat->nr_samples += stat->nr_batch;
+ stat->nr_batch = stat->batch = 0;
+}
+
+void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
+{
+ if (!src->nr_samples)
+ return;
+
+ blk_stat_flush_batch(src);
+
+ dst->min = min(dst->min, src->min);
+ dst->max = max(dst->max, src->max);
+
+ if (!dst->nr_samples)
+ dst->mean = src->mean;
+ else {
+ dst->mean = div64_s64((src->mean * src->nr_samples) +
+ (dst->mean * dst->nr_samples),
+ dst->nr_samples + src->nr_samples);
+ }
+ dst->nr_samples += src->nr_samples;
+}
+
+static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+{
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+ uint64_t latest = 0;
+ int i, j, nr;
+
+ blk_stat_init(&dst[0]);
+ blk_stat_init(&dst[1]);
+
+ nr = 0;
+ do {
+ uint64_t newest = 0;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ hctx_for_each_ctx(hctx, ctx, j) {
+ if (!ctx->stat[0].nr_samples &&
+ !ctx->stat[1].nr_samples)
+ continue;
+ if (ctx->stat[0].time > newest)
+ newest = ctx->stat[0].time;
+ if (ctx->stat[1].time > newest)
+ newest = ctx->stat[1].time;
+ }
+ }
+
+ /*
+ * No samples
+ */
+ if (!newest)
+ break;
+
+ if (newest > latest)
+ latest = newest;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ hctx_for_each_ctx(hctx, ctx, j) {
+ if (ctx->stat[0].time == newest) {
+ blk_stat_sum(&dst[0], &ctx->stat[0]);
+ nr++;
+ }
+ if (ctx->stat[1].time == newest) {
+ blk_stat_sum(&dst[1], &ctx->stat[1]);
+ nr++;
+ }
+ }
+ }
+ /*
+ * If we race on finding an entry, just loop back again.
+ * Should be very rare.
+ */
+ } while (!nr);
+
+ dst[0].time = dst[1].time = latest;
+}
+
+void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst)
+{
+ if (q->mq_ops)
+ blk_mq_stat_get(q, dst);
+ else {
+ memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat));
+ memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat));
+ }
+}
+
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst)
+{
+ struct blk_mq_ctx *ctx;
+ unsigned int i, nr;
+
+ nr = 0;
+ do {
+ uint64_t newest = 0;
+
+ hctx_for_each_ctx(hctx, ctx, i) {
+ if (!ctx->stat[0].nr_samples &&
+ !ctx->stat[1].nr_samples)
+ continue;
+
+ if (ctx->stat[0].time > newest)
+ newest = ctx->stat[0].time;
+ if (ctx->stat[1].time > newest)
+ newest = ctx->stat[1].time;
+ }
+
+ if (!newest)
+ break;
+
+ hctx_for_each_ctx(hctx, ctx, i) {
+ if (ctx->stat[0].time == newest) {
+ blk_stat_sum(&dst[0], &ctx->stat[0]);
+ nr++;
+ }
+ if (ctx->stat[1].time == newest) {
+ blk_stat_sum(&dst[1], &ctx->stat[1]);
+ nr++;
+ }
+ }
+ /*
+ * If we race on finding an entry, just loop back again.
+ * Should be very rare, as the window is only updated
+ * occasionally
+ */
+ } while (!nr);
+}
+
+static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now)
+{
+ stat->min = -1ULL;
+ stat->max = stat->nr_samples = stat->mean = 0;
+ stat->batch = stat->nr_batch = 0;
+ stat->time = time_now & BLK_STAT_NSEC_MASK;
+}
+
+void blk_stat_init(struct blk_rq_stat *stat)
+{
+ __blk_stat_init(stat, ktime_to_ns(ktime_get()));
+}
+
+static bool __blk_stat_is_current(struct blk_rq_stat *stat, s64 now)
+{
+ return (now & BLK_STAT_NSEC_MASK) == (stat->time & BLK_STAT_NSEC_MASK);
+}
+
+bool blk_stat_is_current(struct blk_rq_stat *stat)
+{
+ return __blk_stat_is_current(stat, ktime_to_ns(ktime_get()));
+}
+
+void blk_stat_add(struct blk_rq_stat *stat, struct request *rq)
+{
+ s64 now, value;
+
+ now = __blk_stat_time(ktime_to_ns(ktime_get()));
+ if (now < blk_stat_time(&rq->issue_stat))
+ return;
+
+ if (!__blk_stat_is_current(stat, now))
+ __blk_stat_init(stat, now);
+
+ value = now - blk_stat_time(&rq->issue_stat);
+ if (value > stat->max)
+ stat->max = value;
+ if (value < stat->min)
+ stat->min = value;
+
+ if (stat->batch + value < stat->batch ||
+ stat->nr_batch + 1 == BLK_RQ_STAT_BATCH)
+ blk_stat_flush_batch(stat);
+
+ stat->batch += value;
+ stat->nr_batch++;
+}
+
+void blk_stat_clear(struct request_queue *q)
+{
+ if (q->mq_ops) {
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+ int i, j;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ hctx_for_each_ctx(hctx, ctx, j) {
+ blk_stat_init(&ctx->stat[0]);
+ blk_stat_init(&ctx->stat[1]);
+ }
+ }
+ } else {
+ blk_stat_init(&q->rq_stats[0]);
+ blk_stat_init(&q->rq_stats[1]);
+ }
+}
+
+void blk_stat_set_issue_time(struct blk_issue_stat *stat)
+{
+ stat->time = (stat->time & BLK_STAT_MASK) |
+ (ktime_to_ns(ktime_get()) & BLK_STAT_TIME_MASK);
+}
diff --git a/block/blk-stat.h b/block/blk-stat.h
new file mode 100644
index 000000000000..26b1f45dff26
--- /dev/null
+++ b/block/blk-stat.h
@@ -0,0 +1,37 @@
+#ifndef BLK_STAT_H
+#define BLK_STAT_H
+
+/*
+ * ~0.13s window as a power-of-2 (2^27 nsecs)
+ */
+#define BLK_STAT_NSEC 134217728ULL
+#define BLK_STAT_NSEC_MASK ~(BLK_STAT_NSEC - 1)
+
+/*
+ * Upper 3 bits can be used elsewhere
+ */
+#define BLK_STAT_RES_BITS 3
+#define BLK_STAT_SHIFT (64 - BLK_STAT_RES_BITS)
+#define BLK_STAT_TIME_MASK ((1ULL << BLK_STAT_SHIFT) - 1)
+#define BLK_STAT_MASK ~BLK_STAT_TIME_MASK
+
+void blk_stat_add(struct blk_rq_stat *, struct request *);
+void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *);
+void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *);
+void blk_stat_clear(struct request_queue *q);
+void blk_stat_init(struct blk_rq_stat *);
+void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *);
+bool blk_stat_is_current(struct blk_rq_stat *);
+void blk_stat_set_issue_time(struct blk_issue_stat *);
+
+static inline u64 __blk_stat_time(u64 time)
+{
+ return time & BLK_STAT_TIME_MASK;
+}
+
+static inline u64 blk_stat_time(struct blk_issue_stat *stat)
+{
+ return __blk_stat_time(stat->time);
+}
+
+#endif
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 9cc8d7c5439a..3228ae396e3e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -384,6 +384,26 @@ static ssize_t queue_dax_show(struct request_queue *q, char *page)
return queue_var_show(blk_queue_dax(q), page);
}
+static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre)
+{
+ return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n",
+ pre, (long long) stat->nr_samples,
+ (long long) stat->mean, (long long) stat->min,
+ (long long) stat->max);
+}
+
+static ssize_t queue_stats_show(struct request_queue *q, char *page)
+{
+ struct blk_rq_stat stat[2];
+ ssize_t ret;
+
+ blk_queue_stat_get(q, stat);
+
+ ret = print_stat(page, &stat[0], "read :");
+ ret += print_stat(page + ret, &stat[1], "write:");
+ return ret;
+}
+
static struct queue_sysfs_entry queue_requests_entry = {
.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
.show = queue_requests_show,
@@ -526,6 +546,11 @@ static struct queue_sysfs_entry queue_dax_entry = {
.show = queue_dax_show,
};
+static struct queue_sysfs_entry queue_stats_entry = {
+ .attr = {.name = "stats", .mode = S_IRUGO },
+ .show = queue_stats_show,
+};
+
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
@@ -553,6 +578,7 @@ static struct attribute *default_attrs[] = {
&queue_poll_entry.attr,
&queue_wc_entry.attr,
&queue_dax_entry.attr,
+ &queue_stats_entry.attr,
NULL,
};
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 765994c5b2c5..cb5d746001d5 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -273,4 +273,20 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie)
return cookie & ((1u << BLK_QC_T_SHIFT) - 1);
}
+struct blk_issue_stat {
+ u64 time;
+};
+
+#define BLK_RQ_STAT_BATCH 64
+
+struct blk_rq_stat {
+ s64 mean;
+ u64 min;
+ u64 max;
+ s32 nr_samples;
+ s32 nr_batch;
+ u64 batch;
+ s64 time;
+};
+
#endif /* __LINUX_BLK_TYPES_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 86b15f9fc9b2..8a0056b5772e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -151,6 +151,7 @@ struct request {
struct gendisk *rq_disk;
struct hd_struct *part;
unsigned long start_time;
+ struct blk_issue_stat issue_stat;
#ifdef CONFIG_BLK_CGROUP
struct request_list *rl; /* rl this rq is alloced from */
unsigned long long start_time_ns;
@@ -414,6 +415,9 @@ struct request_queue {
unsigned int nr_sorted;
unsigned int in_flight[2];
+
+ struct blk_rq_stat rq_stats[2];
+
/*
* Number of active block driver functions for which blk_drain_queue()
* must wait. Must be incremented around functions that unlock the
--
2.7.4