[PATCH v3 1/3] block: track per-node I/O latency

From: John Meneghini
Date: Thu May 09 2024 - 16:47:55 EST


From: Hannes Reinecke <hare@xxxxxxxxxx>

Add a new option 'BLK_NODE_LATENCY' to track per-node I/O latency.
This can be used by I/O schedulers to determine the 'best' queue
to send I/O to.

Signed-off-by: Hannes Reinecke <hare@xxxxxxxxxx>

Cleaned up checkpatch warnings and updated MAINTAINERS.

Signed-off-by: John Meneghini <jmeneghi@xxxxxxxxxx>
---
MAINTAINERS | 1 +
block/Kconfig | 9 +
block/Makefile | 1 +
block/blk-mq-debugfs.c | 2 +
block/blk-nlatency.c | 389 +++++++++++++++++++++++++++++++++++++++++
block/blk-rq-qos.h | 6 +
include/linux/blk-mq.h | 11 ++
7 files changed, 419 insertions(+)
create mode 100644 block/blk-nlatency.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 7c121493f43d..a4634365c82f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5405,6 +5405,7 @@ F: block/bfq-cgroup.c
F: block/blk-cgroup.c
F: block/blk-iocost.c
F: block/blk-iolatency.c
+F: block/blk-nlatency.c
F: block/blk-throttle.c
F: include/linux/blk-cgroup.h

diff --git a/block/Kconfig b/block/Kconfig
index 1de4682d48cc..641ed39d609c 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -186,6 +186,15 @@ config BLK_CGROUP_IOPRIO
scheduler and block devices process requests. Only some I/O schedulers
and some block devices support I/O priorities.

+config BLK_NODE_LATENCY
+ bool "Track per-node I/O latency"
+ help
+ Enable per-node I/O latency tracking for multipathing. This uses the
+ blk-nodelat latency tracker to provide latencies for each node, and schedules
+ I/O on the path with the least latency for the submitting node. This can be
+ used by I/O schedulers to determine the node with the least latency. Currently
+ only supports nvme over fabrics devices.
+
config BLK_DEBUG_FS
bool "Block layer debugging information in debugfs"
default y
diff --git a/block/Makefile b/block/Makefile
index 46ada9dc8bbf..9d2e71a3e36f 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -21,6 +21,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
+obj-$(CONFIG_BLK_NODE_LATENCY) += blk-nlatency.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 94668e72ab09..cb38228b95d8 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -762,6 +762,8 @@ static const char *rq_qos_id_to_name(enum rq_qos_id id)
return "latency";
case RQ_QOS_COST:
return "cost";
+ case RQ_QOS_NLAT:
+ return "node-latency";
}
return "unknown";
}
diff --git a/block/blk-nlatency.c b/block/blk-nlatency.c
new file mode 100644
index 000000000000..219c3f636d76
--- /dev/null
+++ b/block/blk-nlatency.c
@@ -0,0 +1,389 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Per-node request latency tracking.
+ *
+ * Copyright (C) 2023 Hannes Reinecke
+ *
+ * A simple per-node latency tracker for use by I/O scheduler.
+ * Latencies are measures over 'win_usec' microseconds and stored per node.
+ * If the number of measurements falls below 'lowat' the measurement is
+ * assumed to be unreliable and will become 'stale'.
+ * These 'stale' latencies can be 'decayed', where during each measurement
+ * interval the 'stale' latency value is decreased by 'decay' percent.
+ * Once the 'stale' latency reaches zero it will be updated by the
+ * measured latency.
+ */
+#include <linux/kernel.h>
+#include <linux/blk_types.h>
+#include <linux/slab.h>
+
+#include "blk-stat.h"
+#include "blk-rq-qos.h"
+#include "blk.h"
+
+#define NLAT_DEFAULT_LOWAT 2
+#define NLAT_DEFAULT_DECAY 50
+
+struct rq_nlat {
+ struct rq_qos rqos;
+
+ u64 win_usec; /* latency measurement window in microseconds */
+ unsigned int lowat; /* Low Watermark latency measurement */
+ unsigned int decay; /* Percentage for 'decaying' latencies */
+ bool enabled;
+
+ struct blk_stat_callback *cb;
+
+ unsigned int num;
+ u64 *latency;
+ unsigned int *samples;
+};
+
+static inline struct rq_nlat *RQNLAT(struct rq_qos *rqos)
+{
+ return container_of(rqos, struct rq_nlat, rqos);
+}
+
+static u64 nlat_default_latency_usec(struct request_queue *q)
+{
+ /*
+ * We default to 2msec for non-rotational storage, and 75msec
+ * for rotational storage.
+ */
+ if (blk_queue_nonrot(q))
+ return 2000ULL;
+ else
+ return 75000ULL;
+}
+
+static void nlat_timer_fn(struct blk_stat_callback *cb)
+{
+ struct rq_nlat *nlat = cb->data;
+ int n;
+
+ for (n = 0; n < cb->buckets; n++) {
+ if (cb->stat[n].nr_samples < nlat->lowat) {
+ /*
+ * 'decay' the latency by the specified
+ * percentage to ensure the queues are
+ * being tested to balance out temporary
+ * latency spikes.
+ */
+ nlat->latency[n] =
+ div64_u64(nlat->latency[n] * nlat->decay, 100);
+ } else
+ nlat->latency[n] = cb->stat[n].mean;
+ nlat->samples[n] = cb->stat[n].nr_samples;
+ }
+ if (nlat->enabled)
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+}
+
+static int nlat_bucket_node(const struct request *rq)
+{
+ if (!rq->mq_ctx)
+ return -1;
+ return cpu_to_node(blk_mq_rq_cpu((struct request *)rq));
+}
+
+static void nlat_exit(struct rq_qos *rqos)
+{
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ blk_stat_remove_callback(nlat->rqos.disk->queue, nlat->cb);
+ blk_stat_free_callback(nlat->cb);
+ kfree(nlat->samples);
+ kfree(nlat->latency);
+ kfree(nlat);
+}
+
+#ifdef CONFIG_BLK_DEBUG_FS
+static int nlat_win_usec_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%llu\n", nlat->win_usec);
+ return 0;
+}
+
+static ssize_t nlat_win_usec_write(void *data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ char val[16] = { };
+ u64 usec;
+ int err;
+
+ if (blk_queue_dying(nlat->rqos.disk->queue))
+ return -ENOENT;
+
+ if (count >= sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_user(val, buf, count))
+ return -EFAULT;
+
+ err = kstrtoull(val, 10, &usec);
+ if (err)
+ return err;
+ blk_stat_deactivate(nlat->cb);
+ nlat->win_usec = usec;
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+ return count;
+}
+
+static int nlat_lowat_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%u\n", nlat->lowat);
+ return 0;
+}
+
+static ssize_t nlat_lowat_write(void *data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ char val[16] = { };
+ unsigned int lowat;
+ int err;
+
+ if (blk_queue_dying(nlat->rqos.disk->queue))
+ return -ENOENT;
+
+ if (count >= sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_user(val, buf, count))
+ return -EFAULT;
+
+ err = kstrtouint(val, 10, &lowat);
+ if (err)
+ return err;
+ blk_stat_deactivate(nlat->cb);
+ nlat->lowat = lowat;
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+ return count;
+}
+
+static int nlat_decay_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%u\n", nlat->decay);
+ return 0;
+}
+
+static ssize_t nlat_decay_write(void *data, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ char val[16] = { };
+ unsigned int decay;
+ int err;
+
+ if (blk_queue_dying(nlat->rqos.disk->queue))
+ return -ENOENT;
+
+ if (count >= sizeof(val))
+ return -EINVAL;
+
+ if (copy_from_user(val, buf, count))
+ return -EFAULT;
+
+ err = kstrtouint(val, 10, &decay);
+ if (err)
+ return err;
+ if (decay > 100)
+ return -EINVAL;
+ blk_stat_deactivate(nlat->cb);
+ nlat->decay = decay;
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+ return count;
+}
+
+static int nlat_enabled_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+
+ seq_printf(m, "%d\n", nlat->enabled);
+ return 0;
+}
+
+static int nlat_id_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+
+ seq_printf(m, "%u\n", rqos->id);
+ return 0;
+}
+
+static int nlat_latency_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ int n;
+
+ if (!nlat->enabled)
+ return 0;
+
+ for (n = 0; n < nlat->num; n++) {
+ if (n > 0)
+ seq_puts(m, " ");
+ seq_printf(m, "%llu", nlat->latency[n]);
+ }
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static int nlat_samples_show(void *data, struct seq_file *m)
+{
+ struct rq_qos *rqos = data;
+ struct rq_nlat *nlat = RQNLAT(rqos);
+ int n;
+
+ if (!nlat->enabled)
+ return 0;
+
+ for (n = 0; n < nlat->num; n++) {
+ if (n > 0)
+ seq_puts(m, " ");
+ seq_printf(m, "%u", nlat->samples[n]);
+ }
+ seq_puts(m, "\n");
+ return 0;
+}
+
+static const struct blk_mq_debugfs_attr nlat_debugfs_attrs[] = {
+ {"win_usec", 0600, nlat_win_usec_show, nlat_win_usec_write},
+ {"lowat", 0600, nlat_lowat_show, nlat_lowat_write},
+ {"decay", 0600, nlat_decay_show, nlat_decay_write},
+ {"enabled", 0400, nlat_enabled_show},
+ {"id", 0400, nlat_id_show},
+ {"latency", 0400, nlat_latency_show},
+ {"samples", 0400, nlat_samples_show},
+ {},
+};
+#endif
+
+static const struct rq_qos_ops nlat_rqos_ops = {
+ .exit = nlat_exit,
+#ifdef CONFIG_BLK_DEBUG_FS
+ .debugfs_attrs = nlat_debugfs_attrs,
+#endif
+};
+
+u64 blk_nlat_latency(struct gendisk *disk, int node)
+{
+ struct rq_qos *rqos;
+ struct rq_nlat *nlat;
+
+ rqos = nlat_rq_qos(disk->queue);
+ if (!rqos)
+ return 0;
+ nlat = RQNLAT(rqos);
+ if (node > nlat->num)
+ return 0;
+
+ return div64_u64(nlat->latency[node], 1000);
+}
+EXPORT_SYMBOL_GPL(blk_nlat_latency);
+
+int blk_nlat_enable(struct gendisk *disk)
+{
+ struct rq_qos *rqos;
+ struct rq_nlat *nlat;
+
+ /* Latency tracking not enabled? */
+ rqos = nlat_rq_qos(disk->queue);
+ if (!rqos)
+ return -EINVAL;
+ nlat = RQNLAT(rqos);
+ if (nlat->enabled)
+ return 0;
+
+ /* Queue not registered? Maybe shutting down... */
+ if (!blk_queue_registered(disk->queue))
+ return -EAGAIN;
+
+ nlat->enabled = true;
+ memset(nlat->latency, 0, sizeof(u64) * nlat->num);
+ memset(nlat->samples, 0, sizeof(unsigned int) * nlat->num);
+ blk_stat_activate_nsecs(nlat->cb, nlat->win_usec * 1000);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blk_nlat_enable);
+
+void blk_nlat_disable(struct gendisk *disk)
+{
+ struct rq_qos *rqos = nlat_rq_qos(disk->queue);
+ struct rq_nlat *nlat;
+
+ if (!rqos)
+ return;
+ nlat = RQNLAT(rqos);
+ if (nlat->enabled) {
+ blk_stat_deactivate(nlat->cb);
+ nlat->enabled = false;
+ }
+}
+EXPORT_SYMBOL_GPL(blk_nlat_disable);
+
+int blk_nlat_init(struct gendisk *disk)
+{
+ struct rq_nlat *nlat;
+ int ret = -ENOMEM;
+
+ nlat = kzalloc(sizeof(*nlat), GFP_KERNEL);
+ if (!nlat)
+ return -ENOMEM;
+
+ nlat->num = num_possible_nodes();
+ nlat->lowat = NLAT_DEFAULT_LOWAT;
+ nlat->decay = NLAT_DEFAULT_DECAY;
+ nlat->win_usec = nlat_default_latency_usec(disk->queue);
+
+ nlat->latency = kcalloc(nlat->num, sizeof(u64), GFP_KERNEL);
+ if (!nlat->latency)
+ goto err_free;
+ nlat->samples = kcalloc(nlat->num, sizeof(unsigned int), GFP_KERNEL);
+ if (!nlat->samples)
+ goto err_free;
+ nlat->cb = blk_stat_alloc_callback(nlat_timer_fn, nlat_bucket_node,
+ nlat->num, nlat);
+ if (!nlat->cb)
+ goto err_free;
+
+ /*
+ * Assign rwb and add the stats callback.
+ */
+ mutex_lock(&disk->queue->rq_qos_mutex);
+ ret = rq_qos_add(&nlat->rqos, disk, RQ_QOS_NLAT, &nlat_rqos_ops);
+ mutex_unlock(&disk->queue->rq_qos_mutex);
+ if (ret)
+ goto err_free_cb;
+
+ blk_stat_add_callback(disk->queue, nlat->cb);
+
+ return 0;
+
+err_free_cb:
+ blk_stat_free_callback(nlat->cb);
+err_free:
+ kfree(nlat->samples);
+ kfree(nlat->latency);
+ kfree(nlat);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blk_nlat_init);
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 37245c97ee61..2fc11ced0c00 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -17,6 +17,7 @@ enum rq_qos_id {
RQ_QOS_WBT,
RQ_QOS_LATENCY,
RQ_QOS_COST,
+ RQ_QOS_NLAT,
};

struct rq_wait {
@@ -79,6 +80,11 @@ static inline struct rq_qos *iolat_rq_qos(struct request_queue *q)
return rq_qos_id(q, RQ_QOS_LATENCY);
}

+static inline struct rq_qos *nlat_rq_qos(struct request_queue *q)
+{
+ return rq_qos_id(q, RQ_QOS_NLAT);
+}
+
static inline void rq_wait_init(struct rq_wait *rq_wait)
{
atomic_set(&rq_wait->inflight, 0);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d3d8fd8e229b..1f3829627f1b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1231,4 +1231,15 @@ static inline bool blk_req_can_dispatch_to_zone(struct request *rq)
}
#endif /* CONFIG_BLK_DEV_ZONED */

+#ifdef CONFIG_BLK_NODE_LATENCY
+int blk_nlat_enable(struct gendisk *disk);
+void blk_nlat_disable(struct gendisk *disk);
+u64 blk_nlat_latency(struct gendisk *disk, int node);
+int blk_nlat_init(struct gendisk *disk);
+#else
+static inline int blk_nlat_enable(struct gendisk *disk) { return 0; }
+static inline void blk_nlat_disable(struct gendisk *disk) {}
+static inline u64 blk_nlat_latency(struct gendisk *disk, int node) { return 0; }
+static inline int blk_nlat_init(struct gendisk *disk) { return -EOPNOTSUPP; }
+#endif
#endif /* BLK_MQ_H */
--
2.39.3