[PATCH 10/10] blkcg: implement BPF_PROG_TYPE_IO_COST

From: Tejun Heo
Date: Thu Jun 13 2019 - 22:01:56 EST


Currently, blkcg implements one builtin IO cost model - lienar. To
allow customization and experimentation, allow a bpf program to
override IO cost model.

Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
---
block/Kconfig | 3 +
block/blk-ioweight.c | 148 +++++++++++++++++-
block/blk.h | 8 +
block/ioctl.c | 4 +
include/linux/bpf_types.h | 3 +
include/uapi/linux/bpf.h | 11 ++
include/uapi/linux/fs.h | 2 +
tools/bpf/bpftool/feature.c | 3 +
tools/bpf/bpftool/main.h | 1 +
tools/include/uapi/linux/bpf.h | 11 ++
tools/include/uapi/linux/fs.h | 2 +
tools/lib/bpf/libbpf.c | 2 +
tools/lib/bpf/libbpf_probes.c | 1 +
tools/testing/selftests/bpf/Makefile | 2 +-
tools/testing/selftests/bpf/iocost_ctrl.c | 43 +++++
.../selftests/bpf/progs/iocost_linear_prog.c | 52 ++++++
16 files changed, 287 insertions(+), 9 deletions(-)
create mode 100644 tools/testing/selftests/bpf/iocost_ctrl.c
create mode 100644 tools/testing/selftests/bpf/progs/iocost_linear_prog.c

diff --git a/block/Kconfig b/block/Kconfig
index 15b3de28a264..2882fdd573ca 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -204,4 +204,7 @@ config BLK_MQ_RDMA
config BLK_PM
def_bool BLOCK && PM

+config BLK_BPF_IO_COST
+ def_bool BLK_CGROUP_IOWEIGHT && BPF_SYSCALL
+
source "block/Kconfig.iosched"
diff --git a/block/blk-ioweight.c b/block/blk-ioweight.c
index 3d9fc1a631be..de4fc57bb77c 100644
--- a/block/blk-ioweight.c
+++ b/block/blk-ioweight.c
@@ -43,6 +43,10 @@
* parameters can be configured from userspace via
* /sys/block/DEV/queue/io_cost_model.
*
+ * For experimentations and refinements, the IO model can also be replaced
+ * by a IO_COST bpf program. Take a look at progs/iocost_linear_prog.c and
+ * iocost_ctrl.c under tools/testing/selftests/bpf for details on how-to.
+ *
* 2. Control Strategy
*
* The device virtual time (vtime) is used as the primary control metric.
@@ -176,6 +180,7 @@
#include <linux/parser.h>
#include <linux/sched/signal.h>
#include <linux/blk-cgroup.h>
+#include <linux/filter.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
#include "blk-wbt.h"
@@ -387,6 +392,10 @@ struct iow {
bool enabled;

struct iow_params params;
+#ifdef CONFIG_BPF_SYSCALL
+ /* if non-NULL, bpf cost model is being used */
+ struct bpf_prog __rcu *cost_prog;
+#endif
u32 period_us;
u32 margin_us;
u64 vrate_min;
@@ -1565,6 +1574,45 @@ static void iow_timer_fn(struct timer_list *timer)
spin_unlock_irq(&iow->lock);
}

+#ifdef CONFIG_BLK_BPF_IO_COST
+static bool calc_vtime_cost_bpf(struct bio *bio, struct iow_gq *iowg,
+ bool is_merge, u64 *costp)
+{
+ struct iow *iow = iowg->iow;
+ struct bpf_prog *prog;
+ bool ret = false;
+
+ if (!iow->cost_prog)
+ return ret;
+
+ rcu_read_lock();
+ prog = rcu_dereference(iow->cost_prog);
+ if (prog) {
+ struct bpf_io_cost ctx = {
+ .cost = 0,
+ .opf = bio->bi_opf,
+ .nr_sectors = bio_sectors(bio),
+ .sector = bio->bi_iter.bi_sector,
+ .last_sector = iowg->cursor,
+ .is_merge = is_merge,
+ };
+
+ BPF_PROG_RUN(prog, &ctx);
+ *costp = ctx.cost;
+ ret = true;
+ }
+ rcu_read_unlock();
+
+ return ret;
+}
+#else
+static bool calc_vtime_cost_bpf(struct bio *bio, struct iow_gq *iowg,
+ bool is_merge, u64 *costp)
+{
+ return false;
+}
+#endif
+
static void calc_vtime_cost_builtin(struct bio *bio, struct iow_gq *iowg,
bool is_merge, u64 *costp)
{
@@ -1610,6 +1658,9 @@ static u64 calc_vtime_cost(struct bio *bio, struct iow_gq *iowg, bool is_merge)
{
u64 cost;

+ if (calc_vtime_cost_bpf(bio, iowg, is_merge, &cost))
+ return cost;
+
calc_vtime_cost_builtin(bio, iowg, is_merge, &cost);
return cost;
}
@@ -2214,14 +2265,17 @@ static u64 iow_cost_model_prfill(struct seq_file *sf,
if (!dname)
return 0;

- seq_printf(sf, "%s ctrl=%s model=linear "
- "rbps=%llu rseqiops=%llu rrandiops=%llu "
- "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
- dname, iow->user_cost_model ? "user" : "auto",
- u[I_LCOEF_RBPS],
- u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
- u[I_LCOEF_WBPS],
- u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
+ if (iow->cost_prog)
+ seq_printf(sf, "%s ctrl=bpf\n", dname);
+ else
+ seq_printf(sf, "%s ctrl=%s model=linear "
+ "rbps=%llu rseqiops=%llu rrandiops=%llu "
+ "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
+ dname, iow->user_cost_model ? "user" : "auto",
+ u[I_LCOEF_RBPS],
+ u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
+ u[I_LCOEF_WBPS],
+ u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
return 0;
}

@@ -2363,6 +2417,84 @@ static struct blkcg_policy blkcg_policy_iow = {
.pd_free_fn = iow_pd_free,
};

+#ifdef CONFIG_BLK_BPF_IO_COST
+static bool io_cost_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= sizeof(struct bpf_io_cost) || off % size)
+ return false;
+
+ if (off != offsetof(struct bpf_io_cost, cost) && type != BPF_READ)
+ return false;
+
+ switch (off) {
+ case bpf_ctx_range(struct bpf_io_cost, opf):
+ bpf_ctx_record_field_size(info, sizeof(__u32));
+ return bpf_ctx_narrow_access_ok(off, size, sizeof(__u32));
+ case offsetof(struct bpf_io_cost, nr_sectors):
+ return size == sizeof(__u32);
+ case offsetof(struct bpf_io_cost, cost):
+ case offsetof(struct bpf_io_cost, sector):
+ case offsetof(struct bpf_io_cost, last_sector):
+ return size == sizeof(__u64);
+ case offsetof(struct bpf_io_cost, is_merge):
+ return size == sizeof(__u8);
+ }
+
+ return false;
+}
+
+const struct bpf_prog_ops io_cost_prog_ops = {
+};
+
+const struct bpf_verifier_ops io_cost_verifier_ops = {
+ .is_valid_access = io_cost_is_valid_access,
+};
+
+int blk_bpf_io_cost_ioctl(struct block_device *bdev, unsigned cmd,
+ char __user *arg)
+{
+ int prog_fd = (int)(long)arg;
+ struct bpf_prog *prog = NULL;
+ struct request_queue *q;
+ struct iow *iow;
+ int ret = 0;
+
+ q = bdev_get_queue(bdev);
+ if (!q)
+ return -ENXIO;
+ iow = q_to_iow(q);
+
+ if (prog_fd >= 0) {
+ prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_IO_COST);
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ spin_lock_irq(&iow->lock);
+ if (!iow->cost_prog) {
+ rcu_assign_pointer(iow->cost_prog, prog);
+ prog = NULL;
+ } else {
+ ret = -EEXIST;
+ }
+ spin_unlock_irq(&iow->lock);
+ } else {
+ spin_lock_irq(&iow->lock);
+ if (iow->cost_prog) {
+ prog = iow->cost_prog;
+ rcu_assign_pointer(iow->cost_prog, NULL);
+ }
+ spin_unlock_irq(&iow->lock);
+ }
+
+ if (prog)
+ bpf_prog_put(prog);
+ return ret;
+}
+#endif /* CONFIG_BLK_BPF_IO_COST */
+
static int __init iow_init(void)
{
return blkcg_policy_register(&blkcg_policy_iow);
diff --git a/block/blk.h b/block/blk.h
index 7814aa207153..98fa2283534f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -317,6 +317,14 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
}
#endif /* CONFIG_BOUNCE */

+#ifdef CONFIG_BLK_BPF_IO_COST
+int blk_bpf_io_cost_ioctl(struct block_device *bdev, unsigned cmd,
+ char __user *arg);
+#else
+static inline int blk_bpf_io_cost_ioctl(struct block_device *bdev, unsigned cmd,
+ char __user *arg) { return -ENOTTY; }
+#endif
+
#ifdef CONFIG_BLK_CGROUP_IOLATENCY
extern int blk_iolatency_init(struct request_queue *q);
#else
diff --git a/block/ioctl.c b/block/ioctl.c
index 15a0eb80ada9..89d48d7dea0f 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -11,6 +11,8 @@
#include <linux/pr.h>
#include <linux/uaccess.h>

+#include "blk.h"
+
static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user *arg)
{
struct block_device *bdevp;
@@ -590,6 +592,8 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKTRACESETUP:
case BLKTRACETEARDOWN:
return blk_trace_ioctl(bdev, cmd, argp);
+ case BLKBPFIOCOST:
+ return blk_bpf_io_cost_ioctl(bdev, cmd, argp);
case IOC_PR_REGISTER:
return blkdev_pr_register(bdev, argp);
case IOC_PR_RESERVE:
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 5a9975678d6f..fb0a91c655c2 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -37,6 +37,9 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
#ifdef CONFIG_INET
BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
#endif
+#ifdef CONFIG_BLK_BPF_IO_COST
+BPF_PROG_TYPE(BPF_PROG_TYPE_IO_COST, io_cost)
+#endif

BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 63e0cf66f01a..1664ef4ccc79 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -170,6 +170,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_FLOW_DISSECTOR,
BPF_PROG_TYPE_CGROUP_SYSCTL,
BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+ BPF_PROG_TYPE_IO_COST,
};

enum bpf_attach_type {
@@ -3472,6 +3473,16 @@ struct bpf_flow_keys {
};
};

+struct bpf_io_cost {
+ __u64 cost; /* output */
+
+ __u32 opf;
+ __u32 nr_sectors;
+ __u64 sector;
+ __u64 last_sector;
+ __u8 is_merge;
+};
+
struct bpf_func_info {
__u32 insn_off;
__u32 type_id;
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 59c71fa8c553..ddf3c80c9407 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -181,6 +181,8 @@ struct fsxattr {
#define BLKSECDISCARD _IO(0x12,125)
#define BLKROTATIONAL _IO(0x12,126)
#define BLKZEROOUT _IO(0x12,127)
+#define BLKBPFIOCOST _IO(0x12, 128)
+
/*
* A jump here: 130-131 are reserved for zoned block devices
* (see uapi/linux/blkzoned.h)
diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index d672d9086fff..beeac8ac48f3 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -383,6 +383,9 @@ static void probe_kernel_image_config(void)
/* bpftilter module with "user mode helper" */
"CONFIG_BPFILTER_UMH",

+ /* Block */
+ "CONFIG_BLK_IO_COST",
+
/* test_bpf module for BPF tests */
"CONFIG_TEST_BPF",
};
diff --git a/tools/bpf/bpftool/main.h b/tools/bpf/bpftool/main.h
index 3d63feb7f852..298e53f35573 100644
--- a/tools/bpf/bpftool/main.h
+++ b/tools/bpf/bpftool/main.h
@@ -74,6 +74,7 @@ static const char * const prog_type_name[] = {
[BPF_PROG_TYPE_SK_REUSEPORT] = "sk_reuseport",
[BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector",
[BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl",
+ [BPF_PROG_TYPE_IO_COST] = "io_cost",
};

extern const char * const map_type_name[];
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 63e0cf66f01a..1664ef4ccc79 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -170,6 +170,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_FLOW_DISSECTOR,
BPF_PROG_TYPE_CGROUP_SYSCTL,
BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE,
+ BPF_PROG_TYPE_IO_COST,
};

enum bpf_attach_type {
@@ -3472,6 +3473,16 @@ struct bpf_flow_keys {
};
};

+struct bpf_io_cost {
+ __u64 cost; /* output */
+
+ __u32 opf;
+ __u32 nr_sectors;
+ __u64 sector;
+ __u64 last_sector;
+ __u8 is_merge;
+};
+
struct bpf_func_info {
__u32 insn_off;
__u32 type_id;
diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h
index 59c71fa8c553..ddf3c80c9407 100644
--- a/tools/include/uapi/linux/fs.h
+++ b/tools/include/uapi/linux/fs.h
@@ -181,6 +181,8 @@ struct fsxattr {
#define BLKSECDISCARD _IO(0x12,125)
#define BLKROTATIONAL _IO(0x12,126)
#define BLKZEROOUT _IO(0x12,127)
+#define BLKBPFIOCOST _IO(0x12, 128)
+
/*
* A jump here: 130-131 are reserved for zoned block devices
* (see uapi/linux/blkzoned.h)
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 197b574406b3..6dbee409f3b0 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -2266,6 +2266,7 @@ static bool bpf_prog_type__needs_kver(enum bpf_prog_type type)
case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE:
case BPF_PROG_TYPE_PERF_EVENT:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_IO_COST:
return false;
case BPF_PROG_TYPE_KPROBE:
default:
@@ -3168,6 +3169,7 @@ static const struct {
BPF_PROG_SEC("lwt_out", BPF_PROG_TYPE_LWT_OUT),
BPF_PROG_SEC("lwt_xmit", BPF_PROG_TYPE_LWT_XMIT),
BPF_PROG_SEC("lwt_seg6local", BPF_PROG_TYPE_LWT_SEG6LOCAL),
+ BPF_PROG_SEC("io_cost", BPF_PROG_TYPE_IO_COST),
BPF_APROG_SEC("cgroup_skb/ingress", BPF_PROG_TYPE_CGROUP_SKB,
BPF_CGROUP_INET_INGRESS),
BPF_APROG_SEC("cgroup_skb/egress", BPF_PROG_TYPE_CGROUP_SKB,
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 5e2aa83f637a..024831756151 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -101,6 +101,7 @@ probe_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns,
case BPF_PROG_TYPE_SK_REUSEPORT:
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_IO_COST:
default:
break;
}
diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile
index 66f2dca1dee1..c28f308c9575 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -23,7 +23,7 @@ TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test
test_align test_verifier_log test_dev_cgroup test_tcpbpf_user \
test_sock test_btf test_sockmap test_lirc_mode2_user get_cgroup_id_user \
test_socket_cookie test_cgroup_storage test_select_reuseport test_section_names \
- test_netcnt test_tcpnotify_user test_sock_fields test_sysctl
+ test_netcnt test_tcpnotify_user test_sock_fields test_sysctl iocost_ctrl

BPF_OBJ_FILES = $(patsubst %.c,%.o, $(notdir $(wildcard progs/*.c)))
TEST_GEN_FILES = $(BPF_OBJ_FILES)
diff --git a/tools/testing/selftests/bpf/iocost_ctrl.c b/tools/testing/selftests/bpf/iocost_ctrl.c
new file mode 100644
index 000000000000..d9d3eb70d0ac
--- /dev/null
+++ b/tools/testing/selftests/bpf/iocost_ctrl.c
@@ -0,0 +1,43 @@
+#include <stdio.h>
+#include <errno.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+
+#include <linux/bpf.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
+
+#include <linux/fs.h>
+
+int main(int argc, char **argv)
+{
+ struct bpf_object *obj;
+ int dev_fd, prog_fd = -1;
+
+ if (argc < 2) {
+ fprintf(stderr, "Usage: iocost-attach BLKDEV [BPF_PROG]");
+ return 1;
+ }
+
+ dev_fd = open(argv[1], O_RDONLY);
+ if (dev_fd < 0) {
+ perror("open(BLKDEV)");
+ return 1;
+ }
+
+ if (argc > 2) {
+ if (bpf_prog_load(argv[2], BPF_PROG_TYPE_IO_COST,
+ &obj, &prog_fd)) {
+ perror("bpf_prog_load(BPF_PROG)");
+ return 1;
+ }
+ }
+
+ if (ioctl(dev_fd, BLKBPFIOCOST, (long)prog_fd)) {
+ perror("ioctl(BLKBPFIOCOST)");
+ return 1;
+ }
+ return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/iocost_linear_prog.c b/tools/testing/selftests/bpf/progs/iocost_linear_prog.c
new file mode 100644
index 000000000000..4e202c595658
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/iocost_linear_prog.c
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/version.h>
+#include <linux/bpf.h>
+#include "bpf_helpers.h"
+
+#define REQ_OP_READ 0
+#define REQ_OP_WRITE 1
+#define REQ_OP_BITS 8
+#define REQ_OP_MASK ((1 << REQ_OP_BITS) - 1)
+
+#define LCOEF_RSEQIO 14663889
+#define LCOEF_RRANDIO 248752010
+#define LCOEF_RPAGE 28151808
+#define LCOEF_WSEQIO 32671670
+#define LCOEF_WRANDIO 63150006
+#define LCOEF_WPAGE 7323648
+
+#define RAND_IO_CUTOFF 10
+
+SEC("io_cost")
+int func(struct bpf_io_cost *ctx)
+{
+ int op;
+ __u64 seqio, randio, page;
+ __s64 delta;
+
+ switch (ctx->opf & REQ_OP_MASK) {
+ case REQ_OP_READ:
+ seqio = LCOEF_RSEQIO;
+ randio = LCOEF_RRANDIO;
+ page = LCOEF_RPAGE;
+ break;
+ case REQ_OP_WRITE:
+ seqio = LCOEF_WSEQIO;
+ randio = LCOEF_WRANDIO;
+ page = LCOEF_WPAGE;
+ break;
+ default:
+ return 0;
+ }
+
+ delta = ctx->sector - ctx->last_sector;
+ if (delta >= -RAND_IO_CUTOFF && delta <= RAND_IO_CUTOFF)
+ ctx->cost += seqio;
+ else
+ ctx->cost += randio;
+ if (!ctx->is_merge)
+ ctx->cost += page * (ctx->nr_sectors >> 3);
+
+ return 0;
+}
--
2.17.1