[PATCH 1/1] block io layer filters api

From: Sergei Shtepa
Date: Thu Aug 27 2020 - 15:23:00 EST


to register a filter a user of the API would call blk_filter_register()
with blk_filter_ops that would allow it to intercept the following events
in the system:
* bio requests
* addition of a disk
* removal of a disk

to unregister a filter a user of the API would call
blk_filter_unregister() multiple filters can be stacked at different
altitudes when bio request is intercepted, it can be passed to filter
at lower level or it can be sent for completion

Signed-off-by: Sergei Shtepa <sergei.shtepa@xxxxxxxxx>
---
block/Kconfig | 11 ++
block/Makefile | 1 +
block/blk-core.c | 11 +-
block/blk-filter-internal.h | 34 +++++
block/blk-filter.c | 288 ++++++++++++++++++++++++++++++++++++
block/genhd.c | 24 +++
include/linux/blk-filter.h | 41 +++++
include/linux/genhd.h | 2 +
8 files changed, 410 insertions(+), 2 deletions(-)
create mode 100644 block/blk-filter-internal.h
create mode 100644 block/blk-filter.c
create mode 100644 include/linux/blk-filter.h

diff --git a/block/Kconfig b/block/Kconfig
index bbad5e8bbffe..a308801b4376 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -204,6 +204,17 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
by falling back to the kernel crypto API when inline
encryption hardware is not present.

+config BLK_FILTER
+ bool "Enable support for block layer filters"
+ default y
+ depends on MODULES
+ help
+ Enabling this lets third-party kernel modules intercept
+ bio requests for any block device. This allows them to implement
+ changed block tracking and snapshots without any reconfiguration of
+ the existing setup. For example, this option allows snapshotting of
+ a block device without adding it to LVM.
+
menu "Partition Types"

source "block/partitions/Kconfig"
diff --git a/block/Makefile b/block/Makefile
index 8d841f5f986f..b8ee50b8e031 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -38,3 +38,4 @@ obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
+obj-$(CONFIG_BLK_FILTER) += blk-filter.o
diff --git a/block/blk-core.c b/block/blk-core.c
index d9d632639bd1..3421ddeb69e5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -50,6 +50,7 @@
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-rq-qos.h"
+#include "blk-filter-internal.h"

struct dentry *blk_debugfs_root;

@@ -1273,13 +1274,19 @@ blk_qc_t submit_bio(struct bio *bio)
blk_qc_t ret;

psi_memstall_enter(&pflags);
- ret = submit_bio_noacct(bio);
+ if (IS_ENABLED(CONFIG_BLK_FILTER))
+ ret = blk_filter_submit_bio(bio);
+ else
+ ret = submit_bio_noacct(bio);
psi_memstall_leave(&pflags);

return ret;
}

- return submit_bio_noacct(bio);
+ if (IS_ENABLED(CONFIG_BLK_FILTER))
+ return blk_filter_submit_bio(bio);
+ else
+ return submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio);

diff --git a/block/blk-filter-internal.h b/block/blk-filter-internal.h
new file mode 100644
index 000000000000..942066f3fecb
--- /dev/null
+++ b/block/blk-filter-internal.h
@@ -0,0 +1,34 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ *
+ * Block device filters internal declarations
+ */
+
+#ifndef BLK_FILTER_INTERNAL_H
+#define BLK_FILTER_INTERNAL_H
+
+#ifdef CONFIG_BLK_FILTER
+#include <linux/blk-filter.h>
+
+void blk_filter_disk_add(struct gendisk *disk);
+
+void blk_filter_disk_del(struct gendisk *disk);
+
+void blk_filter_disk_release(struct gendisk *disk);
+
+blk_qc_t blk_filter_submit_bio(struct bio *bio);
+
+#else /* CONFIG_BLK_FILTER */
+
+static inline void blk_filter_disk_add(struct gendisk *disk) { }
+
+static inline void blk_filter_disk_del(struct gendisk *disk) { }
+
+static inline void blk_filter_disk_release(struct gendisk *disk) { }
+
+static inline blk_qc_t blk_filter_submit_bio(struct bio *bio) { return 0; }
+
+#endif /* CONFIG_BLK_FILTER */
+
+#endif
diff --git a/block/blk-filter.c b/block/blk-filter.c
new file mode 100644
index 000000000000..edb30f342a3d
--- /dev/null
+++ b/block/blk-filter.c
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/genhd.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include "blk-filter-internal.h"
+#include <linux/rwsem.h>
+
+struct blk_filter_ctx {
+ struct blk_filter *filter;
+ /*
+ * Reserved for extension
+ */
+};
+
+DECLARE_RWSEM(blk_filter_ctx_list_lock);
+struct blk_filter_ctx *blk_filter_ctx_list[BLK_FILTER_ALTITUDE_MAX] = { 0 };
+
+static inline struct blk_filter_ctx *_get_ctx(size_t altitude)
+{
+ return blk_filter_ctx_list[altitude-1];
+}
+
+static inline void _set_ctx(size_t altitude, struct blk_filter_ctx *ctx)
+{
+ blk_filter_ctx_list[altitude-1] = ctx;
+}
+
+static struct blk_filter_ctx *_blk_ctx_new(struct blk_filter *filter)
+{
+ struct blk_filter_ctx *ctx = kzalloc(sizeof(struct blk_filter_ctx), GFP_KERNEL);
+
+ if (!ctx)
+ return ctx;
+
+ ctx->filter = filter;
+
+ return ctx;
+}
+
+static int _blk_ctx_link(struct blk_filter_ctx *ctx, size_t altitude)
+{
+ int result = 0;
+
+ if (altitude > BLK_FILTER_ALTITUDE_MAX)
+ return -ENOENT;
+
+ down_write(&blk_filter_ctx_list_lock);
+
+ if (_get_ctx(altitude))
+ result = -EEXIST;
+ else
+ _set_ctx(altitude, ctx);
+
+ up_write(&blk_filter_ctx_list_lock);
+
+ return result;
+}
+
+static int _blk_ctx_unlink(struct blk_filter_ctx *ctx)
+{
+ int result = -EEXIST;
+ size_t altitude = BLK_FILTER_ALTITUDE_MIN;
+
+ down_write(&blk_filter_ctx_list_lock);
+
+ for (; altitude <= BLK_FILTER_ALTITUDE_MAX; ++altitude) {
+ if (_get_ctx(altitude) && (_get_ctx(altitude) == ctx)) {
+ _set_ctx(altitude, NULL);
+ result = 0;
+ break;
+ }
+ }
+
+ up_write(&blk_filter_ctx_list_lock);
+
+ return result;
+}
+
+/**
+ * blk_filter_disk_add() - Notify filters when a new disk is added.
+ * @disk: The new disk.
+ */
+void blk_filter_disk_add(struct gendisk *disk)
+{
+ size_t altitude = BLK_FILTER_ALTITUDE_MIN;
+
+ pr_warn("blk-filter: add disk [%s].\n", disk->disk_name);
+
+ down_read(&blk_filter_ctx_list_lock);
+
+ for (; altitude <= BLK_FILTER_ALTITUDE_MAX; ++altitude) {
+ struct blk_filter_ctx *ctx = _get_ctx(altitude);
+
+ if (ctx && ctx->filter->ops && ctx->filter->ops->disk_add)
+ ctx->filter->ops->disk_add(disk);
+ }
+
+ up_read(&blk_filter_ctx_list_lock);
+}
+
+/**
+ * blk_filter_disk_del() - Notify filters when the disk is deleted.
+ * @disk: The disk to delete.
+ */
+void blk_filter_disk_del(struct gendisk *disk)
+{
+ size_t altitude = BLK_FILTER_ALTITUDE_MIN;
+
+ pr_warn("blk-filter: del disk [%s].\n", disk->disk_name);
+
+ down_read(&blk_filter_ctx_list_lock);
+
+ for (; altitude <= BLK_FILTER_ALTITUDE_MAX; ++altitude) {
+ struct blk_filter_ctx *ctx = _get_ctx(altitude);
+
+ if (ctx && ctx->filter->ops && ctx->filter->ops->disk_del)
+ ctx->filter->ops->disk_del(disk);
+ }
+
+ up_read(&blk_filter_ctx_list_lock);
+}
+
+/**
+ * blk_filter_disk_release() - Notify filters when the disk is released.
+ * @disk: The disk to release.
+ */
+void blk_filter_disk_release(struct gendisk *disk)
+{
+ size_t altitude = BLK_FILTER_ALTITUDE_MAX;
+
+ pr_warn("blk-filter: release disk [%s].\n", disk->disk_name);
+
+ down_read(&blk_filter_ctx_list_lock);
+
+ for (; altitude <= BLK_FILTER_ALTITUDE_MIN; --altitude) {
+ struct blk_filter_ctx *ctx = _get_ctx(altitude);
+
+ if (ctx && ctx->filter->ops && ctx->filter->ops->disk_release)
+ ctx->filter->ops->disk_release(disk);
+ }
+
+ up_read(&blk_filter_ctx_list_lock);
+}
+
+/**
+ * blk_filter_submit_bio_altitude() - Send bio for porcessing to specific filter.
+ * @altitude: The filter altitude.
+ * @bio: The new bio for block I/O layer.
+ *
+ * Return: Bio submitting result, like for submit_bio function.
+ */
+blk_qc_t blk_filter_submit_bio_altitude(size_t altitude, struct bio *bio)
+{
+ blk_qc_t ret;
+ bool bypass = true;
+
+ down_read(&blk_filter_ctx_list_lock);
+ while (altitude >= BLK_FILTER_ALTITUDE_MIN) {
+ struct blk_filter_ctx *ctx = _get_ctx(altitude);
+
+ if (ctx && ctx->filter->ops && ctx->filter->ops->submit_bio) {
+ ret = ctx->filter->ops->submit_bio(bio);
+ bypass = false;
+ break;
+ }
+ --altitude;
+ }
+ up_read(&blk_filter_ctx_list_lock);
+
+ if (bypass)
+ ret = submit_bio_noacct(bio);
+
+ return ret;
+}
+
+/**
+ * blk_filter_submit_bio() - Send new bio to filters for processing.
+ * @bio: The new bio for block I/O layer.
+ *
+ * Return: Bio submitting result, like for submit_bio function.
+ */
+blk_qc_t blk_filter_submit_bio(struct bio *bio)
+{
+ return blk_filter_submit_bio_altitude(BLK_FILTER_ALTITUDE_MAX, bio);
+}
+
+/**
+ * blk_filter_register() - Create new block I/O layer filter.
+ * @filter: The filter description structure.
+ *
+ * Return: Zero if the filter was registered successfully or an error code if it failed.
+ */
+int blk_filter_register(struct blk_filter *filter)
+{
+ int result = 0;
+ struct blk_filter_ctx *ctx;
+
+ pr_warn("blk-filter: register filter [%s].\n", filter->name);
+
+ ctx = _blk_ctx_new(filter);
+ if (!ctx)
+ return -ENOMEM;
+
+ result = _blk_ctx_link(ctx, filter->altitude);
+ if (result)
+ goto failed;
+
+ filter->blk_filter_ctx = (void *)ctx;
+ return 0;
+
+failed:
+ kfree(ctx);
+ return result;
+}
+EXPORT_SYMBOL(blk_filter_register);
+
+/**
+ * blk_filter_unregister() - Remove existing block I/O layer filter.
+ * @filter: The filter description structure.
+ *
+ * Return: Zero if the filter was removed successfully or an error code if it failed.
+ */
+int blk_filter_unregister(struct blk_filter *filter)
+{
+ int result = 0;
+ struct blk_filter_ctx *ctx;
+
+ pr_warn("blk-filter: unregister filter [%s].\n", filter->name);
+
+ ctx = (struct blk_filter_ctx *)filter->blk_filter_ctx;
+
+ result = _blk_ctx_unlink(ctx);
+ if (result == 0)
+ kfree(ctx);
+
+ return result;
+}
+EXPORT_SYMBOL(blk_filter_unregister);
+
+/**
+ * blk_filter_check_altitude() - Checking that altitude is free.
+ * @altitude: The filter description structure.
+ *
+ * Return: NULL if the altitude is free or the name of the module registered at this altitude.
+ */
+const char *blk_filter_check_altitude(size_t altitude)
+{
+ struct blk_filter_ctx *ctx = _get_ctx(altitude);
+
+ if (!ctx)
+ return NULL;
+
+ return ctx->filter->name;
+}
+EXPORT_SYMBOL(blk_filter_check_altitude);
+
+static void _attach_fn(struct gendisk *disk, void *_ctx)
+{
+ struct blk_filter *filter = (struct blk_filter *)_ctx;
+
+ if (filter->ops && filter->ops->disk_add)
+ filter->ops->disk_add(disk);
+}
+
+/**
+ * blk_filter_attach_disks() - Enumerate all existing disks and call disk_add callback for each.
+ * @filter: The filter description structure.
+ *
+ * Return: Zero if the existing disks was attached successfully or an error code if it failed.
+ */
+int blk_filter_attach_disks(struct blk_filter *filter)
+{
+ return disk_enumerate(_attach_fn, filter);
+}
+EXPORT_SYMBOL(blk_filter_attach_disks);
+
+/**
+ * blk_filter_submit_bio_next() - Send a bio to the lower filters for processing.
+ * @bio: The bio for block I/O layer.
+ *
+ * Return: Bio submitting result, like for submit_bio function.
+ */
+blk_qc_t blk_filter_submit_bio_next(struct blk_filter *filter, struct bio *bio)
+{
+ return blk_filter_submit_bio_altitude(filter->altitude-1, bio);
+}
+EXPORT_SYMBOL(blk_filter_submit_bio_next);
diff --git a/block/genhd.c b/block/genhd.c
index 99c64641c314..c5604415e772 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -25,6 +25,7 @@
#include <linux/badblocks.h>

#include "blk.h"
+#include "blk-filter-internal.h"

static DEFINE_MUTEX(block_class_lock);
static struct kobject *block_depr;
@@ -837,6 +838,7 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
*/
WARN_ON_ONCE(!blk_get_queue(disk->queue));

+ blk_filter_disk_add(disk);
disk_add_events(disk);
blk_integrity_add(disk);
}
@@ -900,6 +902,7 @@ void del_gendisk(struct gendisk *disk)

might_sleep();

+ blk_filter_disk_del(disk);
blk_integrity_del(disk);
disk_del_events(disk);

@@ -1562,6 +1565,7 @@ static void disk_release(struct device *dev)

might_sleep();

+ blk_filter_disk_release(disk);
blk_free_devt(dev->devt);
disk_release_events(disk);
kfree(disk->random);
@@ -2339,3 +2343,23 @@ static void disk_release_events(struct gendisk *disk)
WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
kfree(disk->ev);
}
+
+int disk_enumerate(void (*fn)(struct gendisk *disk, void *ctx), void *ctx)
+{
+ struct class_dev_iter *iter;
+ struct device *dev;
+
+ iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ class_dev_iter_init(iter, &block_class, NULL, &disk_type);
+ dev = class_dev_iter_next(iter);
+ while (dev) {
+ fn(dev_to_disk(dev), ctx);
+ dev = class_dev_iter_next(iter);
+ };
+
+ kfree(iter);
+ return 0;
+}
diff --git a/include/linux/blk-filter.h b/include/linux/blk-filter.h
new file mode 100644
index 000000000000..201613168864
--- /dev/null
+++ b/include/linux/blk-filter.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * API declarations for kernel modules utilizing block device filters
+ */
+
+#ifndef BLK_FILTER_H
+#define BLK_FILTER_H
+
+#ifdef CONFIG_BLK_FILTER
+#define BLK_FILTER_ALTITUDE_MAX 4
+#define BLK_FILTER_ALTITUDE_MIN 1
+
+struct blk_filter_ops {
+ void (*disk_add)(struct gendisk *disk);
+ void (*disk_del)(struct gendisk *disk);
+ void (*disk_release)(struct gendisk *disk);
+ blk_qc_t (*submit_bio)(struct bio *bio);
+};
+
+struct blk_filter {
+ const char *name;
+ const struct blk_filter_ops *ops;
+ size_t altitude;
+ void *blk_filter_ctx;
+};
+
+
+int blk_filter_register(struct blk_filter *filter);
+
+int blk_filter_unregister(struct blk_filter *filter);
+
+const char *blk_filter_check_altitude(size_t altitude);
+
+int blk_filter_attach_disks(struct blk_filter *filter);
+
+blk_qc_t blk_filter_submit_bio_next(struct blk_filter *filter, struct bio *bio);
+
+#endif /* CONFIG_BLK_FILTER */
+
+#endif
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index 4ab853461dff..5f065f8989b4 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -319,6 +319,8 @@ extern void set_capacity_revalidate_and_notify(struct gendisk *disk,
sector_t size, bool revalidate);
extern unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask);

+extern int disk_enumerate(void (*fn)(struct gendisk *disk, void *cxt), void *cxt);
+
/* drivers/char/random.c */
extern void add_disk_randomness(struct gendisk *disk) __latent_entropy;
extern void rand_initialize_disk(struct gendisk *disk);
--
2.20.1