[PATCH v8 11/11] dm: add power-of-2 zoned target for non-power-of-2 zoned devices

From: Pankaj Raghav
Date: Wed Jul 27 2022 - 12:26:08 EST


Only power-of-2(po2) zoned devices were supported in linux but now non
power-of-2(npo2) zoned device support has been added to the block layer.

Filesystems such as F2FS and btrfs have support for zoned devices with
po2 zone size assumption. Before adding native support for npo2 zoned
devices, it was suggested to create a dm target for npo2 zoned device to
appear as a po2 target so that file systems can initially work without any
explicit changes by using this target.

The design of this target is very simple: introduce gaps between the
underlying device's zone size and the nearest po2 device zone size.
Device's actual zone size becomes the zone capacity of the target and
device's nearest po2 zone size becomes the target's zone size.

For e.g., a device with a zone size/capacity of 3M will have an equivalent
target layout as follows:

Device layout :-
zone capacity = 3M
zone size = 3M

|--------------|-------------|
0 3M 6M

Target layout :-
zone capacity=3M
zone size = 4M

|--------------|---|--------------|---|
0 3M 4M 7M 8M

All IOs will be remapped from target to the actual device location.

The area between target's zone capacity and zone size will be emulated
in the target.
The read IOs that fall in the emulated gap area will return 0 filled
bio and all the other IOs in that area will result in an error.
If a read IO span across the emulated area boundary, then the IOs are
split across them. All other IO operations that span across the emulated
area boundary will result in an error.

The target can be easily created as follows:
dmsetup create <label> --table '0 <size_sects> po2z /dev/nvme<id>'

Signed-off-by: Pankaj Raghav <p.raghav@xxxxxxxxxxx>
Suggested-by: Johannes Thumshirn <johannes.thumshirn@xxxxxxx>
Suggested-by: Damien Le Moal <damien.lemoal@xxxxxxx>
Suggested-by: Hannes Reinecke <hare@xxxxxxx>
---
drivers/md/Kconfig | 9 ++
drivers/md/Makefile | 2 +
drivers/md/dm-po2z-target.c | 261 ++++++++++++++++++++++++++++++++++
drivers/md/dm-table.c | 13 +-
drivers/md/dm-zone.c | 1 +
include/linux/device-mapper.h | 9 ++
6 files changed, 292 insertions(+), 3 deletions(-)
create mode 100644 drivers/md/dm-po2z-target.c

diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 998a5cfdbc4e..d58ccfee765b 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -518,6 +518,15 @@ config DM_FLAKEY
help
A target that intermittently fails I/O for debugging purposes.

+config DM_PO2Z
+ tristate "Power-of-2 target for non power-of-2 zoned devices"
+ depends on BLK_DEV_DM
+ depends on BLK_DEV_ZONED
+ help
+ A target that converts a zoned device with non power-of-2 zone size to
+ be power of 2. This is done by introducing gaps in between the zone
+ capacity and the power of 2 zone size.
+
config DM_VERITY
tristate "Verity target support"
depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 270f694850ec..98af4ed98f73 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -26,6 +26,7 @@ dm-era-y += dm-era-target.o
dm-clone-y += dm-clone-target.o dm-clone-metadata.o
dm-verity-y += dm-verity-target.o
dm-zoned-y += dm-zoned-target.o dm-zoned-metadata.o dm-zoned-reclaim.o
+dm-po2z-y += dm-po2z-target.o

md-mod-y += md.o md-bitmap.o
raid456-y += raid5.o raid5-cache.o raid5-ppl.o
@@ -60,6 +61,7 @@ obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
obj-$(CONFIG_DM_DELAY) += dm-delay.o
obj-$(CONFIG_DM_DUST) += dm-dust.o
obj-$(CONFIG_DM_FLAKEY) += dm-flakey.o
+obj-$(CONFIG_DM_PO2Z) += dm-po2z.o
obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
obj-$(CONFIG_DM_MULTIPATH_QL) += dm-queue-length.o
obj-$(CONFIG_DM_MULTIPATH_ST) += dm-service-time.o
diff --git a/drivers/md/dm-po2z-target.c b/drivers/md/dm-po2z-target.c
new file mode 100644
index 000000000000..e9c5b7d00eda
--- /dev/null
+++ b/drivers/md/dm-po2z-target.c
@@ -0,0 +1,261 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2022 Samsung Electronics Co., Ltd.
+ */
+
+#include <linux/device-mapper.h>
+
+#define DM_MSG_PREFIX "po2z"
+
+struct dm_po2z_target {
+ struct dm_dev *dev;
+ sector_t zone_size; /* Actual zone size of the underlying dev*/
+ sector_t zone_size_po2; /* zone_size rounded to the nearest po2 value */
+ sector_t zone_size_diff; /* diff between zone_size_po2 and zone_size */
+ u32 nr_zones;
+};
+
+static inline u32 npo2_zone_no(struct dm_po2z_target *dmh, sector_t sect)
+{
+ return div64_u64(sect, dmh->zone_size);
+}
+
+static inline u32 po2_zone_no(struct dm_po2z_target *dmh, sector_t sect)
+{
+ return sect >> ilog2(dmh->zone_size_po2);
+}
+
+static inline sector_t target_to_device_sect(struct dm_po2z_target *dmh,
+ sector_t sect)
+{
+ u32 zone_idx = po2_zone_no(dmh, sect);
+
+ return sect - (zone_idx * dmh->zone_size_diff);
+}
+
+static inline sector_t device_to_target_sect(struct dm_po2z_target *dmh,
+ sector_t sect)
+{
+ u32 zone_idx = npo2_zone_no(dmh, sect);
+
+ return sect + (zone_idx * dmh->zone_size_diff);
+}
+
+/*
+ * This target works on the complete zoned device. Partial mapping is not
+ * supported.
+ * Construct a zoned po2 logical device: <dev-path>
+ */
+static int dm_po2z_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+{
+ struct dm_po2z_target *dmh = NULL;
+ int ret;
+ sector_t zone_size;
+ sector_t dev_capacity;
+
+ if (argc < 1)
+ return -EINVAL;
+
+ dmh = kmalloc(sizeof(*dmh), GFP_KERNEL);
+ if (!dmh)
+ return -ENOMEM;
+
+ ret = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
+ &dmh->dev);
+
+ if (ret) {
+ ti->error = "Device lookup failed";
+ return ret;
+ }
+
+ zone_size = bdev_zone_sectors(dmh->dev->bdev);
+ dev_capacity = get_capacity(dmh->dev->bdev->bd_disk);
+ if (ti->len != dev_capacity || ti->begin) {
+ DMERR("%pg Partial mapping of the target not supported",
+ dmh->dev->bdev);
+ return -EINVAL;
+ }
+
+ if (is_power_of_2(zone_size)) {
+ DMERR("%pg: this target is not useful for power-of-2 zoned devices",
+ dmh->dev->bdev);
+ return -EINVAL;
+ }
+
+ dmh->zone_size = zone_size;
+ dmh->zone_size_po2 = 1 << get_count_order_long(zone_size);
+ dmh->zone_size_diff = dmh->zone_size_po2 - dmh->zone_size;
+ ti->private = dmh;
+ dmh->nr_zones = npo2_zone_no(dmh, ti->len);
+ ti->len = dmh->zone_size_po2 * dmh->nr_zones;
+
+ return 0;
+}
+
+static int dm_po2z_report_zones_cb(struct blk_zone *zone, unsigned int idx,
+ void *data)
+{
+ struct dm_report_zones_args *args = data;
+ struct dm_po2z_target *dmh = args->tgt->private;
+
+ zone->start = device_to_target_sect(dmh, zone->start);
+ zone->wp = device_to_target_sect(dmh, zone->wp);
+ zone->len = dmh->zone_size_po2;
+ args->next_sector = zone->start + zone->len;
+
+ return args->orig_cb(zone, args->zone_idx++, args->orig_data);
+}
+
+static int dm_po2z_report_zones(struct dm_target *ti,
+ struct dm_report_zones_args *args,
+ unsigned int nr_zones)
+{
+ struct dm_po2z_target *dmh = ti->private;
+ sector_t sect = po2_zone_no(dmh, args->next_sector) * dmh->zone_size;
+
+ return blkdev_report_zones(dmh->dev->bdev, sect, nr_zones,
+ dm_po2z_report_zones_cb, args);
+}
+
+static int dm_po2z_end_io(struct dm_target *ti, struct bio *bio,
+ blk_status_t *error)
+{
+ struct dm_po2z_target *dmh = ti->private;
+
+ if (bio->bi_status == BLK_STS_OK && bio_op(bio) == REQ_OP_ZONE_APPEND)
+ bio->bi_iter.bi_sector =
+ device_to_target_sect(dmh, bio->bi_iter.bi_sector);
+
+ return DM_ENDIO_DONE;
+}
+
+static void dm_po2z_io_hints(struct dm_target *ti, struct queue_limits *limits)
+{
+ struct dm_po2z_target *dmh = ti->private;
+
+ limits->chunk_sectors = dmh->zone_size_po2;
+}
+
+static bool bio_across_emulated_zone_area(struct dm_po2z_target *dmh,
+ struct bio *bio)
+{
+ u32 zone_idx = po2_zone_no(dmh, bio->bi_iter.bi_sector);
+ sector_t size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+
+ return (bio->bi_iter.bi_sector - (zone_idx * dmh->zone_size_po2) +
+ size) > dmh->zone_size;
+}
+
+static int dm_po2z_map_read(struct dm_po2z_target *dmh, struct bio *bio)
+{
+ sector_t start_sect, size, relative_sect_in_zone, split_io_pos;
+ u32 zone_idx;
+
+ /*
+ * Read does not extend into the emulated zone area (area between
+ * zone capacity and zone size)
+ */
+ if (!bio_across_emulated_zone_area(dmh, bio)) {
+ bio->bi_iter.bi_sector =
+ target_to_device_sect(dmh, bio->bi_iter.bi_sector);
+ return DM_MAPIO_REMAPPED;
+ }
+
+ start_sect = bio->bi_iter.bi_sector;
+ size = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+ zone_idx = po2_zone_no(dmh, start_sect);
+ relative_sect_in_zone = start_sect - (zone_idx * dmh->zone_size_po2);
+
+ /*
+ * If the starting sector is in the emulated area then fill
+ * all the bio with zeros. If bio is across emulated zone boundary,
+ * split the bio across boundaries and fill zeros only for the
+ * bio that is between the zone capacity and the zone size.
+ */
+ if (relative_sect_in_zone < dmh->zone_size &&
+ ((relative_sect_in_zone + size) > dmh->zone_size)) {
+ split_io_pos = (zone_idx * dmh->zone_size_po2) + dmh->zone_size;
+ dm_accept_partial_bio(bio, split_io_pos - start_sect);
+ bio->bi_iter.bi_sector = target_to_device_sect(dmh, start_sect);
+
+ return DM_MAPIO_REMAPPED;
+ } else if (relative_sect_in_zone >= dmh->zone_size &&
+ ((relative_sect_in_zone + size) > dmh->zone_size_po2)) {
+ split_io_pos = (zone_idx + 1) * dmh->zone_size_po2;
+ dm_accept_partial_bio(bio, split_io_pos - start_sect);
+ }
+
+ zero_fill_bio(bio);
+ bio_endio(bio);
+ return DM_MAPIO_SUBMITTED;
+}
+
+static int dm_po2z_map(struct dm_target *ti, struct bio *bio)
+{
+ struct dm_po2z_target *dmh = ti->private;
+
+ bio_set_dev(bio, dmh->dev->bdev);
+ if (bio_sectors(bio) || op_is_zone_mgmt(bio_op(bio))) {
+ /*
+ * Read operation on the emulated zone area (between zone capacity
+ * and zone size) will fill the bio with zeroes. Any other operation
+ * in the emulated area should return an error.
+ */
+ if (bio_op(bio) == REQ_OP_READ)
+ return dm_po2z_map_read(dmh, bio);
+
+ if (!bio_across_emulated_zone_area(dmh, bio)) {
+ bio->bi_iter.bi_sector = target_to_device_sect(dmh,
+ bio->bi_iter.bi_sector);
+ return DM_MAPIO_REMAPPED;
+ }
+ return DM_MAPIO_KILL;
+ }
+ return DM_MAPIO_REMAPPED;
+}
+
+static int dm_po2z_iterate_devices(struct dm_target *ti,
+ iterate_devices_callout_fn fn, void *data)
+{
+ struct dm_po2z_target *dmh = ti->private;
+ sector_t len = dmh->nr_zones * dmh->zone_size;
+
+ return fn(ti, dmh->dev, 0, len, data);
+}
+
+static struct target_type dm_po2z_target = {
+ .name = "po2z",
+ .version = { 1, 0, 0 },
+ .features = DM_TARGET_ZONED_HM | DM_TARGET_EMULATED_ZONES,
+ .map = dm_po2z_map,
+ .end_io = dm_po2z_end_io,
+ .report_zones = dm_po2z_report_zones,
+ .iterate_devices = dm_po2z_iterate_devices,
+ .module = THIS_MODULE,
+ .io_hints = dm_po2z_io_hints,
+ .ctr = dm_po2z_ctr,
+};
+
+static int __init dm_po2z_init(void)
+{
+ int r = dm_register_target(&dm_po2z_target);
+
+ if (r < 0)
+ DMERR("register failed %d", r);
+
+ return r;
+}
+
+static void __exit dm_po2z_exit(void)
+{
+ dm_unregister_target(&dm_po2z_target);
+}
+
+/* Module hooks */
+module_init(dm_po2z_init);
+module_exit(dm_po2z_exit);
+
+MODULE_DESCRIPTION(DM_NAME "power-of-2 zoned target");
+MODULE_AUTHOR("Pankaj Raghav <p.raghav@xxxxxxxxxxx>");
+MODULE_LICENSE("GPL");
+
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 534fddfc2b42..d77feff124af 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1614,13 +1614,20 @@ static bool dm_table_supports_zoned_model(struct dm_table *t,
return true;
}

-static int device_not_matches_zone_sectors(struct dm_target *ti, struct dm_dev *dev,
+/*
+ * Callback function to check for device zone sector across devices. If the
+ * DM_TARGET_EMULATED_ZONES target feature flag is not set, then the target
+ * should have the same zone sector as the underlying devices.
+ */
+static int check_valid_device_zone_sectors(struct dm_target *ti, struct dm_dev *dev,
sector_t start, sector_t len, void *data)
{
unsigned int *zone_sectors = data;

- if (!bdev_is_zoned(dev->bdev))
+ if (!bdev_is_zoned(dev->bdev) ||
+ dm_target_supports_emulated_zones(ti->type))
return 0;
+
return bdev_zone_sectors(dev->bdev) != *zone_sectors;
}

@@ -1646,7 +1653,7 @@ static int validate_hardware_zoned_model(struct dm_table *t,
if (!zone_sectors)
return -EINVAL;

- if (dm_table_any_dev_attr(t, device_not_matches_zone_sectors, &zone_sectors)) {
+ if (dm_table_any_dev_attr(t, check_valid_device_zone_sectors, &zone_sectors)) {
DMERR("%s: zone sectors is not consistent across all zoned devices",
dm_device_name(t->md));
return -EINVAL;
diff --git a/drivers/md/dm-zone.c b/drivers/md/dm-zone.c
index 31c16aafdbfc..2b6b3883471f 100644
--- a/drivers/md/dm-zone.c
+++ b/drivers/md/dm-zone.c
@@ -302,6 +302,7 @@ int dm_set_zones_restrictions(struct dm_table *t, struct request_queue *q)
if (dm_table_supports_zone_append(t)) {
clear_bit(DMF_EMULATE_ZONE_APPEND, &md->flags);
dm_cleanup_zoned_dev(md);
+
return 0;
}

diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 920085dd7f3b..7dbd28b8de01 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -294,6 +294,15 @@ struct target_type {
#define dm_target_supports_mixed_zoned_model(type) (false)
#endif

+#ifdef CONFIG_BLK_DEV_ZONED
+#define DM_TARGET_EMULATED_ZONES 0x00000400
+#define dm_target_supports_emulated_zones(type) \
+ ((type)->features & DM_TARGET_EMULATED_ZONES)
+#else
+#define DM_TARGET_EMULATED_ZONES 0x00000000
+#define dm_target_supports_emulated_zones(type) (false)
+#endif
+
struct dm_target {
struct dm_table *table;
struct target_type *type;
--
2.25.1