Re: [PATCH v2] null_blk: add support for max open/active zone limit for zoned devices

From: Damien Le Moal
Date: Fri Aug 28 2020 - 03:06:50 EST


On 2020/08/27 22:50, Niklas Cassel wrote:
> Add support for user space to set a max open zone and a max active zone
> limit via configfs. By default, the default values are 0 == no limit.
>
> Call the block layer API functions used for exposing the configured
> limits to sysfs.
>
> Add accounting in null_blk_zoned so that these new limits are respected.
> Performing an operating that would exceed these limits results in a

Performing a write operation that would result in exceeding these...

> standard I/O error.
>
> A max open zone limit exists in the ZBC standard.
> While null_blk_zoned is used to test the Zoned Block Device model in
> Linux, when it comes to differences between ZBC and ZNS, null_blk_zoned
> mostly follows ZBC.
>
> Therefore, implement the manage open zone resources function from ZBC,
> but additionally add support for max active zones.
> This enables user space not only to test against a device with an open
> zone limit, but also to test against a device with an active zone limit.
>
> Signed-off-by: Niklas Cassel <niklas.cassel@xxxxxxx>
> ---
> Changes since v1:
> -Fixed review comments by Damien Le Moal.
>
> drivers/block/null_blk.h | 5 +
> drivers/block/null_blk_main.c | 16 +-
> drivers/block/null_blk_zoned.c | 319 +++++++++++++++++++++++++++------
> 3 files changed, 282 insertions(+), 58 deletions(-)
>
> diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
> index daed4a9c34367..d2e7db43a52a7 100644
> --- a/drivers/block/null_blk.h
> +++ b/drivers/block/null_blk.h
> @@ -42,6 +42,9 @@ struct nullb_device {
> struct badblocks badblocks;
>
> unsigned int nr_zones;
> + unsigned int nr_zones_imp_open;
> + unsigned int nr_zones_exp_open;
> + unsigned int nr_zones_closed;
> struct blk_zone *zones;
> sector_t zone_size_sects;
>
> @@ -51,6 +54,8 @@ struct nullb_device {
> unsigned long zone_size; /* zone size in MB if device is zoned */
> unsigned long zone_capacity; /* zone capacity in MB if device is zoned */
> unsigned int zone_nr_conv; /* number of conventional zones */
> + unsigned int zone_max_open; /* max number of open zones */
> + unsigned int zone_max_active; /* max number of active zones */
> unsigned int submit_queues; /* number of submission queues */
> unsigned int home_node; /* home node for the device */
> unsigned int queue_mode; /* block interface */
> diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
> index d74443a9c8fa2..53161a418611b 100644
> --- a/drivers/block/null_blk_main.c
> +++ b/drivers/block/null_blk_main.c
> @@ -208,6 +208,14 @@ static unsigned int g_zone_nr_conv;
> module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
> MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");
>
> +static unsigned int g_zone_max_open;
> +module_param_named(zone_max_open, g_zone_max_open, uint, 0444);
> +MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)");
> +
> +static unsigned int g_zone_max_active;
> +module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
> +MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");
> +
> static struct nullb_device *null_alloc_dev(void);
> static void null_free_dev(struct nullb_device *dev);
> static void null_del_dev(struct nullb *nullb);
> @@ -347,6 +355,8 @@ NULLB_DEVICE_ATTR(zoned, bool, NULL);
> NULLB_DEVICE_ATTR(zone_size, ulong, NULL);
> NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
> NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
> +NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
> +NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
>
> static ssize_t nullb_device_power_show(struct config_item *item, char *page)
> {
> @@ -464,6 +474,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
> &nullb_device_attr_zone_size,
> &nullb_device_attr_zone_capacity,
> &nullb_device_attr_zone_nr_conv,
> + &nullb_device_attr_zone_max_open,
> + &nullb_device_attr_zone_max_active,
> NULL,
> };
>
> @@ -517,7 +529,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
> static ssize_t memb_group_features_show(struct config_item *item, char *page)
> {
> return snprintf(page, PAGE_SIZE,
> - "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv\n");
> + "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active\n");
> }
>
> CONFIGFS_ATTR_RO(memb_group_, features);
> @@ -580,6 +592,8 @@ static struct nullb_device *null_alloc_dev(void)
> dev->zone_size = g_zone_size;
> dev->zone_capacity = g_zone_capacity;
> dev->zone_nr_conv = g_zone_nr_conv;
> + dev->zone_max_open = g_zone_max_open;
> + dev->zone_max_active = g_zone_max_active;
> return dev;
> }
>
> diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
> index 3d25c9ad23831..e7e341e811fbf 100644
> --- a/drivers/block/null_blk_zoned.c
> +++ b/drivers/block/null_blk_zoned.c
> @@ -51,6 +51,22 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
> dev->zone_nr_conv);
> }
>
> + /* Max active zones has to be < nbr of seq zones in order to be enforceable */
> + if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) {
> + dev->zone_max_active = 0;
> + pr_info("zone_max_active limit disabled, limit >= zone count\n");
> + }
> +
> + /* Max open zones has to be <= max active zones */
> + if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) {
> + dev->zone_max_open = dev->zone_max_active;
> + pr_info("changed the maximum number of open zones to %u\n",
> + dev->nr_zones);
> + } else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) {
> + dev->zone_max_open = 0;
> + pr_info("zone_max_open limit disabled, limit >= zone count\n");
> + }
> +
> for (i = 0; i < dev->zone_nr_conv; i++) {
> struct blk_zone *zone = &dev->zones[i];
>
> @@ -99,6 +115,8 @@ int null_register_zoned_dev(struct nullb *nullb)
> }
>
> blk_queue_max_zone_append_sectors(q, dev->zone_size_sects);
> + blk_queue_max_open_zones(q, dev->zone_max_open);
> + blk_queue_max_active_zones(q, dev->zone_max_active);
>
> return 0;
> }
> @@ -159,6 +177,103 @@ size_t null_zone_valid_read_len(struct nullb *nullb,
> return (zone->wp - sector) << SECTOR_SHIFT;
> }
>
> +static blk_status_t null_close_zone(struct nullb_device *dev, struct blk_zone *zone)
> +{
> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> + return BLK_STS_IOERR;
> +
> + switch (zone->cond) {
> + case BLK_ZONE_COND_CLOSED:
> + /* close operation on closed is not an error */
> + return BLK_STS_OK;
> + case BLK_ZONE_COND_IMP_OPEN:
> + dev->nr_zones_imp_open--;
> + break;
> + case BLK_ZONE_COND_EXP_OPEN:
> + dev->nr_zones_exp_open--;
> + break;
> + case BLK_ZONE_COND_EMPTY:
> + case BLK_ZONE_COND_FULL:
> + default:
> + return BLK_STS_IOERR;
> + }
> +
> + if (zone->wp == zone->start) {
> + zone->cond = BLK_ZONE_COND_EMPTY;
> + } else {
> + zone->cond = BLK_ZONE_COND_CLOSED;
> + dev->nr_zones_closed++;
> + }
> +
> + return BLK_STS_OK;
> +}
> +
> +static void null_close_first_imp_zone(struct nullb_device *dev)
> +{
> + unsigned int i;
> +
> + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
> + if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) {
> + null_close_zone(dev, &dev->zones[i]);
> + return;
> + }
> + }
> +}
> +
> +static bool null_can_set_active(struct nullb_device *dev)
> +{
> + if (!dev->zone_max_active)
> + return true;
> +
> + return dev->nr_zones_exp_open + dev->nr_zones_imp_open +
> + dev->nr_zones_closed < dev->zone_max_active;
> +}
> +
> +static bool null_can_open(struct nullb_device *dev)
> +{
> + if (!dev->zone_max_open)
> + return true;
> +
> + if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open)
> + return true;
> +
> + if (dev->nr_zones_imp_open && null_can_set_active(dev)) {
> + null_close_first_imp_zone(dev);
> + return true;
> + }
> +
> + return false;
> +}
> +
> +/*
> + * This function matches the manage open zone resources function in the ZBC standard,
> + * with the addition of max active zones support (added in the ZNS standard).
> + *
> + * The function determines if a zone can transition to implicit open or explicit open,
> + * while maintaining the max open zone (and max active zone) limit(s). It may close an
> + * implicit open zone in order to make additional zone resources available.
> + *
> + * ZBC states that an implicit open zone shall be closed only if there is not
> + * room within the open limit. However, with the addition of an active limit,
> + * it is not certain that closing an implicit open zone will allow a new zone
> + * to be opened, since we might already be at the active limit capacity.
> + */
> +static bool null_manage_zone_resources(struct nullb_device *dev, struct blk_zone *zone)

I still do not like the name. Since this return a bool, what about
null_has_zone_resources() ?

> +{
> + switch (zone->cond) {
> + case BLK_ZONE_COND_EMPTY:
> + if (!null_can_set_active(dev))
> + return false;
> + fallthrough;
> + case BLK_ZONE_COND_CLOSED:
> + return null_can_open(dev);
> + default:
> + /* Should never be called for other states */
> + WARN_ON(1);
> + return false;
> + }
> +}
> +
> static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
> unsigned int nr_sectors, bool append)
> {
> @@ -177,43 +292,155 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
> /* Cannot write to a full zone */
> return BLK_STS_IOERR;
> case BLK_ZONE_COND_EMPTY:
> + case BLK_ZONE_COND_CLOSED:
> + if (!null_manage_zone_resources(dev, zone))
> + return BLK_STS_IOERR;
> + break;
> case BLK_ZONE_COND_IMP_OPEN:
> case BLK_ZONE_COND_EXP_OPEN:
> + break;
> + default:
> + /* Invalid zone condition */
> + return BLK_STS_IOERR;
> + }
> +
> + /*
> + * Regular writes must be at the write pointer position.
> + * Zone append writes are automatically issued at the write
> + * pointer and the position returned using the request or BIO
> + * sector.
> + */
> + if (append) {
> + sector = zone->wp;
> + if (cmd->bio)
> + cmd->bio->bi_iter.bi_sector = sector;
> + else
> + cmd->rq->__sector = sector;
> + } else if (sector != zone->wp) {
> + return BLK_STS_IOERR;
> + }
> +
> + if (zone->wp + nr_sectors > zone->start + zone->capacity)
> + return BLK_STS_IOERR;
> +
> + if (zone->cond == BLK_ZONE_COND_CLOSED) {
> + dev->nr_zones_closed--;
> + dev->nr_zones_imp_open++;
> + } else if (zone->cond == BLK_ZONE_COND_EMPTY) {
> + dev->nr_zones_imp_open++;
> + }
> + if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
> + zone->cond = BLK_ZONE_COND_IMP_OPEN;
> +
> + ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
> + if (ret != BLK_STS_OK)
> + return ret;
> +
> + zone->wp += nr_sectors;
> + if (zone->wp == zone->start + zone->capacity) {
> + if (zone->cond == BLK_ZONE_COND_EXP_OPEN)
> + dev->nr_zones_exp_open--;
> + else if (zone->cond == BLK_ZONE_COND_IMP_OPEN)
> + dev->nr_zones_imp_open--;
> + zone->cond = BLK_ZONE_COND_FULL;
> + }
> + return BLK_STS_OK;
> +}
> +
> +static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone)
> +{
> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> + return BLK_STS_IOERR;
> +
> + switch (zone->cond) {
> + case BLK_ZONE_COND_EXP_OPEN:
> + /* open operation on exp open is not an error */
> + return BLK_STS_OK;
> + case BLK_ZONE_COND_EMPTY:
> + if (!null_manage_zone_resources(dev, zone))
> + return BLK_STS_IOERR;
> + break;
> + case BLK_ZONE_COND_IMP_OPEN:
> + dev->nr_zones_imp_open--;
> + break;
> case BLK_ZONE_COND_CLOSED:
> - /*
> - * Regular writes must be at the write pointer position.
> - * Zone append writes are automatically issued at the write
> - * pointer and the position returned using the request or BIO
> - * sector.
> - */
> - if (append) {
> - sector = zone->wp;
> - if (cmd->bio)
> - cmd->bio->bi_iter.bi_sector = sector;
> - else
> - cmd->rq->__sector = sector;
> - } else if (sector != zone->wp) {
> + if (!null_manage_zone_resources(dev, zone))
> return BLK_STS_IOERR;
> - }
> + dev->nr_zones_closed--;
> + break;
> + case BLK_ZONE_COND_FULL:
> + default:
> + return BLK_STS_IOERR;
> + }
> +
> + zone->cond = BLK_ZONE_COND_EXP_OPEN;
> + dev->nr_zones_exp_open++;
>
> - if (zone->wp + nr_sectors > zone->start + zone->capacity)
> + return BLK_STS_OK;
> +}
> +
> +static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone)
> +{
> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> + return BLK_STS_IOERR;
> +
> + switch (zone->cond) {
> + case BLK_ZONE_COND_FULL:
> + /* finish operation on full is not an error */
> + return BLK_STS_OK;
> + case BLK_ZONE_COND_EMPTY:
> + if (!null_manage_zone_resources(dev, zone))

OK. So you are hitting a fuzzy case here that is not actually well described in
the standards. That is, does finishing an empty zone necessarilly imply a
temporary transition through imp open ? Which you are assuming is a yes here.
Personally, I would say that is not necessary, but no strong feeling either way.

> return BLK_STS_IOERR;
> + break;
> + case BLK_ZONE_COND_IMP_OPEN:
> + dev->nr_zones_imp_open--;
> + break;
> + case BLK_ZONE_COND_EXP_OPEN:
> + dev->nr_zones_exp_open--;
> + break;
> + case BLK_ZONE_COND_CLOSED:
> + if (!null_manage_zone_resources(dev, zone))
> + return BLK_STS_IOERR;
> + dev->nr_zones_closed--;
> + break;
> + default:
> + return BLK_STS_IOERR;
> + }
>
> - if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
> - zone->cond = BLK_ZONE_COND_IMP_OPEN;
> + zone->cond = BLK_ZONE_COND_FULL;
> + zone->wp = zone->start + zone->len;
>
> - ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
> - if (ret != BLK_STS_OK)
> - return ret;
> + return BLK_STS_OK;
> +}
> +
> +static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *zone)
> +{
> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> + return BLK_STS_IOERR;
>
> - zone->wp += nr_sectors;
> - if (zone->wp == zone->start + zone->capacity)
> - zone->cond = BLK_ZONE_COND_FULL;
> + switch (zone->cond) {
> + case BLK_ZONE_COND_EMPTY:
> + /* reset operation on empty is not an error */
> return BLK_STS_OK;
> + case BLK_ZONE_COND_IMP_OPEN:
> + dev->nr_zones_imp_open--;
> + break;
> + case BLK_ZONE_COND_EXP_OPEN:
> + dev->nr_zones_exp_open--;
> + break;
> + case BLK_ZONE_COND_CLOSED:
> + dev->nr_zones_closed--;
> + break;
> + case BLK_ZONE_COND_FULL:
> + break;
> default:
> - /* Invalid zone condition */
> return BLK_STS_IOERR;
> }
> +
> + zone->cond = BLK_ZONE_COND_EMPTY;
> + zone->wp = zone->start;
> +
> + return BLK_STS_OK;
> }
>
> static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
> @@ -222,56 +449,34 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
> struct nullb_device *dev = cmd->nq->dev;
> unsigned int zone_no = null_zone_no(dev, sector);
> struct blk_zone *zone = &dev->zones[zone_no];
> + blk_status_t ret = BLK_STS_OK;
> size_t i;
>
> switch (op) {
> case REQ_OP_ZONE_RESET_ALL:
> - for (i = 0; i < dev->nr_zones; i++) {
> - if (zone[i].type == BLK_ZONE_TYPE_CONVENTIONAL)
> - continue;
> - zone[i].cond = BLK_ZONE_COND_EMPTY;
> - zone[i].wp = zone[i].start;
> - }
> + for (i = dev->zone_nr_conv; i < dev->nr_zones; i++)
> + null_reset_zone(dev, &dev->zones[i]);
> break;
> case REQ_OP_ZONE_RESET:
> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> - return BLK_STS_IOERR;
> -
> - zone->cond = BLK_ZONE_COND_EMPTY;
> - zone->wp = zone->start;
> + ret = null_reset_zone(dev, zone);
> break;
> case REQ_OP_ZONE_OPEN:
> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> - return BLK_STS_IOERR;
> - if (zone->cond == BLK_ZONE_COND_FULL)
> - return BLK_STS_IOERR;
> -
> - zone->cond = BLK_ZONE_COND_EXP_OPEN;
> + ret = null_open_zone(dev, zone);
> break;
> case REQ_OP_ZONE_CLOSE:
> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> - return BLK_STS_IOERR;
> - if (zone->cond == BLK_ZONE_COND_FULL)
> - return BLK_STS_IOERR;
> -
> - if (zone->wp == zone->start)
> - zone->cond = BLK_ZONE_COND_EMPTY;
> - else
> - zone->cond = BLK_ZONE_COND_CLOSED;
> + ret = null_close_zone(dev, zone);
> break;
> case REQ_OP_ZONE_FINISH:
> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
> - return BLK_STS_IOERR;
> -
> - zone->cond = BLK_ZONE_COND_FULL;
> - zone->wp = zone->start + zone->len;
> + ret = null_finish_zone(dev, zone);
> break;
> default:
> return BLK_STS_NOTSUPP;
> }
>
> - trace_nullb_zone_op(cmd, zone_no, zone->cond);
> - return BLK_STS_OK;
> + if (ret == BLK_STS_OK)
> + trace_nullb_zone_op(cmd, zone_no, zone->cond);
> +
> + return ret;
> }
>
> blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op,
>

Apart from the commit message and function name nits above, this looks good to me.

Reviewed-by: Damien Le Moal <damien.lemoal@xxxxxxx>

--
Damien Le Moal
Western Digital Research