Re: [PATCH] null_blk: add support for max open/active zone limit for zoned devices

From: Damien Le Moal
Date: Tue Aug 25 2020 - 19:04:07 EST


On 2020/08/26 7:52, Damien Le Moal wrote:
> On 2020/08/25 21:22, Niklas Cassel wrote:
>> Add support for user space to set a max open zone and a max active zone
>> limit via configfs. By default, the default value is 0 == no limit.
>
> s/value is/values are/
>
>>
>> Call the block layer API functions used for exposing the configured
>> limits to sysfs.
>>
>> Add accounting in null_blk_zoned so that these new limits are respected.
>> Performing an operating that would exceed these limits results is a
>
> s/is a/in a/
>
>> standard I/O error.
>>
>> A max open zone limit exists in the ZBC standard.
>> While null_blk_zoned is used to test the Zoned Block Device model in
>> Linux, when it comes to differences between ZBC and ZNS, null_blk_zoned
>> mostly follows ZBC.
>>
>> Therefore, implement the manage open zone resources function from ZBC,
>> but additionally add support for max active zones.
>> This enables user space not only to test against a device with an open
>> zone limit, but also to test against a device with an active zone limit.
>>
>> Signed-off-by: Niklas Cassel <niklas.cassel@xxxxxxx>
>> ---
>> drivers/block/null_blk.h | 5 +
>> drivers/block/null_blk_main.c | 16 +-
>> drivers/block/null_blk_zoned.c | 316 +++++++++++++++++++++++++++------
>> 3 files changed, 283 insertions(+), 54 deletions(-)
>>
>> diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h
>> index daed4a9c34367..d2e7db43a52a7 100644
>> --- a/drivers/block/null_blk.h
>> +++ b/drivers/block/null_blk.h
>> @@ -42,6 +42,9 @@ struct nullb_device {
>> struct badblocks badblocks;
>>
>> unsigned int nr_zones;
>> + unsigned int nr_zones_imp_open;
>> + unsigned int nr_zones_exp_open;
>> + unsigned int nr_zones_closed;
>> struct blk_zone *zones;
>> sector_t zone_size_sects;
>>
>> @@ -51,6 +54,8 @@ struct nullb_device {
>> unsigned long zone_size; /* zone size in MB if device is zoned */
>> unsigned long zone_capacity; /* zone capacity in MB if device is zoned */
>> unsigned int zone_nr_conv; /* number of conventional zones */
>> + unsigned int zone_max_open; /* max number of open zones */
>> + unsigned int zone_max_active; /* max number of active zones */
>> unsigned int submit_queues; /* number of submission queues */
>> unsigned int home_node; /* home node for the device */
>> unsigned int queue_mode; /* block interface */
>> diff --git a/drivers/block/null_blk_main.c b/drivers/block/null_blk_main.c
>> index d74443a9c8fa2..53161a418611b 100644
>> --- a/drivers/block/null_blk_main.c
>> +++ b/drivers/block/null_blk_main.c
>> @@ -208,6 +208,14 @@ static unsigned int g_zone_nr_conv;
>> module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
>> MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");
>>
>> +static unsigned int g_zone_max_open;
>> +module_param_named(zone_max_open, g_zone_max_open, uint, 0444);
>> +MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)");
>> +
>> +static unsigned int g_zone_max_active;
>> +module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
>> +MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");
>> +
>> static struct nullb_device *null_alloc_dev(void);
>> static void null_free_dev(struct nullb_device *dev);
>> static void null_del_dev(struct nullb *nullb);
>> @@ -347,6 +355,8 @@ NULLB_DEVICE_ATTR(zoned, bool, NULL);
>> NULLB_DEVICE_ATTR(zone_size, ulong, NULL);
>> NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
>> NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
>> +NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
>> +NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
>>
>> static ssize_t nullb_device_power_show(struct config_item *item, char *page)
>> {
>> @@ -464,6 +474,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
>> &nullb_device_attr_zone_size,
>> &nullb_device_attr_zone_capacity,
>> &nullb_device_attr_zone_nr_conv,
>> + &nullb_device_attr_zone_max_open,
>> + &nullb_device_attr_zone_max_active,
>> NULL,
>> };
>>
>> @@ -517,7 +529,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item)
>> static ssize_t memb_group_features_show(struct config_item *item, char *page)
>> {
>> return snprintf(page, PAGE_SIZE,
>> - "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv\n");
>> + "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size,zone_capacity,zone_nr_conv,zone_max_open,zone_max_active\n");
>> }
>>
>> CONFIGFS_ATTR_RO(memb_group_, features);
>> @@ -580,6 +592,8 @@ static struct nullb_device *null_alloc_dev(void)
>> dev->zone_size = g_zone_size;
>> dev->zone_capacity = g_zone_capacity;
>> dev->zone_nr_conv = g_zone_nr_conv;
>> + dev->zone_max_open = g_zone_max_open;
>> + dev->zone_max_active = g_zone_max_active;
>> return dev;
>> }
>>
>> diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c
>> index 3d25c9ad23831..5fb38c9bdd4ae 100644
>> --- a/drivers/block/null_blk_zoned.c
>> +++ b/drivers/block/null_blk_zoned.c
>> @@ -51,6 +51,24 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
>> dev->zone_nr_conv);
>> }
>>
>> + /* Max active zones has to be <= number of sequential zones */
>> + if (dev->zone_max_active > dev->nr_zones - dev->zone_nr_conv) {
>> + dev->zone_max_active = dev->nr_zones - dev->zone_nr_conv;
>> + pr_info("changed the maximum number of active zones to %u\n",
>> + dev->nr_zones);
>> + }
>
> if dev->zone_max_active == dev->nr_zones - dev->zone_nr_conv, you could also
> change dev->zone_max_active to 0, since that is equivalent. Not a blocker
> though, I think.
>
>> +
>> + /* Max open zones has to be <= max active zones */
>> + if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) {
>> + dev->zone_max_open = dev->zone_max_active;
>> + pr_info("changed the maximum number of open zones to %u\n",
>> + dev->nr_zones);
>> + } else if (dev->zone_max_open > dev->nr_zones - dev->zone_nr_conv) {
>> + dev->zone_max_open = dev->nr_zones - dev->zone_nr_conv;
>> + pr_info("changed the maximum number of open zones to %u\n",
>> + dev->nr_zones);
>> + }
>
> And same here for zone_max_open.
>
>> +
>> for (i = 0; i < dev->zone_nr_conv; i++) {
>> struct blk_zone *zone = &dev->zones[i];
>>
>> @@ -99,6 +117,8 @@ int null_register_zoned_dev(struct nullb *nullb)
>> }
>>
>> blk_queue_max_zone_append_sectors(q, dev->zone_size_sects);
>> + blk_queue_max_open_zones(q, dev->zone_max_open);
>> + blk_queue_max_active_zones(q, dev->zone_max_active);
>>
>> return 0;
>> }
>> @@ -159,6 +179,99 @@ size_t null_zone_valid_read_len(struct nullb *nullb,
>> return (zone->wp - sector) << SECTOR_SHIFT;
>> }
>>
>> +static blk_status_t null_close_zone(struct nullb_device *dev, struct blk_zone *zone)
>> +{
>> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> + return BLK_STS_IOERR;
>> +
>> + switch (zone->cond) {
>> + case BLK_ZONE_COND_CLOSED:
>> + /* close operation on closed is not an error */
>> + return BLK_STS_OK;
>> + case BLK_ZONE_COND_IMP_OPEN:
>> + dev->nr_zones_imp_open--;
>> + break;
>> + case BLK_ZONE_COND_EXP_OPEN:
>> + dev->nr_zones_exp_open--;
>> + break;
>> + case BLK_ZONE_COND_EMPTY:
>> + case BLK_ZONE_COND_FULL:
>> + default:
>> + return BLK_STS_IOERR;
>> + }
>> +
>> + if (zone->wp == zone->start) {
>> + zone->cond = BLK_ZONE_COND_EMPTY;
>> + } else {
>> + zone->cond = BLK_ZONE_COND_CLOSED;
>> + dev->nr_zones_closed++;
>> + }
>> +
>> + return BLK_STS_OK;
>> +}
>> +
>> +static void null_close_first_imp_zone(struct nullb_device *dev)
>> +{
>> + unsigned int i;
>> +
>> + for (i = 0; i < dev->nr_zones; i++) {
>
> You can start the loop from dev->nr_conv_zones, that will avoid the first if below.
>
>> + if (dev->zones[i].type == BLK_ZONE_TYPE_CONVENTIONAL)
>> + continue;
>> + if (dev->zones[i].cond == BLK_ZONE_COND_IMP_OPEN) {
>> + null_close_zone(dev, &dev->zones[i]);
>> + return;
>> + }
>> + }
>> +}
>> +
>> +static bool null_room_in_active(struct nullb_device *dev)
>
> The name is a little strange. What about null_can_set_active() ?
>
>> +{
>> + if (!dev->zone_max_active)
>> + return true;
>> +
>> + return dev->nr_zones_exp_open + dev->nr_zones_imp_open +
>> + dev->nr_zones_closed < dev->zone_max_active;
>> +}
>> +
>> +static bool null_room_in_open(struct nullb_device *dev)
>
> Same here: null_can_open() ?
>
>> +{
>> + if (!dev->zone_max_open)
>> + return true;
>> +
>> + return dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open;
>> +}
>> +
>> +/*
>> + * This function matches the manage open zone resources function in the ZBC standard,
>> + * with the addition of max active zones support (added in the ZNS standard).
>> + *
>> + * ZBC states that an implicitly open zone shall be closed only if there is not
>> + * room within the open limit. However, if an active limit is set, it is not certain
>> + * that an implicitly opened zone can be closed without exceeding the active limit.
>
> Hu... imp open and close both being an active state, closing an imp open zone
> will never result in exceeding the max active limit. Did you mean something like
> "closing an imp open zone to open another one" ? or something else ? This needs
> clarification as the function name does not really tell us what this is trying
> to do. It looks like this is checking if a zone can be open... So why not merge
> that into null_room_in_open(), renamed null_can_open() ?
>
>> + */
>> +static bool null_manage_zone_resources(struct nullb_device *dev, struct blk_zone *zone)
>> +{
>> + switch (zone->cond) {
>> + case BLK_ZONE_COND_EMPTY:
>> + if (!null_room_in_active(dev))
>> + return false;
>> + fallthrough;
>> + case BLK_ZONE_COND_CLOSED:
>> + if (null_room_in_open(dev)) {
>> + return true;
>> + } else if (dev->nr_zones_imp_open && null_room_in_active(dev)) {
>> + null_close_first_imp_zone(dev);
>> + return true;
>> + } else {
>
> else is not needed here.
>
>> + return false;
>> + }
>> + default:
>> + /* Should never be called for other states */
>> + WARN_ON(1);
>> + return false;
>> + }
>> +}
>> +
>> static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
>> unsigned int nr_sectors, bool append)
>> {
>> @@ -177,43 +290,155 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
>> /* Cannot write to a full zone */
>> return BLK_STS_IOERR;
>> case BLK_ZONE_COND_EMPTY:
>> + case BLK_ZONE_COND_CLOSED:
>> + if (!null_manage_zone_resources(dev, zone))
>> + return BLK_STS_IOERR;
>> + break;
>> case BLK_ZONE_COND_IMP_OPEN:
>> case BLK_ZONE_COND_EXP_OPEN:
>> + break;
>> + default:
>> + /* Invalid zone condition */
>> + return BLK_STS_IOERR;
>> + }
>> +
>> + /*
>> + * Regular writes must be at the write pointer position.
>> + * Zone append writes are automatically issued at the write
>> + * pointer and the position returned using the request or BIO
>> + * sector.
>> + */
>> + if (append) {
>> + sector = zone->wp;
>> + if (cmd->bio)
>> + cmd->bio->bi_iter.bi_sector = sector;
>> + else
>> + cmd->rq->__sector = sector;
>> + } else if (sector != zone->wp) {
>> + return BLK_STS_IOERR;
>> + }
>> +
>> + if (zone->wp + nr_sectors > zone->start + zone->capacity)
>> + return BLK_STS_IOERR;
>> +
>> + if (zone->cond == BLK_ZONE_COND_CLOSED) {
>> + dev->nr_zones_closed--;
>> + dev->nr_zones_imp_open++;
>> + } else if (zone->cond == BLK_ZONE_COND_EMPTY) {
>> + dev->nr_zones_imp_open++;
>> + }
>> + if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
>> + zone->cond = BLK_ZONE_COND_IMP_OPEN;
>> +
>> + ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
>> + if (ret != BLK_STS_OK)
>> + return ret;
>> +
>> + zone->wp += nr_sectors;
>> + if (zone->wp == zone->start + zone->capacity) {
>> + if (zone->cond == BLK_ZONE_COND_EXP_OPEN)
>> + dev->nr_zones_exp_open--;
>> + else if (zone->cond == BLK_ZONE_COND_IMP_OPEN)
>> + dev->nr_zones_imp_open--;
>> + zone->cond = BLK_ZONE_COND_FULL;
>> + }
>> + return BLK_STS_OK;
>> +}
>> +
>> +static blk_status_t null_open_zone(struct nullb_device *dev, struct blk_zone *zone)
>> +{
>> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> + return BLK_STS_IOERR;
>> +
>> + switch (zone->cond) {
>> + case BLK_ZONE_COND_EXP_OPEN:
>> + /* open operation on exp open is not an error */
>> + return BLK_STS_OK;
>> + case BLK_ZONE_COND_EMPTY:
>> + if (!null_manage_zone_resources(dev, zone))
>> + return BLK_STS_IOERR;
>> + break;
>> + case BLK_ZONE_COND_IMP_OPEN:
>> + dev->nr_zones_imp_open--;
>> + break;
>> case BLK_ZONE_COND_CLOSED:
>> - /*
>> - * Regular writes must be at the write pointer position.
>> - * Zone append writes are automatically issued at the write
>> - * pointer and the position returned using the request or BIO
>> - * sector.
>> - */
>> - if (append) {
>> - sector = zone->wp;
>> - if (cmd->bio)
>> - cmd->bio->bi_iter.bi_sector = sector;
>> - else
>> - cmd->rq->__sector = sector;
>> - } else if (sector != zone->wp) {
>> + if (!null_manage_zone_resources(dev, zone))
>> return BLK_STS_IOERR;
>> - }
>> + dev->nr_zones_closed--;
>> + break;
>> + case BLK_ZONE_COND_FULL:
>> + default:
>> + return BLK_STS_IOERR;
>> + }
>> +
>> + zone->cond = BLK_ZONE_COND_EXP_OPEN;
>> + dev->nr_zones_exp_open++;
>> +
>> + return BLK_STS_OK;
>> +}
>> +
>> +static blk_status_t null_finish_zone(struct nullb_device *dev, struct blk_zone *zone)
>> +{
>> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> + return BLK_STS_IOERR;
>>
>> - if (zone->wp + nr_sectors > zone->start + zone->capacity)
>> + switch (zone->cond) {
>> + case BLK_ZONE_COND_FULL:
>> + /* finish operation on full is not an error */
>> + return BLK_STS_OK;
>> + case BLK_ZONE_COND_EMPTY:
>> + if (!null_manage_zone_resources(dev, zone))
>> + return BLK_STS_IOERR;
>> + break;
>> + case BLK_ZONE_COND_IMP_OPEN:
>> + dev->nr_zones_imp_open--;
>> + break;
>> + case BLK_ZONE_COND_EXP_OPEN:
>> + dev->nr_zones_exp_open--;
>> + break;
>> + case BLK_ZONE_COND_CLOSED:
>> + if (!null_manage_zone_resources(dev, zone))
>> return BLK_STS_IOERR;
>> + dev->nr_zones_closed--;
>> + break;
>> + default:
>> + return BLK_STS_IOERR;
>> + }
>>
>> - if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
>> - zone->cond = BLK_ZONE_COND_IMP_OPEN;
>> + zone->cond = BLK_ZONE_COND_FULL;
>> + zone->wp = zone->len;
>
> zone->wp = zone=>start + zone->len;
>
>>
>> - ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
>> - if (ret != BLK_STS_OK)
>> - return ret;
>> + return BLK_STS_OK;
>> +}
>> +
>> +static blk_status_t null_reset_zone(struct nullb_device *dev, struct blk_zone *zone)
>> +{
>> + if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> + return BLK_STS_IOERR;
>>
>> - zone->wp += nr_sectors;
>> - if (zone->wp == zone->start + zone->capacity)
>> - zone->cond = BLK_ZONE_COND_FULL;
>> + switch (zone->cond) {
>> + case BLK_ZONE_COND_EMPTY:
>> + /* reset operation on empty is not an error */
>> return BLK_STS_OK;
>> + case BLK_ZONE_COND_IMP_OPEN:
>> + dev->nr_zones_imp_open--;
>> + break;
>> + case BLK_ZONE_COND_EXP_OPEN:
>> + dev->nr_zones_exp_open--;
>> + break;
>> + case BLK_ZONE_COND_CLOSED:
>> + dev->nr_zones_closed--;
>> + break;
>> + case BLK_ZONE_COND_FULL:
>> + break;
>> default:
>> - /* Invalid zone condition */
>> return BLK_STS_IOERR;
>> }
>> +
>> + zone->cond = BLK_ZONE_COND_EMPTY;
>> + zone->wp = zone->start;
>> +
>> + return BLK_STS_OK;
>> }
>>
>> static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
>> @@ -222,49 +447,34 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op,
>> struct nullb_device *dev = cmd->nq->dev;
>> unsigned int zone_no = null_zone_no(dev, sector);
>> struct blk_zone *zone = &dev->zones[zone_no];
>> + blk_status_t ret;
>> size_t i;
>>
>> switch (op) {
>> case REQ_OP_ZONE_RESET_ALL:
>> for (i = 0; i < dev->nr_zones; i++) {
>> - if (zone[i].type == BLK_ZONE_TYPE_CONVENTIONAL)
>> - continue;
>> - zone[i].cond = BLK_ZONE_COND_EMPTY;
>> - zone[i].wp = zone[i].start;
>> + null_reset_zone(dev, &dev->zones[i]);
>> }
>
> You can drop the curly brackets too, and start the loop from nr_conv_zones too.
>
>> break;
>> case REQ_OP_ZONE_RESET:
>> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> - return BLK_STS_IOERR;
>> -
>> - zone->cond = BLK_ZONE_COND_EMPTY;
>> - zone->wp = zone->start;
>> + ret = null_reset_zone(dev, zone);
>> + if (ret != BLK_STS_OK)
>> + return ret;
>
> You can return directly here:
>
> return null_reset_zone(dev, zone);

Arg. No, you can't. There is the trace call after the switch. So please ignore
this comment :)

But you can still avoid repeating the "if (ret != BLK_STS_OK) return ret;" by
calling the trace only for BLK_STS_OK and returning ret at the end.

>
>> break;
>> case REQ_OP_ZONE_OPEN:
>> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> - return BLK_STS_IOERR;
>> - if (zone->cond == BLK_ZONE_COND_FULL)
>> - return BLK_STS_IOERR;
>> -
>> - zone->cond = BLK_ZONE_COND_EXP_OPEN;
>> + ret = null_open_zone(dev, zone);
>> + if (ret != BLK_STS_OK)
>> + return ret;
>
> same here.
>
>> break;
>> case REQ_OP_ZONE_CLOSE:
>> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> - return BLK_STS_IOERR;
>> - if (zone->cond == BLK_ZONE_COND_FULL)
>> - return BLK_STS_IOERR;
>> -
>> - if (zone->wp == zone->start)
>> - zone->cond = BLK_ZONE_COND_EMPTY;
>> - else
>> - zone->cond = BLK_ZONE_COND_CLOSED;
>> + ret = null_close_zone(dev, zone);
>> + if (ret != BLK_STS_OK)
>> + return ret;
>
> And here.
>
>> break;
>> case REQ_OP_ZONE_FINISH:
>> - if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
>> - return BLK_STS_IOERR;
>> -
>> - zone->cond = BLK_ZONE_COND_FULL;
>> - zone->wp = zone->start + zone->len;
>> + ret = null_finish_zone(dev, zone);
>> + if (ret != BLK_STS_OK)
>> + return ret;
>
> And here too.
>
>> break;
>> default:
>> return BLK_STS_NOTSUPP;
>>
>
>


--
Damien Le Moal
Western Digital Research