Re: [RFC PATCH 1/1] virtio-blk: process block layer timedout request

From: Chaitanya Kulkarni
Date: Mon Jan 08 2024 - 22:33:46 EST


On 11/30/23 17:25, Stefan Hajnoczi wrote:
> On Wed, Nov 29, 2023 at 11:01:33PM -0800, Chaitanya Kulkarni wrote:
>> Improve block layer request handling by implementing a timeout handler.
>> Current implementation assums that request will never timeout and will
>> be completed by underlaying transport. However, this assumption can
>> cause issues under heavy load especially when dealing with different
>> subsystems and real hardware.
>>
>> To solve this, add a block layer request timeout handler that will
>> complete timed-out requests in the same context if the virtio device
>> has a VIRTIO_CONFIG_S_DRIVER_OK status. If the device has any other
>> status, we'll stop the block layer request queue and proceed with the
>> teardown sequence, allowing applications waiting for I/O to exit
>> gracefully with appropriate error.
>>
>> Also, add two new module parameters that allows user to specify the
>> I/O timeout for the tagset when allocating the disk and a teardown limit
>> for the timed out requeets before we initiate device teardown from the
>> timeout handler. These changes will improve the stability and
>> reliability of our system under request timeout scenario.
>>
>> Signed-off-by: Chaitanya Kulkarni <kch@xxxxxxxxxx>
>> ---
>> drivers/block/virtio_blk.c | 122 ++++++++++++++++++++++++++++++++
>> include/uapi/linux/virtio_blk.h | 1 +
>> 2 files changed, 123 insertions(+)
> Hi,
> This looks interesting. There was a discussion about implementing
> ->timeout() in September:
> https://lore.kernel.org/io-uring/20230926145526.GA387951@fedora/

Thanks for pointing that out ...

>> diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
>> index 4689ac2e0c0e..da26c2bf933b 100644
>> --- a/drivers/block/virtio_blk.c
>> +++ b/drivers/block/virtio_blk.c
>> @@ -16,6 +16,7 @@
>> #include <linux/blk-mq-virtio.h>
>> #include <linux/numa.h>
>> #include <linux/vmalloc.h>
>> +#include <linux/xarray.h>
>> #include <uapi/linux/virtio_ring.h>
>>
>> #define PART_BITS 4
>> @@ -31,6 +32,15 @@
>> #define VIRTIO_BLK_INLINE_SG_CNT 2
>> #endif
>>
>> +static unsigned int io_timeout = 20;
>> +module_param(io_timeout, uint, 0644);
>> +MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O requests. Default:20");
>> +
>> +static unsigned int timeout_teardown_limit = 2;
>> +module_param(timeout_teardown_limit, uint, 0644);
>> +MODULE_PARM_DESC(timeout_teardown_limit,
>> + "request timeout teardown limit for stable dev. Default:2");
>> +
>> static unsigned int num_request_queues;
>> module_param(num_request_queues, uint, 0644);
>> MODULE_PARM_DESC(num_request_queues,
>> @@ -84,6 +94,20 @@ struct virtio_blk {
>>
>> /* For zoned device */
>> unsigned int zone_sectors;
>> +
>> + /*
>> + * Block layer Request timeout teardown limit when device is in the
>> + * stable state, i.e. it has VIRTIO_CONFIG_S_DRIVER_OK value for its
>> + * config status. Once this limit is reached issue
>> + * virtblk_teardown_work to teardown the device in the block lyaer
>> + * request timeout callback.
>> + */
>> + atomic_t rq_timeout_count;
>> + /* avoid tear down race between remove and teardown work */
>> + struct mutex teardown_mutex;
>> + /* tear down work to be scheduled from block layer request handler */
>> + struct work_struct teardown_work;
>> +
>> };
>>
>> struct virtblk_req {
>> @@ -117,6 +141,8 @@ static inline blk_status_t virtblk_result(u8 status)
>> case VIRTIO_BLK_S_OK:
>> return BLK_STS_OK;
>> case VIRTIO_BLK_S_UNSUPP:
>> + case VIRTIO_BLK_S_TIMEOUT:
>> + return BLK_STS_TIMEOUT;
>> return BLK_STS_NOTSUPP;
>> case VIRTIO_BLK_S_ZONE_OPEN_RESOURCE:
>> return BLK_STS_ZONE_OPEN_RESOURCE;
>> @@ -926,6 +952,7 @@ static void virtblk_free_disk(struct gendisk *disk)
>> struct virtio_blk *vblk = disk->private_data;
>>
>> ida_free(&vd_index_ida, vblk->index);
>> + mutex_destroy(&vblk->teardown_mutex);
>> mutex_destroy(&vblk->vdev_mutex);
>> kfree(vblk);
>> }
>> @@ -1287,6 +1314,86 @@ static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
>> return found;
>> }
>>
>> +static bool virtblk_cancel_request(struct request *rq, void *data)
>> +{
>> + struct virtblk_req *vbr = blk_mq_rq_to_pdu(rq);
>> +
>> + vbr->in_hdr.status = VIRTIO_BLK_S_TIMEOUT;
>> + if (blk_mq_request_started(rq) && !blk_mq_request_completed(rq))
>> + blk_mq_complete_request(rq);
>> +
>> + return true;
>> +}
>> +
>> +static void virtblk_teardown_work(struct work_struct *w)
>> +{
>> + struct virtio_blk *vblk =
>> + container_of(w, struct virtio_blk, teardown_work);
>> + struct request_queue *q = vblk->disk->queue;
>> + struct virtio_device *vdev = vblk->vdev;
>> + struct blk_mq_hw_ctx *hctx;
>> + unsigned long idx;
>> +
>> + mutex_lock(&vblk->teardown_mutex);
>> + if (!vblk->vdev)
>> + goto unlock;
>> +
>> + blk_mq_quiesce_queue(q);
>> +
>> + /* Process any outstanding request from device. */
>> + xa_for_each(&q->hctx_table, idx, hctx)
>> + virtblk_poll(hctx, NULL);
>> +
>> + blk_sync_queue(q);
>> + blk_mq_tagset_busy_iter(&vblk->tag_set, virtblk_cancel_request, vblk);
>> + blk_mq_tagset_wait_completed_request(&vblk->tag_set);
>> +
>> + /*
>> + * Unblock any pending dispatch I/Os before we destroy device. From
>> + * del_gendisk() -> __blk_mark_disk_dead(disk) will set GD_DEAD flag,
>> + * that will make sure any new I/O from bio_queue_enter() to fail.
>> + */
>> + blk_mq_unquiesce_queue(q);
>> + del_gendisk(vblk->disk);
>> + blk_mq_free_tag_set(&vblk->tag_set);
>> +
>> + mutex_lock(&vblk->vdev_mutex);
>> + flush_work(&vblk->config_work);
>> +
>> + virtio_reset_device(vdev);
>> +
>> + vblk->vdev = NULL;
>> +
>> + vdev->config->del_vqs(vdev);
>> + kfree(vblk->vqs);
>> +
>> + mutex_unlock(&vblk->vdev_mutex);
>> +
>> + put_disk(vblk->disk);
>> +
>> +unlock:
>> + mutex_unlock(&vblk->teardown_mutex);
>> +}
>> +
>> +static enum blk_eh_timer_return virtblk_timeout(struct request *req)
>> +{
>> + struct virtio_blk *vblk = req->mq_hctx->queue->queuedata;
>> + struct virtio_device *vdev = vblk->vdev;
>> + bool ok = vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_DRIVER_OK;
> Please check VIRTIO_CONFIG_S_NEEDS_RESET first. When
> VIRTIO_CONFIG_S_NEEDS_RESET is set the device is not ok, regardless of
> whether DRIVER_OK is set. See 2.1.1 Driver Requirements: Device Status
> Field in the VIRTIO specification.

okay will add that to next version ...

>
>> +
>> + if ((atomic_dec_return(&vblk->rq_timeout_count) != 0) && ok) {
>> + virtblk_cancel_request(req, NULL);
> The driver cannot abandon the request here because the device still owns
> the request:
>
> 1. I/O buffer memory is corrupted for READ requests and disk contents
> are corrupted for WRITE requests when the device does finally process
> the request.
>
> 2. After virtblk_timeout() -> virtblk_cancel_request() ->
> blk_mq_complete_request(), a freed address is returned from
> virtblk_done() -> virtqueue_get_buf() -> blk_mq_rq_from_pdu() when
> the device finally completes the request.
>
> I suggest the following:
>
> (Optional) Add an ABORT/CANCEL request type to the VIRTIO specification
> and use it to safely cancel requests when the device supports it.

I really want to keep this simple without introducing a new command
so we don't have to modify the underlying transport to handle that.

> Otherwise reset the device so that all pending requests are canceled.

will issue reset here and see if that works ...

>> + return BLK_EH_DONE;
>> + }
>> +
>> + dev_err(&vdev->dev, "%s:%s initiating teardown\n", __func__,
>> + vblk->disk->disk_name);
>> +
>> + queue_work(virtblk_wq, &vblk->teardown_work);
>> +
>> + return BLK_EH_RESET_TIMER;
>> +}
>> +
>> static const struct blk_mq_ops virtio_mq_ops = {
>> .queue_rq = virtio_queue_rq,
>> .queue_rqs = virtio_queue_rqs,
>> @@ -1294,6 +1401,7 @@ static const struct blk_mq_ops virtio_mq_ops = {
>> .complete = virtblk_request_done,
>> .map_queues = virtblk_map_queues,
>> .poll = virtblk_poll,
>> + .timeout = virtblk_timeout,
>> };
>>
>> static unsigned int virtblk_queue_depth;
>> @@ -1365,6 +1473,7 @@ static int virtblk_probe(struct virtio_device *vdev)
>> memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
>> vblk->tag_set.ops = &virtio_mq_ops;
>> vblk->tag_set.queue_depth = queue_depth;
>> + vblk->tag_set.timeout = io_timeout * HZ;
>> vblk->tag_set.numa_node = NUMA_NO_NODE;
>> vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
>> vblk->tag_set.cmd_size =
>> @@ -1387,6 +1496,10 @@ static int virtblk_probe(struct virtio_device *vdev)
>> }
>> q = vblk->disk->queue;
>>
>> + mutex_init(&vblk->teardown_mutex);
>> + INIT_WORK(&vblk->teardown_work, virtblk_teardown_work);
>> + atomic_set(&vblk->rq_timeout_count, timeout_teardown_limit);
>> +
>> virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
>>
>> vblk->disk->major = major;
>> @@ -1598,6 +1711,12 @@ static void virtblk_remove(struct virtio_device *vdev)
>> {
>> struct virtio_blk *vblk = vdev->priv;
>>
>> + mutex_lock(&vblk->teardown_mutex);
>> +
>> + /* we did the cleanup in the timeout handler */
>> + if (!vblk->vdev)
>> + goto unlock;
>> +
>> /* Make sure no work handler is accessing the device. */
>> flush_work(&vblk->config_work);
>>
>> @@ -1618,6 +1737,9 @@ static void virtblk_remove(struct virtio_device *vdev)
>> mutex_unlock(&vblk->vdev_mutex);
>>
>> put_disk(vblk->disk);
>> +
>> +unlock:
>> + mutex_unlock(&vblk->teardown_mutex);
>> }
>>
>> #ifdef CONFIG_PM_SLEEP
>> diff --git a/include/uapi/linux/virtio_blk.h b/include/uapi/linux/virtio_blk.h
>> index 3744e4da1b2a..ed864195ab26 100644
>> --- a/include/uapi/linux/virtio_blk.h
>> +++ b/include/uapi/linux/virtio_blk.h
>> @@ -317,6 +317,7 @@ struct virtio_scsi_inhdr {
>> #define VIRTIO_BLK_S_OK 0
>> #define VIRTIO_BLK_S_IOERR 1
>> #define VIRTIO_BLK_S_UNSUPP 2
>> +#define VIRTIO_BLK_S_TIMEOUT 3
> The structs and constants in this header file come from the VIRTIO
> specification. Anything changed in this file must first be accepted into
> the VIRTIO spec because this is the hardware interface definition.
>
> VIRTIO_BLK_S_TIMEOUT seems to be synthetic value that is purely used by
> software, not the device. Maybe there is no need to update the spec.
> Just avoid using in_hdr.status to signal timeouts and use a separate
> flag/field instead in a block layer or virtio_blk driver request struct.

It is a specific error hence I've added that on the similar lines,
do you have a specific field in mind that you would prefer ?

Thanks for the reply, just got back from time off, looking forward to
send V2 as soon as I get the reply.

-ck