Re: [PATCH v3] xen-blkfront: dynamic configuration of per-vbd resources

From: Roger Pau Monné
Date: Wed Jul 27 2016 - 04:08:10 EST


Hello,

Thanks for the fixes.

On Wed, Jul 27, 2016 at 11:21:25AM +0800, Bob Liu wrote:
> The current VBD layer reserves buffer space for each attached device based on
> three statically configured settings which are read at boot time.
> * max_indirect_segs: Maximum amount of segments.
> * max_ring_page_order: Maximum order of pages to be used for the shared ring.
> * max_queues: Maximum of queues(rings) to be used.
>
> But the storage backend, workload, and guest memory result in very different
> tuning requirements. It's impossible to centrally predict application
> characteristics so it's best to leave allow the settings can be dynamiclly
> adjusted based on workload inside the Guest.
>
> Usage:
> Show current values:
> cat /sys/devices/vbd-xxx/max_indirect_segs
> cat /sys/devices/vbd-xxx/max_ring_page_order
> cat /sys/devices/vbd-xxx/max_queues
>
> Write new values:
> echo <new value> > /sys/devices/vbd-xxx/max_indirect_segs
> echo <new value> > /sys/devices/vbd-xxx/max_ring_page_order
> echo <new value> > /sys/devices/vbd-xxx/max_queues
>
> Signed-off-by: Bob Liu <bob.liu@xxxxxxxxxx>
> --
> v3:
> * Remove new_max_indirect_segments.
> * Fix BUG_ON().
> v2:
> * Rename to max_ring_page_order.
> * Remove the waiting code suggested by Roger.
> ---
> drivers/block/xen-blkfront.c | 277 ++++++++++++++++++++++++++++++++++++++++--
> 1 file changed, 269 insertions(+), 8 deletions(-)
>
> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
> index 1b4c380..57baa54 100644
> --- a/drivers/block/xen-blkfront.c
> +++ b/drivers/block/xen-blkfront.c
> @@ -212,6 +212,10 @@ struct blkfront_info
> /* Save uncomplete reqs and bios for migration. */
> struct list_head requests;
> struct bio_list bio_list;
> + /* For dynamic configuration. */
> + unsigned int reconfiguring:1;
> + unsigned int max_ring_page_order;
> + unsigned int max_queues;

max_{ring_page_order/queues} should be after max_indirect_segments, and
reconfigurng should be of type bool (moreover when below you assign 'true'
or 'false' to it).

> };
>
> static unsigned int nr_minors;
> @@ -1350,6 +1354,31 @@ static void blkif_free(struct blkfront_info *info, int suspend)
> for (i = 0; i < info->nr_rings; i++)
> blkif_free_ring(&info->rinfo[i]);
>
> + /* Remove old xenstore nodes. */
> + if (info->nr_ring_pages > 1)
> + xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-page-order");
> +
> + if (info->nr_rings == 1) {
> + if (info->nr_ring_pages == 1) {
> + xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-ref");
> + } else {
> + for (i = 0; i < info->nr_ring_pages; i++) {
> + char ring_ref_name[RINGREF_NAME_LEN];
> +
> + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
> + xenbus_rm(XBT_NIL, info->xbdev->nodename, ring_ref_name);
> + }
> + }
> + } else {
> + xenbus_rm(XBT_NIL, info->xbdev->nodename, "multi-queue-num-queues");
> +
> + for (i = 0; i < info->nr_rings; i++) {
> + char queuename[QUEUE_NAME_LEN];
> +
> + snprintf(queuename, QUEUE_NAME_LEN, "queue-%u", i);
> + xenbus_rm(XBT_NIL, info->xbdev->nodename, queuename);
> + }
> + }
> kfree(info->rinfo);
> info->rinfo = NULL;
> info->nr_rings = 0;
> @@ -1763,15 +1792,20 @@ static int talk_to_blkback(struct xenbus_device *dev,
> const char *message = NULL;
> struct xenbus_transaction xbt;
> int err;
> - unsigned int i, max_page_order = 0;
> + unsigned int i, backend_max_order = 0;
> unsigned int ring_page_order = 0;
>
> err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
> - "max-ring-page-order", "%u", &max_page_order);
> + "max-ring-page-order", "%u", &backend_max_order);
> if (err != 1)
> info->nr_ring_pages = 1;
> else {
> - ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
> + if (info->max_ring_page_order)
> + /* Dynamic configured through /sys. */
> + ring_page_order = min(info->max_ring_page_order, backend_max_order);
> + else
> + /* Default. */
> + ring_page_order = min(xen_blkif_max_ring_order, backend_max_order);
> info->nr_ring_pages = 1 << ring_page_order;

In order to avoid this 'if' here (and below), what do you think of assigning
the global values to the blkfront_info structure in blkfront_probe? Then you
would only have to do min(info->max_ring_page_order, backend_max_order) in
order to get the min value.

> }
>
> @@ -1894,7 +1928,13 @@ static int negotiate_mq(struct blkfront_info *info)
> if (err < 0)
> backend_max_queues = 1;
>
> - info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
> + if (info->max_queues)
> + /* Dynamic configured through /sys */
> + info->nr_rings = min(backend_max_queues, info->max_queues);
> + else
> + /* Default. */
> + info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
> +
> /* We need at least one ring. */
> if (!info->nr_rings)
> info->nr_rings = 1;
> @@ -2352,11 +2392,198 @@ static void blkfront_gather_backend_features(struct blkfront_info *info)
> NULL);
> if (err)
> info->max_indirect_segments = 0;
> - else
> - info->max_indirect_segments = min(indirect_segments,
> - xen_blkif_max_segments);
> + else {
> + if (info->max_indirect_segments)
> + /* Dynamic configured through /sys */
> + info->max_indirect_segments = min(indirect_segments,
> + info->max_indirect_segments);
> + else
> + info->max_indirect_segments = min(indirect_segments,
> + xen_blkif_max_segments);
> + }
> +}
> +
> +static ssize_t max_ring_page_order_show(struct device *dev,
> + struct device_attribute *attr, char *page)
> +{
> + struct blkfront_info *info = dev_get_drvdata(dev);
> +
> + return sprintf(page, "%u\n", get_order(info->nr_ring_pages * XEN_PAGE_SIZE));
> +}
> +
> +static ssize_t max_indirect_segs_show(struct device *dev,
> + struct device_attribute *attr, char *page)
> +{
> + struct blkfront_info *info = dev_get_drvdata(dev);
> +
> + return sprintf(page, "%u\n", info->max_indirect_segments);
> +}
> +
> +static ssize_t max_queues_show(struct device *dev,
> + struct device_attribute *attr, char *page)
> +{
> + struct blkfront_info *info = dev_get_drvdata(dev);
> +
> + return sprintf(page, "%u\n", info->nr_rings);
> +}
> +
> +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count)
> +{
> + /*
> + * Prevent new requests even to software request queue.
> + */
> + blk_mq_freeze_queue(info->rq);
> +
> + /*
> + * Guarantee no uncompleted reqs.
> + */
> + if (part_in_flight(&info->gd->part0) || info->reconfiguring) {
> + blk_mq_unfreeze_queue(info->rq);
> + pr_err("Dev:%s busy, please retry later.\n", dev_name(&info->xbdev->dev));
> + return -EBUSY;
> + }
> +
> + /*
> + * Frontend: Backend:
> + * Freeze_queue()
> + * Switch to XenbusStateClosed
> + * frontend_changed(StateClosed)
> + * > xen_blkif_disconnect()
> + * > Switch to XenbusStateClosed
> + * blkback_changed(StateClosed)
> + * > blkfront_resume()
> + * > Switch to StateInitialised
> + * frontend_changed(StateInitialised):
> + * > reconnect
> + * > Switch to StateConnected
> + * blkback_changed(StateConnected)
> + * > blkif_recover()
> + * > Also switch to StateConnected
> + * > Unfreeze_queue()
> + */
> + info->reconfiguring = true;

This is racy AFAICT. You should use an atomic test_and_set in the condition
above or lock the blkfront_info mutex before and while setting
reconfiguring. Without this two (or more) concurrent calls to
dynamic_reconfig_device might succeed.

> + xenbus_switch_state(info->xbdev, XenbusStateClosed);
> +
> + return count;
> +}
> +
> +static ssize_t max_indirect_segs_store(struct device *dev,
> + struct device_attribute *attr, const char *buf, size_t count)
> +{
> + ssize_t ret;
> + unsigned int max_segs = 0, backend_max_segs = 0;
> + struct blkfront_info *info = dev_get_drvdata(dev);
> + int err;
> +
> + ret = kstrtouint(buf, 10, &max_segs);
> + if (ret < 0)
> + return ret;
> +
> + if (max_segs == info->max_indirect_segments)
> + return count;
> +
> + err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
> + "feature-max-indirect-segments", "%u", &backend_max_segs,
> + NULL);
> + if (err) {
> + pr_err("Backend %s doesn't support feature-indirect-segments.\n",
> + info->xbdev->otherend);
> + return -EOPNOTSUPP;
> + }
> +
> + if (max_segs > backend_max_segs) {
> + pr_err("Invalid max indirect segment (%u), backend-max: %u.\n",
> + max_segs, backend_max_segs);
> + return -EINVAL;
> + }
> +
> + info->max_indirect_segments = max_segs;
> +
> + return dynamic_reconfig_device(info, count);
> +}
> +
> +static ssize_t max_ring_page_order_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + ssize_t ret;
> + unsigned int max_order = 0, backend_max_order = 0;
> + struct blkfront_info *info = dev_get_drvdata(dev);
> + int err;
> +
> + ret = kstrtouint(buf, 10, &max_order);
> + if (ret < 0)
> + return ret;
> +
> + if ((1 << max_order) == info->nr_ring_pages)
> + return count;
> +
> + if (max_order > XENBUS_MAX_RING_GRANT_ORDER) {
> + pr_err("Invalid max_ring_page_order (%u), max: %u.\n",
> + max_order, XENBUS_MAX_RING_GRANT_ORDER);
> + return -EINVAL;
> + }
> +
> + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
> + "max-ring-page-order", "%u", &backend_max_order);
> + if (err != 1) {
> + pr_err("Backend %s doesn't support feature multi-page-ring.\n",
> + info->xbdev->otherend);
> + return -EOPNOTSUPP;
> + }
> + if (max_order > backend_max_order) {
> + pr_err("Invalid max_ring_page_order (%u), backend supports max: %u.\n",
> + max_order, backend_max_order);
> + return -EINVAL;
> + }
> + info->max_ring_page_order = max_order;
> +
> + return dynamic_reconfig_device(info, count);
> +}
> +
> +static ssize_t max_queues_store(struct device *dev,
> + struct device_attribute *attr,
> + const char *buf, size_t count)
> +{
> + ssize_t ret;
> + unsigned int max_queues = 0, backend_max_queues = 0;
> + struct blkfront_info *info = dev_get_drvdata(dev);
> + int err;
> +
> + ret = kstrtouint(buf, 10, &max_queues);
> + if (ret < 0)
> + return ret;
> +
> + if (max_queues == info->nr_rings)
> + return count;
> +
> + if (max_queues > num_online_cpus()) {
> + pr_err("Invalid max_queues (%u), can't bigger than online cpus: %u.\n",
> + max_queues, num_online_cpus());
> + return -EINVAL;
> + }
> +
> + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
> + "multi-queue-max-queues", "%u", &backend_max_queues);
> + if (err != 1) {
> + pr_err("Backend %s doesn't support block multi queue.\n",
> + info->xbdev->otherend);
> + return -EOPNOTSUPP;
> + }
> + if (max_queues > backend_max_queues) {
> + pr_err("Invalid max_queues (%u), backend supports max: %u.\n",
> + max_queues, backend_max_queues);
> + return -EINVAL;
> + }
> + info->max_queues = max_queues;
> +
> + return dynamic_reconfig_device(info, count);
> }
>
> +static DEVICE_ATTR_RW(max_queues);
> +static DEVICE_ATTR_RW(max_ring_page_order);
> +static DEVICE_ATTR_RW(max_indirect_segs);
> +
> /*
> * Invoked when the backend is finally 'ready' (and has told produced
> * the details about the physical device - #sectors, size, etc).
> @@ -2393,6 +2620,10 @@ static void blkfront_connect(struct blkfront_info *info)
> * supports indirect descriptors, and how many.
> */
> blkif_recover(info);
> + if (info->reconfiguring) {
> + blk_mq_unfreeze_queue(info->rq);
> + info->reconfiguring = false;
> + }
> return;
>
> default:
> @@ -2443,6 +2674,22 @@ static void blkfront_connect(struct blkfront_info *info)
> return;
> }
>
> + err = device_create_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
> + if (err)
> + goto fail;
> +
> + err = device_create_file(&info->xbdev->dev, &dev_attr_max_indirect_segs);
> + if (err) {
> + device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
> + goto fail;
> + }
> +
> + err = device_create_file(&info->xbdev->dev, &dev_attr_max_queues);
> + if (err) {
> + device_remove_file(&info->xbdev->dev, &dev_attr_max_ring_page_order);
> + device_remove_file(&info->xbdev->dev, &dev_attr_max_indirect_segs);
> + goto fail;
> + }
> xenbus_switch_state(info->xbdev, XenbusStateConnected);
>
> /* Kick pending requests. */
> @@ -2453,6 +2700,12 @@ static void blkfront_connect(struct blkfront_info *info)
> add_disk(info->gd);
>
> info->is_ready = 1;
> + return;
> +
> +fail:
> + blkif_free(info, 0);
> + xlvbd_release_gendisk(info);
> + return;

Hm, I'm not sure whether this chunk should be in a separate patch, it seems
like blkfront_connect doesn't properly cleanup on error (if
xlvbd_alloc_gendisk fails blkif_free will not be called). Do you think you
could send the addition of the 'fail' label as a separate patch and fix the
error path of xlvbd_alloc_gendisk?

> }
>
> /**
> @@ -2500,8 +2753,16 @@ static void blkback_changed(struct xenbus_device *dev,
> break;
>
> case XenbusStateClosed:
> - if (dev->state == XenbusStateClosed)
> + if (dev->state == XenbusStateClosed) {
> + if (info->reconfiguring)
> + if (blkfront_resume(info->xbdev)) {

Could you please join those two conditions:

if (info->reconfiguring && blkfront_resume(info->xbdev)) { ...

Also, I'm not sure this is correct, if blkfront sees the "Closing" state on
blkback it will try to close the frontend and destroy the block device (see
blkfront_closing), and this should be avoided. You should call
blkfront_resume as soon as you see the backend move to the Closed or Closing
states, without calling blkfront_closing.

> + /* Resume failed. */
> + info->reconfiguring = false;
> + xenbus_switch_state(info->xbdev, XenbusStateClosed);
> + pr_err("Resume from dynamic configuration failed\n");
> + }
> break;
> + }
> /* Missed the backend's Closing state -- fallthrough */
> case XenbusStateClosing:
> if (info)
> --
> 1.7.10.4