Re: [PATCH 3/3] xen-blkfront: dynamic configuration of per-vbd resources
From: Bob Liu
Date: Thu Jul 21 2016 - 06:09:14 EST
On 07/21/2016 04:57 PM, Roger Pau Monné wrote:
> On Fri, Jul 15, 2016 at 05:31:49PM +0800, Bob Liu wrote:
>> The current VBD layer reserves buffer space for each attached device based on
>> three statically configured settings which are read at boot time.
>> * max_indirect_segs: Maximum amount of segments.
>> * max_ring_page_order: Maximum order of pages to be used for the shared ring.
>> * max_queues: Maximum of queues(rings) to be used.
>>
>> But the storage backend, workload, and guest memory result in very different
>> tuning requirements. It's impossible to centrally predict application
>> characteristics so it's best to leave allow the settings can be dynamiclly
>> adjusted based on workload inside the Guest.
>>
>> Usage:
>> Show current values:
>> cat /sys/devices/vbd-xxx/max_indirect_segs
>> cat /sys/devices/vbd-xxx/max_ring_page_order
>> cat /sys/devices/vbd-xxx/max_queues
>>
>> Write new values:
>> echo <new value> > /sys/devices/vbd-xxx/max_indirect_segs
>> echo <new value> > /sys/devices/vbd-xxx/max_ring_page_order
>> echo <new value> > /sys/devices/vbd-xxx/max_queues
>>
>> Signed-off-by: Bob Liu <bob.liu@xxxxxxxxxx>
>> --
>> v2: Add device lock and other comments from Konrad.
>> ---
>> drivers/block/xen-blkfront.c | 285 ++++++++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 283 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
>> index 10f46a8..9a5ed22 100644
>> --- a/drivers/block/xen-blkfront.c
>> +++ b/drivers/block/xen-blkfront.c
>> @@ -46,6 +46,7 @@
>> #include <linux/scatterlist.h>
>> #include <linux/bitmap.h>
>> #include <linux/list.h>
>> +#include <linux/delay.h>
>>
>> #include <xen/xen.h>
>> #include <xen/xenbus.h>
>> @@ -212,6 +213,11 @@ struct blkfront_info
>> /* Save uncomplete reqs and bios for migration. */
>> struct list_head requests;
>> struct bio_list bio_list;
>> + /* For dynamic configuration. */
>> + unsigned int reconfiguring:1;
>> + int new_max_indirect_segments;
>> + int new_max_ring_page_order;
>> + int new_max_queues;
>> };
>>
>> static unsigned int nr_minors;
>> @@ -1350,6 +1356,31 @@ static void blkif_free(struct blkfront_info *info, int suspend)
>> for (i = 0; i < info->nr_rings; i++)
>> blkif_free_ring(&info->rinfo[i]);
>>
>> + /* Remove old xenstore nodes. */
>> + if (info->nr_ring_pages > 1)
>> + xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-page-order");
>> +
>> + if (info->nr_rings == 1) {
>> + if (info->nr_ring_pages == 1) {
>> + xenbus_rm(XBT_NIL, info->xbdev->nodename, "ring-ref");
>> + } else {
>> + for (i = 0; i < info->nr_ring_pages; i++) {
>> + char ring_ref_name[RINGREF_NAME_LEN];
>> +
>> + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
>> + xenbus_rm(XBT_NIL, info->xbdev->nodename, ring_ref_name);
>> + }
>> + }
>> + } else {
>> + xenbus_rm(XBT_NIL, info->xbdev->nodename, "multi-queue-num-queues");
>> +
>> + for (i = 0; i < info->nr_rings; i++) {
>> + char queuename[QUEUE_NAME_LEN];
>> +
>> + snprintf(queuename, QUEUE_NAME_LEN, "queue-%u", i);
>> + xenbus_rm(XBT_NIL, info->xbdev->nodename, queuename);
>> + }
>> + }
>> kfree(info->rinfo);
>> info->rinfo = NULL;
>> info->nr_rings = 0;
>> @@ -1772,6 +1803,10 @@ static int talk_to_blkback(struct xenbus_device *dev,
>> info->nr_ring_pages = 1;
>> else {
>> ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
>> + if (info->new_max_ring_page_order) {
>
> Instead of calling this "new_max_ring_page_order", could you just call it
> max_ring_page_order, iniitalize it to xen_blkif_max_ring_order by default
Sure, I can do that.
> and use it everywhere instead of xen_blkif_max_ring_order?
But "xen_blkif_max_ring_order" still have to be used here, this is the only place "xen_blkif_max_ring_order" is used(except checking the value of it in xlblk_init()).
>
>> + BUG_ON(info->new_max_ring_page_order > max_page_order);
>> + ring_page_order = info->new_max_ring_page_order;
>> + }
>> info->nr_ring_pages = 1 << ring_page_order;
>> }
>>
>> @@ -1895,6 +1930,10 @@ static int negotiate_mq(struct blkfront_info *info)
>> backend_max_queues = 1;
>>
>> info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
>> + if (info->new_max_queues) {
>
> Same here IMHO, this is going to make the code flow slightly easier to
> understand.
>
>> + BUG_ON(info->new_max_queues > backend_max_queues);
>> + info->nr_rings = info->new_max_queues;
>> + }
>> /* We need at least one ring. */
>> if (!info->nr_rings)
>> info->nr_rings = 1;
>> @@ -2352,11 +2391,227 @@ static void blkfront_gather_backend_features(struct blkfront_info *info)
>> NULL);
>> if (err)
>> info->max_indirect_segments = 0;
>> - else
>> + else {
>> info->max_indirect_segments = min(indirect_segments,
>> xen_blkif_max_segments);
>> + if (info->new_max_indirect_segments) {
>> + BUG_ON(info->new_max_indirect_segments > indirect_segments);
>> + info->max_indirect_segments = info->new_max_indirect_segments;
>> + }
>> + }
>> +}
>> +
>> +static ssize_t max_ring_page_order_show(struct device *dev,
>> + struct device_attribute *attr, char *page)
>> +{
>> + struct blkfront_info *info = dev_get_drvdata(dev);
>> +
>> + return sprintf(page, "%u\n", get_order(info->nr_ring_pages * XEN_PAGE_SIZE));
>> +}
>> +
>> +static ssize_t max_indirect_segs_show(struct device *dev,
>> + struct device_attribute *attr, char *page)
>> +{
>> + struct blkfront_info *info = dev_get_drvdata(dev);
>> +
>> + return sprintf(page, "%u\n", info->max_indirect_segments);
>> +}
>> +
>> +static ssize_t max_queues_show(struct device *dev,
>> + struct device_attribute *attr, char *page)
>> +{
>> + struct blkfront_info *info = dev_get_drvdata(dev);
>> +
>> + return sprintf(page, "%u\n", info->nr_rings);
>> +}
>> +
>> +static ssize_t dynamic_reconfig_device(struct blkfront_info *info, ssize_t count)
>> +{
>> + unsigned int i;
>> + int err = -EBUSY;
>> +
>> + /*
>> + * Make sure no migration in parallel, device lock is actually a
>> + * mutex.
>> + */
>> + if (!device_trylock(&info->xbdev->dev)) {
>> + pr_err("Fail to acquire dev:%s lock, may be in migration.\n",
>> + dev_name(&info->xbdev->dev));
>> + return err;
>> + }
>> +
>> + /*
>> + * Prevent new requests and guarantee no uncompleted reqs.
>> + */
>> + blk_mq_freeze_queue(info->rq);
>> + if (part_in_flight(&info->gd->part0))
>> + goto out;
>> +
>> + /*
>> + * Front Backend
>> + * Switch to XenbusStateClosed
>> + * frontend_changed():
>> + * case XenbusStateClosed:
>> + * xen_blkif_disconnect()
>> + * Switch to XenbusStateClosed
>> + * blkfront_resume():
>> + * frontend_changed():
>> + * reconnect
>> + * Wait until XenbusStateConnected
>> + */
>> + info->reconfiguring = true;
>> + xenbus_switch_state(info->xbdev, XenbusStateClosed);
>> +
>> + /* Poll every 100ms, 1 minute timeout. */
>> + for (i = 0; i < 600; i++) {
>> + /*
>> + * Wait backend enter XenbusStateClosed, blkback_changed()
>> + * will clear reconfiguring.
>> + */
>> + if (!info->reconfiguring)
>> + goto resume;
>> + schedule_timeout_interruptible(msecs_to_jiffies(100));
>> + }
>
> Instead of having this wait, could you just set info->reconfiguring = 1, set
> the frontend state to XenbusStateClosed and mimic exactly what a resume from
> suspension does? blkback_changed would have to set the frontend state to
> InitWait when it detects that the backend has switched to Closed, and call
> blkfront_resume.
I think that won't work.
In the real "resume" case, the power management system will trigger all ->resume() path.
But there is no place for dynamic configuration.
Thanks,
Bob Liu