Re: [Xen-devel] [PATCH 2/2] drivers: xen/block: add multi-page ring support
From: Bob Liu
Date: Fri Feb 06 2015 - 05:43:21 EST
On 02/06/2015 01:41 AM, Felipe Franciosi wrote:
> Hi Bob,
>
> Can you elaborate on the environment where you measured such an improvement?
>
> I'm particularly interested in:
> What workload were you issuing? (e.g. 4K seq reads?)
8k writes.
> What backend were you using? (e.g. null driver? what parameters? some specific disk/array?)
Iscsi lun spread across about 100 physical drives, so the limit
outstanding requests(32) is really not enough to keep them busy.
With more inflight requests from DomU, we can see IOPS improved
significantly. E.g after using 4 pages as the ring, there could be 128
inflight requests instead of 32.
Thanks,
-Bob
> What was the host configuration for the test?
> What was the VM configuration for the test?
>
> Thanks,
> Felipe
>
>
> -----Original Message-----
> From: xen-devel-bounces@xxxxxxxxxxxxx [mailto:xen-devel-bounces@xxxxxxxxxxxxx] On Behalf Of Bob Liu
> Sent: 23 January 2015 10:15
> To: xen-devel@xxxxxxxxxxxxx
> Cc: Wei Liu; linux-kernel@xxxxxxxxxxxxxxx; Bob Liu; David Vrabel; boris.ostrovsky@xxxxxxxxxx; Roger Pau Monne
> Subject: [Xen-devel] [PATCH 2/2] drivers: xen/block: add multi-page ring support
>
> Extend xen/block to support multi-page ring.
> * xen-blkback notify blkfront with feature-multi-ring-pages
> * xen-blkfront write to xenstore about how many pages are used as the ring
>
> If using 4 pages as the ring, inflight requests inscreased from 32 to 128 and IOPS improved nearly 400% on our system.
>
> Signed-off-by: Bob Liu <bob.liu@xxxxxxxxxx>
> ---
> drivers/block/xen-blkback/xenbus.c | 86 +++++++++++++++++++++++++--------
> drivers/block/xen-blkfront.c | 94 ++++++++++++++++++++++++++----------
> 2 files changed, 133 insertions(+), 47 deletions(-)
>
> diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
> index cbd13c9..a5c9c62 100644
> --- a/drivers/block/xen-blkback/xenbus.c
> +++ b/drivers/block/xen-blkback/xenbus.c
> @@ -193,8 +193,8 @@ fail:
> return ERR_PTR(-ENOMEM);
> }
>
> -static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
> - unsigned int evtchn)
> +static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
> + unsigned int nr_grefs, unsigned int evtchn)
> {
> int err;
>
> @@ -202,7 +202,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
> if (blkif->irq)
> return 0;
>
> - err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1,
> + err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
> &blkif->blk_ring);
> if (err < 0)
> return err;
> @@ -212,21 +212,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
> {
> struct blkif_sring *sring;
> sring = (struct blkif_sring *)blkif->blk_ring;
> - BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
> + BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE *
> +nr_grefs);
> break;
> }
> case BLKIF_PROTOCOL_X86_32:
> {
> struct blkif_x86_32_sring *sring_x86_32;
> sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
> - BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
> + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE *
> +nr_grefs);
> break;
> }
> case BLKIF_PROTOCOL_X86_64:
> {
> struct blkif_x86_64_sring *sring_x86_64;
> sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
> - BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
> + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE *
> +nr_grefs);
> break;
> }
> default:
> @@ -588,6 +588,13 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
> if (err)
> goto fail;
>
> + err = xenbus_printf(XBT_NIL, dev->nodename,
> + "feature-multi-ring-pages", "%u", 1);
> + if (err) {
> + pr_debug("Error writing feature-multi-ring-pages\n");
> + goto fail;
> + }
> +
> err = xenbus_switch_state(dev, XenbusStateInitWait);
> if (err)
> goto fail;
> @@ -852,23 +859,61 @@ again:
> static int connect_ring(struct backend_info *be) {
> struct xenbus_device *dev = be->dev;
> - unsigned long ring_ref;
> - unsigned int evtchn;
> + unsigned int ring_ref[XENBUS_MAX_RING_PAGES];
> + unsigned int evtchn, nr_grefs;
> unsigned int pers_grants;
> char protocol[64] = "";
> int err;
>
> DPRINTK("%s", dev->otherend);
> -
> - err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
> - &ring_ref, "event-channel", "%u", &evtchn, NULL);
> - if (err) {
> - xenbus_dev_fatal(dev, err,
> - "reading %s/ring-ref and event-channel",
> - dev->otherend);
> + err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
> + &evtchn);
> + if (err != 1) {
> + err = -EINVAL;
> + xenbus_dev_fatal(dev, err, "reading %s/event-channel",
> + dev->otherend);
> return err;
> }
>
> + err = xenbus_scanf(XBT_NIL, dev->otherend, "max-ring-pages", "%u",
> + &nr_grefs);
> + if (err != 1) {
> + err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
> + "%u", &ring_ref[0]);
> + if (err != 1) {
> + err = -EINVAL;
> + xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
> + dev->otherend);
> + return err;
> + }
> + nr_grefs = 1;
> + pr_debug("%s:using one-page ring with ref: %d\n",
> + dev->otherend, ring_ref[0]);
> + } else {
> + unsigned int i;
> +
> + if (nr_grefs > XENBUS_MAX_RING_PAGES) {
> + err = -EINVAL;
> + xenbus_dev_fatal(dev, err,
> + "%s/ring-pages:%d too big", dev->otherend, nr_grefs);
> + return err;
> + }
> + for (i = 0; i < nr_grefs; i++) {
> + char ring_ref_name[15];
> + snprintf(ring_ref_name, sizeof(ring_ref_name),
> + "ring-ref%u", i);
> + err = xenbus_scanf(XBT_NIL, dev->otherend,
> + ring_ref_name, "%d", &ring_ref[i]);
> + if (err != 1) {
> + err = -EINVAL;
> + xenbus_dev_fatal(dev, err, "reading %s/%s",
> + dev->otherend, ring_ref_name);
> + return err;
> + }
> + pr_debug("blkback: ring-ref%u: %d\n", i, ring_ref[i]);
> + }
> + }
> +
> be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
> err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
> "%63s", protocol, NULL);
> @@ -893,15 +938,14 @@ static int connect_ring(struct backend_info *be)
> be->blkif->vbd.feature_gnt_persistent = pers_grants;
> be->blkif->vbd.overflow_max_grants = 0;
>
> - pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
> - ring_ref, evtchn, be->blkif->blk_protocol, protocol,
> - pers_grants ? "persistent grants" : "");
> + pr_info(DRV_PFX "ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
> + nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
> + pers_grants ? "persistent grants" : "");
>
> /* Map the shared frame, irq etc. */
> - err = xen_blkif_map(be->blkif, ring_ref, evtchn);
> + err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
> if (err) {
> - xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
> - ring_ref, evtchn);
> + xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
> return err;
> }
>
> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 2fcf75d..dec9fe7 100644
> --- a/drivers/block/xen-blkfront.c
> +++ b/drivers/block/xen-blkfront.c
> @@ -98,7 +98,12 @@ static unsigned int xen_blkif_max_segments = 32; module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
>
> -#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
> +static unsigned int xen_blkif_max_ring_pages = 1;
> +module_param_named(max_ring_pages, xen_blkif_max_ring_pages, int, 0);
> +MODULE_PARM_DESC(max_ring_pages, "Maximum amount of pages to be used as
> +the ring");
> +
> +#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE *
> +info->nr_ring_pages) #define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif,
> +PAGE_SIZE * XENBUS_MAX_RING_PAGES)
>
> /*
> * We have one of these per vbd, whether ide, scsi or 'other'. They @@ -114,13 +119,14 @@ struct blkfront_info
> int vdevice;
> blkif_vdev_t handle;
> enum blkif_state connected;
> - int ring_ref;
> + int ring_ref[XENBUS_MAX_RING_PAGES];
> + int nr_ring_pages;
> struct blkif_front_ring ring;
> unsigned int evtchn, irq;
> struct request_queue *rq;
> struct work_struct work;
> struct gnttab_free_callback callback;
> - struct blk_shadow shadow[BLK_RING_SIZE];
> + struct blk_shadow shadow[BLK_MAX_RING_SIZE];
> struct list_head grants;
> struct list_head indirect_pages;
> unsigned int persistent_gnts_c;
> @@ -139,8 +145,6 @@ static unsigned int nr_minors; static unsigned long *minors; static DEFINE_SPINLOCK(minor_lock);
>
> -#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
> - (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
> #define GRANT_INVALID_REF 0
>
> #define PARTS_PER_DISK 16
> @@ -1033,12 +1037,15 @@ free_shadow:
> flush_work(&info->work);
>
> /* Free resources associated with old device channel. */
> - if (info->ring_ref != GRANT_INVALID_REF) {
> - gnttab_end_foreign_access(info->ring_ref, 0,
> - (unsigned long)info->ring.sring);
> - info->ring_ref = GRANT_INVALID_REF;
> - info->ring.sring = NULL;
> + for (i = 0; i < info->nr_ring_pages; i++) {
> + if (info->ring_ref[i] != GRANT_INVALID_REF) {
> + gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
> + info->ring_ref[i] = GRANT_INVALID_REF;
> + }
> }
> + free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
> + info->ring.sring = NULL;
> +
> if (info->irq)
> unbind_from_irqhandler(info->irq, info);
> info->evtchn = info->irq = 0;
> @@ -1245,26 +1252,30 @@ static int setup_blkring(struct xenbus_device *dev,
> struct blkfront_info *info)
> {
> struct blkif_sring *sring;
> - int err;
> - grant_ref_t gref;
> + int err, i;
> + unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE;
> + grant_ref_t gref[XENBUS_MAX_RING_PAGES];
>
> - info->ring_ref = GRANT_INVALID_REF;
> + for (i = 0; i < XENBUS_MAX_RING_PAGES; i++)
> + info->ring_ref[i] = GRANT_INVALID_REF;
>
> - sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
> + sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
> + get_order(ring_size));
> if (!sring) {
> xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
> return -ENOMEM;
> }
> SHARED_RING_INIT(sring);
> - FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
> + FRONT_RING_INIT(&info->ring, sring, ring_size);
>
> - err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref);
> + err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages,
> +gref);
> if (err < 0) {
> free_page((unsigned long)sring);
> info->ring.sring = NULL;
> goto fail;
> }
> - info->ring_ref = gref;
> + for (i = 0; i < info->nr_ring_pages; i++)
> + info->ring_ref[i] = gref[i];
>
> err = xenbus_alloc_evtchn(dev, &info->evtchn);
> if (err)
> @@ -1292,7 +1303,14 @@ static int talk_to_blkback(struct xenbus_device *dev, {
> const char *message = NULL;
> struct xenbus_transaction xbt;
> - int err;
> + int err, i, multi_ring_pages = 0;
> +
> + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
> + "feature-multi-ring-pages", "%u", &multi_ring_pages);
> + if (err != 1)
> + info->nr_ring_pages = 1;
> + else
> + info->nr_ring_pages = xen_blkif_max_ring_pages;
>
> /* Create shared ring, alloc event channel. */
> err = setup_blkring(dev, info);
> @@ -1306,12 +1324,33 @@ again:
> goto destroy_blkring;
> }
>
> - err = xenbus_printf(xbt, dev->nodename,
> - "ring-ref", "%u", info->ring_ref);
> - if (err) {
> - message = "writing ring-ref";
> - goto abort_transaction;
> + if (info->nr_ring_pages == 1) {
> + err = xenbus_printf(xbt, dev->nodename,
> + "ring-ref", "%u", info->ring_ref[0]);
> + if (err) {
> + message = "writing ring-ref";
> + goto abort_transaction;
> + }
> + } else {
> + err = xenbus_printf(xbt, dev->nodename,
> + "max-ring-pages", "%u", info->nr_ring_pages);
> + if (err) {
> + message = "writing max-ring-pages";
> + goto abort_transaction;
> + }
> +
> + for (i = 0; i < info->nr_ring_pages; i++) {
> + char ring_ref_name[15];
> + sprintf(ring_ref_name, "ring-ref%d", i);
> + err = xenbus_printf(xbt, dev->nodename,
> + ring_ref_name, "%d", info->ring_ref[i]);
> + if (err) {
> + message = "writing ring-ref";
> + goto abort_transaction;
> + }
> + }
> }
> +
> err = xenbus_printf(xbt, dev->nodename,
> "event-channel", "%u", info->evtchn);
> if (err) {
> @@ -1422,9 +1461,8 @@ static int blkfront_probe(struct xenbus_device *dev,
> info->connected = BLKIF_STATE_DISCONNECTED;
> INIT_WORK(&info->work, blkif_restart_queue);
>
> - for (i = 0; i < BLK_RING_SIZE; i++)
> + for (i = 0; i < BLK_MAX_RING_SIZE; i++)
> info->shadow[i].req.u.rw.id = i+1;
> - info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
>
> /* Front end dir is a number, which is used as the id. */
> info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); @@ -1436,6 +1474,7 @@ static int blkfront_probe(struct xenbus_device *dev,
> dev_set_drvdata(&dev->dev, NULL);
> return err;
> }
> + info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
>
> return 0;
> }
> @@ -1476,7 +1515,7 @@ static int blkif_recover(struct blkfront_info *info)
>
> /* Stage 2: Set up free list. */
> memset(&info->shadow, 0, sizeof(info->shadow));
> - for (i = 0; i < BLK_RING_SIZE; i++)
> + for (i = 0; i < BLK_MAX_RING_SIZE; i++)
> info->shadow[i].req.u.rw.id = i+1;
> info->shadow_free = info->ring.req_prod_pvt;
> info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff; @@ -2088,6 +2127,9 @@ static int __init xlblk_init(void) {
> int ret;
>
> + if (xen_blkif_max_ring_pages > XENBUS_MAX_RING_PAGES)
> + return -EINVAL;
> +
> if (!xen_domain())
> return -ENODEV;
>
> --
> 1.7.10.4
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@xxxxxxxxxxxxx
> http://lists.xen.org/xen-devel
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/