Re: [PATCH v4 2/2] xen/block: add multi-page ring support
From: Roger Pau Monné
Date: Thu May 21 2015 - 07:22:30 EST
El 20/05/15 a les 15.10, Bob Liu ha escrit:
> Extend xen/block to support multi-page ring, so that more requests can be issued
> by using more than one pages as the request ring between blkfront and backend.
> As a result, the performance can get improved significantly.
>
> We got some impressive improvements on our highend iscsi storage cluster backend.
> If using 64 pages as the ring, the IOPS increased about 15 times for the
> throughput testing and above doubled for the latency testing.
>
> The reason was the limit on outstanding requests is 32 if use only one-page
> ring, but in our case the iscsi lun was spread across about 100 physical drives,
> 32 was really not enough to keep them busy.
>
> Changes in v2:
> - Rebased to 4.0-rc6.
> - Document on how mutli-page ring feature working to linux io/blkif.h.
>
> Changes in v3:
> - Remove changes to linux io/blkif.h and follow the protocol defined
> in io/blkif.h of XEN tree.
> - Rebased to 4.1-rc3
>
> Changes in v4:
> - Turn to use 'ring-page-order' and 'max-ring-page-order'.
> - A few comments from Roger.
>
> Signed-off-by: Bob Liu <bob.liu@xxxxxxxxxx>
> ---
> drivers/block/xen-blkback/blkback.c | 12 ++++
> drivers/block/xen-blkback/common.h | 3 +-
> drivers/block/xen-blkback/xenbus.c | 85 +++++++++++++++++++++-------
> drivers/block/xen-blkfront.c | 110 ++++++++++++++++++++++++++----------
> 4 files changed, 161 insertions(+), 49 deletions(-)
>
> diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
> index 713fc9f..057890f 100644
> --- a/drivers/block/xen-blkback/blkback.c
> +++ b/drivers/block/xen-blkback/blkback.c
> @@ -84,6 +84,12 @@ MODULE_PARM_DESC(max_persistent_grants,
> "Maximum number of grants to map persistently");
>
> /*
> + * Maximum number of pages to be used as the ring between front and backend
> + */
> +unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER;
> +module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
> +MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used as the ring");
I'm not a native speaker, but the following seems better:
"Maximum order of pages to be used for the shared ring"
> +/*
> * The LRU mechanism to clean the lists of persistent grants needs to
> * be executed periodically. The time interval between consecutive executions
> * of the purge mechanism is set in ms.
> @@ -1435,6 +1441,12 @@ static int __init xen_blkif_init(void)
> {
> int rc = 0;
>
> + if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) {
> + pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
> + xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER);
> + xen_blkif_max_ring_order = XENBUS_MAX_RING_PAGE_ORDER;
> + }
This chunk should go after the !xen_domain check.
> if (!xen_domain())
> return -ENODEV;
>
> diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
> index f620b5d..edc0992 100644
> --- a/drivers/block/xen-blkback/common.h
> +++ b/drivers/block/xen-blkback/common.h
> @@ -44,6 +44,7 @@
> #include <xen/interface/io/blkif.h>
> #include <xen/interface/io/protocols.h>
>
> +extern unsigned int xen_blkif_max_ring_order;
> /*
> * This is the maximum number of segments that would be allowed in indirect
> * requests. This value will also be passed to the frontend.
> @@ -248,7 +249,7 @@ struct backend_info;
> #define PERSISTENT_GNT_WAS_ACTIVE 1
>
> /* Number of requests that we can fit in a ring */
> -#define XEN_BLKIF_REQS 32
> +#define XEN_BLKIF_REQS (32 * XENBUS_MAX_RING_PAGES)
This should be XEN_BLKIF_MAX_REQS, because the actual value might be lower.
>
> struct persistent_gnt {
> struct page *page;
> diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
> index 6ab69ad..1ec05eb 100644
> --- a/drivers/block/xen-blkback/xenbus.c
> +++ b/drivers/block/xen-blkback/xenbus.c
> @@ -25,6 +25,7 @@
>
> /* Enlarge the array size in order to fully show blkback name. */
> #define BLKBACK_NAME_LEN (20)
> +#define RINGREF_NAME_LEN (20)
>
> struct backend_info {
> struct xenbus_device *dev;
> @@ -198,8 +199,8 @@ fail:
> return ERR_PTR(-ENOMEM);
> }
>
> -static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
> - unsigned int evtchn)
> +static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref,
> + unsigned int nr_grefs, unsigned int evtchn)
> {
> int err;
>
> @@ -207,7 +208,7 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
> if (blkif->irq)
> return 0;
>
> - err = xenbus_map_ring_valloc(blkif->be->dev, &gref, 1,
> + err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
> &blkif->blk_ring);
> if (err < 0)
> return err;
> @@ -217,21 +218,21 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t gref,
> {
> struct blkif_sring *sring;
> sring = (struct blkif_sring *)blkif->blk_ring;
> - BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE);
> + BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE * nr_grefs);
> break;
> }
> case BLKIF_PROTOCOL_X86_32:
> {
> struct blkif_x86_32_sring *sring_x86_32;
> sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring;
> - BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE);
> + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE * nr_grefs);
> break;
> }
> case BLKIF_PROTOCOL_X86_64:
> {
> struct blkif_x86_64_sring *sring_x86_64;
> sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring;
> - BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE);
> + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE * nr_grefs);
> break;
> }
> default:
> @@ -597,6 +598,11 @@ static int xen_blkbk_probe(struct xenbus_device *dev,
> if (err)
> goto fail;
>
> + err = xenbus_printf(XBT_NIL, dev->nodename, "max-ring-page-order", "%u",
> + xen_blkif_max_ring_order);
> + if (err)
> + pr_warn("%s write out 'max-ring-pages' failed\n", __func__);
> +
> err = xenbus_switch_state(dev, XenbusStateInitWait);
> if (err)
> goto fail;
> @@ -860,22 +866,64 @@ again:
> static int connect_ring(struct backend_info *be)
> {
> struct xenbus_device *dev = be->dev;
> - unsigned long ring_ref;
> - unsigned int evtchn;
> + unsigned int ring_ref[XENBUS_MAX_RING_PAGES];
> + unsigned int evtchn, nr_grefs, ring_page_order;
> unsigned int pers_grants;
> char protocol[64] = "";
> int err;
>
> pr_debug("%s %s\n", __func__, dev->otherend);
>
> - err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu",
> - &ring_ref, "event-channel", "%u", &evtchn, NULL);
> - if (err) {
> - xenbus_dev_fatal(dev, err,
> - "reading %s/ring-ref and event-channel",
> + err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u",
> + &evtchn);
> + if (err != 1) {
> + err = -EINVAL;
> + xenbus_dev_fatal(dev, err, "reading %s/event-channel",
> dev->otherend);
> return err;
> }
> + pr_info("event-channel %u\n", evtchn);
> +
> + err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
> + &ring_page_order);
> + if (err != 1) {
> + err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref",
> + "%u", &ring_ref[0]);
> + if (err != 1) {
> + err = -EINVAL;
> + xenbus_dev_fatal(dev, err, "reading %s/ring-ref",
> + dev->otherend);
> + return err;
> + }
> + nr_grefs = 1;
> + pr_info("%s:using single page: ring-ref %d\n", dev->otherend,
> + ring_ref[0]);
> + } else {
> + unsigned int i;
> +
> + if (ring_page_order > xen_blkif_max_ring_order) {
> + err = -EINVAL;
> + xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d",
> + dev->otherend, ring_page_order, xen_blkif_max_ring_order);
> + return err;
> + }
> +
> + nr_grefs = 1 << ring_page_order;
> + for (i = 0; i < nr_grefs; i++) {
> + char ring_ref_name[RINGREF_NAME_LEN];
> +
> + snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
^ RINGREF_NAME_LEN
> + err = xenbus_scanf(XBT_NIL, dev->otherend,
> + ring_ref_name, "%u", &ring_ref[i]);
No need to split lines unless they are > 100 chars (here and elsewhere).
> + if (err != 1) {
> + err = -EINVAL;
> + xenbus_dev_fatal(dev, err, "reading %s/%s",
> + dev->otherend, ring_ref_name);
> + return err;
> + }
> + pr_info("ring-ref%u: %u\n", i, ring_ref[i]);
> + }
> + }
>
> be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
> err = xenbus_gather(XBT_NIL, dev->otherend, "protocol",
> @@ -901,15 +949,14 @@ static int connect_ring(struct backend_info *be)
> be->blkif->vbd.feature_gnt_persistent = pers_grants;
> be->blkif->vbd.overflow_max_grants = 0;
>
> - pr_info("ring-ref %ld, event-channel %d, protocol %d (%s) %s\n",
> - ring_ref, evtchn, be->blkif->blk_protocol, protocol,
> - pers_grants ? "persistent grants" : "");
> + pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n",
> + nr_grefs, evtchn, be->blkif->blk_protocol, protocol,
> + pers_grants ? "persistent grants" : "");
>
> /* Map the shared frame, irq etc. */
> - err = xen_blkif_map(be->blkif, ring_ref, evtchn);
> + err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn);
> if (err) {
> - xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u",
> - ring_ref, evtchn);
> + xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
> return err;
> }
>
> diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
> index 88e23fd..3d1c6fb 100644
> --- a/drivers/block/xen-blkfront.c
> +++ b/drivers/block/xen-blkfront.c
> @@ -98,7 +98,17 @@ static unsigned int xen_blkif_max_segments = 32;
> module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
> MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
>
> -#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
> +static unsigned int xen_blkif_max_ring_order;
> +module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
> +MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used as the ring");
> +
> +#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * info->nr_ring_pages)
Didn't we agreed that this macro should take a explicit info parameter?
> +#define BLK_MAX_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE * XENBUS_MAX_RING_PAGES)
> +/*
> + * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 characters
> + * are enough. Define to 20 to keep consist with backend.
> + */
> +#define RINGREF_NAME_LEN (20)
>
> /*
> * We have one of these per vbd, whether ide, scsi or 'other'. They
> @@ -114,13 +124,14 @@ struct blkfront_info
> int vdevice;
> blkif_vdev_t handle;
> enum blkif_state connected;
> - int ring_ref;
> + int ring_ref[XENBUS_MAX_RING_PAGES];
> + unsigned int nr_ring_pages;
> struct blkif_front_ring ring;
> unsigned int evtchn, irq;
> struct request_queue *rq;
> struct work_struct work;
> struct gnttab_free_callback callback;
> - struct blk_shadow shadow[BLK_RING_SIZE];
> + struct blk_shadow shadow[BLK_MAX_RING_SIZE];
> struct list_head grants;
> struct list_head indirect_pages;
> unsigned int persistent_gnts_c;
> @@ -139,8 +150,6 @@ static unsigned int nr_minors;
> static unsigned long *minors;
> static DEFINE_SPINLOCK(minor_lock);
>
> -#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
> - (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
> #define GRANT_INVALID_REF 0
>
> #define PARTS_PER_DISK 16
> @@ -1033,12 +1042,15 @@ free_shadow:
> flush_work(&info->work);
>
> /* Free resources associated with old device channel. */
> - if (info->ring_ref != GRANT_INVALID_REF) {
> - gnttab_end_foreign_access(info->ring_ref, 0,
> - (unsigned long)info->ring.sring);
> - info->ring_ref = GRANT_INVALID_REF;
> - info->ring.sring = NULL;
> + for (i = 0; i < info->nr_ring_pages; i++) {
> + if (info->ring_ref[i] != GRANT_INVALID_REF) {
> + gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
> + info->ring_ref[i] = GRANT_INVALID_REF;
> + }
> }
> + free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
> + info->ring.sring = NULL;
> +
> if (info->irq)
> unbind_from_irqhandler(info->irq, info);
> info->evtchn = info->irq = 0;
> @@ -1245,26 +1257,30 @@ static int setup_blkring(struct xenbus_device *dev,
> struct blkfront_info *info)
> {
> struct blkif_sring *sring;
> - grant_ref_t gref;
> - int err;
> + int err, i;
> + unsigned long ring_size = info->nr_ring_pages * PAGE_SIZE;
> + grant_ref_t gref[XENBUS_MAX_RING_PAGES];
>
> - info->ring_ref = GRANT_INVALID_REF;
> + for (i = 0; i < info->nr_ring_pages; i++)
> + info->ring_ref[i] = GRANT_INVALID_REF;
>
> - sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
> + sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
> + get_order(ring_size));
> if (!sring) {
> xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
> return -ENOMEM;
> }
> SHARED_RING_INIT(sring);
> - FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
> + FRONT_RING_INIT(&info->ring, sring, ring_size);
>
> - err = xenbus_grant_ring(dev, info->ring.sring, 1, &gref);
> + err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref);
> if (err < 0) {
> free_page((unsigned long)sring);
This should be:
free_pages((unsigned long)sring, get_order(info->nr_ring_pages *
PAGE_SIZE));
> info->ring.sring = NULL;
> goto fail;
> }
> - info->ring_ref = gref;
> + for (i = 0; i < info->nr_ring_pages; i++)
> + info->ring_ref[i] = gref[i];
>
> err = xenbus_alloc_evtchn(dev, &info->evtchn);
> if (err)
> @@ -1292,7 +1308,18 @@ static int talk_to_blkback(struct xenbus_device *dev,
> {
> const char *message = NULL;
> struct xenbus_transaction xbt;
> - int err;
> + int err, i;
> + unsigned int max_page_order = 0;
> + unsigned int ring_page_order = 0;
> +
> + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
> + "max-ring-page-order", "%u", &max_page_order);
> + if (err != 1)
> + info->nr_ring_pages = 1;
> + else {
> + ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
> + info->nr_ring_pages = 1 << ring_page_order;
> + }
>
> /* Create shared ring, alloc event channel. */
> err = setup_blkring(dev, info);
> @@ -1306,11 +1333,31 @@ again:
> goto destroy_blkring;
> }
>
> - err = xenbus_printf(xbt, dev->nodename,
> - "ring-ref", "%u", info->ring_ref);
> - if (err) {
> - message = "writing ring-ref";
> - goto abort_transaction;
> + if (info->nr_ring_pages == 1) {
> + err = xenbus_printf(xbt, dev->nodename,
> + "ring-ref", "%u", info->ring_ref[0]);
> + if (err) {
> + message = "writing ring-ref";
> + goto abort_transaction;
> + }
> + } else {
> + err = xenbus_printf(xbt, dev->nodename,
> + "ring-page-order", "%u", ring_page_order);
> + if (err) {
> + message = "writing ring-page-order";
> + goto abort_transaction;
> + }
> +
> + for (i = 0; i < info->nr_ring_pages; i++) {
> + char ring_ref_name[RINGREF_NAME_LEN];
Missing line break here.
> + snprintf(ring_ref_name, sizeof(ring_ref_name), "ring-ref%u", i);
I would also use RINGREF_NAME_LEN here directly instead of the sizeof.
> + err = xenbus_printf(xbt, dev->nodename,
> + ring_ref_name, "%u", info->ring_ref[i]);
> + if (err) {
> + message = "writing ring-ref";
> + goto abort_transaction;
> + }
> + }
> }
> err = xenbus_printf(xbt, dev->nodename,
> "event-channel", "%u", info->evtchn);
> @@ -1338,6 +1385,9 @@ again:
> goto destroy_blkring;
> }
>
> + for (i = 0; i < BLK_MAX_RING_SIZE; i++)
IMHO better use (BLK_RING_SIZE - 1) here instead of BLK_MAX_RING_SIZE.
> + info->shadow[i].req.u.rw.id = i+1;
> + info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
> xenbus_switch_state(dev, XenbusStateInitialised);
>
> return 0;
> @@ -1361,7 +1411,7 @@ again:
> static int blkfront_probe(struct xenbus_device *dev,
> const struct xenbus_device_id *id)
> {
> - int err, vdevice, i;
> + int err, vdevice;
> struct blkfront_info *info;
>
> /* FIXME: Use dynamic device id if this is not set. */
> @@ -1422,10 +1472,6 @@ static int blkfront_probe(struct xenbus_device *dev,
> info->connected = BLKIF_STATE_DISCONNECTED;
> INIT_WORK(&info->work, blkif_restart_queue);
>
> - for (i = 0; i < BLK_RING_SIZE; i++)
> - info->shadow[i].req.u.rw.id = i+1;
> - info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
> -
> /* Front end dir is a number, which is used as the id. */
> info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
> dev_set_drvdata(&dev->dev, info);
> @@ -1469,7 +1515,7 @@ static int blkif_recover(struct blkfront_info *info)
>
> /* Stage 2: Set up free list. */
> memset(&info->shadow, 0, sizeof(info->shadow));
> - for (i = 0; i < BLK_RING_SIZE; i++)
> + for (i = 0; i < BLK_MAX_RING_SIZE; i++)
Why do you have to use BLK_MAX_RING_SIZE instead of the previous
BLK_RING_SIZE?
> info->shadow[i].req.u.rw.id = i+1;
> info->shadow_free = info->ring.req_prod_pvt;
> info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
> @@ -2086,6 +2132,12 @@ static int __init xlblk_init(void)
> {
> int ret;
>
> + if (xen_blkif_max_ring_order > XENBUS_MAX_RING_PAGE_ORDER) {
> + pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
> + xen_blkif_max_ring_order, XENBUS_MAX_RING_PAGE_ORDER);
> + xen_blkif_max_ring_order = 0;
> + }
> +
> if (!xen_domain())
> return -ENODEV;
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/