Re: [PATCH v4 07/10] xen/blkback: pseudo support for multi hardware queues/rings

From: Konrad Rzeszutek Wilk
Date: Wed Nov 04 2015 - 22:25:07 EST


On November 4, 2015 10:02:26 PM EST, Bob Liu <bob.liu@xxxxxxxxxx> wrote:
>
>On 11/05/2015 10:30 AM, Konrad Rzeszutek Wilk wrote:
>> On Mon, Nov 02, 2015 at 12:21:43PM +0800, Bob Liu wrote:
>>> Preparatory patch for multiple hardware queues (rings). The number
>of
>>> rings is unconditionally set to 1, larger number will be enabled in
>next
>>> patch so as to make every single patch small and readable.
>>
>> Instead of saying 'next patch' - spell out the title of the patch.
>>
>>
>>>
>>> Signed-off-by: Arianna Avanzini <avanzini.arianna@xxxxxxxxx>
>>> Signed-off-by: Bob Liu <bob.liu@xxxxxxxxxx>
>>> ---
>>> drivers/block/xen-blkback/common.h | 3 +-
>>> drivers/block/xen-blkback/xenbus.c | 292
>+++++++++++++++++++++++--------------
>>> 2 files changed, 185 insertions(+), 110 deletions(-)
>>>
>>> diff --git a/drivers/block/xen-blkback/common.h
>b/drivers/block/xen-blkback/common.h
>>> index f0dd69a..4de1326 100644
>>> --- a/drivers/block/xen-blkback/common.h
>>> +++ b/drivers/block/xen-blkback/common.h
>>> @@ -341,7 +341,8 @@ struct xen_blkif {
>>> struct work_struct free_work;
>>> unsigned int nr_ring_pages;
>>> /* All rings for this device */
>>> - struct xen_blkif_ring ring;
>>> + struct xen_blkif_ring *rings;
>>> + unsigned int nr_rings;
>>> };
>>>
>>> struct seg_buf {
>>> diff --git a/drivers/block/xen-blkback/xenbus.c
>b/drivers/block/xen-blkback/xenbus.c
>>> index 7bdd5fd..ac4b458 100644
>>> --- a/drivers/block/xen-blkback/xenbus.c
>>> +++ b/drivers/block/xen-blkback/xenbus.c
>>> @@ -84,11 +84,12 @@ static int blkback_name(struct xen_blkif *blkif,
>char *buf)
>>>
>>> static void xen_update_blkif_status(struct xen_blkif *blkif)
>>> {
>>> - int err;
>>> + int err, i;
>>
>> unsigned int.
>>> char name[BLKBACK_NAME_LEN];
>>> + struct xen_blkif_ring *ring;
>>>
>>> /* Not ready to connect? */
>>> - if (!blkif->ring.irq || !blkif->vbd.bdev)
>>> + if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev)
>>> return;
>>>
>>> /* Already connected? */
>>> @@ -113,19 +114,57 @@ static void xen_update_blkif_status(struct
>xen_blkif *blkif)
>>> }
>>> invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
>>>
>>> - blkif->ring.xenblkd = kthread_run(xen_blkif_schedule,
>&blkif->ring, "%s", name);
>>> - if (IS_ERR(blkif->ring.xenblkd)) {
>>> - err = PTR_ERR(blkif->ring.xenblkd);
>>> - blkif->ring.xenblkd = NULL;
>>> - xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
>>> - return;
>>> + if (blkif->nr_rings == 1) {
>>> + blkif->rings[0].xenblkd = kthread_run(xen_blkif_schedule,
>&blkif->rings[0], "%s", name);
>>> + if (IS_ERR(blkif->rings[0].xenblkd)) {
>>> + err = PTR_ERR(blkif->rings[0].xenblkd);
>>> + blkif->rings[0].xenblkd = NULL;
>>> + xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
>>> + return;
>>> + }
>>> + } else {
>>> + for (i = 0; i < blkif->nr_rings; i++) {
>>> + ring = &blkif->rings[i];
>>> + ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d",
>name, i);
>>> + if (IS_ERR(ring->xenblkd)) {
>>> + err = PTR_ERR(ring->xenblkd);
>>> + ring->xenblkd = NULL;
>>> + xenbus_dev_error(blkif->be->dev, err,
>>> + "start %s-%d xenblkd", name, i);
>>> + return;
>>> + }
>>> + }
>>
>> Please collapse this together and just have one loop.
>>
>> And just use the same name throughout ('%s-%d')
>>
>> Also this does not take care of actually freeing the allocated
>> ring->xenblkd if one of them fails to start.
>>
>> Hmm, looking at this function .. we seem to forget to change the
>> state if something fails.
>>
>> As in, connect switches the state to Connected.. And if anything
>blows
>> up after the connect() we don't switch the state. We do report an
>error
>> in the XenBus, but the state is the same.
>>
>> We should be using xenbus_dev_fatal I believe - so at least the state
>> changes to Closing (and then the frontend can switch itself to
>> Closing->Closed) - and we will call 'xen_blkif_disconnect' on Closed.
>
>>
>
>Will update..
>
>>> + }
>>> +}
>>> +
>>> +static int xen_blkif_alloc_rings(struct xen_blkif *blkif)
>>> +{
>>> + int r;
>>
>> unsigned int i;
>>
>>> +
>>> + blkif->rings = kzalloc(blkif->nr_rings * sizeof(struct
>xen_blkif_ring), GFP_KERNEL);
>>> + if (!blkif->rings)
>>> + return -ENOMEM;
>>> +
>>> + for (r = 0; r < blkif->nr_rings; r++) {
>>> + struct xen_blkif_ring *ring = &blkif->rings[r];
>>> +
>>> + spin_lock_init(&ring->blk_ring_lock);
>>> + init_waitqueue_head(&ring->wq);
>>> + INIT_LIST_HEAD(&ring->pending_free);
>>> +
>>> + spin_lock_init(&ring->pending_free_lock);
>>> + init_waitqueue_head(&ring->pending_free_wq);
>>> + init_waitqueue_head(&ring->shutdown_wq);
>>> + ring->blkif = blkif;
>>> + xen_blkif_get(blkif);
>>> }
>>> +
>>> + return 0;
>>> }
>>>
>>> static struct xen_blkif *xen_blkif_alloc(domid_t domid)
>>> {
>>> struct xen_blkif *blkif;
>>> - struct xen_blkif_ring *ring;
>>>
>>> BUILD_BUG_ON(MAX_INDIRECT_PAGES >
>BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
>>>
>>> @@ -136,27 +175,17 @@ static struct xen_blkif
>*xen_blkif_alloc(domid_t domid)
>>> blkif->domid = domid;
>>> atomic_set(&blkif->refcnt, 1);
>>> init_completion(&blkif->drain_complete);
>>> - atomic_set(&blkif->drain, 0);
>>> INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
>>> spin_lock_init(&blkif->free_pages_lock);
>>> INIT_LIST_HEAD(&blkif->free_pages);
>>> - blkif->free_pages_num = 0;
>>> - blkif->persistent_gnts.rb_node = NULL;
>>> INIT_LIST_HEAD(&blkif->persistent_purge_list);
>>> - atomic_set(&blkif->persistent_gnt_in_use, 0);
>>> INIT_WORK(&blkif->persistent_purge_work,
>xen_blkbk_unmap_purged_grants);
>>>
>>> - ring = &blkif->ring;
>>> - ring->blkif = blkif;
>>> - spin_lock_init(&ring->blk_ring_lock);
>>> - init_waitqueue_head(&ring->wq);
>>> - ring->st_print = jiffies;
>>> - atomic_set(&ring->inflight, 0);
>>> -
>>> - INIT_LIST_HEAD(&ring->pending_free);
>>> - spin_lock_init(&ring->pending_free_lock);
>>> - init_waitqueue_head(&ring->pending_free_wq);
>>> - init_waitqueue_head(&ring->shutdown_wq);
>>> + blkif->nr_rings = 1;
>>> + if (xen_blkif_alloc_rings(blkif)) {
>>> + kmem_cache_free(xen_blkif_cachep, blkif);
>>> + return ERR_PTR(-ENOMEM);
>>> + }
>>>
>>> return blkif;
>>> }
>>> @@ -218,50 +247,54 @@ static int xen_blkif_map(struct xen_blkif_ring
>*ring, grant_ref_t *gref,
>>> static int xen_blkif_disconnect(struct xen_blkif *blkif)
>>> {
>>> struct pending_req *req, *n;
>>> - int i = 0, j;
>>> - struct xen_blkif_ring *ring = &blkif->ring;
>>> + int j, r;
>>>
>>
>> unsigned int i;
>>> - if (ring->xenblkd) {
>>> - kthread_stop(ring->xenblkd);
>>> - wake_up(&ring->shutdown_wq);
>>> - ring->xenblkd = NULL;
>>> - }
>>> + for (r = 0; r < blkif->nr_rings; r++) {
>>> + struct xen_blkif_ring *ring = &blkif->rings[r];
>>> + int i = 0;
>>
>> unsigned int
>>>
>>> - /* The above kthread_stop() guarantees that at this point we
>>> - * don't have any discard_io or other_io requests. So, checking
>>> - * for inflight IO is enough.
>>> - */
>>> - if (atomic_read(&ring->inflight) > 0)
>>> - return -EBUSY;
>>> + if (ring->xenblkd) {
>>> + kthread_stop(ring->xenblkd);
>>> + wake_up(&ring->shutdown_wq);
>>> + ring->xenblkd = NULL;
>>> + }
>>>
>>> - if (ring->irq) {
>>> - unbind_from_irqhandler(ring->irq, ring);
>>> - ring->irq = 0;
>>> - }
>>> + /* The above kthread_stop() guarantees that at this point we
>>> + * don't have any discard_io or other_io requests. So, checking
>>> + * for inflight IO is enough.
>>> + */
>>> + if (atomic_read(&ring->inflight) > 0)
>>> + return -EBUSY;
>>>
>>> - if (ring->blk_rings.common.sring) {
>>> - xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
>>> - ring->blk_rings.common.sring = NULL;
>>> - }
>>> + if (ring->irq) {
>>> + unbind_from_irqhandler(ring->irq, ring);
>>> + ring->irq = 0;
>>> + }
>>>
>>> - /* Remove all persistent grants and the cache of ballooned pages.
>*/
>>> - xen_blkbk_free_caches(ring);
>>> + if (ring->blk_rings.common.sring) {
>>> + xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
>>> + ring->blk_rings.common.sring = NULL;
>>> + }
>>>
>>> - /* Check that there is no request in use */
>>> - list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
>>> - list_del(&req->free_list);
>>> + /* Remove all persistent grants and the cache of ballooned pages.
>*/
>>> + xen_blkbk_free_caches(ring);
>>>
>>> - for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
>>> - kfree(req->segments[j]);
>>> + /* Check that there is no request in use */
>>> + list_for_each_entry_safe(req, n, &ring->pending_free, free_list)
>{
>>> + list_del(&req->free_list);
>>>
>>> - for (j = 0; j < MAX_INDIRECT_PAGES; j++)
>>> - kfree(req->indirect_pages[j]);
>>> + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
>>> + kfree(req->segments[j]);
>>>
>>> - kfree(req);
>>> - i++;
>>> - }
>>> + for (j = 0; j < MAX_INDIRECT_PAGES; j++)
>>> + kfree(req->indirect_pages[j]);
>>>
>>> - WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
>>> + kfree(req);
>>> + i++;
>>> + }
>>> +
>>> + WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
>>> + }
>>> blkif->nr_ring_pages = 0;
>>>
>>> return 0;
>>> @@ -281,6 +314,7 @@ static void xen_blkif_free(struct xen_blkif
>*blkif)
>>> BUG_ON(!list_empty(&blkif->free_pages));
>>> BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
>>>
>>> + kfree(blkif->rings);
>>> kmem_cache_free(xen_blkif_cachep, blkif);
>>> }
>>>
>>> @@ -307,15 +341,19 @@ int __init xen_blkif_interface_init(void)
>>> struct xenbus_device *dev = to_xenbus_device(_dev); \
>>> struct backend_info *be = dev_get_drvdata(&dev->dev); \
>>> struct xen_blkif *blkif = be->blkif; \
>>> - struct xen_blkif_ring *ring = &blkif->ring; \
>>> + int i; \
>>> + \
>>> + for (i = 0; i < blkif->nr_rings; i++) { \
>>> + struct xen_blkif_ring *ring = &blkif->rings[i]; \
>>> \
>>> - blkif->st_oo_req = ring->st_oo_req; \
>>> - blkif->st_rd_req = ring->st_rd_req; \
>>> - blkif->st_wr_req = ring->st_wr_req; \
>>> - blkif->st_f_req = ring->st_f_req; \
>>> - blkif->st_ds_req = ring->st_ds_req; \
>>> - blkif->st_rd_sect = ring->st_rd_sect; \
>>> - blkif->st_wr_sect = ring->st_wr_sect; \
>>> + blkif->st_oo_req += ring->st_oo_req; \
>>> + blkif->st_rd_req += ring->st_rd_req; \
>>> + blkif->st_wr_req += ring->st_wr_req; \
>>> + blkif->st_f_req += ring->st_f_req; \
>>> + blkif->st_ds_req += ring->st_ds_req; \
>>> + blkif->st_rd_sect += ring->st_rd_sect; \
>>> + blkif->st_wr_sect += ring->st_wr_sect; \
>>> + } \
>>
>> That needs fixing. You could alter the macro to only read the values
>> from the proper member.
>
>Do you think we still need these debug values for per-ring?
>I'm thinking of just leave them for per-device(blkif), but that
>requires extra lock to protect?

It should be per device. However I think you are OK with locking - as long as the sysfs is torn down before we deallocate the rings.



--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/