[PATCH RFC 17/21] blk-mq: Introduce a 1:N hardware contexts

From: Alexander Gordeev
Date: Fri Sep 16 2016 - 04:54:14 EST


This is 1st change in a bid to enable mapping of multiple
device hardware queues to a single CPU.

It introduces concepts of 1:1 low-level hardware context
(1 low-level hardware context to 1 device hardware queue)
and opposed to 1:N hardware context (1 hardware context to
N device hardware queues). Basically, it replaces what is
now 1:1 hardware context.

CC: Jens Axboe <axboe@xxxxxxxxx>
CC: linux-nvme@xxxxxxxxxxxxxxxxxxx
Signed-off-by: Alexander Gordeev <agordeev@xxxxxxxxxx>
---
block/blk-core.c | 3 ++-
block/blk-mq.c | 32 +++++++++++++++++++++++---------
drivers/block/loop.c | 2 +-
drivers/block/mtip32xx/mtip32xx.c | 3 ++-
drivers/block/null_blk.c | 11 +++++------
drivers/block/rbd.c | 2 +-
drivers/block/virtio_blk.c | 5 +++--
drivers/block/xen-blkfront.c | 5 +++--
drivers/md/dm-rq.c | 3 ++-
drivers/nvme/host/pci.c | 27 +++++++++++++++------------
drivers/scsi/scsi_lib.c | 3 ++-
include/linux/blk-mq.h | 27 +++++++++++++++++++++------
12 files changed, 80 insertions(+), 43 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 36c7ac3..bf4f196 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -3314,11 +3314,12 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie)
while (!need_resched()) {
unsigned int queue_num = blk_qc_t_to_queue_num(cookie);
struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num];
+ struct blk_mq_llhw_ctx *llhw_ctx = &hctx->llhw_ctxs[0];
int ret;

hctx->poll_invoked++;

- ret = q->mq_ops->poll(hctx, blk_qc_t_to_tag(cookie));
+ ret = q->mq_ops->poll(llhw_ctx, blk_qc_t_to_tag(cookie));
if (ret > 0) {
hctx->poll_success++;
set_current_state(TASK_RUNNING);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c27e64e..274eab8 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -838,7 +838,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
bd.list = dptr;
bd.last = list_empty(&rq_list);

- ret = q->mq_ops->queue_rq(hctx, &bd);
+ ret = q->mq_ops->queue_rq(&hctx->llhw_ctxs[0], &bd);
switch (ret) {
case BLK_MQ_RQ_QUEUE_OK:
queued++;
@@ -1266,7 +1266,7 @@ static int blk_mq_direct_issue_request(struct request *rq, blk_qc_t *cookie)
* error (busy), just add it to our list as we previously
* would have done
*/
- ret = q->mq_ops->queue_rq(hctx, &bd);
+ ret = q->mq_ops->queue_rq(&hctx->llhw_ctxs[0], &bd);
if (ret == BLK_MQ_RQ_QUEUE_OK) {
*cookie = new_cookie;
return 0;
@@ -1661,6 +1661,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
+ int i;
+
blk_mq_tag_idle(hctx);

if (set->ops->exit_request)
@@ -1669,7 +1671,8 @@ static void blk_mq_exit_hctx(struct request_queue *q,
BLK_MQ_MAX_DEPTH + hctx_idx);

if (set->ops->exit_hctx)
- set->ops->exit_hctx(hctx, hctx_idx);
+ for (i = 0; i < hctx->nr_llhw_ctx; i++)
+ set->ops->exit_hctx(&hctx->llhw_ctxs[i]);

blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
blk_free_flush_queue(hctx->fq);
@@ -1696,13 +1699,16 @@ static struct blk_mq_hw_ctx *blk_mq_init_hctx(struct request_queue *q,
struct blk_mq_tag_set *set, unsigned hctx_idx)
{
struct blk_mq_hw_ctx *hctx;
+ unsigned int nr_llhw_ctx = 1;
int node;
+ int i;

node = blk_mq_hw_queue_to_node(q->mq_map, hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;

- hctx = kzalloc_node(sizeof(*hctx), GFP_KERNEL, node);
+ hctx = kzalloc_node(sizeof(*hctx) +
+ nr_llhw_ctx * sizeof(hctx->llhw_ctxs[0]), GFP_KERNEL, node);
if (!hctx)
return NULL;

@@ -1734,6 +1740,7 @@ static struct blk_mq_hw_ctx *blk_mq_init_hctx(struct request_queue *q,
hctx->queue = q;
hctx->queue_num = hctx_idx;
hctx->nr_ctx = 0;
+ hctx->nr_llhw_ctx = nr_llhw_ctx;
hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
hctx->tags = set->tags[hctx_idx];

@@ -1741,9 +1748,16 @@ static struct blk_mq_hw_ctx *blk_mq_init_hctx(struct request_queue *q,
blk_mq_hctx_notify, hctx);
blk_mq_register_cpu_notifier(&hctx->cpu_notifier);

- if (set->ops->init_hctx &&
- set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
- goto unregister_cpu_notifier;
+ for (i = 0; i < hctx->nr_llhw_ctx; i++) {
+ struct blk_mq_llhw_ctx *llhw_ctx = &hctx->llhw_ctxs[i];
+
+ llhw_ctx->index = i;
+ llhw_ctx->queue_id = hctx_idx;
+
+ if (set->ops->init_hctx &&
+ set->ops->init_hctx(llhw_ctx, set->driver_data))
+ goto exit_hctx;
+ }

if (set->ops->init_request &&
set->ops->init_request(set->driver_data,
@@ -1755,8 +1769,8 @@ static struct blk_mq_hw_ctx *blk_mq_init_hctx(struct request_queue *q,

exit_hctx:
if (set->ops->exit_hctx)
- set->ops->exit_hctx(hctx, hctx_idx);
- unregister_cpu_notifier:
+ for (i--; i >= 0; i--)
+ set->ops->exit_hctx(&hctx->llhw_ctxs[i]);
blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
kfree(hctx->fq);
free_bitmap:
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cbdb3b1..f290c64 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -1637,7 +1637,7 @@ int loop_unregister_transfer(int number)
EXPORT_SYMBOL(loop_register_transfer);
EXPORT_SYMBOL(loop_unregister_transfer);

-static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
+static int loop_queue_rq(struct blk_mq_llhw_ctx *llhw_ctx,
const struct blk_mq_queue_data *bd)
{
struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 3cc92e9..5d7c17d 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3805,9 +3805,10 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
return false;
}

-static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
+static int mtip_queue_rq(struct blk_mq_llhw_ctx *llhw_ctx,
const struct blk_mq_queue_data *bd)
{
+ struct blk_mq_hw_ctx *hctx = blk_mq_to_hctx(llhw_ctx);
struct request *rq = bd->rq;
int ret;

diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index 7d3b7d6..1747040 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -351,7 +351,7 @@ static void null_request_fn(struct request_queue *q)
}
}

-static int null_queue_rq(struct blk_mq_hw_ctx *hctx,
+static int null_queue_rq(struct blk_mq_llhw_ctx *llhw_ctx,
const struct blk_mq_queue_data *bd)
{
struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
@@ -361,7 +361,7 @@ static int null_queue_rq(struct blk_mq_hw_ctx *hctx,
cmd->timer.function = null_cmd_timer_expired;
}
cmd->rq = bd->rq;
- cmd->nq = hctx->driver_data;
+ cmd->nq = llhw_ctx->driver_data;

blk_mq_start_request(bd->rq);

@@ -378,13 +378,12 @@ static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
nq->queue_depth = nullb->queue_depth;
}

-static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
- unsigned int index)
+static int null_init_hctx(struct blk_mq_llhw_ctx *llhw_ctx, void *data)
{
struct nullb *nullb = data;
- struct nullb_queue *nq = &nullb->queues[index];
+ struct nullb_queue *nq = &nullb->queues[llhw_ctx->queue_id];

- hctx->driver_data = nq;
+ llhw_ctx->driver_data = nq;
null_init_queue(nullb, nq);
nullb->nr_queues++;

diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index c1f84df..7dd5e0e 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -3383,7 +3383,7 @@ err:
blk_mq_end_request(rq, result);
}

-static int rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
+static int rbd_queue_rq(struct blk_mq_llhw_ctx *llhw_ctx,
const struct blk_mq_queue_data *bd)
{
struct request *rq = bd->rq;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2dc5c96..9cc26c7 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -157,15 +157,16 @@ static void virtblk_done(struct virtqueue *vq)
spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
}

-static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
+static int virtio_queue_rq(struct blk_mq_llhw_ctx *llhw_ctx,
const struct blk_mq_queue_data *bd)
{
+ struct blk_mq_hw_ctx *hctx = blk_mq_to_hctx(llhw_ctx);
struct virtio_blk *vblk = hctx->queue->queuedata;
struct request *req = bd->rq;
struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
unsigned long flags;
unsigned int num;
- int qid = hctx->queue_num;
+ int qid = llhw_ctx->queue_id;
int err;
bool notify = false;

diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 9908597..784c4d5 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -872,11 +872,12 @@ static inline bool blkif_request_flush_invalid(struct request *req,
!info->feature_fua));
}

-static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
+static int blkif_queue_rq(struct blk_mq_llhw_ctx *llhw_ctx,
const struct blk_mq_queue_data *qd)
{
unsigned long flags;
- int qid = hctx->queue_num;
+ int qid = llhw_ctx->queue_id;
+ struct blk_mq_hw_ctx *hctx = blk_mq_to_hctx(llhw_ctx);
struct blkfront_info *info = hctx->queue->queuedata;
struct blkfront_ring_info *rinfo = NULL;

diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index d1c3645..b074137 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -855,9 +855,10 @@ static int dm_mq_init_request(void *data, struct request *rq,
return 0;
}

-static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
+static int dm_mq_queue_rq(struct blk_mq_llhw_ctx *llhw_ctx,
const struct blk_mq_queue_data *bd)
{
+ struct blk_mq_hw_ctx *hctx = blk_mq_to_hctx(llhw_ctx);
struct request *rq = bd->rq;
struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
struct mapped_device *md = tio->md;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 086fd7e..eef2e40 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -201,9 +201,10 @@ static unsigned int nvme_cmd_size(struct nvme_dev *dev)
nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES);
}

-static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
- unsigned int hctx_idx)
+static int nvme_admin_init_hctx(struct blk_mq_llhw_ctx *llhw_ctx, void *data)
{
+ struct blk_mq_hw_ctx *hctx = blk_mq_to_hctx(llhw_ctx);
+ unsigned int hctx_idx = llhw_ctx->queue_id;
struct nvme_dev *dev = data;
struct nvme_queue *nvmeq = dev->queues[0];

@@ -211,14 +212,14 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
WARN_ON(nvmeq->tags);

- hctx->driver_data = nvmeq;
+ llhw_ctx->driver_data = nvmeq;
nvmeq->tags = &dev->admin_tagset.tags[0];
return 0;
}

-static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+static void nvme_admin_exit_hctx(struct blk_mq_llhw_ctx *llhw_ctx)
{
- struct nvme_queue *nvmeq = hctx->driver_data;
+ struct nvme_queue *nvmeq = llhw_ctx->driver_data;

nvmeq->tags = NULL;
}
@@ -236,9 +237,10 @@ static int nvme_admin_init_request(void *data, struct request *req,
return 0;
}

-static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
- unsigned int hctx_idx)
+static int nvme_init_hctx(struct blk_mq_llhw_ctx *llhw_ctx, void *data)
{
+ struct blk_mq_hw_ctx *hctx = blk_mq_to_hctx(llhw_ctx);
+ unsigned int hctx_idx = llhw_ctx->queue_id;
struct nvme_dev *dev = data;
struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];

@@ -246,7 +248,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
nvmeq->tags = &dev->tagset.tags[hctx_idx];

WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
- hctx->driver_data = nvmeq;
+ llhw_ctx->driver_data = nvmeq;
return 0;
}

@@ -558,11 +560,12 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
/*
* NOTE: ns is NULL when called on the admin queue.
*/
-static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
+static int nvme_queue_rq(struct blk_mq_llhw_ctx *llhw_ctx,
const struct blk_mq_queue_data *bd)
{
+ struct blk_mq_hw_ctx *hctx = blk_mq_to_hctx(llhw_ctx);
struct nvme_ns *ns = hctx->queue->queuedata;
- struct nvme_queue *nvmeq = hctx->driver_data;
+ struct nvme_queue *nvmeq = llhw_ctx->driver_data;
struct nvme_dev *dev = nvmeq->dev;
struct request *req = bd->rq;
struct nvme_command cmnd;
@@ -742,9 +745,9 @@ static irqreturn_t nvme_irq_check(int irq, void *data)
return IRQ_NONE;
}

-static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
+static int nvme_poll(struct blk_mq_llhw_ctx *llhw_ctx, unsigned int tag)
{
- struct nvme_queue *nvmeq = hctx->driver_data;
+ struct nvme_queue *nvmeq = llhw_ctx->driver_data;

if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
spin_lock_irq(&nvmeq->q_lock);
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 2cca9cf..0019213 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -1876,9 +1876,10 @@ static void scsi_mq_done(struct scsi_cmnd *cmd)
blk_mq_complete_request(cmd->request, cmd->request->errors);
}

-static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
+static int scsi_queue_rq(struct blk_mq_llhw_ctx *llhw_ctx,
const struct blk_mq_queue_data *bd)
{
+ struct blk_mq_hw_ctx *hctx = blk_mq_to_hctx(llhw_ctx);
struct request *req = bd->rq;
struct request_queue *q = req->q;
struct scsi_device *sdev = q->queuedata;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 6c7ee56..2c3392b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -18,6 +18,12 @@ struct blk_mq_ctxmap {
struct blk_align_bitmap *map;
};

+struct blk_mq_llhw_ctx {
+ int index;
+ int queue_id;
+ void *driver_data;
+};
+
struct blk_mq_hw_ctx {
struct {
spinlock_t lock;
@@ -36,8 +42,6 @@ struct blk_mq_hw_ctx {
struct request_queue *queue;
struct blk_flush_queue *fq;

- void *driver_data;
-
struct blk_mq_ctxmap ctx_map;

unsigned int nr_ctx;
@@ -62,8 +66,19 @@ struct blk_mq_hw_ctx {

unsigned long poll_invoked;
unsigned long poll_success;
+
+ unsigned int nr_llhw_ctx;
+ struct blk_mq_llhw_ctx llhw_ctxs[0];
};

+static inline
+struct blk_mq_hw_ctx *blk_mq_to_hctx(struct blk_mq_llhw_ctx *llhw_ctx)
+{
+ struct blk_mq_llhw_ctx *llhw_ctx_0 = llhw_ctx - llhw_ctx->index;
+
+ return (void *)llhw_ctx_0 - offsetof(struct blk_mq_hw_ctx, llhw_ctxs);
+}
+
struct blk_mq_tag_set {
struct blk_mq_ops *ops;
unsigned int nr_hw_queues;
@@ -87,11 +102,11 @@ struct blk_mq_queue_data {
bool last;
};

-typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data *);
+typedef int (queue_rq_fn)(struct blk_mq_llhw_ctx *, const struct blk_mq_queue_data *);
typedef struct blk_mq_hw_ctx *(map_queue_fn)(struct request_queue *, const int);
typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool);
-typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int);
-typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int);
+typedef int (init_hctx_fn)(struct blk_mq_llhw_ctx *, void *);
+typedef void (exit_hctx_fn)(struct blk_mq_llhw_ctx *);
typedef int (init_request_fn)(void *, struct request *, unsigned int,
unsigned int, unsigned int);
typedef void (exit_request_fn)(void *, struct request *, unsigned int,
@@ -101,7 +116,7 @@ typedef int (reinit_request_fn)(void *, struct request *);
typedef void (busy_iter_fn)(struct blk_mq_hw_ctx *, struct request *, void *,
bool);
typedef void (busy_tag_iter_fn)(struct request *, void *, bool);
-typedef int (poll_fn)(struct blk_mq_hw_ctx *, unsigned int);
+typedef int (poll_fn)(struct blk_mq_llhw_ctx *, unsigned int);


struct blk_mq_ops {
--
1.8.3.1