[PATCH RFC 19/21] blk-mq: Enable combined hardware queues

From: Alexander Gordeev
Date: Fri Sep 16 2016 - 04:53:45 EST


This is 3rd step change in a bid to enable mapping of multiple
device hardware queues to a single CPU.

It introduces combined hardware context - the one consisting from
multiple low-level hardware contexts. As result, queue depths deeper
than the device hardware queue depth are made possible (but not
yet allowed).

CC: Jens Axboe <axboe@xxxxxxxxx>
CC: linux-nvme@xxxxxxxxxxxxxxxxxxx
Signed-off-by: Alexander Gordeev <agordeev@xxxxxxxxxx>
---
block/blk-mq-tag.c | 4 +-
block/blk-mq.c | 150 +++++++++++++++----------------------------------
include/linux/blk-mq.h | 5 ++
3 files changed, 51 insertions(+), 108 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 1602813..e987a6b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -477,7 +477,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
{
int i;

- for (i = 0; i < tagset->nr_hw_queues; i++) {
+ for (i = 0; i < tagset->nr_co_queues; i++) {
if (tagset->tags && tagset->tags[i])
blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
}
@@ -491,7 +491,7 @@ int blk_mq_reinit_tagset(struct blk_mq_tag_set *set)
if (!set->ops->reinit_request)
goto out;

- for (i = 0; i < set->nr_hw_queues; i++) {
+ for (i = 0; i < set->nr_co_queues; i++) {
struct blk_mq_tags *tags = set->tags[i];

for (j = 0; j < tags->nr_tags; j++) {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6d055ec..450a3ed 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1499,22 +1499,27 @@ static size_t order_to_size(unsigned int order)
return (size_t)PAGE_SIZE << order;
}

+static unsigned int queue_depth(struct blk_mq_tag_set *set)
+{
+ return set->queue_depth * set->co_queue_size;
+}
+
static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
unsigned int hctx_idx)
{
struct blk_mq_tags *tags;
unsigned int i, j, entries_per_page, max_order = 4;
size_t rq_size, left;
+ unsigned int depth = queue_depth(set);

- tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
- set->numa_node,
+ tags = blk_mq_init_tags(depth, set->reserved_tags, set->numa_node,
BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
if (!tags)
return NULL;

INIT_LIST_HEAD(&tags->page_list);

- tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
+ tags->rqs = kzalloc_node(depth * sizeof(struct request *),
GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
set->numa_node);
if (!tags->rqs) {
@@ -1528,9 +1533,9 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
*/
rq_size = round_up(sizeof(struct request) + set->cmd_size,
cache_line_size());
- left = rq_size * set->queue_depth;
+ left = rq_size * depth;

- for (i = 0; i < set->queue_depth; ) {
+ for (i = 0; i < depth; ) {
int this_order = max_order;
struct page *page;
int to_do;
@@ -1564,7 +1569,7 @@ static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
*/
kmemleak_alloc(p, order_to_size(this_order), 1, GFP_KERNEL);
entries_per_page = order_to_size(this_order) / rq_size;
- to_do = min(entries_per_page, set->queue_depth - i);
+ to_do = min(entries_per_page, depth - i);
left -= to_do * rq_size;
for (j = 0; j < to_do; j++) {
tags->rqs[i] = p;
@@ -1703,7 +1708,7 @@ static struct blk_mq_hw_ctx *blk_mq_init_hctx(struct request_queue *q,
struct blk_mq_tag_set *set, unsigned hctx_idx)
{
struct blk_mq_hw_ctx *hctx;
- unsigned int nr_llhw_ctx = 1;
+ unsigned int nr_llhw_ctx = set->co_queue_size;
int node;
int i;

@@ -1757,7 +1762,7 @@ static struct blk_mq_hw_ctx *blk_mq_init_hctx(struct request_queue *q,
struct blk_mq_llhw_ctx *llhw_ctx = &hctx->llhw_ctxs[i];

llhw_ctx->index = i;
- llhw_ctx->queue_id = hctx_idx;
+ llhw_ctx->queue_id = (hctx_idx * set->co_queue_size) + i;

if (set->ops->init_hctx &&
set->ops->init_hctx(llhw_ctx, set->driver_data))
@@ -2005,7 +2010,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;

blk_mq_sysfs_unregister(q);
- for (i = 0; i < set->nr_hw_queues; i++) {
+ for (i = 0; i < set->nr_co_queues; i++) {
if (hctxs[i])
continue;
if (!set->tags[i])
@@ -2050,7 +2055,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
if (!q->queue_ctx)
goto err_exit;

- q->queue_hw_ctx = kzalloc_node(set->nr_hw_queues *
+ q->queue_hw_ctx = kzalloc_node(set->nr_co_queues *
sizeof(*(q->queue_hw_ctx)), GFP_KERNEL, set->numa_node);
if (!q->queue_hw_ctx)
goto err_percpu;
@@ -2090,12 +2095,12 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
/*
* Do this after blk_queue_make_request() overrides it...
*/
- q->nr_requests = set->queue_depth;
+ q->nr_requests = queue_depth(set);

if (set->ops->complete)
blk_queue_softirq_done(q, set->ops->complete);

- blk_mq_init_cpu_queues(q, set->nr_hw_queues);
+ blk_mq_init_cpu_queues(q, set->nr_co_queues);

get_online_cpus();
mutex_lock(&all_q_mutex);
@@ -2232,7 +2237,7 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
{
int i;

- for (i = 0; i < set->nr_hw_queues; i++) {
+ for (i = 0; i < set->nr_co_queues; i++) {
set->tags[i] = blk_mq_init_rq_map(set, i);
if (!set->tags[i])
goto out_unwind;
@@ -2248,38 +2253,11 @@ out_unwind:
}

/*
- * Allocate the request maps associated with this tag_set. Note that this
- * may reduce the depth asked for, if memory is tight. set->queue_depth
- * will be updated to reflect the allocated depth.
+ * TODO Restore original functionality
*/
static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
{
- unsigned int depth;
- int err;
-
- depth = set->queue_depth;
- do {
- err = __blk_mq_alloc_rq_maps(set);
- if (!err)
- break;
-
- set->queue_depth >>= 1;
- if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
- err = -ENOMEM;
- break;
- }
- } while (set->queue_depth);
-
- if (!set->queue_depth || err) {
- pr_err("blk-mq: failed to allocate request map\n");
- return -ENOMEM;
- }
-
- if (depth != set->queue_depth)
- pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
- depth, set->queue_depth);
-
- return 0;
+ return __blk_mq_alloc_rq_maps(set);
}

struct cpumask *blk_mq_tags_cpumask(struct blk_mq_tags *tags)
@@ -2291,8 +2269,7 @@ EXPORT_SYMBOL_GPL(blk_mq_tags_cpumask);
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
- * requested depth down, if if it too large. In that case, the set
- * value will be stored in set->queue_depth.
+ * requested depth down, if if it too large.
*/
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{
@@ -2302,34 +2279,32 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
return -EINVAL;
if (!set->queue_depth)
return -EINVAL;
- if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
- return -EINVAL;
-
if (!set->ops->queue_rq)
return -EINVAL;

- if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
- pr_info("blk-mq: reduced tag depth to %u\n",
- BLK_MQ_MAX_DEPTH);
- set->queue_depth = BLK_MQ_MAX_DEPTH;
- }
+ /*
+ * TODO Restore original queue depth and count limits
+ */

/*
* If a crashdump is active, then we are potentially in a very
- * memory constrained environment. Limit us to 1 queue and
- * 64 tags to prevent using too much memory.
+ * memory constrained environment. Limit us to 1 queue.
*/
- if (is_kdump_kernel()) {
- set->nr_hw_queues = 1;
- set->queue_depth = min(64U, set->queue_depth);
- }
+ set->nr_co_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
+ set->co_queue_size = 1;
+
+ if (queue_depth(set) < set->reserved_tags + BLK_MQ_TAG_MIN)
+ return -EINVAL;
+ if (queue_depth(set) > BLK_MQ_MAX_DEPTH)
+ return -EINVAL;
+
/*
* There is no use for more h/w queues than cpus.
*/
- if (set->nr_hw_queues > nr_cpu_ids)
- set->nr_hw_queues = nr_cpu_ids;
+ if (set->nr_co_queues > nr_cpu_ids)
+ set->nr_co_queues = nr_cpu_ids;

- set->tags = kzalloc_node(set->nr_hw_queues * sizeof(*set->tags),
+ set->tags = kzalloc_node(set->nr_co_queues * sizeof(*set->tags),
GFP_KERNEL, set->numa_node);
if (!set->tags)
return -ENOMEM;
@@ -2352,7 +2327,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
{
int i;

- for (i = 0; i < set->nr_hw_queues; i++) {
+ for (i = 0; i < set->nr_co_queues; i++) {
if (set->tags[i])
blk_mq_free_rq_map(set, set->tags[i], i);
}
@@ -2362,56 +2337,19 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
}
EXPORT_SYMBOL(blk_mq_free_tag_set);

+/*
+ * TODO Restore original functionality
+ */
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
{
- struct blk_mq_tag_set *set = q->tag_set;
- struct blk_mq_hw_ctx *hctx;
- int i, ret;
-
- if (!set || nr > set->queue_depth)
- return -EINVAL;
-
- ret = 0;
- queue_for_each_hw_ctx(q, hctx, i) {
- if (!hctx->tags)
- continue;
- ret = blk_mq_tag_update_depth(hctx->tags, nr);
- if (ret)
- break;
- }
-
- if (!ret)
- q->nr_requests = nr;
-
- return ret;
+ return -EINVAL;
}

+/*
+ * TODO Restore original functionality
+ */
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
{
- struct request_queue *q;
-
- if (nr_hw_queues > nr_cpu_ids)
- nr_hw_queues = nr_cpu_ids;
- if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
- return;
-
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_freeze_queue(q);
-
- set->nr_hw_queues = nr_hw_queues;
- list_for_each_entry(q, &set->tag_list, tag_set_list) {
- blk_mq_realloc_hw_ctxs(set, q);
-
- if (q->nr_hw_queues > 1)
- blk_queue_make_request(q, blk_mq_make_request);
- else
- blk_queue_make_request(q, blk_sq_make_request);
-
- blk_mq_queue_reinit(q, cpu_online_mask);
- }
-
- list_for_each_entry(q, &set->tag_list, tag_set_list)
- blk_mq_unfreeze_queue(q);
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 52a9e7c..579dfaf 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -88,8 +88,13 @@ int blk_mq_tag_to_llhw_ctx_idx(struct blk_mq_hw_ctx *hctx, unsigned int tag)

struct blk_mq_tag_set {
struct blk_mq_ops *ops;
+
unsigned int nr_hw_queues;
unsigned int queue_depth; /* max hw supported */
+
+ unsigned int nr_co_queues; /* number of combined queues */
+ unsigned int co_queue_size; /* hw queues in one combined */
+
unsigned int reserved_tags;
unsigned int cmd_size; /* per-request extra data */
int numa_node;
--
1.8.3.1