Re: [PATCHSET v4] blk-mq-scheduling framework
From: Bart Van Assche
Date: Thu Dec 22 2016 - 11:37:49 EST
On Fri, 2016-12-16 at 17:12 -0700, Jens Axboe wrote:
> From the discussion last time, I looked into the feasibility of having
> two sets of tags for the same request pool, to avoid having to copy
> some of the request fields at dispatch and completion time. To do that,
> we'd have to replace the driver tag map(s) with our own, and augment
> that with tag map(s) on the side representing the device queue depth.
> Queuing IO with the scheduler would allocate from the new map, and
> dispatching would acquire the "real" tag. We would need to change
> drivers to do this, or add an extra indirection table to map a real
> tag to the scheduler tag. We would also need a 1:1 mapping between
> scheduler and hardware tag pools, or additional info to track it.
> Unless someone can convince me otherwise, I think the current approach
> is cleaner.
Hello Jens,
Can you have a look at the attached patches? These implement the "two tags
per request" approach without a table that maps one tag type to the other
or any other ugly construct.Â__blk_mq_alloc_request() is modified such that
it assigns rq->sched_tag and sched_tags->rqs[] instead of rq->tag and
tags->rqs[]. rq->tag and tags->rqs[] are assigned just before dispatch by
blk_mq_assign_drv_tag(). This approach results in significantly less code
than the approach proposed in v4 of your blk-mq-sched patch series. Memory
usage is lower because only a single set of requests is allocated. The
runtime overhead is lower because request fields no longer have to be
copied between the requests owned by the block driver and the requests
owned by the I/O scheduler. I can boot a VM from the virtio-blk driver but
otherwise the attached patches have not yet been tested.
Thanks,
Bart.From 0fd04112850a73f5be9fa91a29bd1791179e1e80 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@xxxxxxxxxxx>
Date: Tue, 20 Dec 2016 12:53:54 +0100
Subject: [PATCH 1/3] blk-mq: Revert some of the blk-mq-sched framework changes
Remove the functions that allocate and free shadow requests.
Remove the get_request, put_request and completed_request callback
functions from struct elevator_type. Remove blk-mq I/O scheduling
functions that become superfluous due to these changes.
Note: this patch breaks blk-mq I/O scheduling. Later patches will make
blk-mq I/O scheduling work again.
---
block/blk-mq-sched.c | 295 +----------------------------------------------
block/blk-mq-sched.h | 55 +--------
block/blk-mq.c | 50 ++++++--
block/mq-deadline.c | 90 ++-------------
include/linux/elevator.h | 3 -
5 files changed, 58 insertions(+), 435 deletions(-)
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 265e4a9cce7e..e46769db3d57 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -15,196 +15,6 @@
#include "blk-mq-tag.h"
#include "blk-wbt.h"
-/*
- * Empty set
- */
-static const struct blk_mq_ops mq_sched_tag_ops = {
-};
-
-void blk_mq_sched_free_requests(struct blk_mq_tags *tags)
-{
- blk_mq_free_rq_map(NULL, tags, 0);
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_free_requests);
-
-struct blk_mq_tags *blk_mq_sched_alloc_requests(unsigned int depth,
- unsigned int numa_node)
-{
- struct blk_mq_tag_set set = {
- .ops = &mq_sched_tag_ops,
- .nr_hw_queues = 1,
- .queue_depth = depth,
- .numa_node = numa_node,
- };
-
- return blk_mq_init_rq_map(&set, 0);
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_alloc_requests);
-
-void blk_mq_sched_free_hctx_data(struct request_queue *q,
- void (*exit)(struct blk_mq_hw_ctx *))
-{
- struct blk_mq_hw_ctx *hctx;
- int i;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- if (exit)
- exit(hctx);
- kfree(hctx->sched_data);
- hctx->sched_data = NULL;
- }
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
-
-int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
- int (*init)(struct blk_mq_hw_ctx *),
- void (*exit)(struct blk_mq_hw_ctx *))
-{
- struct blk_mq_hw_ctx *hctx;
- int ret;
- int i;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- hctx->sched_data = kmalloc_node(size, GFP_KERNEL, hctx->numa_node);
- if (!hctx->sched_data) {
- ret = -ENOMEM;
- goto error;
- }
-
- if (init) {
- ret = init(hctx);
- if (ret) {
- /*
- * We don't want to give exit() a partially
- * initialized sched_data. init() must clean up
- * if it fails.
- */
- kfree(hctx->sched_data);
- hctx->sched_data = NULL;
- goto error;
- }
- }
- }
-
- return 0;
-error:
- blk_mq_sched_free_hctx_data(q, exit);
- return ret;
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_init_hctx_data);
-
-struct request *blk_mq_sched_alloc_shadow_request(struct request_queue *q,
- struct blk_mq_alloc_data *data,
- struct blk_mq_tags *tags,
- atomic_t *wait_index)
-{
- struct sbq_wait_state *ws;
- DEFINE_WAIT(wait);
- struct request *rq;
- int tag;
-
- tag = __sbitmap_queue_get(&tags->bitmap_tags);
- if (tag != -1)
- goto done;
-
- if (data->flags & BLK_MQ_REQ_NOWAIT)
- return NULL;
-
- ws = sbq_wait_ptr(&tags->bitmap_tags, wait_index);
- do {
- prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE);
-
- tag = __sbitmap_queue_get(&tags->bitmap_tags);
- if (tag != -1)
- break;
-
- blk_mq_run_hw_queue(data->hctx, false);
-
- tag = __sbitmap_queue_get(&tags->bitmap_tags);
- if (tag != -1)
- break;
-
- blk_mq_put_ctx(data->ctx);
- io_schedule();
-
- data->ctx = blk_mq_get_ctx(data->q);
- data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu);
- finish_wait(&ws->wait, &wait);
- ws = sbq_wait_ptr(&tags->bitmap_tags, wait_index);
- } while (1);
-
- finish_wait(&ws->wait, &wait);
-done:
- rq = tags->rqs[tag];
- rq->tag = tag;
- rq->rq_flags = RQF_ALLOCED;
- return rq;
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_alloc_shadow_request);
-
-void blk_mq_sched_free_shadow_request(struct blk_mq_tags *tags,
- struct request *rq)
-{
- WARN_ON_ONCE(!(rq->rq_flags & RQF_ALLOCED));
- sbitmap_queue_clear(&tags->bitmap_tags, rq->tag, rq->mq_ctx->cpu);
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_free_shadow_request);
-
-static void rq_copy(struct request *rq, struct request *src)
-{
-#define FIELD_COPY(dst, src, name) ((dst)->name = (src)->name)
- FIELD_COPY(rq, src, cpu);
- FIELD_COPY(rq, src, cmd_type);
- FIELD_COPY(rq, src, cmd_flags);
- rq->rq_flags |= (src->rq_flags & (RQF_PREEMPT | RQF_QUIET | RQF_PM | RQF_DONTPREP));
- rq->rq_flags &= ~RQF_IO_STAT;
- FIELD_COPY(rq, src, __data_len);
- FIELD_COPY(rq, src, __sector);
- FIELD_COPY(rq, src, bio);
- FIELD_COPY(rq, src, biotail);
- FIELD_COPY(rq, src, rq_disk);
- FIELD_COPY(rq, src, part);
- FIELD_COPY(rq, src, issue_stat);
- src->issue_stat.time = 0;
- FIELD_COPY(rq, src, nr_phys_segments);
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
- FIELD_COPY(rq, src, nr_integrity_segments);
-#endif
- FIELD_COPY(rq, src, ioprio);
- FIELD_COPY(rq, src, timeout);
-
- if (src->cmd_type == REQ_TYPE_BLOCK_PC) {
- FIELD_COPY(rq, src, cmd);
- FIELD_COPY(rq, src, cmd_len);
- FIELD_COPY(rq, src, extra_len);
- FIELD_COPY(rq, src, sense_len);
- FIELD_COPY(rq, src, resid_len);
- FIELD_COPY(rq, src, sense);
- FIELD_COPY(rq, src, retries);
- }
-
- src->bio = src->biotail = NULL;
-}
-
-static void sched_rq_end_io(struct request *rq, int error)
-{
- struct request *sched_rq = rq->end_io_data;
-
- FIELD_COPY(sched_rq, rq, resid_len);
- FIELD_COPY(sched_rq, rq, extra_len);
- FIELD_COPY(sched_rq, rq, sense_len);
- FIELD_COPY(sched_rq, rq, errors);
- FIELD_COPY(sched_rq, rq, retries);
-
- blk_account_io_completion(sched_rq, blk_rq_bytes(sched_rq));
- blk_account_io_done(sched_rq);
-
- if (sched_rq->end_io)
- sched_rq->end_io(sched_rq, error);
-
- blk_mq_finish_request(rq);
-}
-
static inline struct request *
__blk_mq_sched_alloc_request(struct blk_mq_hw_ctx *hctx)
{
@@ -225,55 +35,6 @@ __blk_mq_sched_alloc_request(struct blk_mq_hw_ctx *hctx)
return rq;
}
-static inline void
-__blk_mq_sched_init_request_from_shadow(struct request *rq,
- struct request *sched_rq)
-{
- WARN_ON_ONCE(!(sched_rq->rq_flags & RQF_ALLOCED));
- rq_copy(rq, sched_rq);
- rq->end_io = sched_rq_end_io;
- rq->end_io_data = sched_rq;
-}
-
-struct request *
-blk_mq_sched_request_from_shadow(struct blk_mq_hw_ctx *hctx,
- struct request *(*get_sched_rq)(struct blk_mq_hw_ctx *))
-{
- struct request *rq, *sched_rq;
-
- rq = __blk_mq_sched_alloc_request(hctx);
- if (!rq)
- return NULL;
-
- sched_rq = get_sched_rq(hctx);
- if (sched_rq) {
- __blk_mq_sched_init_request_from_shadow(rq, sched_rq);
- return rq;
- }
-
- /*
- * __blk_mq_finish_request() drops a queue ref we already hold,
- * so grab an extra one.
- */
- blk_queue_enter_live(hctx->queue);
- __blk_mq_finish_request(hctx, rq->mq_ctx, rq);
- return NULL;
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_request_from_shadow);
-
-struct request *__blk_mq_sched_request_from_shadow(struct blk_mq_hw_ctx *hctx,
- struct request *sched_rq)
-{
- struct request *rq;
-
- rq = __blk_mq_sched_alloc_request(hctx);
- if (rq)
- __blk_mq_sched_init_request_from_shadow(rq, sched_rq);
-
- return rq;
-}
-EXPORT_SYMBOL_GPL(__blk_mq_sched_request_from_shadow);
-
static void __blk_mq_sched_assign_ioc(struct request_queue *q,
struct request *rq, struct io_context *ioc)
{
@@ -298,8 +59,8 @@ static void __blk_mq_sched_assign_ioc(struct request_queue *q,
rq->elv.icq = NULL;
}
-static void blk_mq_sched_assign_ioc(struct request_queue *q,
- struct request *rq, struct bio *bio)
+void blk_mq_sched_assign_ioc(struct request_queue *q, struct request *rq,
+ struct bio *bio)
{
struct io_context *ioc;
@@ -308,44 +69,9 @@ static void blk_mq_sched_assign_ioc(struct request_queue *q,
__blk_mq_sched_assign_ioc(q, rq, ioc);
}
-struct request *blk_mq_sched_get_request(struct request_queue *q,
- struct bio *bio,
- unsigned int op,
- struct blk_mq_alloc_data *data)
-{
- struct elevator_queue *e = q->elevator;
- struct blk_mq_hw_ctx *hctx;
- struct blk_mq_ctx *ctx;
- struct request *rq;
-
- blk_queue_enter_live(q);
- ctx = blk_mq_get_ctx(q);
- hctx = blk_mq_map_queue(q, ctx->cpu);
-
- blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
-
- if (e && e->type->ops.mq.get_request)
- rq = e->type->ops.mq.get_request(q, op, data);
- else
- rq = __blk_mq_alloc_request(data, op);
-
- if (rq) {
- rq->elv.icq = NULL;
- if (e && e->type->icq_cache)
- blk_mq_sched_assign_ioc(q, rq, bio);
- data->hctx->queued++;
- return rq;
- }
-
- blk_queue_exit(q);
- return NULL;
-}
-
void blk_mq_sched_put_request(struct request *rq)
{
struct request_queue *q = rq->q;
- struct elevator_queue *e = q->elevator;
- bool has_queue_ref = false, do_free = false;
wbt_done(q->rq_wb, &rq->issue_stat);
@@ -357,22 +83,7 @@ void blk_mq_sched_put_request(struct request *rq)
}
}
- /*
- * If we are freeing a shadow that hasn't been started, then drop
- * our queue ref on it. This normally happens at IO completion
- * time, but if we merge request-to-request, then this 'rq' will
- * never get started or completed.
- */
- if (blk_mq_sched_rq_is_shadow(rq) && !(rq->rq_flags & RQF_STARTED))
- has_queue_ref = true;
-
- if (e && e->type->ops.mq.put_request)
- do_free = !e->type->ops.mq.put_request(rq);
-
- if (do_free)
- blk_mq_finish_request(rq);
- if (has_queue_ref)
- blk_queue_exit(q);
+ blk_mq_finish_request(rq);
}
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 8ff37f9782e9..6b8c314b5c20 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -3,30 +3,6 @@
#include "blk-mq.h"
-struct blk_mq_tags *blk_mq_sched_alloc_requests(unsigned int depth, unsigned int numa_node);
-void blk_mq_sched_free_requests(struct blk_mq_tags *tags);
-
-int blk_mq_sched_init_hctx_data(struct request_queue *q, size_t size,
- int (*init)(struct blk_mq_hw_ctx *),
- void (*exit)(struct blk_mq_hw_ctx *));
-
-void blk_mq_sched_free_hctx_data(struct request_queue *q,
- void (*exit)(struct blk_mq_hw_ctx *));
-
-void blk_mq_sched_free_shadow_request(struct blk_mq_tags *tags,
- struct request *rq);
-struct request *blk_mq_sched_alloc_shadow_request(struct request_queue *q,
- struct blk_mq_alloc_data *data,
- struct blk_mq_tags *tags,
- atomic_t *wait_index);
-struct request *
-blk_mq_sched_request_from_shadow(struct blk_mq_hw_ctx *hctx,
- struct request *(*get_sched_rq)(struct blk_mq_hw_ctx *));
-struct request *
-__blk_mq_sched_request_from_shadow(struct blk_mq_hw_ctx *hctx,
- struct request *sched_rq);
-
-struct request *blk_mq_sched_get_request(struct request_queue *q, struct bio *bio, unsigned int op, struct blk_mq_alloc_data *data);
void blk_mq_sched_put_request(struct request *rq);
void __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
@@ -35,6 +11,9 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio);
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
+void blk_mq_sched_assign_ioc(struct request_queue *q,
+ struct request *rq, struct bio *bio);
+
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
int blk_mq_sched_init(struct request_queue *q);
@@ -109,22 +88,6 @@ blk_mq_sched_insert_requests(struct request_queue *q, struct blk_mq_ctx *ctx,
blk_mq_run_hw_queue(hctx, run_queue_async);
}
-static inline void
-blk_mq_sched_dispatch_shadow_requests(struct blk_mq_hw_ctx *hctx,
- struct list_head *rq_list,
- struct request *(*get_sched_rq)(struct blk_mq_hw_ctx *))
-{
- do {
- struct request *rq;
-
- rq = blk_mq_sched_request_from_shadow(hctx, get_sched_rq);
- if (!rq)
- break;
-
- list_add_tail(&rq->queuelist, rq_list);
- } while (1);
-}
-
static inline bool
blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
@@ -140,11 +103,6 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
static inline void
blk_mq_sched_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
{
- struct elevator_queue *e = hctx->queue->elevator;
-
- if (e && e->type->ops.mq.completed_request)
- e->type->ops.mq.completed_request(hctx, rq);
-
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
blk_mq_run_hw_queue(hctx, true);
@@ -179,11 +137,4 @@ static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
return false;
}
-/*
- * Returns true if this is an internal shadow request
- */
-static inline bool blk_mq_sched_rq_is_shadow(struct request *rq)
-{
- return (rq->rq_flags & RQF_ALLOCED) != 0;
-}
#endif
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 3a19834211b2..35e1162602f5 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -245,6 +245,8 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
unsigned int flags)
{
struct blk_mq_alloc_data alloc_data;
+ struct blk_mq_ctx *ctx;
+ struct blk_mq_hw_ctx *hctx;
struct request *rq;
int ret;
@@ -252,13 +254,16 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw,
if (ret)
return ERR_PTR(ret);
- rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data);
-
- blk_mq_put_ctx(alloc_data.ctx);
- blk_queue_exit(q);
+ ctx = blk_mq_get_ctx(q);
+ hctx = blk_mq_map_queue(q, ctx->cpu);
+ blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx);
+ rq = __blk_mq_alloc_request(&alloc_data, rw);
+ blk_mq_put_ctx(ctx);
- if (!rq)
+ if (!rq) {
+ blk_queue_exit(q);
return ERR_PTR(-EWOULDBLOCK);
+ }
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
@@ -324,7 +329,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
const int tag = rq->tag;
struct request_queue *q = rq->q;
- blk_mq_sched_completed_request(hctx, rq);
+ ctx->rq_completed[rq_is_sync(rq)]++;
if (rq->rq_flags & RQF_MQ_INFLIGHT)
atomic_dec(&hctx->nr_active);
@@ -1246,6 +1251,34 @@ static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
}
}
+static struct request *blk_mq_get_request(struct request_queue *q,
+ struct bio *bio,
+ struct blk_mq_alloc_data *data)
+{
+ struct elevator_queue *e = q->elevator;
+ struct blk_mq_hw_ctx *hctx;
+ struct blk_mq_ctx *ctx;
+ struct request *rq;
+
+ blk_queue_enter_live(q);
+ ctx = blk_mq_get_ctx(q);
+ hctx = blk_mq_map_queue(q, ctx->cpu);
+
+ trace_block_getrq(q, bio, bio->bi_opf);
+ blk_mq_set_alloc_data(data, q, 0, ctx, hctx);
+ rq = __blk_mq_alloc_request(data, bio->bi_opf);
+
+ if (rq) {
+ rq->elv.icq = NULL;
+ if (e && e->type->icq_cache)
+ blk_mq_sched_assign_ioc(q, rq, bio);
+ data->hctx->queued++;
+ return rq;
+ }
+
+ return rq;
+}
+
static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
{
struct request_queue *q = rq->q;
@@ -1328,7 +1361,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
trace_block_getrq(q, bio, bio->bi_opf);
- rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
+ rq = blk_mq_get_request(q, bio, &data);
if (unlikely(!rq)) {
__wbt_done(q->rq_wb, wb_acct);
return BLK_QC_T_NONE;
@@ -1448,7 +1481,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
trace_block_getrq(q, bio, bio->bi_opf);
- rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data);
+ rq = blk_mq_get_request(q, bio, &data);
if (unlikely(!rq)) {
__wbt_done(q->rq_wb, wb_acct);
return BLK_QC_T_NONE;
@@ -1504,6 +1537,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
blk_mq_sched_insert_request(rq, false, true, true);
goto done;
}
+
if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
/*
* For a SYNC request, send it to the hardware immediately. For
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index e26c02798041..9a4039d9b4f0 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -63,8 +63,6 @@ struct deadline_data {
spinlock_t lock;
struct list_head dispatch;
- struct blk_mq_tags *tags;
- atomic_t wait_index;
};
static inline struct rb_root *
@@ -300,7 +298,13 @@ static struct request *__dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
static void dd_dispatch_requests(struct blk_mq_hw_ctx *hctx,
struct list_head *rq_list)
{
- blk_mq_sched_dispatch_shadow_requests(hctx, rq_list, __dd_dispatch_request);
+ for (;;) {
+ struct request *rq = __dd_dispatch_request(hctx);
+ if (!rq)
+ break;
+
+ list_add_tail(&rq->queuelist, rq_list);
+ }
}
static void dd_exit_queue(struct elevator_queue *e)
@@ -310,7 +314,6 @@ static void dd_exit_queue(struct elevator_queue *e)
BUG_ON(!list_empty(&dd->fifo_list[READ]));
BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
- blk_mq_sched_free_requests(dd->tags);
kfree(dd);
}
@@ -333,13 +336,6 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
}
eq->elevator_data = dd;
- dd->tags = blk_mq_sched_alloc_requests(queue_depth, q->node);
- if (!dd->tags) {
- kfree(dd);
- kobject_put(&eq->kobj);
- return -ENOMEM;
- }
-
INIT_LIST_HEAD(&dd->fifo_list[READ]);
INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
dd->sort_list[READ] = RB_ROOT;
@@ -351,7 +347,6 @@ static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
dd->fifo_batch = fifo_batch;
spin_lock_init(&dd->lock);
INIT_LIST_HEAD(&dd->dispatch);
- atomic_set(&dd->wait_index, 0);
q->elevator = eq;
return 0;
@@ -409,11 +404,10 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
blk_mq_sched_request_inserted(rq);
/*
- * If we're trying to insert a real request, just send it directly
- * to the hardware dispatch list. This only happens for a requeue,
- * or FUA/FLUSH requests.
+ * Send FUA and FLUSH requests directly to the hardware dispatch list.
+ * To do: also send requeued requests directly to the hw disp list.
*/
- if (!blk_mq_sched_rq_is_shadow(rq)) {
+ if (rq->cmd_flags & (REQ_PREFLUSH | REQ_FUA)) {
spin_lock(&hctx->lock);
list_add_tail(&rq->queuelist, &hctx->dispatch);
spin_unlock(&hctx->lock);
@@ -459,67 +453,6 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
spin_unlock(&dd->lock);
}
-static struct request *dd_get_request(struct request_queue *q, unsigned int op,
- struct blk_mq_alloc_data *data)
-{
- struct deadline_data *dd = q->elevator->elevator_data;
- struct request *rq;
-
- /*
- * The flush machinery intercepts before we insert the request. As
- * a work-around, just hand it back a real request.
- */
- if (unlikely(op & (REQ_PREFLUSH | REQ_FUA)))
- rq = __blk_mq_alloc_request(data, op);
- else {
- rq = blk_mq_sched_alloc_shadow_request(q, data, dd->tags, &dd->wait_index);
- if (rq)
- blk_mq_rq_ctx_init(q, data->ctx, rq, op);
- }
-
- return rq;
-}
-
-static bool dd_put_request(struct request *rq)
-{
- /*
- * If it's a real request, we just have to free it. Return false
- * to say we didn't handle it, and blk_mq_sched will take care of that.
- */
- if (!blk_mq_sched_rq_is_shadow(rq))
- return false;
-
- if (!(rq->rq_flags & RQF_STARTED)) {
- struct request_queue *q = rq->q;
- struct deadline_data *dd = q->elevator->elevator_data;
-
- /*
- * IO completion would normally do this, but if we merge
- * and free before we issue the request, we need to free
- * the shadow tag here.
- */
- blk_mq_sched_free_shadow_request(dd->tags, rq);
- }
-
- return true;
-}
-
-static void dd_completed_request(struct blk_mq_hw_ctx *hctx, struct request *rq)
-{
- struct request *sched_rq = rq->end_io_data;
-
- /*
- * sched_rq can be NULL, if we haven't setup the shadow yet
- * because we failed getting one.
- */
- if (sched_rq) {
- struct deadline_data *dd = hctx->queue->elevator->elevator_data;
-
- blk_mq_sched_free_shadow_request(dd->tags, sched_rq);
- blk_mq_start_stopped_hw_queue(hctx, true);
- }
-}
-
static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
@@ -601,11 +534,8 @@ static struct elv_fs_entry deadline_attrs[] = {
static struct elevator_type mq_deadline = {
.ops.mq = {
- .get_request = dd_get_request,
- .put_request = dd_put_request,
.insert_requests = dd_insert_requests,
.dispatch_requests = dd_dispatch_requests,
- .completed_request = dd_completed_request,
.next_request = elv_rb_latter_request,
.former_request = elv_rb_former_request,
.bio_merge = dd_bio_merge,
diff --git a/include/linux/elevator.h b/include/linux/elevator.h
index 64224d39d707..312e6d3e89fa 100644
--- a/include/linux/elevator.h
+++ b/include/linux/elevator.h
@@ -89,12 +89,9 @@ struct elevator_mq_ops {
int (*request_merge)(struct request_queue *q, struct request **, struct bio *);
void (*request_merged)(struct request_queue *, struct request *, int);
void (*requests_merged)(struct request_queue *, struct request *, struct request *);
- struct request *(*get_request)(struct request_queue *, unsigned int, struct blk_mq_alloc_data *);
- bool (*put_request)(struct request *);
void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool);
void (*dispatch_requests)(struct blk_mq_hw_ctx *, struct list_head *);
bool (*has_work)(struct blk_mq_hw_ctx *);
- void (*completed_request)(struct blk_mq_hw_ctx *, struct request *);
void (*started_request)(struct request *);
void (*requeue_request)(struct request *);
struct request *(*former_request)(struct request_queue *, struct request *);
--
2.11.0
From ae72bb9f67d01b3a02cee80c81a712f775d13c32 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@xxxxxxxxxxx>
Date: Tue, 20 Dec 2016 12:00:47 +0100
Subject: [PATCH 2/3] blk-mq: Make the blk_mq_{get,put}_tag() callers specify
the tag set
This patch does not change any functionality.
---
block/blk-mq-tag.c | 29 ++++++++++----------
block/blk-mq-tag.h | 7 +++--
block/blk-mq.c | 80 ++++++++++++++++++++++++++++++++++++------------------
block/blk-mq.h | 9 ++++--
4 files changed, 77 insertions(+), 48 deletions(-)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index dcf5ce3ba4bf..890d634db0ee 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -156,47 +156,46 @@ static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt,
return tag;
}
-static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data)
+static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
+ struct blk_mq_tags *tags)
{
int tag;
- tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx,
- data->hctx->tags);
+ tag = bt_get(data, &tags->bitmap_tags, data->hctx, tags);
if (tag >= 0)
- return tag + data->hctx->tags->nr_reserved_tags;
+ return tag + tags->nr_reserved_tags;
return BLK_MQ_TAG_FAIL;
}
-static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data)
+static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data,
+ struct blk_mq_tags *tags)
{
int tag;
- if (unlikely(!data->hctx->tags->nr_reserved_tags)) {
+ if (unlikely(!tags->nr_reserved_tags)) {
WARN_ON_ONCE(1);
return BLK_MQ_TAG_FAIL;
}
- tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL,
- data->hctx->tags);
+ tag = bt_get(data, &tags->breserved_tags, NULL, tags);
if (tag < 0)
return BLK_MQ_TAG_FAIL;
return tag;
}
-unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
+unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data,
+ struct blk_mq_tags *tags)
{
if (data->flags & BLK_MQ_REQ_RESERVED)
- return __blk_mq_get_reserved_tag(data);
- return __blk_mq_get_tag(data);
+ return __blk_mq_get_reserved_tag(data, tags);
+ return __blk_mq_get_tag(data, tags);
}
-void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
- unsigned int tag)
+void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
+ struct blk_mq_ctx *ctx, unsigned int tag)
{
- struct blk_mq_tags *tags = hctx->tags;
-
if (tag >= tags->nr_reserved_tags) {
const int real_tag = tag - tags->nr_reserved_tags;
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index d1662734dc53..84186a11d2e0 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -23,9 +23,10 @@ struct blk_mq_tags {
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy);
extern void blk_mq_free_tags(struct blk_mq_tags *tags);
-extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
-extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
- unsigned int tag);
+extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data,
+ struct blk_mq_tags *tags);
+extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
+ struct blk_mq_ctx *ctx, unsigned int tag);
extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page);
extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 35e1162602f5..b68b7fc43e46 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -220,12 +220,13 @@ EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init);
struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
unsigned int op)
{
+ struct blk_mq_tags *tags = data->hctx->tags;
struct request *rq;
unsigned int tag;
- tag = blk_mq_get_tag(data);
+ tag = blk_mq_get_tag(data, tags);
if (tag != BLK_MQ_TAG_FAIL) {
- rq = data->hctx->tags->rqs[tag];
+ rq = tags->rqs[tag];
if (blk_mq_tag_busy(data->hctx)) {
rq->rq_flags = RQF_MQ_INFLIGHT;
@@ -339,7 +340,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
- blk_mq_put_tag(hctx, ctx, tag);
+ blk_mq_put_tag(hctx, hctx->tags, ctx, tag);
blk_queue_exit(q);
}
@@ -1554,8 +1555,8 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
return cookie;
}
-void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
- unsigned int hctx_idx)
+void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+ unsigned int hctx_idx)
{
struct page *page;
@@ -1581,23 +1582,19 @@ void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
kmemleak_free(page_address(page));
__free_pages(page, page->private);
}
+}
+void blk_mq_free_rq_map(struct blk_mq_tags *tags)
+{
kfree(tags->rqs);
blk_mq_free_tags(tags);
}
-static size_t order_to_size(unsigned int order)
-{
- return (size_t)PAGE_SIZE << order;
-}
-
-struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
- unsigned int hctx_idx)
+struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx)
{
struct blk_mq_tags *tags;
- unsigned int i, j, entries_per_page, max_order = 4;
- size_t rq_size, left;
tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
set->numa_node,
@@ -1605,8 +1602,6 @@ struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
if (!tags)
return NULL;
- INIT_LIST_HEAD(&tags->page_list);
-
tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
set->numa_node);
@@ -1615,6 +1610,22 @@ struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
return NULL;
}
+ return tags;
+}
+
+static size_t order_to_size(unsigned int order)
+{
+ return (size_t)PAGE_SIZE << order;
+}
+
+int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+ unsigned int hctx_idx)
+{
+ unsigned int i, j, entries_per_page, max_order = 4;
+ size_t rq_size, left;
+
+ INIT_LIST_HEAD(&tags->page_list);
+
/*
* rq_size is the size of the request plus driver payload, rounded
* to the cacheline size
@@ -1674,11 +1685,11 @@ struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
i++;
}
}
- return tags;
+ return 0;
fail:
- blk_mq_free_rq_map(set, tags, hctx_idx);
- return NULL;
+ blk_mq_free_rqs(set, tags, hctx_idx);
+ return -ENOMEM;
}
/*
@@ -1899,7 +1910,13 @@ static void blk_mq_map_swqueue(struct request_queue *q,
hctx_idx = q->mq_map[i];
/* unmapped hw queue can be remapped after CPU topo changed */
if (!set->tags[hctx_idx]) {
- set->tags[hctx_idx] = blk_mq_init_rq_map(set, hctx_idx);
+ set->tags[hctx_idx] = blk_mq_alloc_rq_map(set,
+ hctx_idx);
+ if (blk_mq_alloc_rqs(set, set->tags[hctx_idx],
+ hctx_idx) < 0) {
+ blk_mq_free_rq_map(set->tags[hctx_idx]);
+ set->tags[hctx_idx] = NULL;
+ }
/*
* If tags initialization fail for some hctx,
@@ -1932,7 +1949,8 @@ static void blk_mq_map_swqueue(struct request_queue *q,
* allocation
*/
if (i && set->tags[i]) {
- blk_mq_free_rq_map(set, set->tags[i], i);
+ blk_mq_free_rqs(set, set->tags[i], i);
+ blk_mq_free_rq_map(set->tags[i]);
set->tags[i] = NULL;
}
hctx->tags = NULL;
@@ -2102,7 +2120,8 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
if (hctx) {
if (hctx->tags) {
- blk_mq_free_rq_map(set, hctx->tags, j);
+ blk_mq_free_rqs(set, set->tags[j], j);
+ blk_mq_free_rq_map(hctx->tags);
set->tags[j] = NULL;
}
blk_mq_exit_hctx(q, set, hctx, j);
@@ -2304,16 +2323,21 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
int i;
for (i = 0; i < set->nr_hw_queues; i++) {
- set->tags[i] = blk_mq_init_rq_map(set, i);
+ set->tags[i] = blk_mq_alloc_rq_map(set, i);
if (!set->tags[i])
goto out_unwind;
+ if (blk_mq_alloc_rqs(set, set->tags[i], i) < 0)
+ goto free_rq_map;
}
return 0;
out_unwind:
- while (--i >= 0)
- blk_mq_free_rq_map(set, set->tags[i], i);
+ while (--i >= 0) {
+ blk_mq_free_rqs(set, set->tags[i], i);
+free_rq_map:
+ blk_mq_free_rq_map(set->tags[i]);
+ }
return -ENOMEM;
}
@@ -2438,8 +2462,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
int i;
for (i = 0; i < nr_cpu_ids; i++) {
- if (set->tags[i])
- blk_mq_free_rq_map(set, set->tags[i], i);
+ if (set->tags[i]) {
+ blk_mq_free_rqs(set, set->tags[i], i);
+ blk_mq_free_rq_map(set->tags[i]);
+ }
}
kfree(set->mq_map);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 898c3c9a60ec..2e98dd8ccee2 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -37,10 +37,13 @@ void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
/*
* Internal helpers for allocating/freeing the request map
*/
-void blk_mq_free_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
- unsigned int hctx_idx);
-struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
+void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+ unsigned int hctx_idx);
+void blk_mq_free_rq_map(struct blk_mq_tags *tags);
+struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
unsigned int hctx_idx);
+int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
+ unsigned int hctx_idx);
/*
* Internal helpers for request insertion into sw queues
--
2.11.0
From c49ec4e8b0e4135a87c9894597901539f3e3ca08 Mon Sep 17 00:00:00 2001
From: Bart Van Assche <bart.vanassche@xxxxxxxxxxx>
Date: Wed, 21 Dec 2016 12:39:33 +0100
Subject: [PATCH 3/3] blk-mq: Split driver and scheduler tags
Add 'sched_tags' next to 'tags' in struct blk_mq_hw_ctx and also
in struct blk_mq_tag_set. Add 'sched_tag' next to 'tag' in struct
request. Modify blk_mq_update_nr_requests() such that it accepts
values larger than the queue depth. Make __blk_mq_free_request()
free both tags. Make blk_mq_alloc_tag_set() allocate both tag sets.
Make blk_mq_free_tag_set() free both tag sets. Make
blk_mq_dispatch_rq_list() allocate the driver tag. Modify
blk_mq_update_nr_requests() such that it accepts a size that
exceeds the queue depth.
---
block/blk-flush.c | 9 ++-
block/blk-mq.c | 160 +++++++++++++++++++++++++++++++++++--------------
block/blk-mq.h | 5 +-
block/blk-tag.c | 1 +
include/linux/blk-mq.h | 2 +
include/linux/blkdev.h | 1 +
6 files changed, 129 insertions(+), 49 deletions(-)
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 6a7c29d2eb3c..46d12bbfde85 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -170,6 +170,8 @@ static bool blk_flush_complete_seq(struct request *rq,
struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
bool queued = false, kicked;
+ BUG_ON(rq->tag < 0);
+
BUG_ON(rq->flush.seq & seq);
rq->flush.seq |= seq;
@@ -319,6 +321,8 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq)
if (q->mq_ops) {
struct blk_mq_hw_ctx *hctx;
+ BUG_ON(first_rq->tag < 0);
+
flush_rq->mq_ctx = first_rq->mq_ctx;
flush_rq->tag = first_rq->tag;
fq->orig_rq = first_rq;
@@ -452,8 +456,9 @@ void blk_insert_flush(struct request *rq)
* processed directly without going through flush machinery. Queue
* for normal execution.
*/
- if ((policy & REQ_FSEQ_DATA) &&
- !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
+ if (((policy & REQ_FSEQ_DATA) &&
+ !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) ||
+ (q->mq_ops && blk_mq_assign_drv_tag(rq) < 0)) {
if (q->mq_ops)
blk_mq_sched_insert_request(rq, false, true, false);
else
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b68b7fc43e46..48d7968d4ed9 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -220,20 +220,21 @@ EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init);
struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data,
unsigned int op)
{
- struct blk_mq_tags *tags = data->hctx->tags;
+ struct blk_mq_tags *tags = data->hctx->sched_tags;
struct request *rq;
- unsigned int tag;
+ unsigned int sched_tag;
- tag = blk_mq_get_tag(data, tags);
- if (tag != BLK_MQ_TAG_FAIL) {
- rq = tags->rqs[tag];
+ sched_tag = blk_mq_get_tag(data, tags);
+ if (sched_tag != BLK_MQ_TAG_FAIL) {
+ rq = tags->rqs[sched_tag];
+ rq->tag = -1;
if (blk_mq_tag_busy(data->hctx)) {
rq->rq_flags = RQF_MQ_INFLIGHT;
atomic_inc(&data->hctx->nr_active);
}
- rq->tag = tag;
+ rq->sched_tag = sched_tag;
blk_mq_rq_ctx_init(data->q, data->ctx, rq, op);
return rq;
}
@@ -328,6 +329,7 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct request *rq)
{
const int tag = rq->tag;
+ const int sched_tag = rq->sched_tag;
struct request_queue *q = rq->q;
ctx->rq_completed[rq_is_sync(rq)]++;
@@ -340,7 +342,13 @@ void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags);
- blk_mq_put_tag(hctx, hctx->tags, ctx, tag);
+ if (tag >= 0) {
+ WARN_ON_ONCE(hctx->tags->rqs[tag] != rq);
+ hctx->tags->rqs[tag] = NULL;
+ blk_mq_put_tag(hctx, hctx->tags, ctx, tag);
+ }
+ if (sched_tag >= 0)
+ blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
blk_queue_exit(q);
}
@@ -844,6 +852,26 @@ static inline unsigned int queued_to_index(unsigned int queued)
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
+int blk_mq_assign_drv_tag(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
+ struct blk_mq_alloc_data data = {
+ .q = rq->q,
+ .ctx = rq->mq_ctx,
+ .hctx = hctx,
+ };
+
+ rq->tag = blk_mq_get_tag(&data, hctx->tags);
+ if (rq->tag < 0)
+ goto out;
+ WARN_ON_ONCE(hctx->tags->rqs[rq->tag]);
+ hctx->tags->rqs[rq->tag] = rq;
+
+out:
+ return rq->tag;
+}
+
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
{
struct request_queue *q = hctx->queue;
@@ -866,6 +894,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list)
struct blk_mq_queue_data bd;
rq = list_first_entry(list, struct request, queuelist);
+ if (rq->tag < 0 && blk_mq_assign_drv_tag(rq) < 0)
+ break;
list_del_init(&rq->queuelist);
bd.rq = rq;
@@ -1296,7 +1326,8 @@ static void blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie)
goto insert;
hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu);
- if (blk_mq_hctx_stopped(hctx))
+ if (blk_mq_hctx_stopped(hctx) ||
+ (rq->tag < 0 && blk_mq_assign_drv_tag(rq) < 0))
goto insert;
new_cookie = blk_tag_to_qc_t(rq->tag, hctx->queue_num);
@@ -1592,17 +1623,19 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags)
}
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
- unsigned int hctx_idx)
+ unsigned int hctx_idx,
+ unsigned int nr_tags,
+ unsigned int reserved_tags)
{
struct blk_mq_tags *tags;
- tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
+ tags = blk_mq_init_tags(nr_tags, reserved_tags,
set->numa_node,
BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags));
if (!tags)
return NULL;
- tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
+ tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
set->numa_node);
if (!tags->rqs) {
@@ -1800,6 +1833,7 @@ static int blk_mq_init_hctx(struct request_queue *q,
cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
hctx->tags = set->tags[hctx_idx];
+ hctx->sched_tags = set->sched_tags[hctx_idx];
/*
* Allocate space for all possible cpus to avoid allocation at
@@ -1881,6 +1915,38 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
}
}
+static void __blk_mq_free_rq_map_i(struct blk_mq_tag_set *set, int hctx_idx)
+{
+ if (set->sched_tags[hctx_idx]) {
+ blk_mq_free_rqs(set, set->sched_tags[hctx_idx], hctx_idx);
+ blk_mq_free_rq_map(set->sched_tags[hctx_idx]);
+ set->sched_tags[hctx_idx] = NULL;
+ }
+ if (set->tags[hctx_idx]) {
+ blk_mq_free_rq_map(set->tags[hctx_idx]);
+ set->tags[hctx_idx] = NULL;
+ }
+}
+
+static bool __blk_mq_alloc_rq_map_i(struct blk_mq_tag_set *set, int hctx_idx,
+ unsigned int nr_requests)
+{
+ int ret = 0;
+
+ set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
+ set->queue_depth, set->reserved_tags);
+ set->sched_tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
+ nr_requests, 0);
+ if (set->sched_tags[hctx_idx])
+ ret = blk_mq_alloc_rqs(set, set->sched_tags[hctx_idx],
+ hctx_idx);
+ if (!set->tags[hctx_idx] || !set->sched_tags[hctx_idx] || ret < 0) {
+ __blk_mq_free_rq_map_i(set, hctx_idx);
+ return false;
+ }
+ return true;
+}
+
static void blk_mq_map_swqueue(struct request_queue *q,
const struct cpumask *online_mask)
{
@@ -1909,23 +1975,15 @@ static void blk_mq_map_swqueue(struct request_queue *q,
hctx_idx = q->mq_map[i];
/* unmapped hw queue can be remapped after CPU topo changed */
- if (!set->tags[hctx_idx]) {
- set->tags[hctx_idx] = blk_mq_alloc_rq_map(set,
- hctx_idx);
- if (blk_mq_alloc_rqs(set, set->tags[hctx_idx],
- hctx_idx) < 0) {
- blk_mq_free_rq_map(set->tags[hctx_idx]);
- set->tags[hctx_idx] = NULL;
- }
-
+ if (!set->tags[hctx_idx] &&
+ !__blk_mq_alloc_rq_map_i(set, hctx_idx, q->nr_requests)) {
/*
* If tags initialization fail for some hctx,
* that hctx won't be brought online. In this
* case, remap the current ctx to hctx[0] which
* is guaranteed to always have tags allocated
*/
- if (!set->tags[hctx_idx])
- q->mq_map[i] = 0;
+ q->mq_map[i] = 0;
}
ctx = per_cpu_ptr(q->queue_ctx, i);
@@ -2318,26 +2376,20 @@ static int blk_mq_queue_reinit_prepare(unsigned int cpu)
return 0;
}
-static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
+static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set,
+ unsigned int nr_requests)
{
int i;
- for (i = 0; i < set->nr_hw_queues; i++) {
- set->tags[i] = blk_mq_alloc_rq_map(set, i);
- if (!set->tags[i])
+ for (i = 0; i < set->nr_hw_queues; i++)
+ if (!__blk_mq_alloc_rq_map_i(set, i, nr_requests))
goto out_unwind;
- if (blk_mq_alloc_rqs(set, set->tags[i], i) < 0)
- goto free_rq_map;
- }
return 0;
out_unwind:
- while (--i >= 0) {
- blk_mq_free_rqs(set, set->tags[i], i);
-free_rq_map:
- blk_mq_free_rq_map(set->tags[i]);
- }
+ while (--i >= 0)
+ __blk_mq_free_rq_map_i(set, i);
return -ENOMEM;
}
@@ -2347,14 +2399,15 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
* may reduce the depth asked for, if memory is tight. set->queue_depth
* will be updated to reflect the allocated depth.
*/
-static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
+static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set,
+ unsigned int nr_requests)
{
unsigned int depth;
int err;
depth = set->queue_depth;
do {
- err = __blk_mq_alloc_rq_maps(set);
+ err = __blk_mq_alloc_rq_maps(set, nr_requests);
if (!err)
break;
@@ -2385,7 +2438,7 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
*/
int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
{
- int ret;
+ int ret = -ENOMEM;
BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS);
@@ -2425,32 +2478,39 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (!set->tags)
return -ENOMEM;
- ret = -ENOMEM;
+ set->sched_tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *),
+ GFP_KERNEL, set->numa_node);
+ if (!set->sched_tags)
+ goto free_drv_tags;
+
set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids,
GFP_KERNEL, set->numa_node);
if (!set->mq_map)
- goto out_free_tags;
+ goto free_sched_tags;
if (set->ops->map_queues)
ret = set->ops->map_queues(set);
else
ret = blk_mq_map_queues(set);
if (ret)
- goto out_free_mq_map;
+ goto free_mq_map;
- ret = blk_mq_alloc_rq_maps(set);
+ ret = blk_mq_alloc_rq_maps(set, set->queue_depth/*q->nr_requests*/);
if (ret)
- goto out_free_mq_map;
+ goto free_mq_map;
mutex_init(&set->tag_list_lock);
INIT_LIST_HEAD(&set->tag_list);
return 0;
-out_free_mq_map:
+free_mq_map:
kfree(set->mq_map);
set->mq_map = NULL;
-out_free_tags:
+free_sched_tags:
+ kfree(set->sched_tags);
+ set->sched_tags = NULL;
+free_drv_tags:
kfree(set->tags);
set->tags = NULL;
return ret;
@@ -2465,12 +2525,16 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
if (set->tags[i]) {
blk_mq_free_rqs(set, set->tags[i], i);
blk_mq_free_rq_map(set->tags[i]);
+ blk_mq_free_rq_map(set->sched_tags[i]);
}
}
kfree(set->mq_map);
set->mq_map = NULL;
+ kfree(set->sched_tags);
+ set->sched_tags = NULL;
+
kfree(set->tags);
set->tags = NULL;
}
@@ -2482,14 +2546,18 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
struct blk_mq_hw_ctx *hctx;
int i, ret;
- if (!set || nr > set->queue_depth)
+ if (!set)
return -EINVAL;
ret = 0;
queue_for_each_hw_ctx(q, hctx, i) {
if (!hctx->tags)
continue;
- ret = blk_mq_tag_update_depth(hctx->tags, nr);
+ ret = blk_mq_tag_update_depth(hctx->tags,
+ min(nr, set->queue_depth));
+ if (ret)
+ break;
+ ret = blk_mq_tag_update_depth(hctx->sched_tags, nr);
if (ret)
break;
}
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 2e98dd8ccee2..0368c513c2ab 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -31,6 +31,7 @@ void blk_mq_freeze_queue(struct request_queue *q);
void blk_mq_free_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
+int blk_mq_assign_drv_tag(struct request *rq);
bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *, struct list_head *);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
@@ -41,7 +42,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx);
void blk_mq_free_rq_map(struct blk_mq_tags *tags);
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
- unsigned int hctx_idx);
+ unsigned int hctx_idx,
+ unsigned int nr_tags,
+ unsigned int reserved_tags);
int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx);
diff --git a/block/blk-tag.c b/block/blk-tag.c
index bae1decb6ec3..319a3a3eb1d7 100644
--- a/block/blk-tag.c
+++ b/block/blk-tag.c
@@ -272,6 +272,7 @@ void blk_queue_end_tag(struct request_queue *q, struct request *rq)
list_del_init(&rq->queuelist);
rq->rq_flags &= ~RQF_QUEUED;
rq->tag = -1;
+ rq->sched_tag = -1;
if (unlikely(bqt->tag_index[tag] == NULL))
printk(KERN_ERR "%s: tag %d is missing\n",
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 9255ccb043f2..377594bcda8d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -36,6 +36,7 @@ struct blk_mq_hw_ctx {
atomic_t wait_index;
struct blk_mq_tags *tags;
+ struct blk_mq_tags *sched_tags;
struct srcu_struct queue_rq_srcu;
@@ -72,6 +73,7 @@ struct blk_mq_tag_set {
void *driver_data;
struct blk_mq_tags **tags;
+ struct blk_mq_tags **sched_tags;
struct mutex tag_list_lock;
struct list_head tag_list;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 7c40fb838b44..112b57bce9e9 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -223,6 +223,7 @@ struct request {
void *special; /* opaque pointer available for LLD use */
int tag;
+ int sched_tag;
int errors;
/*
--
2.11.0