[WIP PATCHSET 4/4] Modify interface and operation to comply with blk-mq-sched

From: Paolo Valente
Date: Tue Feb 07 2017 - 12:25:26 EST


As for modifications of the operation, the major changes are the introduction
of a scheduler lock, and the moving to deferred work of the body of the hook
exit_icq. The latter change has been made to avoid deadlocks caused by the
combination of the following facts: 1) such a body takes the scheduler lock,
and, if not deferred, 2) it does so from inside the exit_icq hook, which is
invoked with the queue lock held, and 3) there is at least one code path,
namely that starting from bfq_bio_merge, which takes these locks in the
opposite order.

Signed-off-by: Paolo Valente <paolo.valente@xxxxxxxxxx>
---
block/bfq-cgroup.c | 4 -
block/bfq-mq-iosched.c | 791 ++++++++++++++++++++++++++++++-------------------
block/bfq-mq.h | 37 +--
3 files changed, 496 insertions(+), 336 deletions(-)

diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 36b68ec..7ecce47 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -472,8 +472,6 @@ static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
struct bfq_group *bfqg, *parent;
struct bfq_entity *entity;

- assert_spin_locked(bfqd->queue->queue_lock);
-
bfqg = bfq_lookup_bfqg(bfqd, blkcg);

if (unlikely(!bfqg))
@@ -602,8 +600,6 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
struct bfq_group *bfqg;
struct bfq_entity *entity;

- lockdep_assert_held(bfqd->queue->queue_lock);
-
bfqg = bfq_find_set_group(bfqd, blkcg);

if (unlikely(!bfqg))
diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c
index a8679de..05a12b6 100644
--- a/block/bfq-mq-iosched.c
+++ b/block/bfq-mq-iosched.c
@@ -76,9 +76,14 @@
#include <linux/jiffies.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
-#undef CONFIG_BFQ_GROUP_IOSCHED /* cgroups support not yet functional */
-#include "bfq.h"
+#include <linux/sbitmap.h>
+#include <linux/delay.h>
+
#include "blk.h"
+#include "blk-mq.h"
+#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
+#include "bfq-mq.h"

/* Expiration time of sync (0) and async (1) requests, in ns. */
static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
@@ -188,8 +193,6 @@ static int device_speed_thresh[2];
#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0])
#define RQ_BFQQ(rq) ((rq)->elv.priv[1])

-static void bfq_schedule_dispatch(struct bfq_data *bfqd);
-
/**
* icq_to_bic - convert iocontext queue structure to bfq_io_cq.
* @icq: the iocontext queue.
@@ -211,11 +214,12 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
struct request_queue *q)
{
if (ioc) {
+ unsigned long flags;
struct bfq_io_cq *icq;

- spin_lock_irq(q->queue_lock);
+ spin_lock_irqsave(q->queue_lock, flags);
icq = icq_to_bic(ioc_lookup_icq(ioc, q));
- spin_unlock_irq(q->queue_lock);
+ spin_unlock_irqrestore(q->queue_lock, flags);

return icq;
}
@@ -239,7 +243,7 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd)
{
if (bfqd->queued != 0) {
bfq_log(bfqd, "schedule dispatch");
- kblockd_schedule_work(&bfqd->unplug_work);
+ blk_mq_run_hw_queues(bfqd->queue, true);
}
}

@@ -728,9 +732,9 @@ static int bfqq_process_refs(struct bfq_queue *bfqq)
{
int process_refs, io_refs;

- lockdep_assert_held(bfqq->bfqd->queue->queue_lock);
+ lockdep_assert_held(&bfqq->bfqd->lock);

- io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
+ io_refs = bfqq->allocated;
process_refs = bfqq->ref - io_refs - bfqq->entity.on_st;
BUG_ON(process_refs < 0);
return process_refs;
@@ -1441,6 +1445,8 @@ static void bfq_add_request(struct request *rq)
bfqq->queued[rq_is_sync(rq)]++;
bfqd->queued++;

+ BUG_ON(!RQ_BFQQ(rq));
+ BUG_ON(RQ_BFQQ(rq) != bfqq);
elv_rb_add(&bfqq->sort_list, rq);

/*
@@ -1449,6 +1455,8 @@ static void bfq_add_request(struct request *rq)
prev = bfqq->next_rq;
next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
BUG_ON(!next_rq);
+ BUG_ON(!RQ_BFQQ(next_rq));
+ BUG_ON(RQ_BFQQ(next_rq) != bfqq);
bfqq->next_rq = next_rq;

/*
@@ -1544,6 +1552,7 @@ static sector_t get_sdist(sector_t last_pos, struct request *rq)
return sdist;
}

+#if 0 /* Still not clear if we can do without next two functions */
static void bfq_activate_request(struct request_queue *q, struct request *rq)
{
struct bfq_data *bfqd = q->elevator->elevator_data;
@@ -1557,8 +1566,10 @@ static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
BUG_ON(bfqd->rq_in_driver == 0);
bfqd->rq_in_driver--;
}
+#endif

-static void bfq_remove_request(struct request *rq)
+static void bfq_remove_request(struct request_queue *q,
+ struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq);
struct bfq_data *bfqd = bfqq->bfqd;
@@ -1569,6 +1580,19 @@ static void bfq_remove_request(struct request *rq)

if (bfqq->next_rq == rq) {
bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
+ if (bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)) {
+ pr_crit("no bfqq! for next rq %p bfqq %p\n",
+ bfqq->next_rq, bfqq);
+ }
+
+ BUG_ON(bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq));
+ if (bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq) {
+ pr_crit(
+ "wrong bfqq! for next rq %p, rq_bfqq %p bfqq %p\n",
+ bfqq->next_rq, RQ_BFQQ(bfqq->next_rq), bfqq);
+ }
+ BUG_ON(bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq);
+
bfq_updated_next_req(bfqd, bfqq);
}

@@ -1579,6 +1603,10 @@ static void bfq_remove_request(struct request *rq)
bfqd->queued--;
elv_rb_del(&bfqq->sort_list, rq);

+ elv_rqhash_del(q, rq);
+ if (q->last_merge == rq)
+ q->last_merge = NULL;
+
if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
bfqq->next_rq = NULL;

@@ -1616,22 +1644,47 @@ static void bfq_remove_request(struct request *rq)
bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags);
}

-static int bfq_merge(struct request_queue *q, struct request **req,
- struct bio *bio)
+static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+{
+ struct request_queue *q = hctx->queue;
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+ struct request *free = NULL;
+ bool ret;
+
+ spin_lock_irq(&bfqd->lock);
+ ret = blk_mq_sched_try_merge(q, bio, &free);
+
+ /*
+ * XXX Not yet freeing without lock held, to avoid an
+ * inconsistency with respect to the lock-protected invocation
+ * of blk_mq_sched_try_insert_merge in bfq_bio_merge. Waiting
+ * for clarifications from Jens.
+ */
+ if (free)
+ blk_mq_free_request(free);
+ spin_unlock_irq(&bfqd->lock);
+
+ return ret;
+}
+
+static int bfq_request_merge(struct request_queue *q, struct request **req,
+ struct bio *bio)
{
struct bfq_data *bfqd = q->elevator->elevator_data;
struct request *__rq;

- __rq = bfq_find_rq_fmerge(bfqd, bio);
+ __rq = bfq_find_rq_fmerge(bfqd, bio, q);
if (__rq && elv_bio_merge_ok(__rq, bio)) {
*req = __rq;
+ bfq_log(bfqd, "request_merge: req %p", __rq);
+
return ELEVATOR_FRONT_MERGE;
}

return ELEVATOR_NO_MERGE;
}

-static void bfq_merged_request(struct request_queue *q, struct request *req,
+static void bfq_request_merged(struct request_queue *q, struct request *req,
int type)
{
if (type == ELEVATOR_FRONT_MERGE &&
@@ -1645,13 +1698,23 @@ static void bfq_merged_request(struct request_queue *q, struct request *req,

/* Reposition request in its sort_list */
elv_rb_del(&bfqq->sort_list, req);
+ BUG_ON(!RQ_BFQQ(req));
+ BUG_ON(RQ_BFQQ(req) != bfqq);
elv_rb_add(&bfqq->sort_list, req);
+
+ spin_lock_irq(&bfqd->lock);
/* Choose next request to be served for bfqq */
prev = bfqq->next_rq;
next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req,
bfqd->last_position);
BUG_ON(!next_rq);
+
bfqq->next_rq = next_rq;
+
+ bfq_log_bfqq(bfqd, bfqq,
+ "requests_merged: req %p prev %p next_rq %p bfqq %p",
+ req, prev, next_rq, bfqq);
+
/*
* If next_rq changes, update both the queue's budget to
* fit the new request and the queue's position in its
@@ -1661,22 +1724,27 @@ static void bfq_merged_request(struct request_queue *q, struct request *req,
bfq_updated_next_req(bfqd, bfqq);
bfq_pos_tree_add_move(bfqd, bfqq);
}
+ spin_unlock_irq(&bfqd->lock);
}
}

-#ifdef BFQ_GROUP_IOSCHED_ENABLED
-static void bfq_bio_merged(struct request_queue *q, struct request *req,
- struct bio *bio)
-{
- bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf);
-}
-#endif
-
-static void bfq_merged_requests(struct request_queue *q, struct request *rq,
+static void bfq_requests_merged(struct request_queue *q, struct request *rq,
struct request *next)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next);

+ BUG_ON(!RQ_BFQQ(rq));
+ BUG_ON(!RQ_BFQQ(next));
+
+ if (!RB_EMPTY_NODE(&rq->rb_node))
+ goto end;
+
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "requests_merged: rq %p next %p bfqq %p next_bfqq %p",
+ rq, next, bfqq, next_bfqq);
+
+ spin_lock_irq(&bfqq->bfqd->lock);
+
/*
* If next and rq belong to the same bfq_queue and next is older
* than rq, then reposition rq in the fifo (by substituting next
@@ -1697,7 +1765,10 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq,
if (bfqq->next_rq == next)
bfqq->next_rq = rq;

- bfq_remove_request(next);
+ bfq_remove_request(q, next);
+
+ spin_unlock_irq(&bfqq->bfqd->lock);
+end:
bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
}

@@ -1741,7 +1812,7 @@ static void bfq_end_wr(struct bfq_data *bfqd)
{
struct bfq_queue *bfqq;

- spin_lock_irq(bfqd->queue->queue_lock);
+ spin_lock_irq(&bfqd->lock);

list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
bfq_bfqq_end_wr(bfqq);
@@ -1749,7 +1820,7 @@ static void bfq_end_wr(struct bfq_data *bfqd)
bfq_bfqq_end_wr(bfqq);
bfq_end_wr_async(bfqd);

- spin_unlock_irq(bfqd->queue->queue_lock);
+ spin_unlock_irq(&bfqd->lock);
}

static sector_t bfq_io_struct_pos(void *io_struct, bool request)
@@ -2131,8 +2202,8 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
bfq_put_queue(bfqq);
}

-static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
- struct bio *bio)
+static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
+ struct bio *bio)
{
struct bfq_data *bfqd = q->elevator->elevator_data;
bool is_sync = op_is_sync(bio->bi_opf);
@@ -2150,7 +2221,7 @@ static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
* merge only if rq is queued there.
* Queue lock is held here.
*/
- bic = bfq_bic_lookup(bfqd, current->io_context);
+ bic = bfq_bic_lookup(bfqd, current->io_context, q);
if (!bic)
return false;

@@ -2175,12 +2246,6 @@ static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
return bfqq == RQ_BFQQ(rq);
}

-static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq,
- struct request *next)
-{
- return RQ_BFQQ(rq) == RQ_BFQQ(next);
-}
-
/*
* Set the maximum time for the in-service queue to consume its
* budget. This prevents seeky processes from lowering the throughput.
@@ -2211,7 +2276,6 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
{
if (bfqq) {
bfqg_stats_update_avg_queue_size(bfqq_group(bfqq));
- bfq_mark_bfqq_must_alloc(bfqq);
bfq_clear_bfqq_fifo_expire(bfqq);

bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
@@ -2650,27 +2714,28 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
}

/*
- * Move request from internal lists to the dispatch list of the request queue
+ * Remove request from internal lists.
*/
-static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
+static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq);

/*
- * For consistency, the next instruction should have been executed
- * after removing the request from the queue and dispatching it.
- * We execute instead this instruction before bfq_remove_request()
- * (and hence introduce a temporary inconsistency), for efficiency.
- * In fact, in a forced_dispatch, this prevents two counters related
- * to bfqq->dispatched to risk to be uselessly decremented if bfqq
- * is not in service, and then to be incremented again after
- * incrementing bfqq->dispatched.
+ * For consistency, the next instruction should have been
+ * executed after removing the request from the queue and
+ * dispatching it. We execute instead this instruction before
+ * bfq_remove_request() (and hence introduce a temporary
+ * inconsistency), for efficiency. In fact, should this
+ * dispatch occur for a non in-service bfqq, this anticipated
+ * increment prevents two counters related to bfqq->dispatched
+ * from risking to be, first, uselessly decremented, and then
+ * incremented again when the (new) value of bfqq->dispatched
+ * happens to be taken into account.
*/
bfqq->dispatched++;
bfq_update_peak_rate(q->elevator->elevator_data, rq);

- bfq_remove_request(rq);
- elv_dispatch_sort(q, rq);
+ bfq_remove_request(q, rq);
}

static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
@@ -3534,7 +3599,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue");

if (bfq_may_expire_for_budg_timeout(bfqq) &&
- !hrtimer_active(&bfqd->idle_slice_timer) &&
+ !bfq_bfqq_wait_request(bfqq) &&
!bfq_bfqq_must_idle(bfqq))
goto expire;

@@ -3570,7 +3635,6 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
* arrives.
*/
if (bfq_bfqq_wait_request(bfqq)) {
- BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer));
/*
* If we get here: 1) at least a new request
* has arrived but we have not disabled the
@@ -3597,7 +3661,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
* for a new request, or has requests waiting for a completion and
* may idle after their completion, then keep it anyway.
*/
- if (hrtimer_active(&bfqd->idle_slice_timer) ||
+ if (bfq_bfqq_wait_request(bfqq) ||
(bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) {
bfqq = NULL;
goto keep_queue;
@@ -3676,13 +3740,11 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
}

/*
- * Dispatch one request from bfqq, moving it to the request queue
- * dispatch list.
+ * Dispatch next request from bfqq.
*/
-static int bfq_dispatch_request(struct bfq_data *bfqd,
- struct bfq_queue *bfqq)
+static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
{
- int dispatched = 0;
struct request *rq = bfqq->next_rq;
unsigned long service_to_charge;

@@ -3698,7 +3760,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd,

BUG_ON(bfqq->entity.budget < bfqq->entity.service);

- bfq_dispatch_insert(bfqd->queue, rq);
+ bfq_dispatch_remove(bfqd->queue, rq);

/*
* If weight raising has to terminate for bfqq, then next
@@ -3714,86 +3776,66 @@ static int bfq_dispatch_request(struct bfq_data *bfqd,
bfq_update_wr_data(bfqd, bfqq);

bfq_log_bfqq(bfqd, bfqq,
- "dispatched %u sec req (%llu), budg left %d",
+ "dispatched %u sec req (%llu), budg left %d, new disp_nr %d",
blk_rq_sectors(rq),
(unsigned long long) blk_rq_pos(rq),
- bfq_bfqq_budget_left(bfqq));
-
- dispatched++;
+ bfq_bfqq_budget_left(bfqq),
+ bfqq->dispatched);

if (!bfqd->in_service_bic) {
atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
bfqd->in_service_bic = RQ_BIC(rq);
}

+ /*
+ * Expire bfqq, pretending that its budget expired, if bfqq
+ * belongs to CLASS_IDLE and other queues are waiting for
+ * service.
+ */
if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq))
goto expire;

- return dispatched;
+ return rq;

expire:
bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED);
- return dispatched;
-}
-
-static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
-{
- int dispatched = 0;
-
- while (bfqq->next_rq) {
- bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
- dispatched++;
- }
-
- BUG_ON(!list_empty(&bfqq->fifo));
- return dispatched;
+ return rq;
}

-/*
- * Drain our current requests.
- * Used for barriers and when switching io schedulers on-the-fly.
- */
-static int bfq_forced_dispatch(struct bfq_data *bfqd)
+static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
{
- struct bfq_queue *bfqq, *n;
- struct bfq_service_tree *st;
- int dispatched = 0;
+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;

- bfqq = bfqd->in_service_queue;
- if (bfqq)
- __bfq_bfqq_expire(bfqd, bfqq);
+ bfq_log(bfqd, "has_work, dispatch_non_empty %d busy_queues %d",
+ !list_empty_careful(&bfqd->dispatch), bfqd->busy_queues > 0);

/*
- * Loop through classes, and be careful to leave the scheduler
- * in a consistent state, as feedback mechanisms and vtime
- * updates cannot be disabled during the process.
+ * Avoiding lock: a race on bfqd->busy_queues should cause at
+ * most a call to dispatch for nothing
*/
- list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
- st = bfq_entity_service_tree(&bfqq->entity);
-
- dispatched += __bfq_forced_dispatch_bfqq(bfqq);
-
- bfqq->max_budget = bfq_max_budget(bfqd);
- bfq_forget_idle(st);
- }
-
- BUG_ON(bfqd->busy_queues != 0);
-
- return dispatched;
+ return !list_empty_careful(&bfqd->dispatch) ||
+ bfqd->busy_queues > 0;
}

-static int bfq_dispatch_requests(struct request_queue *q, int force)
+static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
- struct bfq_data *bfqd = q->elevator->elevator_data;
- struct bfq_queue *bfqq;
+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+ struct request *rq = NULL;
+ struct bfq_queue *bfqq = NULL;
+
+ if (!list_empty(&bfqd->dispatch)) {
+ rq = list_first_entry(&bfqd->dispatch, struct request,
+ queuelist);
+ list_del_init(&rq->queuelist);
+ bfq_log(bfqd,
+ "dispatch requests: picked %p from dispatch list", rq);
+ goto exit;
+ }

bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);

if (bfqd->busy_queues == 0)
- return 0;
-
- if (unlikely(force))
- return bfq_forced_dispatch(bfqd);
+ goto exit;

/*
* Force device to serve one request at a time if
@@ -3808,25 +3850,53 @@ static int bfq_dispatch_requests(struct request_queue *q, int force)
* throughput.
*/
if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
- return 0;
+ goto exit;

bfqq = bfq_select_queue(bfqd);
if (!bfqq)
- return 0;
+ goto exit;

BUG_ON(bfqq->entity.budget < bfqq->entity.service);

BUG_ON(bfq_bfqq_wait_request(bfqq));

- if (!bfq_dispatch_request(bfqd, bfqq))
- return 0;
-
- bfq_log_bfqq(bfqd, bfqq, "dispatched %s request",
- bfq_bfqq_sync(bfqq) ? "sync" : "async");
+ rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq);

BUG_ON(bfqq->next_rq == NULL &&
bfqq->entity.budget < bfqq->entity.service);
- return 1;
+exit:
+ if (rq) {
+ rq->rq_flags |= RQF_STARTED;
+ bfqd->rq_in_driver++;
+ if (bfqq)
+ bfq_log_bfqq(bfqd, bfqq,
+ "dispatched %s request %p, rq_in_driver %d",
+ bfq_bfqq_sync(bfqq) ? "sync" : "async",
+ rq,
+ bfqd->rq_in_driver);
+ else
+ bfq_log(bfqd,
+ "dispatched request %p from dispatch list, rq_in_driver %d",
+ rq, bfqd->rq_in_driver);
+ } else
+ bfq_log(bfqd,
+ "returned NULL request, rq_in_driver %d",
+ bfqd->rq_in_driver);
+
+ return rq;
+}
+
+
+static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
+{
+ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
+ struct request *rq;
+
+ spin_lock_irq(&bfqd->lock);
+ rq = __bfq_dispatch_request(hctx);
+ spin_unlock_irq(&bfqd->lock);
+
+ return rq;
}

/*
@@ -3843,13 +3913,15 @@ static void bfq_put_queue(struct bfq_queue *bfqq)

BUG_ON(bfqq->ref <= 0);

- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref);
+ if (bfqq->bfqd)
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref);
+
bfqq->ref--;
if (bfqq->ref)
return;

BUG_ON(rb_first(&bfqq->sort_list));
- BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
+ BUG_ON(bfqq->allocated != 0);
BUG_ON(bfqq->entity.tree);
BUG_ON(bfq_bfqq_busy(bfqq));

@@ -3864,7 +3936,8 @@ static void bfq_put_queue(struct bfq_queue *bfqq)
*/
hlist_del_init(&bfqq->burst_list_node);

- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);
+ if (bfqq->bfqd)
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq);

kmem_cache_free(bfq_pool, bfqq);
#ifdef BFQ_GROUP_IOSCHED_ENABLED
@@ -3905,29 +3978,53 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_put_queue(bfqq);
}

-static void bfq_exit_icq(struct io_cq *icq)
+static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
{
- struct bfq_io_cq *bic = icq_to_bic(icq);
- struct bfq_data *bfqd = bic_to_bfqd(bic);
+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
+ struct bfq_data *bfqd;

- if (bic_to_bfqq(bic, false)) {
- bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false));
- bic_set_bfqq(bic, NULL, false);
- }
+ if (bfqq)
+ bfqd = bfqq->bfqd; /* NULL if scheduler already exited */

- if (bic_to_bfqq(bic, true)) {
+ if (bfqq && bfqd) {
+ spin_lock_irq(&bfqd->lock);
/*
* If the bic is using a shared queue, put the reference
* taken on the io_context when the bic started using a
* shared bfq_queue.
*/
- if (bfq_bfqq_coop(bic_to_bfqq(bic, true)))
- put_io_context(icq->ioc);
- bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true));
- bic_set_bfqq(bic, NULL, true);
+ if (is_sync && bfq_bfqq_coop(bfqq))
+ put_io_context(bic->icq.ioc);
+ bfq_exit_bfqq(bfqd, bfqq);
+ bic_set_bfqq(bic, NULL, is_sync);
+ spin_unlock_irq(&bfqd->lock);
}
}

+static void bfq_exit_icq_body(struct work_struct *work)
+{
+ struct bfq_io_cq *bic =
+ container_of(work, struct bfq_io_cq, exit_icq_work);
+
+ bfq_exit_icq_bfqq(bic, true);
+ bfq_exit_icq_bfqq(bic, false);
+}
+
+static void bfq_init_icq(struct io_cq *icq)
+{
+ struct bfq_io_cq *bic = icq_to_bic(icq);
+
+ INIT_WORK(&bic->exit_icq_work, bfq_exit_icq_body);
+}
+
+static void bfq_exit_icq(struct io_cq *icq)
+{
+ struct bfq_io_cq *bic = icq_to_bic(icq);
+
+ BUG_ON(!bic);
+ kblockd_schedule_work(&bic->exit_icq_work);
+}
+
/*
* Update the entity prio values; note that the new values will not
* be used until the next (re)activation.
@@ -3937,6 +4034,11 @@ static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq,
{
struct task_struct *tsk = current;
int ioprio_class;
+ struct bfq_data *bfqd = bfqq->bfqd;
+
+ WARN_ON(!bfqd);
+ if (!bfqd)
+ return;

ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
switch (ioprio_class) {
@@ -4017,6 +4119,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
INIT_HLIST_NODE(&bfqq->burst_list_node);
BUG_ON(!hlist_unhashed(&bfqq->burst_list_node));

+ spin_lock_init(&bfqq->lock);
+
bfqq->ref = 0;
bfqq->bfqd = bfqd;

@@ -4273,21 +4377,17 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (budget_timeout)
bfq_bfqq_expire(bfqd, bfqq, false,
BFQ_BFQQ_BUDGET_TIMEOUT);
-
- /*
- * Let the request rip immediately, or let a new queue be
- * selected if bfqq has just been expired.
- */
- __blk_run_queue(bfqd->queue);
}
}

-static void bfq_insert_request(struct request_queue *q, struct request *rq)
+
+static void __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
{
- struct bfq_data *bfqd = q->elevator->elevator_data;
struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;

- assert_spin_locked(bfqd->queue->queue_lock);
+ assert_spin_locked(&bfqd->lock);
+
+ bfq_log_bfqq(bfqd, bfqq, "__insert_req: rq %p bfqq %p", rq, bfqq);

/*
* An unplug may trigger a requeue of a request from the device
@@ -4303,8 +4403,14 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq)
* Release the request's reference to the old bfqq
* and make sure one is taken to the shared queue.
*/
- new_bfqq->allocated[rq_data_dir(rq)]++;
- bfqq->allocated[rq_data_dir(rq)]--;
+ new_bfqq->allocated++;
+ bfqq->allocated--;
+ bfq_log_bfqq(bfqd, bfqq,
+ "insert_request: new allocated %d", bfqq->allocated);
+ bfq_log_bfqq(bfqd, new_bfqq,
+ "insert_request: new_bfqq new allocated %d",
+ bfqq->allocated);
+
new_bfqq->ref++;
bfq_clear_bfqq_just_created(bfqq);
bfq_put_queue(bfqq);
@@ -4324,6 +4430,55 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq)
bfq_rq_enqueued(bfqd, bfqq, rq);
}

+static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
+ bool at_head)
+{
+ struct request_queue *q = hctx->queue;
+ struct bfq_data *bfqd = q->elevator->elevator_data;
+
+ spin_lock_irq(&bfqd->lock);
+ if (blk_mq_sched_try_insert_merge(q, rq))
+ goto done;
+ spin_unlock_irq(&bfqd->lock);
+
+ blk_mq_sched_request_inserted(rq);
+
+ spin_lock_irq(&bfqd->lock);
+ if (at_head || blk_rq_is_passthrough(rq)) {
+ struct bfq_queue *bfqq = RQ_BFQQ(rq);
+
+ if (at_head)
+ list_add(&rq->queuelist, &bfqd->dispatch);
+ else
+ list_add_tail(&rq->queuelist, &bfqd->dispatch);
+
+ if (bfqq)
+ bfqq->dispatched++;
+ } else {
+ __bfq_insert_request(bfqd, rq);
+
+ if (rq_mergeable(rq)) {
+ elv_rqhash_add(q, rq);
+ if (!q->last_merge)
+ q->last_merge = rq;
+ }
+ }
+done:
+ spin_unlock_irq(&bfqd->lock);
+}
+
+static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
+ struct list_head *list, bool at_head)
+{
+ while (!list_empty(list)) {
+ struct request *rq;
+
+ rq = list_first_entry(list, struct request, queuelist);
+ list_del_init(&rq->queuelist);
+ bfq_insert_request(hctx, rq, at_head);
+ }
+}
+
static void bfq_update_hw_tag(struct bfq_data *bfqd)
{
bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
@@ -4349,27 +4504,21 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
bfqd->hw_tag_samples = 0;
}

-static void bfq_completed_request(struct request_queue *q, struct request *rq)
+static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
{
- struct bfq_queue *bfqq = RQ_BFQQ(rq);
- struct bfq_data *bfqd = bfqq->bfqd;
u64 now_ns;
u32 delta_us;

- bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left",
- blk_rq_sectors(rq));
-
- assert_spin_locked(bfqd->queue->queue_lock);
bfq_update_hw_tag(bfqd);

BUG_ON(!bfqd->rq_in_driver);
BUG_ON(!bfqq->dispatched);
bfqd->rq_in_driver--;
bfqq->dispatched--;
- bfqg_stats_update_completion(bfqq_group(bfqq),
- rq_start_time_ns(rq),
- rq_io_start_time_ns(rq),
- rq->cmd_flags);
+
+ bfq_log_bfqq(bfqd, bfqq,
+ "completed_requests: new disp %d, new rq_in_driver %d",
+ bfqq->dispatched, bfqd->rq_in_driver);

if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
@@ -4395,7 +4544,8 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
*/
delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC);

- bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu",
+ bfq_log_bfqq(bfqd, bfqq,
+ "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu",
delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size,
(USEC_PER_SEC*
(u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us))
@@ -4445,7 +4595,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
if (bfqd->in_service_queue == bfqq) {
if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) {
bfq_arm_slice_timer(bfqd);
- goto out;
+ return;
} else if (bfq_may_expire_for_budg_timeout(bfqq))
bfq_bfqq_expire(bfqd, bfqq, false,
BFQ_BFQQ_BUDGET_TIMEOUT);
@@ -4455,68 +4605,81 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq)
bfq_bfqq_expire(bfqd, bfqq, false,
BFQ_BFQQ_NO_MORE_REQUESTS);
}
-
- if (!bfqd->rq_in_driver)
- bfq_schedule_dispatch(bfqd);
-
-out:
- return;
}

-static int __bfq_may_queue(struct bfq_queue *bfqq)
+static void bfq_put_rq_priv_body(struct bfq_queue *bfqq)
{
- if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
- bfq_clear_bfqq_must_alloc(bfqq);
- return ELV_MQUEUE_MUST;
- }
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "put_request_body: allocated %d", bfqq->allocated);
+ BUG_ON(!bfqq->allocated);
+ bfqq->allocated--;

- return ELV_MQUEUE_MAY;
+ bfq_put_queue(bfqq);
}

-static int bfq_may_queue(struct request_queue *q, unsigned int op)
+static void bfq_put_rq_private(struct request_queue *q, struct request *rq)
{
- struct bfq_data *bfqd = q->elevator->elevator_data;
- struct task_struct *tsk = current;
- struct bfq_io_cq *bic;
struct bfq_queue *bfqq;
+ struct bfq_data *bfqd;
+ struct bfq_io_cq *bic;

- /*
- * Don't force setup of a queue from here, as a call to may_queue
- * does not necessarily imply that a request actually will be
- * queued. So just lookup a possibly existing queue, or return
- * 'may queue' if that fails.
- */
- bic = bfq_bic_lookup(bfqd, tsk->io_context);
- if (!bic)
- return ELV_MQUEUE_MAY;
+ BUG_ON(!rq);
+ bfqq = RQ_BFQQ(rq);
+ BUG_ON(!bfqq);

- bfqq = bic_to_bfqq(bic, op_is_sync(op));
- if (bfqq)
- return __bfq_may_queue(bfqq);
+ bic = RQ_BIC(rq);
+ BUG_ON(!bic);

- return ELV_MQUEUE_MAY;
-}
+ bfqd = bfqq->bfqd;
+ BUG_ON(!bfqd);

-/*
- * Queue lock held here.
- */
-static void bfq_put_request(struct request *rq)
-{
- struct bfq_queue *bfqq = RQ_BFQQ(rq);
+ BUG_ON(rq->rq_flags & RQF_QUEUED);
+ BUG_ON(!(rq->rq_flags & RQF_ELVPRIV));

- if (bfqq) {
- const int rw = rq_data_dir(rq);
+ bfq_log_bfqq(bfqd, bfqq,
+ "putting rq %p with %u sects left, STARTED %d",
+ rq, blk_rq_sectors(rq),
+ rq->rq_flags & RQF_STARTED);

- BUG_ON(!bfqq->allocated[rw]);
- bfqq->allocated[rw]--;
+ if (rq->rq_flags & RQF_STARTED)
+ bfqg_stats_update_completion(bfqq_group(bfqq),
+ rq_start_time_ns(rq),
+ rq_io_start_time_ns(rq),
+ rq->cmd_flags);

- rq->elv.priv[0] = NULL;
- rq->elv.priv[1] = NULL;
+ BUG_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED));

- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
- bfqq, bfqq->ref);
- bfq_put_queue(bfqq);
+ if (likely(rq->rq_flags & RQF_STARTED)) {
+ unsigned long flags;
+
+ spin_lock_irqsave(&bfqd->lock, flags);
+
+ bfq_completed_request(bfqq, bfqd);
+ bfq_put_rq_priv_body(bfqq);
+
+ spin_unlock_irqrestore(&bfqd->lock, flags);
+ } else {
+ /*
+ * Request rq may be still/already in the scheduler,
+ * in which case we need to remove it. And we cannot
+ * defer such a check and removal, to avoid
+ * inconsistencies in the time interval from the end
+ * of this function to the start of the deferred work.
+ * Fortunately, this situation occurs only in process
+ * context, so taking the scheduler lock does not
+ * cause any deadlock, even if other locks are already
+ * (correctly) held by this process.
+ */
+ BUG_ON(in_interrupt());
+
+ assert_spin_locked(&bfqd->lock);
+ if (!RB_EMPTY_NODE(&rq->rb_node))
+ bfq_remove_request(q, rq);
+ bfq_put_rq_priv_body(bfqq);
}
+
+ rq->elv.priv[0] = NULL;
+ rq->elv.priv[1] = NULL;
}

/*
@@ -4548,18 +4711,17 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
/*
* Allocate bfq data structures associated with this request.
*/
-static int bfq_set_request(struct request_queue *q, struct request *rq,
- struct bio *bio, gfp_t gfp_mask)
+static int bfq_get_rq_private(struct request_queue *q, struct request *rq,
+ struct bio *bio)
{
struct bfq_data *bfqd = q->elevator->elevator_data;
struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
- const int rw = rq_data_dir(rq);
const int is_sync = rq_is_sync(rq);
struct bfq_queue *bfqq;
- unsigned long flags;
bool split = false;

- spin_lock_irqsave(q->queue_lock, flags);
+ spin_lock_irq(&bfqd->lock);
+
bfq_check_ioprio_change(bic, bio);

if (!bic)
@@ -4578,7 +4740,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
bic_set_bfqq(bic, bfqq, is_sync);
if (split && is_sync) {
bfq_log_bfqq(bfqd, bfqq,
- "set_request: was_in_list %d "
+ "get_request: was_in_list %d "
"was_in_large_burst %d "
"large burst in progress %d",
bic->was_in_burst_list,
@@ -4588,12 +4750,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
if ((bic->was_in_burst_list && bfqd->large_burst) ||
bic->saved_in_large_burst) {
bfq_log_bfqq(bfqd, bfqq,
- "set_request: marking in "
+ "get_request: marking in "
"large burst");
bfq_mark_bfqq_in_large_burst(bfqq);
} else {
bfq_log_bfqq(bfqd, bfqq,
- "set_request: clearing in "
+ "get_request: clearing in "
"large burst");
bfq_clear_bfqq_in_large_burst(bfqq);
if (bic->was_in_burst_list)
@@ -4618,9 +4780,12 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
}
}

- bfqq->allocated[rw]++;
+ bfqq->allocated++;
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "get_request: new allocated %d", bfqq->allocated);
+
bfqq->ref++;
- bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref);
+ bfq_log_bfqq(bfqd, bfqq, "get_request: bfqq %p, %d", bfqq, bfqq->ref);

rq->elv.priv[0] = bic;
rq->elv.priv[1] = bfqq;
@@ -4647,26 +4812,55 @@ static int bfq_set_request(struct request_queue *q, struct request *rq,
if (unlikely(bfq_bfqq_just_created(bfqq)))
bfq_handle_burst(bfqd, bfqq);

- spin_unlock_irqrestore(q->queue_lock, flags);
+ spin_unlock_irq(&bfqd->lock);

return 0;

queue_fail:
- bfq_schedule_dispatch(bfqd);
- spin_unlock_irqrestore(q->queue_lock, flags);
+ spin_unlock_irq(&bfqd->lock);

return 1;
}

-static void bfq_kick_queue(struct work_struct *work)
+static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
{
- struct bfq_data *bfqd =
- container_of(work, struct bfq_data, unplug_work);
- struct request_queue *q = bfqd->queue;
+ struct bfq_data *bfqd = bfqq->bfqd;
+ enum bfqq_expiration reason;
+ unsigned long flags;
+
+ BUG_ON(!bfqd);
+ spin_lock_irqsave(&bfqd->lock, flags);
+ bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration");
+ bfq_clear_bfqq_wait_request(bfqq);

- spin_lock_irq(q->queue_lock);
- __blk_run_queue(q);
- spin_unlock_irq(q->queue_lock);
+ if (bfqq != bfqd->in_service_queue) {
+ spin_unlock_irqrestore(&bfqd->lock, flags);
+ return;
+ }
+
+ if (bfq_bfqq_budget_timeout(bfqq))
+ /*
+ * Also here the queue can be safely expired
+ * for budget timeout without wasting
+ * guarantees
+ */
+ reason = BFQ_BFQQ_BUDGET_TIMEOUT;
+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
+ /*
+ * The queue may not be empty upon timer expiration,
+ * because we may not disable the timer when the
+ * first request of the in-service queue arrives
+ * during disk idling.
+ */
+ reason = BFQ_BFQQ_TOO_IDLE;
+ else
+ goto schedule_dispatch;
+
+ bfq_bfqq_expire(bfqd, bfqq, true, reason);
+
+schedule_dispatch:
+ spin_unlock_irqrestore(&bfqd->lock, flags);
+ bfq_schedule_dispatch(bfqd);
}

/*
@@ -4677,59 +4871,24 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
{
struct bfq_data *bfqd = container_of(timer, struct bfq_data,
idle_slice_timer);
- struct bfq_queue *bfqq;
- unsigned long flags;
- enum bfqq_expiration reason;
+ struct bfq_queue *bfqq = bfqd->in_service_queue;

- spin_lock_irqsave(bfqd->queue->queue_lock, flags);
+ bfq_log(bfqd, "slice_timer expired");

- bfqq = bfqd->in_service_queue;
/*
* Theoretical race here: the in-service queue can be NULL or
- * different from the queue that was idling if the timer handler
- * spins on the queue_lock and a new request arrives for the
- * current queue and there is a full dispatch cycle that changes
- * the in-service queue. This can hardly happen, but in the worst
- * case we just expire a queue too early.
+ * different from the queue that was idling if a new request
+ * arrives for the current queue and there is a full dispatch
+ * cycle that changes the in-service queue. This can hardly
+ * happen, but in the worst case we just expire a queue too
+ * early.
*/
- if (bfqq) {
- bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
- bfq_clear_bfqq_wait_request(bfqq);
-
- if (bfq_bfqq_budget_timeout(bfqq))
- /*
- * Also here the queue can be safely expired
- * for budget timeout without wasting
- * guarantees
- */
- reason = BFQ_BFQQ_BUDGET_TIMEOUT;
- else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
- /*
- * The queue may not be empty upon timer expiration,
- * because we may not disable the timer when the
- * first request of the in-service queue arrives
- * during disk idling.
- */
- reason = BFQ_BFQQ_TOO_IDLE;
- else
- goto schedule_dispatch;
-
- bfq_bfqq_expire(bfqd, bfqq, true, reason);
- }
-
-schedule_dispatch:
- bfq_schedule_dispatch(bfqd);
+ if (bfqq)
+ bfq_idle_slice_timer_body(bfqq);

- spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
return HRTIMER_NORESTART;
}

-static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
-{
- hrtimer_cancel(&bfqd->idle_slice_timer);
- cancel_work_sync(&bfqd->unplug_work);
-}
-
static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
struct bfq_queue **bfqq_ptr)
{
@@ -4766,30 +4925,44 @@ static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
static void bfq_exit_queue(struct elevator_queue *e)
{
struct bfq_data *bfqd = e->elevator_data;
- struct request_queue *q = bfqd->queue;
struct bfq_queue *bfqq, *n;

- bfq_shutdown_timer_wq(bfqd);
+ bfq_log(bfqd, "exit_queue: starting ...");

- spin_lock_irq(q->queue_lock);
+ hrtimer_cancel(&bfqd->idle_slice_timer);

BUG_ON(bfqd->in_service_queue);
- list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
- bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+ BUG_ON(!list_empty(&bfqd->active_list));

- spin_unlock_irq(q->queue_lock);
+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) {
+ if (bfqq->bic) /* bfqqs without bic are handled below */
+ cancel_work_sync(&bfqq->bic->exit_icq_work);
+ }

- bfq_shutdown_timer_wq(bfqd);
+ spin_lock_irq(&bfqd->lock);
+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) {
+ bfq_deactivate_bfqq(bfqd, bfqq, false, false);
+ /*
+ * Make sure that deferred exit_icq_work completes
+ * without errors for bfq_queues without bic
+ */
+ if (!bfqq->bic)
+ bfqq->bfqd = NULL;
+ }
+ spin_unlock_irq(&bfqd->lock);
+
+ hrtimer_cancel(&bfqd->idle_slice_timer);

BUG_ON(hrtimer_active(&bfqd->idle_slice_timer));

#ifdef BFQ_GROUP_IOSCHED_ENABLED
- blkcg_deactivate_policy(q, &blkcg_policy_bfq);
+ blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
#else
bfq_put_async_queues(bfqd, bfqd->root_group);
kfree(bfqd->root_group);
#endif

+ bfq_log(bfqd, "exit_queue: finished ...");
kfree(bfqd);
}

@@ -4848,10 +5021,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)

bfqd->queue = q;

- spin_lock_irq(q->queue_lock);
- q->elevator = eq;
- spin_unlock_irq(q->queue_lock);
-
bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node);
if (!bfqd->root_group)
goto out_free;
@@ -4865,8 +5034,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfqd->queue_weights_tree = RB_ROOT;
bfqd->group_weights_tree = RB_ROOT;

- INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
-
INIT_LIST_HEAD(&bfqd->active_list);
INIT_LIST_HEAD(&bfqd->idle_list);
INIT_HLIST_HEAD(&bfqd->burst_list);
@@ -4915,6 +5082,11 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
bfqd->device_speed = BFQ_BFQD_FAST;

+ spin_lock_init(&bfqd->lock);
+ INIT_LIST_HEAD(&bfqd->dispatch);
+
+ q->elevator = eq;
+
return 0;

out_free:
@@ -4971,7 +5143,7 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
bfqd->queued);

- spin_lock_irq(bfqd->queue->queue_lock);
+ spin_lock_irq(&bfqd->lock);

num_char += sprintf(page + num_char, "Active:\n");
list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
@@ -5000,7 +5172,7 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
jiffies_to_msecs(bfqq->wr_cur_max_time));
}

- spin_unlock_irq(bfqd->queue->queue_lock);
+ spin_unlock_irq(&bfqd->lock);

return num_char;
}
@@ -5208,35 +5380,31 @@ static struct elv_fs_entry bfq_attrs[] = {
__ATTR_NULL
};

-static struct elevator_type iosched_bfq = {
- .ops.sq = {
- .elevator_merge_fn = bfq_merge,
- .elevator_merged_fn = bfq_merged_request,
- .elevator_merge_req_fn = bfq_merged_requests,
-#ifdef BFQ_GROUP_IOSCHED_ENABLED
- .elevator_bio_merged_fn = bfq_bio_merged,
-#endif
- .elevator_allow_bio_merge_fn = bfq_allow_bio_merge,
- .elevator_allow_rq_merge_fn = bfq_allow_rq_merge,
- .elevator_dispatch_fn = bfq_dispatch_requests,
- .elevator_add_req_fn = bfq_insert_request,
- .elevator_activate_req_fn = bfq_activate_request,
- .elevator_deactivate_req_fn = bfq_deactivate_request,
- .elevator_completed_req_fn = bfq_completed_request,
- .elevator_former_req_fn = elv_rb_former_request,
- .elevator_latter_req_fn = elv_rb_latter_request,
- .elevator_init_icq_fn = bfq_init_icq,
- .elevator_exit_icq_fn = bfq_exit_icq,
- .elevator_set_req_fn = bfq_set_request,
- .elevator_put_req_fn = bfq_put_request,
- .elevator_may_queue_fn = bfq_may_queue,
- .elevator_init_fn = bfq_init_queue,
- .elevator_exit_fn = bfq_exit_queue,
+static struct elevator_type iosched_bfq_mq = {
+ .ops.mq = {
+ .get_rq_priv = bfq_get_rq_private,
+ .put_rq_priv = bfq_put_rq_private,
+ .init_icq = bfq_init_icq,
+ .exit_icq = bfq_exit_icq,
+ .insert_requests = bfq_insert_requests,
+ .dispatch_request = bfq_dispatch_request,
+ .next_request = elv_rb_latter_request,
+ .former_request = elv_rb_former_request,
+ .allow_merge = bfq_allow_bio_merge,
+ .bio_merge = bfq_bio_merge,
+ .request_merge = bfq_request_merge,
+ .requests_merged = bfq_requests_merged,
+ .request_merged = bfq_request_merged,
+ .has_work = bfq_has_work,
+ .init_sched = bfq_init_queue,
+ .exit_sched = bfq_exit_queue,
},
+
+ .uses_mq = true,
.icq_size = sizeof(struct bfq_io_cq),
.icq_align = __alignof__(struct bfq_io_cq),
.elevator_attrs = bfq_attrs,
- .elevator_name = "bfq",
+ .elevator_name = "bfq-mq",
.elevator_owner = THIS_MODULE,
};

@@ -5261,7 +5429,7 @@ static struct blkcg_policy blkcg_policy_bfq = {
static int __init bfq_init(void)
{
int ret;
- char msg[60] = "BFQ I/O-scheduler: v8r8-rc2";
+ char msg[60] = "BFQ-MQ I/O-scheduler: v8r8-rc2";

#ifdef BFQ_GROUP_IOSCHED_ENABLED
ret = blkcg_policy_register(&blkcg_policy_bfq);
@@ -5306,7 +5474,7 @@ static int __init bfq_init(void)
device_speed_thresh[0] = (4 * R_slow[0]) / 3;
device_speed_thresh[1] = (4 * R_slow[1]) / 3;

- ret = elv_register(&iosched_bfq);
+ ret = elv_register(&iosched_bfq_mq);
if (ret)
goto err_pol_unreg;

@@ -5326,8 +5494,8 @@ static int __init bfq_init(void)

static void __exit bfq_exit(void)
{
- elv_unregister(&iosched_bfq);
-#ifdef BFQ_GROUP_IOSCHED_ENABLED
+ elv_unregister(&iosched_bfq_mq);
+#ifdef CONFIG_BFQ_GROUP_ENABLED
blkcg_policy_unregister(&blkcg_policy_bfq);
#endif
bfq_slab_kill();
@@ -5336,5 +5504,6 @@ static void __exit bfq_exit(void)
module_init(bfq_init);
module_exit(bfq_exit);

-MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente");
+MODULE_AUTHOR("Paolo Valente");
MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("MQ Budget Fair IO scheduler");
diff --git a/block/bfq-mq.h b/block/bfq-mq.h
index c6acee2..6e1c0d8 100644
--- a/block/bfq-mq.h
+++ b/block/bfq-mq.h
@@ -1,5 +1,5 @@
/*
- * BFQ v8r8-rc2 for 4.10.0: data structures and common functions prototypes.
+ * BFQ-MQ v8r8-rc2 for 4.10.0: data structures and common functions prototypes.
*
* Based on ideas and code from CFQ:
* Copyright (C) 2003 Jens Axboe <axboe@xxxxxxxxx>
@@ -21,15 +21,8 @@
#include <linux/rbtree.h>
#include <linux/blk-cgroup.h>

-/*
- * Define an alternative macro to compile cgroups support. This is one
- * of the steps needed to let bfq-mq share the files bfq-sched.c and
- * bfq-cgroup.c with bfq. For bfq-mq, the macro
- * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether
- * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not
- * CONFIG_BFQ_GROUP_IOSCHED, is defined.
- */
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+/* see comments on CONFIG_BFQ_GROUP_IOSCHED in bfq.h */
+#ifdef CONFIG_BFQ_MQ_GROUP_IOSCHED
#define BFQ_GROUP_IOSCHED_ENABLED
#endif

@@ -248,8 +241,8 @@ struct bfq_queue {
struct request *next_rq;
/* number of sync and async requests queued */
int queued[2];
- /* number of sync and async requests currently allocated */
- int allocated[2];
+ /* number of requests currently allocated */
+ int allocated;
/* number of pending metadata requests */
int meta_pending;
/* fifo list of requests in sort_list */
@@ -334,6 +327,8 @@ struct bfq_queue {
unsigned long wr_start_at_switch_to_srt;

unsigned long split_time; /* time of last split */
+
+ spinlock_t lock;
};

/**
@@ -350,6 +345,9 @@ struct bfq_io_cq {
uint64_t blkcg_serial_nr; /* the current blkcg serial */
#endif

+ /* delayed work to exec the body of the the exit_icq handler */
+ struct work_struct exit_icq_work;
+
/*
* Snapshot of the idle window before merging; taken to
* remember this value while the queue is merged, so as to be
@@ -391,11 +389,13 @@ enum bfq_device_speed {
/**
* struct bfq_data - per-device data structure.
*
- * All the fields are protected by the @queue lock.
+ * All the fields are protected by @lock.
*/
struct bfq_data {
- /* request queue for the device */
+ /* device request queue */
struct request_queue *queue;
+ /* dispatch queue */
+ struct list_head dispatch;

/* root bfq_group for the device */
struct bfq_group *root_group;
@@ -449,8 +449,6 @@ struct bfq_data {
* the queue in service.
*/
struct hrtimer idle_slice_timer;
- /* delayed work to restart dispatching on the request queue */
- struct work_struct unplug_work;

/* bfq_queue in service */
struct bfq_queue *in_service_queue;
@@ -603,6 +601,8 @@ struct bfq_data {

/* fallback dummy bfqq for extreme OOM conditions */
struct bfq_queue oom_bfqq;
+
+ spinlock_t lock;
};

enum bfqq_state_flags {
@@ -613,7 +613,6 @@ enum bfqq_state_flags {
* waiting for a request
* without idling the device
*/
- BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */
BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */
BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */
BFQ_BFQQ_FLAG_sync, /* synchronous queue */
@@ -652,7 +651,6 @@ BFQ_BFQQ_FNS(just_created);
BFQ_BFQQ_FNS(busy);
BFQ_BFQQ_FNS(wait_request);
BFQ_BFQQ_FNS(non_blocking_wait_rq);
-BFQ_BFQQ_FNS(must_alloc);
BFQ_BFQQ_FNS(fifo_expire);
BFQ_BFQQ_FNS(idle_window);
BFQ_BFQQ_FNS(sync);
@@ -672,7 +670,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
char __pbuf[128]; \
\
- assert_spin_locked((bfqd)->queue->queue_lock); \
blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
pr_crit("bfq%d%c %s " fmt "\n", \
(bfqq)->pid, \
@@ -708,7 +705,6 @@ static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
char __pbuf[128]; \
\
- assert_spin_locked((bfqd)->queue->queue_lock); \
blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \
blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \
(bfqq)->pid, \
@@ -935,7 +931,6 @@ static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)

static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio);
static void bfq_put_queue(struct bfq_queue *bfqq);
-static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
struct bio *bio, bool is_sync,
struct bfq_io_cq *bic);
--
2.10.0