[PATCH v2 1/1] block: fix blk_queue_split() resource exhaustion

From: Lars Ellenberg
Date: Mon Jul 11 2016 - 10:10:53 EST


For a long time, generic_make_request() converts recursion into
iteration by queuing recursive arguments on current->bio_list.

This is convenient for stacking drivers,
the top-most driver would take the originally submitted bio,
and re-submit a re-mapped version of it, or one or more clones,
or one or more new allocated bios to its backend(s). Which
are then simply processed in turn, and each can again queue
more "backend-bios" until we reach the bottom of the driver stack,
and actually dispatch to the real backend device.

Any stacking driver ->make_request_fn() could expect that,
once it returns, any backend-bios it submitted via recursive calls
to generic_make_request() would now be processed and dispatched, before
the current task would call into this driver again.

This is changed by commit
54efd50 block: make generic_make_request handle arbitrarily sized bios

Drivers may call blk_queue_split() inside their ->make_request_fn(),
which may split the current bio into a front-part to be dealt with
immediately, and a remainder-part, which may need to be split even
further. That remainder-part will simply also be pushed to
current->bio_list, and would end up being head-of-queue, in front
of any backend-bios the current make_request_fn() might submit during
processing of the fron-part.

Which means the current task would immediately end up back in the same
make_request_fn() of the same driver again, before any of its backend
bios have even been processed.

This can lead to resource starvation deadlock.
Drivers could avoid this by learning to not need blk_queue_split(),
or by submitting their backend bios in a different context (dedicated
kernel thread, work_queue context, ...). Or by playing funny re-ordering
games with entries on current->bio_list.

Instead, I suggest to distinguish between recursive calls to
generic_make_request(), and pushing back the remainder part in
blk_queue_split(), by pointing current->bio_lists to a
struct recursion_to_iteration_bio_lists {
struct bio_list recursion;
struct bio_list queue;
}

By providing each q->make_request_fn() with an empty "recursion"
bio_list, then merging any recursively submitted bios to the
head of the "queue" list, we can make the recursion-to-iteration
logic in generic_make_request() process deepest level bios first,
and "sibling" bios of the same level in "natural" order.

Signed-off-by: Lars Ellenberg <lars.ellenberg@xxxxxxxxxx>
Signed-off-by: Roland Kammerer <roland.kammerer@xxxxxxxxxx>
---
block/bio.c | 20 +++++++++++--------
block/blk-core.c | 49 +++++++++++++++++++++++++----------------------
block/blk-merge.c | 5 ++++-
drivers/md/bcache/btree.c | 12 ++++++------
drivers/md/dm-bufio.c | 2 +-
drivers/md/raid1.c | 5 ++---
drivers/md/raid10.c | 5 ++---
include/linux/bio.h | 25 ++++++++++++++++++++++++
include/linux/sched.h | 4 ++--
9 files changed, 80 insertions(+), 47 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 848cd35..c2606fd 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -366,12 +366,16 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
*/

bio_list_init(&punt);
- bio_list_init(&nopunt);

- while ((bio = bio_list_pop(current->bio_list)))
+ bio_list_init(&nopunt);
+ while ((bio = bio_list_pop(&current->bio_lists->recursion)))
bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+ current->bio_lists->recursion = nopunt;

- *current->bio_list = nopunt;
+ bio_list_init(&nopunt);
+ while ((bio = bio_list_pop(&current->bio_lists->queue)))
+ bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+ current->bio_lists->queue = nopunt;

spin_lock(&bs->rescue_lock);
bio_list_merge(&bs->rescue_list, &punt);
@@ -453,13 +457,13 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
*
* We solve this, and guarantee forward progress, with a rescuer
* workqueue per bio_set. If we go to allocate and there are
- * bios on current->bio_list, we first try the allocation
- * without __GFP_DIRECT_RECLAIM; if that fails, we punt those
- * bios we would be blocking to the rescuer workqueue before
- * we retry with the original gfp_flags.
+ * bios on current->bio_lists->{recursion,queue}, we first try the
+ * allocation without __GFP_DIRECT_RECLAIM; if that fails, we
+ * punt those bios we would be blocking to the rescuer
+ * workqueue before we retry with the original gfp_flags.
*/

- if (current->bio_list && !bio_list_empty(current->bio_list))
+ if (current_has_pending_bios())
gfp_mask &= ~__GFP_DIRECT_RECLAIM;

p = mempool_alloc(bs->bio_pool, gfp_mask);
diff --git a/block/blk-core.c b/block/blk-core.c
index 3cfd67d..2886a59b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2040,7 +2040,7 @@ end_io:
*/
blk_qc_t generic_make_request(struct bio *bio)
{
- struct bio_list bio_list_on_stack;
+ struct recursion_to_iteration_bio_lists bio_lists_on_stack;
blk_qc_t ret = BLK_QC_T_NONE;

if (!generic_make_request_checks(bio))
@@ -2049,15 +2049,20 @@ blk_qc_t generic_make_request(struct bio *bio)
/*
* We only want one ->make_request_fn to be active at a time, else
* stack usage with stacked devices could be a problem. So use
- * current->bio_list to keep a list of requests submited by a
- * make_request_fn function. current->bio_list is also used as a
+ * current->bio_lists to keep a list of requests submited by a
+ * make_request_fn function. current->bio_lists is also used as a
* flag to say if generic_make_request is currently active in this
* task or not. If it is NULL, then no make_request is active. If
* it is non-NULL, then a make_request is active, and new requests
- * should be added at the tail
+ * should be added at the tail of current->bio_lists->recursion;
+ * bios resulting from a call to blk_queue_split() from
+ * within ->make_request_fn() should be pushed back to the head of
+ * current->bio_lists->queue.
+ * After the current ->make_request_fn() returns, .recursion will be
+ * merged back to the head of .queue.
*/
- if (current->bio_list) {
- bio_list_add(current->bio_list, bio);
+ if (current->bio_lists) {
+ bio_list_add(&current->bio_lists->recursion, bio);
goto out;
}

@@ -2066,35 +2071,33 @@ blk_qc_t generic_make_request(struct bio *bio)
* Before entering the loop, bio->bi_next is NULL (as all callers
* ensure that) so we have a list with a single bio.
* We pretend that we have just taken it off a longer list, so
- * we assign bio_list to a pointer to the bio_list_on_stack,
- * thus initialising the bio_list of new bios to be
- * added. ->make_request() may indeed add some more bios
- * through a recursive call to generic_make_request. If it
- * did, we find a non-NULL value in bio_list and re-enter the loop
- * from the top. In this case we really did just take the bio
- * of the top of the list (no pretending) and so remove it from
- * bio_list, and call into ->make_request() again.
+ * we assign bio_list to a pointer to the bio_lists_on_stack,
+ * thus initialising the bio_lists of new bios to be added.
+ * ->make_request() may indeed add some more bios to .recursion
+ * through a recursive call to generic_make_request. If it did,
+ * we find a non-NULL value in .recursion, merge .recursion back to the
+ * head of .queue, and re-enter the loop from the top. In this case we
+ * really did just take the bio of the top of the list (no pretending)
+ * and so remove it from .queue, and call into ->make_request() again.
*/
BUG_ON(bio->bi_next);
- bio_list_init(&bio_list_on_stack);
- current->bio_list = &bio_list_on_stack;
+ bio_list_init(&bio_lists_on_stack.queue);
+ current->bio_lists = &bio_lists_on_stack;
do {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);

if (likely(blk_queue_enter(q, false) == 0)) {
+ bio_list_init(&bio_lists_on_stack.recursion);
ret = q->make_request_fn(q, bio);
-
blk_queue_exit(q);
-
- bio = bio_list_pop(current->bio_list);
+ bio_list_merge_head(&bio_lists_on_stack.queue,
+ &bio_lists_on_stack.recursion);
} else {
- struct bio *bio_next = bio_list_pop(current->bio_list);
-
bio_io_error(bio);
- bio = bio_next;
}
+ bio = bio_list_pop(&current->bio_lists->queue);
} while (bio);
- current->bio_list = NULL; /* deactivate */
+ current->bio_lists = NULL; /* deactivate */

out:
return ret;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index c265348..df96327 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -172,6 +172,7 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,
struct bio *split, *res;
unsigned nsegs;

+ BUG_ON(!current->bio_lists);
if (bio_op(*bio) == REQ_OP_DISCARD)
split = blk_bio_discard_split(q, *bio, bs, &nsegs);
else if (bio_op(*bio) == REQ_OP_WRITE_SAME)
@@ -190,7 +191,9 @@ void blk_queue_split(struct request_queue *q, struct bio **bio,

bio_chain(split, *bio);
trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
- generic_make_request(*bio);
+ /* push back remainder, it may later be split further */
+ bio_list_add_head(&current->bio_lists->queue, *bio);
+ /* and fake submission of a suitably sized piece */
*bio = split;
}
}
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 76f7534..731ec3b 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -450,7 +450,7 @@ void __bch_btree_node_write(struct btree *b, struct closure *parent)

trace_bcache_btree_write(b);

- BUG_ON(current->bio_list);
+ BUG_ON(current->bio_lists);
BUG_ON(b->written >= btree_blocks(b));
BUG_ON(b->written && !i->keys);
BUG_ON(btree_bset_first(b)->seq != i->seq);
@@ -544,7 +544,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)

/* Force write if set is too big */
if (set_bytes(i) > PAGE_SIZE - 48 &&
- !current->bio_list)
+ !current->bio_lists)
bch_btree_node_write(b, NULL);
}

@@ -889,7 +889,7 @@ static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op,
{
struct btree *b;

- BUG_ON(current->bio_list);
+ BUG_ON(current->bio_lists);

lockdep_assert_held(&c->bucket_lock);

@@ -976,7 +976,7 @@ retry:
b = mca_find(c, k);

if (!b) {
- if (current->bio_list)
+ if (current->bio_lists)
return ERR_PTR(-EAGAIN);

mutex_lock(&c->bucket_lock);
@@ -2127,7 +2127,7 @@ static int bch_btree_insert_node(struct btree *b, struct btree_op *op,

return 0;
split:
- if (current->bio_list) {
+ if (current->bio_lists) {
op->lock = b->c->root->level + 1;
return -EAGAIN;
} else if (op->lock <= b->c->root->level) {
@@ -2209,7 +2209,7 @@ int bch_btree_insert(struct cache_set *c, struct keylist *keys,
struct btree_insert_op op;
int ret = 0;

- BUG_ON(current->bio_list);
+ BUG_ON(current->bio_lists);
BUG_ON(bch_keylist_empty(keys));

bch_btree_op_init(&op.op, 0);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 6571c81..ba0c325 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -174,7 +174,7 @@ static inline int dm_bufio_cache_index(struct dm_bufio_client *c)
#define DM_BUFIO_CACHE(c) (dm_bufio_caches[dm_bufio_cache_index(c)])
#define DM_BUFIO_CACHE_NAME(c) (dm_bufio_cache_names[dm_bufio_cache_index(c)])

-#define dm_bufio_in_request() (!!current->bio_list)
+#define dm_bufio_in_request() (!!current->bio_lists)

static void dm_bufio_lock(struct dm_bufio_client *c)
{
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 10e53cd..38790e3 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -876,8 +876,7 @@ static sector_t wait_barrier(struct r1conf *conf, struct bio *bio)
(!conf->barrier ||
((conf->start_next_window <
conf->next_resync + RESYNC_SECTORS) &&
- current->bio_list &&
- !bio_list_empty(current->bio_list))),
+ current_has_pending_bios())),
conf->resync_lock);
conf->nr_waiting--;
}
@@ -1014,7 +1013,7 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
struct r1conf *conf = mddev->private;
struct bio *bio;

- if (from_schedule || current->bio_list) {
+ if (from_schedule || current->bio_lists) {
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->pending_bio_list, &plug->pending);
conf->pending_count += plug->pending_cnt;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 245640b..13a5341 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -945,8 +945,7 @@ static void wait_barrier(struct r10conf *conf)
wait_event_lock_irq(conf->wait_barrier,
!conf->barrier ||
(conf->nr_pending &&
- current->bio_list &&
- !bio_list_empty(current->bio_list)),
+ current_has_pending_bios()),
conf->resync_lock);
conf->nr_waiting--;
}
@@ -1022,7 +1021,7 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
struct r10conf *conf = mddev->private;
struct bio *bio;

- if (from_schedule || current->bio_list) {
+ if (from_schedule || current->bio_lists) {
spin_lock_irq(&conf->device_lock);
bio_list_merge(&conf->pending_bio_list, &plug->pending);
conf->pending_count += plug->pending_cnt;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b7e1a008..2f8a361 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -541,6 +541,24 @@ struct bio_list {
struct bio *tail;
};

+/* for generic_make_request() */
+struct recursion_to_iteration_bio_lists {
+ /* For stacking drivers submitting to their respective backend,
+ * bios are added to the tail of .recursion, which is re-initialized
+ * before each call to ->make_request_fn() and after that returns,
+ * the whole .recursion queue is then merged back to the head of .queue.
+ *
+ * The recursion-to-iteration logic in generic_make_request() will
+ * peel off of .queue.head, processing bios in deepest-level-first
+ * "natural" order. */
+ struct bio_list recursion;
+
+ /* This keeps a list of to-be-processed bios.
+ * The "remainder" part resulting from calling blk_queue_split()
+ * will be pushed back to its head. */
+ struct bio_list queue;
+};
+
static inline int bio_list_empty(const struct bio_list *bl)
{
return bl->head == NULL;
@@ -551,6 +569,13 @@ static inline void bio_list_init(struct bio_list *bl)
bl->head = bl->tail = NULL;
}

+static inline bool current_has_pending_bios(void)
+{
+ return current->bio_lists &&
+ (!bio_list_empty(&current->bio_lists->queue) ||
+ !bio_list_empty(&current->bio_lists->recursion));
+}
+
#define BIO_EMPTY_LIST { NULL, NULL }

#define bio_list_for_each(bio, bl) \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6e42ada..146eedc 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -128,7 +128,7 @@ struct sched_attr {

struct futex_pi_state;
struct robust_list_head;
-struct bio_list;
+struct recursion_to_iteration_bio_lists;
struct fs_struct;
struct perf_event_context;
struct blk_plug;
@@ -1727,7 +1727,7 @@ struct task_struct {
void *journal_info;

/* stacked block device info */
- struct bio_list *bio_list;
+ struct recursion_to_iteration_bio_lists *bio_lists;

#ifdef CONFIG_BLOCK
/* stack plugging */
--
1.9.1