[PATCH 8/8] writeback: throttle buffered writeback

From: Jens Axboe
Date: Wed Mar 23 2016 - 11:26:35 EST


Test patch that throttles buffered writeback to make it a lot
more smooth, and has way less impact on other system activity.
Background writeback should be, by definition, background
activity. The fact that we flush huge bundles of it at the time
means that it potentially has heavy impacts on foreground workloads,
which isn't ideal. We can't easily limit the sizes of writes that
we do, since that would impact file system layout in the presence
of delayed allocation. So just throttle back buffered writeback,
unless someone is waiting for it.

Would likely need a dynamic adaption to the current device, this
one has only been tested on NVMe. But it brings down background
activity impact from 1-2s to tens of milliseconds instead.

This is just a test patch, and as such, it registers a queue sysfs
entry to both monitor the current state:

$ cat /sys/block/nvme0n1/queue/wb_stats
limit=4, batch=2, inflight=0, wait=0, timer=0

'limit' denotes how many requests we will allow inflight for buffered
writeback, this settings can be tweaked through writing to the
'wb_depth' file. Writing '0' turns this off completely. 'inflight' shows
how many requests are currently inflight for buffered writeback, 'wait'
shows if anyone is currently waiting for access, and 'timer' shows
if we have processes being deferred in write back cache timeout.

Background buffered writeback will be throttled at depth 'wb_depth',
and even lower (QD=1) if the device recently completed "competing" IO.
If we are doing reclaim or otherwise sync buffered writeback, the limit
is increased 4x to achieve full device bandwidth.

Finally, if the device has write back caching, 'wb_cache_delay' delays
by this amount of usecs when a write completes before allowing more.

Signed-off-by: Jens Axboe <axboe@xxxxxx>
---
block/Makefile | 2 +-
block/blk-core.c | 15 ++++
block/blk-mq.c | 32 ++++++-
block/blk-sysfs.c | 84 ++++++++++++++++++
block/blk-wb.c | 219 ++++++++++++++++++++++++++++++++++++++++++++++
block/blk-wb.h | 27 ++++++
include/linux/blk_types.h | 2 +
include/linux/blkdev.h | 3 +
8 files changed, 381 insertions(+), 3 deletions(-)
create mode 100644 block/blk-wb.c
create mode 100644 block/blk-wb.h

diff --git a/block/Makefile b/block/Makefile
index 9eda2322b2d4..9df911a3b569 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -5,7 +5,7 @@
obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
- blk-lib.o blk-mq.o blk-mq-tag.o \
+ blk-lib.o blk-mq.o blk-mq-tag.o blk-wb.o \
blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \
genhd.o scsi_ioctl.o partition-generic.o ioprio.o \
badblocks.o partitions/
diff --git a/block/blk-core.c b/block/blk-core.c
index 827f8badd143..887a9e64c6ef 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -39,6 +39,7 @@

#include "blk.h"
#include "blk-mq.h"
+#include "blk-wb.h"

EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
@@ -848,6 +849,9 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
if (blk_init_rl(&q->root_rl, q, GFP_KERNEL))
goto fail;

+ if (blk_buffered_writeback_init(q))
+ goto fail;
+
INIT_WORK(&q->timeout_work, blk_timeout_work);
q->request_fn = rfn;
q->prep_rq_fn = NULL;
@@ -880,6 +884,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,

fail:
blk_free_flush_queue(q->fq);
+ blk_buffered_writeback_exit(q);
return NULL;
}
EXPORT_SYMBOL(blk_init_allocated_queue);
@@ -1485,6 +1490,8 @@ void __blk_put_request(struct request_queue *q, struct request *req)
/* this is a bio leak */
WARN_ON(req->bio != NULL);

+ blk_buffered_writeback_done(q->rq_wb, req);
+
/*
* Request may not have originated from ll_rw_blk. if not,
* it didn't come out of our reserved rq pools
@@ -1714,6 +1721,7 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
struct request *req;
unsigned int request_count = 0;
+ bool wb_acct;

/*
* low level driver can indicate that it wants pages above a
@@ -1766,6 +1774,8 @@ static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio)
}

get_rq:
+ wb_acct = blk_buffered_writeback_wait(q->rq_wb, bio, q->queue_lock);
+
/*
* This sync check and mask will be re-done in init_request_from_bio(),
* but we need to set it earlier to expose the sync flag to the
@@ -1781,11 +1791,16 @@ get_rq:
*/
req = get_request(q, rw_flags, bio, GFP_NOIO);
if (IS_ERR(req)) {
+ if (wb_acct)
+ __blk_buffered_writeback_done(q->rq_wb);
bio->bi_error = PTR_ERR(req);
bio_endio(bio);
goto out_unlock;
}

+ if (wb_acct)
+ req->cmd_flags |= REQ_BUF_INFLIGHT;
+
/*
* After dropping the lock and possibly sleeping here, our request
* may now be mergeable after it had proven unmergeable (above).
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 050f7a13021b..55aace97fd35 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -29,6 +29,7 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
+#include "blk-wb.h"

static DEFINE_MUTEX(all_q_mutex);
static LIST_HEAD(all_q_list);
@@ -274,6 +275,9 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,

if (rq->cmd_flags & REQ_MQ_INFLIGHT)
atomic_dec(&hctx->nr_active);
+
+ blk_buffered_writeback_done(q->rq_wb, rq);
+
rq->cmd_flags = 0;

clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
@@ -1253,6 +1257,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
blk_qc_t cookie;
+ bool wb_acct;

blk_queue_bounce(q, &bio);

@@ -1270,9 +1275,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
} else
request_count = blk_plug_queued_count(q);

+ wb_acct = blk_buffered_writeback_wait(q->rq_wb, bio, NULL);
+
rq = blk_mq_map_request(q, bio, &data);
- if (unlikely(!rq))
+ if (unlikely(!rq)) {
+ if (wb_acct)
+ __blk_buffered_writeback_done(q->rq_wb);
return BLK_QC_T_NONE;
+ }
+
+ if (wb_acct)
+ rq->cmd_flags |= REQ_BUF_INFLIGHT;

cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);

@@ -1349,6 +1362,7 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
struct blk_map_ctx data;
struct request *rq;
blk_qc_t cookie;
+ bool wb_acct;

blk_queue_bounce(q, &bio);

@@ -1363,9 +1377,17 @@ static blk_qc_t blk_sq_make_request(struct request_queue *q, struct bio *bio)
blk_attempt_plug_merge(q, bio, &request_count, NULL))
return BLK_QC_T_NONE;

+ wb_acct = blk_buffered_writeback_wait(q->rq_wb, bio, NULL);
+
rq = blk_mq_map_request(q, bio, &data);
- if (unlikely(!rq))
+ if (unlikely(!rq)) {
+ if (wb_acct)
+ __blk_buffered_writeback_done(q->rq_wb);
return BLK_QC_T_NONE;
+ }
+
+ if (wb_acct)
+ rq->cmd_flags |= REQ_BUF_INFLIGHT;

cookie = blk_tag_to_qc_t(rq->tag, data.hctx->queue_num);

@@ -2018,6 +2040,9 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
/* mark the queue as mq asap */
q->mq_ops = set->ops;

+ if (blk_buffered_writeback_init(q))
+ return ERR_PTR(-ENOMEM);
+
q->queue_ctx = alloc_percpu(struct blk_mq_ctx);
if (!q->queue_ctx)
return ERR_PTR(-ENOMEM);
@@ -2084,6 +2109,7 @@ err_map:
kfree(q->queue_hw_ctx);
err_percpu:
free_percpu(q->queue_ctx);
+ blk_buffered_writeback_exit(q);
return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);
@@ -2096,6 +2122,8 @@ void blk_mq_free_queue(struct request_queue *q)
list_del_init(&q->all_q_node);
mutex_unlock(&all_q_mutex);

+ blk_buffered_writeback_exit(q);
+
blk_mq_del_queue_tag_set(q);

blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 954e510452d7..9ac9be23e700 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -13,6 +13,7 @@

#include "blk.h"
#include "blk-mq.h"
+#include "blk-wb.h"

struct queue_sysfs_entry {
struct attribute attr;
@@ -347,6 +348,71 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page,
return ret;
}

+static ssize_t queue_wb_stats_show(struct request_queue *q, char *page)
+{
+ struct rq_wb *wb = q->rq_wb;
+
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ return sprintf(page, "limit=%d, batch=%d, inflight=%d, wait=%d, timer=%d\n",
+ wb->limit, wb->batch, atomic_read(&wb->inflight),
+ waitqueue_active(&wb->wait), timer_pending(&wb->timer));
+}
+
+static ssize_t queue_wb_depth_show(struct request_queue *q, char *page)
+{
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ return queue_var_show(q->rq_wb->limit, page);
+}
+
+static ssize_t queue_wb_depth_store(struct request_queue *q, const char *page,
+ size_t count)
+{
+ unsigned long var;
+ ssize_t ret;
+
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ ret = queue_var_store(&var, page, count);
+ if (ret < 0)
+ return ret;
+ if (var != (unsigned int) var)
+ return -EINVAL;
+
+ blk_update_wb_limit(q->rq_wb, var);
+ return ret;
+}
+
+static ssize_t queue_wb_cache_delay_show(struct request_queue *q, char *page)
+{
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ return queue_var_show(q->rq_wb->cache_delay_usecs, page);
+}
+
+static ssize_t queue_wb_cache_delay_store(struct request_queue *q,
+ const char *page, size_t count)
+{
+ unsigned long var;
+ ssize_t ret;
+
+ if (!q->rq_wb)
+ return -EINVAL;
+
+ ret = queue_var_store(&var, page, count);
+ if (ret < 0)
+ return ret;
+
+ q->rq_wb->cache_delay_usecs = var;
+ q->rq_wb->cache_delay = usecs_to_jiffies(var);
+ return ret;
+}
+
static ssize_t queue_wc_show(struct request_queue *q, char *page)
{
if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
@@ -516,6 +582,21 @@ static struct queue_sysfs_entry queue_wc_entry = {
.store = queue_wc_store,
};

+static struct queue_sysfs_entry queue_wb_stats_entry = {
+ .attr = {.name = "wb_stats", .mode = S_IRUGO },
+ .show = queue_wb_stats_show,
+};
+static struct queue_sysfs_entry queue_wb_cache_delay_entry = {
+ .attr = {.name = "wb_cache_usecs", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_wb_cache_delay_show,
+ .store = queue_wb_cache_delay_store,
+};
+static struct queue_sysfs_entry queue_wb_depth_entry = {
+ .attr = {.name = "wb_depth", .mode = S_IRUGO | S_IWUSR },
+ .show = queue_wb_depth_show,
+ .store = queue_wb_depth_store,
+};
+
static struct attribute *default_attrs[] = {
&queue_requests_entry.attr,
&queue_ra_entry.attr,
@@ -542,6 +623,9 @@ static struct attribute *default_attrs[] = {
&queue_random_entry.attr,
&queue_poll_entry.attr,
&queue_wc_entry.attr,
+ &queue_wb_stats_entry.attr,
+ &queue_wb_cache_delay_entry.attr,
+ &queue_wb_depth_entry.attr,
NULL,
};

diff --git a/block/blk-wb.c b/block/blk-wb.c
new file mode 100644
index 000000000000..2aa3753a8e1e
--- /dev/null
+++ b/block/blk-wb.c
@@ -0,0 +1,219 @@
+/*
+ * buffered writeback throttling
+ *
+ * Copyright (C) 2016 Jens Axboe
+ *
+ * Things that need changing:
+ *
+ * - Auto-detection of most of this, no tunables. Cache type we can get,
+ * and most other settings we can tweak/gather based on time.
+ * - Better solution for rwb->bdp_wait?
+ * - Higher depth for WB_SYNC_ALL?
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+
+#include "blk.h"
+#include "blk-wb.h"
+
+void __blk_buffered_writeback_done(struct rq_wb *rwb)
+{
+ int inflight;
+
+ inflight = atomic_dec_return(&rwb->inflight);
+ if (inflight >= rwb->limit)
+ return;
+
+ /*
+ * If the device does caching, we can still flood it with IO
+ * even at a low depth. If caching is on, delay a bit before
+ * submitting the next, if we're still purely background
+ * activity.
+ */
+ if (test_bit(QUEUE_FLAG_WC, &rwb->q->queue_flags) && !*rwb->bdp_wait &&
+ time_before(jiffies, rwb->last_comp + rwb->cache_delay)) {
+ if (!timer_pending(&rwb->timer))
+ mod_timer(&rwb->timer, jiffies + rwb->cache_delay);
+ return;
+ }
+
+ if (waitqueue_active(&rwb->wait)) {
+ int diff = rwb->limit - inflight;
+
+ if (diff >= rwb->batch)
+ wake_up_nr(&rwb->wait, 1);
+ }
+}
+
+/*
+ * Called on completion of a request. Note that it's also called when
+ * a request is merged, when the request gets freed.
+ */
+void blk_buffered_writeback_done(struct rq_wb *rwb, struct request *rq)
+{
+ if (!(rq->cmd_flags & REQ_BUF_INFLIGHT)) {
+ const unsigned long cur = jiffies;
+
+ if (rwb->limit && cur != rwb->last_comp)
+ rwb->last_comp = cur;
+ } else
+ __blk_buffered_writeback_done(rwb);
+}
+
+/*
+ * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded,
+ * false if 'v' + 1 would be bigger than 'below'.
+ */
+static bool atomic_inc_below(atomic_t *v, int below)
+{
+ int cur = atomic_read(v);
+
+ for (;;) {
+ int old;
+
+ if (cur >= below)
+ return false;
+ old = atomic_cmpxchg(v, cur, cur + 1);
+ if (old == cur)
+ break;
+ cur = old;
+ }
+
+ return true;
+}
+
+/*
+ * Block if we will exceed our limit, or if we are currently waiting for
+ * the timer to kick off queuing again.
+ */
+static void __blk_buffered_writeback_wait(struct rq_wb *rwb, unsigned int limit,
+ spinlock_t *lock)
+{
+ DEFINE_WAIT(wait);
+
+ if (!timer_pending(&rwb->timer) &&
+ atomic_inc_below(&rwb->inflight, limit))
+ return;
+
+ do {
+ prepare_to_wait_exclusive(&rwb->wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+
+ if (!timer_pending(&rwb->timer) &&
+ atomic_inc_below(&rwb->inflight, limit))
+ break;
+
+ if (lock)
+ spin_unlock_irq(lock);
+
+ io_schedule();
+
+ if (lock)
+ spin_lock_irq(lock);
+ } while (1);
+
+ finish_wait(&rwb->wait, &wait);
+}
+
+/*
+ * Returns true if the IO request should be accounted, false if not.
+ * May sleep, if we have exceeded the writeback limits. Caller can pass
+ * in an irq held spinlock, if it holds one when calling this function.
+ * If we do sleep, we'll release and re-grab it.
+ */
+bool blk_buffered_writeback_wait(struct rq_wb *rwb, struct bio *bio,
+ spinlock_t *lock)
+{
+ unsigned int limit;
+
+ /*
+ * If disabled, or not a WRITE (or a discard), do nothing
+ */
+ if (!rwb->limit || !(bio->bi_rw & REQ_WRITE) ||
+ (bio->bi_rw & REQ_DISCARD))
+ return false;
+
+ /*
+ * Don't throttle WRITE_ODIRECT
+ */
+ if ((bio->bi_rw & (REQ_SYNC | REQ_NOIDLE)) == REQ_SYNC)
+ return false;
+
+ /*
+ * At this point we know it's a buffered write. If REQ_SYNC is
+ * set, then it's WB_SYNC_ALL writeback. Bump the limit 4x for
+ * those, since someone is (or will be) waiting on that.
+ */
+ limit = rwb->limit;
+ if (bio->bi_rw & REQ_SYNC)
+ limit <<= 2;
+ else if (limit != 1) {
+ /*
+ * If less than 100ms since we completed unrelated IO,
+ * limit us to a depth of 1 for background writeback.
+ */
+ if (time_before(jiffies, rwb->last_comp + HZ / 10))
+ limit = 1;
+ else if (!*rwb->bdp_wait)
+ limit >>= 1;
+ }
+
+ __blk_buffered_writeback_wait(rwb, limit, lock);
+ return true;
+}
+
+void blk_update_wb_limit(struct rq_wb *rwb, unsigned int limit)
+{
+ rwb->limit = limit;
+ rwb->batch = rwb->limit / 2;
+ if (!rwb->batch && rwb->limit)
+ rwb->batch = 1;
+ else if (rwb->batch > 4)
+ rwb->batch = 4;
+
+ wake_up_all(&rwb->wait);
+}
+
+static void blk_buffered_writeback_timer(unsigned long data)
+{
+ struct rq_wb *rwb = (struct rq_wb *) data;
+
+ if (waitqueue_active(&rwb->wait))
+ wake_up_nr(&rwb->wait, 1);
+}
+
+#define DEF_WB_LIMIT 4
+#define DEF_WB_CACHE_DELAY 10000
+
+int blk_buffered_writeback_init(struct request_queue *q)
+{
+ struct rq_wb *rwb;
+
+ rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
+ if (!rwb)
+ return -ENOMEM;
+
+ atomic_set(&rwb->inflight, 0);
+ init_waitqueue_head(&rwb->wait);
+ rwb->last_comp = jiffies;
+ rwb->bdp_wait = &q->backing_dev_info.wb.dirty_sleeping;
+ setup_timer(&rwb->timer, blk_buffered_writeback_timer,
+ (unsigned long) rwb);
+ rwb->cache_delay_usecs = DEF_WB_CACHE_DELAY;
+ rwb->cache_delay = usecs_to_jiffies(rwb->cache_delay);
+ rwb->q = q;
+ blk_update_wb_limit(rwb, DEF_WB_LIMIT);
+ q->rq_wb = rwb;
+ return 0;
+}
+
+void blk_buffered_writeback_exit(struct request_queue *q)
+{
+ if (q->rq_wb)
+ del_timer_sync(&q->rq_wb->timer);
+
+ kfree(q->rq_wb);
+ q->rq_wb = NULL;
+}
diff --git a/block/blk-wb.h b/block/blk-wb.h
new file mode 100644
index 000000000000..f3b4cd139815
--- /dev/null
+++ b/block/blk-wb.h
@@ -0,0 +1,27 @@
+#ifndef BLK_WB_H
+#define BLK_WB_H
+
+#include <linux/atomic.h>
+#include <linux/wait.h>
+
+struct rq_wb {
+ unsigned int limit;
+ unsigned int batch;
+ unsigned int cache_delay;
+ unsigned int cache_delay_usecs;
+ unsigned long last_comp;
+ unsigned int *bdp_wait;
+ struct request_queue *q;
+ atomic_t inflight;
+ wait_queue_head_t wait;
+ struct timer_list timer;
+};
+
+void __blk_buffered_writeback_done(struct rq_wb *);
+void blk_buffered_writeback_done(struct rq_wb *, struct request *);
+bool blk_buffered_writeback_wait(struct rq_wb *, struct bio *, spinlock_t *);
+int blk_buffered_writeback_init(struct request_queue *);
+void blk_buffered_writeback_exit(struct request_queue *);
+void blk_update_wb_limit(struct rq_wb *, unsigned int);
+
+#endif
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 86a38ea1823f..6f2a174b771c 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -188,6 +188,7 @@ enum rq_flag_bits {
__REQ_PM, /* runtime pm request */
__REQ_HASHED, /* on IO scheduler merge hash */
__REQ_MQ_INFLIGHT, /* track inflight for MQ */
+ __REQ_BUF_INFLIGHT, /* track inflight for buffered */
__REQ_NR_BITS, /* stops here */
};

@@ -241,6 +242,7 @@ enum rq_flag_bits {
#define REQ_PM (1ULL << __REQ_PM)
#define REQ_HASHED (1ULL << __REQ_HASHED)
#define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT)
+#define REQ_BUF_INFLIGHT (1ULL << __REQ_BUF_INFLIGHT)

typedef unsigned int blk_qc_t;
#define BLK_QC_T_NONE -1U
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 76e875159e52..8586685bf7b2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -37,6 +37,7 @@ struct bsg_job;
struct blkcg_gq;
struct blk_flush_queue;
struct pr_ops;
+struct rq_wb;

#define BLKDEV_MIN_RQ 4
#define BLKDEV_MAX_RQ 128 /* Default maximum */
@@ -290,6 +291,8 @@ struct request_queue {
int nr_rqs[2]; /* # allocated [a]sync rqs */
int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */

+ struct rq_wb *rq_wb;
+
/*
* If blkcg is not used, @q->root_rl serves all requests. If blkcg
* is used, root blkg allocates from @q->root_rl and all other
--
2.4.1.168.g1ea28e1