[PATCH V4 05/15] blk-throttle: add downgrade logic

From: Shaohua Li
Date: Mon Nov 14 2016 - 17:22:42 EST


When queue state machine is in LIMIT_MAX state, but a cgroup is below
its high limit for some time, the queue should be downgraded to lower
state as one cgroup's high limit isn't met.

Signed-off-by: Shaohua Li <shli@xxxxxx>
---
block/blk-throttle.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 188 insertions(+)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 34a75e5..d177252 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -136,6 +136,13 @@ struct throtl_grp {
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];

+ unsigned long last_high_overflow_time[2];
+
+ uint64_t last_bytes_disp[2];
+ unsigned int last_io_disp[2];
+
+ unsigned long last_check_time;
+
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
@@ -155,6 +162,9 @@ struct throtl_data
struct work_struct dispatch_work;
unsigned int limit_index;
bool limit_valid[LIMIT_CNT];
+
+ unsigned long high_upgrade_time;
+ unsigned long high_downgrade_time;
};

static void throtl_pending_timer_fn(unsigned long arg);
@@ -896,6 +906,8 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
/* Charge the bio to the group */
tg->bytes_disp[rw] += bio->bi_iter.bi_size;
tg->io_disp[rw]++;
+ tg->last_bytes_disp[rw] += bio->bi_iter.bi_size;
+ tg->last_io_disp[rw]++;

/*
* REQ_THROTTLED is used to prevent the same bio to be throttled
@@ -1510,6 +1522,65 @@ static struct blkcg_policy blkcg_policy_throtl = {
.pd_free_fn = throtl_pd_free,
};

+static unsigned long __tg_last_high_overflow_time(struct throtl_grp *tg)
+{
+ unsigned long rtime = -1, wtime = -1;
+
+ if (tg->bps[READ][LIMIT_HIGH] != -1 ||
+ tg->iops[READ][LIMIT_HIGH] != -1 ||
+ tg->bps[READ][LIMIT_MAX] != -1 ||
+ tg->iops[READ][LIMIT_MAX] != -1)
+ rtime = tg->last_high_overflow_time[READ];
+ if (tg->bps[WRITE][LIMIT_HIGH] != -1 ||
+ tg->iops[WRITE][LIMIT_HIGH] != -1 ||
+ tg->bps[WRITE][LIMIT_MAX] != -1 ||
+ tg->iops[WRITE][LIMIT_MAX] != -1)
+ wtime = tg->last_high_overflow_time[WRITE];
+ return min(rtime, wtime) == -1 ? 0 : min(rtime, wtime);
+}
+
+static unsigned long tg_last_high_overflow_time(struct throtl_grp *tg)
+{
+ struct throtl_service_queue *parent_sq;
+ struct throtl_grp *parent = tg;
+ unsigned long ret = __tg_last_high_overflow_time(tg);
+
+ while (true) {
+ parent_sq = parent->service_queue.parent_sq;
+ parent = sq_to_tg(parent_sq);
+ if (!parent)
+ break;
+ if (((parent->bps[READ][LIMIT_HIGH] != -1 &&
+ parent->bps[READ][LIMIT_HIGH] >=
+ tg->bps[READ][LIMIT_HIGH]) ||
+ (parent->bps[READ][LIMIT_HIGH] == -1 &&
+ parent->bps[READ][LIMIT_MAX] >=
+ tg->bps[READ][LIMIT_HIGH])) &&
+ ((parent->bps[WRITE][LIMIT_HIGH] != -1 &&
+ parent->bps[WRITE][LIMIT_HIGH] >=
+ tg->bps[WRITE][LIMIT_HIGH]) ||
+ (parent->bps[WRITE][LIMIT_HIGH] == -1 &&
+ parent->bps[WRITE][LIMIT_MAX] >=
+ tg->bps[WRITE][LIMIT_HIGH])) &&
+ ((parent->iops[READ][LIMIT_HIGH] != -1 &&
+ parent->iops[READ][LIMIT_HIGH] >=
+ tg->iops[READ][LIMIT_HIGH]) ||
+ (parent->iops[READ][LIMIT_HIGH] == -1 &&
+ parent->iops[READ][LIMIT_MAX] >=
+ tg->iops[READ][LIMIT_HIGH])) &&
+ ((parent->iops[WRITE][LIMIT_HIGH] != -1 &&
+ parent->iops[WRITE][LIMIT_HIGH] >=
+ tg->iops[WRITE][LIMIT_HIGH]) ||
+ (parent->iops[WRITE][LIMIT_HIGH] == -1 &&
+ parent->iops[WRITE][LIMIT_MAX] >=
+ tg->iops[WRITE][LIMIT_HIGH])))
+ break;
+ if (time_after(__tg_last_high_overflow_time(parent), ret))
+ ret = __tg_last_high_overflow_time(parent);
+ }
+ return ret;
+}
+
static bool throtl_upgrade_check_one(struct throtl_grp *tg)
{
struct throtl_service_queue *sq = &tg->service_queue;
@@ -1557,6 +1628,9 @@ static bool throtl_can_upgrade(struct throtl_data *td,
if (td->limit_index != LIMIT_HIGH)
return false;

+ if (time_before(jiffies, td->high_downgrade_time + throtl_slice))
+ return false;
+
rcu_read_lock();
blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1580,6 +1654,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
struct blkcg_gq *blkg;

td->limit_index = LIMIT_MAX;
+ td->high_upgrade_time = jiffies;
rcu_read_lock();
blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
struct throtl_grp *tg = blkg_to_tg(blkg);
@@ -1595,6 +1670,111 @@ static void throtl_upgrade_state(struct throtl_data *td)
queue_work(kthrotld_workqueue, &td->dispatch_work);
}

+static void throtl_downgrade_state(struct throtl_data *td, int new)
+{
+ td->limit_index = new;
+ td->high_downgrade_time = jiffies;
+}
+
+static bool throtl_downgrade_check_one(struct throtl_grp *tg)
+{
+ struct throtl_data *td = tg->td;
+ unsigned long now = jiffies;
+
+ /*
+ * If cgroup is below high limit, consider downgrade and throttle other
+ * cgroups
+ */
+ if (time_after_eq(now, td->high_upgrade_time + throtl_slice) &&
+ time_after_eq(now, tg_last_high_overflow_time(tg) + throtl_slice))
+ return true;
+ return false;
+}
+
+static bool throtl_downgrade_check_hierarchy(struct throtl_grp *tg)
+{
+ if (!throtl_downgrade_check_one(tg))
+ return false;
+ while (true) {
+ if (!tg || (cgroup_subsys_on_dfl(io_cgrp_subsys) &&
+ !tg_to_blkg(tg)->parent))
+ break;
+
+ if (!throtl_downgrade_check_one(tg))
+ return false;
+ tg = sq_to_tg(tg->service_queue.parent_sq);
+ }
+ return true;
+}
+
+static void throtl_downgrade_check(struct throtl_grp *tg)
+{
+ uint64_t bps;
+ unsigned int iops;
+ unsigned long elapsed_time;
+ unsigned long now = jiffies;
+
+ if (tg->td->limit_index != LIMIT_MAX ||
+ !tg->td->limit_valid[LIMIT_HIGH])
+ return;
+ if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
+ return;
+ if (time_after(tg->last_check_time + throtl_slice, now))
+ return;
+
+ if (time_before(now, tg_last_high_overflow_time(tg) + throtl_slice))
+ return;
+
+ elapsed_time = now - tg->last_check_time;
+ tg->last_check_time = now;
+
+ if (tg->bps[READ][LIMIT_HIGH] != -1 ||
+ tg->bps[READ][LIMIT_MAX] != -1) {
+ bps = tg->last_bytes_disp[READ] * HZ;
+ do_div(bps, elapsed_time);
+ if (bps >= tg->bps[READ][LIMIT_HIGH] ||
+ bps >= tg->bps[READ][LIMIT_MAX])
+ tg->last_high_overflow_time[READ] = now;
+ }
+
+ if (tg->bps[WRITE][LIMIT_HIGH] != -1 ||
+ tg->bps[WRITE][LIMIT_MAX] != -1) {
+ bps = tg->last_bytes_disp[WRITE] * HZ;
+ do_div(bps, elapsed_time);
+ if (bps >= tg->bps[WRITE][LIMIT_HIGH] ||
+ bps >= tg->bps[WRITE][LIMIT_MAX])
+ tg->last_high_overflow_time[WRITE] = now;
+ }
+
+ if (tg->iops[READ][LIMIT_HIGH] != -1 ||
+ tg->iops[READ][LIMIT_MAX] != -1) {
+ iops = tg->last_io_disp[READ] * HZ / elapsed_time;
+ if (iops >= tg->iops[READ][LIMIT_HIGH] ||
+ iops >= tg->iops[READ][LIMIT_MAX])
+ tg->last_high_overflow_time[READ] = now;
+ }
+
+ if (tg->iops[WRITE][LIMIT_HIGH] != -1 ||
+ tg->iops[WRITE][LIMIT_MAX] != -1) {
+ iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
+ if (iops >= tg->iops[WRITE][LIMIT_HIGH] ||
+ iops >= tg->iops[WRITE][LIMIT_MAX])
+ tg->last_high_overflow_time[WRITE] = now;
+ }
+
+ /*
+ * If cgroup is below high limit, consider downgrade and throttle other
+ * cgroups
+ */
+ if (throtl_downgrade_check_hierarchy(tg))
+ throtl_downgrade_state(tg->td, LIMIT_HIGH);
+
+ tg->last_bytes_disp[READ] = 0;
+ tg->last_bytes_disp[WRITE] = 0;
+ tg->last_io_disp[READ] = 0;
+ tg->last_io_disp[WRITE] = 0;
+}
+
bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
struct bio *bio)
{
@@ -1619,12 +1799,16 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,

again:
while (true) {
+ if (tg->last_high_overflow_time[rw] == 0)
+ tg->last_high_overflow_time[rw] = jiffies;
+ throtl_downgrade_check(tg);
/* throtl is FIFO - if bios are already queued, should queue */
if (sq->nr_queued[rw])
break;

/* if above limits, break to queue */
if (!tg_may_dispatch(tg, bio, NULL)) {
+ tg->last_high_overflow_time[rw] = jiffies;
if (throtl_can_upgrade(tg->td, tg)) {
throtl_upgrade_state(tg->td);
goto again;
@@ -1668,6 +1852,8 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
tg->io_disp[rw], tg_iops_limit(tg, rw),
sq->nr_queued[READ], sq->nr_queued[WRITE]);

+ tg->last_high_overflow_time[rw] = jiffies;
+
bio_associate_current(bio);
tg->td->nr_queued[rw]++;
throtl_add_bio_tg(bio, qn, tg);
@@ -1778,6 +1964,8 @@ int blk_throtl_init(struct request_queue *q)

td->limit_valid[LIMIT_MAX] = true;
td->limit_index = LIMIT_MAX;
+ td->high_upgrade_time = jiffies;
+ td->high_downgrade_time = jiffies;
/* activate policy */
ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
if (ret)
--
2.9.3