[PATCH 10/10] blkcg: implement interface for the unified hierarchy

From: Tejun Heo
Date: Fri Jul 24 2015 - 14:44:18 EST


blkcg interface grew to be the biggest of all controllers and
unfortunately most inconsistent too. The interface files are
inconsistent with a number of cloes duplicates. Some files have
recursive variants while others don't. There's distinction between
normal and leaf weights which isn't intuitive and there are a lot of
stat knobs which don't make much sense outside of debugging and expose
too much implementation details to userland.

In the unified hierarchy, everything is always hierarchical and
internal nodes can't have tasks rendering the two structural issues
twisting the current interface. The interface has to be updated in a
significant anyway and this is a good chance to revamp it as a whole.
This patch implements blkcg interface for the unified hierarchy.

* (from a previous patch) blkcg is identified by "io" instead of
"blkio" on the unified hierarchy. Given that the whole interface is
updated anyway, the rename shouldn't carry noticeable conversion
overhead.

* The original interface consisted of 27 files is replaced with the
following three files.

blkio.stat : per-blkcg stats
blkio.weight : per-cgroup and per-cgroup-queue weight settings
blkio.max : per-cgroup-queue bps and iops max limits

Documentation/cgroups/unified-hierarchy.txt updated accordingly.

Signed-off-by: Tejun Heo <tj@xxxxxxxxxx>
---
Documentation/cgroups/unified-hierarchy.txt | 57 +++++++++++++-
block/blk-cgroup.c | 51 +++++++++++++
block/blk-throttle.c | 112 ++++++++++++++++++++++++++++
block/cfq-iosched.c | 61 +++++++++++++--
include/linux/blk-cgroup.h | 1 +
5 files changed, 275 insertions(+), 7 deletions(-)

diff --git a/Documentation/cgroups/unified-hierarchy.txt b/Documentation/cgroups/unified-hierarchy.txt
index 86847a7..4e23d4c 100644
--- a/Documentation/cgroups/unified-hierarchy.txt
+++ b/Documentation/cgroups/unified-hierarchy.txt
@@ -374,9 +374,62 @@ supported and the interface files "release_agent" and

5-3. Per-Controller Changes

-5-3-1. blkio
+5-3-1. io

-- blk-throttle becomes properly hierarchical.
+- blkio is renamed to io. The interface is overhauled anyway. The
+ new name is more in line with the other two major controllers, cpu
+ and memory, and better suited given that it may be used for cgroup
+ writeback without involving block layer.
+
+- Everything including stat is always hierarchical making separate
+ recursive stat files pointless and, as no internal node can have
+ tasks, leaf weights are meaningless. The operation model is
+ simplified and the interface is overhauled accordingly.
+
+ io.stat
+
+ The stat file. The reported stats are from the point where
+ bio's are issued to request_queue. The stats are counted
+ independent of which policies are enabled. Each line in the
+ file follows the following format. More fields may later be
+ added at the end.
+
+ $MAJ:$MIN rbytes=$RBYTES wbytes=$WBYTES rios=$RIOS wrios=$WIOS
+
+ io.weight
+
+ The weight setting, currently only available and effective if
+ cfq-iosched is in use for the target device. The weight is
+ between 10 and 1000 and defaults to 500. The first line
+ always contains the default weight in the following format to
+ use when per-device setting is missing.
+
+ default $WEIGHT
+
+ Subsequent lines list per-device weights of the following
+ format.
+
+ $MAJ:$MIN $WEIGHT
+
+ Writing "$WEIGHT" or "default $WEIGHT" changes the default
+ setting. Writing "$MAJ:$MIN $WEIGHT" sets per-device weight
+ while "$MAJ:$MIN default" clears it.
+
+ This file is available only on non-root cgroups.
+
+ io.max
+
+ The maximum bandwidth and/or iops setting, only available if
+ blk-throttle is enabled. The file is of the following format.
+
+ $MAJ:$MIN rbps=$RBPS wbps=$WBPS riops=$RIOPS wiops=$WIOPS
+
+ ${R|W}BPS are read/write bytes per second and ${R|W}IOPS are
+ read/write IOs per second. "max" indicates no limit. Writing
+ to the file follows the same format but the individual
+ settings may be ommitted or specified in any order.
+
+ This file is available only on non-root cgroups.


5-3-2. cpuset
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index b9d511b..b97a075 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -854,6 +854,53 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
}
EXPORT_SYMBOL_GPL(blkg_conf_finish);

+static int blkcg_print_stat(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct blkcg_gq *blkg;
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ const char *dname;
+ struct blkg_rwstat rwstat;
+ u64 rbytes, wbytes, rios, wios;
+
+ dname = blkg_dev_name(blkg);
+ if (!dname)
+ continue;
+
+ spin_lock_irq(blkg->q->queue_lock);
+
+ rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+ offsetof(struct blkcg_gq, stat_bytes));
+ rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+ wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+ rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
+ offsetof(struct blkcg_gq, stat_ios));
+ rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
+ wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
+
+ spin_unlock_irq(blkg->q->queue_lock);
+
+ if (rbytes || wbytes || rios || wios)
+ seq_printf(sf, "%s rbytes=%llu wbytes=%llu rios=%llu wios=%llu\n",
+ dname, rbytes, wbytes, rios, wios);
+ }
+
+ rcu_read_unlock();
+ return 0;
+}
+
+struct cftype blkcg_files[] = {
+ {
+ .name = "stat",
+ .seq_show = blkcg_print_stat,
+ },
+ { } /* terminate */
+};
+
struct cftype blkcg_legacy_files[] = {
{
.name = "reset_stats",
@@ -1100,6 +1147,7 @@ struct cgroup_subsys io_cgrp_subsys = {
.css_offline = blkcg_css_offline,
.css_free = blkcg_css_free,
.can_attach = blkcg_can_attach,
+ .dfl_cftypes = blkcg_files,
.legacy_cftypes = blkcg_legacy_files,
.legacy_name = "blkio",
#ifdef CONFIG_MEMCG
@@ -1271,6 +1319,9 @@ int blkcg_policy_register(struct blkcg_policy *pol)
mutex_unlock(&blkcg_pol_mutex);

/* everything is in place, add intf files for the new policy */
+ if (pol->dfl_cftypes)
+ WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys,
+ pol->dfl_cftypes));
if (pol->legacy_cftypes)
WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys,
pol->legacy_cftypes));
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index a8bb2fd..c75a263 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1265,6 +1265,117 @@ static struct cftype throtl_legacy_files[] = {
{ } /* terminate */
};

+static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ struct throtl_grp *tg = pd_to_tg(pd);
+ const char *dname = blkg_dev_name(pd->blkg);
+ char bufs[4][21] = { "max", "max", "max", "max" };
+
+ if (!dname)
+ return 0;
+ if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 &&
+ tg->iops[READ] == -1 && tg->iops[WRITE] == -1)
+ return 0;
+
+ if (tg->bps[READ] != -1)
+ snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]);
+ if (tg->bps[WRITE] != -1)
+ snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]);
+ if (tg->iops[READ] != -1)
+ snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]);
+ if (tg->iops[WRITE] != -1)
+ snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]);
+
+ seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n",
+ dname, bufs[0], bufs[1], bufs[2], bufs[3]);
+ return 0;
+}
+
+static int tg_print_max(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max,
+ &blkcg_policy_throtl, seq_cft(sf)->private, false);
+ return 0;
+}
+
+static ssize_t tg_set_max(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct blkcg *blkcg = css_to_blkcg(of_css(of));
+ struct blkg_conf_ctx ctx;
+ struct throtl_grp *tg;
+ u64 v[4];
+ int ret;
+
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+ if (ret)
+ return ret;
+
+ tg = blkg_to_tg(ctx.blkg);
+
+ v[0] = tg->bps[READ];
+ v[1] = tg->bps[WRITE];
+ v[2] = tg->iops[READ];
+ v[3] = tg->iops[WRITE];
+
+ while (true) {
+ char tok[27]; /* wiops=18446744073709551616 */
+ char *p;
+ u64 val = -1;
+ int len;
+
+ if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
+ break;
+ if (tok[0] == '\0')
+ break;
+ ctx.body += len;
+
+ ret = -EINVAL;
+ p = tok;
+ strsep(&p, "=");
+ if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
+ goto out_finish;
+
+ ret = -ERANGE;
+ if (!val)
+ goto out_finish;
+
+ ret = -EINVAL;
+ if (!strcmp(tok, "rbps"))
+ v[0] = val;
+ else if (!strcmp(tok, "wbps"))
+ v[1] = val;
+ else if (!strcmp(tok, "riops"))
+ v[2] = min_t(u64, val, UINT_MAX);
+ else if (!strcmp(tok, "wiops"))
+ v[3] = min_t(u64, val, UINT_MAX);
+ else
+ goto out_finish;
+ }
+
+ tg->bps[READ] = v[0];
+ tg->bps[WRITE] = v[1];
+ tg->iops[READ] = v[2];
+ tg->iops[WRITE] = v[3];
+
+ tg_conf_updated(tg);
+ ret = 0;
+out_finish:
+ blkg_conf_finish(&ctx);
+ return ret ?: nbytes;
+}
+
+static struct cftype throtl_files[] = {
+ {
+ .name = "max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = tg_print_max,
+ .write = tg_set_max,
+ },
+ { } /* terminate */
+};
+
static void throtl_shutdown_wq(struct request_queue *q)
{
struct throtl_data *td = q->td;
@@ -1273,6 +1384,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
}

static struct blkcg_policy blkcg_policy_throtl = {
+ .dfl_cftypes = throtl_files,
.legacy_cftypes = throtl_legacy_files,

.pd_alloc_fn = throtl_pd_alloc,
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 7a72301..97da571 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1740,7 +1740,7 @@ static int cfq_print_leaf_weight(struct seq_file *sf, void *v)

static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off,
- bool is_leaf_weight)
+ bool on_dfl, bool is_leaf_weight)
{
struct blkcg *blkcg = css_to_blkcg(of_css(of));
struct blkg_conf_ctx ctx;
@@ -1753,9 +1753,17 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
if (ret)
return ret;

- ret = -EINVAL;
- if (sscanf(ctx.body, "%llu", &v) != 1)
+ if (sscanf(ctx.body, "%llu", &v) == 1) {
+ /* require "default" on dfl */
+ ret = -ERANGE;
+ if (!v && on_dfl)
+ goto out_finish;
+ } else if (!strcmp(strim(ctx.body), "default")) {
+ v = 0;
+ } else {
+ ret = -EINVAL;
goto out_finish;
+ }

cfqg = blkg_to_cfqg(ctx.blkg);
cfqgd = blkcg_to_cfqgd(blkcg);
@@ -1779,13 +1787,13 @@ static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of,
static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- return __cfqg_set_weight_device(of, buf, nbytes, off, false);
+ return __cfqg_set_weight_device(of, buf, nbytes, off, false, false);
}

static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
- return __cfqg_set_weight_device(of, buf, nbytes, off, true);
+ return __cfqg_set_weight_device(of, buf, nbytes, off, false, true);
}

static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val,
@@ -2103,6 +2111,48 @@ static struct cftype cfq_blkcg_legacy_files[] = {
#endif /* CONFIG_DEBUG_BLK_CGROUP */
{ } /* terminate */
};
+
+static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg);
+
+ seq_printf(sf, "default %u\n", cgd->weight);
+ blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device,
+ &blkcg_policy_cfq, 0, false);
+ return 0;
+}
+
+static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ char *endp;
+ int ret;
+ u64 v;
+
+ buf = strim(buf);
+
+ /* "WEIGHT" or "default WEIGHT" sets the default weight */
+ v = simple_strtoull(buf, &endp, 0);
+ if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
+ ret = __cfq_set_weight(of_css(of), v, false);
+ return ret ?: nbytes;
+ }
+
+ /* "MAJ:MIN WEIGHT" */
+ return __cfqg_set_weight_device(of, buf, nbytes, off, true, false);
+}
+
+static struct cftype cfq_blkcg_files[] = {
+ {
+ .name = "weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = cfq_print_weight_on_dfl,
+ .write = cfq_set_weight_on_dfl,
+ },
+ { } /* terminate */
+};
+
#else /* GROUP_IOSCHED */
static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd,
struct blkcg *blkcg)
@@ -4659,6 +4709,7 @@ static struct elevator_type iosched_cfq = {

#ifdef CONFIG_CFQ_GROUP_IOSCHED
static struct blkcg_policy blkcg_policy_cfq = {
+ .dfl_cftypes = cfq_blkcg_files,
.legacy_cftypes = cfq_blkcg_legacy_files,

.cpd_alloc_fn = cfq_cpd_alloc,
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index b270aef..9a7c4bd 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -148,6 +148,7 @@ typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
struct blkcg_policy {
int plid;
/* cgroup files for the policy */
+ struct cftype *dfl_cftypes;
struct cftype *legacy_cftypes;

/* operations */
--
2.4.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/