Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical scheduling anduse_hierarchy interface
From: Justin TerAvest
Date: Wed Feb 16 2011 - 19:32:04 EST
After a quick read,
It's sad that we have to have so many use_hierarchy checks; it seems
like we're asking for bugs, especially in the future when one codepath
gets updated but not the other.
CodingStyle says we should only have one declaration per line.
I feel like there is an implicit assumption that groups and tasks
should not be children of the same parent; that is, a group should
contain only groups, or only tasks, but I don't see this enforced;
there's just and assumption that BE:SYNC is "good enough" for that
comparison. This smells like something that will be tweaked/tuned for
fairness later. :( Why don't we just prevent this from happening?
The clean_up label in chain_alloc() is strange; I don't think the goto
is necessary at all. I found that method generally hard to understand.
It's doing a lot.
It's possible that some of these can't be worked around.
On Wed, Feb 9, 2011 at 11:47 PM, Gui Jianfeng
<guijianfeng@xxxxxxxxxxxxxx> wrote:
> CFQ group hierarchical scheduling and use_hierarchy interface.
>
> Signed-off-by: Gui Jianfeng <guijianfeng@xxxxxxxxxxxxxx>
> ---
> block/blk-cgroup.c | 61 +++++-
> block/blk-cgroup.h | 3 +
> block/cfq-iosched.c | 603 +++++++++++++++++++++++++++++++++++++--------------
> 3 files changed, 500 insertions(+), 167 deletions(-)
>
> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
> index 455768a..c55fecd 100644
> --- a/block/blk-cgroup.c
> +++ b/block/blk-cgroup.c
> @@ -25,7 +25,10 @@
> static DEFINE_SPINLOCK(blkio_list_lock);
> static LIST_HEAD(blkio_list);
>
> -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
> +struct blkio_cgroup blkio_root_cgroup = {
> + .weight = 2*BLKIO_WEIGHT_DEFAULT,
> + .use_hierarchy = 0
> +};
> EXPORT_SYMBOL_GPL(blkio_root_cgroup);
>
> static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
> @@ -454,6 +457,7 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
> blkg->blkcg_id = 0;
> }
>
> +
> /*
> * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
> * indicating that blk_group was unhashed by the time we got to it.
> @@ -765,6 +769,12 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
> }
> EXPORT_SYMBOL_GPL(blkcg_get_weight);
>
> +unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg)
> +{
> + return blkcg->use_hierarchy;
> +}
> +EXPORT_SYMBOL_GPL(blkcg_get_use_hierarchy);
> +
> uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
> {
> struct blkio_policy_node *pn;
> @@ -1202,6 +1212,8 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
> switch(name) {
> case BLKIO_PROP_weight:
> return (u64)blkcg->weight;
> + case BLKIO_PROP_use_hierarchy:
> + return (u64)blkcg->use_hierarchy;
> }
> break;
> default:
> @@ -1210,6 +1222,36 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
> return 0;
> }
>
> +static int blkio_use_hierarchy_write(struct cgroup *cgrp, u64 val)
> +{
> + struct cgroup *parent = cgrp->parent;
> + struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
> + int ret = 0;
> +
> + if (val != 0 && val != 1)
> + return -EINVAL;
> +
> + blkcg = cgroup_to_blkio_cgroup(cgrp);
> + if (parent)
> + parent_blkcg = cgroup_to_blkio_cgroup(parent);
> +
> + cgroup_lock();
> + /*
> + * If parent's use_hierarchy is set, we can't make any modifications
> + * in the child subtrees. If it is unset, then the change can occur,
> + * provided the current cgroup has no children.
> + */
> + if (!parent_blkcg || !parent_blkcg->use_hierarchy) {
> + if (list_empty(&cgrp->children))
> + blkcg->use_hierarchy = val;
> + else
> + ret = -EBUSY;
> + } else
> + ret = -EINVAL;
> + cgroup_unlock();
> + return ret;
> +}
> +
> static int
> blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
> {
> @@ -1224,6 +1266,8 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
> switch(name) {
> case BLKIO_PROP_weight:
> return blkio_weight_write(blkcg, val);
> + case BLKIO_PROP_use_hierarchy:
> + return blkio_use_hierarchy_write(cgrp, val);
> }
> break;
> default:
> @@ -1301,6 +1345,13 @@ struct cftype blkio_files[] = {
> .name = "reset_stats",
> .write_u64 = blkiocg_reset_stats,
> },
> + {
> + .name = "use_hierarchy",
> + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
> + BLKIO_PROP_use_hierarchy),
> + .read_u64 = blkiocg_file_read_u64,
> + .write_u64 = blkiocg_file_write_u64,
> + },
> #ifdef CONFIG_BLK_DEV_THROTTLING
> {
> .name = "throttle.read_bps_device",
> @@ -1444,7 +1495,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
> static struct cgroup_subsys_state *
> blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
> {
> - struct blkio_cgroup *blkcg;
> + struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
> struct cgroup *parent = cgroup->parent;
>
> if (!parent) {
> @@ -1452,6 +1503,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
> goto done;
> }
>
> + parent_blkcg = cgroup_to_blkio_cgroup(parent);
> blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
> if (!blkcg)
> return ERR_PTR(-ENOMEM);
> @@ -1462,6 +1514,11 @@ done:
> INIT_HLIST_HEAD(&blkcg->blkg_list);
>
> INIT_LIST_HEAD(&blkcg->policy_list);
> + if (parent)
> + blkcg->use_hierarchy = parent_blkcg->use_hierarchy;
> + else
> + blkcg->use_hierarchy = 0;
> +
> return &blkcg->css;
> }
>
> diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
> index ea4861b..5b4b351 100644
> --- a/block/blk-cgroup.h
> +++ b/block/blk-cgroup.h
> @@ -90,6 +90,7 @@ enum blkcg_file_name_prop {
> BLKIO_PROP_idle_time,
> BLKIO_PROP_empty_time,
> BLKIO_PROP_dequeue,
> + BLKIO_PROP_use_hierarchy,
> };
>
> /* cgroup files owned by throttle policy */
> @@ -105,6 +106,7 @@ enum blkcg_file_name_throtl {
> struct blkio_cgroup {
> struct cgroup_subsys_state css;
> unsigned int weight;
> + bool use_hierarchy;
> spinlock_t lock;
> struct hlist_head blkg_list;
> struct list_head policy_list; /* list of blkio_policy_node */
> @@ -179,6 +181,7 @@ struct blkio_policy_node {
>
> extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
> dev_t dev);
> +extern unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg);
> extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
> dev_t dev);
> extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
> index aa3eda8..0e21d27 100644
> --- a/block/cfq-iosched.c
> +++ b/block/cfq-iosched.c
> @@ -110,6 +110,9 @@ struct cfq_entity {
> u64 vdisktime;
> bool is_group_entity;
> unsigned int weight;
> + struct cfq_entity *parent;
> + /* Reposition time */
> + unsigned long reposition_time;
> };
>
> /*
> @@ -118,8 +121,6 @@ struct cfq_entity {
> struct cfq_queue {
> /* The schedule entity */
> struct cfq_entity cfqe;
> - /* Reposition time */
> - unsigned long reposition_time;
> /* reference count */
> int ref;
> /* various state flags, see below */
> @@ -199,6 +200,9 @@ struct cfq_group {
> /* number of cfqq currently on this group */
> int nr_cfqq;
>
> + /* number of sub cfq groups */
> + int nr_subgp;
> +
> /*
> * Per group busy queus average. Useful for workload slice calc. We
> * create the array for each prio class but at run time it is used
> @@ -234,10 +238,11 @@ struct cfq_group {
> */
> struct cfq_data {
> struct request_queue *queue;
> - /* Root service tree for cfq_groups */
> - struct cfq_rb_root grp_service_tree;
> struct cfq_group root_group;
>
> + /* cfq group schedule in flat or hierarchy manner. */
> + bool use_hierarchy;
> +
> /*
> * The priority currently being served
> */
> @@ -246,6 +251,9 @@ struct cfq_data {
> unsigned long workload_expires;
> struct cfq_group *serving_group;
>
> + /* Service tree for cfq group flat scheduling mode. */
> + struct cfq_rb_root grp_service_tree;
> +
> /*
> * Each priority tree is sorted by next_request position. These
> * trees are used when determining if two or more queues are
> @@ -355,8 +363,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
> }
>
>
> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
> -
> static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
> enum wl_prio_t prio,
> enum wl_type_t type)
> @@ -643,13 +649,50 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
> return cfqg->busy_queues_avg[rt];
> }
>
> +static inline unsigned int
> +cfq_group_get_total_weight(struct cfq_group *cfqg)
> +{
> + int i, j;
> + struct cfq_rb_root *st;
> + unsigned int total_weight = 0;
> +
> + for_each_cfqg_st(cfqg, i, j, st) {
> + total_weight += st->total_weight;
> + }
> +
> + return total_weight;
> +}
> +
> static inline unsigned
> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> struct cfq_entity *cfqe = &cfqg->cfqe;
> + struct cfq_rb_root *st;
> + int group_slice = cfq_target_latency;
> + unsigned int grp_total_weight;
> + struct cfq_group *p_cfqg;
> +
> + /*
> + * Calculate group slice in a hierarchical way.
> + * Note, the calculation is cross all service trees under a group.
> + */
> + do {
> + if (cfqe->parent) {
> + p_cfqg = cfqg_of_entity(cfqe->parent);
> + grp_total_weight = cfq_group_get_total_weight(p_cfqg);
> + group_slice = group_slice * cfqe->weight /
> + grp_total_weight;
> + } else {
> + /* For top level groups */
> + st = cfqe->service_tree;
> + group_slice = group_slice * cfqe->weight /
> + st->total_weight;
> + }
>
> - return cfq_target_latency * cfqe->weight / st->total_weight;
> + cfqe = cfqe->parent;
> + } while (cfqe);
> +
> + return group_slice;
> }
>
> static inline void
> @@ -672,7 +715,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
> /* scale low_slice according to IO priority
> * and sync vs async */
> unsigned low_slice =
> - min(slice, base_low_slice * slice / sync_slice);
> + min(slice, base_low_slice * slice /
> + sync_slice);
> /* the adapted slice value is scaled to fit all iqs
> * into the target latency */
> slice = max(slice * group_slice / expect_latency,
> @@ -812,17 +856,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
> return NULL;
> }
>
> -static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
> -{
> - if (!root->left)
> - root->left = rb_first(&root->rb);
> -
> - if (root->left)
> - return rb_entry_entity(root->left);
> -
> - return NULL;
> -}
> -
> static void rb_erase_init(struct rb_node *n, struct rb_root *root)
> {
> rb_erase(n, root);
> @@ -896,12 +929,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>
> rb_link_node(&cfqe->rb_node, parent, node);
> rb_insert_color(&cfqe->rb_node, &st->rb);
> +
> + update_min_vdisktime(st);
> }
>
> static void
> cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> {
> __cfq_entity_service_tree_add(st, cfqe);
> + cfqe->reposition_time = jiffies;
> st->count++;
> st->total_weight += cfqe->weight;
> }
> @@ -909,34 +945,52 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> static void
> cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> struct cfq_entity *cfqe = &cfqg->cfqe;
> - struct cfq_entity *__cfqe;
> struct rb_node *n;
> + struct cfq_entity *entity;
> + struct cfq_rb_root *st;
> + struct cfq_group *__cfqg;
>
> cfqg->nr_cfqq++;
> +
> if (!RB_EMPTY_NODE(&cfqe->rb_node))
> return;
>
> /*
> - * Currently put the group at the end. Later implement something
> - * so that groups get lesser vtime based on their weights, so that
> - * if group does not loose all if it was not continously backlogged.
> + * Enqueue this group and its ancestors onto their service tree.
> */
> - n = rb_last(&st->rb);
> - if (n) {
> - __cfqe = rb_entry_entity(n);
> - cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
> - } else
> - cfqe->vdisktime = st->min_vdisktime;
> + while (cfqe) {
> + if (!RB_EMPTY_NODE(&cfqe->rb_node))
> + return;
>
> - cfq_entity_service_tree_add(st, cfqe);
> + /*
> + * Currently put the group at the end. Later implement
> + * something so that groups get lesser vtime based on
> + * their weights, so that if group does not loose all
> + * if it was not continously backlogged.
> + */
> + st = cfqe->service_tree;
> + n = rb_last(&st->rb);
> + if (n) {
> + entity = rb_entry_entity(n);
> + cfqe->vdisktime = entity->vdisktime +
> + CFQ_IDLE_DELAY;
> + } else
> + cfqe->vdisktime = st->min_vdisktime;
> +
> + cfq_entity_service_tree_add(st, cfqe);
> + cfqe = cfqe->parent;
> + __cfqg = cfqg_of_entity(cfqe);
> + if (__cfqg)
> + __cfqg->nr_subgp++;
> + }
> }
>
> static void
> __cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> {
> cfq_rb_erase(&cfqe->rb_node, st);
> + update_min_vdisktime(st);
> }
>
> static void
> @@ -945,27 +999,43 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
> if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
> __cfq_entity_service_tree_del(st, cfqe);
> st->total_weight -= cfqe->weight;
> - cfqe->service_tree = NULL;
> }
> }
>
> static void
> cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
> {
> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> struct cfq_entity *cfqe = &cfqg->cfqe;
> + struct cfq_group *__cfqg, *p_cfqg;
>
> BUG_ON(cfqg->nr_cfqq < 1);
> cfqg->nr_cfqq--;
>
> - /* If there are other cfq queues under this group, don't delete it */
> - if (cfqg->nr_cfqq)
> + /*
> + * If there are other cfq queues under this group, or there are other
> + * cfq groups under this group, don't delete it.
> + */
> + if (cfqg->nr_cfqq || cfqg->nr_subgp)
> return;
>
> - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
> - cfq_entity_service_tree_del(st, cfqe);
> - cfqg->saved_workload_slice = 0;
> - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
> + /*
> + * Dequeue this group and its ancestors from their service
> + * tree.
> + */
> + while (cfqe) {
> + __cfqg = cfqg_of_entity(cfqe);
> + p_cfqg = cfqg_of_entity(cfqe->parent);
> + cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
> + cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
> + cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
> + __cfqg->saved_workload_slice = 0;
> + cfqe = cfqe->parent;
> + if (p_cfqg) {
> + p_cfqg->nr_subgp--;
> + if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
> + return;
> + }
> + }
> }
>
> static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
> @@ -997,7 +1067,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
> static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
> struct cfq_queue *cfqq)
> {
> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
> unsigned int used_sl, charge;
> int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
> - cfqg->service_tree_idle.count;
> @@ -1011,10 +1080,23 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
> else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
> charge = cfqq->allocated_slice;
>
> - /* Can't update vdisktime while group is on service tree */
> - __cfq_entity_service_tree_del(st, cfqe);
> - cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
> - __cfq_entity_service_tree_add(st, cfqe);
> + /*
> + * Update the vdisktime on the whole chain.
> + */
> + while (cfqe) {
> + struct cfq_rb_root *st = cfqe->service_tree;
> +
> + /*
> + * Can't update vdisktime while group is on service
> + * tree.
> + */
> + __cfq_entity_service_tree_del(st, cfqe);
> + cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
> + __cfq_entity_service_tree_add(st, cfqe);
> + st->count++;
> + cfqe->reposition_time = jiffies;
> + cfqe = cfqe->parent;
> + }
>
> /* This group is being expired. Save the context */
> if (time_after(cfqd->workload_expires, jiffies)) {
> @@ -1026,7 +1108,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
> cfqg->saved_workload_slice = 0;
>
> cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
> - cfqe->vdisktime, st->min_vdisktime);
> + cfqg->cfqe.vdisktime,
> + cfqg->cfqe.service_tree->min_vdisktime);
> cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
> " sect=%u", used_sl, cfqq->slice_dispatch, charge,
> iops_mode(cfqd), cfqq->nr_sectors);
> @@ -1048,35 +1131,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
> cfqg_of_blkg(blkg)->cfqe.weight = weight;
> }
>
> -static struct cfq_group *
> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> +static void init_cfqe(struct blkio_cgroup *blkcg,
> + struct cfq_group *cfqg)
> +{
> + struct cfq_entity *cfqe = &cfqg->cfqe;
> +
> + cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
> + RB_CLEAR_NODE(&cfqe->rb_node);
> + cfqe->is_group_entity = true;
> + cfqe->parent = NULL;
> +}
> +
> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
> + struct cfq_group *cfqg)
> {
> - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> - struct cfq_group *cfqg = NULL;
> - void *key = cfqd;
> int i, j;
> struct cfq_rb_root *st;
> - struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> unsigned int major, minor;
> -
> - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> - cfqg->blkg.dev = MKDEV(major, minor);
> - goto done;
> - }
> - if (cfqg || !create)
> - goto done;
> -
> - cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
> - if (!cfqg)
> - goto done;
> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>
> for_each_cfqg_st(cfqg, i, j, st)
> *st = CFQ_RB_ROOT;
> - RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
> -
> - cfqg->cfqe.is_group_entity = true;
>
> /*
> * Take the initial reference that will be released on destroy
> @@ -1086,24 +1161,199 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> */
> cfqg->ref = 1;
>
> + /* Add group onto cgroup list */
> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
> + MKDEV(major, minor));
> + /* Initiate group entity */
> + init_cfqe(blkcg, cfqg);
> + /* Add group on cfqd list */
> + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
> +}
> +
> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
> +
> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
> +{
> + if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
> + cfq_destroy_cfqg(cfqd, cfqg);
> +}
> +
> +static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
> + struct cfq_group *p_cfqg)
> +{
> + struct cfq_entity *cfqe, *p_cfqe;
> +
> + cfqe = &cfqg->cfqe;
> +
> /*
> - * Add group onto cgroup list. It might happen that bdi->dev is
> - * not initiliazed yet. Initialize this new group without major
> - * and minor info and this info will be filled in once a new thread
> - * comes for IO. See code above.
> + * 1. If use_hierarchy of the CGroup where cfqg's parent stays is not
> + * set, we put this cfqg onto global service tree.
> + * 2. If cfqg is root cfqg, put it onto global service tree.
> */
> - if (bdi->dev) {
> - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
> - MKDEV(major, minor));
> - } else
> - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
> - 0);
> + if (!p_cfqg) {
> + cfqe->service_tree = &cfqd->grp_service_tree;
> + cfqe->parent = NULL;
> + return;
> + }
>
> - cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
> + p_cfqe = &p_cfqg->cfqe;
>
> - /* Add group on cfqd list */
> - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
> + cfqe->parent = p_cfqe;
> +
> + /*
> + * Currently, just put cfq group entity on "BE:SYNC" workload
> + * service tree.
> + */
> + cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
> + SYNC_WORKLOAD);
> + /* child reference */
> + p_cfqg->ref++;
> +}
> +
> +static struct cfq_group *cfqg_get_parent(struct cfq_group * cfqg)
> +{
> + struct cfq_entity *cfqe, *p_cfqe;
> +
> + if (!cfqg)
> + return NULL;
> +
> + cfqe = &cfqg->cfqe;
> + p_cfqe = cfqe->parent;
> + if (!p_cfqe)
> + return NULL;
> +
> + return cfqg_of_entity(p_cfqe);
> +}
> +
> +static struct cfq_group *
> +cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
> +{
> + struct blkio_cgroup *blkcg;
> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> + unsigned int major, minor;
> + struct cfq_group *cfqg, *leaf_cfqg, *child_cfqg, *tmp_cfqg;
> + void *key = cfqd;
> +
> + /*
> + * If CGroup's use_hierarchy is unset, we just need to allocate only
> + * one CFQ group, and this group will put onto the "grp_service_tree".
> + * We don't need to check whether the cfqg exists, the caller has
> + * already checked it.
> + */
> + blkcg = cgroup_to_blkio_cgroup(cgroup);
> + if (!blkcg_get_use_hierarchy(blkcg)) {
> + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
> + cfqd->queue->node);
> + if (!cfqg)
> + return NULL;
> +
> + init_cfqg(cfqd, blkcg, cfqg);
> + cfqg_set_parent(cfqd, cfqg, NULL);
> + return cfqg;
> + }
> +
> + /*
> + * Allocate the CFQ group chain until we meet the group we'v already
> + * allocated before, or to the CGroup whose use_hierarchy is not set.
> + */
> + leaf_cfqg = NULL;
> + child_cfqg = NULL;
> + for (; cgroup != NULL; cgroup = cgroup->parent) {
> + blkcg = cgroup_to_blkio_cgroup(cgroup);
> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> + if (cfqg) {
> + if (!cfqg->blkg.dev && bdi->dev &&
> + dev_name(bdi->dev)) {
> + sscanf(dev_name(bdi->dev), "%u:%u",
> + &major, &minor);
> + cfqg->blkg.dev = MKDEV(major, minor);
> + }
> +
> + /*
> + * Initialization of parent doesn't finish yet, get
> + * it done.
> + */
> + if (child_cfqg) {
> + if (blkcg_get_use_hierarchy(blkcg))
> + cfqg_set_parent(cfqd, child_cfqg,
> + cfqg);
> + else
> + cfqg_set_parent(cfqd, child_cfqg,
> + NULL);
> + }
> +
> + /* chain has already been built */
> + break;
> + }
> +
> + /*
> + * We only allocate a cfqg that the corresponding cgroup's
> + * use_hierarchy is set.
> + */
> + if (blkcg_get_use_hierarchy(blkcg)) {
> + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
> + cfqd->queue->node);
> + if (!cfqg)
> + goto clean_up;
> +
> + if (!leaf_cfqg)
> + leaf_cfqg = cfqg;
> +
> + init_cfqg(cfqd, blkcg, cfqg);
> + } else {
> + cfqg = NULL;
> + }
> +
> + if (child_cfqg)
> + cfqg_set_parent(cfqd, child_cfqg, cfqg);
> +
> + /*
> + * This CGroup's use_hierarchy isn't set, this means the CFQ
> + * group chain has been built.
> + */
> + if (!blkcg_get_use_hierarchy(blkcg))
> + break;
> +
> + child_cfqg = cfqg;
> + }
> +
> + return leaf_cfqg;
> +
> +clean_up:
> + /* clean up the allocated cfq groups. */
> + while (leaf_cfqg) {
> + tmp_cfqg = leaf_cfqg;
> + leaf_cfqg = cfqg_get_parent(leaf_cfqg);
> + uninit_cfqg(cfqd, tmp_cfqg);
> + }
> +
> + return NULL;
> +}
> +
> +static struct cfq_group *
> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
> +{
> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
> + struct cfq_group *cfqg = NULL;
> + void *key = cfqd;
> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
> + unsigned int major, minor;
> +
> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
> + cfqg->blkg.dev = MKDEV(major, minor);
> + goto done;
> + }
> + if (cfqg || !create)
> + goto done;
> +
> + /*
> + * Allocate CFQ group chain to the root group or we meet the CGroup
> + * with use_hierarchy disabled.
> + */
> + cfqg = cfqg_chain_alloc(cfqd, cgroup);
>
> done:
> return cfqg;
> @@ -1148,6 +1398,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
> {
> struct cfq_rb_root *st;
> int i, j;
> + struct cfq_group *p_cfqg;
>
> BUG_ON(cfqg->ref <= 0);
> cfqg->ref--;
> @@ -1155,6 +1406,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
> return;
> for_each_cfqg_st(cfqg, i, j, st)
> BUG_ON(!RB_EMPTY_ROOT(&st->rb));
> +
> + do {
> + p_cfqg = cfqg_get_parent(cfqg);
> + kfree(cfqg);
> + cfqg = NULL;
> + /*
> + * Drop the reference taken by children, if nobody references
> + * parent group, we need delete the parent also.
> + */
> + if (p_cfqg) {
> + p_cfqg->ref--;
> + if (p_cfqg->ref == 0)
> + cfqg = p_cfqg;
> + }
> + } while (cfqg);
> +
> kfree(cfqg);
> }
>
> @@ -1321,9 +1588,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
> * ioprio.
> */
> pos_offset = cfq_get_boost(cfqd, cfqq);
> - /* Debug purpose, should remove. */
> - cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
> - pos_offset);
> cfqe->vdisktime = service_tree->min_vdisktime +
> pos_offset;
> } else
> @@ -1365,9 +1629,8 @@ insert:
> cfqe->service_tree = service_tree;
>
> /* Add cfqq onto service tree. */
> +
> cfq_entity_service_tree_add(service_tree, cfqe);
> - update_min_vdisktime(service_tree);
> - cfqq->reposition_time = jiffies;
> if ((add_front || !new_cfqq) && !group_changed)
> return;
> cfq_group_service_tree_add(cfqd, cfqq->cfqg);
> @@ -1810,28 +2073,43 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
> return cfqq_of_entity(cfq_rb_first(service_tree));
> }
>
> -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
> +struct cfq_rb_root *choose_service_tree_forced(struct cfq_group *cfqg)
> {
> - struct cfq_group *cfqg;
> - struct cfq_entity *cfqe;
> int i, j;
> struct cfq_rb_root *st;
>
> - if (!cfqd->rq_queued)
> - return NULL;
> + for_each_cfqg_st(cfqg, i, j, st) {
> + if (st->count != 0)
> + return st;
> + }
>
> - cfqg = cfq_get_next_cfqg(cfqd);
> - if (!cfqg)
> + return NULL;
> +}
> +
> +static struct cfq_entity *
> +cfq_get_next_entity_forced(struct cfq_data *cfqd)
> +{
> + struct cfq_entity *cfqe;
> + struct cfq_rb_root *st = &cfqd->grp_service_tree;
> + struct cfq_group *cfqg;
> +
> + if (!cfqd->rq_queued)
> return NULL;
>
> - for_each_cfqg_st(cfqg, i, j, st) {
> + do {
> cfqe = cfq_rb_first(st);
> - if (cfqe != NULL)
> - return cfqq_of_entity(cfqe);
> - }
> + if (cfqe && !cfqe->is_group_entity)
> + return cfqe;
> + else if (cfqe && cfqe->is_group_entity)
> + cfqg = cfqg_of_entity(cfqe);
> +
> + st = choose_service_tree_forced(cfqg);
> + } while (st);
> +
> return NULL;
> }
>
> +
> /*
> * Get and set a new active qu
>
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at http://www.tux.org/lkml/
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/