Re: [PATCH 5/6 v4] cfq-iosched: CFQ group hierarchical schedulingand use_hierarchy interface

From: Gui Jianfeng
Date: Wed Feb 16 2011 - 20:22:18 EST


Justin TerAvest wrote:
> After a quick read,
>
> It's sad that we have to have so many use_hierarchy checks; it seems
> like we're asking for bugs, especially in the future when one codepath
> gets updated but not the other.
>
> CodingStyle says we should only have one declaration per line.
>
> I feel like there is an implicit assumption that groups and tasks
> should not be children of the same parent; that is, a group should
> contain only groups, or only tasks, but I don't see this enforced;
> there's just and assumption that BE:SYNC is "good enough" for that
> comparison. This smells like something that will be tweaked/tuned for
> fairness later. :( Why don't we just prevent this from happening?

Hi Justin,

Thanks for reviewing.

Previously, I posted very first version that makes a group containing only
groups or only tasks. But I think it's more flexible to treat groups and
tasks at the same level. I think Vivek and Jens have the same opinion.
We had discussed in this thread http://lkml.org/lkml/2010/8/30/30

>
> The clean_up label in chain_alloc() is strange; I don't think the goto
> is necessary at all. I found that method generally hard to understand.
> It's doing a lot.

I don't understand why clean_up isn't needed.
When we fail to allocate a cfq group at some level, we have to clean up
all groups in the chain that we have already allocated.

Thanks,
Gui

>
> It's possible that some of these can't be worked around.
>
>
> On Wed, Feb 9, 2011 at 11:47 PM, Gui Jianfeng
> <guijianfeng@xxxxxxxxxxxxxx> wrote:
>> CFQ group hierarchical scheduling and use_hierarchy interface.
>>
>> Signed-off-by: Gui Jianfeng <guijianfeng@xxxxxxxxxxxxxx>
>> ---
>> block/blk-cgroup.c | 61 +++++-
>> block/blk-cgroup.h | 3 +
>> block/cfq-iosched.c | 603 +++++++++++++++++++++++++++++++++++++--------------
>> 3 files changed, 500 insertions(+), 167 deletions(-)
>>
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index 455768a..c55fecd 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -25,7 +25,10 @@
>> static DEFINE_SPINLOCK(blkio_list_lock);
>> static LIST_HEAD(blkio_list);
>>
>> -struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
>> +struct blkio_cgroup blkio_root_cgroup = {
>> + .weight = 2*BLKIO_WEIGHT_DEFAULT,
>> + .use_hierarchy = 0
>> +};
>> EXPORT_SYMBOL_GPL(blkio_root_cgroup);
>>
>> static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
>> @@ -454,6 +457,7 @@ static void __blkiocg_del_blkio_group(struct blkio_group *blkg)
>> blkg->blkcg_id = 0;
>> }
>>
>> +
>> /*
>> * returns 0 if blkio_group was still on cgroup list. Otherwise returns 1
>> * indicating that blk_group was unhashed by the time we got to it.
>> @@ -765,6 +769,12 @@ unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>> }
>> EXPORT_SYMBOL_GPL(blkcg_get_weight);
>>
>> +unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg)
>> +{
>> + return blkcg->use_hierarchy;
>> +}
>> +EXPORT_SYMBOL_GPL(blkcg_get_use_hierarchy);
>> +
>> uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg, dev_t dev)
>> {
>> struct blkio_policy_node *pn;
>> @@ -1202,6 +1212,8 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
>> switch(name) {
>> case BLKIO_PROP_weight:
>> return (u64)blkcg->weight;
>> + case BLKIO_PROP_use_hierarchy:
>> + return (u64)blkcg->use_hierarchy;
>> }
>> break;
>> default:
>> @@ -1210,6 +1222,36 @@ static u64 blkiocg_file_read_u64 (struct cgroup *cgrp, struct cftype *cft) {
>> return 0;
>> }
>>
>> +static int blkio_use_hierarchy_write(struct cgroup *cgrp, u64 val)
>> +{
>> + struct cgroup *parent = cgrp->parent;
>> + struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
>> + int ret = 0;
>> +
>> + if (val != 0 && val != 1)
>> + return -EINVAL;
>> +
>> + blkcg = cgroup_to_blkio_cgroup(cgrp);
>> + if (parent)
>> + parent_blkcg = cgroup_to_blkio_cgroup(parent);
>> +
>> + cgroup_lock();
>> + /*
>> + * If parent's use_hierarchy is set, we can't make any modifications
>> + * in the child subtrees. If it is unset, then the change can occur,
>> + * provided the current cgroup has no children.
>> + */
>> + if (!parent_blkcg || !parent_blkcg->use_hierarchy) {
>> + if (list_empty(&cgrp->children))
>> + blkcg->use_hierarchy = val;
>> + else
>> + ret = -EBUSY;
>> + } else
>> + ret = -EINVAL;
>> + cgroup_unlock();
>> + return ret;
>> +}
>> +
>> static int
>> blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
>> {
>> @@ -1224,6 +1266,8 @@ blkiocg_file_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
>> switch(name) {
>> case BLKIO_PROP_weight:
>> return blkio_weight_write(blkcg, val);
>> + case BLKIO_PROP_use_hierarchy:
>> + return blkio_use_hierarchy_write(cgrp, val);
>> }
>> break;
>> default:
>> @@ -1301,6 +1345,13 @@ struct cftype blkio_files[] = {
>> .name = "reset_stats",
>> .write_u64 = blkiocg_reset_stats,
>> },
>> + {
>> + .name = "use_hierarchy",
>> + .private = BLKIOFILE_PRIVATE(BLKIO_POLICY_PROP,
>> + BLKIO_PROP_use_hierarchy),
>> + .read_u64 = blkiocg_file_read_u64,
>> + .write_u64 = blkiocg_file_write_u64,
>> + },
>> #ifdef CONFIG_BLK_DEV_THROTTLING
>> {
>> .name = "throttle.read_bps_device",
>> @@ -1444,7 +1495,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>> static struct cgroup_subsys_state *
>> blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>> {
>> - struct blkio_cgroup *blkcg;
>> + struct blkio_cgroup *blkcg, *parent_blkcg = NULL;
>> struct cgroup *parent = cgroup->parent;
>>
>> if (!parent) {
>> @@ -1452,6 +1503,7 @@ blkiocg_create(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>> goto done;
>> }
>>
>> + parent_blkcg = cgroup_to_blkio_cgroup(parent);
>> blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
>> if (!blkcg)
>> return ERR_PTR(-ENOMEM);
>> @@ -1462,6 +1514,11 @@ done:
>> INIT_HLIST_HEAD(&blkcg->blkg_list);
>>
>> INIT_LIST_HEAD(&blkcg->policy_list);
>> + if (parent)
>> + blkcg->use_hierarchy = parent_blkcg->use_hierarchy;
>> + else
>> + blkcg->use_hierarchy = 0;
>> +
>> return &blkcg->css;
>> }
>>
>> diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
>> index ea4861b..5b4b351 100644
>> --- a/block/blk-cgroup.h
>> +++ b/block/blk-cgroup.h
>> @@ -90,6 +90,7 @@ enum blkcg_file_name_prop {
>> BLKIO_PROP_idle_time,
>> BLKIO_PROP_empty_time,
>> BLKIO_PROP_dequeue,
>> + BLKIO_PROP_use_hierarchy,
>> };
>>
>> /* cgroup files owned by throttle policy */
>> @@ -105,6 +106,7 @@ enum blkcg_file_name_throtl {
>> struct blkio_cgroup {
>> struct cgroup_subsys_state css;
>> unsigned int weight;
>> + bool use_hierarchy;
>> spinlock_t lock;
>> struct hlist_head blkg_list;
>> struct list_head policy_list; /* list of blkio_policy_node */
>> @@ -179,6 +181,7 @@ struct blkio_policy_node {
>>
>> extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>> dev_t dev);
>> +extern unsigned int blkcg_get_use_hierarchy(struct blkio_cgroup *blkcg);
>> extern uint64_t blkcg_get_read_bps(struct blkio_cgroup *blkcg,
>> dev_t dev);
>> extern uint64_t blkcg_get_write_bps(struct blkio_cgroup *blkcg,
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index aa3eda8..0e21d27 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -110,6 +110,9 @@ struct cfq_entity {
>> u64 vdisktime;
>> bool is_group_entity;
>> unsigned int weight;
>> + struct cfq_entity *parent;
>> + /* Reposition time */
>> + unsigned long reposition_time;
>> };
>>
>> /*
>> @@ -118,8 +121,6 @@ struct cfq_entity {
>> struct cfq_queue {
>> /* The schedule entity */
>> struct cfq_entity cfqe;
>> - /* Reposition time */
>> - unsigned long reposition_time;
>> /* reference count */
>> int ref;
>> /* various state flags, see below */
>> @@ -199,6 +200,9 @@ struct cfq_group {
>> /* number of cfqq currently on this group */
>> int nr_cfqq;
>>
>> + /* number of sub cfq groups */
>> + int nr_subgp;
>> +
>> /*
>> * Per group busy queus average. Useful for workload slice calc. We
>> * create the array for each prio class but at run time it is used
>> @@ -234,10 +238,11 @@ struct cfq_group {
>> */
>> struct cfq_data {
>> struct request_queue *queue;
>> - /* Root service tree for cfq_groups */
>> - struct cfq_rb_root grp_service_tree;
>> struct cfq_group root_group;
>>
>> + /* cfq group schedule in flat or hierarchy manner. */
>> + bool use_hierarchy;
>> +
>> /*
>> * The priority currently being served
>> */
>> @@ -246,6 +251,9 @@ struct cfq_data {
>> unsigned long workload_expires;
>> struct cfq_group *serving_group;
>>
>> + /* Service tree for cfq group flat scheduling mode. */
>> + struct cfq_rb_root grp_service_tree;
>> +
>> /*
>> * Each priority tree is sorted by next_request position. These
>> * trees are used when determining if two or more queues are
>> @@ -355,8 +363,6 @@ cfqg_of_entity(struct cfq_entity *cfqe)
>> }
>>
>>
>> -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
>> -
>> static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
>> enum wl_prio_t prio,
>> enum wl_type_t type)
>> @@ -643,13 +649,50 @@ static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
>> return cfqg->busy_queues_avg[rt];
>> }
>>
>> +static inline unsigned int
>> +cfq_group_get_total_weight(struct cfq_group *cfqg)
>> +{
>> + int i, j;
>> + struct cfq_rb_root *st;
>> + unsigned int total_weight = 0;
>> +
>> + for_each_cfqg_st(cfqg, i, j, st) {
>> + total_weight += st->total_weight;
>> + }
>> +
>> + return total_weight;
>> +}
>> +
>> static inline unsigned
>> cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct cfq_entity *cfqe = &cfqg->cfqe;
>> + struct cfq_rb_root *st;
>> + int group_slice = cfq_target_latency;
>> + unsigned int grp_total_weight;
>> + struct cfq_group *p_cfqg;
>> +
>> + /*
>> + * Calculate group slice in a hierarchical way.
>> + * Note, the calculation is cross all service trees under a group.
>> + */
>> + do {
>> + if (cfqe->parent) {
>> + p_cfqg = cfqg_of_entity(cfqe->parent);
>> + grp_total_weight = cfq_group_get_total_weight(p_cfqg);
>> + group_slice = group_slice * cfqe->weight /
>> + grp_total_weight;
>> + } else {
>> + /* For top level groups */
>> + st = cfqe->service_tree;
>> + group_slice = group_slice * cfqe->weight /
>> + st->total_weight;
>> + }
>>
>> - return cfq_target_latency * cfqe->weight / st->total_weight;
>> + cfqe = cfqe->parent;
>> + } while (cfqe);
>> +
>> + return group_slice;
>> }
>>
>> static inline void
>> @@ -672,7 +715,8 @@ cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
>> /* scale low_slice according to IO priority
>> * and sync vs async */
>> unsigned low_slice =
>> - min(slice, base_low_slice * slice / sync_slice);
>> + min(slice, base_low_slice * slice /
>> + sync_slice);
>> /* the adapted slice value is scaled to fit all iqs
>> * into the target latency */
>> slice = max(slice * group_slice / expect_latency,
>> @@ -812,17 +856,6 @@ static struct cfq_entity *cfq_rb_first(struct cfq_rb_root *root)
>> return NULL;
>> }
>>
>> -static struct cfq_entity *cfq_rb_first_entity(struct cfq_rb_root *root)
>> -{
>> - if (!root->left)
>> - root->left = rb_first(&root->rb);
>> -
>> - if (root->left)
>> - return rb_entry_entity(root->left);
>> -
>> - return NULL;
>> -}
>> -
>> static void rb_erase_init(struct rb_node *n, struct rb_root *root)
>> {
>> rb_erase(n, root);
>> @@ -896,12 +929,15 @@ __cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>>
>> rb_link_node(&cfqe->rb_node, parent, node);
>> rb_insert_color(&cfqe->rb_node, &st->rb);
>> +
>> + update_min_vdisktime(st);
>> }
>>
>> static void
>> cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> {
>> __cfq_entity_service_tree_add(st, cfqe);
>> + cfqe->reposition_time = jiffies;
>> st->count++;
>> st->total_weight += cfqe->weight;
>> }
>> @@ -909,34 +945,52 @@ cfq_entity_service_tree_add(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> static void
>> cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct cfq_entity *cfqe = &cfqg->cfqe;
>> - struct cfq_entity *__cfqe;
>> struct rb_node *n;
>> + struct cfq_entity *entity;
>> + struct cfq_rb_root *st;
>> + struct cfq_group *__cfqg;
>>
>> cfqg->nr_cfqq++;
>> +
>> if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> return;
>>
>> /*
>> - * Currently put the group at the end. Later implement something
>> - * so that groups get lesser vtime based on their weights, so that
>> - * if group does not loose all if it was not continously backlogged.
>> + * Enqueue this group and its ancestors onto their service tree.
>> */
>> - n = rb_last(&st->rb);
>> - if (n) {
>> - __cfqe = rb_entry_entity(n);
>> - cfqe->vdisktime = __cfqe->vdisktime + CFQ_IDLE_DELAY;
>> - } else
>> - cfqe->vdisktime = st->min_vdisktime;
>> + while (cfqe) {
>> + if (!RB_EMPTY_NODE(&cfqe->rb_node))
>> + return;
>>
>> - cfq_entity_service_tree_add(st, cfqe);
>> + /*
>> + * Currently put the group at the end. Later implement
>> + * something so that groups get lesser vtime based on
>> + * their weights, so that if group does not loose all
>> + * if it was not continously backlogged.
>> + */
>> + st = cfqe->service_tree;
>> + n = rb_last(&st->rb);
>> + if (n) {
>> + entity = rb_entry_entity(n);
>> + cfqe->vdisktime = entity->vdisktime +
>> + CFQ_IDLE_DELAY;
>> + } else
>> + cfqe->vdisktime = st->min_vdisktime;
>> +
>> + cfq_entity_service_tree_add(st, cfqe);
>> + cfqe = cfqe->parent;
>> + __cfqg = cfqg_of_entity(cfqe);
>> + if (__cfqg)
>> + __cfqg->nr_subgp++;
>> + }
>> }
>>
>> static void
>> __cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> {
>> cfq_rb_erase(&cfqe->rb_node, st);
>> + update_min_vdisktime(st);
>> }
>>
>> static void
>> @@ -945,27 +999,43 @@ cfq_entity_service_tree_del(struct cfq_rb_root *st, struct cfq_entity *cfqe)
>> if (!RB_EMPTY_NODE(&cfqe->rb_node)) {
>> __cfq_entity_service_tree_del(st, cfqe);
>> st->total_weight -= cfqe->weight;
>> - cfqe->service_tree = NULL;
>> }
>> }
>>
>> static void
>> cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> struct cfq_entity *cfqe = &cfqg->cfqe;
>> + struct cfq_group *__cfqg, *p_cfqg;
>>
>> BUG_ON(cfqg->nr_cfqq < 1);
>> cfqg->nr_cfqq--;
>>
>> - /* If there are other cfq queues under this group, don't delete it */
>> - if (cfqg->nr_cfqq)
>> + /*
>> + * If there are other cfq queues under this group, or there are other
>> + * cfq groups under this group, don't delete it.
>> + */
>> + if (cfqg->nr_cfqq || cfqg->nr_subgp)
>> return;
>>
>> - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
>> - cfq_entity_service_tree_del(st, cfqe);
>> - cfqg->saved_workload_slice = 0;
>> - cfq_blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
>> + /*
>> + * Dequeue this group and its ancestors from their service
>> + * tree.
>> + */
>> + while (cfqe) {
>> + __cfqg = cfqg_of_entity(cfqe);
>> + p_cfqg = cfqg_of_entity(cfqe->parent);
>> + cfq_entity_service_tree_del(cfqe->service_tree, cfqe);
>> + cfq_blkiocg_update_dequeue_stats(&__cfqg->blkg, 1);
>> + cfq_log_cfqg(cfqd, __cfqg, "del_from_rr group");
>> + __cfqg->saved_workload_slice = 0;
>> + cfqe = cfqe->parent;
>> + if (p_cfqg) {
>> + p_cfqg->nr_subgp--;
>> + if (p_cfqg->nr_cfqq || p_cfqg->nr_subgp)
>> + return;
>> + }
>> + }
>> }
>>
>> static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>> @@ -997,7 +1067,6 @@ static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
>> static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> struct cfq_queue *cfqq)
>> {
>> - struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> unsigned int used_sl, charge;
>> int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
>> - cfqg->service_tree_idle.count;
>> @@ -1011,10 +1080,23 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
>> charge = cfqq->allocated_slice;
>>
>> - /* Can't update vdisktime while group is on service tree */
>> - __cfq_entity_service_tree_del(st, cfqe);
>> - cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
>> - __cfq_entity_service_tree_add(st, cfqe);
>> + /*
>> + * Update the vdisktime on the whole chain.
>> + */
>> + while (cfqe) {
>> + struct cfq_rb_root *st = cfqe->service_tree;
>> +
>> + /*
>> + * Can't update vdisktime while group is on service
>> + * tree.
>> + */
>> + __cfq_entity_service_tree_del(st, cfqe);
>> + cfqe->vdisktime += cfq_scale_slice(charge, cfqe);
>> + __cfq_entity_service_tree_add(st, cfqe);
>> + st->count++;
>> + cfqe->reposition_time = jiffies;
>> + cfqe = cfqe->parent;
>> + }
>>
>> /* This group is being expired. Save the context */
>> if (time_after(cfqd->workload_expires, jiffies)) {
>> @@ -1026,7 +1108,8 @@ static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> cfqg->saved_workload_slice = 0;
>>
>> cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu",
>> - cfqe->vdisktime, st->min_vdisktime);
>> + cfqg->cfqe.vdisktime,
>> + cfqg->cfqe.service_tree->min_vdisktime);
>> cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u disp=%u charge=%u iops=%u"
>> " sect=%u", used_sl, cfqq->slice_dispatch, charge,
>> iops_mode(cfqd), cfqq->nr_sectors);
>> @@ -1048,35 +1131,27 @@ void cfq_update_blkio_group_weight(void *key, struct blkio_group *blkg,
>> cfqg_of_blkg(blkg)->cfqe.weight = weight;
>> }
>>
>> -static struct cfq_group *
>> -cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +static void init_cfqe(struct blkio_cgroup *blkcg,
>> + struct cfq_group *cfqg)
>> +{
>> + struct cfq_entity *cfqe = &cfqg->cfqe;
>> +
>> + cfqe->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> + RB_CLEAR_NODE(&cfqe->rb_node);
>> + cfqe->is_group_entity = true;
>> + cfqe->parent = NULL;
>> +}
>> +
>> +static void init_cfqg(struct cfq_data *cfqd, struct blkio_cgroup *blkcg,
>> + struct cfq_group *cfqg)
>> {
>> - struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> - struct cfq_group *cfqg = NULL;
>> - void *key = cfqd;
>> int i, j;
>> struct cfq_rb_root *st;
>> - struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> unsigned int major, minor;
>> -
>> - cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> - if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> - cfqg->blkg.dev = MKDEV(major, minor);
>> - goto done;
>> - }
>> - if (cfqg || !create)
>> - goto done;
>> -
>> - cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
>> - if (!cfqg)
>> - goto done;
>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>>
>> for_each_cfqg_st(cfqg, i, j, st)
>> *st = CFQ_RB_ROOT;
>> - RB_CLEAR_NODE(&cfqg->cfqe.rb_node);
>> -
>> - cfqg->cfqe.is_group_entity = true;
>>
>> /*
>> * Take the initial reference that will be released on destroy
>> @@ -1086,24 +1161,199 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> */
>> cfqg->ref = 1;
>>
>> + /* Add group onto cgroup list */
>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> + cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> + MKDEV(major, minor));
>> + /* Initiate group entity */
>> + init_cfqe(blkcg, cfqg);
>> + /* Add group on cfqd list */
>> + hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> +}
>> +
>> +static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg);
>> +
>> +static void uninit_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
>> +{
>> + if (!cfq_blkiocg_del_blkio_group(&cfqg->blkg))
>> + cfq_destroy_cfqg(cfqd, cfqg);
>> +}
>> +
>> +static void cfqg_set_parent(struct cfq_data *cfqd, struct cfq_group *cfqg,
>> + struct cfq_group *p_cfqg)
>> +{
>> + struct cfq_entity *cfqe, *p_cfqe;
>> +
>> + cfqe = &cfqg->cfqe;
>> +
>> /*
>> - * Add group onto cgroup list. It might happen that bdi->dev is
>> - * not initiliazed yet. Initialize this new group without major
>> - * and minor info and this info will be filled in once a new thread
>> - * comes for IO. See code above.
>> + * 1. If use_hierarchy of the CGroup where cfqg's parent stays is not
>> + * set, we put this cfqg onto global service tree.
>> + * 2. If cfqg is root cfqg, put it onto global service tree.
>> */
>> - if (bdi->dev) {
>> - sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> - MKDEV(major, minor));
>> - } else
>> - cfq_blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> - 0);
>> + if (!p_cfqg) {
>> + cfqe->service_tree = &cfqd->grp_service_tree;
>> + cfqe->parent = NULL;
>> + return;
>> + }
>>
>> - cfqg->cfqe.weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>> + p_cfqe = &p_cfqg->cfqe;
>>
>> - /* Add group on cfqd list */
>> - hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> + cfqe->parent = p_cfqe;
>> +
>> + /*
>> + * Currently, just put cfq group entity on "BE:SYNC" workload
>> + * service tree.
>> + */
>> + cfqe->service_tree = service_tree_for(p_cfqg, BE_WORKLOAD,
>> + SYNC_WORKLOAD);
>> + /* child reference */
>> + p_cfqg->ref++;
>> +}
>> +
>> +static struct cfq_group *cfqg_get_parent(struct cfq_group * cfqg)
>> +{
>> + struct cfq_entity *cfqe, *p_cfqe;
>> +
>> + if (!cfqg)
>> + return NULL;
>> +
>> + cfqe = &cfqg->cfqe;
>> + p_cfqe = cfqe->parent;
>> + if (!p_cfqe)
>> + return NULL;
>> +
>> + return cfqg_of_entity(p_cfqe);
>> +}
>> +
>> +static struct cfq_group *
>> +cfqg_chain_alloc(struct cfq_data *cfqd, struct cgroup *cgroup)
>> +{
>> + struct blkio_cgroup *blkcg;
>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> + unsigned int major, minor;
>> + struct cfq_group *cfqg, *leaf_cfqg, *child_cfqg, *tmp_cfqg;
>> + void *key = cfqd;
>> +
>> + /*
>> + * If CGroup's use_hierarchy is unset, we just need to allocate only
>> + * one CFQ group, and this group will put onto the "grp_service_tree".
>> + * We don't need to check whether the cfqg exists, the caller has
>> + * already checked it.
>> + */
>> + blkcg = cgroup_to_blkio_cgroup(cgroup);
>> + if (!blkcg_get_use_hierarchy(blkcg)) {
>> + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
>> + cfqd->queue->node);
>> + if (!cfqg)
>> + return NULL;
>> +
>> + init_cfqg(cfqd, blkcg, cfqg);
>> + cfqg_set_parent(cfqd, cfqg, NULL);
>> + return cfqg;
>> + }
>> +
>> + /*
>> + * Allocate the CFQ group chain until we meet the group we'v already
>> + * allocated before, or to the CGroup whose use_hierarchy is not set.
>> + */
>> + leaf_cfqg = NULL;
>> + child_cfqg = NULL;
>> + for (; cgroup != NULL; cgroup = cgroup->parent) {
>> + blkcg = cgroup_to_blkio_cgroup(cgroup);
>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> + if (cfqg) {
>> + if (!cfqg->blkg.dev && bdi->dev &&
>> + dev_name(bdi->dev)) {
>> + sscanf(dev_name(bdi->dev), "%u:%u",
>> + &major, &minor);
>> + cfqg->blkg.dev = MKDEV(major, minor);
>> + }
>> +
>> + /*
>> + * Initialization of parent doesn't finish yet, get
>> + * it done.
>> + */
>> + if (child_cfqg) {
>> + if (blkcg_get_use_hierarchy(blkcg))
>> + cfqg_set_parent(cfqd, child_cfqg,
>> + cfqg);
>> + else
>> + cfqg_set_parent(cfqd, child_cfqg,
>> + NULL);
>> + }
>> +
>> + /* chain has already been built */
>> + break;
>> + }
>> +
>> + /*
>> + * We only allocate a cfqg that the corresponding cgroup's
>> + * use_hierarchy is set.
>> + */
>> + if (blkcg_get_use_hierarchy(blkcg)) {
>> + cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC,
>> + cfqd->queue->node);
>> + if (!cfqg)
>> + goto clean_up;
>> +
>> + if (!leaf_cfqg)
>> + leaf_cfqg = cfqg;
>> +
>> + init_cfqg(cfqd, blkcg, cfqg);
>> + } else {
>> + cfqg = NULL;
>> + }
>> +
>> + if (child_cfqg)
>> + cfqg_set_parent(cfqd, child_cfqg, cfqg);
>> +
>> + /*
>> + * This CGroup's use_hierarchy isn't set, this means the CFQ
>> + * group chain has been built.
>> + */
>> + if (!blkcg_get_use_hierarchy(blkcg))
>> + break;
>> +
>> + child_cfqg = cfqg;
>> + }
>> +
>> + return leaf_cfqg;
>> +
>> +clean_up:
>> + /* clean up the allocated cfq groups. */
>> + while (leaf_cfqg) {
>> + tmp_cfqg = leaf_cfqg;
>> + leaf_cfqg = cfqg_get_parent(leaf_cfqg);
>> + uninit_cfqg(cfqd, tmp_cfqg);
>> + }
>> +
>> + return NULL;
>> +}
>> +
>> +static struct cfq_group *
>> +cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> +{
>> + struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
>> + struct cfq_group *cfqg = NULL;
>> + void *key = cfqd;
>> + struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
>> + unsigned int major, minor;
>> +
>> + cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
>> + if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
>> + sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> + cfqg->blkg.dev = MKDEV(major, minor);
>> + goto done;
>> + }
>> + if (cfqg || !create)
>> + goto done;
>> +
>> + /*
>> + * Allocate CFQ group chain to the root group or we meet the CGroup
>> + * with use_hierarchy disabled.
>> + */
>> + cfqg = cfqg_chain_alloc(cfqd, cgroup);
>>
>> done:
>> return cfqg;
>> @@ -1148,6 +1398,7 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>> {
>> struct cfq_rb_root *st;
>> int i, j;
>> + struct cfq_group *p_cfqg;
>>
>> BUG_ON(cfqg->ref <= 0);
>> cfqg->ref--;
>> @@ -1155,6 +1406,22 @@ static void cfq_put_cfqg(struct cfq_group *cfqg)
>> return;
>> for_each_cfqg_st(cfqg, i, j, st)
>> BUG_ON(!RB_EMPTY_ROOT(&st->rb));
>> +
>> + do {
>> + p_cfqg = cfqg_get_parent(cfqg);
>> + kfree(cfqg);
>> + cfqg = NULL;
>> + /*
>> + * Drop the reference taken by children, if nobody references
>> + * parent group, we need delete the parent also.
>> + */
>> + if (p_cfqg) {
>> + p_cfqg->ref--;
>> + if (p_cfqg->ref == 0)
>> + cfqg = p_cfqg;
>> + }
>> + } while (cfqg);
>> +
>> kfree(cfqg);
>> }
>>
>> @@ -1321,9 +1588,6 @@ static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
>> * ioprio.
>> */
>> pos_offset = cfq_get_boost(cfqd, cfqq);
>> - /* Debug purpose, should remove. */
>> - cfq_log_cfqq(cfqd, cfqq, "pos_offset: %llu",
>> - pos_offset);
>> cfqe->vdisktime = service_tree->min_vdisktime +
>> pos_offset;
>> } else
>> @@ -1365,9 +1629,8 @@ insert:
>> cfqe->service_tree = service_tree;
>>
>> /* Add cfqq onto service tree. */
>> +
>> cfq_entity_service_tree_add(service_tree, cfqe);
>> - update_min_vdisktime(service_tree);
>> - cfqq->reposition_time = jiffies;
>> if ((add_front || !new_cfqq) && !group_changed)
>> return;
>> cfq_group_service_tree_add(cfqd, cfqq->cfqg);
>> @@ -1810,28 +2073,43 @@ static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
>> return cfqq_of_entity(cfq_rb_first(service_tree));
>> }
>>
>> -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
>> +struct cfq_rb_root *choose_service_tree_forced(struct cfq_group *cfqg)
>> {
>> - struct cfq_group *cfqg;
>> - struct cfq_entity *cfqe;
>> int i, j;
>> struct cfq_rb_root *st;
>>
>> - if (!cfqd->rq_queued)
>> - return NULL;
>> + for_each_cfqg_st(cfqg, i, j, st) {
>> + if (st->count != 0)
>> + return st;
>> + }
>>
>> - cfqg = cfq_get_next_cfqg(cfqd);
>> - if (!cfqg)
>> + return NULL;
>> +}
>> +
>> +static struct cfq_entity *
>> +cfq_get_next_entity_forced(struct cfq_data *cfqd)
>> +{
>> + struct cfq_entity *cfqe;
>> + struct cfq_rb_root *st = &cfqd->grp_service_tree;
>> + struct cfq_group *cfqg;
>> +
>> + if (!cfqd->rq_queued)
>> return NULL;
>>
>> - for_each_cfqg_st(cfqg, i, j, st) {
>> + do {
>> cfqe = cfq_rb_first(st);
>> - if (cfqe != NULL)
>> - return cfqq_of_entity(cfqe);
>> - }
>> + if (cfqe && !cfqe->is_group_entity)
>> + return cfqe;
>> + else if (cfqe && cfqe->is_group_entity)
>> + cfqg = cfqg_of_entity(cfqe);
>> +
>> + st = choose_service_tree_forced(cfqg);
>> + } while (st);
>> +
>> return NULL;
>> }
>>
>> +
>> /*
>> * Get and set a new active qu
>>
>>
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
>> the body of a message to majordomo@xxxxxxxxxxxxxxx
>> More majordomo info at http://vger.kernel.org/majordomo-info.html
>> Please read the FAQ at http://www.tux.org/lkml/
>>
>

--
Regards
Gui Jianfeng
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/