Re: [PATCH 1/2 V3] io-controller: Add a new interface "weight_device" for IO-Controller
From: Nauman Rafique
Date: Mon Mar 08 2010 - 14:40:33 EST
On Fri, Mar 5, 2010 at 6:13 AM, Vivek Goyal <vgoyal@xxxxxxxxxx> wrote:
> On Fri, Mar 05, 2010 at 10:25:58AM +0800, Gui Jianfeng wrote:
>> Currently, IO Controller makes use of blkio.weight to assign weight for
>> all devices. Here a new user interface "blkio.weight_device" is introduced to
>> assign different weights for different devices. blkio.weight becomes the
>> default value for devices which are not configured by "blkio.weight_device"
>>
>> You can use the following format to assigned specific weight for a given
>> device:
>>
>> major:minor represents device number.
>>
>> And you can remove a specific weight as following:
>>
>> V1->V2 changes:
>> - use user interface "weight_device" instead of "policy" suggested by Vivek
>> - rename some struct suggested by Vivek
>> - rebase to 2.6-block "for-linus" branch
>> - remove an useless list_empty check pointed out by Li Zefan
>> - some trivial typo fix
>>
>> V2->V3 changes:
>> - Move policy_*_node() functions up to get rid of forward declarations
>> - rename related functions by adding prefix "blkio_"
>>
>
> Thanks for the changes Gui. Looks good to me.
>
> Acked-by: Vivek Goyal <vgoyal@xxxxxxxxxx>
>
> Vivek
>
>> Signed-off-by: Gui Jianfeng <guijianfeng@xxxxxxxxxxxxxx>
>> ---
>> block/blk-cgroup.c | 236 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>> block/blk-cgroup.h | 10 ++
>> block/cfq-iosched.c | 2 +-
>> 3 files changed, 246 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index c85d74c..8825e49 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -16,6 +16,7 @@
>> #include <linux/module.h>
>> #include <linux/err.h>
>> #include "blk-cgroup.h"
>> +#include <linux/genhd.h>
>>
>> static DEFINE_SPINLOCK(blkio_list_lock);
>> static LIST_HEAD(blkio_list);
>> @@ -23,6 +24,32 @@ static LIST_HEAD(blkio_list);
>> struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
>> EXPORT_SYMBOL_GPL(blkio_root_cgroup);
>>
>> +static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
>> + struct blkio_policy_node *pn)
>> +{
>> + list_add(&pn->node, &blkcg->policy_list);
>> +}
>> +
>> +/* Must be called with blkcg->lock held */
>> +static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
>> +{
>> + list_del(&pn->node);
>> +}
>> +
>> +/* Must be called with blkcg->lock held */
>> +static struct blkio_policy_node *
>> +blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
>> +{
>> + struct blkio_policy_node *pn;
>> +
>> + list_for_each_entry(pn, &blkcg->policy_list, node) {
>> + if (pn->dev == dev)
>> + return pn;
>> + }
>> +
>> + return NULL;
>> +}
>> +
>> struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
>> {
>> return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
>> @@ -128,6 +155,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
>> struct blkio_group *blkg;
>> struct hlist_node *n;
>> struct blkio_policy_type *blkiop;
>> + struct blkio_policy_node *pn;
>>
>> if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
>> return -EINVAL;
>> @@ -136,7 +164,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
>> spin_lock(&blkio_list_lock);
>> spin_lock_irq(&blkcg->lock);
>> blkcg->weight = (unsigned int)val;
>> +
>> hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
>> + pn = blkio_policy_search_node(blkcg, blkg->dev);
>> +
>> + if (pn)
>> + continue;
>> +
>> list_for_each_entry(blkiop, &blkio_list, list)
>> blkiop->ops.blkio_update_group_weight_fn(blkg,
>> blkcg->weight);
>> @@ -178,15 +212,208 @@ SHOW_FUNCTION_PER_GROUP(dequeue);
>>
>> #ifdef CONFIG_DEBUG_BLK_CGROUP
>> void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
>> - unsigned long dequeue)
>> + unsigned long dequeue)
>> {
>> blkg->dequeue += dequeue;
>> }
>> EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
>> #endif
>>
>> +static int blkio_check_dev_num(dev_t dev)
>> +{
>> + int part = 0;
>> + struct gendisk *disk;
>> +
>> + disk = get_gendisk(dev, &part);
>> + if (!disk || part)
>> + return -ENODEV;
>> +
>> + return 0;
>> +}
>> +
>> +static int blkio_policy_parse_and_set(char *buf,
>> + struct blkio_policy_node *newpn)
>> +{
>> + char *s[4], *p, *major_s = NULL, *minor_s = NULL;
>> + int ret;
>> + unsigned long major, minor, temp;
>> + int i = 0;
>> + dev_t dev;
>> +
>> + memset(s, 0, sizeof(s));
>> +
>> + while ((p = strsep(&buf, " ")) != NULL) {
>> + if (!*p)
>> + continue;
>> +
>> + s[i++] = p;
>> +
>> + /* Prevent from inputing too many things */
>> + if (i == 3)
>> + break;
>> + }
>> +
>> + if (i != 2)
>> + return -EINVAL;
>> +
>> + p = strsep(&s[0], ":");
>> + if (p != NULL)
>> + major_s = p;
>> + else
>> + return -EINVAL;
>> +
>> + minor_s = s[0];
>> + if (!minor_s)
>> + return -EINVAL;
>> +
>> + ret = strict_strtoul(major_s, 10, &major);
>> + if (ret)
>> + return -EINVAL;
>> +
>> + ret = strict_strtoul(minor_s, 10, &minor);
>> + if (ret)
>> + return -EINVAL;
>> +
>> + dev = MKDEV(major, minor);
I am not quite sure if exposing a mojor,minor number is the best
interface that can be exposed to user space. How about actual disk
names like sda, sdb, .. etc? The only problem I see there is that it
seems tricky to get to these disk names from within the block layer.
"struct request_queue" has a pointer to backing_dev which has a device
from which we can get major,minor. But in order to get to disk name,
we would have to call get_gendisk which can hold a semaphore. Is this
the reason for us going with major,minor as a user interface to
specify a disk? I bet there are good reasons for us not keeping a
pointer to "struct gendisk" from "struct request_queue". If we could
keep that pointer, our user interface could be very easily modified to
be the disk name like sda, sdb, etc.
>> +
>> + ret = blkio_check_dev_num(dev);
>> + if (ret)
>> + return ret;
>> +
>> + newpn->dev = dev;
>> +
>> + if (s[1] == NULL)
>> + return -EINVAL;
>> +
>> + ret = strict_strtoul(s[1], 10, &temp);
>> + if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
>> + temp > BLKIO_WEIGHT_MAX)
>> + return -EINVAL;
>> +
>> + newpn->weight = temp;
>> +
>> + return 0;
>> +}
>> +
>> +unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>> + dev_t dev)
>> +{
>> + struct blkio_policy_node *pn;
>> +
>> + pn = blkio_policy_search_node(blkcg, dev);
>> + if (pn)
>> + return pn->weight;
>> + else
>> + return blkcg->weight;
>> +}
>> +EXPORT_SYMBOL_GPL(blkcg_get_weight);
>> +
>> +static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
>> + const char *buffer)
>> +{
>> + int ret = 0;
>> + char *buf;
>> + struct blkio_policy_node *newpn, *pn;
>> + struct blkio_cgroup *blkcg;
>> + struct blkio_group *blkg;
>> + int keep_newpn = 0;
>> + struct hlist_node *n;
>> + struct blkio_policy_type *blkiop;
>> +
>> + buf = kstrdup(buffer, GFP_KERNEL);
>> + if (!buf)
>> + return -ENOMEM;
>> +
>> + newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
>> + if (!newpn) {
>> + ret = -ENOMEM;
>> + goto free_buf;
>> + }
>> +
>> + ret = blkio_policy_parse_and_set(buf, newpn);
>> + if (ret)
>> + goto free_newpn;
>> +
>> + blkcg = cgroup_to_blkio_cgroup(cgrp);
>> +
>> + spin_lock_irq(&blkcg->lock);
>> +
>> + pn = blkio_policy_search_node(blkcg, newpn->dev);
>> + if (!pn) {
>> + if (newpn->weight != 0) {
>> + blkio_policy_insert_node(blkcg, newpn);
>> + keep_newpn = 1;
>> + }
>> + spin_unlock_irq(&blkcg->lock);
>> + goto update_io_group;
>> + }
>> +
>> + if (newpn->weight == 0) {
>> + /* weight == 0 means deleteing a specific weight */
>> + blkio_policy_delete_node(pn);
>> + spin_unlock_irq(&blkcg->lock);
>> + goto update_io_group;
>> + }
>> + spin_unlock_irq(&blkcg->lock);
>> +
>> + pn->weight = newpn->weight;
>> +
>> +update_io_group:
>> + /* update weight for each cfqg */
>> + spin_lock(&blkio_list_lock);
>> + spin_lock_irq(&blkcg->lock);
>> +
>> + hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
>> + if (newpn->dev == blkg->dev) {
>> + list_for_each_entry(blkiop, &blkio_list, list)
>> + blkiop->ops.blkio_update_group_weight_fn(blkg,
>> + newpn->weight ?
>> + newpn->weight :
>> + blkcg->weight);
>> + }
>> + }
>> +
>> + spin_unlock_irq(&blkcg->lock);
>> + spin_unlock(&blkio_list_lock);
>> +
>> +free_newpn:
>> + if (!keep_newpn)
>> + kfree(newpn);
>> +free_buf:
>> + kfree(buf);
>> + return ret;
>> +}
>> +
>> +static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
>> + struct seq_file *m)
>> +{
>> + struct blkio_cgroup *blkcg;
>> + struct blkio_policy_node *pn;
>> +
>> + seq_printf(m, "dev\tweight\n");
>> +
>> + blkcg = cgroup_to_blkio_cgroup(cgrp);
>> + if (list_empty(&blkcg->policy_list))
>> + goto out;
>> +
>> + spin_lock_irq(&blkcg->lock);
>> + list_for_each_entry(pn, &blkcg->policy_list, node) {
>> + seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
>> + MINOR(pn->dev), pn->weight);
>> + }
>> + spin_unlock_irq(&blkcg->lock);
>> +out:
>> + return 0;
>> +}
>> +
>> struct cftype blkio_files[] = {
>> {
>> + .name = "weight_device",
>> + .read_seq_string = blkiocg_weight_device_read,
>> + .write_string = blkiocg_weight_device_write,
>> + .max_write_len = 256,
>> + },
>> + {
>> .name = "weight",
>> .read_u64 = blkiocg_weight_read,
>> .write_u64 = blkiocg_weight_write,
>> @@ -220,6 +447,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>> struct blkio_group *blkg;
>> void *key;
>> struct blkio_policy_type *blkiop;
>> + struct blkio_policy_node *pn, *pntmp;
>>
>> rcu_read_lock();
>> remove_entry:
>> @@ -250,7 +478,12 @@ remove_entry:
>> blkiop->ops.blkio_unlink_group_fn(key, blkg);
>> spin_unlock(&blkio_list_lock);
>> goto remove_entry;
>> +
>> done:
>> + list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
>> + blkio_policy_delete_node(pn);
>> + kfree(pn);
>> + }
>> free_css_id(&blkio_subsys, &blkcg->css);
>> rcu_read_unlock();
>> kfree(blkcg);
>> @@ -280,6 +513,7 @@ done:
>> spin_lock_init(&blkcg->lock);
>> INIT_HLIST_HEAD(&blkcg->blkg_list);
>>
>> + INIT_LIST_HEAD(&blkcg->policy_list);
>> return &blkcg->css;
>> }
>>
>> diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
>> index 84bf745..c8144a2 100644
>> --- a/block/blk-cgroup.h
>> +++ b/block/blk-cgroup.h
>> @@ -22,6 +22,7 @@ struct blkio_cgroup {
>> unsigned int weight;
>> spinlock_t lock;
>> struct hlist_head blkg_list;
>> + struct list_head policy_list; /* list of blkio_policy_node */
>> };
>>
>> struct blkio_group {
>> @@ -43,6 +44,15 @@ struct blkio_group {
>> unsigned long sectors;
>> };
>>
>> +struct blkio_policy_node {
>> + struct list_head node;
>> + dev_t dev;
>> + unsigned int weight;
>> +};
>> +
>> +extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>> + dev_t dev);
>> +
>> typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
>> typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
>> unsigned int weight);
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index dee9d93..6945e9b 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -954,7 +954,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> if (!cfqg)
>> goto done;
>>
>> - cfqg->weight = blkcg->weight;
>> for_each_cfqg_st(cfqg, i, j, st)
>> *st = CFQ_RB_ROOT;
>> RB_CLEAR_NODE(&cfqg->rb_node);
>> @@ -971,6 +970,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>> sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>> blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>> MKDEV(major, minor));
>> + cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>>
>> /* Add group on cfqd list */
>> hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> --
>> 1.5.4.rc3
>>
>>
>>
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/