Re: [PATCH 1/2 V3] io-controller: Add a new interface "weight_device" for IO-Controller

From: Nauman Rafique
Date: Mon Mar 08 2010 - 14:40:33 EST


On Fri, Mar 5, 2010 at 6:13 AM, Vivek Goyal <vgoyal@xxxxxxxxxx> wrote:
> On Fri, Mar 05, 2010 at 10:25:58AM +0800, Gui Jianfeng wrote:
>> Currently, IO Controller makes use of blkio.weight to assign weight for
>> all devices. Here a new user interface "blkio.weight_device" is introduced to
>> assign different weights for different devices. blkio.weight becomes the
>> default value for devices which are not configured by "blkio.weight_device"
>>
>> You can use the following format to assigned specific weight for a given
>> device:
>>
>> major:minor represents device number.
>>
>> And you can remove a specific weight as following:
>>
>> V1->V2 changes:
>> - use user interface "weight_device" instead of "policy" suggested by Vivek
>> - rename some struct suggested by Vivek
>> - rebase to 2.6-block "for-linus" branch
>> - remove an useless list_empty check pointed out by Li Zefan
>> - some trivial typo fix
>>
>> V2->V3 changes:
>> - Move policy_*_node() functions up to get rid of forward declarations
>> - rename related functions by adding prefix "blkio_"
>>
>
> Thanks for the changes Gui. Looks good to me.
>
> Acked-by: Vivek Goyal <vgoyal@xxxxxxxxxx>
>
> Vivek
>
>> Signed-off-by: Gui Jianfeng <guijianfeng@xxxxxxxxxxxxxx>
>> ---
>>  block/blk-cgroup.c  |  236 ++++++++++++++++++++++++++++++++++++++++++++++++++-
>>  block/blk-cgroup.h  |   10 ++
>>  block/cfq-iosched.c |    2 +-
>>  3 files changed, 246 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
>> index c85d74c..8825e49 100644
>> --- a/block/blk-cgroup.c
>> +++ b/block/blk-cgroup.c
>> @@ -16,6 +16,7 @@
>>  #include <linux/module.h>
>>  #include <linux/err.h>
>>  #include "blk-cgroup.h"
>> +#include <linux/genhd.h>
>>
>>  static DEFINE_SPINLOCK(blkio_list_lock);
>>  static LIST_HEAD(blkio_list);
>> @@ -23,6 +24,32 @@ static LIST_HEAD(blkio_list);
>>  struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT };
>>  EXPORT_SYMBOL_GPL(blkio_root_cgroup);
>>
>> +static inline void blkio_policy_insert_node(struct blkio_cgroup *blkcg,
>> +                                         struct blkio_policy_node *pn)
>> +{
>> +     list_add(&pn->node, &blkcg->policy_list);
>> +}
>> +
>> +/* Must be called with blkcg->lock held */
>> +static inline void blkio_policy_delete_node(struct blkio_policy_node *pn)
>> +{
>> +     list_del(&pn->node);
>> +}
>> +
>> +/* Must be called with blkcg->lock held */
>> +static struct blkio_policy_node *
>> +blkio_policy_search_node(const struct blkio_cgroup *blkcg, dev_t dev)
>> +{
>> +     struct blkio_policy_node *pn;
>> +
>> +     list_for_each_entry(pn, &blkcg->policy_list, node) {
>> +             if (pn->dev == dev)
>> +                     return pn;
>> +     }
>> +
>> +     return NULL;
>> +}
>> +
>>  struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup)
>>  {
>>       return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id),
>> @@ -128,6 +155,7 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
>>       struct blkio_group *blkg;
>>       struct hlist_node *n;
>>       struct blkio_policy_type *blkiop;
>> +     struct blkio_policy_node *pn;
>>
>>       if (val < BLKIO_WEIGHT_MIN || val > BLKIO_WEIGHT_MAX)
>>               return -EINVAL;
>> @@ -136,7 +164,13 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
>>       spin_lock(&blkio_list_lock);
>>       spin_lock_irq(&blkcg->lock);
>>       blkcg->weight = (unsigned int)val;
>> +
>>       hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
>> +             pn = blkio_policy_search_node(blkcg, blkg->dev);
>> +
>> +             if (pn)
>> +                     continue;
>> +
>>               list_for_each_entry(blkiop, &blkio_list, list)
>>                       blkiop->ops.blkio_update_group_weight_fn(blkg,
>>                                       blkcg->weight);
>> @@ -178,15 +212,208 @@ SHOW_FUNCTION_PER_GROUP(dequeue);
>>
>>  #ifdef CONFIG_DEBUG_BLK_CGROUP
>>  void blkiocg_update_blkio_group_dequeue_stats(struct blkio_group *blkg,
>> -                     unsigned long dequeue)
>> +                                           unsigned long dequeue)
>>  {
>>       blkg->dequeue += dequeue;
>>  }
>>  EXPORT_SYMBOL_GPL(blkiocg_update_blkio_group_dequeue_stats);
>>  #endif
>>
>> +static int blkio_check_dev_num(dev_t dev)
>> +{
>> +     int part = 0;
>> +     struct gendisk *disk;
>> +
>> +     disk = get_gendisk(dev, &part);
>> +     if (!disk || part)
>> +             return -ENODEV;
>> +
>> +     return 0;
>> +}
>> +
>> +static int blkio_policy_parse_and_set(char *buf,
>> +                                   struct blkio_policy_node *newpn)
>> +{
>> +     char *s[4], *p, *major_s = NULL, *minor_s = NULL;
>> +     int ret;
>> +     unsigned long major, minor, temp;
>> +     int i = 0;
>> +     dev_t dev;
>> +
>> +     memset(s, 0, sizeof(s));
>> +
>> +     while ((p = strsep(&buf, " ")) != NULL) {
>> +             if (!*p)
>> +                     continue;
>> +
>> +             s[i++] = p;
>> +
>> +             /* Prevent from inputing too many things */
>> +             if (i == 3)
>> +                     break;
>> +     }
>> +
>> +     if (i != 2)
>> +             return -EINVAL;
>> +
>> +     p = strsep(&s[0], ":");
>> +     if (p != NULL)
>> +             major_s = p;
>> +     else
>> +             return -EINVAL;
>> +
>> +     minor_s = s[0];
>> +     if (!minor_s)
>> +             return -EINVAL;
>> +
>> +     ret = strict_strtoul(major_s, 10, &major);
>> +     if (ret)
>> +             return -EINVAL;
>> +
>> +     ret = strict_strtoul(minor_s, 10, &minor);
>> +     if (ret)
>> +             return -EINVAL;
>> +
>> +     dev = MKDEV(major, minor);

I am not quite sure if exposing a mojor,minor number is the best
interface that can be exposed to user space. How about actual disk
names like sda, sdb, .. etc? The only problem I see there is that it
seems tricky to get to these disk names from within the block layer.
"struct request_queue" has a pointer to backing_dev which has a device
from which we can get major,minor. But in order to get to disk name,
we would have to call get_gendisk which can hold a semaphore. Is this
the reason for us going with major,minor as a user interface to
specify a disk? I bet there are good reasons for us not keeping a
pointer to "struct gendisk" from "struct request_queue". If we could
keep that pointer, our user interface could be very easily modified to
be the disk name like sda, sdb, etc.

>> +
>> +     ret = blkio_check_dev_num(dev);
>> +     if (ret)
>> +             return ret;
>> +
>> +     newpn->dev = dev;
>> +
>> +     if (s[1] == NULL)
>> +             return -EINVAL;
>> +
>> +     ret = strict_strtoul(s[1], 10, &temp);
>> +     if (ret || (temp < BLKIO_WEIGHT_MIN && temp > 0) ||
>> +         temp > BLKIO_WEIGHT_MAX)
>> +             return -EINVAL;
>> +
>> +     newpn->weight =  temp;
>> +
>> +     return 0;
>> +}
>> +
>> +unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>> +                           dev_t dev)
>> +{
>> +     struct blkio_policy_node *pn;
>> +
>> +     pn = blkio_policy_search_node(blkcg, dev);
>> +     if (pn)
>> +             return pn->weight;
>> +     else
>> +             return blkcg->weight;
>> +}
>> +EXPORT_SYMBOL_GPL(blkcg_get_weight);
>> +
>> +static int blkiocg_weight_device_write(struct cgroup *cgrp, struct cftype *cft,
>> +                                    const char *buffer)
>> +{
>> +     int ret = 0;
>> +     char *buf;
>> +     struct blkio_policy_node *newpn, *pn;
>> +     struct blkio_cgroup *blkcg;
>> +     struct blkio_group *blkg;
>> +     int keep_newpn = 0;
>> +     struct hlist_node *n;
>> +     struct blkio_policy_type *blkiop;
>> +
>> +     buf = kstrdup(buffer, GFP_KERNEL);
>> +     if (!buf)
>> +             return -ENOMEM;
>> +
>> +     newpn = kzalloc(sizeof(*newpn), GFP_KERNEL);
>> +     if (!newpn) {
>> +             ret = -ENOMEM;
>> +             goto free_buf;
>> +     }
>> +
>> +     ret = blkio_policy_parse_and_set(buf, newpn);
>> +     if (ret)
>> +             goto free_newpn;
>> +
>> +     blkcg = cgroup_to_blkio_cgroup(cgrp);
>> +
>> +     spin_lock_irq(&blkcg->lock);
>> +
>> +     pn = blkio_policy_search_node(blkcg, newpn->dev);
>> +     if (!pn) {
>> +             if (newpn->weight != 0) {
>> +                     blkio_policy_insert_node(blkcg, newpn);
>> +                     keep_newpn = 1;
>> +             }
>> +             spin_unlock_irq(&blkcg->lock);
>> +             goto update_io_group;
>> +     }
>> +
>> +     if (newpn->weight == 0) {
>> +             /* weight == 0 means deleteing a specific weight */
>> +             blkio_policy_delete_node(pn);
>> +             spin_unlock_irq(&blkcg->lock);
>> +             goto update_io_group;
>> +     }
>> +     spin_unlock_irq(&blkcg->lock);
>> +
>> +     pn->weight = newpn->weight;
>> +
>> +update_io_group:
>> +     /* update weight for each cfqg */
>> +     spin_lock(&blkio_list_lock);
>> +     spin_lock_irq(&blkcg->lock);
>> +
>> +     hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
>> +             if (newpn->dev == blkg->dev) {
>> +                     list_for_each_entry(blkiop, &blkio_list, list)
>> +                             blkiop->ops.blkio_update_group_weight_fn(blkg,
>> +                                             newpn->weight ?
>> +                                             newpn->weight :
>> +                                             blkcg->weight);
>> +             }
>> +     }
>> +
>> +     spin_unlock_irq(&blkcg->lock);
>> +     spin_unlock(&blkio_list_lock);
>> +
>> +free_newpn:
>> +     if (!keep_newpn)
>> +             kfree(newpn);
>> +free_buf:
>> +     kfree(buf);
>> +     return ret;
>> +}
>> +
>> +static int blkiocg_weight_device_read(struct cgroup *cgrp, struct cftype *cft,
>> +                                   struct seq_file *m)
>> +{
>> +     struct blkio_cgroup *blkcg;
>> +     struct blkio_policy_node *pn;
>> +
>> +     seq_printf(m, "dev\tweight\n");
>> +
>> +     blkcg = cgroup_to_blkio_cgroup(cgrp);
>> +     if (list_empty(&blkcg->policy_list))
>> +             goto out;
>> +
>> +     spin_lock_irq(&blkcg->lock);
>> +     list_for_each_entry(pn, &blkcg->policy_list, node) {
>> +             seq_printf(m, "%u:%u\t%u\n", MAJOR(pn->dev),
>> +                        MINOR(pn->dev), pn->weight);
>> +     }
>> +     spin_unlock_irq(&blkcg->lock);
>> +out:
>> +     return 0;
>> +}
>> +
>>  struct cftype blkio_files[] = {
>>       {
>> +             .name = "weight_device",
>> +             .read_seq_string = blkiocg_weight_device_read,
>> +             .write_string = blkiocg_weight_device_write,
>> +             .max_write_len = 256,
>> +     },
>> +     {
>>               .name = "weight",
>>               .read_u64 = blkiocg_weight_read,
>>               .write_u64 = blkiocg_weight_write,
>> @@ -220,6 +447,7 @@ static void blkiocg_destroy(struct cgroup_subsys *subsys, struct cgroup *cgroup)
>>       struct blkio_group *blkg;
>>       void *key;
>>       struct blkio_policy_type *blkiop;
>> +     struct blkio_policy_node *pn, *pntmp;
>>
>>       rcu_read_lock();
>>  remove_entry:
>> @@ -250,7 +478,12 @@ remove_entry:
>>               blkiop->ops.blkio_unlink_group_fn(key, blkg);
>>       spin_unlock(&blkio_list_lock);
>>       goto remove_entry;
>> +
>>  done:
>> +     list_for_each_entry_safe(pn, pntmp, &blkcg->policy_list, node) {
>> +             blkio_policy_delete_node(pn);
>> +             kfree(pn);
>> +     }
>>       free_css_id(&blkio_subsys, &blkcg->css);
>>       rcu_read_unlock();
>>       kfree(blkcg);
>> @@ -280,6 +513,7 @@ done:
>>       spin_lock_init(&blkcg->lock);
>>       INIT_HLIST_HEAD(&blkcg->blkg_list);
>>
>> +     INIT_LIST_HEAD(&blkcg->policy_list);
>>       return &blkcg->css;
>>  }
>>
>> diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
>> index 84bf745..c8144a2 100644
>> --- a/block/blk-cgroup.h
>> +++ b/block/blk-cgroup.h
>> @@ -22,6 +22,7 @@ struct blkio_cgroup {
>>       unsigned int weight;
>>       spinlock_t lock;
>>       struct hlist_head blkg_list;
>> +     struct list_head policy_list; /* list of blkio_policy_node */
>>  };
>>
>>  struct blkio_group {
>> @@ -43,6 +44,15 @@ struct blkio_group {
>>       unsigned long sectors;
>>  };
>>
>> +struct blkio_policy_node {
>> +     struct list_head node;
>> +     dev_t dev;
>> +     unsigned int weight;
>> +};
>> +
>> +extern unsigned int blkcg_get_weight(struct blkio_cgroup *blkcg,
>> +                                  dev_t dev);
>> +
>>  typedef void (blkio_unlink_group_fn) (void *key, struct blkio_group *blkg);
>>  typedef void (blkio_update_group_weight_fn) (struct blkio_group *blkg,
>>                                               unsigned int weight);
>> diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
>> index dee9d93..6945e9b 100644
>> --- a/block/cfq-iosched.c
>> +++ b/block/cfq-iosched.c
>> @@ -954,7 +954,6 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>       if (!cfqg)
>>               goto done;
>>
>> -     cfqg->weight = blkcg->weight;
>>       for_each_cfqg_st(cfqg, i, j, st)
>>               *st = CFQ_RB_ROOT;
>>       RB_CLEAR_NODE(&cfqg->rb_node);
>> @@ -971,6 +970,7 @@ cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
>>       sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
>>       blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
>>                                       MKDEV(major, minor));
>> +     cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
>>
>>       /* Add group on cfqd list */
>>       hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
>> --
>> 1.5.4.rc3
>>
>>
>>
>>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/