Re: [PATCH 25/23] io-controller: fix queue vs group fairness

From: Fabio Checconi
Date: Tue Sep 08 2009 - 19:10:36 EST


Hi,

> From: Vivek Goyal <vgoyal@xxxxxxxxxx>
> Date: Tue, Sep 08, 2009 06:28:27PM -0400
>
>
> o I found an issue during test and that is if there is a mix of queue and group
...
> So we need to keep track of process io queue's vdisktime, even it after got
> deleted from io scheduler's service tree and use that same vdisktime if that
> queue gets backlogged again. But trusting a ioq's vdisktime is bad because
> it can lead to issues if a service tree min_vtime wrap around takes place
> between two requests of the queue. (Agreed that it can be not that easy to
> hit but it is possible).
>
> Hence, keep a cache of io queues serviced recently and when a queue gets
> backlogged, if it is found in cache, use that vdisktime otherwise assign
> a new vdisktime. This cache of io queues (idle tree), is basically the idea
> implemented by BFQ guys. I had gotten rid of idle trees in V9 and now I am
> bringing it back. (Now I understand it better. :-)).
>
> There is one good side affect of keeping the cache of recently service io
> queues. Now CFQ can differentiate between streaming readers and new processes
> doing IO. Now for a new queue (which is not in the cache), we can assign a
> lower vdisktime and for a streaming reader, we assign vdisktime based on disk
> time used. This way small file readers or the processes doing small amount
> of IO will have reduced latencies at the cost of little reduced throughput of
> streaming readers.
>

just a little note: this patch seems to introduce a special case for
vdisktime = 0, assigning it the meaning of "bad timestamp," but the virtual
time space wraps, so 0 is a perfectly legal value, which can be reached by
service. I have no idea if it can produce visible effects, but it doesn't
seem to be correct.


> Signed-off-by: Vivek Goyal <vgoyal@xxxxxxxxxx>
> ---
> block/cfq-iosched.c | 2
> block/elevator-fq.c | 252 ++++++++++++++++++++++++++++++++++++++++++++++++----
> block/elevator-fq.h | 9 +
> 3 files changed, 246 insertions(+), 17 deletions(-)
>
> Index: linux16/block/elevator-fq.c
> ===================================================================
> --- linux16.orig/block/elevator-fq.c 2009-09-08 15:44:21.000000000 -0400
> +++ linux16/block/elevator-fq.c 2009-09-08 15:47:45.000000000 -0400
> @@ -52,6 +52,8 @@ static struct kmem_cache *elv_ioq_pool;
> #define elv_log_entity(entity, fmt, args...)
> #endif
>
> +static void check_idle_tree_release(struct io_service_tree *st);
> +
> static inline struct io_queue *ioq_of(struct io_entity *entity)
> {
> if (entity->my_sd == NULL)
> @@ -109,6 +111,11 @@ elv_prio_to_slice(struct elv_fq_data *ef
> return elv_weight_slice(efqd, elv_ioq_sync(ioq), ioq->entity.weight);
> }
>
> +static inline int vdisktime_gt(u64 a, u64 b)
> +{
> + return (s64)(a - b) > 0;
> +}
> +
> static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
> {
> s64 delta = (s64)(vdisktime - min_vdisktime);
> @@ -145,6 +152,7 @@ static void update_min_vdisktime(struct
> }
>
> st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
> + check_idle_tree_release(st);
> }
>
> static inline struct io_entity *parent_entity(struct io_entity *entity)
> @@ -411,27 +419,46 @@ static void place_entity(struct io_servi
> struct rb_node *parent;
> struct io_entity *entry;
> int nr_active = st->nr_active - 1;
> + struct io_queue *ioq = ioq_of(entity);
> + int sync = 1;
> +
> + if (ioq)
> + sync = elv_ioq_sync(ioq);
> +
> + if (add_front || !nr_active) {
> + vdisktime = st->min_vdisktime;
> + goto done;
> + }
> +
> + if (sync && entity->vdisktime
> + && vdisktime_gt(entity->vdisktime, st->min_vdisktime)) {
> + /* vdisktime still in future. Use old vdisktime */
> + vdisktime = entity->vdisktime;
> + goto done;
> + }
>
> /*
> - * Currently put entity at the end of last entity. This probably will
> - * require adjustments as we move along
> + * Effectively a new queue. Assign sync queue a lower vdisktime so
> + * we can achieve better latencies for small file readers. For async
> + * queues, put them at the end of the existing queue.
> + * Group entities are always considered sync.
> */
> - if (io_entity_class_idle(entity)) {
> - vdisktime = elv_delta_fair(ELV_IDLE_DELAY, entity);
> - parent = rb_last(&st->active);
> - if (parent) {
> - entry = rb_entry(parent, struct io_entity, rb_node);
> - vdisktime += entry->vdisktime;
> - }
> - } else if (!add_front && nr_active) {
> - parent = rb_last(&st->active);
> - if (parent) {
> - entry = rb_entry(parent, struct io_entity, rb_node);
> - vdisktime = entry->vdisktime;
> - }
> - } else
> + if (sync) {
> vdisktime = st->min_vdisktime;
> + goto done;
> + }
>
> + /*
> + * Put entity at the end of the tree. Effectively async queues use
> + * this path.
> + */
> + parent = rb_last(&st->active);
> + if (parent) {
> + entry = rb_entry(parent, struct io_entity, rb_node);
> + vdisktime = entry->vdisktime;
> + } else
> + vdisktime = st->min_vdisktime;
> +done:
> entity->vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
> elv_log_entity(entity, "place_entity: vdisktime=%llu"
> " min_vdisktime=%llu", entity->vdisktime,
> @@ -447,6 +474,122 @@ static inline void io_entity_update_prio
> */
> init_io_entity_service_tree(entity, parent_entity(entity));
> entity->ioprio_changed = 0;
> +
> + /*
> + * Assign this entity a fresh vdisktime instead of using
> + * previous one as prio class will lead to service tree
> + * change and this vdisktime will not be valid on new
> + * service tree.
> + *
> + * TODO: Handle the case of only prio change.
> + */
> + entity->vdisktime = 0;
> + }
> +}
> +
> +static void
> +__dequeue_io_entity_idle(struct io_service_tree *st, struct io_entity *entity)
> +{
> + if (st->rb_leftmost_idle == &entity->rb_node) {
> + struct rb_node *next_node;
> +
> + next_node = rb_next(&entity->rb_node);
> + st->rb_leftmost_idle = next_node;
> + }
> +
> + rb_erase(&entity->rb_node, &st->idle);
> + RB_CLEAR_NODE(&entity->rb_node);
> +}
> +
> +static void dequeue_io_entity_idle(struct io_entity *entity)
> +{
> + struct io_queue *ioq = ioq_of(entity);
> +
> + __dequeue_io_entity_idle(entity->st, entity);
> + entity->on_idle_st = 0;
> + if (ioq)
> + elv_put_ioq(ioq);
> +}
> +
> +static void
> +__enqueue_io_entity_idle(struct io_service_tree *st, struct io_entity *entity)
> +{
> + struct rb_node **node = &st->idle.rb_node;
> + struct rb_node *parent = NULL;
> + struct io_entity *entry;
> + int leftmost = 1;
> +
> + while (*node != NULL) {
> + parent = *node;
> + entry = rb_entry(parent, struct io_entity, rb_node);
> +
> + if (vdisktime_gt(entry->vdisktime, entity->vdisktime))
> + node = &parent->rb_left;
> + else {
> + node = &parent->rb_right;
> + leftmost = 0;
> + }
> + }
> +
> + /*
> + * Maintain a cache of leftmost tree entries (it is frequently
> + * used)
> + */
> + if (leftmost)
> + st->rb_leftmost_idle = &entity->rb_node;
> +
> + rb_link_node(&entity->rb_node, parent, node);
> + rb_insert_color(&entity->rb_node, &st->idle);
> +}
> +
> +static void enqueue_io_entity_idle(struct io_entity *entity)
> +{
> + struct io_queue *ioq = ioq_of(entity);
> + struct io_group *parent_iog;
> +
> + /*
> + * Don't put an entity on idle tree if it has been marked for deletion.
> + * We are not expecting more io from this entity. No need to cache it
> + */
> +
> + if (entity->exiting)
> + return;
> +
> + /*
> + * If parent group is exiting, don't put on idle tree. May be task got
> + * moved to a different cgroup and original cgroup got deleted
> + */
> + parent_iog = iog_of(parent_entity(entity));
> + if (parent_iog->entity.exiting)
> + return;
> +
> + if (ioq)
> + elv_get_ioq(ioq);
> + __enqueue_io_entity_idle(entity->st, entity);
> + entity->on_idle_st = 1;
> +}
> +
> +static void check_idle_tree_release(struct io_service_tree *st)
> +{
> + struct io_entity *leftmost;
> +
> + if (!st->rb_leftmost_idle)
> + return;
> +
> + leftmost = rb_entry(st->rb_leftmost_idle, struct io_entity, rb_node);
> +
> + if (vdisktime_gt(st->min_vdisktime, leftmost->vdisktime))
> + dequeue_io_entity_idle(leftmost);
> +}
> +
> +static void flush_idle_tree(struct io_service_tree *st)
> +{
> + struct io_entity *entity;
> +
> + while(st->rb_leftmost_idle) {
> + entity = rb_entry(st->rb_leftmost_idle, struct io_entity,
> + rb_node);
> + dequeue_io_entity_idle(entity);
> }
> }
>
> @@ -483,6 +626,9 @@ static void dequeue_io_entity(struct io_
> st->nr_active--;
> sd->nr_active--;
> debug_update_stats_dequeue(entity);
> +
> + if (vdisktime_gt(entity->vdisktime, st->min_vdisktime))
> + enqueue_io_entity_idle(entity);
> }
>
> static void
> @@ -524,6 +670,16 @@ static void enqueue_io_entity(struct io_
> struct io_service_tree *st;
> struct io_sched_data *sd = io_entity_sched_data(entity);
>
> + if (entity->on_idle_st)
> + dequeue_io_entity_idle(entity);
> + else
> + /*
> + * This entity was not in idle tree cache. Zero out vdisktime
> + * so that we don't rely on old vdisktime instead assign a
> + * fresh one.
> + */
> + entity->vdisktime = 0;
> +
> io_entity_update_prio(entity);
> st = entity->st;
> st->nr_active++;
> @@ -574,6 +730,8 @@ static void requeue_io_entity(struct io_
> struct io_service_tree *st = entity->st;
> struct io_entity *next_entity;
>
> + entity->vdisktime = 0;
> +
> if (add_front) {
> next_entity = __lookup_next_io_entity(st);
>
> @@ -1937,11 +2095,18 @@ static void io_free_root_group(struct el
> {
> struct io_group *iog = e->efqd->root_group;
> struct io_cgroup *iocg = &io_root_cgroup;
> + struct io_service_tree *st;
> + int i;
>
> spin_lock_irq(&iocg->lock);
> hlist_del_rcu(&iog->group_node);
> spin_unlock_irq(&iocg->lock);
>
> + for (i = 0; i < IO_IOPRIO_CLASSES; i++) {
> + st = iog->sched_data.service_tree + i;
> + flush_idle_tree(st);
> + }
> +
> put_io_group_queues(e, iog);
> elv_put_iog(iog);
> }
> @@ -2039,9 +2204,29 @@ EXPORT_SYMBOL(elv_put_iog);
> */
> static void __io_destroy_group(struct elv_fq_data *efqd, struct io_group *iog)
> {
> + struct io_service_tree *st;
> + int i;
> + struct io_entity *entity = &iog->entity;
> +
> + /*
> + * Mark io group for deletion so that no new entry goes in
> + * idle tree. Any active queue which is removed from active
> + * tree will not be put in to idle tree.
> + */
> + entity->exiting = 1;
> +
> + /* We flush idle tree now, and don't put things in there any more. */
> + for (i = 0; i < IO_IOPRIO_CLASSES; i++) {
> + st = iog->sched_data.service_tree + i;
> + flush_idle_tree(st);
> + }
> +
> hlist_del(&iog->elv_data_node);
> put_io_group_queues(efqd->eq, iog);
>
> + if (entity->on_idle_st)
> + dequeue_io_entity_idle(entity);
> +
> /*
> * Put the reference taken at the time of creation so that when all
> * queues are gone, group can be destroyed.
> @@ -2374,7 +2559,13 @@ static struct io_group *io_alloc_root_gr
> static void io_free_root_group(struct elevator_queue *e)
> {
> struct io_group *iog = e->efqd->root_group;
> + struct io_service_tree *st;
> + int i;
>
> + for (i = 0; i < IO_IOPRIO_CLASSES; i++) {
> + st = iog->sched_data.service_tree + i;
> + flush_idle_tree(st);
> + }
> put_io_group_queues(e, iog);
> kfree(iog);
> }
> @@ -3257,6 +3448,35 @@ done:
> elv_schedule_dispatch(q);
> }
>
> +/*
> + * The process associted with ioq (in case of cfq), is going away. Mark it
> + * for deletion.
> + */
> +void elv_exit_ioq(struct io_queue *ioq)
> +{
> + struct io_entity *entity = &ioq->entity;
> +
> + /*
> + * Async ioq's belong to io group and are cleaned up once group is
> + * being deleted. Not need to do any cleanup here even if cfq has
> + * dropped the reference to the queue
> + */
> + if (!elv_ioq_sync(ioq))
> + return;
> +
> + /*
> + * This queue is still under service. Just mark it so that once all
> + * the IO from queue is done, it is not put back in idle tree.
> + */
> + if (entity->on_st) {
> + entity->exiting = 1;
> + return;
> + } else if(entity->on_idle_st) {
> + /* Remove ioq from idle tree */
> + dequeue_io_entity_idle(entity);
> + }
> +}
> +EXPORT_SYMBOL(elv_exit_ioq);
> static void elv_slab_kill(void)
> {
> /*
> Index: linux16/block/cfq-iosched.c
> ===================================================================
> --- linux16.orig/block/cfq-iosched.c 2009-09-08 15:43:36.000000000 -0400
> +++ linux16/block/cfq-iosched.c 2009-09-08 15:47:45.000000000 -0400
> @@ -1138,6 +1138,7 @@ static void cfq_exit_cfqq(struct cfq_dat
> elv_schedule_dispatch(cfqd->queue);
> }
>
> + elv_exit_ioq(cfqq->ioq);
> cfq_put_queue(cfqq);
> }
>
> @@ -1373,6 +1374,7 @@ static void changed_cgroup(struct io_con
> */
> if (iog != __iog) {
> cic_set_cfqq(cic, NULL, 1);
> + elv_exit_ioq(sync_cfqq->ioq);
> cfq_put_queue(sync_cfqq);
> }
> }
> Index: linux16/block/elevator-fq.h
> ===================================================================
> --- linux16.orig/block/elevator-fq.h 2009-09-08 15:43:36.000000000 -0400
> +++ linux16/block/elevator-fq.h 2009-09-08 15:47:45.000000000 -0400
> @@ -33,6 +33,10 @@ struct io_service_tree {
> u64 min_vdisktime;
> struct rb_node *rb_leftmost;
> unsigned int nr_active;
> +
> + /* A cache of io entities which were served and expired */
> + struct rb_root idle;
> + struct rb_node *rb_leftmost_idle;
> };
>
> struct io_sched_data {
> @@ -44,9 +48,12 @@ struct io_sched_data {
> struct io_entity {
> struct rb_node rb_node;
> int on_st;
> + int on_idle_st;
> u64 vdisktime;
> unsigned int weight;
> struct io_entity *parent;
> + /* This io entity (queue or group) has been marked for deletion */
> + unsigned int exiting;
>
> struct io_sched_data *my_sd;
> struct io_service_tree *st;
> @@ -572,7 +579,7 @@ extern struct io_queue *elv_alloc_ioq(st
> extern void elv_free_ioq(struct io_queue *ioq);
> extern struct io_group *ioq_to_io_group(struct io_queue *ioq);
> extern int elv_iog_should_idle(struct io_queue *ioq);
> -
> +extern void elv_exit_ioq(struct io_queue *ioq);
> #else /* CONFIG_ELV_FAIR_QUEUING */
> static inline struct elv_fq_data *
> elv_alloc_fq_data(struct request_queue *q, struct elevator_queue *e)
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/