[APPENDIX PATCH 3/5] blk_end_request: dynamic load balancing forrequest-based dm-multipath

From: Kiyoshi Ueda
Date: Fri Aug 31 2007 - 18:46:42 EST


This patch adds dynamic load balancer to request-based dm-multipath.

Request-based dm itself is still under development and not ready
for inclusion.

Signed-off-by: Kiyoshi Ueda <k-ueda@xxxxxxxxxxxxx>
Signed-off-by: Jun'ichi Nomura <j-nomura@xxxxxxxxxxxxx>
---
drivers/md/Makefile | 3
drivers/md/dm-adaptive.c | 369 ++++++++++++++++++++++++++++++++++++++++++ drivers/md/dm-load-balance.c | 342 ++++++++++++++++++++++++++++++++++++++
drivers/md/dm-mpath.c | 32 ++-
drivers/md/dm-path-selector.h | 7
drivers/md/dm-round-robin.c | 2
drivers/md/dm.c | 4
include/linux/device-mapper.h | 3
8 files changed, 742 insertions(+), 20 deletions(-)

diff -rupN a2-rqdm-mpath/drivers/md/dm-adaptive.c a3-rqdm-mpath-dlb/drivers/md/dm-adaptive.c
--- a2-rqdm-mpath/drivers/md/dm-adaptive.c 1969-12-31 19:00:00.000000000 -0500
+++ a3-rqdm-mpath-dlb/drivers/md/dm-adaptive.c 2007-08-28 16:41:34.000000000 -0400
@@ -0,0 +1,369 @@
+/*
+ * Copyright (C) 2007 NEC Corporation. All Rights Reserved.
+ * dm-adaptive.c
+ *
+ * Module Author: Kiyoshi Ueda
+ *
+ * This file is released under the GPL.
+ *
+ * Adaptive path selector.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#define DM_MSG_PREFIX "multipath adaptive"
+#define AD_MIN_IO 100
+#define AD_VERSION "0.2.0"
+
+struct selector {
+// spinlock_t lock;
+ struct list_head valid_paths;
+ struct list_head failed_paths;
+};
+
+struct path_info {
+ struct list_head list;
+ struct dm_path *path;
+ unsigned int repeat_count;
+
+ atomic_t in_flight; /* Total size of in-flight I/Os */
+ size_t perf; /* Recent performance of the path */
+ sector_t last_sectors; /* Total sectors of the last disk_stat_read */
+ size_t last_io_ticks; /* io_ticks of the last disk_stat_read */
+
+ size_t rqsz[2]; /* Size of the last request. For Debug */
+};
+
+static void free_paths(struct list_head *paths)
+{
+ struct path_info *pi, *next;
+
+ list_for_each_entry_safe(pi, next, paths, list) {
+ list_del(&pi->list);
+ pi->path->pscontext = NULL;
+ kfree(pi);
+ }
+}
+
+static struct selector *alloc_selector(void)
+{
+ struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+ if (s) {
+ memset(s, 0, sizeof(*s));
+ INIT_LIST_HEAD(&s->valid_paths);
+ INIT_LIST_HEAD(&s->failed_paths);
+// s->lock = SPIN_LOCK_UNLOCKED;
+ }
+
+ return s;
+}
+
+static int ad_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+ struct selector *s;
+
+ s = alloc_selector();
+ if (!s)
+ return -ENOMEM;
+
+ ps->context = s;
+ return 0;
+}
+
+static void ad_destroy(struct path_selector *ps)
+{
+ struct selector *s = (struct selector *) ps->context;
+
+ free_paths(&s->valid_paths);
+ free_paths(&s->failed_paths);
+ kfree(s);
+ ps->context = NULL;
+}
+
+static int ad_status(struct path_selector *ps, struct dm_path *path,
+ status_type_t type, char *result, unsigned int maxlen)
+{
+ struct path_info *pi;
+ int sz = 0;
+
+ if (!path)
+ DMEMIT("0 ");
+ else {
+ pi = (struct path_info *) path->pscontext;
+ if (!pi)
+ BUG();
+
+ switch (type) {
+ case STATUSTYPE_INFO:
+ DMEMIT("if:%08lu pf:%06lu rsR:%06lu rsW:%06lu ",
+ (unsigned long) atomic_read(&pi->in_flight),
+ pi->perf,
+ pi->rqsz[READ], pi->rqsz[WRITE]);
+ break;
+ case STATUSTYPE_TABLE:
+ DMEMIT("%u ", pi->repeat_count);
+ break;
+ }
+ }
+
+ return sz;
+}
+
+/*
+ * Note: Assuming IRQs are enabled when this function gets called.
+ */
+static int ad_add_path(struct path_selector *ps, struct dm_path *path,
+ int argc, char **argv, char **error)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi;
+ unsigned int repeat_count = AD_MIN_IO;
+ struct gendisk *disk = path->dev->bdev->bd_disk;
+
+ if (argc > 1) {
+ *error = "adaptive ps: incorrect number of arguments";
+ return -EINVAL;
+ }
+
+ /* First path argument is number of I/Os before switching path. */
+ if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+ *error = "adaptive ps: invalid repeat count";
+ return -EINVAL;
+ }
+
+ /* allocate the path */
+ pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+ if (!pi) {
+ *error = "adaptive ps: Error allocating path context";
+ return -ENOMEM;
+ }
+
+ pi->path = path;
+ pi->repeat_count = repeat_count;
+
+ pi->perf = 0;
+ pi->last_sectors = disk_stat_read(disk, sectors[READ])
+ + disk_stat_read(disk, sectors[WRITE]);
+ pi->last_io_ticks = disk_stat_read(disk, io_ticks);
+ atomic_set(&pi->in_flight, 0);
+ pi->rqsz[READ] = pi->rqsz[WRITE] = 0;
+
+ path->pscontext = pi;
+
+// spin_lock_irq(&s->lock);
+ list_add_tail(&pi->list, &s->valid_paths);
+// spin_unlock_irq(&s->lock);
+
+ return 0;
+}
+
+/*
+ * Note: Called with IRQ disabled from mpath.c/fail_path().
+ */
+static void ad_fail_path(struct path_selector *ps, struct dm_path *p)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi = (struct path_info *) p->pscontext;
+
+// spin_lock(&s->lock);
+ list_move(&pi->list, &s->failed_paths);
+// spin_unlock(&s->lock);
+}
+
+/*
+ * Notes: Called with IRQ disabled from mpath.c/reinstate_path().
+ */
+static int ad_reinstate_path(struct path_selector *ps, struct dm_path *p)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi = (struct path_info *) p->pscontext;
+
+ if(!pi)
+ BUG();
+
+// spin_lock(&s->lock);
+ list_move_tail(&pi->list, &s->valid_paths);
+// spin_unlock(&s->lock);
+
+ return 0;
+}
+
+static void stats_update(struct path_info *pi)
+{
+ sector_t sectors;
+ size_t io_ticks, tmp;
+ struct gendisk *disk = pi->path->dev->bdev->bd_disk;
+
+ sectors = disk_stat_read(disk, sectors[READ])
+ + disk_stat_read(disk, sectors[WRITE]);
+ io_ticks = disk_stat_read(disk, io_ticks);
+
+ if ((sectors != pi->last_sectors) && (io_ticks != pi->last_io_ticks)) {
+ tmp = (sectors - pi->last_sectors) << 9;
+ do_div(tmp, jiffies_to_msecs((io_ticks - pi->last_io_ticks)));
+ pi->perf = tmp;
+
+ pi->last_sectors = sectors;
+ pi->last_io_ticks = io_ticks;
+ }
+}
+
+static int ad_compare_load(struct path_info *pi1, struct path_info *pi2,
+ size_t new_io)
+{
+ size_t if1, if2;
+// size_t st1, st2;
+
+ if1 = atomic_read(&pi1->in_flight);
+ if2 = atomic_read(&pi2->in_flight);
+
+ /*
+ * Case 1: No performace data available. Choose less loaded path.
+ */
+ if (!pi1->perf || !pi2->perf)
+ return if1 - if2;
+
+ /*
+ * Case 2: Calculate service time. Choose faster path.
+ * if ((if1+new_io)/pi1->perf < (if2+new_io)/pi2->perf) pi1.
+ * if ((if1+new_io)/pi1->perf > (if2+new_io)/pi2->perf) pi2.
+ * To avoid do_div(), use
+ * if ((if1+new_io)*pi2->perf < (if2+new_io)*pi1->perf) pi1.
+ * if ((if1+new_io)*pi2->perf > (if2+new_io)*pi1->perf) pi2.
+ */
+// st1 = (if2 + new_io) * pi1->perf;
+// st2 = (if1 + new_io) * pi2->perf;
+// st1 = (if2) * pi1->perf;
+// st2 = (if1) * pi2->perf;
+ if1 = (if1 + new_io) << 10;
+ if2 = (if2 + new_io) << 10;
+ do_div(if1, pi1->perf);
+ do_div(if2, pi2->perf);
+
+// if (st1 != st2)
+// return (st2 < st1) ? -1 : 1;
+ if (if1 != if2)
+ return if1 - if2;
+
+ /*
+ * Case 3: Service time is equal. Choose faster path.
+ */
+ return pi2->perf - pi1->perf;
+}
+
+static struct dm_path *ad_select_path(struct path_selector *ps,
+ unsigned int *repeat_count, size_t nr_bytes)
+{
+ struct selector *s = (struct selector *) ps->context;
+ struct path_info *pi, *best = NULL;
+// unsigned long flags;
+
+ if (!s)
+ BUG();
+ if (!repeat_count)
+ BUG();
+
+// spin_lock_irqsave(&s->lock, flags);
+ if (list_empty(&s->valid_paths)) {
+// spin_unlock_irqrestore(&s->lock, flags);
+ printk(KERN_INFO "adaptive ps: no valid paths.\n");
+ return NULL;
+ }
+
+ /* Change preferred (first in list) path to evenly balance. */
+ list_move_tail(s->valid_paths.next, &s->valid_paths);
+
+ /* Update performance information before best path selection */
+ list_for_each_entry(pi, &s->valid_paths, list) {
+ stats_update(pi);
+ }
+
+ list_for_each_entry(pi, &s->valid_paths, list) {
+ if (!best)
+ best = pi;
+ else {
+ if (ad_compare_load(pi, best, nr_bytes) < 0)
+ best = pi;
+ }
+ }
+// spin_unlock_irqrestore(&s->lock, flags);
+
+ if (best) {
+ *repeat_count = best->repeat_count;
+ return best->path;
+ }
+
+ return NULL;
+}
+
+static int ad_start_io(struct path_selector *ps, struct dm_path *p,
+ struct request *clone)
+{
+ struct path_info *pi = (struct path_info *) p->pscontext;
+ int rw = rq_data_dir(clone);
+
+ /* Update debug information */
+ pi->rqsz[rw] = clone->nr_sectors << 9;
+
+ atomic_add(clone->nr_sectors << 9, &pi->in_flight);
+
+ return 0;
+}
+
+static int ad_end_io(struct path_selector *ps, struct dm_path *p,
+ struct request *clone, int nr_bytes)
+{
+ struct path_info *pi = (struct path_info *) p->pscontext;
+
+ atomic_sub(nr_bytes, &pi->in_flight);
+
+ return 0;
+}
+
+static struct path_selector_type ad_ps = {
+ .name = "adaptive",
+ .module = THIS_MODULE,
+ .table_args = 1,
+ .info_args = 4,
+ .create = ad_create,
+ .destroy = ad_destroy,
+ .status = ad_status,
+ .add_path = ad_add_path,
+ .fail_path = ad_fail_path,
+ .reinstate_path = ad_reinstate_path,
+ .select_path = ad_select_path,
+ .start_io = ad_start_io,
+ .end_io = ad_end_io,
+};
+
+static int __init dm_ad_init(void)
+{
+ int r = dm_register_path_selector(&ad_ps);
+
+ if (r < 0)
+ DMERR("adaptive: register failed %d", r);
+
+ DMINFO("dm-adaptive version " AD_VERSION " loaded");
+
+ return r;
+}
+
+static void __exit dm_ad_exit(void)
+{
+ int r = dm_unregister_path_selector(&ad_ps);
+
+ if (r < 0)
+ DMERR("adaptive: unregister failed %d", r);
+}
+
+module_init(dm_ad_init);
+module_exit(dm_ad_exit);
+
+MODULE_DESCRIPTION(
+ "Copyright (C) 2007 NEC Corporation. All Rights Reserved.\n"
+ DM_NAME " Adaptive path selector (dm-adaptive.c version AD_VERSION)"
+);
+MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@xxxxxxxxxxxxx>");
+MODULE_LICENSE("GPL");
diff -rupN a2-rqdm-mpath/drivers/md/dm.c a3-rqdm-mpath-dlb/drivers/md/dm.c
--- a2-rqdm-mpath/drivers/md/dm.c 2007-08-29 14:33:31.000000000 -0400
+++ a3-rqdm-mpath-dlb/drivers/md/dm.c 2007-08-29 14:33:34.000000000 -0400
@@ -829,7 +829,7 @@ static int clone_end_request(struct requ
error = !uptodate ? -EIO : uptodate;

if (endio_first) {
- r = endio_first(tio->ti, clone, error, &tio->info);
+ r = endio_first(tio->ti, clone, error, nr_bytes, &tio->info);
switch (r) {
case 0:
/* succeeded */
@@ -1357,7 +1357,7 @@ static void dm_request_fn(struct request

ti = dm_table_find_target(map, rq->sector);
congested = ti->type->congested;
- if (congested && congested(ti))
+ if (congested && congested(ti, rq->nr_sectors << 9))
break;

blkdev_dequeue_request(rq);
diff -rupN a2-rqdm-mpath/drivers/md/dm-load-balance.c a3-rqdm-mpath-dlb/drivers/md/dm-load-balance.c
--- a2-rqdm-mpath/drivers/md/dm-load-balance.c 1969-12-31 19:00:00.000000000 -0500
+++ a3-rqdm-mpath-dlb/drivers/md/dm-load-balance.c 2007-08-28 16:41:34.000000000 -0400
@@ -0,0 +1,342 @@
+/*
+ * (C) Copyright IBM Corp. 2004,2005 All Rights Reserved.
+ * dm-load-balance.c
+ *
+ * Module Author: Stefan Bader
+ *
+ * This file is released under the GPL.
+ *
+ * Load balancing path selector.
+ */
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <asm/atomic.h>
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#define DM_MSG_PREFIX "multipath load-balance"
+#define LB_MIN_IO 128
+#define LB_VERSION "0.1.0"
+
+struct selector {
+ spinlock_t lock;
+ struct list_head valid_paths;
+ struct list_head failed_paths;
+};
+
+struct path_info {
+ struct list_head list;
+ struct dm_path * path;
+ unsigned int repeat_count;
+ atomic_t load;
+};
+
+static struct selector *alloc_selector(void)
+{
+ struct selector * s;
+
+ if ((s = kmalloc(sizeof(*s), GFP_KERNEL)) != NULL) {
+ memset(s, 0, sizeof(*s));
+ INIT_LIST_HEAD(&s->valid_paths);
+ INIT_LIST_HEAD(&s->failed_paths);
+ s->lock = SPIN_LOCK_UNLOCKED;
+ }
+
+ return s;
+}
+
+static inline void free_selector(struct selector *s)
+{
+ kfree(s);
+}
+
+static int lb_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+ struct selector * s;
+
+ if ((s = alloc_selector()) == NULL)
+ return -ENOMEM;
+
+ ps->context = s;
+
+ return 0;
+}
+
+static void lb_free_paths(struct list_head *paths)
+{
+ struct path_info * cpi;
+ struct path_info * npi;
+
+ list_for_each_entry_safe(cpi, npi, paths, list) {
+ list_del(&cpi->list);
+ cpi->path->pscontext = NULL;
+ kfree(cpi);
+ }
+}
+
+static void lb_destroy(struct path_selector *ps)
+{
+ struct selector * s;
+
+ s = (struct selector *) ps->context;
+ lb_free_paths(&s->valid_paths);
+ lb_free_paths(&s->failed_paths);
+ free_selector(s);
+ ps->context = NULL;
+}
+
+/*
+ * Note: Assuming IRQs are enabled when this function gets called.
+ */
+static int
+lb_add_path(
+ struct path_selector * ps,
+ struct dm_path * path,
+ int argc,
+ char ** argv,
+ char ** error)
+{
+ struct selector * s;
+ struct path_info * pi;
+ unsigned int repeat_count;
+
+ s = (struct selector *) ps->context;
+
+ /* Parse the arguments */
+ if (argc > 1) {
+ *error = "dm-load-balance: incorrect number of arguments";
+ return -EINVAL;
+ }
+
+ /* First path argument is number of I/Os before switching path. */
+ repeat_count = LB_MIN_IO;
+ if (argc > 0) {
+ if (sscanf(argv[0], "%u", &repeat_count) != 1) {
+ *error = "load-balance ps: invalid repeat count";
+ return -EINVAL;
+ }
+ }
+
+ /* Allocate the path information structure */
+ if ((pi = kmalloc(sizeof(*pi), GFP_KERNEL)) == NULL) {
+ *error = "dm-load-balance: Error allocating path information";
+ return -ENOMEM;
+ }
+
+ pi->path = path;
+ pi->repeat_count = repeat_count;
+ atomic_set(&pi->load, 0);
+ path->pscontext = pi;
+
+ spin_lock_irq(&s->lock);
+ list_add_tail(&pi->list, &s->valid_paths);
+ spin_unlock_irq(&s->lock);
+
+ return 0;
+}
+
+/*
+ * Note: Called with IRQ disabled from mpath.c/fail_path().
+ */
+static void
+lb_fail_path(struct path_selector *ps, struct dm_path *p)
+{
+ struct path_info * pi;
+ struct selector * s;
+
+ pi = (struct path_info *) p->pscontext;
+ s = (struct selector *) ps->context;
+
+ spin_lock(&s->lock);
+ list_move(&pi->list, &s->failed_paths);
+ spin_unlock(&s->lock);
+}
+
+/*
+ * Notes: Called with IRQ disabled from mpath.c/reinstate_path().
+ */
+static int
+lb_reinstate_path(struct path_selector *ps, struct dm_path *p)
+{
+ struct path_info * pi;
+ struct selector * s;
+
+ pi = (struct path_info *) p->pscontext;
+ s = (struct selector *) ps->context;
+
+ if(!pi)
+ BUG();
+
+ spin_lock(&s->lock);
+ list_move_tail(&pi->list, &s->valid_paths);
+ spin_unlock(&s->lock);
+
+ return 0;
+}
+
+static inline int
+lb_compare_load(struct path_info *pi1, struct path_info *pi2)
+{
+ return atomic_read(&pi1->load) - atomic_read(&pi2->load);
+}
+
+static struct dm_path *
+lb_select_path(
+ struct path_selector * ps,
+ unsigned * repeat,
+ size_t nr_bytes)
+{
+ struct selector * s;
+ struct path_info * cpi;
+ struct path_info * spi;
+ unsigned long flags;
+
+ s = (struct selector *) ps->context;
+ if (!s)
+ BUG();
+ if (!repeat)
+ BUG();
+
+ spin_lock_irqsave(&s->lock, flags);
+ if (list_empty(&s->valid_paths)) {
+ spin_unlock_irqrestore(&s->lock, flags);
+ printk(KERN_ERR "dm-load-balance: no valid paths!\n");
+ return NULL;
+ }
+
+ /* Change preferred (first in list) path to evenly balance. */
+ list_move_tail(s->valid_paths.next, &s->valid_paths);
+
+ spi = NULL;
+ list_for_each_entry(cpi, &s->valid_paths, list) {
+ if (spi == NULL) {
+ spi = cpi;
+ } else {
+ if (lb_compare_load(cpi, spi) < 0) {
+ spi = cpi;
+ }
+ }
+ }
+ spin_unlock_irqrestore(&s->lock, flags);
+
+ if (spi)
+ *repeat = spi->repeat_count;
+
+ return spi ? spi->path : NULL;
+}
+
+static int
+lb_io_started(
+ struct path_selector * ps,
+ struct dm_path * p,
+ struct request * clone)
+{
+ struct path_info * pi;
+
+ pi = (struct path_info *) p->pscontext;
+ atomic_inc(&pi->load);
+
+ return 0;
+}
+
+static int
+lb_io_finished(
+ struct path_selector * ps,
+ struct dm_path * p,
+ struct request * clone,
+ int nr_bytes)
+{
+ struct path_info * pi;
+
+ pi = (struct path_info *) p->pscontext;
+ atomic_dec(&pi->load);
+
+ return 0;
+}
+
+static int
+lb_status(
+ struct path_selector * ps,
+ struct dm_path * p,
+ status_type_t type,
+ char * result,
+ unsigned int maxlen)
+{
+ struct path_info * pi;
+ int sz;
+
+ /* This is used by DMEMIT. */
+ sz = 0;
+
+ /* When called with (p == NULL) return selector status/args. */
+ if (!p) {
+ DMEMIT("0 ");
+ } else {
+ pi = (struct path_info *) p->pscontext;
+ if (!pi)
+ BUG();
+
+ switch (type) {
+ case STATUSTYPE_TABLE:
+ DMEMIT("%i ", pi->repeat_count);
+ break;
+ case STATUSTYPE_INFO:
+ DMEMIT("%i ", atomic_read(&pi->load));
+ break;
+ }
+ }
+
+ return sz;
+}
+
+static struct path_selector_type lb_ps = {
+ .name = "load-balance",
+ .module = THIS_MODULE,
+ .table_args = 1,
+ .info_args = 1,
+ .create = lb_create,
+ .destroy = lb_destroy,
+ .status = lb_status,
+ .add_path = lb_add_path,
+ .fail_path = lb_fail_path,
+ .reinstate_path = lb_reinstate_path,
+ .select_path = lb_select_path,
+ .start_io = lb_io_started,
+ .end_io = lb_io_finished,
+};
+
+int __init dm_lb_ps_init(void)
+{
+ int rc;
+
+ rc = dm_register_path_selector(&lb_ps);
+ if (rc < 0)
+ DMERR("load-balance: register failed %d", rc);
+
+ DMINFO("dm-load-balance version " LB_VERSION " loaded");
+
+ return rc;
+}
+
+void __exit dm_lb_ps_exit(void)
+{
+ int rc;
+
+ rc = dm_unregister_path_selector(&lb_ps);
+ if (rc < 0)
+ DMERR("load-balance: unregister failed %d", rc);
+}
+
+module_init(dm_lb_ps_init);
+module_exit(dm_lb_ps_exit);
+
+MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>");
+MODULE_DESCRIPTION(
+ "(C) Copyright IBM Corp. 2004,2005 All Rights Reserved.\n"
+ DM_NAME " load balancing path selector (dm-load-balance.c version "
+ LB_VERSION ")"
+);
+MODULE_LICENSE("GPL");
+
diff -rupN a2-rqdm-mpath/drivers/md/dm-mpath.c a3-rqdm-mpath-dlb/drivers/md/dm-mpath.c
--- a2-rqdm-mpath/drivers/md/dm-mpath.c 2007-08-29 14:07:39.000000000 -0400
+++ a3-rqdm-mpath-dlb/drivers/md/dm-mpath.c 2007-08-29 14:07:59.000000000 -0400
@@ -227,11 +227,12 @@ static void __switch_pg(struct multipath
}
}

-static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
+static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
+ size_t nr_bytes)
{
struct dm_path *path;

- path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
+ path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
if (!path)
return -ENXIO;

@@ -243,7 +244,7 @@ static int __choose_path_in_pg(struct mu
return 0;
}

-static void __choose_pgpath(struct multipath *m)
+static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
{
struct priority_group *pg;
unsigned bypassed = 1;
@@ -255,12 +256,12 @@ static void __choose_pgpath(struct multi
if (m->next_pg) {
pg = m->next_pg;
m->next_pg = NULL;
- if (!__choose_path_in_pg(m, pg))
+ if (!__choose_path_in_pg(m, pg, nr_bytes))
return;
}

/* Don't change PG until it has no remaining paths */
- if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
+ if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
return;

/*
@@ -272,7 +273,7 @@ static void __choose_pgpath(struct multi
list_for_each_entry(pg, &m->priority_groups, list) {
if (pg->bypassed == bypassed)
continue;
- if (!__choose_path_in_pg(m, pg))
+ if (!__choose_path_in_pg(m, pg, nr_bytes))
return;
}
} while (bypassed--);
@@ -311,7 +312,7 @@ static int map_io(struct multipath *m, s
/* Do we need to select a new pgpath? */
if (!m->current_pgpath ||
(!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
- __choose_pgpath(m);
+ __choose_pgpath(m, clone->nr_sectors << 9);

pgpath = m->current_pgpath;

@@ -345,6 +346,10 @@ static int map_io(struct multipath *m, s

mpio->pgpath = pgpath;

+ if (r == 1 && m->current_pg->ps.type->start_io)
+ m->current_pg->ps.type->start_io(&m->current_pg->ps,
+ &pgpath->path, clone);
+
spin_unlock_irqrestore(&m->lock, flags);

return r;
@@ -421,7 +426,7 @@ static void process_queued_ios(struct wo
goto out;

if (!m->current_pgpath)
- __choose_pgpath(m);
+ __choose_pgpath(m, 1 << 19); /* Assume 512 KB */

pgpath = m->current_pgpath;

@@ -1086,7 +1091,8 @@ static int do_end_io(struct multipath *m
* clone->q's lock must be held
*/
static int multipath_end_io_first(struct dm_target *ti, struct request *clone,
- int error, union map_info *map_context)
+ int error, int nr_bytes,
+ union map_info *map_context)
{
struct multipath *m = ti->private;
struct dm_mpath_io *mpio = map_context->ptr;
@@ -1098,7 +1104,7 @@ static int multipath_end_io_first(struct
if (pgpath) {
ps = &pgpath->pg->ps;
if (ps->type->end_io)
- ps->type->end_io(ps, &pgpath->path);
+ ps->type->end_io(ps, &pgpath->path, clone, nr_bytes);
}

return r;
@@ -1327,7 +1333,7 @@ static int multipath_ioctl(struct dm_tar
spin_lock_irqsave(&m->lock, flags);

if (!m->current_pgpath)
- __choose_pgpath(m);
+ __choose_pgpath(m, 1 << 19); /* Assume 512KB */

if (m->current_pgpath) {
bdev = m->current_pgpath->path.dev->bdev;
@@ -1384,7 +1390,7 @@ static int __pg_congested(struct priorit
}
#endif

-static int multipath_congested(struct dm_target *ti)
+static int multipath_congested(struct dm_target *ti, size_t nr_bytes)
{
int r = 0;
struct multipath *m = (struct multipath *) ti->private;
@@ -1409,7 +1415,7 @@ static int multipath_congested(struct dm
* in map_io(). (This is a hack for pre-decrementing repeat_count
* in map_io(). Needs to be fixed this repeat_count bug.)
*/
- __choose_pgpath(m);
+ __choose_pgpath(m, nr_bytes);
if (m->current_pgpath) {
if (__pgpath_congested(m->current_pgpath)) {
r = 1;
diff -rupN a2-rqdm-mpath/drivers/md/dm-path-selector.h a3-rqdm-mpath-dlb/drivers/md/dm-path-selector.h
--- a2-rqdm-mpath/drivers/md/dm-path-selector.h 2007-08-13 00:25:24.000000000 -0400
+++ a3-rqdm-mpath-dlb/drivers/md/dm-path-selector.h 2007-08-28 16:41:34.000000000 -0400
@@ -56,7 +56,7 @@ struct path_selector_type {
* the path fails.
*/
struct dm_path *(*select_path) (struct path_selector *ps,
- unsigned *repeat_count);
+ unsigned *repeat_count, size_t nr_bytes);

/*
* Notify the selector that a path has failed.
@@ -75,7 +75,10 @@ struct path_selector_type {
int (*status) (struct path_selector *ps, struct dm_path *path,
status_type_t type, char *result, unsigned int maxlen);

- int (*end_io) (struct path_selector *ps, struct dm_path *path);
+ int (*start_io) (struct path_selector *ps, struct dm_path *path,
+ struct request *clone);
+ int (*end_io) (struct path_selector *ps, struct dm_path *path,
+ struct request *clone, int nr_bytes);
};

/* Register a path selector */
diff -rupN a2-rqdm-mpath/drivers/md/dm-round-robin.c a3-rqdm-mpath-dlb/drivers/md/dm-round-robin.c
--- a2-rqdm-mpath/drivers/md/dm-round-robin.c 2007-08-13 00:25:24.000000000 -0400
+++ a3-rqdm-mpath-dlb/drivers/md/dm-round-robin.c 2007-08-28 16:41:34.000000000 -0400
@@ -160,7 +160,7 @@ static int rr_reinstate_path(struct path
}

static struct dm_path *rr_select_path(struct path_selector *ps,
- unsigned *repeat_count)
+ unsigned *repeat_count, size_t nr_bytes)
{
struct selector *s = (struct selector *) ps->context;
struct path_info *pi = NULL;
diff -rupN a2-rqdm-mpath/drivers/md/Makefile a3-rqdm-mpath-dlb/drivers/md/Makefile
--- a2-rqdm-mpath/drivers/md/Makefile 2007-08-13 00:25:24.000000000 -0400
+++ a3-rqdm-mpath-dlb/drivers/md/Makefile 2007-08-28 16:41:34.000000000 -0400
@@ -33,7 +33,8 @@ obj-$(CONFIG_BLK_DEV_MD) += md-mod.o
obj-$(CONFIG_BLK_DEV_DM) += dm-mod.o
obj-$(CONFIG_DM_CRYPT) += dm-crypt.o
obj-$(CONFIG_DM_DELAY) += dm-delay.o
-obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o
+obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o \
+ dm-load-balance.o dm-adaptive.o
obj-$(CONFIG_DM_MULTIPATH_EMC) += dm-emc.o
obj-$(CONFIG_DM_MULTIPATH_RDAC) += dm-rdac.o
obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o
diff -rupN a2-rqdm-mpath/include/linux/device-mapper.h a3-rqdm-mpath-dlb/include/linux/device-mapper.h
--- a2-rqdm-mpath/include/linux/device-mapper.h 2007-08-28 15:21:48.000000000 -0400
+++ a3-rqdm-mpath-dlb/include/linux/device-mapper.h 2007-08-28 16:41:34.000000000 -0400
@@ -64,6 +64,7 @@ typedef int (*dm_endio_fn) (struct dm_ta

typedef int (*dm_request_endio_first_fn) (struct dm_target *ti,
struct request *clone, int error,
+ int nr_bytes,
union map_info *map_context);

typedef int (*dm_request_endio_fn) (struct dm_target *ti,
@@ -88,7 +89,7 @@ typedef int (*dm_message_fn) (struct dm_
typedef int (*dm_ioctl_fn) (struct dm_target *ti, struct inode *inode,
struct file *filp, unsigned int cmd,
unsigned long arg);
-typedef int (*dm_congested_fn) (struct dm_target *ti);
+typedef int (*dm_congested_fn) (struct dm_target *ti, size_t nr_bytes);

void dm_error(const char *message);

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/