Re: Status of the IO scheduler fixes for 2.4

From: Chris Mason (mason@suse.com)
Date: Sun Jul 06 2003 - 13:51:38 EST


On Sun, 2003-07-06 at 03:58, Marc-Christian Petersen wrote:
> On Friday 04 July 2003 23:37, Chris Mason wrote:
>
> Hi Chris,
>
> > > If the IO fairness still doesnt
> > > get somewhat better for general use (well get isolated user reports and
> > > benchmarks for both the two patches), then I might consider the q->full
> > > patch (it has throughtput drawbacks and I prefer avoiding a tunable
> > > there).
> now there is io-stalls-10 in .22-pre3 (lowlat elev. + fixpausing). Could you
> please send "q->full patch" as ontop of -pre3? :-)

Attached, this defaults to q->full off and keeps the elvtune changes.
So to turn on the q->full low latency fixes, you need to:

elvtune -b 1 /dev/xxxx

Note that for lvm and md, you need to elvtune each underlying device.
Running it on an lvm/md device doesn't do anything.

-chris


--- 1.9/drivers/block/blkpg.c Sat Mar 30 06:58:05 2002
+++ edited/drivers/block/blkpg.c Sun Jul 6 14:34:31 2003
@@ -261,10 +261,10 @@
                         return blkpg_ioctl(dev, (struct blkpg_ioctl_arg *) arg);
                         
                 case BLKELVGET:
- return blkelvget_ioctl(&blk_get_queue(dev)->elevator,
+ return blkelvget_ioctl(blk_get_queue(dev),
                                                (blkelv_ioctl_arg_t *) arg);
                 case BLKELVSET:
- return blkelvset_ioctl(&blk_get_queue(dev)->elevator,
+ return blkelvset_ioctl(blk_get_queue(dev),
                                                (blkelv_ioctl_arg_t *) arg);
 
                 case BLKBSZGET:
--- 1.8/drivers/block/elevator.c Tue Oct 22 00:29:25 2002
+++ edited/drivers/block/elevator.c Sun Jul 6 14:34:31 2003
@@ -180,23 +180,28 @@
 
 void elevator_noop_merge_req(struct request *req, struct request *next) {}
 
-int blkelvget_ioctl(elevator_t * elevator, blkelv_ioctl_arg_t * arg)
+int blkelvget_ioctl(request_queue_t *q, blkelv_ioctl_arg_t * arg)
 {
+ elevator_t *elevator = &q->elevator;
         blkelv_ioctl_arg_t output;
 
         output.queue_ID = elevator->queue_ID;
         output.read_latency = elevator->read_latency;
         output.write_latency = elevator->write_latency;
- output.max_bomb_segments = 0;
-
+ output.max_bomb_segments = q->max_queue_sectors;
+ if (q->low_latency)
+ output.max_bomb_segments |= MAX_BOMB_LATENCY_MASK;
+ else
+ output.max_bomb_segments &= ~MAX_BOMB_LATENCY_MASK;
         if (copy_to_user(arg, &output, sizeof(blkelv_ioctl_arg_t)))
                 return -EFAULT;
 
         return 0;
 }
 
-int blkelvset_ioctl(elevator_t * elevator, const blkelv_ioctl_arg_t * arg)
+int blkelvset_ioctl(request_queue_t *q, const blkelv_ioctl_arg_t * arg)
 {
+ elevator_t *elevator = &q->elevator;
         blkelv_ioctl_arg_t input;
 
         if (copy_from_user(&input, arg, sizeof(blkelv_ioctl_arg_t)))
@@ -206,9 +211,19 @@
                 return -EINVAL;
         if (input.write_latency < 0)
                 return -EINVAL;
+ if (input.max_bomb_segments < 0)
+ return -EINVAL;
 
         elevator->read_latency = input.read_latency;
         elevator->write_latency = input.write_latency;
+ q->low_latency = input.max_bomb_segments & MAX_BOMB_LATENCY_MASK ? 1:0;
+ printk("queue %d: low latency mode is now %s\n", elevator->queue_ID,
+ q->low_latency ? "on" : "off");
+ input.max_bomb_segments &= ~MAX_BOMB_LATENCY_MASK;
+ if (input.max_bomb_segments)
+ q->max_queue_sectors = input.max_bomb_segments;
+ printk("queue %d: max queue sectors is now %d\n", elevator->queue_ID,
+ q->max_queue_sectors);
         return 0;
 }
 
--- 1.46/drivers/block/ll_rw_blk.c Fri Jul 4 13:35:08 2003
+++ edited/drivers/block/ll_rw_blk.c Sun Jul 6 14:34:30 2003
@@ -379,8 +379,22 @@
 {
         if (q->plugged) {
                 q->plugged = 0;
- if (!list_empty(&q->queue_head))
+ if (!list_empty(&q->queue_head)) {
+
+ /* we don't want merges later on to come in
+ * and significantly increase the amount of
+ * work during an unplug, it can lead to high
+ * latencies while some poor waiter tries to
+ * run an ever increasing chunk of io.
+ * This does lower throughput some though.
+ */
+ if (q->low_latency) {
+ struct request *rq;
+ rq = blkdev_entry_prev_request(&q->queue_head),
+ rq->elevator_sequence = 0;
+ }
                         q->request_fn(q);
+ }
         }
 }
 
@@ -527,7 +541,9 @@
         q->plug_tq.routine = &generic_unplug_device;
         q->plug_tq.data = q;
         q->plugged = 0;
+ q->full = 0;
         q->can_throttle = 0;
+ q->low_latency = 0;
 
         /*
          * These booleans describe the queue properties. We set the
@@ -546,7 +562,7 @@
  * Get a free request. io_request_lock must be held and interrupts
  * disabled on the way in. Returns NULL if there are no free requests.
  */
-static struct request *get_request(request_queue_t *q, int rw)
+static struct request *__get_request(request_queue_t *q, int rw)
 {
         struct request *rq = NULL;
         struct request_list *rl;
@@ -560,11 +576,34 @@
                 rq->cmd = rw;
                 rq->special = NULL;
                 rq->q = q;
- }
+ } else if (q->low_latency)
+ q->full = 1;
         return rq;
 }
 
 /*
+ * get a free request, honoring the queue_full condition
+ */
+static inline struct request *get_request(request_queue_t *q, int rw)
+{
+ if (q->full)
+ return NULL;
+ return __get_request(q, rw);
+}
+
+/*
+ * helper func to do memory barriers and wakeups when we finally decide
+ * to clear the queue full condition
+ */
+static inline void clear_full_and_wake(request_queue_t *q)
+{
+ q->full = 0;
+ mb();
+ if (waitqueue_active(&q->wait_for_requests))
+ wake_up(&q->wait_for_requests);
+}
+
+/*
  * Here's the request allocation design, low latency version:
  *
  * 1: Blocking on request exhaustion is a key part of I/O throttling.
@@ -612,24 +651,30 @@
 {
         register struct request *rq;
         DECLARE_WAITQUEUE(wait, current);
+ int oversized;
 
         add_wait_queue_exclusive(&q->wait_for_requests, &wait);
 
         do {
                 set_current_state(TASK_UNINTERRUPTIBLE);
                 spin_lock_irq(&io_request_lock);
- if (blk_oversized_queue(q)) {
- __generic_unplug_device(q);
+ oversized = blk_oversized_queue(q);
+ if (q->full || oversized) {
+ if (oversized)
+ __generic_unplug_device(q);
                         spin_unlock_irq(&io_request_lock);
                         schedule();
                         spin_lock_irq(&io_request_lock);
                 }
- rq = get_request(q, rw);
+ rq = __get_request(q, rw);
                 spin_unlock_irq(&io_request_lock);
         } while (rq == NULL);
         remove_wait_queue(&q->wait_for_requests, &wait);
         current->state = TASK_RUNNING;
 
+ if (!waitqueue_active(&q->wait_for_requests))
+ clear_full_and_wake(q);
+
         return rq;
 }
 
@@ -876,6 +921,8 @@
                         smp_mb();
                         if (waitqueue_active(&q->wait_for_requests))
                                 wake_up(&q->wait_for_requests);
+ else
+ clear_full_and_wake(q);
                 }
         }
 }
--- 1.24/include/linux/blkdev.h Fri Jul 4 13:18:12 2003
+++ edited/include/linux/blkdev.h Sun Jul 6 14:34:38 2003
@@ -136,12 +136,25 @@
         int head_active:1;
 
         /*
+ * Booleans that indicate whether the queue's free requests have
+ * been exhausted and is waiting to drop below the batch_requests
+ * threshold
+ */
+ int full:1;
+
+ /*
          * Boolean that indicates you will use blk_started_sectors
          * and blk_finished_sectors in addition to blk_started_io
          * and blk_finished_io. It enables the throttling code to
          * help keep the sectors in flight to a reasonable value
          */
         int can_throttle:1;
+
+ /*
+ * Boolean that indicates the queue should prefer low
+ * latency over throughput. This enables the q->full checks
+ */
+ int low_latency:1;
 
         unsigned long bounce_pfn;
 
--- 1.6/include/linux/elevator.h Fri Jul 4 13:12:45 2003
+++ edited/include/linux/elevator.h Sun Jul 6 14:34:31 2003
@@ -35,14 +35,19 @@
         int queue_ID;
         int read_latency;
         int write_latency;
+/*
+ * (max_bomb_segments & MAX_BOMB_LATENCY_MASK) == 1 indicates low latency
+ * mode. We're using any odd number to indicate low latency is on.
+ */
+#define MAX_BOMB_LATENCY_MASK 1
         int max_bomb_segments;
 } blkelv_ioctl_arg_t;
 
 #define BLKELVGET _IOR(0x12,106,sizeof(blkelv_ioctl_arg_t))
 #define BLKELVSET _IOW(0x12,107,sizeof(blkelv_ioctl_arg_t))
 
-extern int blkelvget_ioctl(elevator_t *, blkelv_ioctl_arg_t *);
-extern int blkelvset_ioctl(elevator_t *, const blkelv_ioctl_arg_t *);
+extern int blkelvget_ioctl(request_queue_t *, blkelv_ioctl_arg_t *);
+extern int blkelvset_ioctl(request_queue_t *, const blkelv_ioctl_arg_t *);
 
 extern void elevator_init(elevator_t *, elevator_t);
 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Mon Jul 07 2003 - 22:00:27 EST