Re: [PATCH] io stalls

From: Chris Mason (mason@suse.com)
Date: Thu Jun 12 2003 - 11:06:43 EST


On Wed, 2003-06-11 at 23:20, Nick Piggin wrote:

>
> I think the cpu utilization gain of waking a number of tasks
> at once would be outweighed by advantage of waking 1 task
> and not putting it to sleep again for a number of requests.
> You obviously are not claiming concurrency improvements, as
> your method would also increase contention on the io lock
> (or the queue lock in 2.5).

I've been trying variations on this for a few days, none have been
thrilling but the end result is better dbench and iozone throughput
overall. For the 20 writer iozone test, rc7 got an average throughput
of 3MB/s, and yesterdays latency patch got 500k/s or so. Ouch.

This gets us up to 1.2MB/s. I'm keeping yesterday's
get_request_wait_wake, which wakes up a waiter instead of unplugging.

The basic idea here is that after a process is woken up and grabs a
request, he becomes the batch owner. Batch owners get to ignore the
q->full flag for either 1/5 second or 32 requests, whichever comes
first. The timer part is an attempt at preventing memory pressure
writers (who go 1 req at a time) from holding onto batch ownership for
too long. Latency stats after dbench 50:

device 08:01: num_req 120077, total jiffies waited 663231
        65538 forced to wait
        1 min wait, 175 max wait
        10 average wait
        65296 < 100, 242 < 200, 0 < 300, 0 < 400, 0 < 500
        0 waits longer than 500 jiffies

Good latency system wide comes from fair waiting, but it also comes from
how fast we can run write_some_buffers(), since that is the unit of
throttling. Hopefully this patch decreases the time it takes for
write_some_buffers over the past latency patches, or gives someone else
a better idea ;-)

Attached is an incremental over yesterday's io-stalls-5.diff.

-chris

diff -u edited/drivers/block/ll_rw_blk.c edited/drivers/block/ll_rw_blk.c
--- edited/drivers/block/ll_rw_blk.c Wed Jun 11 13:36:10 2003
+++ edited/drivers/block/ll_rw_blk.c Thu Jun 12 11:53:03 2003
@@ -437,6 +437,12 @@
         nr_requests = 128;
         if (megs < 32)
                 nr_requests /= 2;
+ q->batch_owner[0] = NULL;
+ q->batch_owner[1] = NULL;
+ q->batch_remaining[0] = 0;
+ q->batch_remaining[1] = 0;
+ q->batch_jiffies[0] = 0;
+ q->batch_jiffies[1] = 0;
         blk_grow_request_list(q, nr_requests);
 
         init_waitqueue_head(&q->wait_for_requests[0]);
@@ -558,6 +564,31 @@
         blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
 }
 
+#define BATCH_JIFFIES (HZ/5)
+static void check_batch_owner(request_queue_t *q, int rw)
+{
+ if (q->batch_owner[rw] != current)
+ return;
+ if (--q->batch_remaining[rw] > 0 &&
+ jiffies - q->batch_jiffies[rw] < BATCH_JIFFIES) {
+ return;
+ }
+ q->batch_owner[rw] = NULL;
+}
+
+static void set_batch_owner(request_queue_t *q, int rw)
+{
+ struct task_struct *tsk = current;
+ if (q->batch_owner[rw] == tsk)
+ return;
+ if (q->batch_owner[rw] &&
+ jiffies - q->batch_jiffies[rw] < BATCH_JIFFIES)
+ return;
+ q->batch_jiffies[rw] = jiffies;
+ q->batch_owner[rw] = current;
+ q->batch_remaining[rw] = q->batch_requests;
+}
+
 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queue);
 /*
  * Get a free request. io_request_lock must be held and interrupts
@@ -587,9 +618,13 @@
  */
 static inline struct request *get_request(request_queue_t *q, int rw)
 {
- if (queue_full(q, rw))
+ struct request *rq;
+ if (queue_full(q, rw) && q->batch_owner[rw] != current)
                 return NULL;
- return __get_request(q, rw);
+ rq = __get_request(q, rw);
+ if (rq)
+ check_batch_owner(q, rw);
+ return rq;
 }
 
 /*
@@ -657,9 +692,9 @@
 
         add_wait_queue_exclusive(&q->wait_for_requests[rw], &wait);
 
+ spin_lock_irq(&io_request_lock);
         do {
                 set_current_state(TASK_UNINTERRUPTIBLE);
- spin_lock_irq(&io_request_lock);
                 if (queue_full(q, rw) || q->rq[rw].count == 0) {
                         if (q->rq[rw].count == 0)
                                 __generic_unplug_device(q);
@@ -668,8 +703,9 @@
                         spin_lock_irq(&io_request_lock);
                 }
                 rq = __get_request(q, rw);
- spin_unlock_irq(&io_request_lock);
         } while (rq == NULL);
+ set_batch_owner(q, rw);
+ spin_unlock_irq(&io_request_lock);
         remove_wait_queue(&q->wait_for_requests[rw], &wait);
         current->state = TASK_RUNNING;
 
@@ -1010,6 +1046,7 @@
         struct list_head *head, *insert_here;
         int latency;
         elevator_t *elevator = &q->elevator;
+ int need_unplug = 0;
 
         count = bh->b_size >> 9;
         sector = bh->b_rsector;
@@ -1145,8 +1182,8 @@
                                 spin_unlock_irq(&io_request_lock);
                                 freereq = __get_request_wait(q, rw);
                                 head = &q->queue_head;
+ need_unplug = 1;
                                 spin_lock_irq(&io_request_lock);
- get_request_wait_wakeup(q, rw);
                                 goto again;
                         }
                 }
@@ -1174,6 +1211,8 @@
 out:
         if (freereq)
                 blkdev_release_request(freereq);
+ if (need_unplug)
+ get_request_wait_wakeup(q, rw);
         spin_unlock_irq(&io_request_lock);
         return 0;
 end_io:
diff -u edited/include/linux/blkdev.h edited/include/linux/blkdev.h
--- edited/include/linux/blkdev.h Wed Jun 11 09:56:55 2003
+++ edited/include/linux/blkdev.h Thu Jun 12 09:44:26 2003
@@ -92,6 +92,10 @@
          */
         int batch_requests;
 
+ struct task_struct *batch_owner[2];
+ int batch_remaining[2];
+ unsigned long batch_jiffies[2];
+
         /*
          * Together with queue_head for cacheline sharing
          */

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sun Jun 15 2003 - 22:00:32 EST