[PATCH] softirq block request completion

From: Jens Axboe
Date: Thu Oct 21 2004 - 03:53:58 EST


Hi,

I wanted to play with this after the io completion latency talks started
some time ago. So I got an hour to do so this morning, this is what I
came up with. It's only lightly tested, but it does boot and survives a
dbench on IDE and SCSI.

This is just an RFC, not a call for inclusion (as if the above should
not be enough of a warning :-)

Signed-off-by: Jens Axboe <axboe@xxxxxxx>

===== drivers/block/ll_rw_blk.c 1.274 vs edited =====
--- 1.274/drivers/block/ll_rw_blk.c 2004-10-20 10:37:07 +02:00
+++ edited/drivers/block/ll_rw_blk.c 2004-10-21 10:21:57 +02:00
@@ -28,6 +28,8 @@
#include <linux/slab.h>
#include <linux/swap.h>
#include <linux/writeback.h>
+#include <linux/interrupt.h>
+#include <linux/hash.h>

/*
* for max sense size
@@ -60,13 +62,27 @@
/*
* Controlling structure to kblockd
*/
-static struct workqueue_struct *kblockd_workqueue;
+static struct workqueue_struct *kblockd_workqueue;

unsigned long blk_max_low_pfn, blk_max_pfn;

EXPORT_SYMBOL(blk_max_low_pfn);
EXPORT_SYMBOL(blk_max_pfn);

+/*
+ * per-cpu completion stuff. try to ammortize the cost of getting the
+ * queue_lock by collecting queues in different lists. this should be related
+ * to NR_CPUS as well, no point in making BLK_NR_COMP_QUEUES bigger than the
+ * number of cpus in the system.
+ */
+#define BLK_NR_COMP_SHIFT 2
+#define BLK_NR_COMP_QUEUES (1 << BLK_NR_COMP_SHIFT)
+
+struct blk_completion_queue {
+ struct list_head done_list[BLK_NR_COMP_QUEUES];
+};
+static DEFINE_PER_CPU(struct blk_completion_queue, blk_cpu_done);
+
/* Amount of time in which a process may batch requests */
#define BLK_BATCH_TIME (HZ/50UL)

@@ -2968,6 +2984,113 @@
EXPORT_SYMBOL(end_that_request_chunk);

/*
+ * end io completely on this list of requests. operation is split into two
+ * parts - one that can run without the queue lock held, and one which
+ * requires it.
+ */
+#define list_entry_done(entry) list_entry((entry), struct request, donelist)
+static void process_completion_queue(struct list_head *list)
+{
+ LIST_HEAD(local_list);
+ request_queue_t *q;
+ static int max;
+ int i = 0;
+
+ while (!list_empty(list)) {
+ struct request *rq = list_entry_done(list->next);
+ unsigned int bytes_done;
+ int uptodate;
+
+ list_move(&rq->donelist, &local_list);
+ i++;
+
+ /*
+ * fs or pc request
+ */
+ if (blk_fs_request(rq))
+ bytes_done = rq->hard_nr_sectors << 9;
+ else
+ bytes_done = rq->data_len;
+
+ uptodate = 1;
+ if (rq->errors)
+ uptodate = rq->errors;
+
+ end_that_request_chunk(rq, uptodate, bytes_done);
+ add_disk_randomness(rq->rq_disk);
+ }
+
+ q = NULL;
+ list = &local_list;
+ while (!list_empty(list)) {
+ struct request *rq = list_entry_done(list->next);
+
+ list_del_init(&rq->donelist);
+
+ if (q != rq->q) {
+ if (q)
+ spin_unlock_irq(q->queue_lock);
+
+ q = rq->q;
+ spin_lock_irq(q->queue_lock);
+ }
+
+ if (blk_rq_tagged(rq))
+ blk_queue_end_tag(q, rq);
+
+ end_that_request_last(rq);
+ }
+
+ if (q)
+ spin_unlock_irq(q->queue_lock);
+}
+
+/*
+ * splice the completion data to a local structure and hand off to
+ * process_completion_queue() to complete the requests
+ */
+static void blk_done_softirq(struct softirq_action *h)
+{
+ struct blk_completion_queue local_bcq;
+ struct blk_completion_queue *bcq;
+ int i;
+
+ for (i = 0; i < BLK_NR_COMP_QUEUES; i++)
+ INIT_LIST_HEAD(&local_bcq.done_list[i]);
+
+ local_irq_disable();
+ bcq = &__get_cpu_var(blk_cpu_done);
+
+ for (i = 0; i < BLK_NR_COMP_QUEUES; i++)
+ list_splice_init(&bcq->done_list[i], &local_bcq.done_list[i]);
+
+ local_irq_enable();
+
+ for (i = 0; i < BLK_NR_COMP_QUEUES; i++)
+ process_completion_queue(&local_bcq.done_list[i]);
+}
+
+void end_that_request_completely(struct request *req)
+{
+ long index = hash_ptr(req->q, BLK_NR_COMP_SHIFT);
+ struct blk_completion_queue *bcq;
+ unsigned long flags;
+
+ INIT_LIST_HEAD(&req->donelist);
+ BUG_ON(!blk_pc_request(req) && !blk_fs_request(req));
+
+ local_irq_save(flags);
+
+ bcq = &__get_cpu_var(blk_cpu_done);
+ list_add_tail(&req->donelist, &bcq->done_list[index]);
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+
+ local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(end_that_request_completely);
+
+/*
* queue lock must be held
*/
void end_that_request_last(struct request *req)
@@ -3064,6 +3187,8 @@

int __init blk_dev_init(void)
{
+ int i, j;
+
kblockd_workqueue = create_workqueue("kblockd");
if (!kblockd_workqueue)
panic("Failed to create kblockd\n");
@@ -3076,6 +3201,15 @@

iocontext_cachep = kmem_cache_create("blkdev_ioc",
sizeof(struct io_context), 0, SLAB_PANIC, NULL, NULL);
+
+ for (i = 0; i < NR_CPUS; i++) {
+ struct blk_completion_queue *bcq = &per_cpu(blk_cpu_done, i);
+
+ for (j = 0; j < BLK_NR_COMP_QUEUES; j++)
+ INIT_LIST_HEAD(&bcq->done_list[j]);
+ }
+
+ open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);

blk_max_low_pfn = max_low_pfn;
blk_max_pfn = max_pfn;
===== drivers/ide/ide-io.c 1.32 vs edited =====
--- 1.32/drivers/ide/ide-io.c 2004-10-20 18:52:21 +02:00
+++ edited/drivers/ide/ide-io.c 2004-10-21 09:02:12 +02:00
@@ -135,6 +135,12 @@
HWGROUP(drive)->hwif->ide_dma_on(drive);
}

+ if (rq_all_done(rq, nr_sectors << 9)) {
+ blkdev_dequeue_request(rq);
+ end_that_request_completely(rq);
+ goto next;
+ }
+
if (!end_that_request_first(rq, uptodate, nr_sectors)) {
add_disk_randomness(rq->rq_disk);

@@ -142,8 +148,9 @@
blk_queue_end_tag(drive->queue, rq);

blkdev_dequeue_request(rq);
- HWGROUP(drive)->rq = NULL;
end_that_request_last(rq);
+next:
+ HWGROUP(drive)->rq = NULL;
ret = 0;
}
return ret;
===== drivers/scsi/scsi_lib.c 1.137 vs edited =====
--- 1.137/drivers/scsi/scsi_lib.c 2004-10-19 08:06:58 +02:00
+++ edited/drivers/scsi/scsi_lib.c 2004-10-21 08:45:15 +02:00
@@ -522,6 +522,11 @@
struct request *req = cmd->request;
unsigned long flags;

+ if (rq_all_done(req, bytes)) {
+ end_that_request_completely(req);
+ goto next;
+ }
+
/*
* If there are blocks left over at the end, set up the command
* to queue the remainder of them.
@@ -560,6 +565,7 @@
* This will goose the queue request function at the end, so we don't
* need to worry about launching another command.
*/
+next:
scsi_next_command(cmd);
return NULL;
}
===== include/linux/blkdev.h 1.154 vs edited =====
--- 1.154/include/linux/blkdev.h 2004-10-19 11:40:18 +02:00
+++ edited/include/linux/blkdev.h 2004-10-21 10:06:44 +02:00
@@ -106,9 +106,8 @@
* try to put the fields that are referenced together in the same cacheline
*/
struct request {
- struct list_head queuelist; /* looking for ->queue? you must _not_
- * access it directly, use
- * blkdev_dequeue_request! */
+ struct list_head queuelist;
+ struct list_head donelist;
unsigned long flags; /* see REQ_ bits below */

/* Maintain bio traversal state for part by part I/O submission.
@@ -591,6 +590,16 @@
extern void end_that_request_last(struct request *);
extern int process_that_request_first(struct request *, unsigned int);
extern void end_request(struct request *req, int uptodate);
+extern void end_that_request_completely(struct request *);
+
+static inline int rq_all_done(struct request *rq, unsigned int nr_bytes)
+{
+ if ((blk_fs_request(rq) && nr_bytes >= (rq->hard_nr_sectors << 9)) ||
+ (blk_pc_request(rq) && nr_bytes >= rq->data_len))
+ return 1;
+
+ return 0;
+}

/*
* end_that_request_first/chunk() takes an uptodate argument. we account
===== include/linux/interrupt.h 1.31 vs edited =====
--- 1.31/include/linux/interrupt.h 2004-10-19 07:26:39 +02:00
+++ edited/include/linux/interrupt.h 2004-10-21 08:45:15 +02:00
@@ -89,6 +89,7 @@
NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ,
SCSI_SOFTIRQ,
+ BLOCK_SOFTIRQ,
TASKLET_SOFTIRQ
};


--
Jens Axboe

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/