[PATCH v3 4/4] block: loop: support DIO & AIO
From: Ming Lei
Date: Wed May 06 2015 - 13:09:04 EST
There are about 3 advantages to use direct I/O and AIO on
read/write loop's backing file:
1) double cache can be avoided, then memory usage gets
decreased a lot
2) not like user space direct I/O, there isn't cost of
pinning pages
3) avoid context switch for obtaining good throughput
- in buffered file read, random I/O throughput is often obtained
only if they are submitted concurrently from lots of tasks; but for
sequential I/O, most of times they can be hit from page cache, so
concurrent submissions often introduce unnecessary context switch
and can't improve throughput much. There was such discussion[1]
to use non-blocking I/O to improve the problem for application.
- with direct I/O and AIO, concurrent submissions can be
avoided and random read throughput can't be affected meantime
Follows my fio test result:
1. 16 jobs fio test inside ext4 file system over loop block
1) How to run
- linux kernel: 4.1.0-rc2-next-20150506 with the patchset
- the loop block is over one image on HDD.
- linux psync, 16 jobs, size 400M, ext4 over loop block
- test result: IOPS from fio output
2) Throughput result:
-------------------------------------------------------------
test cases |randread |read |randwrite |write |
-------------------------------------------------------------
base |240 |8705 |3763 |20914
-------------------------------------------------------------
base+loop aio |242 |9258 |4577 |21451
-------------------------------------------------------------
3) context switch
- context switch decreased by ~16% with loop aio for randread,
and decreased by ~33% for read
4) memory usage
- After these four tests with loop aio: ~10% memory becomes used
- After these four tests without loop aio: more than 55% memory
becomes used
2. single job fio test inside ext4 file system over loop block(for Maxim Patlasov)
1) How to run
- linux kernel: 4.1.0-rc2-next-20150506 with the patchset
- the loop block is over one image on HDD.
- linux psync, 1 job, size 4000M, ext4 over loop block
- test result: IOPS from fio output
2) Throughput result:
-------------------------------------------------------------
test cases |randread |read |randwrite |write |
-------------------------------------------------------------
base |109 |21180 |4192 |22782
-------------------------------------------------------------
base+loop aio |114 |21018 |5404 |22670
-------------------------------------------------------------
3) context switch
- context switch decreased by ~10% with loop aio for randread,
and decreased by ~50% for read
4) memory usage
- After these four tests with loop aio: ~10% memory becomes used
- After these four tests without loop aio: more than 55% memory
becomes used
Both 'context switch' and 'memory usage' data are got from sar.
[1] https://lwn.net/Articles/612483/
[2] sar graph when running fio over loop without the patchset
http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-nonaio.pdf
[3] sar graph when running fio over loop with the patchset
http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-aio.pdf
[4] sar graph when running fio over loop without the patchset
http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-nonaio-1job.pdf
[5] sar graph when running fio over loop with the patchset
http://kernel.ubuntu.com/~ming/block/loop-aio/v3/lo-aio-1job.pdf
Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxxxxx>
---
drivers/block/loop.c | 85 ++++++++++++++++++++++++++++++++++++++++++++++++++--
drivers/block/loop.h | 4 +++
2 files changed, 86 insertions(+), 3 deletions(-)
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 94c9eec..3652581 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -388,6 +388,65 @@ static int lo_req_flush(struct loop_device *lo, struct request *rq)
return ret;
}
+static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
+{
+ struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
+ struct request *rq = cmd->rq;
+
+ if (ret > 0)
+ ret = 0;
+ else if (ret < 0)
+ ret = -EIO;
+
+ rq->errors = ret;
+ blk_mq_complete_request(rq);
+}
+
+static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
+ loff_t pos, bool rw)
+{
+ struct iov_iter iter;
+ struct bio_vec *bvec;
+ struct bio *bio = cmd->rq->bio;
+ struct file *file = lo->lo_backing_file;
+ int ret;
+
+ /* nomerge for loop request queue */
+ WARN_ON(cmd->rq->bio != cmd->rq->biotail);
+
+ bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+ iov_iter_bvec(&iter, ITER_BVEC | rw, bvec,
+ bio_segments(bio), blk_rq_bytes(cmd->rq));
+
+ cmd->iocb.ki_pos = pos;
+ cmd->iocb.ki_filp = file;
+ cmd->iocb.ki_complete = lo_rw_aio_complete;
+ cmd->iocb.ki_flags = IOCB_DONT_DIRTY_PAGE | IOCB_DIRECT;
+
+ if (rw == WRITE)
+ ret = file->f_op->write_iter(&cmd->iocb, &iter);
+ else
+ ret = file->f_op->read_iter(&cmd->iocb, &iter);
+
+ if (ret != -EIOCBQUEUED)
+ cmd->iocb.ki_complete(&cmd->iocb, ret, 0);
+ return 0;
+}
+
+
+static inline int lo_rw_simple(struct loop_device *lo,
+ struct request *rq, loff_t pos, bool rw)
+{
+ struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+ if (cmd->use_aio)
+ return lo_rw_aio(lo, cmd, pos, rw);
+
+ if (rw == WRITE)
+ return lo_write_simple(lo, rq, pos);
+ else
+ return lo_read_simple(lo, rq, pos);
+}
static int do_req_filebacked(struct loop_device *lo, struct request *rq)
{
@@ -404,13 +463,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
else if (lo->transfer)
ret = lo_write_transfer(lo, rq, pos);
else
- ret = lo_write_simple(lo, rq, pos);
+ ret = lo_rw_simple(lo, rq, pos, WRITE);
} else {
if (lo->transfer)
ret = lo_read_transfer(lo, rq, pos);
else
- ret = lo_read_simple(lo, rq, pos);
+ ret = lo_rw_simple(lo, rq, pos, READ);
}
return ret;
@@ -441,6 +500,12 @@ static void do_loop_switch(struct loop_device *lo, struct switch_request *p)
mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
lo->old_gfp_mask = mapping_gfp_mask(mapping);
mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+
+ lo->support_dio = mapping->a_ops && mapping->a_ops->direct_IO;
+ if (lo->support_dio)
+ lo->use_aio = true;
+ else
+ lo->use_aio = false;
}
/*
@@ -761,6 +826,13 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
blk_queue_flush(lo->lo_queue, REQ_FLUSH);
+ /* use aio if it is possible */
+ lo->support_dio = mapping->a_ops && mapping->a_ops->direct_IO;
+ if (lo->support_dio)
+ lo->use_aio = true;
+ else
+ lo->use_aio = false;
+
set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);
loop_sysfs_init(lo);
@@ -1451,6 +1523,12 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx,
if (lo->lo_state != Lo_bound)
return -EIO;
+ if (lo->use_aio && !lo->transfer &&
+ !(cmd->rq->cmd_flags & (REQ_FLUSH | REQ_DISCARD)))
+ cmd->use_aio = true;
+ else
+ cmd->use_aio = false;
+
queue_kthread_work(&lo->worker, &cmd->work);
return BLK_MQ_RQ_QUEUE_OK;
@@ -1470,7 +1548,8 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
failed:
if (ret)
cmd->rq->errors = -EIO;
- blk_mq_complete_request(cmd->rq);
+ if (!cmd->use_aio || ret)
+ blk_mq_complete_request(cmd->rq);
}
static void loop_queue_work(struct kthread_work *work)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 54c6aa5..0af40a0 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -58,6 +58,8 @@ struct loop_device {
struct mutex lo_ctl_mutex;
struct kthread_worker worker;
struct task_struct *worker_task;
+ bool support_dio;
+ bool use_aio;
struct request_queue *lo_queue;
struct blk_mq_tag_set tag_set;
@@ -68,6 +70,8 @@ struct loop_cmd {
struct kthread_work work;
struct request *rq;
struct list_head list;
+ bool use_aio; /* use AIO interface to handle I/O */
+ struct kiocb iocb;
};
/* Support for loadable transfer modules */
--
1.9.1
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/