Re: disk IO directly from PCI memory to block device sectors

From: Jens Axboe
Date: Fri Sep 26 2008 - 07:34:52 EST


On Fri, Sep 26 2008, Jens Axboe wrote:
> Another alternative would be using splice - if the pci device exposed a
> char device node, you could support ->splice_read() there which would
> just fill the pages into the pipe buffer. Then change the block device
> fops ->splice_write() to go direct to the block device through a bio
> instead of using the page cache based generic_file_splice_write(). Such
> a change would actually make sense to do, if the block device has been
> opened with O_DIRECT. And it would get you about the same performance as
> doing it in-kernel, the only extra overhead would be two syscalls per
> 64k (well probably only one extra syscall, since you probably need an
> ioctl/syscall to initiate the in-kernel activity as well). So just about
> as free as you could get.

Something like this, totally untested but should get the point across.

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 57e2786..fd06032 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -24,6 +24,7 @@
#include <linux/uio.h>
#include <linux/namei.h>
#include <linux/log2.h>
+#include <linux/splice.h>
#include <asm/uaccess.h>
#include "internal.h"

@@ -1224,6 +1225,77 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return blkdev_ioctl(file->f_mapping->host, file, cmd, arg);
}

+static void block_splice_end_io(struct bio *bio, int err)
+{
+ bio_put(bio);
+}
+
+static int pipe_to_disk(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
+ struct splice_desc *sd)
+{
+ struct block_device *bdev = I_BDEV(sd->u.file->f_mapping->host);
+ struct bio *bio;
+ int ret, bs;
+
+ bs = queue_hardsect_size(bdev_get_queue(bdev));
+ if (sd->pos & (bs - 1))
+ return -EINVAL;
+
+ ret = buf->ops->confirm(pipe, buf);
+ if (unlikely(ret))
+ return ret;
+
+ bio = bio_alloc(GFP_KERNEL, 1);
+ bio->bi_sector = sd->pos / bs;
+ bio->bi_bdev = bdev;
+ bio->bi_end_io = block_splice_end_io;
+
+ bio_add_page(bio, buf->page, buf->len, buf->offset);
+
+ submit_bio(WRITE, bio);
+ return buf->len;
+}
+
+/*
+ * Splice to file opened with O_DIRECT. Bypass caching completely and
+ * just go direct-to-bio
+ */
+static ssize_t __block_splice_write(struct pipe_inode_info *pipe,
+ struct file *out, loff_t *ppos, size_t len,
+ unsigned int flags)
+{
+ struct splice_desc sd = {
+ .total_len = len,
+ .flags = flags,
+ .pos = *ppos,
+ .u.file = out,
+ };
+ struct inode *inode = out->f_mapping->host;
+ ssize_t ret;
+
+ if (unlikely(*ppos & 511))
+ return -EINVAL;
+
+ inode_double_lock(inode, pipe->inode);
+ ret = __splice_from_pipe(pipe, &sd, pipe_to_disk);
+ inode_double_unlock(inode, pipe->inode);
+
+ if (ret > 0)
+ *ppos += ret;
+
+ return ret;
+}
+
+static ssize_t block_splice_write(struct pipe_inode_info *pipe,
+ struct file *out, loff_t *ppos, size_t len,
+ unsigned int flags)
+{
+ if (out->f_flags & O_DIRECT)
+ return __block_splice_write(pipe, out, ppos, len, flags);
+
+ return generic_file_splice_write(pipe, out, ppos, len, flags);
+}
+
static const struct address_space_operations def_blk_aops = {
.readpage = blkdev_readpage,
.writepage = blkdev_writepage,
@@ -1249,7 +1321,7 @@ const struct file_operations def_blk_fops = {
.compat_ioctl = compat_blkdev_ioctl,
#endif
.splice_read = generic_file_splice_read,
- .splice_write = generic_file_splice_write,
+ .splice_write = block_splice_write,
};

int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)

--
Jens Axboe

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/