[PATCH 3/3] Zach's core aio changes to support vectored AIO

From: Badari Pulavarty
Date: Tue Mar 07 2006 - 19:20:18 EST


This work is initially done by Zach Brown to add support for
vectored aio. These are the core changes for AIO to support
IOCB_CMD_PREADV/IOCB_CMD_PWRITEV.

This work is initailly done by Zach Brown to add support for
vectored aio. These are the core changes for AIO to support
IOCB_CMD_PREADV/IOCB_CMD_PWRITEV.

Signed-off-by: Badari Pulavarty <pbadari@xxxxxxxxxx>
Index: linux-2.6.16-rc5/fs/aio.c
===================================================================
--- linux-2.6.16-rc5.orig/fs/aio.c 2006-03-07 13:44:09.000000000 -0800
+++ linux-2.6.16-rc5/fs/aio.c 2006-03-07 13:59:17.000000000 -0800
@@ -416,6 +416,7 @@ static struct kiocb fastcall *__aio_get_
req->ki_retry = NULL;
req->ki_dtor = NULL;
req->private = NULL;
+ req->ki_iovec = NULL;
INIT_LIST_HEAD(&req->ki_run_list);

/* Check if the completion queue has enough free space to
@@ -461,6 +462,8 @@ static inline void really_put_req(struct

if (req->ki_dtor)
req->ki_dtor(req);
+ if (req->ki_iovec != &req->ki_inline_vec)
+ kfree(req->ki_iovec);
kmem_cache_free(kiocb_cachep, req);
ctx->reqs_active--;

@@ -1302,69 +1305,63 @@ asmlinkage long sys_io_destroy(aio_conte
return -EINVAL;
}

-/*
- * aio_p{read,write} are the default ki_retry methods for
- * IO_CMD_P{READ,WRITE}. They maintains kiocb retry state around potentially
- * multiple calls to f_op->aio_read(). They loop around partial progress
- * instead of returning -EIOCBRETRY because they don't have the means to call
- * kick_iocb().
- */
-static ssize_t aio_pread(struct kiocb *iocb)
+static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
{
- struct file *file = iocb->ki_filp;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- ssize_t ret = 0;
+ struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg];

- do {
- iocb->ki_inline_vec.iov_base = iocb->ki_buf;
- iocb->ki_inline_vec.iov_len = iocb->ki_left;
+ BUG_ON(ret <= 0);

- ret = file->f_op->aio_read(iocb, &iocb->ki_inline_vec,
- 1, iocb->ki_pos);
- /*
- * Can't just depend on iocb->ki_left to determine
- * whether we are done. This may have been a short read.
- */
- if (ret > 0) {
- iocb->ki_buf += ret;
- iocb->ki_left -= ret;
+ while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) {
+ ssize_t this = min(iov->iov_len, (size_t)ret);
+ iov->iov_base += this;
+ iov->iov_len -= this;
+ iocb->ki_left -= this;
+ ret -= this;
+ if (iov->iov_len == 0) {
+ iocb->ki_cur_seg++;
+ iov++;
}
+ }

- /*
- * For pipes and sockets we return once we have some data; for
- * regular files we retry till we complete the entire read or
- * find that we can't read any more data (e.g short reads).
- */
- } while (ret > 0 && iocb->ki_left > 0 &&
- !S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode));
-
- /* This means we must have transferred all that we could */
- /* No need to retry anymore */
- if ((ret == 0) || (iocb->ki_left == 0))
- ret = iocb->ki_nbytes - iocb->ki_left;
-
- return ret;
+ /* the caller should not have done more io than what fit in
+ * the remaining iovecs */
+ BUG_ON(ret > 0 && iocb->ki_left == 0);
}

-/* see aio_pread() */
-static ssize_t aio_pwrite(struct kiocb *iocb)
+static ssize_t aio_rw_vect_retry(struct kiocb *iocb)
{
struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ struct inode *inode = mapping->host;
+ ssize_t (*rw_op)(struct kiocb *, const struct iovec *,
+ unsigned long, loff_t);
ssize_t ret = 0;
+ unsigned short opcode;
+
+ if ((iocb->ki_opcode == IOCB_CMD_PREADV) ||
+ (iocb->ki_opcode == IOCB_CMD_PREAD)) {
+ rw_op = file->f_op->aio_read;
+ opcode = IOCB_CMD_PREADV;
+ } else {
+ rw_op = file->f_op->aio_write;
+ opcode = IOCB_CMD_PWRITEV;
+ }

do {
- iocb->ki_inline_vec.iov_base = iocb->ki_buf;
- iocb->ki_inline_vec.iov_len = iocb->ki_left;
+ ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
+ iocb->ki_nr_segs - iocb->ki_cur_seg,
+ iocb->ki_pos);
+ if (ret > 0)
+ aio_advance_iovec(iocb, ret);

- ret = file->f_op->aio_write(iocb, &iocb->ki_inline_vec,
- 1, iocb->ki_pos);
- if (ret > 0) {
- iocb->ki_buf += ret;
- iocb->ki_left -= ret;
- }
- } while (ret > 0 && iocb->ki_left > 0);
+ /* retry all partial writes. retry partial reads as long as its a
+ * regular file. */
+ } while (ret > 0 && iocb->ki_left > 0 &&
+ (opcode == IOCB_CMD_PWRITEV ||
+ (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));

+ /* This means we must have transferred all that we could */
+ /* No need to retry anymore */
if ((ret == 0) || (iocb->ki_left == 0))
ret = iocb->ki_nbytes - iocb->ki_left;

@@ -1391,6 +1388,38 @@ static ssize_t aio_fsync(struct kiocb *i
return ret;
}

+static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb)
+{
+ ssize_t ret;
+
+ ret = rw_copy_check_uvector((struct iovec __user *)kiocb->ki_buf,
+ kiocb->ki_nbytes, 1,
+ &kiocb->ki_inline_vec, &kiocb->ki_iovec);
+ if (ret < 0)
+ goto out;
+
+ kiocb->ki_nr_segs = kiocb->ki_nbytes;
+ kiocb->ki_cur_seg = 0;
+ /* ki_nbytes/left now reflect bytes instead of segs */
+ kiocb->ki_nbytes = ret;
+ kiocb->ki_left = ret;
+
+ ret = 0;
+out:
+ return ret;
+}
+
+static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
+{
+ kiocb->ki_iovec = &kiocb->ki_inline_vec;
+ kiocb->ki_iovec->iov_base = kiocb->ki_buf;
+ kiocb->ki_iovec->iov_len = kiocb->ki_left;
+ kiocb->ki_nr_segs = 1;
+ kiocb->ki_cur_seg = 0;
+ kiocb->ki_nbytes = kiocb->ki_left;
+ return 0;
+}
+
/*
* aio_setup_iocb:
* Performs the initial checks and aio retry method
@@ -1413,9 +1442,12 @@ static ssize_t aio_setup_iocb(struct kio
ret = security_file_permission(file, MAY_READ);
if (unlikely(ret))
break;
+ ret = aio_setup_single_vector(kiocb);
+ if (ret)
+ break;
ret = -EINVAL;
if (file->f_op->aio_read)
- kiocb->ki_retry = aio_pread;
+ kiocb->ki_retry = aio_rw_vect_retry;
break;
case IOCB_CMD_PWRITE:
ret = -EBADF;
@@ -1428,9 +1460,34 @@ static ssize_t aio_setup_iocb(struct kio
ret = security_file_permission(file, MAY_WRITE);
if (unlikely(ret))
break;
+ ret = aio_setup_single_vector(kiocb);
+ if (ret)
+ break;
+ ret = -EINVAL;
+ if (file->f_op->aio_write)
+ kiocb->ki_retry = aio_rw_vect_retry;
+ break;
+ case IOCB_CMD_PREADV:
+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_READ)))
+ break;
+ ret = aio_setup_vectored_rw(kiocb);
+ if (ret)
+ break;
+ ret = -EINVAL;
+ if (file->f_op->aio_read)
+ kiocb->ki_retry = aio_rw_vect_retry;
+ break;
+ case IOCB_CMD_PWRITEV:
+ ret = -EBADF;
+ if (unlikely(!(file->f_mode & FMODE_WRITE)))
+ break;
+ ret = aio_setup_vectored_rw(kiocb);
+ if (ret)
+ break;
ret = -EINVAL;
if (file->f_op->aio_write)
- kiocb->ki_retry = aio_pwrite;
+ kiocb->ki_retry = aio_rw_vect_retry;
break;
case IOCB_CMD_FDSYNC:
ret = -EINVAL;
Index: linux-2.6.16-rc5/include/linux/aio.h
===================================================================
--- linux-2.6.16-rc5.orig/include/linux/aio.h 2006-03-07 13:44:09.000000000 -0800
+++ linux-2.6.16-rc5/include/linux/aio.h 2006-03-07 13:59:17.000000000 -0800
@@ -7,6 +7,7 @@
#include <linux/uio.h>

#include <asm/atomic.h>
+#include <linux/uio.h>

#define AIO_MAXSEGS 4
#define AIO_KIOGRP_NR_ATOMIC 8
@@ -114,6 +115,9 @@ struct kiocb {
long ki_kicked; /* just for testing */
long ki_queued; /* just for testing */
struct iovec ki_inline_vec; /* inline vector */
+ struct iovec *ki_iovec;
+ unsigned long ki_nr_segs;
+ unsigned long ki_cur_seg;

struct list_head ki_list; /* the aio core uses this
* for cancellation */
Index: linux-2.6.16-rc5/include/linux/aio_abi.h
===================================================================
--- linux-2.6.16-rc5.orig/include/linux/aio_abi.h 2006-03-07 13:44:09.000000000 -0800
+++ linux-2.6.16-rc5/include/linux/aio_abi.h 2006-03-07 13:59:17.000000000 -0800
@@ -41,6 +41,8 @@ enum {
* IOCB_CMD_POLL = 5,
*/
IOCB_CMD_NOOP = 6,
+ IOCB_CMD_PREADV = 7,
+ IOCB_CMD_PWRITEV = 8,
};

/* read() from /dev/aio returns these structures. */
Index: linux-2.6.16-rc5/fs/read_write.c
===================================================================
--- linux-2.6.16-rc5.orig/fs/read_write.c 2006-03-07 13:59:14.000000000 -0800
+++ linux-2.6.16-rc5/fs/read_write.c 2006-03-07 13:59:17.000000000 -0800
@@ -513,72 +513,103 @@ ssize_t do_loop_readv_writev(struct file
/* A write operation does a read from user space and vice versa */
#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)

+ssize_t rw_copy_check_uvector(const struct iovec __user * uvector,
+ unsigned long nr_segs, unsigned long fast_segs,
+ struct iovec *fast_pointer,
+ struct iovec **ret_pointer)
+ {
+ unsigned long seg;
+ ssize_t ret;
+ struct iovec *iov = fast_pointer;
+
+ /*
+ * SuS says "The readv() function *may* fail if the iovcnt argument
+ * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
+ * traditionally returned zero for zero segments, so...
+ */
+ if (nr_segs == 0) {
+ ret = 0;
+ goto out;
+ }
+
+ /*
+ * First get the "struct iovec" from user memory and
+ * verify all the pointers
+ */
+ if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (nr_segs > fast_segs) {
+ iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+ if (iov == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ }
+ if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /*
+ * According to the Single Unix Specification we should return EINVAL
+ * if an element length is < 0 when cast to ssize_t or if the
+ * total length would overflow the ssize_t return value of the
+ * system call.
+ */
+ ret = 0;
+ for (seg = 0; seg < nr_segs; seg++) {
+ void __user *buf = iov[seg].iov_base;
+ ssize_t len = (ssize_t)iov[seg].iov_len;
+
+ /* see if we we're about to use an invalid len or if
+ * it's about to overflow ssize_t */
+ if (len < 0 || (ret + len < ret)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ if (unlikely(!access_ok(vrfy_dir(type), buf, len))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ ret += len;
+ }
+out:
+ *ret_pointer = iov;
+ return ret;
+}
+
+/* A write operation does a read from user space and vice versa */
+#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
+
static ssize_t do_readv_writev(int type, struct file *file,
const struct iovec __user * uvector,
unsigned long nr_segs, loff_t *pos)
{
- size_t tot_len;
+ ssize_t tot_len;
struct iovec iovstack[UIO_FASTIOV];
struct iovec *iov = iovstack;
ssize_t ret;
- int seg;
io_fn_t fn;
iov_fn_t fnv;

- /*
- * SuS says "The readv() function *may* fail if the iovcnt argument
- * was less than or equal to 0, or greater than {IOV_MAX}. Linux has
- * traditionally returned zero for zero segments, so...
- */
- ret = 0;
- if (nr_segs == 0)
- goto out;
-
- /*
- * First get the "struct iovec" from user memory and
- * verify all the pointers
- */
- ret = -EINVAL;
- if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
- goto out;
- if (!file->f_op)
+ if (!file->f_op) {
+ ret = -EINVAL;
goto out;
- if (nr_segs > UIO_FASTIOV) {
- ret = -ENOMEM;
- iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
- if (!iov)
- goto out;
}
- ret = -EFAULT;
- if (copy_from_user(iov, uvector, nr_segs*sizeof(*uvector)))
+
+ ret = rw_copy_check_uvector(uvector, nr_segs, ARRAY_SIZE(iovstack),
+ iovstack, &iov);
+
+ if (ret < 0)
goto out;

- /*
- * Single unix specification:
- * We should -EINVAL if an element length is not >= 0 and fitting an
- * ssize_t. The total length is fitting an ssize_t
- *
- * Be careful here because iov_len is a size_t not an ssize_t
- */
- tot_len = 0;
- ret = -EINVAL;
- for (seg = 0; seg < nr_segs; seg++) {
- void __user *buf = iov[seg].iov_base;
- ssize_t len = (ssize_t)iov[seg].iov_len;
-
- if (len < 0) /* size_t not fitting an ssize_t .. */
- goto out;
- if (unlikely(!access_ok(vrfy_dir(type), buf, len)))
- goto Efault;
- tot_len += len;
- if ((ssize_t)tot_len < 0) /* maths overflow on the ssize_t */
- goto out;
- }
- if (tot_len == 0) {
- ret = 0;
+ if (ret == 0)
goto out;
- }

+ tot_len = ret;
ret = rw_verify_area(type, file, pos, tot_len);
if (ret < 0)
goto out;
@@ -610,9 +641,6 @@ out:
fsnotify_modify(file->f_dentry);
}
return ret;
-Efault:
- ret = -EFAULT;
- goto out;
}

ssize_t vfs_readv(struct file *file, const struct iovec __user *vec,
Index: linux-2.6.16-rc5/include/linux/fs.h
===================================================================
--- linux-2.6.16-rc5.orig/include/linux/fs.h 2006-03-07 13:59:14.000000000 -0800
+++ linux-2.6.16-rc5/include/linux/fs.h 2006-03-07 13:59:17.000000000 -0800
@@ -1050,6 +1050,11 @@ struct inode_operations {

struct seq_file;

+ssize_t rw_copy_check_uvector(const struct iovec __user * uvector,
+ unsigned long nr_segs, unsigned long fast_segs,
+ struct iovec *fast_pointer,
+ struct iovec **ret_pointer);
+
extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
extern ssize_t vfs_readv(struct file *, const struct iovec __user *,