[PATCH 20/22] nfs: add support for read_iter, write_iter

From: Dave Kleikamp
Date: Mon Oct 22 2012 - 11:21:01 EST


This patch implements the read_iter and write_iter file operations which
allow kernel code to initiate directIO. This allows the loop device to
read and write directly to the server, bypassing the page cache.

Signed-off-by: Dave Kleikamp <dave.kleikamp@xxxxxxxxxx>
Cc: Zach Brown <zab@xxxxxxxxx>
Cc: Trond Myklebust <Trond.Myklebust@xxxxxxxxxx>
Cc: linux-nfs@xxxxxxxxxxxxxxx
---
fs/nfs/direct.c | 169 +++++++++++++++++++++++++++++++++----------------
fs/nfs/file.c | 48 ++++++++++----
fs/nfs/internal.h | 2 +
fs/nfs/nfs4file.c | 2 +
include/linux/nfs_fs.h | 6 +-
5 files changed, 155 insertions(+), 72 deletions(-)

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4532781..b1fda1c 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -90,6 +90,7 @@ struct nfs_direct_req {
int flags;
#define NFS_ODIRECT_DO_COMMIT (1) /* an unstable reply was received */
#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
+#define NFS_ODIRECT_MARK_DIRTY (4) /* mark read pages dirty */
struct nfs_writeverf verf; /* unstable write verifier */
};

@@ -131,15 +132,13 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,

return -EINVAL;
#else
- const struct iovec *iov = iov_iter_iovec(iter);
-
VM_BUG_ON(iocb->ki_left != PAGE_SIZE);
VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);

if (rw == READ || rw == KERNEL_READ)
- return nfs_file_direct_read(iocb, iov, iter->nr_segs, pos,
+ return nfs_file_direct_read(iocb, iter, pos,
rw == READ ? true : false);
- return nfs_file_direct_write(iocb, iov, iter->nr_segs, pos,
+ return nfs_file_direct_write(iocb, iter, pos,
rw == WRITE ? true : false);
#endif /* CONFIG_NFS_SWAP */
}
@@ -277,7 +276,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
hdr->good_bytes & ~PAGE_MASK,
PAGE_SIZE);
}
- if (!PageCompound(page)) {
+ if ((dreq->flags & NFS_ODIRECT_MARK_DIRTY) &&
+ !PageCompound(page)) {
if (test_bit(NFS_IOHDR_ERROR, &hdr->flags)) {
if (bytes < hdr->good_bytes)
set_page_dirty(page);
@@ -414,10 +414,9 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
return result < 0 ? (ssize_t) result : -EFAULT;
}

-static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos, bool uio)
+static ssize_t nfs_direct_read_schedule(struct nfs_direct_req *dreq,
+ struct iov_iter *iter, loff_t pos,
+ bool uio)
{
struct nfs_pageio_descriptor desc;
ssize_t result = -EINVAL;
@@ -429,16 +428,47 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
get_dreq(dreq);
desc.pg_dreq = dreq;

- for (seg = 0; seg < nr_segs; seg++) {
- const struct iovec *vec = &iov[seg];
- result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio);
- if (result < 0)
- break;
- requested_bytes += result;
- if ((size_t)result < vec->iov_len)
- break;
- pos += vec->iov_len;
- }
+ if (iov_iter_has_iovec(iter)) {
+ const struct iovec *iov = iov_iter_iovec(iter);
+ if (uio)
+ dreq->flags = NFS_ODIRECT_MARK_DIRTY;
+ for (seg = 0; seg < iter->nr_segs; seg++) {
+ const struct iovec *vec = &iov[seg];
+ result = nfs_direct_read_schedule_segment(&desc, vec,
+ pos, uio);
+ if (result < 0)
+ break;
+ requested_bytes += result;
+ if ((size_t)result < vec->iov_len)
+ break;
+ pos += vec->iov_len;
+ }
+ } else if (iov_iter_has_bvec(iter)) {
+ struct nfs_open_context *ctx = dreq->ctx;
+ struct inode *inode = ctx->dentry->d_inode;
+ struct bio_vec *bvec = iov_iter_bvec(iter);
+ for (seg = 0; seg < iter->nr_segs; seg++) {
+ struct nfs_page *req;
+ unsigned int req_len = bvec[seg].bv_len;
+ req = nfs_create_request(ctx, inode,
+ bvec[seg].bv_page,
+ bvec[seg].bv_offset, req_len);
+ if (IS_ERR(req)) {
+ result = PTR_ERR(req);
+ break;
+ }
+ req->wb_index = pos >> PAGE_SHIFT;
+ req->wb_offset = pos & ~PAGE_MASK;
+ if (!nfs_pageio_add_request(&desc, req)) {
+ result = desc.pg_error;
+ nfs_release_request(req);
+ break;
+ }
+ requested_bytes += req_len;
+ pos += req_len;
+ }
+ } else
+ BUG();

nfs_pageio_complete(&desc);

@@ -456,8 +486,8 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq,
return 0;
}

-static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, bool uio)
+static ssize_t nfs_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t pos, bool uio)
{
ssize_t result = -ENOMEM;
struct inode *inode = iocb->ki_filp->f_mapping->host;
@@ -469,7 +499,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
goto out;

dreq->inode = inode;
- dreq->bytes_left = iov_length(iov, nr_segs);
+ dreq->bytes_left = iov_iter_count(iter);
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
@@ -480,8 +510,8 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov,
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;

- NFS_I(inode)->read_io += iov_length(iov, nr_segs);
- result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+ NFS_I(inode)->read_io += iov_iter_count(iter);
+ result = nfs_direct_read_schedule(dreq, iter, pos, uio);
if (!result)
result = nfs_direct_wait(dreq);
out_release:
@@ -815,10 +845,9 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
.completion = nfs_direct_write_completion,
};

-static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos, bool uio)
+static ssize_t nfs_direct_write_schedule(struct nfs_direct_req *dreq,
+ struct iov_iter *iter, loff_t pos,
+ bool uio)
{
struct nfs_pageio_descriptor desc;
struct inode *inode = dreq->inode;
@@ -832,17 +861,48 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
get_dreq(dreq);
atomic_inc(&inode->i_dio_count);

- NFS_I(dreq->inode)->write_io += iov_length(iov, nr_segs);
- for (seg = 0; seg < nr_segs; seg++) {
- const struct iovec *vec = &iov[seg];
- result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio);
- if (result < 0)
- break;
- requested_bytes += result;
- if ((size_t)result < vec->iov_len)
- break;
- pos += vec->iov_len;
- }
+ NFS_I(dreq->inode)->write_io += iov_iter_count(iter);
+
+ if (iov_iter_has_iovec(iter)) {
+ const struct iovec *iov = iov_iter_iovec(iter);
+ for (seg = 0; seg < iter->nr_segs; seg++) {
+ const struct iovec *vec = &iov[seg];
+ result = nfs_direct_write_schedule_segment(&desc, vec,
+ pos, uio);
+ if (result < 0)
+ break;
+ requested_bytes += result;
+ if ((size_t)result < vec->iov_len)
+ break;
+ pos += vec->iov_len;
+ }
+ } else if (iov_iter_has_bvec(iter)) {
+ struct nfs_open_context *ctx = dreq->ctx;
+ struct bio_vec *bvec = iov_iter_bvec(iter);
+ for (seg = 0; seg < iter->nr_segs; seg++) {
+ struct nfs_page *req;
+ unsigned int req_len = bvec[seg].bv_len;
+
+ req = nfs_create_request(ctx, inode, bvec[seg].bv_page,
+ bvec[seg].bv_offset, req_len);
+ if (IS_ERR(req)) {
+ result = PTR_ERR(req);
+ break;
+ }
+ nfs_lock_request(req);
+ req->wb_index = pos >> PAGE_SHIFT;
+ req->wb_offset = pos & ~PAGE_MASK;
+ if (!nfs_pageio_add_request(&desc, req)) {
+ result = desc.pg_error;
+ nfs_unlock_and_release_request(req);
+ break;
+ }
+ requested_bytes += req_len;
+ pos += req_len;
+ }
+ } else
+ BUG();
+
nfs_pageio_complete(&desc);

/*
@@ -860,9 +920,8 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
return 0;
}

-static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos,
- size_t count, bool uio)
+static ssize_t nfs_direct_write(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t pos, bool uio)
{
ssize_t result = -ENOMEM;
struct inode *inode = iocb->ki_filp->f_mapping->host;
@@ -874,7 +933,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
goto out;

dreq->inode = inode;
- dreq->bytes_left = count;
+ dreq->bytes_left = iov_iter_count(iter);
dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
@@ -885,7 +944,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov,
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;

- result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
+ result = nfs_direct_write_schedule(dreq, iter, pos, uio);
if (!result)
result = nfs_direct_wait(dreq);
out_release:
@@ -897,8 +956,7 @@ out:
/**
* nfs_file_direct_read - file direct read operation for NFS files
* @iocb: target I/O control block
- * @iov: vector of user buffers into which to read data
- * @nr_segs: size of iov vector
+ * @iter: vector of buffers into which to read data
* @pos: byte offset in file where reading starts
*
* We use this function for direct reads instead of calling
@@ -915,15 +973,15 @@ out:
* client must read the updated atime from the server back into its
* cache.
*/
-ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, bool uio)
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t pos, bool uio)
{
ssize_t retval = -EINVAL;
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
size_t count;

- count = iov_length(iov, nr_segs);
+ count = iov_iter_count(iter);
nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);

dfprintk(FILE, "NFS: direct read(%s/%s, %zd@%Ld)\n",
@@ -941,7 +999,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,

task_io_account_read(count);

- retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio);
+ retval = nfs_direct_read(iocb, iter, pos, uio);
if (retval > 0)
iocb->ki_pos = pos + retval;

@@ -952,8 +1010,7 @@ out:
/**
* nfs_file_direct_write - file direct write operation for NFS files
* @iocb: target I/O control block
- * @iov: vector of user buffers from which to write data
- * @nr_segs: size of iov vector
+ * @iter: vector of buffers from which to write data
* @pos: byte offset in file where writing starts
*
* We use this function for direct writes instead of calling
@@ -971,15 +1028,15 @@ out:
* Note that O_APPEND is not supported for NFS direct writes, as there
* is no atomic O_APPEND write facility in the NFS protocol.
*/
-ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos, bool uio)
+ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t pos, bool uio)
{
ssize_t retval = -EINVAL;
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
size_t count;

- count = iov_length(iov, nr_segs);
+ count = iov_iter_count(iter);
nfs_add_stats(mapping->host, NFSIOS_DIRECTWRITTENBYTES, count);

dfprintk(FILE, "NFS: direct write(%s/%s, %zd@%Ld)\n",
@@ -1004,7 +1061,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,

task_io_account_write(count);

- retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio);
+ retval = nfs_direct_write(iocb, iter, pos, uio);
if (retval > 0) {
struct inode *inode = mapping->host;

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 582bb88..b4bf6ef 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -172,28 +172,39 @@ nfs_file_flush(struct file *file, fl_owner_t id)
EXPORT_SYMBOL_GPL(nfs_file_flush);

ssize_t
-nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+nfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
{
struct dentry * dentry = iocb->ki_filp->f_path.dentry;
struct inode * inode = dentry->d_inode;
ssize_t result;

if (iocb->ki_filp->f_flags & O_DIRECT)
- return nfs_file_direct_read(iocb, iov, nr_segs, pos, true);
+ return nfs_file_direct_read(iocb, iter, pos, true);

- dprintk("NFS: read(%s/%s, %lu@%lu)\n",
+ dprintk("NFS: read_iter(%s/%s, %lu@%lu)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
- (unsigned long) iov_length(iov, nr_segs), (unsigned long) pos);
+ (unsigned long) iov_iter_count(iter), (unsigned long) pos);

result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping);
if (!result) {
- result = generic_file_aio_read(iocb, iov, nr_segs, pos);
+ result = generic_file_read_iter(iocb, iter, pos);
if (result > 0)
nfs_add_stats(inode, NFSIOS_NORMALREADBYTES, result);
}
return result;
}
+EXPORT_SYMBOL_GPL(nfs_file_read_iter);
+
+ssize_t
+nfs_file_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct iov_iter iter;
+
+ iov_iter_init(&iter, iov, nr_segs, iov_length(iov, nr_segs), 0);
+
+ return nfs_file_read_iter(iocb, &iter, pos);
+}
EXPORT_SYMBOL_GPL(nfs_file_read);

ssize_t
@@ -610,19 +621,19 @@ static int nfs_need_sync_write(struct file *filp, struct inode *inode)
return 0;
}

-ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+ssize_t nfs_file_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+ loff_t pos)
{
struct dentry * dentry = iocb->ki_filp->f_path.dentry;
struct inode * inode = dentry->d_inode;
unsigned long written = 0;
ssize_t result;
- size_t count = iov_length(iov, nr_segs);
+ size_t count = iov_iter_count(iter);

if (iocb->ki_filp->f_flags & O_DIRECT)
- return nfs_file_direct_write(iocb, iov, nr_segs, pos, true);
+ return nfs_file_direct_write(iocb, iter, pos, true);

- dprintk("NFS: write(%s/%s, %lu@%Ld)\n",
+ dprintk("NFS: write_iter(%s/%s, %lu@%lld)\n",
dentry->d_parent->d_name.name, dentry->d_name.name,
(unsigned long) count, (long long) pos);

@@ -642,7 +653,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
if (!count)
goto out;

- result = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ result = generic_file_write_iter(iocb, iter, pos);
if (result > 0)
written = result;

@@ -661,6 +672,17 @@ out_swapfile:
printk(KERN_INFO "NFS: attempt to write to active swap file!\n");
goto out;
}
+EXPORT_SYMBOL_GPL(nfs_file_write_iter);
+
+ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
+{
+ struct iov_iter iter;
+
+ iov_iter_init(&iter, iov, nr_segs, iov_length(iov, nr_segs), 0);
+
+ return nfs_file_write_iter(iocb, &iter, pos);
+}
EXPORT_SYMBOL_GPL(nfs_file_write);

ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
@@ -914,6 +936,8 @@ const struct file_operations nfs_file_operations = {
.write = do_sync_write,
.aio_read = nfs_file_read,
.aio_write = nfs_file_write,
+ .read_iter = nfs_file_read_iter,
+ .write_iter = nfs_file_write_iter,
.mmap = nfs_file_mmap,
.open = nfs_file_open,
.flush = nfs_file_flush,
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 59b133c..8db3b11 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -302,10 +302,12 @@ int nfs_file_fsync_commit(struct file *, loff_t, loff_t, int);
loff_t nfs_file_llseek(struct file *, loff_t, int);
int nfs_file_flush(struct file *, fl_owner_t);
ssize_t nfs_file_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ssize_t nfs_file_read_iter(struct kiocb *, struct iov_iter *, loff_t);
ssize_t nfs_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *,
size_t, unsigned int);
int nfs_file_mmap(struct file *, struct vm_area_struct *);
ssize_t nfs_file_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+ssize_t nfs_file_write_iter(struct kiocb *, struct iov_iter *, loff_t);
int nfs_file_release(struct inode *, struct file *);
int nfs_lock(struct file *, int, struct file_lock *);
int nfs_flock(struct file *, int, struct file_lock *);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index afddd66..195188e 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -123,6 +123,8 @@ const struct file_operations nfs4_file_operations = {
.write = do_sync_write,
.aio_read = nfs_file_read,
.aio_write = nfs_file_write,
+ .read_iter = nfs_file_read_iter,
+ .write_iter = nfs_file_write_iter,
.mmap = nfs_file_mmap,
.open = nfs4_file_open,
.flush = nfs_file_flush,
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 4913e3c..9f8e8a9 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -445,11 +445,9 @@ extern int nfs3_removexattr (struct dentry *, const char *name);
* linux/fs/nfs/direct.c
*/
extern ssize_t nfs_direct_IO(int, struct kiocb *, struct iov_iter *, loff_t);
-extern ssize_t nfs_file_direct_read(struct kiocb *iocb,
- const struct iovec *iov, unsigned long nr_segs,
+extern ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos, bool uio);
-extern ssize_t nfs_file_direct_write(struct kiocb *iocb,
- const struct iovec *iov, unsigned long nr_segs,
+extern ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
loff_t pos, bool uio);

/*
--
1.7.12.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/