[RFC PATCH 08/09] Implement direct file I/O interfaces

From: Long Li
Date: Thu May 17 2018 - 19:27:08 EST


From: Long Li <longli@xxxxxxxxxxxxx>

Implement the main filesystem interface for doing read and write. These functions
don't copy the user data into a kenrel buffer for data transfer. Pages are directly
pinned and passed to the RDMA transport.

Signed-off-by: Long Li <longli@xxxxxxxxxxxxx>
---
fs/cifs/cifsfs.c | 19 ++++
fs/cifs/cifsfs.h | 3 +
fs/cifs/file.c | 322 ++++++++++++++++++++++++++++++++++++++++++++++++++++---
3 files changed, 329 insertions(+), 15 deletions(-)

diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f715609..ba19fed 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1118,6 +1118,25 @@ const struct file_operations cifs_file_direct_ops = {
.fallocate = cifs_fallocate,
};

+const struct file_operations cifs_file_direct_rdma_ops = {
+ .read_iter = cifs_direct_readv,
+ .write_iter = cifs_direct_writev,
+ .open = cifs_open,
+ .release = cifs_close,
+ .lock = cifs_lock,
+ .fsync = cifs_fsync,
+ .flush = cifs_flush,
+ .mmap = cifs_file_mmap,
+ .splice_read = generic_file_splice_read,
+ .splice_write = iter_file_splice_write,
+ .unlocked_ioctl = cifs_ioctl,
+ .copy_file_range = cifs_copy_file_range,
+ .clone_file_range = cifs_clone_file_range,
+ .llseek = cifs_llseek,
+ .setlease = cifs_setlease,
+ .fallocate = cifs_fallocate,
+};
+
const struct file_operations cifs_file_nobrl_ops = {
.read_iter = cifs_loose_read_iter,
.write_iter = cifs_file_write_iter,
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 013ba2a..223cca8 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -94,6 +94,7 @@ extern const struct inode_operations cifs_dfs_referral_inode_operations;
/* Functions related to files and directories */
extern const struct file_operations cifs_file_ops;
extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */
+extern const struct file_operations cifs_file_direct_rdma_ops; /* if directio mnt */
extern const struct file_operations cifs_file_strict_ops; /* if strictio mnt */
extern const struct file_operations cifs_file_nobrl_ops; /* no brlocks */
extern const struct file_operations cifs_file_direct_nobrl_ops;
@@ -102,8 +103,10 @@ extern int cifs_open(struct inode *inode, struct file *file);
extern int cifs_close(struct inode *inode, struct file *file);
extern int cifs_closedir(struct inode *inode, struct file *file);
extern ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to);
+extern ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to);
extern ssize_t cifs_strict_readv(struct kiocb *iocb, struct iov_iter *to);
extern ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from);
+extern ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from);
extern ssize_t cifs_strict_writev(struct kiocb *iocb, struct iov_iter *from);
extern int cifs_lock(struct file *, int, struct file_lock *);
extern int cifs_fsync(struct file *, loff_t, loff_t, int);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index e240c7c..0b394db 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2452,15 +2452,46 @@ cifs_uncached_writedata_release(struct kref *refcount)
int i;
struct cifs_writedata *wdata = container_of(refcount,
struct cifs_writedata, refcount);
+ struct page **pages = wdata->direct_pages ? wdata->direct_pages : wdata->pages;

kref_put(&wdata->ctx->refcount, cifs_aio_ctx_release);
for (i = 0; i < wdata->nr_pages; i++)
- put_page(wdata->pages[i]);
+ put_page(pages[i]);
cifs_writedata_release(refcount);
}

static void collect_uncached_write_data(struct cifs_aio_ctx *ctx);

+static void cifs_direct_writedata_release(struct kref *refcount)
+{
+ int i;
+ struct cifs_writedata *wdata = container_of(refcount,
+ struct cifs_writedata, refcount);
+
+ for (i = 0; i < wdata->nr_pages; i++)
+ put_page(wdata->direct_pages[i]);
+ kvfree(wdata->direct_pages);
+
+ cifs_writedata_release(refcount);
+}
+
+static void cifs_direct_writev_complete(struct work_struct *work)
+{
+ struct cifs_writedata *wdata = container_of(work,
+ struct cifs_writedata, work);
+ struct inode *inode = d_inode(wdata->cfile->dentry);
+ struct cifsInodeInfo *cifsi = CIFS_I(inode);
+
+ spin_lock(&inode->i_lock);
+ cifs_update_eof(cifsi, wdata->offset, wdata->bytes);
+ if (cifsi->server_eof > inode->i_size)
+ i_size_write(inode, cifsi->server_eof);
+ spin_unlock(&inode->i_lock);
+
+ complete(&wdata->done);
+ kref_put(&wdata->refcount, cifs_direct_writedata_release);
+}
+
static void
cifs_uncached_writev_complete(struct work_struct *work)
{
@@ -2703,6 +2734,125 @@ static void collect_uncached_write_data(struct cifs_aio_ctx *ctx)
complete(&ctx->done);
}

+ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ ssize_t total_written = 0;
+ struct cifsFileInfo *cfile;
+ struct cifs_tcon *tcon;
+ struct cifs_sb_info *cifs_sb;
+ struct TCP_Server_Info *server;
+ pid_t pid;
+ unsigned long nr_pages;
+ loff_t offset = iocb->ki_pos;
+ size_t len = iov_iter_count(from);
+ int rc;
+ struct cifs_writedata *wdata;
+
+ rc = generic_write_checks(iocb, from);
+ if (rc <= 0)
+ return rc;
+
+ cifs_sb = CIFS_FILE_SB(file);
+ cfile = file->private_data;
+ tcon = tlink_tcon(cfile->tlink);
+ server = tcon->ses->server;
+
+ if (!server->ops->async_writev)
+ return -ENOSYS;
+
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+ pid = cfile->pid;
+ else
+ pid = current->tgid;
+
+ do {
+ unsigned int wsize, credits;
+ struct page **pagevec;
+ size_t start;
+ ssize_t cur_len;
+
+ rc = server->ops->wait_mtu_credits(server, cifs_sb->wsize,
+ &wsize, &credits);
+ if (rc)
+ break;
+
+ cur_len = iov_iter_get_pages_alloc(from, &pagevec, wsize, &start);
+ if (cur_len < 0) {
+ cifs_dbg(VFS, "direct_writev couldn't get user pages (rc=%zd) iter type %d iov_offset %lu count %lu\n", cur_len, from->type, from->iov_offset, from->count);
+ dump_stack();
+ break;
+ }
+ if (cur_len < 0)
+ break;
+
+ nr_pages = (cur_len + start + PAGE_SIZE -1) / PAGE_SIZE;
+
+ wdata = cifs_writedata_alloc(nr_pages, pagevec,
+ cifs_direct_writev_complete);
+ if (!wdata) {
+ rc = -ENOMEM;
+ add_credits_and_wake_if(server, credits, 0);
+ break;
+ }
+
+ wdata->nr_pages = nr_pages;
+ wdata->page_offset = start;
+ wdata->pagesz = PAGE_SIZE;
+ wdata->tailsz =
+ nr_pages > 1 ?
+ cur_len - (PAGE_SIZE-start) - (nr_pages - 2)*PAGE_SIZE :
+ cur_len;
+
+ wdata->sync_mode = WB_SYNC_ALL;
+ wdata->offset = (__u64)offset;
+ wdata->cfile = cifsFileInfo_get(cfile);
+ wdata->pid = pid;
+ wdata->bytes = cur_len;
+ wdata->credits = credits;
+
+ kref_get(&wdata->refcount);
+
+ if (!wdata->cfile->invalidHandle ||
+ !(rc = cifs_reopen_file(wdata->cfile, false)))
+ rc = server->ops->async_writev(wdata,
+ cifs_direct_writedata_release);
+ if (rc) {
+ add_credits_and_wake_if(server, wdata->credits, 0);
+ kref_put(&wdata->refcount,
+ cifs_writedata_release);
+ if (rc == -EAGAIN)
+ continue;
+ break;
+ } else
+ wait_for_completion(&wdata->done);
+
+ if (wdata->result) {
+ rc = wdata->result;
+ kref_put(&wdata->refcount, cifs_direct_writedata_release);
+ if (rc == -EAGAIN)
+ continue;
+ break;
+ }
+
+ kref_put(&wdata->refcount, cifs_direct_writedata_release);
+
+ iov_iter_advance(from, cur_len);
+ total_written += cur_len;
+ offset += cur_len;
+ len -= cur_len;
+ } while (len);
+
+ if (unlikely(!total_written)) {
+ printk(KERN_ERR "%s: total_written=%ld rc=%d\n", __func__, total_written, rc);
+ return rc;
+ }
+
+ iocb->ki_pos += total_written;
+ return total_written;
+
+}
+
ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
@@ -2942,18 +3092,30 @@ cifs_read_allocate_pages(struct cifs_readdata *rdata, unsigned int nr_pages)
return rc;
}

+static void cifs_direct_readdata_release(struct kref *refcount)
+{
+ struct cifs_readdata *rdata = container_of(refcount,
+ struct cifs_readdata, refcount);
+ unsigned int i;
+ for (i = 0; i < rdata->nr_pages; i++) {
+ put_page(rdata->direct_pages[i]);
+ }
+ kvfree(rdata->direct_pages);
+
+ cifs_readdata_release(refcount);
+}
+
static void
cifs_uncached_readdata_release(struct kref *refcount)
{
struct cifs_readdata *rdata = container_of(refcount,
struct cifs_readdata, refcount);
unsigned int i;
+ struct page **pages = rdata->direct_pages ? rdata->direct_pages : rdata->pages;

kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release);
- for (i = 0; i < rdata->nr_pages; i++) {
- put_page(rdata->pages[i]);
- rdata->pages[i] = NULL;
- }
+ for (i = 0; i < rdata->nr_pages; i++)
+ put_page(pages[i]);
cifs_readdata_release(refcount);
}

@@ -3013,30 +3175,32 @@ uncached_fill_pages(struct TCP_Server_Info *server,
int result = 0;
unsigned int i;
unsigned int nr_pages = rdata->nr_pages;
+ unsigned int page_offset = rdata->page_offset;

rdata->got_bytes = 0;
rdata->tailsz = PAGE_SIZE;
for (i = 0; i < nr_pages; i++) {
- struct page *page = rdata->pages[i];
+ struct page *page = rdata->direct_pages ? rdata->direct_pages[i] : rdata->pages[i];
size_t n;
+ unsigned int segment_size = rdata->pagesz;
+
+ if (i == 0)
+ segment_size -= page_offset;
+ else
+ page_offset = 0;
+

if (len <= 0) {
/* no need to hold page hostage */
- rdata->pages[i] = NULL;
rdata->nr_pages--;
put_page(page);
continue;
}
n = len;
- if (len >= PAGE_SIZE) {
+ if (len >= segment_size)
/* enough data to fill the page */
- n = PAGE_SIZE;
- len -= n;
- } else {
- zero_user(page, len, PAGE_SIZE - len);
- rdata->tailsz = len;
- len = 0;
- }
+ n = segment_size;
+ len -= n;
if (iter)
result = copy_page_from_iter(page, 0, n, iter);
#ifdef CONFIG_CIFS_SMB_DIRECT
@@ -3243,6 +3407,134 @@ collect_uncached_read_data(struct cifs_aio_ctx *ctx)
complete(&ctx->done);
}

+static void cifs_direct_readv_complete(struct work_struct *work)
+{
+ struct cifs_readdata *rdata = container_of(work, struct cifs_readdata, work);
+ int i = 0;
+ unsigned int bytes = 0;
+
+ // Set them dirty?
+ while (bytes < rdata->got_bytes + rdata->page_offset) {
+ set_page_dirty(rdata->direct_pages[i++]);
+ bytes += rdata->pagesz;
+ }
+
+ complete(&rdata->done);
+ kref_put(&rdata->refcount, cifs_direct_readdata_release);
+}
+
+ssize_t cifs_direct_readv(struct kiocb *iocb, struct iov_iter *to)
+{
+ size_t len, cur_len, start;
+ unsigned int npages, rsize, credits;
+ struct file *file;
+ struct cifs_sb_info *cifs_sb;
+ struct cifsFileInfo *cfile;
+ struct cifs_tcon *tcon;
+ struct page **pagevec;
+ ssize_t rc, total_read = 0;
+ struct TCP_Server_Info *server;
+ loff_t offset = iocb->ki_pos;
+ pid_t pid;
+ struct cifs_readdata *rdata;
+ char *buf = to->iov->iov_base;
+
+ len = iov_iter_count(to);
+ if (!len)
+ return 0;
+
+ file = iocb->ki_filp;
+ cifs_sb = CIFS_FILE_SB(file);
+ cfile = file->private_data;
+ tcon = tlink_tcon(cfile->tlink);
+ server = tcon->ses->server;
+
+ if (!server->ops->async_readv)
+ return -ENOSYS;
+
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_RWPIDFORWARD)
+ pid = cfile->pid;
+ else
+ pid = current->tgid;
+
+ if ((file->f_flags & O_ACCMODE) == O_WRONLY)
+ cifs_dbg(FYI, "attempting read on write only file instance\n");
+
+ do {
+ rc = server->ops->wait_mtu_credits(server, cifs_sb->rsize,
+ &rsize, &credits);
+ if (rc)
+ break;
+
+ cur_len = min_t(const size_t, len, rsize);
+
+ rc = iov_iter_get_pages_alloc(to, &pagevec, cur_len, &start);
+ if (rc < 0) {
+ cifs_dbg(VFS, "couldn't get user pages (rc=%zd) iter type %d iov_offset %lu count %lu\n", rc, to->type, to->iov_offset, to->count);
+ dump_stack();
+ break;
+ }
+
+ rdata = cifs_readdata_alloc(0, pagevec, cifs_direct_readv_complete);
+ if (!rdata) {
+ add_credits_and_wake_if(server, credits, 0);
+ rc = -ENOMEM;
+ break;
+ }
+
+ npages = (rc + start + PAGE_SIZE-1) / PAGE_SIZE;
+ rdata->nr_pages = npages;
+ rdata->page_offset = start;
+ rdata->pagesz = PAGE_SIZE;
+ rdata->tailsz = npages > 1 ?
+ rc-(PAGE_SIZE-start)-(npages-2)*PAGE_SIZE :
+ rc;
+ cur_len = rc;
+
+ rdata->cfile = cfile;
+ rdata->offset = offset;
+ rdata->bytes = rc;
+ rdata->pid = pid;
+ rdata->read_into_pages = cifs_uncached_read_into_pages;
+ rdata->copy_into_pages = cifs_uncached_copy_into_pages;
+ rdata->credits = credits;
+
+ kref_get(&rdata->refcount);
+
+ if (!rdata->cfile->invalidHandle ||
+ !(rc = cifs_reopen_file(rdata->cfile, true)))
+ rc = server->ops->async_readv(rdata);
+
+ if (rc) {
+ add_credits_and_wake_if(server, rdata->credits, 0);
+ kref_put(&rdata->refcount,
+ cifs_direct_readdata_release);
+ if (rc == -EAGAIN)
+ continue;
+ } else
+ wait_for_completion(&rdata->done);
+
+ rc = rdata->result;
+ if (rc) {
+ kref_put(&rdata->refcount, cifs_direct_readdata_release);
+ if (rc == -EAGAIN)
+ continue;
+ break;
+ }
+
+ total_read += rdata->got_bytes;
+ kref_put(&rdata->refcount, cifs_direct_readdata_release);
+
+ iov_iter_advance(to, cur_len);
+ len -= cur_len;
+ offset += cur_len;
+ } while (len);
+
+ iocb->ki_pos += total_read;
+
+ return total_read;
+}
+
ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
--
2.7.4