patch: improve generic_file_buffered_write()

From: Bernd Schubert
Date: Wed Sep 05 2007 - 10:15:29 EST


Hi,

recently we discovered writing to a nfs-exported lustre filesystem is rather
slow (20-40 MB/s writing, but over 200 MB/s reading).

As I already explained on the nfs mailing list, this happens since there is an
offset on the very first page due to the nfs header.

http://sourceforge.net/mailarchive/forum.php?thread_name=200708312003.30446.bernd-schubert%40gmx.de&forum_name=nfs

While this especially effects lustre, Olaf Kirch also noticed it on another
filesystem before and wrote a nfs patch for it. This patch has two
disadvantages - it requires to move all data within the pages, IMHO rather
cpu time consuming, furthermore, it presently causes data corruption when
more than one nfs thread is running.

After thinking it over and over again we (Goswin and I) believe it would be
best to improve generic_file_buffered_write().
If there is sufficient data now, as it is usual for aio writes,
generic_file_buffered_write() will now fill each page as much as possible and
only then prepare/commit it. Before generic_file_buffered_write() commited
chunks of pages even though there were still more data.

The attached patch still has two FIXMEs, both for likely()/unlikely()
conditions which IMHO don't reflect the likelyhood for the new aio data
functions.

filemap.c | 144
+++++++++++++++++++++++++++++++++++++++++---------------------
1 file changed, 96 insertions(+), 48 deletions(-)

Signed-off-by: Bernd Schubert <bs@xxxxxxxxx>
Signed-off-by: Goswin von Brederlow <goswin@xxxxxxxxxxxxxxx>


Cheers,
Goswin and Bernd


Index: linux-2.6.20.3/mm/filemap.c
===================================================================
--- linux-2.6.20.3.orig/mm/filemap.c 2007-09-04 13:43:04.000000000 +0200
+++ linux-2.6.20.3/mm/filemap.c 2007-09-05 12:39:23.000000000 +0200
@@ -2057,6 +2057,19 @@
}
EXPORT_SYMBOL(generic_file_direct_write);

+/**
+ * This function will do 3 main tasks for each iov:
+ * - prepare a write
+ * - copy the data from iov into a new page
+ * - commit this page
+ * @iob: file operations
+ * @iov: vector of data to write
+ * @nr_segs: number of iov segments
+ * @pos: position in the file
+ * @ppos: position in the file after this function
+ * @count: number of bytes to write
+ * written: offset in iov->base (data to skip on write)
+ */
ssize_t
generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos, loff_t *ppos,
@@ -2074,6 +2087,11 @@
const struct iovec *cur_iov = iov; /* current iovec */
size_t iov_base = 0; /* offset in the current iovec */
char __user *buf;
+ unsigned long data_start = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+ loff_t wpos = pos; /* the position in the file we will return */
+
+ /* position in file as index of pages */
+ unsigned long index = pos >> PAGE_CACHE_SHIFT;

pagevec_init(&lru_pvec, 0);

@@ -2087,9 +2105,15 @@
buf = cur_iov->iov_base + iov_base;
}

+ page = __grab_cache_page(mapping, index, &cached_page, &lru_pvec);
+ if (!page) {
+ status = -ENOMEM;
+ goto out;
+ }
+
do {
- unsigned long index;
unsigned long offset;
+ unsigned long data_end; /* end of data within the page */
size_t copied;

offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
@@ -2106,6 +2130,8 @@
*/
bytes = min(bytes, cur_iov->iov_len - iov_base);

+ data_end = offset + bytes;
+
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
@@ -2114,95 +2140,117 @@
*/
fault_in_pages_readable(buf, bytes);

- page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
- if (!page) {
- status = -ENOMEM;
- break;
- }
-
if (unlikely(bytes == 0)) {
status = 0;
copied = 0;
goto zero_length_segment;
}

- status = a_ops->prepare_write(file, page, offset, offset+bytes);
- if (unlikely(status)) {
- loff_t isize = i_size_read(inode);
-
- if (status != AOP_TRUNCATED_PAGE)
- unlock_page(page);
- page_cache_release(page);
- if (status == AOP_TRUNCATED_PAGE)
- continue;
+ if (data_end == PAGE_CACHE_SIZE || count == bytes) {
/*
- * prepare_write() may have instantiated a few blocks
- * outside i_size. Trim these off again.
+ * Only prepare a write if its an entire page or if
+ * we don't have more data
*/
- if (pos + bytes > isize)
- vmtruncate(inode, isize);
- break;
+ status = a_ops->prepare_write(file, page, data_start, data_end);
+ if (unlikely(status)) {
+ loff_t isize = i_size_read(inode);
+
+ if (status == AOP_TRUNCATED_PAGE)
+ continue;
+ /*
+ * prepare_write() may have instantiated a few blocks
+ * outside i_size. Trim these off again.
+ */
+ if (pos + bytes > isize)
+ vmtruncate(inode, isize);
+ }
}
- if (likely(nr_segs == 1))
+ if (likely(nr_segs == 1)) /* FIXME: really likely with aio? */
copied = filemap_copy_from_user(page, offset,
buf, bytes);
else
copied = filemap_copy_from_user_iovec(page, offset,
cur_iov, iov_base, bytes);
- flush_dcache_page(page);
- status = a_ops->commit_write(file, page, offset, offset+bytes);
- if (status == AOP_TRUNCATED_PAGE) {
+
+ if (data_end == PAGE_CACHE_SIZE || count == bytes) {
+ /*
+ * Same as above, always try fill pages up to
+ * PAGE_CACHE_SIZE if possible (sufficient data)
+ */
+ flush_dcache_page(page);
+ status = a_ops->commit_write(file, page,
+ data_start, data_end);
+ if (status == AOP_TRUNCATED_PAGE) {
+ continue;
+ }
+ unlock_page(page);
+ mark_page_accessed(page);
page_cache_release(page);
- continue;
+ balance_dirty_pages_ratelimited(mapping);
+ cond_resched();
+ if (bytes < count) { /* still more iov data to write */
+ page = __grab_cache_page(mapping, index + 1,
+ &cached_page, &lru_pvec);
+ if (!page) {
+ status = -ENOMEM;
+ goto out;
+ }
+ } else {
+ page = NULL;
+ }
+ written += data_end - data_start;
+ wpos += data_end - data_start;
+ data_start = 0; /* correct would be data_end % PAGE_SIZE
+ * but if this would be != 0, we don't
+ * have more data
+ */
}
zero_length_segment:
if (likely(copied >= 0)) {
- if (!status)
- status = copied;
-
- if (status >= 0) {
- written += status;
- count -= status;
- pos += status;
- buf += status;
- if (unlikely(nr_segs > 1)) {
+ if (!status) {
+ count -= copied;
+ pos += copied;
+ buf += copied;
+ if (unlikely(nr_segs > 1)) { /* FIXME: really unlikely with aio? */
filemap_set_next_iovec(&cur_iov,
- &iov_base, status);
+ &iov_base, copied);
if (count)
buf = cur_iov->iov_base +
iov_base;
} else {
- iov_base += status;
+ iov_base += copied;
}
}
}
if (unlikely(copied != bytes))
- if (status >= 0)
+ if (!status)
status = -EFAULT;
- unlock_page(page);
- mark_page_accessed(page);
- page_cache_release(page);
- if (status < 0)
+ if (status)
break;
- balance_dirty_pages_ratelimited(mapping);
- cond_resched();
} while (count);
- *ppos = pos;
+
+out:
+ *ppos = wpos;

if (cached_page)
page_cache_release(cached_page);

+ if (page) {
+ unlock_page(page);
+ page_cache_release(page);
+ }
+
/*
* For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
*/
- if (likely(status >= 0)) {
+ if (likely(!status)) {
if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
if (!a_ops->writepage || !is_sync_kiocb(iocb))
status = generic_osync_inode(inode, mapping,
OSYNC_METADATA|OSYNC_DATA);
}
- }
-
+ }
+
/*
* If we get here for O_DIRECT writes then we must have fallen through
* to buffered writes (block instantiation inside i_size). So we sync


--
Bernd Schubert
Q-Leap Networks GmbH

Attachment: pgp00000.pgp
Description: PGP signature