Re: regression in page writeback

From: Wu Fengguang
Date: Wed Sep 30 2009 - 01:27:41 EST


Hi Ted,

On Mon, Sep 28, 2009 at 10:07:57PM +0800, Theodore Ts'o wrote:
> On Mon, Sep 28, 2009 at 09:08:04AM -0400, Christoph Hellwig wrote:
> > This should help a bit for XFS as it historically does multi-page
> > writeouts from ->writepages (and apprently btrfs that added some
> > write-around recently?) but not those brave filesystems only
> > implementing the multi-page writeout from writepages as designed.
>
> Here's the hack which I'm currently working on to work around the
> writeback code limiting writebacks to 1024 pages. I'm assuming this
> is going to be short-term hack assuming the writeback code gets more
> intelligent, but I thought I would throw this into the mix....

> ext4: Adjust ext4_da_writepages() to write out larger contiguous chunks
>
> Work around problems in the writeback code to force out writebacks in
> larger chunks than just 4mb, which is just too small. This also works
> around limitations in the ext4 block allocator, which can't allocate
> more than 2048 blocks at a time. So we need to defeat the round-robin
> characteristics of the writeback code and try to write out as many
> blocks in one inode before allowing the writeback code to move on to
> another inode. We add a a new per-filesystem tunable,
> max_contig_writeback_mb, which caps this to a default of 128mb per
> inode.

It's good to increase MAX_WRITEBACK_PAGES, however I'm afraid
max_contig_writeback_mb may be a burden in future: either it is not
necessary, or a per-bdi counterpart must be introduced for all
filesystems.

And it's preferred to automatically handle slow devices well with the
increased chunk size, instead of adding another parameter.

I scratched up a patch to demo the ideas collected in recent discussions.
Can you check if it serves your needs? Thanks.

> Signed-off-by: "Theodore Ts'o" <tytso@xxxxxxx>
> ---
> fs/ext4/ext4.h | 1 +
> fs/ext4/inode.c | 100 +++++++++++++++++++++++++++++++++++++-----
> fs/ext4/super.c | 3 +
> include/trace/events/ext4.h | 14 ++++--
> 4 files changed, 102 insertions(+), 16 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index e227eea..9f99427 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -942,6 +942,7 @@ struct ext4_sb_info {
> unsigned int s_mb_stats;
> unsigned int s_mb_order2_reqs;
> unsigned int s_mb_group_prealloc;
> + unsigned int s_max_contig_writeback_mb;
> /* where last allocation was done - for stream allocation */
> unsigned long s_mb_last_group;
> unsigned long s_mb_last_start;
> diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
> index 5fb72a9..9e0acb7 100644
> --- a/fs/ext4/inode.c
> +++ b/fs/ext4/inode.c
> @@ -1145,6 +1145,60 @@ static int check_block_validity(struct inode *inode, const char *msg,
> }
>
> /*
> + * Return the number of dirty pages in the given inode starting at
> + * page frame idx.
> + */
> +static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx)
> +{

ext4_num_dirty_pages() may be improved to take a "max_pages" parameter
to avoid unnecessary work.

> + struct address_space *mapping = inode->i_mapping;
> + pgoff_t index;
> + struct pagevec pvec;
> + pgoff_t num = 0;
> + int i, nr_pages, done = 0;
> +
> + pagevec_init(&pvec, 0);
> +
> + while (!done) {
> + index = idx;
> + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
> + PAGECACHE_TAG_DIRTY,
> + (pgoff_t)PAGEVEC_SIZE);
> + if (nr_pages == 0)
> + break;
> + for (i = 0; i < nr_pages; i++) {
> + struct page *page = pvec.pages[i];
> + struct buffer_head *bh, *head;
> +
> + lock_page(page);
> + if (unlikely(page->mapping != mapping) ||
> + !PageDirty(page) ||
> + PageWriteback(page) ||
> + page->index != idx) {
> + done = 1;
> + unlock_page(page);
> + break;
> + }
> + head = page_buffers(page);
> + bh = head;
> + do {
> + if (!buffer_delay(bh) &&
> + !buffer_unwritten(bh)) {
> + done = 1;
> + break;
> + }
> + } while ((bh = bh->b_this_page) != head);
> + unlock_page(page);

I guess a rough estimation will suffice, hehe.
There are no guarantee anyway.

> + if (done)
> + break;
> + idx++;
> + num++;
> + }
> + pagevec_release(&pvec);
> + }
> + return num;
> +}
> +
> +/*
> * The ext4_get_blocks() function tries to look up the requested blocks,
> * and returns if the blocks are already mapped.
> *
> @@ -2744,7 +2798,8 @@ static int ext4_da_writepages(struct address_space *mapping,
> int pages_written = 0;
> long pages_skipped;
> int range_cyclic, cycled = 1, io_done = 0;
> - int needed_blocks, ret = 0, nr_to_writebump = 0;
> + int needed_blocks, ret = 0;
> + long desired_nr_to_write, nr_to_writebump = 0;
> loff_t range_start = wbc->range_start;
> struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
>
> @@ -2771,16 +2826,6 @@ static int ext4_da_writepages(struct address_space *mapping,
> if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
> return -EROFS;
>
> - /*
> - * Make sure nr_to_write is >= sbi->s_mb_stream_request
> - * This make sure small files blocks are allocated in
> - * single attempt. This ensure that small files
> - * get less fragmented.
> - */
> - if (wbc->nr_to_write < sbi->s_mb_stream_request) {
> - nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write;
> - wbc->nr_to_write = sbi->s_mb_stream_request;
> - }
> if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
> range_whole = 1;
>
> @@ -2795,6 +2840,36 @@ static int ext4_da_writepages(struct address_space *mapping,
> } else
> index = wbc->range_start >> PAGE_CACHE_SHIFT;
>
> + /*
> + * This works around two forms of stupidity. The first is in
> + * the writeback code, which caps the maximum number of pages
> + * written to be 1024 pages. This is wrong on multiple
> + * levels; different architectues have a different page size,
> + * which changes the maximum amount of data which gets

Good point.

> + * written. Secondly, 4 megabytes is way too small. XFS
> + * forces this value to be 16 megabytes by multiplying
> + * nr_to_write parameter by four, and then relies on its
> + * allocator to allocate larger extents to make them
> + * contiguous. Unfortunately this brings us to the second

Hopefully we can make these hacks unnecessary :)

> + * stupidity, which is that ext4's mballoc code only allocates
> + * at most 2048 blocks. So we force contiguous writes up to
> + * the number of dirty blocks in the inode, or
> + * sbi->max_contig_writeback_mb whichever is smaller.
> + */
> + if (!range_cyclic && range_whole)
> + desired_nr_to_write = wbc->nr_to_write * 8;
> + else
> + desired_nr_to_write = ext4_num_dirty_pages(inode, index);
> + if (desired_nr_to_write > (sbi->s_max_contig_writeback_mb <<
> + (20 - PAGE_CACHE_SHIFT)))
> + desired_nr_to_write = (sbi->s_max_contig_writeback_mb <<
> + (20 - PAGE_CACHE_SHIFT));
> +
> + if (wbc->nr_to_write < desired_nr_to_write) {
> + nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
> + wbc->nr_to_write = desired_nr_to_write;
> + }
> +
> mpd.wbc = wbc;
> mpd.inode = mapping->host;
>
> @@ -2914,7 +2989,8 @@ retry:
> out_writepages:
> if (!no_nrwrite_index_update)
> wbc->no_nrwrite_index_update = 0;
> - wbc->nr_to_write -= nr_to_writebump;
> + if (wbc->nr_to_write > nr_to_writebump)
> + wbc->nr_to_write -= nr_to_writebump;
> wbc->range_start = range_start;
> trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
> return ret;

Thanks,
Fengguang
---
writeback: bump up writeback chunk size to 128MB

Adjust the writeback call stack to support larger writeback chunk size.

- make wbc.nr_to_write a per-file parameter
- init wbc.nr_to_write with MAX_WRITEBACK_PAGES=128MB
(proposed by Ted)
- add wbc.nr_segments to limit seeks inside sparsely dirtied file
(proposed by Chris)
- add wbc.timeout which will be used to control IO submission time
either per-file or globally.

The wbc.nr_segments is now determined purely by logical page index
distance: if two pages are 1MB apart, it makes a new segment.

Filesystems could do this better with real extent knowledges.
One possible scheme is to record the previous page index in
wbc.writeback_index, and let ->writepage compare if the current and
previous pages lie in the same extent, and decrease wbc.nr_segments
accordingly. Care should taken to avoid double decreases in writepage
and write_cache_pages.

The wbc.timeout (when used per-file) is mainly a safeguard against slow
devices, which may take too long time to sync 128MB data.

The wbc.timeout (when used globally) could be useful when we decide to
do two sync scans on dirty pages and dirty metadata. XFS could say:
please return to sync dirty metadata after 10s. Would need another
b_io_metadata queue, but that's possible.

This work depends on the balance_dirty_pages() wait queue patch.

CC: Theodore Ts'o <tytso@xxxxxxx>
CC: Chris Mason <chris.mason@xxxxxxxxxx>
CC: Dave Chinner <david@xxxxxxxxxxxxx>
CC: Christoph Hellwig <hch@xxxxxxxxxxxxx>
CC: Jan Kara <jack@xxxxxxx>
CC: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
CC: Jens Axboe <jens.axboe@xxxxxxxxxx>
Signed-off-by: Wu Fengguang <fengguang.wu@xxxxxxxxx>
---
fs/fs-writeback.c | 60 ++++++++++++++++++++++--------------
fs/jbd2/commit.c | 1
include/linux/writeback.h | 15 +++++++--
mm/backing-dev.c | 2 -
mm/filemap.c | 1
mm/page-writeback.c | 13 +++++++
6 files changed, 67 insertions(+), 25 deletions(-)

--- linux.orig/fs/fs-writeback.c 2009-09-30 12:17:15.000000000 +0800
+++ linux/fs/fs-writeback.c 2009-09-30 12:17:17.000000000 +0800
@@ -31,6 +31,15 @@
#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)

/*
+ * The maximum number of pages to writeout in a single bdi flush/kupdate
+ * operation. We do this so we don't hold I_SYNC against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode. Also, the code reevaluates
+ * the dirty each time it has written this many pages.
+ */
+#define MAX_WRITEBACK_PAGES (128 * (20 - PAGE_CACHE_SHIFT))
+
+/*
* We don't actually have pdflush, but this one is exported though /proc...
*/
int nr_pdflush_threads;
@@ -540,6 +549,14 @@ writeback_single_inode(struct inode *ino

spin_unlock(&inode_lock);

+ if (wbc->for_kupdate || wbc->for_background) {
+ wbc->nr_segments = 1; /* TODO: test blk_queue_nonrot() */
+ wbc->timeout = HZ;
+ } else {
+ wbc->nr_segments = LONG_MAX;
+ wbc->timeout = 0;
+ }
+
ret = do_writepages(mapping, wbc);

/* Don't write the inode if only I_DIRTY_PAGES was set */
@@ -564,7 +581,9 @@ writeback_single_inode(struct inode *ino
* sometimes bales out without doing anything.
*/
inode->i_state |= I_DIRTY_PAGES;
- if (wbc->nr_to_write <= 0) {
+ if (wbc->nr_to_write <= 0 ||
+ wbc->nr_segments <= 0 ||
+ wbc->timeout < 0) {
/*
* slice used up: queue for next turn
*/
@@ -659,12 +678,17 @@ pinned:
return 0;
}

-static void writeback_inodes_wb(struct bdi_writeback *wb,
+static long writeback_inodes_wb(struct bdi_writeback *wb,
struct writeback_control *wbc)
{
struct super_block *sb = wbc->sb, *pin_sb = NULL;
const int is_blkdev_sb = sb_is_blkdev_sb(sb);
const unsigned long start = jiffies; /* livelock avoidance */
+ unsigned long stop_time = 0;
+ long wrote = 0;
+
+ if (wbc->timeout)
+ stop_time = (start + wbc->timeout) | 1;

spin_lock(&inode_lock);

@@ -721,7 +745,9 @@ static void writeback_inodes_wb(struct b
BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
__iget(inode);
pages_skipped = wbc->pages_skipped;
+ wbc->nr_to_write = MAX_WRITEBACK_PAGES;
writeback_single_inode(inode, wbc);
+ wrote += MAX_WRITEBACK_PAGES - wbc->nr_to_write;
if (wbc->pages_skipped != pages_skipped) {
/*
* writeback is not making progress due to locked
@@ -739,12 +765,15 @@ static void writeback_inodes_wb(struct b
}
if (!list_empty(&wb->b_more_io))
wbc->more_io = 1;
+ if (stop_time && time_after(jiffies, stop_time))
+ break;
}

unpin_sb_for_writeback(&pin_sb);

spin_unlock(&inode_lock);
/* Leave any unwritten inodes on b_io */
+ return wrote;
}

void writeback_inodes_wbc(struct writeback_control *wbc)
@@ -754,15 +783,6 @@ void writeback_inodes_wbc(struct writeba
writeback_inodes_wb(&bdi->wb, wbc);
}

-/*
- * The maximum number of pages to writeout in a single bdi flush/kupdate
- * operation. We do this so we don't hold I_SYNC against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode. Also, the code reevaluates
- * the dirty each time it has written this many pages.
- */
-#define MAX_WRITEBACK_PAGES 1024
-
static inline bool over_bground_thresh(void)
{
unsigned long background_thresh, dirty_thresh;
@@ -797,10 +817,12 @@ static long wb_writeback(struct bdi_writ
.sync_mode = args->sync_mode,
.older_than_this = NULL,
.for_kupdate = args->for_kupdate,
+ .for_background = args->for_background,
.range_cyclic = args->range_cyclic,
};
unsigned long oldest_jif;
long wrote = 0;
+ long nr;
struct inode *inode;

if (wbc.for_kupdate) {
@@ -834,26 +856,20 @@ static long wb_writeback(struct bdi_writ

wbc.more_io = 0;
wbc.encountered_congestion = 0;
- wbc.nr_to_write = MAX_WRITEBACK_PAGES;
wbc.pages_skipped = 0;
- writeback_inodes_wb(wb, &wbc);
- args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
- wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+ nr = writeback_inodes_wb(wb, &wbc);
+ args->nr_pages -= nr;
+ wrote += nr;

/*
- * If we consumed everything, see if we have more
- */
- if (wbc.nr_to_write <= 0)
- continue;
- /*
- * Didn't write everything and we don't have more IO, bail
+ * Bail if no more IO
*/
if (!wbc.more_io)
break;
/*
* Did we write something? Try for more
*/
- if (wbc.nr_to_write < MAX_WRITEBACK_PAGES)
+ if (nr)
continue;
/*
* Nothing written. Wait for some inode to
--- linux.orig/include/linux/writeback.h 2009-09-30 12:13:00.000000000 +0800
+++ linux/include/linux/writeback.h 2009-09-30 12:17:17.000000000 +0800
@@ -32,10 +32,15 @@ struct writeback_control {
struct super_block *sb; /* if !NULL, only write inodes from
this super_block */
enum writeback_sync_modes sync_mode;
+ int timeout;
unsigned long *older_than_this; /* If !NULL, only write back inodes
older than this */
- long nr_to_write; /* Write this many pages, and decrement
- this for each page written */
+ long nr_to_write; /* Max pages to write per file, and
+ decrement this for each page written
+ */
+ long nr_segments; /* Max page segments to write per file,
+ this is also a count down value
+ */
long pages_skipped; /* Pages which were not written */

/*
@@ -49,6 +54,7 @@ struct writeback_control {
unsigned nonblocking:1; /* Don't get stuck on request queues */
unsigned encountered_congestion:1; /* An output: a queue is full */
unsigned for_kupdate:1; /* A kupdate writeback */
+ unsigned for_background:1; /* A background writeback */
unsigned for_reclaim:1; /* Invoked from the page allocator */
unsigned range_cyclic:1; /* range_start is cyclic */
unsigned stop_on_wrap:1; /* stop when write index is to wrap */
@@ -65,6 +71,11 @@ struct writeback_control {
};

/*
+ * if two page ranges are more than 1MB apart, they are taken as two segments.
+ */
+#define WB_SEGMENT_DIST (1024 >> (PAGE_CACHE_SHIFT - 10))
+
+/*
* fs/fs-writeback.c
*/
struct bdi_writeback;
--- linux.orig/mm/filemap.c 2009-09-30 12:13:00.000000000 +0800
+++ linux/mm/filemap.c 2009-09-30 12:17:17.000000000 +0800
@@ -216,6 +216,7 @@ int __filemap_fdatawrite_range(struct ad
struct writeback_control wbc = {
.sync_mode = sync_mode,
.nr_to_write = LONG_MAX,
+ .nr_segments = LONG_MAX,
.range_start = start,
.range_end = end,
};
--- linux.orig/mm/page-writeback.c 2009-09-30 12:17:15.000000000 +0800
+++ linux/mm/page-writeback.c 2009-09-30 12:17:17.000000000 +0800
@@ -765,6 +765,7 @@ int write_cache_pages(struct address_spa
int cycled;
int range_whole = 0;
long nr_to_write = wbc->nr_to_write;
+ unsigned long start_time = jiffies;

pagevec_init(&pvec, 0);
if (wbc->range_cyclic) {
@@ -818,6 +819,12 @@ retry:
break;
}

+ if (done_index + WB_SEGMENT_DIST > page->index &&
+ --wbc->nr_segments <= 0) {
+ done = 1;
+ break;
+ }
+
done_index = page->index + 1;

lock_page(page);
@@ -899,6 +906,12 @@ continue_unlock:
}
pagevec_release(&pvec);
cond_resched();
+ if (wbc->timeout &&
+ time_after(jiffies, start_time + wbc->timeout)) {
+ wbc->timeout = -1;
+ done = 1;
+ break;
+ }
}
if (wbc->stop_on_wrap)
done_index = 0;
--- linux.orig/fs/jbd2/commit.c 2009-09-30 12:13:00.000000000 +0800
+++ linux/fs/jbd2/commit.c 2009-09-30 12:17:17.000000000 +0800
@@ -219,6 +219,7 @@ static int journal_submit_inode_data_buf
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = mapping->nrpages * 2,
+ .nr_segments = LONG_MAX,
.range_start = 0,
.range_end = i_size_read(mapping->host),
};
--- linux.orig/mm/backing-dev.c 2009-09-30 12:17:26.000000000 +0800
+++ linux/mm/backing-dev.c 2009-09-30 12:17:38.000000000 +0800
@@ -336,7 +336,7 @@ static void bdi_flush_io(struct backing_
.sync_mode = WB_SYNC_NONE,
.older_than_this = NULL,
.range_cyclic = 1,
- .nr_to_write = 1024,
+ .timeout = HZ,
};

writeback_inodes_wbc(&wbc);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/