Re: [rfc] possible page manipulation simplifications?

From: Mel Gorman
Date: Mon Dec 04 2006 - 09:40:50 EST


On (02/12/06 13:15), Nick Piggin didst pronounce:
> Hi,
>
> While working in this area, I noticed a few things we do that may not
> have a positive payoff under the most common conditions. Untested yet,
> and probably needs a bit of instrumentation, but it saves about half a
> K of code, lots of branches, and makes things look nicer. Any thoughts?
>
> Quite a bit of code is used in maintaining these "cached pages" that are
> probably pretty unlikely to get used.
>

I think you might be leaking now though. More comments below.

> Also, buffered write path (and others) uses its own LRU pagevec when we should
> be just using the per-CPU LRU pagevec (which will cut down on both data and
> code size cacheline footprint).
>

Splitting the patch into two could be nice but it's grand for the
moment.

> Index: linux-2.6/mm/filemap.c
> ===================================================================
> --- linux-2.6.orig/mm/filemap.c
> +++ linux-2.6/mm/filemap.c
> @@ -686,26 +686,18 @@ EXPORT_SYMBOL(find_lock_page);
> struct page *find_or_create_page(struct address_space *mapping,
> unsigned long index, gfp_t gfp_mask)
> {
> - struct page *page, *cached_page = NULL;
> + struct page *page;
> int err;
> repeat:
> page = find_lock_page(mapping, index);
> if (!page) {
> - if (!cached_page) {
> - cached_page = alloc_page(gfp_mask);
> - if (!cached_page)
> - return NULL;
> - }
> - err = add_to_page_cache_lru(cached_page, mapping,
> - index, gfp_mask);
> - if (!err) {
> - page = cached_page;
> - cached_page = NULL;
> - } else if (err == -EEXIST)
> + page = alloc_page(gfp_mask);
> + if (!page)
> + return NULL;
> + err = add_to_page_cache_lru(page, mapping, index, gfp_mask);
> + if (err == -EEXIST)
> goto repeat;

Lets say the alloc_page() succeeds, but add_to_page_cache_lru() returns
-EEXIST, we'll jump back to the repeat label which calls page =
find_lock_page(). We've lost the page at that point, right? If
add_to_page_cache_lru() returns any error in fact, the page leaks.

You appear to do the right thing later when you call
page_cache_release() on error adding to the page cache. This is probably
safe to do here. Sure, for non-EEXIST errors, you'll allocate the page
twice but you probably don't care if this path is very rarely executed.

> }
> - if (cached_page)
> - page_cache_release(cached_page);
> return page;
> }
> EXPORT_SYMBOL(find_or_create_page);
> @@ -891,11 +883,9 @@ void do_generic_mapping_read(struct addr
> unsigned long next_index;
> unsigned long prev_index;
> loff_t isize;
> - struct page *cached_page;
> int error;
> struct file_ra_state ra = *_ra;
>
> - cached_page = NULL;
> index = *ppos >> PAGE_CACHE_SHIFT;
> next_index = index;
> prev_index = ra.prev_page;
> @@ -1059,14 +1049,12 @@ no_cached_page:
> * Ok, it wasn't cached, so we need to create a new
> * page..
> */
> - if (!cached_page) {
> - cached_page = page_cache_alloc_cold(mapping);
> - if (!cached_page) {
> - desc->error = -ENOMEM;
> - goto out;
> - }
> + page = page_cache_alloc_cold(mapping);
> + if (!page) {
> + desc->error = -ENOMEM;
> + goto out;
> }
> - error = add_to_page_cache_lru(cached_page, mapping,
> + error = add_to_page_cache_lru(page, mapping,
> index, GFP_KERNEL);
> if (error) {
> if (error == -EEXIST)

I think this might be suffering similar problems with leaking pages.

> @@ -1074,8 +1062,6 @@ no_cached_page:
> desc->error = error;
> goto out;
> }
> - page = cached_page;
> - cached_page = NULL;
> goto readpage;
> }
>
> @@ -1083,8 +1069,6 @@ out:
> *_ra = ra;
>
> *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
> - if (cached_page)
> - page_cache_release(cached_page);
> if (filp)
> file_accessed(filp);
> }
> @@ -1545,35 +1529,28 @@ static inline struct page *__read_cache_
> int (*filler)(void *,struct page*),
> void *data)
> {
> - struct page *page, *cached_page = NULL;
> + struct page *page;
> int err;
> repeat:
> page = find_get_page(mapping, index);
> if (!page) {
> - if (!cached_page) {
> - cached_page = page_cache_alloc_cold(mapping);
> - if (!cached_page)
> - return ERR_PTR(-ENOMEM);
> - }
> - err = add_to_page_cache_lru(cached_page, mapping,
> - index, GFP_KERNEL);
> + page = page_cache_alloc_cold(mapping);
> + if (!page)
> + return ERR_PTR(-ENOMEM);
> + err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL);
> if (err == -EEXIST)
> goto repeat;
> if (err < 0) {
> /* Presumably ENOMEM for radix tree node */
> - page_cache_release(cached_page);
> + page_cache_release(page);
> return ERR_PTR(err);
> }
> - page = cached_page;
> - cached_page = NULL;
> err = filler(data, page);
> if (err < 0) {
> page_cache_release(page);
> page = ERR_PTR(err);
> }
> }
> - if (cached_page)
> - page_cache_release(cached_page);

I didn't look carefully here but it looks like more of the same.

> return page;
> }
>
> @@ -1624,40 +1601,6 @@ retry:
> EXPORT_SYMBOL(read_cache_page);
>
> /*
> - * If the page was newly created, increment its refcount and add it to the
> - * caller's lru-buffering pagevec. This function is specifically for
> - * generic_file_write().
> - */
> -static inline struct page *
> -__grab_cache_page(struct address_space *mapping, unsigned long index,
> - struct page **cached_page, struct pagevec *lru_pvec)
> -{
> - int err;
> - struct page *page;
> -repeat:
> - page = find_lock_page(mapping, index);
> - if (!page) {
> - if (!*cached_page) {
> - *cached_page = page_cache_alloc(mapping);
> - if (!*cached_page)
> - return NULL;
> - }
> - err = add_to_page_cache(*cached_page, mapping,
> - index, GFP_KERNEL);
> - if (err == -EEXIST)
> - goto repeat;
> - if (err == 0) {
> - page = *cached_page;
> - page_cache_get(page);
> - if (!pagevec_add(lru_pvec, page))
> - __pagevec_lru_add(lru_pvec);
> - *cached_page = NULL;
> - }
> - }
> - return page;
> -}
> -
> -/*
> * The logic we want is
> *
> * if suid or (sgid and xgrp)
> @@ -1862,14 +1805,10 @@ generic_file_buffered_write(struct kiocb
> struct inode *inode = mapping->host;
> long status = 0;
> struct page *page;
> - struct page *cached_page = NULL;
> - struct pagevec lru_pvec;
> const struct iovec *cur_iov = iov; /* current iovec */
> size_t iov_offset = 0; /* offset in the current iovec */
> char __user *buf;
>
> - pagevec_init(&lru_pvec, 0);
> -
> /*
> * handle partial DIO write. Adjust cur_iov if needed.
> */
> @@ -1911,10 +1850,23 @@ retry_noprogress:
> break;
> #endif
>
> - page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
> - if (!page) {
> - status = -ENOMEM;
> - break;
> +
> +repeat:
> + page = find_lock_page(mapping, index);
> + if (unlikely(!page)) {
> + page = page_cache_alloc(mapping);
> + if (!page) {
> + status = -ENOMEM;
> + break;
> + }
> + status = add_to_page_cache_lru(page, mapping,
> + index, GFP_KERNEL);
> + if (status) {
> + page_cache_release(page);

We don't leak here but might allocate twice.

> + if (status == -EEXIST)
> + goto repeat;
> + break;
> + }
> }
>

Otherwise, this seems to be a reasonable cleanup.

> status = a_ops->prepare_write(file, page, offset, offset+bytes);
> @@ -2027,9 +1979,6 @@ retry_noprogress:
> } while (count);
> *ppos = pos;
>
> - if (cached_page)
> - page_cache_release(cached_page);
> -
> /*
> * For now, when the user asks for O_SYNC, we'll actually give O_DSYNC
> */
> @@ -2049,7 +1998,6 @@ retry_noprogress:
> if (unlikely(file->f_flags & O_DIRECT) && written)
> status = filemap_write_and_wait(mapping);
>
> - pagevec_lru_add(&lru_pvec);
> return written ? written : status;
> }
> EXPORT_SYMBOL(generic_file_buffered_write);
> Index: linux-2.6/fs/mpage.c
> ===================================================================
> --- linux-2.6.orig/fs/mpage.c
> +++ linux-2.6/fs/mpage.c
> @@ -389,31 +389,26 @@ mpage_readpages(struct address_space *ma
> struct bio *bio = NULL;
> unsigned page_idx;
> sector_t last_block_in_bio = 0;
> - struct pagevec lru_pvec;
> struct buffer_head map_bh;
> unsigned long first_logical_block = 0;
>
> clear_buffer_mapped(&map_bh);
> - pagevec_init(&lru_pvec, 0);
> for (page_idx = 0; page_idx < nr_pages; page_idx++) {
> struct page *page = list_entry(pages->prev, struct page, lru);
>
> prefetchw(&page->flags);
> list_del(&page->lru);
> - if (!add_to_page_cache(page, mapping,
> + if (!add_to_page_cache_lru(page, mapping,
> page->index, GFP_KERNEL)) {
> bio = do_mpage_readpage(bio, page,
> nr_pages - page_idx,
> &last_block_in_bio, &map_bh,
> &first_logical_block,
> get_block);
> - if (!pagevec_add(&lru_pvec, page))
> - __pagevec_lru_add(&lru_pvec);
> } else {
> page_cache_release(page);
> }
> }
> - pagevec_lru_add(&lru_pvec);
> BUG_ON(!list_empty(pages));
> if (bio)
> mpage_bio_submit(READ, bio);
> Index: linux-2.6/mm/readahead.c
> ===================================================================
> --- linux-2.6.orig/mm/readahead.c
> +++ linux-2.6/mm/readahead.c
> @@ -132,22 +132,18 @@ int read_cache_pages(struct address_spac
> int (*filler)(void *, struct page *), void *data)
> {
> struct page *page;
> - struct pagevec lru_pvec;
> int ret = 0;
>
> - pagevec_init(&lru_pvec, 0);
> -
> while (!list_empty(pages)) {
> page = list_to_page(pages);
> list_del(&page->lru);
> - if (add_to_page_cache(page, mapping, page->index, GFP_KERNEL)) {
> + if (add_to_page_cache_lru(page, mapping,
> + page->index, GFP_KERNEL)) {
> page_cache_release(page);
> continue;
> }
> ret = filler(data, page);
> - if (!pagevec_add(&lru_pvec, page))
> - __pagevec_lru_add(&lru_pvec);
> - if (ret) {
> + if (unlikely(ret)) {
> while (!list_empty(pages)) {
> struct page *victim;
>
> @@ -158,7 +154,6 @@ int read_cache_pages(struct address_spac
> break;
> }
> }
> - pagevec_lru_add(&lru_pvec);
> return ret;
> }
>
> @@ -168,7 +163,6 @@ static int read_pages(struct address_spa
> struct list_head *pages, unsigned nr_pages)
> {
> unsigned page_idx;
> - struct pagevec lru_pvec;
> int ret;
>
> if (mapping->a_ops->readpages) {
> @@ -178,19 +172,15 @@ static int read_pages(struct address_spa
> goto out;
> }
>
> - pagevec_init(&lru_pvec, 0);
> for (page_idx = 0; page_idx < nr_pages; page_idx++) {
> struct page *page = list_to_page(pages);
> list_del(&page->lru);
> - if (!add_to_page_cache(page, mapping,
> + if (!add_to_page_cache_lru(page, mapping,
> page->index, GFP_KERNEL)) {
> mapping->a_ops->readpage(filp, page);
> - if (!pagevec_add(&lru_pvec, page))
> - __pagevec_lru_add(&lru_pvec);
> } else
> page_cache_release(page);
> }
> - pagevec_lru_add(&lru_pvec);
> ret = 0;
> out:
> return ret;
>

--
Mel Gorman
Part-time Phd Student Linux Technology Center
University of Limerick IBM Dublin Software Lab
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/