Re: [PATCH] 2.3.14: filemap_nopage hangs when offset >= inode->i_size

Chuck Lever (cel@monkey.org)
Fri, 20 Aug 1999 16:47:04 -0400 (EDT)


On Fri, 20 Aug 1999, Steve Dodd wrote:
> Ok, you know all the stuff that got ripped out of filemap_nopage for 2.3.14?
> It seems we need it :- try_to_read_ahead won't operate beyond of the end of
> the file (obviously), but that means when a non-shared mapping does extend
> past the end, filemap_nopage just loops, calling try_to_read_ahead forever
> to get the page it wants.
>
> The patch below includes the last one, and more or less reverts the 2.3.14
> changes (without reintroducing the bug it was meant to fix, I hope). Would
> it actually make sense to look at the whole function again? AFAICS the comments
> about overlapping allocation and I/O are wrong - surely we have to start the
> I/O, *then* allocate the page, if we want to overlap them? Would that be worth
> doing?

this takes the changes in 2.3.14's filemap_nopage a little farther. this
is closer than last week's patch to what i've been running here. i think
it fixes your issues, but i don't have any real world apps here that chase
your corner cases:

+ accessing past the end of a mapped non-shared file

+ handling I/O errors on mapped files without deadlocking, spinning,
or triggering a BUG

steve (or anyone with test cases) can you try this out for me?

i also moved the new_page allocation, see if that change makes sense to
you. in most cases, the old page is already in the page cache, so there
can be no overlap.

--- linux-2.3.13-ref/mm/filemap.c Sun Aug 15 15:01:14 1999
+++ linux/mm/filemap.c Fri Aug 20 16:13:51 1999
@@ -41,6 +41,10 @@

spinlock_t pagecache_lock = SPIN_LOCK_UNLOCKED;

+#define CLUSTER_PAGES (1 << page_cluster)
+#define CLUSTER_SHIFT (PAGE_CACHE_SHIFT + page_cluster)
+#define CLUSTER_BYTES (1 << CLUSTER_SHIFT)
+#define CLUSTER_OFFSET(x) (((x) >> CLUSTER_SHIFT) << CLUSTER_SHIFT)

void __add_page_to_hash_queue(struct page * page, struct page **p)
{
@@ -500,39 +504,58 @@
}

/*
- * Try to read ahead in the file. "page_cache" is a potentially free page
- * that we could use for the cache (if it is 0 we can try to create one,
- * this is all overlapped with the IO on the previous page finishing anyway)
+ * This adds the requested page to the page cache if it isn't already there,
+ * and schedules an I/O to read in its contents from disk.
*/
-static unsigned long try_to_read_ahead(struct file * file,
- unsigned long offset, unsigned long page_cache)
+static inline void page_cache_read(struct file * file, unsigned long offset)
{
+ unsigned long new_page;
struct inode *inode = file->f_dentry->d_inode;
+ struct page ** hash = page_hash(inode, offset);
struct page * page;
- struct page ** hash;

- offset &= PAGE_CACHE_MASK;
- switch (page_cache) {
- case 0:
- page_cache = page_cache_alloc();
- if (!page_cache)
- break;
- default:
- if (offset >= inode->i_size)
- break;
- hash = page_hash(inode, offset);
- page = page_cache_entry(page_cache);
- if (!add_to_page_cache_unique(page, inode, offset, hash)) {
- /*
- * We do not have to check the return value here
- * because it's a readahead.
- */
- inode->i_op->readpage(file, page);
- page_cache = 0;
- page_cache_release(page);
- }
+ spin_lock(&pagecache_lock);
+ page = __find_page_nolock(inode, offset, *hash);
+ spin_unlock(&pagecache_lock);
+ if (page)
+ return;
+
+ new_page = page_cache_alloc();
+ if (!new_page)
+ return;
+ page = page_cache_entry(new_page);
+
+ if (!add_to_page_cache_unique(page, inode, offset, hash)) {
+ inode->i_op->readpage(file, page);
+ page_cache_release(page);
+ return;
+ }
+
+ /*
+ * We arrive here in the unlikely event that someone
+ * raced with us and added our page to the cache first.
+ */
+ page_cache_free(new_page);
+ return;
+}
+
+/*
+ * Read in an entire cluster at once. A cluster is usually a 64k-
+ * aligned block that includes the address requested in "offset."
+ */
+static void read_cluster_nonblocking(struct file * file,
+ unsigned long offset)
+{
+ off_t filesize = file->f_dentry->d_inode->i_size;
+ unsigned long pages = CLUSTER_PAGES;
+
+ offset = CLUSTER_OFFSET(offset);
+ while ((pages-- > 0) && (offset < filesize)) {
+ page_cache_read(file, offset);
+ offset += PAGE_CACHE_SIZE;
}
- return page_cache;
+
+ return;
}

/*
@@ -813,9 +836,9 @@
return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
}

-static inline unsigned long generic_file_readahead(int reada_ok,
+static inline void generic_file_readahead(int reada_ok,
struct file * filp, struct inode * inode,
- unsigned long ppos, struct page * page, unsigned long page_cache)
+ unsigned long ppos, struct page * page)
{
unsigned long max_ahead, ahead;
unsigned long raend;
@@ -879,8 +902,7 @@
ahead = 0;
while (ahead < max_ahead) {
ahead += PAGE_CACHE_SIZE;
- page_cache = try_to_read_ahead(filp, raend + ahead,
- page_cache);
+ page_cache_read(filp, raend + ahead);
}
/*
* If we tried to read ahead some pages,
@@ -912,7 +934,7 @@
#endif
}

- return page_cache;
+ return;
}

/*
@@ -945,13 +967,11 @@
{
struct dentry *dentry = filp->f_dentry;
struct inode *inode = dentry->d_inode;
- size_t pos, pgpos, page_cache;
+ size_t pos, pgpos, page_cache = 0;
int reada_ok;
int error;
int max_readahead = get_max_readahead(inode);

- page_cache = 0;
-
pos = *ppos;
pgpos = pos & PAGE_CACHE_MASK;
/*
@@ -1046,7 +1066,8 @@
* Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
*/
page_not_up_to_date:
- page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
+ generic_file_readahead(reada_ok, filp, inode,
+ pos & PAGE_CACHE_MASK, page);

if (Page_Uptodate(page))
goto page_ok;
@@ -1067,7 +1088,8 @@
goto page_ok;

/* Again, try some read-ahead while waiting for the page to finish.. */
- page_cache = generic_file_readahead(reada_ok, filp, inode, pos & PAGE_CACHE_MASK, page, page_cache);
+ generic_file_readahead(reada_ok, filp, inode,
+ pos & PAGE_CACHE_MASK, page);
wait_on_page(page);
if (Page_Uptodate(page))
goto page_ok;
@@ -1269,31 +1291,34 @@
}

/*
- * Semantics for shared and private memory areas are different past the end
- * of the file. A shared mapping past the last page of the file is an error
- * and results in a SIGBUS, while a private mapping just maps in a zero page.
+ * filemap_nopage() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
*
* The goto's are kind of ugly, but this streamlines the normal case of having
* it in the page cache, and handles the special cases reasonably without
* having a lot of duplicated code.
*
- * WSH 06/04/97: fixed a memory leak and moved the allocation of new_page
- * ahead of the wait if we're sure to need it.
+ * XXX - at some point, this should return unique values to indicate to
+ * the caller whether this is EIO, OOM, or SIGBUS.
*/
static unsigned long filemap_nopage(struct vm_area_struct * area, unsigned long address, int no_share)
{
struct file * file = area->vm_file;
struct dentry * dentry = file->f_dentry;
struct inode * inode = dentry->d_inode;
- unsigned long offset, reada, i;
struct page * page, **hash;
unsigned long old_page, new_page;
- int error;
+ unsigned long offset = address - area->vm_start + area->vm_offset;

- new_page = 0;
- offset = (address & PAGE_MASK) - area->vm_start + area->vm_offset;
- if (offset >= inode->i_size && (area->vm_flags & VM_SHARED) && area->vm_mm == current->mm)
- goto no_page;
+ /*
+ * Semantics for shared and private memory areas are different
+ * past the end of the file. A shared mapping past the last page
+ * of the file is an error and results in a SIGBUS, while a
+ * private mapping just maps in a zero page.
+ */
+ if ((offset >= inode->i_size) &&
+ (area->vm_flags & VM_SHARED) && (area->vm_mm == current->mm))
+ return 0;

/*
* Do we have something in the page cache already?
@@ -1304,24 +1329,12 @@
if (!page)
goto no_cached_page;

-found_page:
/*
* Ok, found a page in the page cache, now we need to check
- * that it's up-to-date. First check whether we'll need an
- * extra page -- better to overlap the allocation with the I/O.
+ * that it's up-to-date.
*/
- if (no_share && !new_page) {
- new_page = page_cache_alloc();
- if (!new_page)
- goto failure;
- }
-
- if (!Page_Uptodate(page)) {
- lock_page(page);
- if (!Page_Uptodate(page))
- goto page_not_uptodate;
- UnlockPage(page);
- }
+ if (!Page_Uptodate(page))
+ goto page_not_uptodate;

success:
/*
@@ -1330,99 +1343,79 @@
*/
old_page = page_address(page);
if (!no_share) {
- /*
- * Ok, we can share the cached page directly.. Get rid
- * of any potential extra pages.
- */
- if (new_page)
- page_cache_free(new_page);
-
flush_page_to_ram(old_page);
return old_page;
}

- /*
- * No sharing ... copy to the new page.
- */
- copy_page(new_page, old_page);
- flush_page_to_ram(new_page);
+ new_page = page_cache_alloc();
+ if (new_page) {
+ copy_page(new_page, old_page);
+ flush_page_to_ram(new_page);
+ }
page_cache_release(page);
return new_page;

no_cached_page:
/*
- * Try to read in an entire cluster at once.
+ * If the requested offset is in our file, try to read a whole
+ * cluster of pages at once.
*/
- reada = offset;
- reada >>= PAGE_CACHE_SHIFT + page_cluster;
- reada <<= PAGE_CACHE_SHIFT + page_cluster;
-
- for (i = 1 << page_cluster; i > 0; --i, reada += PAGE_CACHE_SIZE)
- new_page = try_to_read_ahead(file, reada, new_page);
-
- if (!new_page)
- new_page = page_cache_alloc();
- if (!new_page)
- goto no_page;
-
+ if (offset < inode->i_size)
+ read_cluster_nonblocking(file, offset);
+ else
/*
- * During getting the above page we might have slept,
- * so we need to re-check the situation with the page
- * cache.. The page we just got may be useful if we
- * can't share, so don't get rid of it here.
+ * We're off the end of a privately mapped file, so we need
+ * to map a zero page.
*/
- page = __find_get_page(inode, offset, hash);
- if (page)
- goto found_page;
+ page_cache_read(file, offset);

/*
- * Now, create a new page-cache page from the page we got
+ * The page we want has now been added to the page cache.
+ * In the unlikely event that someone removed it in the
+ * meantime, we'll just come back here and read it again.
*/
- page = page_cache_entry(new_page);
- if (add_to_page_cache_unique(page, inode, offset, hash))
- goto retry_find;
-
- /*
- * Now it's ours and locked, we can do initial IO to it:
- */
- new_page = 0;
+ goto retry_find;

page_not_uptodate:
- error = inode->i_op->readpage(file, page);
+ lock_page(page);
+ if (Page_Uptodate(page)) {
+ UnlockPage(page);
+ goto success;
+ }

- if (!error) {
+ if (!inode->i_op->readpage(file, page)) {
wait_on_page(page);
- if (PageError(page))
- goto page_read_error;
- goto success;
+ if (Page_Uptodate(page))
+ goto success;
+ lock_page(page);
}

-page_read_error:
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
* because there really aren't any performance issues here
* and we need to check for errors.
+ *
+ * On error return from readpage(), we expect that the page
+ * is locked, and that PG_Error set in the page. If one of
+ * these conditions aren't true, it will trigger a BUG.
*/
if (!PageLocked(page))
PAGE_BUG(page);
ClearPageError(page);
- error = inode->i_op->readpage(file, page);
- if (error)
- goto failure;
- wait_on_page(page);
- if (Page_Uptodate(page))
- goto success;
+
+ if (!inode->i_op->readpage(file, page)) {
+ wait_on_page(page);
+ if (Page_Uptodate(page))
+ goto success;
+ }
+ UnlockPage(page);

/*
* Things didn't work out. Return zero to tell the
- * mm layer so, possibly freeing the page cache page first.
+ * mm layer so.
*/
-failure:
page_cache_release(page);
- if (new_page)
- page_cache_free(new_page);
-no_page:
return 0;
}

- Chuck Lever

--
corporate:	<chuckl@netscape.com>
personal:	<chucklever@netscape.net> or <cel@monkey.org>

The Linux Scalability project: http://www.citi.umich.edu/projects/linux-scalability/

- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majordomo@vger.rutgers.edu Please read the FAQ at http://www.tux.org/lkml/