NFS directory handling without monotonicity assumptions: [PATCH] against 2.3.99-pre6

From: Trond Myklebust (trond.myklebust@fys.uio.no)
Date: Thu May 04 2000 - 09:31:43 EST


Hi,

  I've been worried for some time about the way we handle directory
caching in 2.3.x. There have been two attempts to reimplement this;
one by David S. Miller, the other by myself. Both have had the common
weakness that we have made assumptions beyond what is allowed by the
RFC in order to cope with file creation/deletion while some other
process is keeping a directory stream open.

  The problem is that when a file gets deleted, then we have to reread
the entire directory back in to cache. Any offsets in the page cache
will be invalidated, and any cookies that the server has sent us may
disappear from our cache.
  The best solution when we're stuck with this case is simply to send
the last known cookie to the server, and ask for the next entry.

      (Yes 6 months of thinking and we end up with:- "Why don't we
      just ask the guy who knows the answer?" Disgusting...)

We can then locate the corresponding entry in our page cache, and
continue happily from there. This avoids any trouble with assuming
cookies are monotonically increasing etc... and should allows us to
implement a Connectathon-compliant directory cache.

  The following patch implements this idea for
linux-2.3.99-pre6. Please could people test it out before I pass it on
to Linus? I've tested that it works against a Linux knfsd server (and
that it passes the Connectathon99 test against that particular
server).

I'd be particularly interested in hearing whether or not it works
against SGI and NetWare NFS servers since they gave hangs using the
old algorithms.

Cheers,
  Trond

diff -u --recursive --new-file linux-2.3.99-pre7-3/fs/nfs/dir.c linux-2.3.99-pre7-3-devel/fs/nfs/dir.c
--- linux-2.3.99-pre7-3/fs/nfs/dir.c Fri Apr 21 22:36:39 2000
+++ linux-2.3.99-pre7-3-devel/fs/nfs/dir.c Thu May 4 15:31:45 2000
@@ -71,6 +71,79 @@
 };
 
 typedef u32 * (*decode_dirent_t)(u32 *, struct nfs_entry *, int);
+typedef struct {
+ struct file *file;
+ struct page *page;
+ unsigned long page_index;
+ unsigned page_offset;
+ u64 cookie;
+ struct nfs_entry *entry;
+ decode_dirent_t decode;
+ int plus;
+ int error;
+} nfs_readdir_descriptor_t;
+typedef int (*nfs_dirent_find_t)(struct nfs_entry *, nfs_readdir_descriptor_t *);
+
+
+static int
+nfs_find_prev_cookie(struct nfs_entry *entry, nfs_readdir_descriptor_t *desc)
+{
+ return entry->prev_cookie == desc->cookie;
+}
+
+static int
+nfs_find_cookie(struct nfs_entry *entry, nfs_readdir_descriptor_t *desc)
+{
+ return entry->cookie == desc->cookie;
+}
+
+/* Now we cache directories properly, by stuffing the dirent
+ * data directly in the page cache.
+ *
+ * Inode invalidation due to refresh etc. takes care of
+ * _everything_, no sloppy entry flushing logic, no extraneous
+ * copying, network direct to page cache, the way it was meant
+ * to be.
+ *
+ * NOTE: Dirent information verification is done always by the
+ * page-in of the RPC reply, nowhere else, this simplies
+ * things substantially.
+ */
+static
+int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page)
+{
+ struct dentry *dir = desc->file->f_dentry;
+ struct inode *inode = dir->d_inode;
+ void *buffer = (void *)kmap(page);
+ int plus = NFS_USE_READDIRPLUS(inode);
+ int error;
+
+ dfprintk(VFS, "NFS: nfs_readdir_filler() reading cookie %Lu into page %lu.\n", (long long)desc->entry->cookie, page->index);
+
+ again:
+ error = NFS_PROTO(inode)->readdir(dir, desc->entry->cookie, buffer,
+ NFS_SERVER(inode)->dtsize, plus);
+ /* We requested READDIRPLUS, but the server doesn't grok it */
+ if (desc->plus && error == -ENOTSUPP) {
+ NFS_FLAGS(inode) &= ~NFS_INO_ADVISE_RDPLUS;
+ plus = 0;
+ goto again;
+ }
+ if (error < 0)
+ goto error;
+ SetPageUptodate(page);
+ kunmap(page);
+ truncate_inode_pages(page->mapping, (loff_t)(page->index + 1) << PAGE_CACHE_SHIFT);
+ UnlockPage(page);
+ return 0;
+ error:
+ SetPageError(page);
+ kunmap(page);
+ UnlockPage(page);
+ truncate_inode_pages(page->mapping, (loff_t)page->index << PAGE_CACHE_SHIFT);
+ desc->error = error;
+ return -EIO;
+}
 
 /*
  * Given a pointer to a buffer that has already been filled by a call
@@ -80,309 +153,195 @@
  * return the offset within the buffer of the next entry to be
  * read.
  */
-static inline
-long find_dirent(struct page *page, loff_t offset,
- struct nfs_entry *entry,
- decode_dirent_t decode, int plus, int use_cookie)
+static int find_dirent(nfs_readdir_descriptor_t *desc,
+ nfs_dirent_find_t find_actor, struct page *page)
 {
- u8 *p = (u8 *)kmap(page),
- *start = p;
- unsigned long base = page_offset(page),
- pg_offset = 0;
- int loop_count = 0;
+ struct nfs_entry *entry = desc->entry;
+ char *start = (char *)kmap(page),
+ *p = start;
+ int loop_count = 0,
+ status = 0;
 
- if (!p)
- return -EIO;
         for(;;) {
- p = (u8*)decode((__u32*)p, entry, plus);
- if (IS_ERR(p))
+ p = (char *)desc->decode((u32*)p, entry, desc->plus);
+ if (IS_ERR(p)) {
+ status = PTR_ERR(p);
                         break;
- pg_offset = p - start;
- entry->prev = entry->offset;
- entry->offset = base + pg_offset;
- if ((use_cookie ? entry->cookie : entry->offset) > offset)
+ }
+ desc->page_offset = p - start;
+ dfprintk(VFS, "NFS: found cookie %Lu\n", (long long)entry->cookie);
+ if (find_actor(entry, desc))
                         break;
                 if (loop_count++ > 200) {
                         loop_count = 0;
                         schedule();
                 }
         }
-
         kunmap(page);
- return (IS_ERR(p)) ? PTR_ERR(p) : (long)pg_offset;
+ dfprintk(VFS, "NFS: find_dirent() returns %d\n", status);
+ return status;
 }
 
 /*
  * Find the given page, and call find_dirent() in order to try to
  * return the next entry.
- *
- * Returns -EIO if the page is not available, or up to date.
  */
-static inline
-long find_dirent_page(struct inode *inode, loff_t offset,
- struct nfs_entry *entry)
+static int find_dirent_page(nfs_readdir_descriptor_t *desc,
+ nfs_dirent_find_t find_actor)
 {
- decode_dirent_t decode = NFS_PROTO(inode)->decode_dirent;
+ struct inode *inode = desc->file->f_dentry->d_inode;
         struct page *page;
- unsigned long index = entry->offset >> PAGE_CACHE_SHIFT;
- long status = -EIO;
- int plus = NFS_USE_READDIRPLUS(inode),
- use_cookie = NFS_MONOTONE_COOKIES(inode);
-
- dfprintk(VFS, "NFS: find_dirent_page() searching directory page %ld\n", entry->offset & PAGE_CACHE_MASK);
-
- if (entry->page)
- page_cache_release(entry->page);
-
- page = find_get_page(&inode->i_data, index);
-
- if (page && Page_Uptodate(page))
- status = find_dirent(page, offset, entry, decode, plus, use_cookie);
-
- /* NB: on successful return we will be holding the page */
- if (status < 0) {
- entry->page = NULL;
- if (page)
- page_cache_release(page);
- } else
- entry->page = page;
+ unsigned long index = desc->page_index;
+ int status;
+
+ dfprintk(VFS, "NFS: find_dirent_page() searching directory page %ld\n", desc->page_index);
+
+ if (desc->page) {
+ page_cache_release(desc->page);
+ desc->page = NULL;
+ }
 
- dfprintk(VFS, "NFS: find_dirent_page() returns %ld\n", status);
+ page = read_cache_page(&inode->i_data, index,
+ (filler_t *)nfs_readdir_filler, desc);
+ if (IS_ERR(page)) {
+ status = PTR_ERR(page);
+ goto out;
+ }
+ if (!Page_Uptodate(page))
+ goto read_error;
+
+ /* NOTE: Someone else may have changed the READDIRPLUS flag */
+ desc->plus = NFS_USE_READDIRPLUS(inode);
+ status = find_dirent(desc, find_actor, page);
+ if (status >= 0)
+ desc->page = page;
+ else
+ page_cache_release(page);
+ out:
+ dfprintk(VFS, "NFS: find_dirent_page() returns %d\n", status);
         return status;
+ read_error:
+ page_cache_release(page);
+ return -EIO;
 }
 
-
 /*
  * Recurse through the page cache pages, and return a
  * filled nfs_entry structure of the next directory entry if possible.
  *
- * The target for the search is position 'offset'.
- * The latter may either be an offset into the page cache, or (better)
- * a cookie depending on whether we're interested in strictly following
- * the RFC wrt. not assuming monotonicity of cookies or not.
- *
- * For most systems, the latter is more reliable since it naturally
- * copes with holes in the directory.
+ * The target for the search is 'desc->cookie'.
  */
-static inline
-long search_cached_dirent_pages(struct inode *inode, loff_t offset,
- struct nfs_entry *entry)
+static int search_cached_dirent_pages(nfs_readdir_descriptor_t *desc,
+ nfs_dirent_find_t find_actor)
 {
- long res = 0;
+ int res = 0;
         int loop_count = 0;
 
- dfprintk(VFS, "NFS: search_cached_dirent_pages() searching for cookie %Ld\n", (long long)offset);
+ dfprintk(VFS, "NFS: search_cached_dirent_pages() searching for cookie %Lu\n", (long long)desc->cookie);
         for (;;) {
- res = find_dirent_page(inode, offset, entry);
- if (res == -EAGAIN) {
- /* Align to beginning of next page */
- entry->offset &= PAGE_CACHE_MASK;
- entry->offset += PAGE_CACHE_SIZE;
- }
+ res = find_dirent_page(desc, find_actor);
                 if (res != -EAGAIN)
                         break;
+ /* Align to beginning of next page */
+ desc->page_offset = 0;
+ desc->page_index ++;
                 if (loop_count++ > 200) {
                         loop_count = 0;
                         schedule();
                 }
         }
- if (res < 0 && entry->page) {
- page_cache_release(entry->page);
- entry->page = NULL;
- }
- dfprintk(VFS, "NFS: search_cached_dirent_pages() returned %ld\n", res);
+ dfprintk(VFS, "NFS: search_cached_dirent_pages() returned %d\n", res);
         return res;
 }
 
 
-/* Now we cache directories properly, by stuffing the dirent
- * data directly in the page cache.
- *
- * Inode invalidation due to refresh etc. takes care of
- * _everything_, no sloppy entry flushing logic, no extraneous
- * copying, network direct to page cache, the way it was meant
- * to be.
- *
- * NOTE: Dirent information verification is done always by the
- * page-in of the RPC reply, nowhere else, this simplies
- * things substantially.
+/*
+ * If we cannot find a cookie in our cache, we ask the server to return
+ * whatever it thinks is the next entry. We then walk our cache looking
+ * for this particular cookie.
  */
-static inline
-long try_to_get_dirent_page(struct file *file, struct inode *inode,
- struct nfs_entry *entry)
+static int readdir_resync_cookie(nfs_readdir_descriptor_t *desc)
 {
- struct dentry *dir = file->f_dentry;
- struct page *page;
- __u32 *p;
- unsigned long index = entry->offset >> PAGE_CACHE_SHIFT;
- long res = 0;
- unsigned int dtsize = NFS_SERVER(inode)->dtsize;
- int plus = NFS_USE_READDIRPLUS(inode);
+ struct dentry *dir = desc->file->f_dentry;
+ struct inode *inode = dir->d_inode;
+ struct page *page = NULL;
+ u32 *p;
+ struct nfs_entry my_entry;
+ int status = -EIO;
 
- dfprintk(VFS, "NFS: try_to_get_dirent_page() reading directory page @ index %ld\n", index);
-
- page = grab_cache_page(&inode->i_data, index);
+ dfprintk(VFS, "NFS: readdir_resync_cookie() searching for cookie %Lu\n", (long long)desc->cookie);
 
+ page = page_cache_alloc();
         if (!page) {
- res = -ENOMEM;
+ status = -ENOMEM;
                 goto out;
         }
-
- if (Page_Uptodate(page)) {
- dfprintk(VFS, "NFS: try_to_get_dirent_page(): page already up to date.\n");
- goto unlock_out;
+ p = (u32 *)kmap(page);
+ status = NFS_PROTO(inode)->readdir(dir, desc->cookie, p,
+ NFS_SERVER(inode)->dtsize, 0);
+ if (status >= 0) {
+ p = desc->decode(p, &my_entry, 0);
+ if (IS_ERR(p))
+ status = PTR_ERR(p);
         }
-
- p = (__u32 *)kmap(page);
-
- if (dtsize > PAGE_CACHE_SIZE)
- dtsize = PAGE_CACHE_SIZE;
- res = NFS_PROTO(inode)->readdir(dir, entry->cookie, p, dtsize, plus);
-
         kunmap(page);
-
- if (res < 0)
- goto error;
- if (PageError(page))
- ClearPageError(page);
- SetPageUptodate(page);
-
- unlock_out:
- UnlockPage(page);
         page_cache_release(page);
- out:
- dfprintk(VFS, "NFS: try_to_get_dirent_page() returns %ld\n", res);
- return res;
- error:
- SetPageError(page);
- goto unlock_out;
-}
-
-/* Recover from a revalidation flush. The case here is that
- * the inode for the directory got invalidated somehow, and
- * all of our cached information is lost. In order to get
- * a correct cookie for the current readdir request from the
- * user, we must (re-)fetch all the older readdir page cache
- * entries.
- *
- * Returns < 0 if some error occurs.
- */
-static inline
-long refetch_to_readdir(struct file *file, struct inode *inode,
- loff_t off, struct nfs_entry *entry)
-{
- struct nfs_entry my_dirent,
- *dirent = &my_dirent;
- long res;
- int plus = NFS_USE_READDIRPLUS(inode),
- use_cookie = NFS_MONOTONE_COOKIES(inode),
- loop_count = 0;
-
- dfprintk(VFS, "NFS: refetch_to_readdir() searching for cookie %Ld\n", (long long)off);
- *dirent = *entry;
- entry->page = NULL;
-
- for (res = 0;res >= 0;) {
- if (loop_count++ > 200) {
- loop_count = 0;
- schedule();
- }
-
- /* Search for last cookie in page cache */
- res = search_cached_dirent_pages(inode, off, dirent);
-
- if (res >= 0) {
- /* Cookie was found */
- if ((use_cookie?dirent->cookie:dirent->offset) > off) {
- *entry = *dirent;
- dirent->page = NULL;
- break;
- }
- continue;
- }
-
- if (dirent->page)
- page_cache_release(dirent->page);
- dirent->page = NULL;
-
- if (res != -EIO) {
- *entry = *dirent;
- break;
- }
-
- /* Read in a new page */
- res = try_to_get_dirent_page(file, inode, dirent);
- if (res == -EBADCOOKIE) {
- memset(dirent, 0, sizeof(*dirent));
- nfs_zap_caches(inode);
- res = 0;
- }
- /* We requested READDIRPLUS, but the server doesn't grok it */
- if (plus && res == -ENOTSUPP) {
- NFS_FLAGS(inode) &= ~NFS_INO_ADVISE_RDPLUS;
- memset(dirent, 0, sizeof(*dirent));
- nfs_zap_caches(inode);
- plus = 0;
- res = 0;
- }
- }
- if (dirent->page)
- page_cache_release(dirent->page);
+ if (status < 0)
+ goto out;
 
- dfprintk(VFS, "NFS: refetch_to_readdir() returns %ld\n", res);
- return res;
+ /* NOTE: we use nfs_find_cookie() here */
+ desc->cookie = my_entry.cookie;
+ desc->page_index = 0;
+ desc->page_offset = 0;
+ memset(desc->entry, 0, sizeof(*desc->entry));
+ status = search_cached_dirent_pages(desc, nfs_find_cookie);
+ out:
+ dfprintk(VFS, "NFS: readdir_resync_cookie() returns %d\n", status);
+ return status;
 }
 
 /*
  * Once we've found the start of the dirent within a page: fill 'er up...
  */
-static
-int nfs_do_filldir(struct file *file, struct inode *inode,
- struct nfs_entry *entry, void *dirent, filldir_t filldir)
+static int nfs_do_filldir(nfs_readdir_descriptor_t *desc, void *dirent,
+ filldir_t filldir)
 {
- decode_dirent_t decode = NFS_PROTO(inode)->decode_dirent;
- struct page *page = entry->page;
- __u8 *p,
- *start;
- unsigned long base = page_offset(page),
- offset = entry->offset,
- pg_offset,
- fileid;
- int plus = NFS_USE_READDIRPLUS(inode),
- use_cookie = NFS_MONOTONE_COOKIES(inode),
- loop_count = 0,
+ struct file *file = desc->file;
+ struct nfs_entry *entry = desc->entry;
+ char *start = (char *)kmap(desc->page),
+ *p = start + desc->page_offset;
+ unsigned long fileid;
+ int loop_count = 0,
                         res = 0;
 
- dfprintk(VFS, "NFS: nfs_do_filldir() filling starting @ offset %ld\n", entry->offset);
- pg_offset = offset & ~PAGE_CACHE_MASK;
- start = (u8*)kmap(page);
- p = start + pg_offset;
+ dfprintk(VFS, "NFS: nfs_do_filldir() filling starting @ cookie %Lu\n", (long long)desc->cookie);
 
         for(;;) {
- /* Note: entry->prev contains the offset of the start of the
- * current dirent */
+ /* Note: entry->prev_cookie contains the cookie for
+ * retrieving the current dirent on the server */
                 fileid = nfs_fileid_to_ino_t(entry->ino);
- if (use_cookie)
- res = filldir(dirent, entry->name, entry->len, entry->prev_cookie, fileid);
- else
- res = filldir(dirent, entry->name, entry->len, entry->prev, fileid);
+ res = filldir(dirent, entry->name, entry->len,
+ entry->prev_cookie, fileid);
                 if (res < 0)
                         break;
- file->f_pos = (use_cookie) ? entry->cookie : entry->offset;
- p = (u8*)decode((__u32*)p, entry, plus);
- if (!p || IS_ERR(p))
+ file->f_pos = desc->cookie = entry->cookie;
+ p = (char *)desc->decode((u32 *)p, entry, desc->plus);
+ if (IS_ERR(p)) {
+ if (PTR_ERR(p) == -EAGAIN) {
+ desc->page_offset = 0;
+ desc->page_index ++;
+ }
                         break;
- pg_offset = p - start;
- entry->prev = entry->offset;
- entry->offset = base + pg_offset;
+ }
+ desc->page_offset = p - start;
                 if (loop_count++ > 200) {
                         loop_count = 0;
                         schedule();
                 }
         }
- kunmap(page);
+ kunmap(desc->page);
 
- dfprintk(VFS, "NFS: nfs_do_filldir() filling ended @ offset %ld; returning = %d\n", entry->offset, res);
+ dfprintk(VFS, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (long long)desc->cookie, res);
         return res;
 }
 
@@ -393,10 +352,9 @@
 {
         struct dentry *dentry = filp->f_dentry;
         struct inode *inode = dentry->d_inode;
- struct page *page;
- struct nfs_entry my_entry,
- *entry = &my_entry;
- loff_t offset;
+ nfs_readdir_descriptor_t my_desc,
+ *desc = &my_desc;
+ struct nfs_entry my_entry;
         long res;
 
         res = nfs_revalidate(dentry);
@@ -409,36 +367,43 @@
          * read from the last dirent to revalidate f_pos
          * itself.
          */
- memset(entry, 0, sizeof(*entry));
-
- offset = filp->f_pos;
+ memset(desc, 0, sizeof(*desc));
+ memset(&my_entry, 0, sizeof(my_entry));
 
- while(!entry->eof) {
- res = search_cached_dirent_pages(inode, offset, entry);
+ desc->file = filp;
+ desc->cookie = filp->f_pos;
+ desc->entry = &my_entry;
+ desc->decode = NFS_PROTO(inode)->decode_dirent;
 
- if (res < 0) {
- if (entry->eof)
- break;
- res = refetch_to_readdir(filp, inode, offset, entry);
- if (res < 0)
+ while(!desc->entry->eof) {
+ res = search_cached_dirent_pages(desc, nfs_find_prev_cookie);
+ if (res == -EBADCOOKIE) {
+ /* This means either end of directory */
+ if (desc->entry->cookie == desc->cookie) {
+ res = 0;
                                 break;
+ }
+ /* Or that the server has 'lost' a cookie */
+ res = readdir_resync_cookie(desc);
                 }
+ if (res < 0)
+ break;
 
- page = entry->page;
- if (!page)
+ if (!desc->page)
                         printk(KERN_ERR "NFS: Missing page...\n");
- res = nfs_do_filldir(filp, inode, entry, dirent, filldir);
- page_cache_release(page);
- entry->page = NULL;
+ res = nfs_do_filldir(desc, dirent, filldir);
+ page_cache_release(desc->page);
+ desc->page = NULL;
                 if (res < 0) {
                         res = 0;
                         break;
                 }
- offset = filp->f_pos;
         }
- if (entry->page)
- page_cache_release(entry->page);
- if (res < 0 && res != -EBADCOOKIE)
+ if (desc->page)
+ page_cache_release(desc->page);
+ if (desc->error < 0)
+ return desc->error;
+ if (res < 0)
                 return res;
         return 0;
 }
diff -u --recursive --new-file linux-2.3.99-pre7-3/include/linux/nfs_xdr.h linux-2.3.99-pre7-3-devel/include/linux/nfs_xdr.h
--- linux-2.3.99-pre7-3/include/linux/nfs_xdr.h Sat Apr 1 18:04:27 2000
+++ linux-2.3.99-pre7-3-devel/include/linux/nfs_xdr.h Wed May 3 20:30:36 2000
@@ -106,7 +106,6 @@
  * Argument struct for decode_entry function
  */
 struct nfs_entry {
- struct page * page;
         __u64 ino;
         __u64 cookie,
                                 prev_cookie;
@@ -115,8 +114,6 @@
         int eof;
         struct nfs_fh fh;
         struct nfs_fattr fattr;
- unsigned long offset,
- prev;
 };
 
 /*

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sun May 07 2000 - 21:00:14 EST