[PATCH 07/10] orangefs: support very large directories

From: Martin Brandenburg
Date: Fri Apr 07 2017 - 17:18:18 EST


This works by maintaining a linked list of pages which the directory
has been read into rather than one giant fixed-size buffer.

This replaces code which limits the total directory size to the total
amount that could be returned in one server request. Since filenames
are usually considerably shorter than the maximum, the old code could
usually handle several server requests before running out of space.

Signed-off-by: Martin Brandenburg <martin@xxxxxxxxxxxx>
---
fs/orangefs/dir.c | 246 ++++++++++++++++++++++++++++++++++++++----------------
1 file changed, 172 insertions(+), 74 deletions(-)

diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c
index c48220f..7afde1b 100644
--- a/fs/orangefs/dir.c
+++ b/fs/orangefs/dir.c
@@ -6,18 +6,15 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"

-/*
- * There can be up to 512 directory entries. Each entry contains a four byte
- * string size, a string, and a 16 byte khandle. The string can be up to 256
- * bytes and is encoded with a trailing zero and four byte padding for the
- * khandle.
- */
-#define MAX_DIRECTORY ((4 + 257 + 3 + 16)*512)
+struct orangefs_dir_page {
+ struct orangefs_dir_page *next;
+ size_t len;
+};

struct orangefs_dir {
__u64 token;
- void *directory;
- size_t len;
+ struct orangefs_dir_page *page;
+ loff_t end;
int error;
};

@@ -32,20 +29,12 @@ struct orangefs_dir {
* be skipped to get to the directory data.
*/

-static int orangefs_dir_more(struct orangefs_inode_s *oi,
- struct orangefs_dir *od, struct dentry *dentry)
+static int do_readdir(struct orangefs_inode_s *oi, struct orangefs_dir *od,
+ struct dentry *dentry, struct orangefs_kernel_op_s *op)
{
- const size_t offset = sizeof(struct orangefs_readdir_response_s);
struct orangefs_readdir_response_s *resp;
- struct orangefs_kernel_op_s *op;
int bufi, r;

- op = op_alloc(ORANGEFS_VFS_OP_READDIR);
- if (!op) {
- od->error = -ENOMEM;
- return -ENOMEM;
- }
-
/*
* Despite the badly named field, readdir does not use shared memory.
* However, there are a limited number of readdir slots, which must be
@@ -61,7 +50,6 @@ static int orangefs_dir_more(struct orangefs_inode_s *oi,
again:
bufi = orangefs_readdir_index_get();
if (bufi < 0) {
- op_release(op);
od->error = bufi;
return bufi;
}
@@ -79,7 +67,6 @@ static int orangefs_dir_more(struct orangefs_inode_s *oi,
goto again;
} else if (r == -EIO) {
vfree(op->downcall.trailer_buf);
- op_release(op);
od->error = r;
return r;
}
@@ -87,75 +74,188 @@ static int orangefs_dir_more(struct orangefs_inode_s *oi,

if (r < 0) {
vfree(op->downcall.trailer_buf);
- op_release(op);
od->error = r;
return r;
} else if (op->downcall.status) {
vfree(op->downcall.trailer_buf);
- op_release(op);
od->error = op->downcall.status;
return op->downcall.status;
}

resp = (struct orangefs_readdir_response_s *)op->downcall.trailer_buf;
od->token = resp->token;
+ return 0;
+}

- if (od->len + op->downcall.trailer_size - offset <= MAX_DIRECTORY) {
- memcpy(od->directory + od->len,
- op->downcall.trailer_buf + offset,
- op->downcall.trailer_size - offset);
- od->len += op->downcall.trailer_size - offset;
- } else {
- /* This limit was chosen based on protocol limits. */
- gossip_err("orangefs_dir_more: userspace sent too much data\n");
- vfree(op->downcall.trailer_buf);
- op_release(op);
- od->error = -EIO;
- return -EIO;
+static int parse_readdir(struct orangefs_dir *od,
+ struct orangefs_kernel_op_s *op)
+{
+ const size_t offset = sizeof(struct orangefs_readdir_response_s);
+ size_t last, i = offset, count = 1;
+ struct orangefs_dir_page *page;
+
+ page = od->page;
+ while (page->next) {
+ page = page->next;
+ count++;
+ }
+
+ while (i < op->downcall.trailer_size) {
+ __u32 *len, padlen;
+ /* Calculate how much will fit into the current page. */
+ last = i;
+ while (i < op->downcall.trailer_size) {
+ len = (void *)op->downcall.trailer_buf + i;
+ padlen = (4 + *len + 1 + 16) +
+ (8 - (4 + *len + 1 + 16)%8)%8;
+ if (i + padlen > op->downcall.trailer_size) {
+ gossip_err(
+ "orangefs_dir_more: userspace corrupt");
+ return -EIO;
+ }
+ if (page->len + i + padlen - last >
+ PAGE_SIZE - sizeof *page)
+ break;
+ i += padlen;
+ }
+
+ memcpy((void *)page + sizeof *page + page->len,
+ op->downcall.trailer_buf + last, i - last);
+ page->len += i - last;
+ /*
+ * Must add 2. Once from the current page (count++ below has
+ * not been done here) and once to move past the final page.
+ * The iteration code moves to the beginning of the next page
+ * when it runs out, so the end pointer must point to the
+ * beginning of the page after the last page.
+ */
+ od->end = (count + 2) << PAGE_SHIFT;
+ BUG_ON(page->len > PAGE_SIZE - sizeof *page);
+
+ /* New page for next bit. */
+
+ /*
+ * Should this wait if there's more room in the page? Sounds
+ * hard and error-prone to attempt to measure and at best saves
+ * only a few kilobytes.
+ */
+
+ page->next = (void *)__get_free_page(GFP_KERNEL);
+ if (!page->next)
+ return -ENOMEM;
+ page = page->next;
+ count++;
+ page->next = NULL;
+ page->len = 0;
+
+ /* Must make progress. */
+ BUG_ON(last == i);
+ }
+
+ return 0;
+}
+
+static int orangefs_dir_more(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry)
+{
+ struct orangefs_kernel_op_s *op;
+ int r;
+
+ op = op_alloc(ORANGEFS_VFS_OP_READDIR);
+ if (!op) {
+ od->error = -ENOMEM;
+ return -ENOMEM;
+ }
+ r = do_readdir(oi, od, dentry, op);
+ if (r) {
+ od->error = r;
+ goto out;
+ }
+ r = parse_readdir(od, op);
+ if (r) {
+ od->error = r;
+ goto out;
}

+ od->error = 0;
vfree(op->downcall.trailer_buf);
+out:
op_release(op);
- return 0;
+ return od->error;
}

-static int orangefs_dir_fill(struct orangefs_inode_s *oi,
- struct orangefs_dir *od, struct dentry *dentry, struct dir_context *ctx)
+static int fill_from_page(struct orangefs_dir_page *page,
+ struct dir_context *ctx)
{
struct orangefs_khandle *khandle;
__u32 *len, padlen;
loff_t i;
char *s;
- i = ctx->pos - 2;
- while (i < od->len) {
- if (od->len < i + sizeof *len)
- goto eio;
- len = od->directory + i;
+ i = ctx->pos & ~PAGE_MASK;
+
+ /* The file offset from userspace is too large. */
+ if (i > page->len)
+ return -EIO;
+
+ while (i < page->len) {
+ if (page->len < i + sizeof *len)
+ return -EIO;
+ len = (void *)page + sizeof *page + i;
/* len does not include trailing zero but padlen does. */
padlen = (*len + 1) + (4 - (*len + 1)%4)%4;
- if (od->len < i + sizeof *len + padlen + sizeof *khandle)
- goto eio;
- s = od->directory + i + sizeof *len;
+ if (page->len < i + sizeof *len + padlen + sizeof *khandle)
+ return -EIO;
+ s = (void *)page + sizeof *page + i + sizeof *len;
if (s[*len] != 0)
- goto eio;
- khandle = od->directory + i + sizeof *len + padlen;
-
+ return -EIO;
+ khandle = (void *)page + sizeof *page +
+ i + sizeof *len + padlen;
if (!dir_emit(ctx, s, *len, orangefs_khandle_to_ino(khandle),
DT_UNKNOWN))
return 0;
i += sizeof *len + padlen + sizeof *khandle;
i = i + (8 - i%8)%8;
- ctx->pos = i + 2;
+ BUG_ON(i > page->len);
+ ctx->pos = (ctx->pos & PAGE_MASK) | i;
+ }
+ return 1;
+}
+
+static int orangefs_dir_fill(struct orangefs_inode_s *oi,
+ struct orangefs_dir *od, struct dentry *dentry, struct dir_context *ctx)
+{
+ struct orangefs_dir_page *page;
+ size_t count;
+
+ count = ((ctx->pos & PAGE_MASK) >> PAGE_SHIFT) - 1;
+
+ page = od->page;
+ while (page->next && count) {
+ count--;
+ page = page->next;
+ }
+ /* This means the userspace file offset is invalid. */
+ if (count) {
+ od->error = -EIO;
+ return -EIO;
+ }
+
+ while (page && page->len) {
+ int r;
+ r = fill_from_page(page, ctx);
+ if (r < 0) {
+ od->error = r;
+ return r;
+ } else if (r == 0) {
+ /* Userspace buffer is full. */
+ break;
+ } else {
+ /* The page ran out of data. Move to the next page. */
+ ctx->pos = (ctx->pos & PAGE_MASK) + (1 << PAGE_SHIFT);
+ page = page->next;
+ }
}
- BUG_ON(i > od->len);
return 0;
-eio:
- /*
- * Here either data from userspace is corrupt or the application has
- * sought to an invalid location.
- */
- od->error = -EIO;
- return -EIO;
}

static int orangefs_dir_iterate(struct file *file, struct dir_context *ctx)
@@ -180,7 +280,7 @@ static int orangefs_dir_iterate(struct file *file, struct dir_context *ctx)
if (ctx->pos == 1) {
if (!dir_emit_dotdot(file, ctx))
return 0;
- ctx->pos++;
+ ctx->pos = 1 << PAGE_SHIFT;
}

r = 0;
@@ -189,17 +289,16 @@ static int orangefs_dir_iterate(struct file *file, struct dir_context *ctx)
* Must read more if the user has sought past what has been read so far.
* Stop a user who has sought past the end.
*/
- while (od->token != ORANGEFS_READDIR_END && ctx->pos - 2 > od->len) {
+ while (od->token != ORANGEFS_READDIR_END && ctx->pos > od->end) {
r = orangefs_dir_more(oi, od, dentry);
if (r)
return r;
}
- if (od->token == ORANGEFS_READDIR_END && ctx->pos - 2 > od->len) {
+ if (od->token == ORANGEFS_READDIR_END && ctx->pos > od->end)
return -EIO;
- }

/* Then try to fill if there's any left in the buffer. */
- if (ctx->pos - 2 < od->len) {
+ if (ctx->pos < od->end) {
r = orangefs_dir_fill(oi, od, dentry, ctx);
if (r)
return r;
@@ -224,16 +323,10 @@ static int orangefs_dir_open(struct inode *inode, struct file *file)
return -ENOMEM;
od = file->private_data;
od->token = ORANGEFS_READDIR_START;
- /*
- * XXX: It seems wasteful to allocate such a large buffer for
- * each request. Most will be much smaller.
- */
- od->directory = alloc_pages_exact(MAX_DIRECTORY, GFP_KERNEL);
- if (!od->directory) {
- kfree(file->private_data);
- return -ENOMEM;
- }
- od->len = 0;
+ od->page = (void *)__get_free_page(GFP_KERNEL);
+ od->page->next = NULL;
+ od->page->len = 0;
+ od->end = 1 << PAGE_SHIFT;
od->error = 0;
return 0;
}
@@ -241,8 +334,13 @@ static int orangefs_dir_open(struct inode *inode, struct file *file)
static int orangefs_dir_release(struct inode *inode, struct file *file)
{
struct orangefs_dir *od = file->private_data;
+ struct orangefs_dir_page *page = od->page;
orangefs_flush_inode(inode);
- free_pages_exact(od->directory, MAX_DIRECTORY);
+ do {
+ struct orangefs_dir_page *next = page->next;
+ free_page((unsigned long)page);
+ page = next;
+ } while (page);
kfree(od);
return 0;
}
--
2.1.4