[CFT][PATCH] kernfs: Correct kernfs directory seeks.

From: Eric W. Biederman
Date: Sun Jun 03 2018 - 14:51:44 EST



Over the years two bugs have crept into the code that handled the last
returned kernfs directory being deleted, and handled general seeking
in kernfs directories. The result is that reading the /sys/module
directory while moduled are loading and unloading will sometimes
skip over a module that was present for the entire duration of
the directory read.

The first bug is that kernfs_dir_next_pos was advancing the position
in the directory even when kernfs_dir_pos had found the starting
kernfs directory entry was not usable. In this case kernfs_dir_pos is
guaranteed to return the directory entry past the starting directory
entry, making it so that advancing the directory position is wrong.

The second bug is that kernfs_dir_pos when the saved kernfs_node
did not validate, was not always returning a directory after
the the target position, but was sometimes returning the directory
entry just before the target position.

To correct these issues and to make the code easily readable and
comprehensible several cleanups are made.

- A function kernfs_dir_next is factored out to make it straight-forward
to find the next node in a kernfs directory.

- A function kernfs_dir_skip_pos is factored out to skip over directory
entries that should not be shown to userspace. This function is called
from kernfs_fop_readdir directly removing the complication of calling
it from the other directory advancement functions.

- The kernfs_put of the saved directory entry is moved from kernfs_dir_pos
to kernfs_fop_readdir. Keeping it with the kernfs_get when it is saved
and ensuring the directory position advancment functions can reference
the saved node without complications.

- The function kernfs_dir_next_pos is modified to only advance the directory
position in the common case when kernfs_dir_pos validates and returns
the starting kernfs node. For all other cases kernfs_dir_pos is guaranteed
to return a directory position in advance of the starting directory position.

- kernfs_dir_pos is given a significant make over. It's essense remains the
same but some siginficant changes were made.

+ The offset is validated to be a valid hash value at the very beginning.
The validation is updated to handle the fact that neither 0 nor 1 are
valid hash values.

+ An extra test is added at the end of the rbtree lookup to ensure
the returned position is at or beyond the target position.

+ kernfs_name_compare is used during the rbtree lookup instead of just comparing
the hash. This allows the the passed namespace parameter to be used, and
if it is available from the saved entry the old saved name as well.

+ The validation of the saved kernfs node now checks for two cases.
If the saved node is returnable.
If the name of the saved node is usable during lookup.

In net this should result in a easier to understand, and more correct
version of directory traversal for kernfs directories.

Reported-by: "Hatayama, Daisuke" <d.hatayama@xxxxxxxxxxxxxx>
Fixes: a406f75840e1 ("sysfs: use rb-tree for inode number lookup")
Fixes: 1e5289c97bba ("sysfs: Cache the last sysfs_dirent to improve readdir scalability v2")
Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx>
---

Can you test this and please verify it fixes your issue?

fs/kernfs/dir.c | 109 ++++++++++++++++++++++++++++++++++----------------------
1 file changed, 67 insertions(+), 42 deletions(-)

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 89d1dc19340b..8148b5fec48d 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -1584,53 +1584,75 @@ static int kernfs_dir_fop_release(struct inode *inode, struct file *filp)
return 0;
}

+static struct kernfs_node *kernfs_dir_next(struct kernfs_node *pos)
+{
+ struct rb_node *node = rb_next(&pos->rb);
+ return node ? rb_to_kn(node) : NULL;
+}
+
static struct kernfs_node *kernfs_dir_pos(const void *ns,
- struct kernfs_node *parent, loff_t hash, struct kernfs_node *pos)
+ struct kernfs_node *parent, loff_t off, struct kernfs_node *saved)
{
- if (pos) {
- int valid = kernfs_active(pos) &&
- pos->parent == parent && hash == pos->hash;
- kernfs_put(pos);
- if (!valid)
- pos = NULL;
- }
- if (!pos && (hash > 1) && (hash < INT_MAX)) {
- struct rb_node *node = parent->dir.children.rb_node;
- while (node) {
- pos = rb_to_kn(node);
-
- if (hash < pos->hash)
- node = node->rb_left;
- else if (hash > pos->hash)
- node = node->rb_right;
- else
- break;
+ struct kernfs_node *pos;
+ struct rb_node *node;
+ unsigned int hash;
+ const char *name = "";
+
+ /* Is off a valid name hash? */
+ if ((off < 2) || (off >= INT_MAX))
+ return NULL;
+ hash = off;
+
+ /* Is the saved position usable? */
+ if (saved) {
+ /* Proper parent and hash? */
+ if ((parent != saved->parent) || (saved->hash != hash)) {
+ saved = NULL;
+ } else {
+ if (kernfs_active(saved))
+ return saved;
+ name = saved->name;
}
}
- /* Skip over entries which are dying/dead or in the wrong namespace */
- while (pos && (!kernfs_active(pos) || pos->ns != ns)) {
- struct rb_node *node = rb_next(&pos->rb);
- if (!node)
- pos = NULL;
+
+ /* Find the closest pos to the hash we are looking for */
+ pos = NULL;
+ node = parent->dir.children.rb_node;
+ while (node) {
+ int result;
+
+ pos = rb_to_kn(node);
+ result = kernfs_name_compare(hash, name, ns, pos);
+ if (result < 0)
+ node = node->rb_left;
+ else if (result > 0)
+ node = node->rb_right;
else
- pos = rb_to_kn(node);
+ break;
}
+
+ /* Ensure pos is at or beyond the target position */
+ if (pos && (kernfs_name_compare(hash, name, ns, pos) < 0))
+ pos = kernfs_dir_next(pos);
+
return pos;
}

static struct kernfs_node *kernfs_dir_next_pos(const void *ns,
- struct kernfs_node *parent, ino_t ino, struct kernfs_node *pos)
+ struct kernfs_node *parent, loff_t off, struct kernfs_node *start)
{
- pos = kernfs_dir_pos(ns, parent, ino, pos);
- if (pos) {
- do {
- struct rb_node *node = rb_next(&pos->rb);
- if (!node)
- pos = NULL;
- else
- pos = rb_to_kn(node);
- } while (pos && (!kernfs_active(pos) || pos->ns != ns));
- }
+ struct kernfs_node *pos = kernfs_dir_pos(ns, parent, off, start);
+ if (likely(pos == start))
+ pos = kernfs_dir_next(pos);
+ return pos;
+}
+
+static struct kernfs_node *kernfs_dir_skip_pos(const void *ns,
+ struct kernfs_node *pos)
+{
+ /* Skip entries we don't want to return to userspace */
+ while (pos && !(kernfs_active(pos) && (pos->ns == ns)))
+ pos = kernfs_dir_next(pos);
return pos;
}

@@ -1638,7 +1660,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file->f_path.dentry;
struct kernfs_node *parent = kernfs_dentry_node(dentry);
- struct kernfs_node *pos = file->private_data;
+ struct kernfs_node *pos, *saved = file->private_data;
const void *ns = NULL;

if (!dir_emit_dots(file, ctx))
@@ -1648,23 +1670,26 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx)
if (kernfs_ns_enabled(parent))
ns = kernfs_info(dentry->d_sb)->ns;

- for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos);
- pos;
+ for (pos = kernfs_dir_pos(ns, parent, ctx->pos, saved);
+ (pos = kernfs_dir_skip_pos(ns, pos));
pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) {
const char *name = pos->name;
unsigned int type = dt_type(pos);
int len = strlen(name);
ino_t ino = pos->id.ino;

- ctx->pos = pos->hash;
- file->private_data = pos;
- kernfs_get(pos);
+ kernfs_put(saved);
+ saved = pos;
+ ctx->pos = saved->hash;
+ file->private_data = saved;
+ kernfs_get(saved);

mutex_unlock(&kernfs_mutex);
if (!dir_emit(ctx, name, len, ino, type))
return 0;
mutex_lock(&kernfs_mutex);
}
+ kernfs_put(saved);
mutex_unlock(&kernfs_mutex);
file->private_data = NULL;
ctx->pos = INT_MAX;
--
2.14.1