[PATCH] exfat: change i_pos based exfat hash to rb tree

From: Hyeongseok Kim
Date: Sun Mar 28 2021 - 20:33:44 EST


Hashtable for identifying inode by i_pos (cluster+dentry) have slow
search performance when there are lots of cached inode. The slowness
is from the limited count of slots showing that, in average,
(inode count / slots) times hlist traverse, and it becomes worse when
cached inode count increases by such as directory iteration.
To improve this, change from hash table to rb tree to minimize
searching/iterate time which enables O(logN) time complexity.

Test : "time ls -l"
+--------+------------+------------+-----------+
| | file count | fresh read | cached |
+--------+------------+------------+-----------+
| Before | 50,000 | 0m06.59s | 0m01.58s |
+--------+------------+------------+-----------+
| After | 50,000 | 0m05.20s | 0m00.98s |
+--------+------------+------------+-----------+
| Before | 300,000 | 1m28.97s | 0m31.69s |
+--------+------------+------------+-----------+
| After | 300,000 | 0m33.11s | 0m06.21s |
+--------+------------+------------+-----------+

Signed-off-by: Hyeongseok Kim <hyeongseok@xxxxxxxxx>
---
fs/exfat/exfat_fs.h | 12 +++---
fs/exfat/inode.c | 91 +++++++++++++++++++++++++++++++--------------
fs/exfat/namei.c | 10 ++---
fs/exfat/super.c | 16 ++++----
4 files changed, 82 insertions(+), 47 deletions(-)

diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index 1d6da61157c9..f8ad8cbf8499 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -243,8 +243,8 @@ struct exfat_sb_info {
struct nls_table *nls_io; /* Charset used for input and display */
struct ratelimit_state ratelimit;

- spinlock_t inode_hash_lock;
- struct hlist_head inode_hashtable[EXFAT_HASH_SIZE];
+ spinlock_t inode_tree_lock; /* lock for inode_tree structure */
+ struct rb_root inode_tree;

struct rcu_head rcu;
};
@@ -289,8 +289,8 @@ struct exfat_inode_info {
loff_t i_size_aligned;
/* on-disk position of directory entry or 0 */
loff_t i_pos;
- /* hash by i_location */
- struct hlist_node i_hash_fat;
+ /* tree by i_location */
+ struct rb_node rbnode;
/* protect bmap against truncate */
struct rw_semaphore truncate_lock;
struct inode vfs_inode;
@@ -476,8 +476,8 @@ extern const struct inode_operations exfat_file_inode_operations;
void exfat_sync_inode(struct inode *inode);
struct inode *exfat_build_inode(struct super_block *sb,
struct exfat_dir_entry *info, loff_t i_pos);
-void exfat_hash_inode(struct inode *inode, loff_t i_pos);
-void exfat_unhash_inode(struct inode *inode);
+void exfat_inode_tree_insert(struct inode *inode, loff_t i_pos);
+void exfat_inode_tree_erase(struct inode *inode);
struct inode *exfat_iget(struct super_block *sb, loff_t i_pos);
int exfat_write_inode(struct inode *inode, struct writeback_control *wbc);
void exfat_evict_inode(struct inode *inode);
diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c
index 1803ef3220fd..740a34f528ae 100644
--- a/fs/exfat/inode.c
+++ b/fs/exfat/inode.c
@@ -501,50 +501,87 @@ static const struct address_space_operations exfat_aops = {
.bmap = exfat_aop_bmap
};

-static inline unsigned long exfat_hash(loff_t i_pos)
+static struct exfat_inode_info *exfat_inode_tree_find(struct super_block *sb,
+ loff_t i_pos)
{
- return hash_32(i_pos, EXFAT_HASH_BITS);
+ struct exfat_sb_info *sbi = EXFAT_SB(sb);
+ struct rb_node *node = sbi->inode_tree.rb_node;
+ struct exfat_inode_info *info;
+
+ spin_lock(&sbi->inode_tree_lock);
+ while (node) {
+ info = rb_entry(node, struct exfat_inode_info, rbnode);
+ WARN_ON(info->vfs_inode.i_sb != sb);
+
+ if (i_pos == info->i_pos) {
+ spin_unlock(&sbi->inode_tree_lock);
+ return info;
+ }
+
+ if (i_pos < info->i_pos)
+ node = node->rb_left;
+ else /* i_pos > info->i_pos */
+ node = node->rb_right;
+ }
+ spin_unlock(&sbi->inode_tree_lock);
+ return NULL;
}

-void exfat_hash_inode(struct inode *inode, loff_t i_pos)
+void exfat_inode_tree_insert(struct inode *inode, loff_t i_pos)
{
struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
- struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos);
+ struct exfat_inode_info *info, *ei = EXFAT_I(inode);
+ struct rb_root *root = &sbi->inode_tree;
+ struct rb_node **rb_ptr = &root->rb_node;
+ struct rb_node *parent = NULL;
+
+ spin_lock(&sbi->inode_tree_lock);
+ ei->i_pos = i_pos;
+ while (*rb_ptr) {
+ parent = *rb_ptr;
+ info = rb_entry(*rb_ptr, struct exfat_inode_info, rbnode);
+ if (i_pos == info->i_pos) {
+ /* already exists */
+ rb_replace_node(*rb_ptr, &ei->rbnode, &sbi->inode_tree);
+ RB_CLEAR_NODE(*rb_ptr);
+ spin_unlock(&sbi->inode_tree_lock);
+ return;
+ }

- spin_lock(&sbi->inode_hash_lock);
- EXFAT_I(inode)->i_pos = i_pos;
- hlist_add_head(&EXFAT_I(inode)->i_hash_fat, head);
- spin_unlock(&sbi->inode_hash_lock);
+ if (i_pos < info->i_pos)
+ rb_ptr = &(*rb_ptr)->rb_left;
+ else /* (i_pos > info->i_pos) */
+ rb_ptr = &(*rb_ptr)->rb_right;
+ }
+
+ rb_link_node(&ei->rbnode, parent, rb_ptr);
+ rb_insert_color(&ei->rbnode, root);
+ spin_unlock(&sbi->inode_tree_lock);
}

-void exfat_unhash_inode(struct inode *inode)
+void exfat_inode_tree_erase(struct inode *inode)
{
struct exfat_sb_info *sbi = EXFAT_SB(inode->i_sb);
+ struct exfat_inode_info *ei = EXFAT_I(inode);
+ struct rb_root *root = &sbi->inode_tree;

- spin_lock(&sbi->inode_hash_lock);
- hlist_del_init(&EXFAT_I(inode)->i_hash_fat);
- EXFAT_I(inode)->i_pos = 0;
- spin_unlock(&sbi->inode_hash_lock);
+ spin_lock(&sbi->inode_tree_lock);
+ if (!RB_EMPTY_NODE(&ei->rbnode)) {
+ rb_erase(&ei->rbnode, root);
+ RB_CLEAR_NODE(&ei->rbnode);
+ }
+ spin_unlock(&sbi->inode_tree_lock);
}

struct inode *exfat_iget(struct super_block *sb, loff_t i_pos)
{
- struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_inode_info *info;
- struct hlist_head *head = sbi->inode_hashtable + exfat_hash(i_pos);
struct inode *inode = NULL;

- spin_lock(&sbi->inode_hash_lock);
- hlist_for_each_entry(info, head, i_hash_fat) {
- WARN_ON(info->vfs_inode.i_sb != sb);
-
- if (i_pos != info->i_pos)
- continue;
+ info = exfat_inode_tree_find(sb, i_pos);
+ if (info)
inode = igrab(&info->vfs_inode);
- if (inode)
- break;
- }
- spin_unlock(&sbi->inode_hash_lock);
+
return inode;
}

@@ -634,7 +671,7 @@ struct inode *exfat_build_inode(struct super_block *sb,
inode = ERR_PTR(err);
goto out;
}
- exfat_hash_inode(inode, i_pos);
+ exfat_inode_tree_insert(inode, i_pos);
insert_inode_hash(inode);
out:
return inode;
@@ -654,5 +691,5 @@ void exfat_evict_inode(struct inode *inode)
invalidate_inode_buffers(inode);
clear_inode(inode);
exfat_cache_inval_inode(inode);
- exfat_unhash_inode(inode);
+ exfat_inode_tree_erase(inode);
}
diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c
index 24b41103d1cc..10ed9b35fd86 100644
--- a/fs/exfat/namei.c
+++ b/fs/exfat/namei.c
@@ -827,7 +827,7 @@ static int exfat_unlink(struct inode *dir, struct dentry *dentry)
clear_nlink(inode);
inode->i_mtime = inode->i_atime = current_time(inode);
exfat_truncate_atime(&inode->i_atime);
- exfat_unhash_inode(inode);
+ exfat_inode_tree_erase(inode);
exfat_d_version_set(dentry, inode_query_iversion(dir));
unlock:
mutex_unlock(&EXFAT_SB(sb)->s_lock);
@@ -993,7 +993,7 @@ static int exfat_rmdir(struct inode *dir, struct dentry *dentry)
clear_nlink(inode);
inode->i_mtime = inode->i_atime = current_time(inode);
exfat_truncate_atime(&inode->i_atime);
- exfat_unhash_inode(inode);
+ exfat_inode_tree_erase(inode);
exfat_d_version_set(dentry, inode_query_iversion(dir));
unlock:
mutex_unlock(&EXFAT_SB(inode->i_sb)->s_lock);
@@ -1363,8 +1363,8 @@ static int exfat_rename(struct user_namespace *mnt_userns,

i_pos = ((loff_t)EXFAT_I(old_inode)->dir.dir << 32) |
(EXFAT_I(old_inode)->entry & 0xffffffff);
- exfat_unhash_inode(old_inode);
- exfat_hash_inode(old_inode, i_pos);
+ exfat_inode_tree_erase(old_inode);
+ exfat_inode_tree_insert(old_inode, i_pos);
if (IS_DIRSYNC(new_dir))
exfat_sync_inode(old_inode);
else
@@ -1384,7 +1384,7 @@ static int exfat_rename(struct user_namespace *mnt_userns,
mark_inode_dirty(old_dir);

if (new_inode) {
- exfat_unhash_inode(new_inode);
+ exfat_inode_tree_erase(new_inode);

/* skip drop_nlink if new_inode already has been dropped */
if (new_inode->i_nlink) {
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index d38d17a77e76..8f197432c57b 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -317,14 +317,12 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param)
return 0;
}

-static void exfat_hash_init(struct super_block *sb)
+static void exfat_inode_tree_init(struct super_block *sb)
{
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- int i;

- spin_lock_init(&sbi->inode_hash_lock);
- for (i = 0; i < EXFAT_HASH_SIZE; i++)
- INIT_HLIST_HEAD(&sbi->inode_hashtable[i]);
+ spin_lock_init(&sbi->inode_tree_lock);
+ sbi->inode_tree = RB_ROOT;
}

static int exfat_read_root(struct inode *inode)
@@ -648,8 +646,8 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
goto check_nls_io;
}

- /* set up enough so that it can read an inode */
- exfat_hash_init(sb);
+ /* set up exfat inode tree */
+ exfat_inode_tree_init(sb);

if (!strcmp(sbi->options.iocharset, "utf8"))
opts->utf8 = 1;
@@ -683,7 +681,7 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc)
goto put_inode;
}

- exfat_hash_inode(root_inode, EXFAT_I(root_inode)->i_pos);
+ exfat_inode_tree_insert(root_inode, EXFAT_I(root_inode)->i_pos);
insert_inode_hash(root_inode);

sb->s_root = d_make_root(root_inode);
@@ -786,7 +784,7 @@ static void exfat_inode_init_once(void *foo)
ei->nr_caches = 0;
ei->cache_valid_id = EXFAT_CACHE_VALID + 1;
INIT_LIST_HEAD(&ei->cache_lru);
- INIT_HLIST_NODE(&ei->i_hash_fat);
+ RB_CLEAR_NODE(&ei->rbnode);
inode_init_once(&ei->vfs_inode);
}

--
2.27.0.83.g0313f36