[PATCH 2/2] fs/ext4/namei.c: reducing contention on s_orphan_lock mmutex

From: T Makphaibulchoke
Date: Wed Oct 02 2013 - 11:37:25 EST


Instead of using a single per super block mutex, s_orphan_lock, to serialize
all orphan list updates, a separate mutex and spinlock are used to
protect the on disk and in memory orphan lists respecvitely.

At the same time, a per inode mutex is used to serialize orphan list
updates of a single inode.

Signed-off-by: T. Makphaibulchoke <tmac@xxxxxx>
---
fs/ext4/namei.c | 139 ++++++++++++++++++++++++++++++++++++++++----------------
1 file changed, 100 insertions(+), 39 deletions(-)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 35f55a0..d7d3d0f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2554,12 +2554,24 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
struct super_block *sb = inode->i_sb;
struct ext4_iloc iloc;
int err = 0, rc;
+ struct ext4_inode_info *ext4_inode = EXT4_I(inode);
+ struct ext4_sb_info *ext4_sb = EXT4_SB(sb);
+ struct mutex *inode_orphan_mutex;
+ spinlock_t *orphan_lock;
+ struct mutex *ondisk_orphan_mutex;
+ struct list_head *prev;
+ unsigned int next_inum;
+ struct inode *next_inode;

- if (!EXT4_SB(sb)->s_journal)
+ if (ext4_sb->s_journal)
return 0;

- mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
- if (!list_empty(&EXT4_I(inode)->i_orphan))
+ inode_orphan_mutex = &ext4_inode->i_orphan_mutex;
+ orphan_lock = &ext4_sb->s_orphan_lock;
+ ondisk_orphan_mutex = &ext4_sb->s_ondisk_orphan_lock;
+ mutex_lock(inode_orphan_mutex);
+ prev = ext4_inode->i_orphan.prev;
+ if (prev != &ext4_inode->i_orphan)
goto out_unlock;

/*
@@ -2579,17 +2591,28 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
err = ext4_reserve_inode_write(handle, inode, &iloc);
if (err)
goto out_unlock;
+ mutex_lock(ondisk_orphan_mutex);
/*
* Due to previous errors inode may be already a part of on-disk
* orphan list. If so skip on-disk list modification.
*/
if (NEXT_ORPHAN(inode) && NEXT_ORPHAN(inode) <=
- (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)))
+ (le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count))) {
+ mutex_unlock(ondisk_orphan_mutex);
goto mem_insert;
+ }

/* Insert this inode at the head of the on-disk orphan list... */
- NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
- EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ next_inum = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan);
+ NEXT_ORPHAN(inode) = next_inum;
+ next_inode = ilookup(sb, next_inum);
+ if (next_inode)
+ EXT4_I(next_inode)->i_prev_orphan = inode->i_ino;
+ ext4_sb->s_es->s_last_orphan = cpu_to_le32(inode->i_ino);
+ ext4_inode->i_prev_orphan = 0;
+ mutex_unlock(ondisk_orphan_mutex);
+ if (next_inode)
+ iput(next_inode);
err = ext4_handle_dirty_super(handle, sb);
rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
if (!err)
@@ -2604,14 +2627,17 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
* This is safe: on error we're going to ignore the orphan list
* anyway on the next recovery. */
mem_insert:
- if (!err)
- list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
+ if (!err) {
+ spin_lock(orphan_lock);
+ list_add(&ext4_inode->i_orphan, &ext4_sb->s_orphan);
+ spin_unlock(orphan_lock);
+ }

jbd_debug(4, "superblock will point to %lu\n", inode->i_ino);
jbd_debug(4, "orphan inode %lu will point to %d\n",
inode->i_ino, NEXT_ORPHAN(inode));
out_unlock:
- mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
+ mutex_unlock(inode_orphan_mutex);
ext4_std_error(inode->i_sb, err);
return err;
}
@@ -2624,71 +2650,106 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
{
struct list_head *prev;
struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi;
+ struct inode *i_next = NULL;
+ struct inode *i_prev = NULL;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 ino_next;
+ unsigned int ino_prev;
struct ext4_iloc iloc;
int err = 0;
+ struct mutex *inode_orphan_mutex;
+ spinlock_t *orphan_lock;
+ struct mutex *ondisk_orphan_mutex;

- if ((!EXT4_SB(inode->i_sb)->s_journal) &&
- !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS))
+ if ((!sbi->s_journal) &&
+ !(sbi->s_mount_state & EXT4_ORPHAN_FS))
return 0;

- mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
- if (list_empty(&ei->i_orphan))
- goto out;
-
- ino_next = NEXT_ORPHAN(inode);
+ inode_orphan_mutex = &ei->i_orphan_mutex;
+ orphan_lock = &sbi->s_orphan_lock;
+ ondisk_orphan_mutex = &sbi->s_ondisk_orphan_lock;
+ mutex_lock(inode_orphan_mutex);
prev = ei->i_orphan.prev;
- sbi = EXT4_SB(inode->i_sb);
+ if (prev == &ei->i_orphan) {
+ mutex_unlock(inode_orphan_mutex);
+ return err;
+ }

jbd_debug(4, "remove inode %lu from orphan list\n", inode->i_ino);

+ spin_lock(orphan_lock);
list_del_init(&ei->i_orphan);
+ spin_unlock(orphan_lock);

/* If we're on an error path, we may not have a valid
* transaction handle with which to update the orphan list on
* disk, but we still need to remove the inode from the linked
* list in memory. */
- if (!handle)
- goto out;
+ if (!handle) {
+ mutex_unlock(inode_orphan_mutex);
+ return err;
+ }

err = ext4_reserve_inode_write(handle, inode, &iloc);
- if (err)
+ if (err) {
+ mutex_unlock(inode_orphan_mutex);
goto out_err;
+ }

- if (prev == &sbi->s_orphan) {
+ mutex_lock(ondisk_orphan_mutex);
+ ino_next = NEXT_ORPHAN(inode);
+ if (ino_next > 0) {
+ i_next = ilookup(inode->i_sb, ino_next);
+ assert(i_next);
+ }
+ ino_prev = ei->i_prev_orphan;
+ if (!ino_prev) {
jbd_debug(4, "superblock will point to %u\n", ino_next);
BUFFER_TRACE(sbi->s_sbh, "get_write_access");
err = ext4_journal_get_write_access(handle, sbi->s_sbh);
- if (err)
- goto out_brelse;
- sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
- err = ext4_handle_dirty_super(handle, inode->i_sb);
+ if (!err) {
+ sbi->s_es->s_last_orphan = cpu_to_le32(ino_next);
+ if (i_next)
+ EXT4_I(i_next)->i_prev_orphan = 0;
+ }
+ mutex_unlock(ondisk_orphan_mutex);
+ if (!err)
+ err = ext4_handle_dirty_super(handle, inode->i_sb);
} else {
struct ext4_iloc iloc2;
- struct inode *i_prev =
- &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode;
-
- jbd_debug(4, "orphan inode %lu will point to %u\n",
- i_prev->i_ino, ino_next);
- err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
- if (err)
- goto out_brelse;
- NEXT_ORPHAN(i_prev) = ino_next;
- err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
+ i_prev = ilookup(inode->i_sb, ino_prev);
+
+ assert(i_prev);
+ if (i_prev) {
+ jbd_debug(4, "orphan inode %lu will point to %u\n",
+ i_prev->i_ino, ino_next);
+ err = ext4_reserve_inode_write(handle, i_prev, &iloc2);
+ } else
+ err = -ENOENT;
+ if (!err) {
+ NEXT_ORPHAN(i_prev) = ino_next;
+ if (i_next)
+ EXT4_I(i_next)->i_prev_orphan = ino_prev;
+ }
+ mutex_unlock(ondisk_orphan_mutex);
+ if (i_prev && !err)
+ err = ext4_mark_iloc_dirty(handle, i_prev, &iloc2);
}
if (err)
goto out_brelse;
NEXT_ORPHAN(inode) = 0;
+ mutex_unlock(inode_orphan_mutex);
err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-
out_err:
+ if (i_next)
+ iput(i_next);
+ if (i_prev)
+ iput(i_prev);
ext4_std_error(inode->i_sb, err);
-out:
- mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
return err;

out_brelse:
+ mutex_unlock(inode_orphan_mutex);
brelse(iloc.bh);
goto out_err;
}
--
1.7.11.3

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/