[RFC][PATCH 06/10] fs: Rework i_count

From: Peter Zijlstra
Date: Fri Feb 24 2017 - 13:32:43 EST


The inode count: i_count, does not conform to normal reference
counting semantics, its a usage count. That is, objects can and do
live without any references (i_count == 0).

This is because the reference from the inode hash table is not
accounted. This makes that find_inode_fast() can result in 0->1
transitions and similarly, iput() can do 1->0->1 transitions.

This patch changes things to include the inode hash table's reference
and reworks iput() to avoid spurious drops to 0.

It does mean that we now call super_operations::drop_inode() with
non-zero i_count, but I could not find an implementation where this
mattered. Only once we really decide to fully drop the inode and
remove it from the hash will we do the final dec_and_test.

This basically boils down to pushing part of iput() into
atomic_dec_and_lock().

Also, this would allow an RCU based find_inode*().

Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
fs/btrfs/inode.c | 2 +-
fs/inode.c | 24 +++++++++++++++++++-----
include/linux/fs.h | 11 ++++++++++-
3 files changed, 30 insertions(+), 7 deletions(-)

--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3172,7 +3172,7 @@ void btrfs_add_delayed_iput(struct inode
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_inode *binode = BTRFS_I(inode);

- if (atomic_add_unless(&inode->i_count, -1, 1))
+ if (atomic_add_unless(&inode->i_count, -1, 2))
return;

spin_lock(&fs_info->delayed_iput_lock);
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -135,7 +135,7 @@ int inode_init_always(struct super_block
inode->i_sb = sb;
inode->i_blkbits = sb->s_blocksize_bits;
inode->i_flags = 0;
- atomic_set(&inode->i_count, 1);
+ atomic_set(&inode->i_count, 2); /* hashed and ref */
inode->i_op = &empty_iops;
inode->i_fop = &no_open_fops;
inode->__i_nlink = 1;
@@ -387,7 +387,7 @@ static void init_once(void *foo)
void __iget(struct inode *inode)
{
lockdep_assert_held(&inode->i_lock);
- atomic_inc(&inode->i_count);
+ WARN_ON(!atomic_inc_not_zero(&inode->i_count));
}

/*
@@ -395,7 +395,7 @@ void __iget(struct inode *inode)
*/
void ihold(struct inode *inode)
{
- WARN_ON(atomic_inc_return(&inode->i_count) < 2);
+ WARN_ON(atomic_inc_return(&inode->i_count) < 3);
}
EXPORT_SYMBOL(ihold);

@@ -814,6 +814,8 @@ static struct inode *find_inode_fast(str
{
struct inode *inode = NULL;

+ lockdep_assert_held(&inode_hash_lock);
+
repeat:
hlist_for_each_entry(inode, head, i_hash) {
if (inode->i_ino != ino)
@@ -1488,11 +1490,12 @@ void iput(struct inode *inode)

BUG_ON(inode->i_state & I_CLEAR);
retry:
- if (!atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
+ if (atomic_add_unless(&inode->i_count, -1, 2))
return;

+ spin_lock(&inode->i_lock);
+
if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
- atomic_inc(&inode->i_count);
inode->i_state &= ~I_DIRTY_TIME;
spin_unlock(&inode->i_lock);
trace_writeback_lazytime_iput(inode);
@@ -1500,6 +1503,7 @@ void iput(struct inode *inode)
goto retry;
}

+ atomic_dec(&inode->i_count); /* 2 -> 1 */
WARN_ON(inode->i_state & I_NEW);

/*
@@ -1520,6 +1524,16 @@ void iput(struct inode *inode)
spin_unlock(&inode->i_lock);
return;
}
+
+ /*
+ * If, at this point, only the hashtable has a reference left
+ * continue to take the inode out, otherwise someone got a ref
+ * while we weren't looking.
+ */
+ if (atomic_cmpxchg(&inode->i_count, 1, 0) != 1) {
+ spin_unlock(&inode->i_lock);
+ return;
+ }

if (!drop) {
inode->i_state |= I_WILL_FREE;
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2713,7 +2713,16 @@ extern unsigned int get_next_ino(void);

static inline int i_count(struct inode *inode)
{
- return atomic_read(&inode->i_count);
+ int i_count = atomic_read(&inode->i_count);
+
+ /*
+ * In order to preserve the 'old' usage-count semantics, remove the
+ * reference that the hash-table has.
+ */
+ if (i_count)
+ i_count--;
+
+ return i_count;
}

extern void __iget(struct inode * inode);