Re: [PATCH v6 3/3] fs: allow lockless ->i_count bumps as long as it does not transition 0->1

From: Jan Kara

Date: Wed Apr 22 2026 - 06:28:28 EST


On Tue 21-04-26 20:25:38, Mateusz Guzik wrote:
> With this change only 0->1 and 1->0 transitions need the lock.
>
> I verified all places which look at the refcount either only care about
> it staying 0 (and have the lock enforce it) or don't hold the inode lock
> to begin with (making the above change irrelevant to their correcness or
> lack thereof).
>
> I also confirmed nfs and btrfs like to call into these a lot and now
> avoid the lock in the common case, shaving off some atomics.
>
> Signed-off-by: Mateusz Guzik <mjguzik@xxxxxxxxx>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@xxxxxxx>

Honza

> ---
> fs/dcache.c | 4 +++
> fs/inode.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++
> include/linux/fs.h | 4 +--
> 3 files changed, 70 insertions(+), 2 deletions(-)
>
> diff --git a/fs/dcache.c b/fs/dcache.c
> index df11bbba0342..13c81a6bb5e1 100644
> --- a/fs/dcache.c
> +++ b/fs/dcache.c
> @@ -2033,6 +2033,10 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode)
> __d_instantiate(entry, inode);
> spin_unlock(&entry->d_lock);
> WARN_ON(!(inode_state_read(inode) & I_NEW));
> + /*
> + * Paired with igrab_from_hash()
> + */
> + smp_wmb();
> inode_state_clear(inode, I_NEW | I_CREATING);
> inode_wake_up_bit(inode, __I_NEW);
> spin_unlock(&inode->i_lock);
> diff --git a/fs/inode.c b/fs/inode.c
> index 17f0804b429c..39cb22e63d5b 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -1023,6 +1023,7 @@ long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
> }
>
> static void __wait_on_freeing_inode(struct inode *inode, bool hash_locked, bool rcu_locked);
> +static bool igrab_from_hash(struct inode *inode);
>
> /*
> * Called with the inode lock held.
> @@ -1047,6 +1048,11 @@ static struct inode *find_inode(struct super_block *sb,
> continue;
> if (!test(inode, data))
> continue;
> + if (igrab_from_hash(inode)) {
> + rcu_read_unlock();
> + *isnew = false;
> + return inode;
> + }
> spin_lock(&inode->i_lock);
> if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
> __wait_on_freeing_inode(inode, hash_locked, true);
> @@ -1089,6 +1095,11 @@ static struct inode *find_inode_fast(struct super_block *sb,
> continue;
> if (inode->i_sb != sb)
> continue;
> + if (igrab_from_hash(inode)) {
> + rcu_read_unlock();
> + *isnew = false;
> + return inode;
> + }
> spin_lock(&inode->i_lock);
> if (inode_state_read(inode) & (I_FREEING | I_WILL_FREE)) {
> __wait_on_freeing_inode(inode, hash_locked, true);
> @@ -1206,6 +1217,10 @@ void unlock_new_inode(struct inode *inode)
> lockdep_annotate_inode_mutex_key(inode);
> spin_lock(&inode->i_lock);
> WARN_ON(!(inode_state_read(inode) & I_NEW));
> + /*
> + * Paired with igrab_from_hash()
> + */
> + smp_wmb();
> inode_state_clear(inode, I_NEW | I_CREATING);
> inode_wake_up_bit(inode, __I_NEW);
> spin_unlock(&inode->i_lock);
> @@ -1217,6 +1232,10 @@ void discard_new_inode(struct inode *inode)
> lockdep_annotate_inode_mutex_key(inode);
> spin_lock(&inode->i_lock);
> WARN_ON(!(inode_state_read(inode) & I_NEW));
> + /*
> + * Paired with igrab_from_hash()
> + */
> + smp_wmb();
> inode_state_clear(inode, I_NEW);
> inode_wake_up_bit(inode, __I_NEW);
> spin_unlock(&inode->i_lock);
> @@ -1576,6 +1595,14 @@ EXPORT_SYMBOL(ihold);
>
> struct inode *igrab(struct inode *inode)
> {
> + /*
> + * Read commentary above igrab_from_hash() for an explanation why this works.
> + */
> + if (atomic_add_unless(&inode->i_count, 1, 0)) {
> + VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE), inode);
> + return inode;
> + }
> +
> spin_lock(&inode->i_lock);
> if (!(inode_state_read(inode) & (I_FREEING | I_WILL_FREE))) {
> __iget(inode);
> @@ -1593,6 +1620,43 @@ struct inode *igrab(struct inode *inode)
> }
> EXPORT_SYMBOL(igrab);
>
> +/*
> + * igrab_from_hash - special inode refcount acquire primitive for the inode hash
> + *
> + * It provides lockless refcount acquire in the common case of no problematic
> + * flags being set and the count being > 0.
> + *
> + * There are 4 state flags to worry about and the routine makes sure to not bump the
> + * ref if any of them is present.
> + *
> + * I_NEW and I_CREATING can only legally get set *before* the inode becomes visible
> + * during lookup. Thus if the flags are not spotted, they are guaranteed to not be
> + * a factor. However, we need an acquire fence before returning the inode just
> + * in case we raced against clearing the state to make sure our consumer picks up
> + * any other changes made prior. atomic_add_unless provides a full fence, which
> + * takes care of it.
> + *
> + * I_FREEING and I_WILL_FREE can only legally get set if ->i_count == 0 and it is
> + * illegal to bump the ref if either is present. Consequently if atomic_add_unless
> + * managed to replace a non-0 value with a bigger one, we have a guarantee neither
> + * of these flags is set. Note this means explicitly checking of these flags below
> + * is not necessary, it is only done because it does not cost anything on top of the
> + * load which already needs to be done to handle the other flags.
> + */
> +static bool igrab_from_hash(struct inode *inode)
> +{
> + if (inode_state_read_once(inode) & (I_NEW | I_CREATING | I_FREEING | I_WILL_FREE))
> + return false;
> + /*
> + * Paired with routines clearing I_NEW
> + */
> + if (atomic_add_unless(&inode->i_count, 1, 0)) {
> + VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_WILL_FREE), inode);
> + return true;
> + }
> + return false;
> +}
> +
> /**
> * ilookup5_nowait - search for an inode in the inode cache
> * @sb: super block of file system to search
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index a046ae84a227..bbe179b02234 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2226,8 +2226,8 @@ static inline int icount_read_once(const struct inode *inode)
> }
>
> /*
> - * returns the refcount on the inode. The lock guarantees no new references
> - * are added, but references can be dropped as long as the result is > 0.
> + * returns the refcount on the inode. The lock guarantees no 0->1 or 1->0 transitions
> + * of the count are going to take place, otherwise it changes arbitrarily.
> */
> static inline int icount_read(const struct inode *inode)
> {
> --
> 2.48.1
>
--
Jan Kara <jack@xxxxxxxx>
SUSE Labs, CR