Re: [PATCH v3 2/3] fsnotify: Protect i_fsnotify_mask and child flags with inode rwsem

From: Amir Goldstein
Date: Fri Oct 28 2022 - 05:11:26 EST


On Fri, Oct 28, 2022 at 3:10 AM Stephen Brennan
<stephen.s.brennan@xxxxxxxxxx> wrote:
>
> When an inode is interested in events on its children, it must set
> DCACHE_FSNOTIFY_PARENT_WATCHED flag on all its children. Currently, when
> the fsnotify connector is removed and i_fsnotify_mask becomes zero, we
> lazily allow __fsnotify_parent() to do this the next time we see an
> event on a child.
>
> However, if the list of children is very long (e.g., in the millions),
> and lots of activity is occurring on the directory, then it's possible
> for many CPUs to end up blocked on the inode spinlock in
> __fsnotify_update_child_flags(). Each CPU will then redundantly iterate
> over the very long list of children. This situation can cause soft
> lockups.
>
> To avoid this, stop lazily updating child flags in __fsnotify_parent().
> Protect the child flag update with i_rwsem held exclusive, to ensure
> that we only iterate over the child list when it's absolutely necessary,
> and even then, only once.
>
> Signed-off-by: Stephen Brennan <stephen.s.brennan@xxxxxxxxxx>
> ---

Looking good

Reviewed-by: Amir Goldstein <amir73il@xxxxxxxxx>

Regardless, some nits on comment styles...

>
> Notes:
> v3 changes:
>
> * Moved fsnotify_update_children_dentry_flags() into fsnotify.c,
> declared it in the header. Made
> __fsnotify_update_children_dentry_flags() static since it has no
> external callers except fsnotify_update...().
> * Use bitwise xor operator in children_need_update()
> * Eliminated FSNOTIFY_OBJ_FLAG_* constants, reused CONN_FLAG_*
> * Updated documentation of fsnotify_update_inode_conn_flags() to
> reflect its behavior
> * Renamed "flags" to "update_flags" in all its uses, so that it's a
> clear pattern and matches renamed fsnotify_update_object().
>
> fs/notify/fsnotify.c | 45 ++++++++++-
> fs/notify/fsnotify.h | 13 +++-
> fs/notify/mark.c | 124 ++++++++++++++++++++-----------
> include/linux/fsnotify_backend.h | 1 +
> 4 files changed, 137 insertions(+), 46 deletions(-)
>
> diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
> index 7939aa911931..ccb8a3a6c522 100644
> --- a/fs/notify/fsnotify.c
> +++ b/fs/notify/fsnotify.c
> @@ -103,13 +103,15 @@ void fsnotify_sb_delete(struct super_block *sb)
> * parent cares. Thus when an event happens on a child it can quickly tell
> * if there is a need to find a parent and send the event to the parent.
> */
> -void __fsnotify_update_child_dentry_flags(struct inode *inode)
> +static bool __fsnotify_update_children_dentry_flags(struct inode *inode)
> {
> struct dentry *alias, *child;
> int watched;
>
> if (!S_ISDIR(inode->i_mode))
> - return;
> + return false;
> +
> + lockdep_assert_held_write(&inode->i_rwsem);
>
> /* determine if the children should tell inode about their events */
> watched = fsnotify_inode_watches_children(inode);
> @@ -136,6 +138,43 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
> }
> spin_unlock(&alias->d_lock);
> dput(alias);
> + return watched;
> +}
> +
> +void fsnotify_update_children_dentry_flags(struct fsnotify_mark_connector *conn,
> + struct inode *inode)
> +{
> + bool need_update;
> +
> + inode_lock(inode);
> + spin_lock(&conn->lock);
> + need_update = fsnotify_children_need_update(conn, inode);
> + spin_unlock(&conn->lock);
> + if (need_update) {
> + bool watched = __fsnotify_update_children_dentry_flags(inode);
> +
> + spin_lock(&conn->lock);
> + if (watched)
> + conn->flags |= FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN;
> + else
> + conn->flags &= ~FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN;
> + spin_unlock(&conn->lock);
> + }
> + inode_unlock(inode);
> +}
> +
> +
> +static void fsnotify_update_child_dentry_flags(struct inode *inode, struct dentry *dentry)
> +{
> + /*
> + * Flag would be cleared soon by
> + * __fsnotify_update_child_dentry_flags(), but as an
> + * optimization, clear it now.
> + */

Line breaks are weird.
Seems that this comment is out of context.
It is only true in the context of the specific call site.
Either remove it or move it.

> + spin_lock(&dentry->d_lock);
> + if (!fsnotify_inode_watches_children(inode))
> + dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED;
> + spin_unlock(&dentry->d_lock);
> }
>
> /* Are inode/sb/mount interested in parent and name info with this event? */
> @@ -206,7 +245,7 @@ int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data,
> p_inode = parent->d_inode;
> p_mask = fsnotify_inode_watches_children(p_inode);
> if (unlikely(parent_watched && !p_mask))
> - __fsnotify_update_child_dentry_flags(p_inode);
> + fsnotify_update_child_dentry_flags(p_inode, dentry);
>
> /*
> * Include parent/name in notification either if some notification
> diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h
> index fde74eb333cc..621e78a6f0fb 100644
> --- a/fs/notify/fsnotify.h
> +++ b/fs/notify/fsnotify.h
> @@ -70,11 +70,22 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb)
> fsnotify_destroy_marks(&sb->s_fsnotify_marks);
> }
>
> +static inline bool fsnotify_children_need_update(struct fsnotify_mark_connector *conn,
> + struct inode *inode)
> +{
> + bool watched, flags_set;
> +
> + watched = fsnotify_inode_watches_children(inode);
> + flags_set = conn->flags & FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN;
> + return watched ^ flags_set;
> +}
> +
> /*
> * update the dentry->d_flags of all of inode's children to indicate if inode cares
> * about events that happen to its children.
> */
> -extern void __fsnotify_update_child_dentry_flags(struct inode *inode);
> +extern void fsnotify_update_children_dentry_flags(struct fsnotify_mark_connector *conn,
> + struct inode *inode);
>
> extern struct kmem_cache *fsnotify_mark_connector_cachep;
>
> diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> index c74ef947447d..8969128dacc1 100644
> --- a/fs/notify/mark.c
> +++ b/fs/notify/mark.c
> @@ -123,37 +123,52 @@ static void fsnotify_get_inode_ref(struct inode *inode)
> }
>
> /*
> - * Grab or drop inode reference for the connector if needed.
> + * Determine the connector flags that it is necessary to update
> *
> - * When it's time to drop the reference, we only clear the HAS_IREF flag and
> - * return the inode object. fsnotify_drop_object() will be resonsible for doing
> - * iput() outside of spinlocks. This happens when last mark that wanted iref is
> - * detached.
> + * If any action needs to be taken on the connector's inode outside of a spinlock,
> + * we return the inode and set *update_flags accordingly.
> + *
> + * If FSNOTIFY_CONN_FLAG_HAS_IREF is set in *update_flags, then the caller needs
> + * to drop the last inode reference using fsnotify_put_inode_ref().
> + *
> + * If FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN is set in *update_flags, then the
> + * caller needs to update the children dentry flags so that their
> + * DCACHE_FSNOTIFY_PARENT_WATCHED flag matches the i_fsnotify_mask value, using
> + * fsnotify_update_children_dentry_flags().
> */
> -static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
> - bool want_iref)
> +static struct inode *fsnotify_update_inode_conn_flags(struct fsnotify_mark_connector *conn,
> + bool want_iref, int *update_flags)
> {
> bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
> - struct inode *inode = NULL;
> + struct inode *inode = NULL, *ret = NULL;
>
> - if (conn->type != FSNOTIFY_OBJ_TYPE_INODE ||
> - want_iref == has_iref)
> + if (conn->type != FSNOTIFY_OBJ_TYPE_INODE)
> return NULL;
>
> - if (want_iref) {
> - /* Pin inode if any mark wants inode refcount held */
> - fsnotify_get_inode_ref(fsnotify_conn_inode(conn));
> - conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF;
> - } else {
> - /* Unpin inode after detach of last mark that wanted iref */
> - inode = fsnotify_conn_inode(conn);
> - conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF;
> + inode = fsnotify_conn_inode(conn);
> +
> + if (want_iref != has_iref) {
> + if (want_iref) {
> + /* Pin inode if any mark wants inode refcount held */
> + fsnotify_get_inode_ref(inode);
> + conn->flags |= FSNOTIFY_CONN_FLAG_HAS_IREF;
> + } else {
> + /* Unpin inode after detach of last mark that wanted iref */
> + conn->flags &= ~FSNOTIFY_CONN_FLAG_HAS_IREF;
> + ret = inode;
> + *update_flags |= FSNOTIFY_CONN_FLAG_HAS_IREF;
> + }
> + }
> + if (fsnotify_children_need_update(conn, inode)) {
> + ret = inode;
> + *update_flags |= FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN;
> }
>
> - return inode;
> + return ret;
> }
>
> -static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> +static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn,
> + int *update_flags)
> {
> u32 new_mask = 0;
> bool want_iref = false;
> @@ -173,7 +188,7 @@ static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> }
> *fsnotify_conn_mask_p(conn) = new_mask;
>
> - return fsnotify_update_iref(conn, want_iref);
> + return fsnotify_update_inode_conn_flags(conn, want_iref, update_flags);
> }
>
> /*
> @@ -184,15 +199,19 @@ static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> */
> void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> {
> + struct inode *inode = NULL;
> + int flags = 0;
> +
> if (!conn)
> return;
>
> spin_lock(&conn->lock);
> - __fsnotify_recalc_mask(conn);
> + inode = __fsnotify_recalc_mask(conn, &flags);
> spin_unlock(&conn->lock);
> - if (conn->type == FSNOTIFY_OBJ_TYPE_INODE)
> - __fsnotify_update_child_dentry_flags(
> - fsnotify_conn_inode(conn));
> +
> + if (flags & FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN)
> + fsnotify_update_children_dentry_flags(conn, inode);
> + WARN_ON_ONCE(flags & FSNOTIFY_CONN_FLAG_HAS_IREF);
> }
>
> /* Free all connectors queued for freeing once SRCU period ends */
> @@ -240,7 +259,8 @@ static void fsnotify_put_sb_connectors(struct fsnotify_mark_connector *conn)
>
> static void *fsnotify_detach_connector_from_object(
> struct fsnotify_mark_connector *conn,
> - unsigned int *type)
> + unsigned int *type,
> + unsigned int *update_flags)
> {
> struct inode *inode = NULL;
>
> @@ -252,8 +272,10 @@ static void *fsnotify_detach_connector_from_object(
> inode = fsnotify_conn_inode(conn);
> inode->i_fsnotify_mask = 0;
>
> - /* Unpin inode when detaching from connector */
> - if (!(conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF))
> + *update_flags = conn->flags &
> + (FSNOTIFY_CONN_FLAG_HAS_IREF |
> + FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN);
> + if (!*update_flags)
> inode = NULL;
> } else if (conn->type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
> fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0;
> @@ -279,15 +301,37 @@ static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark)
> fsnotify_put_group(group);
> }
>
> -/* Drop object reference originally held by a connector */
> -static void fsnotify_drop_object(unsigned int type, void *objp)
> +/* Apply the update_flags for a connector after recalculating mask */
> +static void fsnotify_update_object(struct fsnotify_mark_connector *conn,
> + unsigned int type, void *objp,
> + int update_flags)
> {
> if (!objp)
> return;
> /* Currently only inode references are passed to be dropped */
> if (WARN_ON_ONCE(type != FSNOTIFY_OBJ_TYPE_INODE))
> return;
> - fsnotify_put_inode_ref(objp);
> +
> + if (update_flags & FSNOTIFY_CONN_FLAG_WATCHES_CHILDREN)
> + /*.

Stray .

> + * At this point, we've already detached the connector from the
> + * inode. It's entirely possible that another connector has been
> + * attached, and that connector would assume that the children's
> + * flags are all clear. There are two possibilities:
> + * (a) The connector has not yet attached a mark that watches its
> + * children. In this case, we will properly clear out the flags,
> + * and the connector's flags will be consistent with the
> + * children.
> + * (b) The connector attaches a mark that watches its children.
> + * It may have even already altered i_fsnotify_mask and/or
> + * altered the child dentry flags. In this case, our call here
> + * will read the correct value of i_fsnotify_mask and apply it
> + * to the children, which duplicates some work, but isn't
> + * harmful.
> + */

Multi line section after if should have {} even if there is only a
single line of
code (checkpatch should complain?).

But I personally dislike large and deeply indented comments,
so if it were me, I would change the wording a bit
"...When detaching a connector from an inode that watches children,
there are two possibilities:..."
and move the huge comment above the if ().

Thanks,
Amir.