[PATCH v4 5/5] fsnotify: require inode lock held during child flag update

From: Stephen Brennan
Date: Fri Nov 11 2022 - 17:07:28 EST


With the prior changes to fsnotify, it is now possible for
fsnotify_recalc_mask() to return before all children flags have been
set. Imagine that two CPUs attempt to add a mark to an inode which would
require watching the children of that directory:

CPU 1: CPU 2:

fsnotify_recalc_mask() {
spin_lock();
update_children = ...
__fsnotify_recalc_mask();
update_children = ...
spin_unlock();
// update_children is true!
fsnotify_conn_set_children_dentry_flags() {
// updating flags ...
cond_resched();

fsnotify_recalc_mask() {
spin_lock();
update_children = ...
__fsnotify_recalc_mask();
update_children = ...
spin_unlock();
// update_children is false
}
// returns to userspace, but
// not all children are marked
// continue updating flags
}
}

To prevent this situation, hold the directory inode lock. This ensures
that any concurrent update to the mask will block until the update is
complete, so that we can guarantee that child flags are set prior to
returning.

Since the directory inode lock is now held during iteration over
d_subdirs, we are guaranteed that __d_move() cannot remove the dentry we
hold, so we no longer need check whether we should retry iteration. We
also are guaranteed that no cursors are moving through the list, since
simple_readdir() holds the inode read lock. Simplify the iteration by
removing this logic.

Signed-off-by: Stephen Brennan <stephen.s.brennan@xxxxxxxxxx>
---
fs/notify/fsnotify.c | 25 +++++++++----------------
fs/notify/mark.c | 8 ++++++++
2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 0ba61211456c..b5778775b88d 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -102,6 +102,8 @@ void fsnotify_sb_delete(struct super_block *sb)
* on a child we run all of our children and set a dentry flag saying that the
* parent cares. Thus when an event happens on a child it can quickly tell
* if there is a need to find a parent and send the event to the parent.
+ *
+ * Context: inode locked exclusive
*/
void fsnotify_update_children_dentry_flags(struct inode *inode, bool watched)
{
@@ -124,22 +126,16 @@ void fsnotify_update_children_dentry_flags(struct inode *inode, bool watched)
* over d_subdirs which will allow us to sleep.
*/
spin_lock(&alias->d_lock);
-retry:
list_for_each_entry(child, &alias->d_subdirs, d_child) {
/*
- * We need to hold a reference while we sleep. But we cannot
- * sleep holding a reference to a cursor, or we risk skipping
- * through the list.
- *
- * When we wake, dput() could free the dentry, invalidating the
- * list pointers. We can't look at the list pointers until we
- * re-lock the parent, and we can't dput() once we have the
- * parent locked. So the solution is to hold onto our reference
- * and free it the *next* time we drop alias->d_lock: either at
- * the end of the function, or at the time of the next sleep.
+ * We need to hold a reference while we sleep. When we wake,
+ * dput() could free the dentry, invalidating the list pointers.
+ * We can't look at the list pointers until we re-lock the
+ * parent, and we can't dput() once we have the parent locked.
+ * So the solution is to hold onto our reference and free it the
+ * *next* time we drop alias->d_lock: either at the end of the
+ * function, or at the time of the next sleep.
*/
- if (child->d_flags & DCACHE_DENTRY_CURSOR)
- continue;
if (need_resched()) {
dget(child);
spin_unlock(&alias->d_lock);
@@ -147,9 +143,6 @@ void fsnotify_update_children_dentry_flags(struct inode *inode, bool watched)
last_ref = child;
cond_resched();
spin_lock(&alias->d_lock);
- /* Check for races with __d_move() */
- if (child->d_parent != alias)
- goto retry;
}

/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 6797a2952f87..f39cd88ad778 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -203,10 +203,15 @@ static void fsnotify_conn_set_children_dentry_flags(
void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
bool update_children;
+ struct inode *inode = NULL;

if (!conn)
return;

+ if (conn->type == FSNOTIFY_OBJ_TYPE_INODE) {
+ inode = fsnotify_conn_inode(conn);
+ inode_lock(inode);
+ }
spin_lock(&conn->lock);
update_children = !fsnotify_conn_watches_children(conn);
__fsnotify_recalc_mask(conn);
@@ -219,6 +224,9 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
*/
if (update_children)
fsnotify_conn_set_children_dentry_flags(conn);
+
+ if (inode)
+ inode_unlock(inode);
}

/* Free all connectors queued for freeing once SRCU period ends */
--
2.34.1