Re: [PATCH] fsnotify: fix inode reference leak in fsnotify_recalc_mask()
From: Amir Goldstein
Date: Mon Apr 20 2026 - 11:36:12 EST
On Mon, Apr 20, 2026 at 8:52 AM Xin Yin <yinxin.x@xxxxxxxxxxxxx> wrote:
>
> fsnotify_recalc_mask() fails to handle the return value of
> __fsnotify_recalc_mask(), which may return an inode pointer that needs
> to be released via fsnotify_drop_object() when the connector's HAS_IREF
> flag transitions from set to cleared.
>
> This manifests as a hung task with the following call trace:
>
> INFO: task umount:1234 blocked for more than 120 seconds.
> Call Trace:
> __schedule
> schedule
> fsnotify_sb_delete
> generic_shutdown_super
> kill_anon_super
> cleanup_mnt
> task_work_run
> do_exit
> do_group_exit
>
> The race window that triggers the iref leak:
>
> Thread A (adding mark) Thread B (removing mark)
> ────────────────────── ────────────────────────
> fsnotify_add_mark_locked():
> fsnotify_add_mark_list():
> spin_lock(conn->lock)
> add mark_B(evictable) to list
> spin_unlock(conn->lock)
> return
>
> /* ---- gap: no lock held ---- */
>
> fsnotify_detach_mark(mark_A):
> spin_lock(mark_A->lock)
> clear ATTACHED flag on mark_A
> spin_unlock(mark_A->lock)
>
> fsnotify_recalc_mask():
> spin_lock(conn->lock)
> __fsnotify_recalc_mask():
> /* mark_A skipped: ATTACHED cleared */
> /* only mark_B(evictable) remains */
> want_iref = false
> has_iref = true /* not yet cleared */
> -> HAS_IREF transitions true -> false
> -> returns inode pointer
> spin_unlock(conn->lock)
> /* BUG: return value discarded!
> * iput() and fsnotify_put_sb_watched_objects()
> * are never called */
>
Hi Xin!
Ouch! nasty one.
Very good analysis.
Do you also have a reproducer?
> Fix this by capturing the return value of __fsnotify_recalc_mask() and
> passing it to fsnotify_drop_object() after releasing the spinlock, which
> is the same pattern used in fsnotify_put_mark().
The fix looks correct, but I prefer for my mental model to keep
fsnotify_drop_object() in one place at the end of fsnotify_put_mark().
Please see the suggested (only sanity tested) patch.
Thanks,
Amir.
>
> Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
> Signed-off-by: Xin Yin <yinxin.x@xxxxxxxxxxxxx>
> ---
> fs/notify/mark.c | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
> diff --git a/fs/notify/mark.c b/fs/notify/mark.c
> index c2ed5b11b0fe6..cc93fcc2c5a9c 100644
> --- a/fs/notify/mark.c
> +++ b/fs/notify/mark.c
> @@ -283,6 +283,8 @@ static void fsnotify_conn_set_children_dentry_flags(
> fsnotify_set_children_dentry_flags(fsnotify_conn_inode(conn));
> }
>
> +static void fsnotify_drop_object(unsigned int type, void *objp);
> +
> /*
> * Calculate mask of events for a list of marks. The caller must make sure
> * connector and connector->obj cannot disappear under us. Callers achieve
> @@ -292,15 +294,19 @@ static void fsnotify_conn_set_children_dentry_flags(
> void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
> {
> bool update_children;
> + unsigned int type;
> + void *objp;
>
> if (!conn)
> return;
>
> spin_lock(&conn->lock);
> update_children = !fsnotify_conn_watches_children(conn);
> - __fsnotify_recalc_mask(conn);
> + objp = __fsnotify_recalc_mask(conn);
> + type = conn->type;
> update_children &= fsnotify_conn_watches_children(conn);
> spin_unlock(&conn->lock);
> + fsnotify_drop_object(type, objp);
> /*
> * Set children's PARENT_WATCHED flags only if parent started watching.
> * When parent stops watching, we clear false positive PARENT_WATCHED
> --
> 2.20.1
From 8cc3e7e2e75f5c93875bffce6bf8398d94398c33 Mon Sep 17 00:00:00 2001
From: Amir Goldstein <amir73il@xxxxxxxxx>
Date: Mon, 20 Apr 2026 14:58:00 +0200
Subject: [PATCH v2] fsnotify: fix inode reference leak in
fsnotify_recalc_mask()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
fsnotify_recalc_mask() fails to handle the return value of
__fsnotify_recalc_mask(), which may return an inode pointer that needs
to be released via fsnotify_drop_object() when the connector's HAS_IREF
flag transitions from set to cleared.
This manifests as a hung task with the following call trace:
INFO: task umount:1234 blocked for more than 120 seconds.
Call Trace:
__schedule
schedule
fsnotify_sb_delete
generic_shutdown_super
kill_anon_super
cleanup_mnt
task_work_run
do_exit
do_group_exit
The race window that triggers the iref leak:
Thread A (adding mark) Thread B (removing mark)
────────────────────── ────────────────────────
fsnotify_add_mark_locked():
fsnotify_add_mark_list():
spin_lock(conn->lock)
add mark_B(evictable) to list
spin_unlock(conn->lock)
return
/* ---- gap: no lock held ---- */
fsnotify_detach_mark(mark_A):
spin_lock(mark_A->lock)
clear ATTACHED flag on mark_A
spin_unlock(mark_A->lock)
fsnotify_put_mark(mark_A)
fsnotify_recalc_mask():
spin_lock(conn->lock)
__fsnotify_recalc_mask():
/* mark_A skipped: ATTACHED cleared */
/* only mark_B(evictable) remains */
want_iref = false
has_iref = true /* not yet cleared */
-> HAS_IREF transitions true -> false
-> returns inode pointer
spin_unlock(conn->lock)
/* BUG: return value discarded!
* iput() and fsnotify_put_sb_watched_objects()
* are never called */
Fix this by deferring the transition true -> false of HAS_IREF flag from
fsnotify_recalc_mask() (Thread A) to fsnotify_put_mark() (thread B).
Fixes: c3638b5b1374 ("fsnotify: allow adding an inode mark without pinning inode")
Signed-off-by: Xin Yin <yinxin.x@xxxxxxxxxxxxx>
Signed-off-by: Amir Goldstein <amir73il@xxxxxxxxx>
---
fs/notify/mark.c | 39 ++++++++++++++++++++++++++++++++++++---
1 file changed, 36 insertions(+), 3 deletions(-)
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 622f05977f86a..e256b420100dc 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -238,7 +238,12 @@ static struct inode *fsnotify_update_iref(struct fsnotify_mark_connector *conn,
return inode;
}
-static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
+/*
+ * Calculate mask of events for a list of marks.
+ *
+ * Return true if any of the attached marks want to hold an inode reference.
+ */
+static bool __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
{
u32 new_mask = 0;
bool want_iref = false;
@@ -262,6 +267,34 @@ static void *__fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
*/
WRITE_ONCE(*fsnotify_conn_mask_p(conn), new_mask);
+ return want_iref;
+}
+
+/*
+ * Calculate mask of events for a list of marks after attach/modify mark
+ * and get an inode reference for the connector if needed.
+ *
+ * A concurrent add of evictable mark and detach of non-evictable mark can
+ * lead to __fsnotify_recalc_mask() returning false want_iref, but in this
+ * case we defer clearing iref to fsnotify_recalc_mask_clear_iref() called
+ * from fsnotify_put_mark().
+ */
+static void fsnotify_recalc_mask_set_iref(struct fsnotify_mark_connector *conn)
+{
+ bool has_iref = conn->flags & FSNOTIFY_CONN_FLAG_HAS_IREF;
+ bool want_iref = __fsnotify_recalc_mask(conn) || has_iref;
+
+ (void) fsnotify_update_iref(conn, want_iref);
+}
+
+/*
+ * Calculate mask of events for a list of marks after detach mark
+ * and return the inode object if its reference is no longer needed.
+ */
+static void *fsnotify_recalc_mask_clear_iref(struct fsnotify_mark_connector *conn)
+{
+ bool want_iref = __fsnotify_recalc_mask(conn);
+
return fsnotify_update_iref(conn, want_iref);
}
@@ -298,7 +331,7 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
spin_lock(&conn->lock);
update_children = !fsnotify_conn_watches_children(conn);
- __fsnotify_recalc_mask(conn);
+ fsnotify_recalc_mask_set_iref(conn);
update_children &= fsnotify_conn_watches_children(conn);
spin_unlock(&conn->lock);
/*
@@ -419,7 +452,7 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
/* Update watched objects after detaching mark */
if (sb)
fsnotify_update_sb_watchers(sb, conn);
- objp = __fsnotify_recalc_mask(conn);
+ objp = fsnotify_recalc_mask_clear_iref(conn);
type = conn->type;
}
WRITE_ONCE(mark->connector, NULL);
--
2.53.0