[PATCH RFC v2 1/2] filemap: defer dropbehind invalidation from IRQ context

From: Tal Zussman

Date: Wed Feb 25 2026 - 17:42:01 EST

folio_end_dropbehind() is called from folio_end_writeback(), which can
run in IRQ context through buffer_head completion.

Previously, when folio_end_dropbehind() detected !in_task(), it skipped
the invalidation entirely. This meant that folios marked for dropbehind
via RWF_DONTCACHE would remain in the page cache after writeback when
completed from IRQ context, defeating the purpose of using it.

Fix this by deferring the dropbehind invalidation to a work item. When
folio_end_dropbehind() is called from IRQ context, the folio is added to
a global folio_batch and the work item is scheduled. The worker drains
the batch, locking each folio and calling filemap_end_dropbehind(), and
re-drains if new folios arrived while processing.

This unblocks enabling RWF_UNCACHED for block devices and other
buffer_head-based I/O.

Signed-off-by: Tal Zussman <tz2294@xxxxxxxxxxxx>
---
mm/filemap.c | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 79 insertions(+), 5 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index ebd75684cb0a..6263f35c5d13 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1085,6 +1085,8 @@ static const struct ctl_table filemap_sysctl_table[] = {
}
};

+static void __init dropbehind_init(void);
+
void __init pagecache_init(void)
{
int i;
@@ -1092,6 +1094,7 @@ void __init pagecache_init(void)
for (i = 0; i < PAGE_WAIT_TABLE_SIZE; i++)
init_waitqueue_head(&folio_wait_table[i]);

+ dropbehind_init();
page_writeback_init();
register_sysctl_init("vm", filemap_sysctl_table);
}
@@ -1613,23 +1616,94 @@ static void filemap_end_dropbehind(struct folio *folio)
* If folio was marked as dropbehind, then pages should be dropped when writeback
* completes. Do that now. If we fail, it's likely because of a big folio -
* just reset dropbehind for that case and latter completions should invalidate.
+ *
+ * When called from IRQ context (e.g. buffer_head completion), we cannot lock
+ * the folio and invalidate. Defer to a workqueue so that callers like
+ * end_buffer_async_write() that complete in IRQ context still get their folios
+ * pruned.
*/
+static DEFINE_SPINLOCK(dropbehind_lock);
+static struct folio_batch dropbehind_fbatch;
+static struct work_struct dropbehind_work;
+
+static void dropbehind_work_fn(struct work_struct *w)
+{
+ struct folio_batch fbatch;
+
+again:
+ spin_lock_irq(&dropbehind_lock);
+ fbatch = dropbehind_fbatch;
+ folio_batch_reinit(&dropbehind_fbatch);
+ spin_unlock_irq(&dropbehind_lock);
+
+ for (int i = 0; i < folio_batch_count(&fbatch); i++) {
+ struct folio *folio = fbatch.folios[i];
+
+ if (folio_trylock(folio)) {
+ filemap_end_dropbehind(folio);
+ folio_unlock(folio);
+ }
+ folio_put(folio);
+ }
+
+ /* Drain folios that were added while we were processing. */
+ spin_lock_irq(&dropbehind_lock);
+ if (folio_batch_count(&dropbehind_fbatch)) {
+ spin_unlock_irq(&dropbehind_lock);
+ goto again;
+ }
+ spin_unlock_irq(&dropbehind_lock);
+}
+
+static void __init dropbehind_init(void)
+{
+ folio_batch_init(&dropbehind_fbatch);
+ INIT_WORK(&dropbehind_work, dropbehind_work_fn);
+}
+
+static void folio_end_dropbehind_irq(struct folio *folio)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&dropbehind_lock, flags);
+
+ /* If there is no space in the folio_batch, skip the invalidation. */
+ if (!folio_batch_space(&dropbehind_fbatch)) {
+ spin_unlock_irqrestore(&dropbehind_lock, flags);
+ return;
+ }
+
+ folio_get(folio);
+ folio_batch_add(&dropbehind_fbatch, folio);
+ spin_unlock_irqrestore(&dropbehind_lock, flags);
+
+ schedule_work(&dropbehind_work);
+}
+
void folio_end_dropbehind(struct folio *folio)
{
if (!folio_test_dropbehind(folio))
return;

/*
- * Hitting !in_task() should not happen off RWF_DONTCACHE writeback,
- * but can happen if normal writeback just happens to find dirty folios
- * that were created as part of uncached writeback, and that writeback
- * would otherwise not need non-IRQ handling. Just skip the
- * invalidation in that case.
+ * Hitting !in_task() can happen for IO completed from IRQ contexts or
+ * if normal writeback just happens to find dirty folios that were
+ * created as part of uncached writeback, and that writeback would
+ * otherwise not need non-IRQ handling.
*/
if (in_task() && folio_trylock(folio)) {
filemap_end_dropbehind(folio);
folio_unlock(folio);
+ return;
}
+
+ /*
+ * In IRQ context we cannot lock the folio or call into the
+ * invalidation path. Defer to a workqueue. This happens for
+ * buffer_head-based writeback which runs from bio IRQ context.
+ */
+ if (!in_task())
+ folio_end_dropbehind_irq(folio);
}
EXPORT_SYMBOL_GPL(folio_end_dropbehind);

--
2.39.5