[PATCH v5 2/2] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking

From: Jeff Layton

Date: Mon May 04 2026 - 09:31:20 EST


The IOCB_DONTCACHE writeback path in generic_write_sync() calls
filemap_flush_range() on every write, submitting writeback inline in
the writer's context. Perf lock contention profiling shows the
performance problem is not lock contention but the writeback submission
work itself — walking the page tree and submitting I/O blocks the writer
for milliseconds, inflating p99.9 latency from 23ms (buffered) to 93ms
(dontcache).

Replace the inline filemap_flush_range() call with a flusher kick that
drains dirty pages in the background. This moves writeback submission
completely off the writer's hot path.

To avoid flushing unrelated buffered dirty data, add a dedicated
WB_start_dontcache bit and wb_check_start_dontcache() handler that uses
the per-wb WB_DONTCACHE_DIRTY counter to determine how many pages to
write back. The flusher writes back that many pages from the oldest dirty
inodes (not restricted to dontcache-specific inodes). This helps
preserve I/O batching while limiting the scope of expedited writeback.

Like WB_start_all, the WB_start_dontcache bit coalesces multiple
DONTCACHE writes into a single flusher wakeup without per-write
allocations.

Also add WB_REASON_DONTCACHE as a new writeback reason for tracing
visibility, and target the correct cgroup writeback domain via
unlocked_inode_to_wb_begin().

dontcache-bench results (same host, T6F_SKL_1920GBF, 251 GiB RAM,
xfs on NVMe, fio io_uring):

Buffered and direct I/O paths are unaffected by this patchset. All
improvements are confined to the dontcache path:

Single-stream throughput (MB/s):
Before After Change
seq-write/dontcache 298 897 +201%
rand-write/dontcache 131 236 +80%

Tail latency improvements (seq-write/dontcache):
p99: 135,266 us -> 23,986 us (-82%)
p99.9: 8,925,479 us -> 28,443 us (-99.7%)

Multi-writer (4 jobs, sequential write):
Before After Change
dontcache aggregate (MB/s) 2,529 4,532 +79%
dontcache p99 (us) 8,553 1,002 -88%
dontcache p99.9 (us) 109,314 1,057 -99%

Dontcache multi-writer throughput now matches buffered (4,532 vs
4,616 MB/s).

32-file write (Axboe test):
Before After Change
dontcache aggregate (MB/s) 1,548 3,499 +126%
dontcache p99 (us) 10,170 602 -94%
Peak dirty pages (MB) 1,837 213 -88%

Dontcache now reaches 81% of buffered throughput (was 35%).

Competing writers (dontcache vs buffered, separate files):
Before After
buffered writer 868 433 MB/s
dontcache writer 415 433 MB/s
Aggregate 1,284 866 MB/s

Previously the buffered writer starved the dontcache writer 2:1.
With per-bdi_writeback tracking, both writers now receive equal
bandwidth. The aggregate matches the buffered-vs-buffered baseline
(863 MB/s), indicating fair sharing regardless of I/O mode.

The dontcache writer's p99.9 latency collapsed from 119 ms to
33 ms (-73%), eliminating the severe periodic stalls seen in the
baseline. Both writers now share identical latency profiles,
matching the buffered-vs-buffered pattern.

The per-bdi_writeback dirty tracking dramatically reduces peak dirty
pages in dontcache workloads, with the 32-file test dropping from
1.8 GB to 213 MB. Dontcache sequential write throughput triples and
multi-writer throughput reaches parity with buffered I/O, with tail
latencies collapsing by 1-2 orders of magnitude.

Assisted-by: Claude:claude-opus-4-6
Reviewed-by: Jan Kara <jack@xxxxxxx>
Reviewed-by: Jens Axboe <axboe@xxxxxxxxx>
Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx>
---
fs/fs-writeback.c | 65 ++++++++++++++++++++++++++++++++++++++++
include/linux/backing-dev-defs.h | 2 ++
include/linux/fs.h | 6 ++--
include/trace/events/writeback.h | 3 +-
4 files changed, 71 insertions(+), 5 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a65694cbfe68..ebef485b2f8b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1334,6 +1334,18 @@ static void wb_start_writeback(struct bdi_writeback *wb, enum wb_reason reason)
wb_wakeup(wb);
}

+static void wb_start_dontcache_writeback(struct bdi_writeback *wb)
+{
+ if (!wb_has_dirty_io(wb))
+ return;
+
+ if (test_bit(WB_start_dontcache, &wb->state) ||
+ test_and_set_bit(WB_start_dontcache, &wb->state))
+ return;
+
+ wb_wakeup(wb);
+}
+
/**
* wb_start_background_writeback - start background writeback
* @wb: bdi_writback to write from
@@ -2373,6 +2385,28 @@ static long wb_check_start_all(struct bdi_writeback *wb)
return nr_pages;
}

+static long wb_check_start_dontcache(struct bdi_writeback *wb)
+{
+ long nr_pages;
+
+ if (!test_bit(WB_start_dontcache, &wb->state))
+ return 0;
+
+ nr_pages = wb_stat(wb, WB_DONTCACHE_DIRTY);
+ if (nr_pages) {
+ struct wb_writeback_work work = {
+ .nr_pages = nr_pages,
+ .sync_mode = WB_SYNC_NONE,
+ .range_cyclic = 1,
+ .reason = WB_REASON_DONTCACHE,
+ };
+
+ nr_pages = wb_writeback(wb, &work);
+ }
+
+ clear_bit(WB_start_dontcache, &wb->state);
+ return nr_pages;
+}

/*
* Retrieve work items and do the writeback they describe
@@ -2394,6 +2428,11 @@ static long wb_do_writeback(struct bdi_writeback *wb)
*/
wrote += wb_check_start_all(wb);

+ /*
+ * Check for dontcache writeback request
+ */
+ wrote += wb_check_start_dontcache(wb);
+
/*
* Check for periodic writeback, kupdated() style
*/
@@ -2468,6 +2507,32 @@ void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi,
rcu_read_unlock();
}

+/**
+ * filemap_dontcache_kick_writeback - kick flusher for IOCB_DONTCACHE writes
+ * @mapping: address_space that was just written to
+ *
+ * Kick the writeback flusher thread to expedite writeback of dontcache dirty
+ * pages. Queue writeback for the inode's wb for as many pages as there are
+ * dontcache pages, but don't restrict writeback to dontcache pages only.
+ *
+ * This significantly improves performance over either writing all wb's pages
+ * or writing only dontcache pages. Although it doesn't guarantee quick
+ * writeback and reclaim of dontcache pages, it keeps the amount of dirty pages
+ * in check. Over longer term dontcache pages get written and reclaimed by
+ * background writeback even with this rough heuristic.
+ */
+void filemap_dontcache_kick_writeback(struct address_space *mapping)
+{
+ struct inode *inode = mapping->host;
+ struct bdi_writeback *wb;
+ struct wb_lock_cookie cookie = {};
+
+ wb = unlocked_inode_to_wb_begin(inode, &cookie);
+ wb_start_dontcache_writeback(wb);
+ unlocked_inode_to_wb_end(inode, &cookie);
+}
+EXPORT_SYMBOL_GPL(filemap_dontcache_kick_writeback);
+
/*
* Wakeup the flusher threads to start writeback of all currently dirty pages
*/
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index cb660dd37286..4f1084937315 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -26,6 +26,7 @@ enum wb_state {
WB_writeback_running, /* Writeback is in progress */
WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */
WB_start_all, /* nr_pages == 0 (all) work pending */
+ WB_start_dontcache, /* dontcache writeback pending */
};

enum wb_stat_item {
@@ -56,6 +57,7 @@ enum wb_reason {
*/
WB_REASON_FORKER_THREAD,
WB_REASON_FOREIGN_FLUSH,
+ WB_REASON_DONTCACHE,

WB_REASON_MAX,
};
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 11559c513dfb..df72b42a9e9b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2624,6 +2624,7 @@ extern int __must_check file_write_and_wait_range(struct file *file,
loff_t start, loff_t end);
int filemap_flush_range(struct address_space *mapping, loff_t start,
loff_t end);
+void filemap_dontcache_kick_writeback(struct address_space *mapping);

static inline int file_write_and_wait(struct file *file)
{
@@ -2657,10 +2658,7 @@ static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
if (ret)
return ret;
} else if (iocb->ki_flags & IOCB_DONTCACHE) {
- struct address_space *mapping = iocb->ki_filp->f_mapping;
-
- filemap_flush_range(mapping, iocb->ki_pos - count,
- iocb->ki_pos - 1);
+ filemap_dontcache_kick_writeback(iocb->ki_filp->f_mapping);
}

return count;
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index bdac0d685a98..13ee076ccd16 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -44,7 +44,8 @@
EM( WB_REASON_PERIODIC, "periodic") \
EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \
EM( WB_REASON_FORKER_THREAD, "forker_thread") \
- EMe(WB_REASON_FOREIGN_FLUSH, "foreign_flush")
+ EM( WB_REASON_FOREIGN_FLUSH, "foreign_flush") \
+ EMe(WB_REASON_DONTCACHE, "dontcache")

WB_WORK_REASON


--
2.54.0