[PATCH RFC v5 1/3] block: add BIO_COMPLETE_IN_TASK for task-context completion

From: Tal Zussman

Date: Wed Apr 08 2026 - 19:09:40 EST


Some bio completion handlers need to run in task context but bio_endio()
can be called from IRQ context (e.g. buffer_head writeback). Add a
BIO_COMPLETE_IN_TASK flag that bio submitters can set to request
task-context completion of their bi_end_io callback.

When bio_endio() sees this flag and is running in non-task context, it
queues the bio to a per-cpu lockless list and schedules a delayed work
item to call bi_end_io() from task context. The delayed work uses a
1-jiffie delay to allow batches of completions to accumulate before
processing. A CPU hotplug dead callback drains any remaining bios from
the departing CPU's batch.

This will be used to enable RWF_DONTCACHE for block devices, and could
be used for other subsystems like fscrypt that need task-context bio
completion.

Suggested-by: Matthew Wilcox <willy@xxxxxxxxxxxxx>
Signed-off-by: Tal Zussman <tz2294@xxxxxxxxxxxx>
---
block/bio.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++-
include/linux/blk_types.h | 7 +++-
2 files changed, 88 insertions(+), 2 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 8203bb7455a9..21b403eb1c04 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -18,6 +18,7 @@
#include <linux/highmem.h>
#include <linux/blk-crypto.h>
#include <linux/xarray.h>
+#include <linux/llist.h>

#include <trace/events/block.h>
#include "blk.h"
@@ -1714,6 +1715,51 @@ void bio_check_pages_dirty(struct bio *bio)
}
EXPORT_SYMBOL_GPL(bio_check_pages_dirty);

+struct bio_complete_batch {
+ struct llist_head list;
+ struct delayed_work work;
+ int cpu;
+};
+
+static DEFINE_PER_CPU(struct bio_complete_batch, bio_complete_batch);
+static struct workqueue_struct *bio_complete_wq;
+
+static void bio_complete_work_fn(struct work_struct *w)
+{
+ struct delayed_work *dw = to_delayed_work(w);
+ struct bio_complete_batch *batch =
+ container_of(dw, struct bio_complete_batch, work);
+ struct llist_node *node;
+ struct bio *bio, *next;
+
+ do {
+ node = llist_del_all(&batch->list);
+ if (!node)
+ break;
+
+ node = llist_reverse_order(node);
+ llist_for_each_entry_safe(bio, next, node, bi_llist)
+ bio->bi_end_io(bio);
+
+ if (need_resched()) {
+ if (!llist_empty(&batch->list))
+ mod_delayed_work_on(batch->cpu,
+ bio_complete_wq,
+ &batch->work, 0);
+ break;
+ }
+ } while (1);
+}
+
+static void bio_queue_completion(struct bio *bio)
+{
+ struct bio_complete_batch *batch = this_cpu_ptr(&bio_complete_batch);
+
+ if (llist_add(&bio->bi_llist, &batch->list))
+ mod_delayed_work_on(batch->cpu, bio_complete_wq,
+ &batch->work, 1);
+}
+
static inline bool bio_remaining_done(struct bio *bio)
{
/*
@@ -1788,7 +1834,9 @@ void bio_endio(struct bio *bio)
}
#endif

- if (bio->bi_end_io)
+ if (!in_task() && bio_flagged(bio, BIO_COMPLETE_IN_TASK))
+ bio_queue_completion(bio);
+ else if (bio->bi_end_io)
bio->bi_end_io(bio);
}
EXPORT_SYMBOL(bio_endio);
@@ -1974,6 +2022,24 @@ int bioset_init(struct bio_set *bs,
}
EXPORT_SYMBOL(bioset_init);

+/*
+ * Drain a dead CPU's deferred bio completions.
+ */
+static int bio_complete_batch_cpu_dead(unsigned int cpu)
+{
+ struct bio_complete_batch *batch =
+ per_cpu_ptr(&bio_complete_batch, cpu);
+ struct llist_node *node;
+ struct bio *bio, *next;
+
+ node = llist_del_all(&batch->list);
+ node = llist_reverse_order(node);
+ llist_for_each_entry_safe(bio, next, node, bi_llist)
+ bio->bi_end_io(bio);
+
+ return 0;
+}
+
static int __init init_bio(void)
{
int i;
@@ -1988,6 +2054,21 @@ static int __init init_bio(void)
SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
}

+ for_each_possible_cpu(i) {
+ struct bio_complete_batch *batch =
+ per_cpu_ptr(&bio_complete_batch, i);
+
+ init_llist_head(&batch->list);
+ INIT_DELAYED_WORK(&batch->work, bio_complete_work_fn);
+ batch->cpu = i;
+ }
+
+ bio_complete_wq = alloc_workqueue("bio_complete", WQ_MEM_RECLAIM, 0);
+ if (!bio_complete_wq)
+ panic("bio: can't allocate bio_complete workqueue\n");
+
+ cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "block/bio:complete:dead",
+ NULL, bio_complete_batch_cpu_dead);
cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
bio_cpu_dead);

diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 8808ee76e73c..0b55159d110d 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -11,6 +11,7 @@
#include <linux/device.h>
#include <linux/ktime.h>
#include <linux/rw_hint.h>
+#include <linux/llist.h>

struct bio_set;
struct bio;
@@ -208,7 +209,10 @@ typedef unsigned int blk_qc_t;
* stacking drivers)
*/
struct bio {
- struct bio *bi_next; /* request queue link */
+ union {
+ struct bio *bi_next; /* request queue link */
+ struct llist_node bi_llist; /* deferred completion */
+ };
struct block_device *bi_bdev;
blk_opf_t bi_opf; /* bottom bits REQ_OP, top bits
* req_flags.
@@ -322,6 +326,7 @@ enum {
BIO_REMAPPED,
BIO_ZONE_WRITE_PLUGGING, /* bio handled through zone write plugging */
BIO_EMULATES_ZONE_APPEND, /* bio emulates a zone append operation */
+ BIO_COMPLETE_IN_TASK, /* complete bi_end_io() in task context */
BIO_FLAG_LAST
};


--
2.39.5