[RFC PATCH 2/2] io_uring/io-wq: try to batch multiple free work

From: Bui Quang Minh
Date: Thu Feb 20 2025 - 23:27:18 EST


Currently, in case we don't use IORING_SETUP_DEFER_TASKRUN, when io
worker frees work, it needs to add a task work. This creates contention
on tctx->task_list. With this commit, io work queues free work on a
local list and batch multiple free work in one call when the number of
free work in local list exceeds IO_REQ_ALLOC_BATCH.

Signed-off-by: Bui Quang Minh <minhquangbui99@xxxxxxxxx>
---
io_uring/io-wq.c | 62 +++++++++++++++++++++++++++++++++++++++++++--
io_uring/io-wq.h | 4 ++-
io_uring/io_uring.c | 23 ++++++++++++++---
io_uring/io_uring.h | 6 ++++-
4 files changed, 87 insertions(+), 8 deletions(-)

diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c
index 5d0928f37471..096711707db9 100644
--- a/io_uring/io-wq.c
+++ b/io_uring/io-wq.c
@@ -544,6 +544,20 @@ static void io_assign_current_work(struct io_worker *worker,
raw_spin_unlock(&worker->lock);
}

+static void flush_req_free_list(struct llist_head *free_list,
+ struct llist_node *tail)
+{
+ struct io_kiocb *first_req, *last_req;
+
+ first_req = container_of(free_list->first, struct io_kiocb,
+ io_task_work.node);
+ last_req = container_of(tail, struct io_kiocb,
+ io_task_work.node);
+
+ io_req_normal_work_add(first_req, last_req);
+ init_llist_head(free_list);
+}
+
/*
* Called with acct->lock held, drops it before returning
*/
@@ -553,6 +567,10 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
{
struct io_wq *wq = worker->wq;
bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
+ LLIST_HEAD(free_list);
+ int free_req = 0;
+ struct llist_node *tail = NULL;
+ struct io_ring_ctx *last_added_ctx = NULL;

do {
struct io_wq_work *work;
@@ -592,6 +610,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
do {
struct io_wq_work *next_hashed, *linked;
unsigned int hash = io_get_work_hash(work);
+ struct io_kiocb *req = container_of(work,
+ struct io_kiocb, work);
+ bool did_free = false;

next_hashed = wq_next_work(work);

@@ -601,7 +622,41 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
wq->do_work(work);
io_assign_current_work(worker, NULL);

- linked = wq->free_work(work);
+ /*
+ * All requests in free list must have the same
+ * io_ring_ctx.
+ */
+ if (last_added_ctx && last_added_ctx != req->ctx) {
+ flush_req_free_list(&free_list, tail);
+ tail = NULL;
+ last_added_ctx = NULL;
+ free_req = 0;
+ }
+
+ /*
+ * Try to batch free work when
+ * !IORING_SETUP_DEFER_TASKRUN to reduce contention
+ * on tctx->task_list.
+ */
+ if (req->ctx->flags & IORING_SETUP_DEFER_TASKRUN)
+ linked = wq->free_work(work, NULL, NULL);
+ else
+ linked = wq->free_work(work, &free_list, &did_free);
+
+ if (did_free) {
+ if (!tail)
+ tail = free_list.first;
+
+ last_added_ctx = req->ctx;
+ free_req++;
+ if (free_req == IO_REQ_ALLOC_BATCH) {
+ flush_req_free_list(&free_list, tail);
+ tail = NULL;
+ last_added_ctx = NULL;
+ free_req = 0;
+ }
+ }
+
work = next_hashed;
if (!work && linked && !io_wq_is_hashed(linked)) {
work = linked;
@@ -626,6 +681,9 @@ static void io_worker_handle_work(struct io_wq_acct *acct,
break;
raw_spin_lock(&acct->lock);
} while (1);
+
+ if (free_list.first)
+ flush_req_free_list(&free_list, tail);
}

static int io_wq_worker(void *data)
@@ -899,7 +957,7 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wq *wq)
do {
atomic_or(IO_WQ_WORK_CANCEL, &work->flags);
wq->do_work(work);
- work = wq->free_work(work);
+ work = wq->free_work(work, NULL, NULL);
} while (work);
}

diff --git a/io_uring/io-wq.h b/io_uring/io-wq.h
index b3b004a7b625..4f1674d3d68e 100644
--- a/io_uring/io-wq.h
+++ b/io_uring/io-wq.h
@@ -21,7 +21,9 @@ enum io_wq_cancel {
IO_WQ_CANCEL_NOTFOUND, /* work not found */
};

-typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *);
+typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *,
+ struct llist_head *,
+ bool *did_free);
typedef void (io_wq_work_fn)(struct io_wq_work *);

struct io_wq_hash {
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 0c111f7d7832..0343c9ec7271 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -120,7 +120,6 @@
#define IO_TCTX_REFS_CACHE_NR (1U << 10)

#define IO_COMPL_BATCH 32
-#define IO_REQ_ALLOC_BATCH 8
#define IO_LOCAL_TW_DEFAULT_MAX 20

struct io_defer_entry {
@@ -985,13 +984,18 @@ __cold bool __io_alloc_req_refill(struct io_ring_ctx *ctx)
return true;
}

-__cold void io_free_req(struct io_kiocb *req)
+static __cold void io_set_req_free(struct io_kiocb *req)
{
/* refs were already put, restore them for io_req_task_complete() */
req->flags &= ~REQ_F_REFCOUNT;
/* we only want to free it, don't post CQEs */
req->flags |= REQ_F_CQE_SKIP;
req->io_task_work.func = io_req_task_complete;
+}
+
+__cold void io_free_req(struct io_kiocb *req)
+{
+ io_set_req_free(req);
io_req_task_work_add(req);
}

@@ -1772,16 +1776,27 @@ int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
IO_URING_F_COMPLETE_DEFER);
}

-struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
+struct io_wq_work *io_wq_free_work(struct io_wq_work *work,
+ struct llist_head *free_list,
+ bool *did_free)
{
struct io_kiocb *req = container_of(work, struct io_kiocb, work);
struct io_kiocb *nxt = NULL;
+ bool free_req = false;

if (req_ref_put_and_test(req)) {
if (req->flags & IO_REQ_LINK_FLAGS)
nxt = io_req_find_next(req);
- io_free_req(req);
+ io_set_req_free(req);
+ if (free_list)
+ __llist_add(&req->io_task_work.node, free_list);
+ else
+ io_req_task_work_add(req);
+ free_req = true;
}
+ if (did_free)
+ *did_free = free_req;
+
return nxt ? &nxt->work : NULL;
}

diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index bdd6407c14d0..dc050bc44b65 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -54,6 +54,8 @@ struct io_wait_queue {
#endif
};

+#define IO_REQ_ALLOC_BATCH 8
+
static inline bool io_should_wake(struct io_wait_queue *iowq)
{
struct io_ring_ctx *ctx = iowq->ctx;
@@ -111,7 +113,9 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
void __io_submit_flush_completions(struct io_ring_ctx *ctx);

-struct io_wq_work *io_wq_free_work(struct io_wq_work *work);
+struct io_wq_work *io_wq_free_work(struct io_wq_work *work,
+ struct llist_head *free_req,
+ bool *did_free);
void io_wq_submit_work(struct io_wq_work *work);

void io_free_req(struct io_kiocb *req);
--
2.43.0