[PATCH v3 2/2] io_uring: batch getting pcpu references

From: Pavel Begunkov
Date: Sat Dec 21 2019 - 15:13:36 EST


percpu_ref_tryget() has its own overhead. Instead getting a reference
for each request, grab a bunch once per io_submit_sqes().

basic benchmark with submit and wait 128 non-linked nops showed ~5%
performance gain. (7044 KIOPS vs 7423)

Signed-off-by: Pavel Begunkov <asml.silence@xxxxxxxxx>
---

It's just becoming more bulky with ret for me, and would love to hear,
hot to make it clearer. This version removes all error handling from
hot path, though with goto.

fs/io_uring.c | 14 ++++++++++----
1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/fs/io_uring.c b/fs/io_uring.c
index 513f1922ce6a..b89a8b975c69 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1045,9 +1045,6 @@ static struct io_kiocb *io_get_req(struct io_ring_ctx *ctx,
gfp_t gfp = GFP_KERNEL | __GFP_NOWARN;
struct io_kiocb *req;

- if (!percpu_ref_tryget(&ctx->refs))
- return NULL;
-
if (!state) {
req = kmem_cache_alloc(req_cachep, gfp);
if (unlikely(!req))
@@ -4400,6 +4397,9 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
return -EBUSY;
}

+ if (!percpu_ref_tryget_many(&ctx->refs, nr))
+ return -EAGAIN;
+
if (nr > IO_PLUG_THRESHOLD) {
io_submit_state_start(&state, nr);
statep = &state;
@@ -4408,16 +4408,22 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr,
for (i = 0; i < nr; i++) {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;
+ unsigned int unused_refs;

req = io_get_req(ctx, statep);
if (unlikely(!req)) {
+ unused_refs = nr - submitted;
if (!submitted)
submitted = -EAGAIN;
+put_refs:
+ percpu_ref_put_many(&ctx->refs, unused_refs);
break;
}
if (!io_get_sqring(ctx, req, &sqe)) {
__io_free_req(req);
- break;
+ /* __io_free_req() puts a ref */
+ unused_refs = nr - submitted - 1;
+ goto put_refs;
}

/* will complete beyond this point, count as submitted */
--
2.24.0