Re: [PATCH] io_uring: Optimization of buffered random write

From: Jens Axboe
Date: Wed Apr 19 2023 - 10:12:53 EST


On 4/19/23 7:32?AM, Jens Axboe wrote:
> On 4/19/23 3:22?AM, luhongfei wrote:
>> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
>> index 4a865f0e85d0..64bb91beb4d6
>> --- a/io_uring/io_uring.c
>> +++ b/io_uring/io_uring.c
>> @@ -2075,8 +2075,23 @@ static inline void io_queue_sqe(struct io_kiocb *req)
>> __must_hold(&req->ctx->uring_lock)
>> {
>> int ret;
>> + bool is_write;
>>
>> - ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
>> + switch (req->opcode) {
>> + case IORING_OP_WRITEV:
>> + case IORING_OP_WRITE_FIXED:
>> + case IORING_OP_WRITE:
>> + is_write = true;
>> + break;
>> + default:
>> + is_write = false;
>> + break;
>> + }
>> +
>> + if (!is_write || (req->rw.kiocb.ki_flags & IOCB_DIRECT))
>> + ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
>> + else
>> + ret = io_issue_sqe(req, 0);
>>
>> /*
>> * We async punt it if the file wasn't marked NOWAIT, or if the file
>
> We really can't just do that, implicitly. What you are doing is making
> any of write synchronous. What are you writing to in terms of device or
> file? If file, what file system is being used? Curious if the target
> supports async buffered writes, guessing it does not which is why you
> see io-wq activity for all of them.
>
> That said, I did toss out a test patch a while back that explicitly sets
> up the ring such that we'll do blocking IO rather than do a non-blocking
> attempt and then punt it if that fails. And I do think there's a use
> case for that, in case you just want to use io_uring for batched
> syscalls and don't care about if you end up blocking for some IO.
>
> Let's do a primer on what happens for io_uring issue:
>
> 1) Non-blocking issue is attempted for IO. If successful, we're done for
> now.
>
> 2) Case 1 failed. Now we have two options
> a) We can poll the file. We arm poll, and we're done for now
> until that triggers.
> b) File cannot be polled, we punt to io-wq which then does a
> blocking attempt.
>
> For case 2b, this is the one where we could've just done a blocking
> attempt initially if the ring was setup with a flag explicitly saying
> that's what the application wants. Or io_uring_enter() had a flag passed
> in that explicitly said this is what the applications wants. I suspect
> we'll want both, to cover both SQPOLL and !SQPOLL.
>
> I'd recommend we still retain non-blocking issue for pollable files, as
> you could very quickly block forever otherwise. Imagine an empty pipe
> and a read issued to it in the blocking mode.
>
> A solution like that would cater to your case too, without potentially
> breaking a lot of things like your patch could. The key here is the
> explicit nature of it, we cannot just go and make odd assumptions about
> a particular opcode type (writes) and ring type (SQPOLL) and say "oh
> this one is fine for just ignoring blocking off the issue path".

Something like this, totally untested. You can either setup the ring
with IORING_SETUP_NO_OFFLOAD, or you can pass in IORING_ENTER_NO_OFFLOAD
to achieve the same thing but on a per-invocation of io_uring_enter(2)
basis.

I suspect this would be cleaner with an io_kiocb flag for this, so we
can make the retry paths correct as well and avoid passing 'no_offload'
too much around. I'll probably clean it up with that and actually try
and test it.

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 0716cb17e436..ea903a677ce9 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -173,6 +173,12 @@ enum {
*/
#define IORING_SETUP_DEFER_TASKRUN (1U << 13)

+/*
+ * Don't attempt non-blocking issue on file types that would otherwise
+ * punt to io-wq if they cannot be completed non-blocking.
+ */
+#define IORING_SETUP_NO_OFFLOAD (1U << 14)
+
enum io_uring_op {
IORING_OP_NOP,
IORING_OP_READV,
@@ -443,6 +449,7 @@ struct io_cqring_offsets {
#define IORING_ENTER_SQ_WAIT (1U << 2)
#define IORING_ENTER_EXT_ARG (1U << 3)
#define IORING_ENTER_REGISTERED_RING (1U << 4)
+#define IORING_ENTER_NO_OFFLOAD (1U << 5)

/*
* Passed in for io_uring_setup(2). Copied back with updated info on success
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 3bca7a79efda..431e41701991 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -147,7 +147,7 @@ static bool io_uring_try_cancel_requests(struct io_ring_ctx *ctx,

static void io_dismantle_req(struct io_kiocb *req);
static void io_clean_op(struct io_kiocb *req);
-static void io_queue_sqe(struct io_kiocb *req);
+static void io_queue_sqe(struct io_kiocb *req, bool no_offload);
static void io_move_task_work_from_local(struct io_ring_ctx *ctx);
static void __io_submit_flush_completions(struct io_ring_ctx *ctx);
static __cold void io_fallback_tw(struct io_uring_task *tctx);
@@ -1471,7 +1471,7 @@ void io_req_task_submit(struct io_kiocb *req, struct io_tw_state *ts)
else if (req->flags & REQ_F_FORCE_ASYNC)
io_queue_iowq(req, ts);
else
- io_queue_sqe(req);
+ io_queue_sqe(req, false);
}

void io_req_task_queue_fail(struct io_kiocb *req, int ret)
@@ -1938,7 +1938,8 @@ static bool io_assign_file(struct io_kiocb *req, const struct io_issue_def *def,
return !!req->file;
}

-static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
+static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags,
+ bool no_offload)
{
const struct io_issue_def *def = &io_issue_defs[req->opcode];
const struct cred *creds = NULL;
@@ -1947,6 +1948,9 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags)
if (unlikely(!io_assign_file(req, def, issue_flags)))
return -EBADF;

+ if (no_offload && (!req->file || !file_can_poll(req->file)))
+ issue_flags &= ~IO_URING_F_NONBLOCK;
+
if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred()))
creds = override_creds(req->creds);

@@ -1980,7 +1984,7 @@ int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts)
{
io_tw_lock(req->ctx, ts);
return io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_MULTISHOT|
- IO_URING_F_COMPLETE_DEFER);
+ IO_URING_F_COMPLETE_DEFER, false);
}

struct io_wq_work *io_wq_free_work(struct io_wq_work *work)
@@ -2029,7 +2033,7 @@ void io_wq_submit_work(struct io_wq_work *work)
}

do {
- ret = io_issue_sqe(req, issue_flags);
+ ret = io_issue_sqe(req, issue_flags, false);
if (ret != -EAGAIN)
break;
/*
@@ -2120,12 +2124,13 @@ static void io_queue_async(struct io_kiocb *req, int ret)
io_queue_linked_timeout(linked_timeout);
}

-static inline void io_queue_sqe(struct io_kiocb *req)
+static inline void io_queue_sqe(struct io_kiocb *req, bool no_offload)
__must_hold(&req->ctx->uring_lock)
{
int ret;

- ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER);
+ ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER,
+ no_offload);

/*
* We async punt it if the file wasn't marked NOWAIT, or if the file
@@ -2337,7 +2342,7 @@ static __cold int io_submit_fail_init(const struct io_uring_sqe *sqe,
}

static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
- const struct io_uring_sqe *sqe)
+ const struct io_uring_sqe *sqe, bool no_offload)
__must_hold(&ctx->uring_lock)
{
struct io_submit_link *link = &ctx->submit_state.link;
@@ -2385,7 +2390,7 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
return 0;
}

- io_queue_sqe(req);
+ io_queue_sqe(req, no_offload);
return 0;
}

@@ -2466,7 +2471,7 @@ static bool io_get_sqe(struct io_ring_ctx *ctx, const struct io_uring_sqe **sqe)
return false;
}

-int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
+int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, bool no_offload)
__must_hold(&ctx->uring_lock)
{
unsigned int entries = io_sqring_entries(ctx);
@@ -2495,7 +2500,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr)
* Continue submitting even for sqe failure if the
* ring was setup with IORING_SETUP_SUBMIT_ALL
*/
- if (unlikely(io_submit_sqe(ctx, req, sqe)) &&
+ if (unlikely(io_submit_sqe(ctx, req, sqe, no_offload)) &&
!(ctx->flags & IORING_SETUP_SUBMIT_ALL)) {
left--;
break;
@@ -3524,7 +3529,8 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,

if (unlikely(flags & ~(IORING_ENTER_GETEVENTS | IORING_ENTER_SQ_WAKEUP |
IORING_ENTER_SQ_WAIT | IORING_ENTER_EXT_ARG |
- IORING_ENTER_REGISTERED_RING)))
+ IORING_ENTER_REGISTERED_RING |
+ IORING_ENTER_NO_OFFLOAD)))
return -EINVAL;

/*
@@ -3575,12 +3581,17 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit,

ret = to_submit;
} else if (to_submit) {
+ bool no_offload;
+
ret = io_uring_add_tctx_node(ctx);
if (unlikely(ret))
goto out;

+ no_offload = flags & IORING_ENTER_NO_OFFLOAD ||
+ ctx->flags & IORING_SETUP_NO_OFFLOAD;
+
mutex_lock(&ctx->uring_lock);
- ret = io_submit_sqes(ctx, to_submit);
+ ret = io_submit_sqes(ctx, to_submit, no_offload);
if (ret != to_submit) {
mutex_unlock(&ctx->uring_lock);
goto out;
@@ -3969,7 +3980,8 @@ static long io_uring_setup(u32 entries, struct io_uring_params __user *params)
IORING_SETUP_R_DISABLED | IORING_SETUP_SUBMIT_ALL |
IORING_SETUP_COOP_TASKRUN | IORING_SETUP_TASKRUN_FLAG |
IORING_SETUP_SQE128 | IORING_SETUP_CQE32 |
- IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN))
+ IORING_SETUP_SINGLE_ISSUER | IORING_SETUP_DEFER_TASKRUN |
+ IORING_SETUP_NO_OFFLOAD))
return -EINVAL;

return io_uring_create(entries, &p, params);
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h
index 25515d69d205..c5c0db7232c0 100644
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -76,7 +76,7 @@ int io_uring_alloc_task_context(struct task_struct *task,
struct io_ring_ctx *ctx);

int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
-int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);
+int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr, bool no_offload);
int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin);
void io_free_batch_list(struct io_ring_ctx *ctx, struct io_wq_work_node *node);
int io_req_prep_async(struct io_kiocb *req);
diff --git a/io_uring/sqpoll.c b/io_uring/sqpoll.c
index 9db4bc1f521a..9a9417bf9e3f 100644
--- a/io_uring/sqpoll.c
+++ b/io_uring/sqpoll.c
@@ -166,6 +166,7 @@ static inline bool io_sqd_events_pending(struct io_sq_data *sqd)

static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
{
+ bool no_offload = ctx->flags & IORING_SETUP_NO_OFFLOAD;
unsigned int to_submit;
int ret = 0;

@@ -190,7 +191,7 @@ static int __io_sq_thread(struct io_ring_ctx *ctx, bool cap_entries)
*/
if (to_submit && likely(!percpu_ref_is_dying(&ctx->refs)) &&
!(ctx->flags & IORING_SETUP_R_DISABLED))
- ret = io_submit_sqes(ctx, to_submit);
+ ret = io_submit_sqes(ctx, to_submit, no_offload);
mutex_unlock(&ctx->uring_lock);

if (to_submit && wq_has_sleeper(&ctx->sqo_sq_wait))

--
Jens Axboe