Re: [GIT PULL] io_uring fixes for 5.10-rc
From: Jens Axboe
Date: Fri Nov 20 2020 - 16:36:52 EST
On 11/20/20 1:02 PM, Linus Torvalds wrote:
> On Fri, Nov 20, 2020 at 10:45 AM Jens Axboe <axboe@xxxxxxxxx> wrote:
>> Jens Axboe (4):
>> proc: don't allow async path resolution of /proc/self components
>
> This one is ok.
>
>> io_uring: handle -EOPNOTSUPP on path resolution
>
> But this one smells. It talks about how it shouldn't block, but the
> fact is, it can easily block when the path going through another
> filesystem (think ".." to get to root before even hitting /proc/self,
> but also think /proc/self/cwd/randompathgoeshere).
>
> The whole concept seems entirely broken anyway. Why would you retry
> the failure after doing it asynchronously? If it really doesn't block,
> then it shouldn't have been done async in the first place.
>
> IMNSHO, the openat logic is just wrong. And that "ignore_nonblock"
> thing is a disgusting hack that is everything that is wrong with
> io_uring. Stop doing these kinds of hacky things that will just cause
> problems down the line.
>
> I think the correct thing to do is to just start the open
> synchronously with an RCU lookup, and if that fails, go to the async
> one. And if the async one fails because it's /proc/self, then it just
> fails. None of this kind of "it should be ok" stuff.
>
> And that would likely be the faster model anyway - do it synchronously
> and immediately for the easy cases.
>
> And if it really is something like "/proc/self/cwd/randompathgoeshere"
> that actually will block, maybe io_uring just shouldn't support it?
>
> I've pulled this, but I really object to how io_uring keeps having
> subtle bugs, and then they get worked around with this kind of hackery
> which really smells like "this will be a subtle bug some time in the
> future".
I don't disagree with you on that. I've been a bit gun shy on touching
the VFS side of things, but this one isn't too bad. I hacked up a patch
that allows io_uring to do LOOKUP_RCU and a quick test seems to indicate
it's fine. On top of that, we just propagate the error if we do fail and
get rid of that odd retry loop.
And yes, it should be much better performance as well, for any sort of
cached lookup. There's a reason why we made the close side more
efficient like that, too.
Lightly tested patch below, needs to be split into 2 parts of course.
But the VFS side is just adding a few functions to fs/internal.h and the
struct nameidata structure, no other changes needed.
diff --git a/fs/internal.h b/fs/internal.h
index 6fd14ea213c3..e100d5bca42d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -131,11 +131,41 @@ struct open_flags {
};
extern struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op);
+extern struct file *path_openat(struct nameidata *nd,
+ const struct open_flags *op, unsigned flags);
extern struct file *do_file_open_root(struct dentry *, struct vfsmount *,
const char *, const struct open_flags *);
extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
+#define EMBEDDED_LEVELS 2
+struct nameidata {
+ struct path path;
+ struct qstr last;
+ struct path root;
+ struct inode *inode; /* path.dentry.d_inode */
+ unsigned int flags;
+ unsigned seq, m_seq, r_seq;
+ int last_type;
+ unsigned depth;
+ int total_link_count;
+ struct saved {
+ struct path link;
+ struct delayed_call done;
+ const char *name;
+ unsigned seq;
+ } *stack, internal[EMBEDDED_LEVELS];
+ struct filename *name;
+ struct nameidata *saved;
+ unsigned root_seq;
+ int dfd;
+ kuid_t dir_uid;
+ umode_t dir_mode;
+} __randomize_layout;
+
+extern void set_nameidata(struct nameidata *p, int dfd, struct filename *name);
+extern void restore_nameidata(void);
+
long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 43ba815e4107..896b7f92cfed 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -4069,9 +4069,6 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock)
struct file *file;
int ret;
- if (force_nonblock && !req->open.ignore_nonblock)
- return -EAGAIN;
-
ret = build_open_flags(&req->open.how, &op);
if (ret)
goto err;
@@ -4080,25 +4077,28 @@ static int io_openat2(struct io_kiocb *req, bool force_nonblock)
if (ret < 0)
goto err;
- file = do_filp_open(req->open.dfd, req->open.filename, &op);
- if (IS_ERR(file)) {
- put_unused_fd(ret);
- ret = PTR_ERR(file);
+ if (!force_nonblock) {
+ struct nameidata nd;
+
+ set_nameidata(&nd, req->open.dfd, req->open.filename);
+ file = path_openat(&nd, &op, op.lookup_flags | LOOKUP_RCU);
+ restore_nameidata();
+
/*
- * A work-around to ensure that /proc/self works that way
- * that it should - if we get -EOPNOTSUPP back, then assume
- * that proc_self_get_link() failed us because we're in async
- * context. We should be safe to retry this from the task
- * itself with force_nonblock == false set, as it should not
- * block on lookup. Would be nice to know this upfront and
- * avoid the async dance, but doesn't seem feasible.
+ * If RCU lookup fails, then we need to retry this from
+ * async context.
*/
- if (ret == -EOPNOTSUPP && io_wq_current_is_worker()) {
- req->open.ignore_nonblock = true;
- refcount_inc(&req->refs);
- io_req_task_queue(req);
- return 0;
+ if (file == ERR_PTR(-ECHILD)) {
+ put_unused_fd(ret);
+ return -EAGAIN;
}
+ } else {
+ file = do_filp_open(req->open.dfd, req->open.filename, &op);
+ }
+
+ if (IS_ERR(file)) {
+ put_unused_fd(ret);
+ ret = PTR_ERR(file);
} else {
fsnotify_open(file);
fd_install(ret, file);
diff --git a/fs/namei.c b/fs/namei.c
index 03d0e11e4f36..288fdae18221 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -498,32 +498,7 @@ void path_put(const struct path *path)
}
EXPORT_SYMBOL(path_put);
-#define EMBEDDED_LEVELS 2
-struct nameidata {
- struct path path;
- struct qstr last;
- struct path root;
- struct inode *inode; /* path.dentry.d_inode */
- unsigned int flags;
- unsigned seq, m_seq, r_seq;
- int last_type;
- unsigned depth;
- int total_link_count;
- struct saved {
- struct path link;
- struct delayed_call done;
- const char *name;
- unsigned seq;
- } *stack, internal[EMBEDDED_LEVELS];
- struct filename *name;
- struct nameidata *saved;
- unsigned root_seq;
- int dfd;
- kuid_t dir_uid;
- umode_t dir_mode;
-} __randomize_layout;
-
-static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
+void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
{
struct nameidata *old = current->nameidata;
p->stack = p->internal;
@@ -534,7 +509,7 @@ static void set_nameidata(struct nameidata *p, int dfd, struct filename *name)
current->nameidata = p;
}
-static void restore_nameidata(void)
+void restore_nameidata(void)
{
struct nameidata *now = current->nameidata, *old = now->saved;
@@ -3346,8 +3321,8 @@ static int do_o_path(struct nameidata *nd, unsigned flags, struct file *file)
return error;
}
-static struct file *path_openat(struct nameidata *nd,
- const struct open_flags *op, unsigned flags)
+struct file *path_openat(struct nameidata *nd, const struct open_flags *op,
+ unsigned flags)
{
struct file *file;
int error;
--
Jens Axboe