namei.c LOOKUP_NONBLOCK (was "Re: [GIT PULL] io_uring fixes for 5.10-rc")

From: Jens Axboe
Date: Thu Dec 10 2020 - 12:33:43 EST


On 11/21/20 3:58 PM, Jens Axboe wrote:
> On 11/21/20 11:07 AM, Linus Torvalds wrote:
>> On Fri, Nov 20, 2020 at 7:00 PM Jens Axboe <axboe@xxxxxxxxx> wrote:
>>>
>>> Actually, I think we can do even better. How about just having
>>> do_filp_open() exit after LOOKUP_RCU fails, if LOOKUP_RCU was already
>>> set in the lookup flags? Then we don't need to change much else, and
>>> most of it falls out naturally.
>>
>> So I was thinking doing the RCU lookup unconditionally, and then doing
>> the nn-RCU lookup if that fails afterwards.
>>
>> But your patch looks good to me.
>>
>> Except for the issue you noticed.
>
> After having taken a closer look, I think the saner approach is
> LOOKUP_NONBLOCK instead of using LOOKUP_RCU which is used more as
> a state than lookup flag. I'll try and hack something up that looks
> passable.
>
>>> Except it seems that should work, except LOOKUP_RCU does not guarantee
>>> that we're not going to do IO:
>>
>> Well, almost nothing guarantees lack of IO, since allocations etc can
>> still block, but..
>
> Sure, and we can't always avoid that - but blatant block on waiting
> for IO should be avoided.
>
>>> [ 20.463195] schedule+0x5f/0xd0
>>> [ 20.463444] io_schedule+0x45/0x70
>>> [ 20.463712] bit_wait_io+0x11/0x50
>>> [ 20.463981] __wait_on_bit+0x2c/0x90
>>> [ 20.464264] out_of_line_wait_on_bit+0x86/0x90
>>> [ 20.464611] ? var_wake_function+0x30/0x30
>>> [ 20.464932] __ext4_find_entry+0x2b5/0x410
>>> [ 20.465254] ? d_alloc_parallel+0x241/0x4e0
>>> [ 20.465581] ext4_lookup+0x51/0x1b0
>>> [ 20.465855] ? __d_lookup+0x77/0x120
>>> [ 20.466136] path_openat+0x4e8/0xe40
>>> [ 20.466417] do_filp_open+0x79/0x100
>>
>> Hmm. Is this perhaps an O_CREAT case? I think we only do the dcache
>> lookups under RCU, not the final path component creation.
>
> It's just a basic test that opens all files under a directory. So
> no O_CREAT, it's all existing files. I think this is just a case of not
> aborting early enough for LOOKUP_NONBLOCK, and we've obviously already
> dropped LOOKUP_RCU (and done rcu_read_unlock() again) at this point.
>
>> And there are probably lots of other situations where we finish with
>> LOOKUP_RCU (with unlazy_walk()), and then continue.>
>> Example: look at "may_lookup()" - if inode_permission() says "I can't
>> do this without blocking" the logic actually just tries to validate
>> the current state (that "unlazy_walk()" thing), and then continue
>> without RCU.
>>
>> It obviously hasn't been about lockless semantics, it's been about
>> really being lockless. So LOOKUP_RCU has been a "try to do this
>> locklessly" rather than "you cannot take any locks".
>>
>> I guess we would have to add a LOOKUP_NOBLOCK thing to actually then
>> say "if the RCU lookup fails, return -EAGAIN".
>>
>> That's probably not a huge undertaking, but yeah, I didn't think of
>> it. I think this is a "we need to have Al tell us if it's reasonable".
>
> Definitely. I did have a weak attempt at LOOKUP_NONBLOCK earlier, I'll
> try and resurrect it and see what that leads to. Outside of just pure
> lookup, the d_revalidate() was a bit interesting as it may block for
> certain cases, but those should be (hopefully) detectable upfront.

Here's a potentially better attempt - basically we allow LOOKUP_NONBLOCK
with LOOKUP_RCU, and if we end up dropping LOOKUP_RCU, then we generally
return -EAGAIN if LOOKUP_NONBLOCK is set as we can no longer guarantee
that we won't block.

Al, what do you think? I didn't include the io_uring part here, as
that's just dropping the existing hack and setting LOOKUP_NONBLOCK if
we're in task context. I can send it out as a separate patch, of course.

diff --git a/fs/namei.c b/fs/namei.c
index 03d0e11e4f36..303874f1b9f1 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -679,7 +679,7 @@ static bool legitimize_root(struct nameidata *nd)
* Nothing should touch nameidata between unlazy_walk() failure and
* terminate_walk().
*/
-static int unlazy_walk(struct nameidata *nd)
+static int __unlazy_walk(struct nameidata *nd)
{
struct dentry *parent = nd->path.dentry;

@@ -704,6 +704,18 @@ static int unlazy_walk(struct nameidata *nd)
return -ECHILD;
}

+static int unlazy_walk(struct nameidata *nd)
+{
+ int ret;
+
+ ret = __unlazy_walk(nd);
+ /* If caller is asking for NONBLOCK lookup, assume we can't satisfy it */
+ if (!ret && (nd->flags & LOOKUP_NONBLOCK))
+ ret = -EAGAIN;
+
+ return ret;
+}
+
/**
* unlazy_child - try to switch to ref-walk mode.
* @nd: nameidata pathwalk data
@@ -764,10 +776,13 @@ static int unlazy_child(struct nameidata *nd, struct dentry *dentry, unsigned se

static inline int d_revalidate(struct dentry *dentry, unsigned int flags)
{
- if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE))
+ if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+ if ((flags & (LOOKUP_RCU | LOOKUP_NONBLOCK)) == LOOKUP_NONBLOCK)
+ return -EAGAIN;
return dentry->d_op->d_revalidate(dentry, flags);
- else
- return 1;
+ }
+
+ return 1;
}

/**
@@ -792,7 +807,7 @@ static int complete_walk(struct nameidata *nd)
*/
if (!(nd->flags & (LOOKUP_ROOT | LOOKUP_IS_SCOPED)))
nd->root.mnt = NULL;
- if (unlikely(unlazy_walk(nd)))
+ if (unlikely(__unlazy_walk(nd)))
return -ECHILD;
}

@@ -1466,8 +1481,9 @@ static struct dentry *lookup_fast(struct nameidata *nd,
unsigned seq;
dentry = __d_lookup_rcu(parent, &nd->last, &seq);
if (unlikely(!dentry)) {
- if (unlazy_walk(nd))
- return ERR_PTR(-ECHILD);
+ int ret = unlazy_walk(nd);
+ if (ret)
+ return ERR_PTR(ret);
return NULL;
}

@@ -1569,8 +1585,9 @@ static inline int may_lookup(struct nameidata *nd)
int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
if (err != -ECHILD)
return err;
- if (unlazy_walk(nd))
- return -ECHILD;
+ err = unlazy_walk(nd);
+ if (err)
+ return err;
}
return inode_permission(nd->inode, MAY_EXEC);
}
@@ -1591,9 +1608,11 @@ static int reserve_stack(struct nameidata *nd, struct path *link, unsigned seq)
// we need to grab link before we do unlazy. And we can't skip
// unlazy even if we fail to grab the link - cleanup needs it
bool grabbed_link = legitimize_path(nd, link, seq);
+ int ret;

- if (unlazy_walk(nd) != 0 || !grabbed_link)
- return -ECHILD;
+ ret = unlazy_walk(nd);
+ if (ret && !grabbed_link)
+ return ret;

if (nd_alloc_stack(nd))
return 0;
@@ -1634,8 +1653,9 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
touch_atime(&last->link);
cond_resched();
} else if (atime_needs_update(&last->link, inode)) {
- if (unlikely(unlazy_walk(nd)))
- return ERR_PTR(-ECHILD);
+ error = unlazy_walk(nd);
+ if (unlikely(error))
+ return ERR_PTR(error);
touch_atime(&last->link);
}

@@ -1652,8 +1672,9 @@ static const char *pick_link(struct nameidata *nd, struct path *link,
if (nd->flags & LOOKUP_RCU) {
res = get(NULL, inode, &last->done);
if (res == ERR_PTR(-ECHILD)) {
- if (unlikely(unlazy_walk(nd)))
- return ERR_PTR(-ECHILD);
+ error = unlazy_walk(nd);
+ if (unlikely(error))
+ return ERR_PTR(error);
res = get(link->dentry, inode, &last->done);
}
} else {
@@ -2193,8 +2214,9 @@ static int link_path_walk(const char *name, struct nameidata *nd)
}
if (unlikely(!d_can_lookup(nd->path.dentry))) {
if (nd->flags & LOOKUP_RCU) {
- if (unlazy_walk(nd))
- return -ECHILD;
+ err = unlazy_walk(nd);
+ if (err)
+ return err;
}
return -ENOTDIR;
}
@@ -3394,10 +3416,14 @@ struct file *do_filp_open(int dfd, struct filename *pathname,

set_nameidata(&nd, dfd, pathname);
filp = path_openat(&nd, op, flags | LOOKUP_RCU);
+ /* If we fail RCU lookup, assume NONBLOCK cannot be honored */
+ if (flags & LOOKUP_NONBLOCK)
+ goto out;
if (unlikely(filp == ERR_PTR(-ECHILD)))
filp = path_openat(&nd, op, flags);
if (unlikely(filp == ERR_PTR(-ESTALE)))
filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
+out:
restore_nameidata();
return filp;
}
diff --git a/include/linux/namei.h b/include/linux/namei.h
index a4bb992623c4..c36c4e0805fc 100644
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -46,6 +46,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};
#define LOOKUP_NO_XDEV 0x040000 /* No mountpoint crossing. */
#define LOOKUP_BENEATH 0x080000 /* No escaping from starting point. */
#define LOOKUP_IN_ROOT 0x100000 /* Treat dirfd as fs root. */
+#define LOOKUP_NONBLOCK 0x200000 /* don't block for lookup */
/* LOOKUP_* flags which do scope-related checks based on the dirfd. */
#define LOOKUP_IS_SCOPED (LOOKUP_BENEATH | LOOKUP_IN_ROOT)


--
Jens Axboe