commit 8c8d4bb18dbaacfe88a4ce758610b386028499ba Author: Linus Torvalds Date: Thu May 31 10:26:52 2012 -0700 vfs: add simple direct-mapped dcache lookup front-end I've pushed __d_lookup_rcu() just about as far as I could, and it still had some problems. The problems were mainly due to: - the complexity of the slow-case handling causes register spills - the hash chain lookup loop causes not only register pressure, but also the extra magic "mask off lock bit from the hash chain head pointer" etc logic - the hash list need to be dynamically sized (we want *big* caches, but you can't use the same size for big and small machines), which causes the initial hash lookup itself to be more complex. This looks like a viable solution to all three problems, and it is actually surprisingly simple: make a trivial fixed-size direct-mapped L1 dentry cache. No chains, no locking, no nothing. This gives measurable improvement on my microbenchmark, and gets good hit-rates on both kernel compiles and even on something like "updatedb", which I'd have expected to be one of the worst possible cases. Apparently updatedb still ends up looking up the same files (/etc/fstab etc) a lot. So those good hit-rates seem to often be due to really stupid programming, but hey, I think we all agree that "stupid programming" is likely the common case that we generally do need to also optimize for ;) For my kernel compile benchmark ("make -j" on a fully built tree), the profile shows (this is kernel-only profile, so user space overhead removed): 8.19% [k] link_path_walk 7.74% [k] __d_lookup_rcu 5.66% [k] selinux_inode_permission 3.73% [k] do_lookup 2.86% [k] path_lookupat 2.72% [k] avc_has_perm_noaudit 2.71% [k] inode_has_perm.isra.49.constprop.71 2.68% [k] avc_lookup 2.51% [k] generic_permission ... 0.78% [k] __d_lookup_rcu_slow ... where "__d_lookup_rcu_slow()" is the exact same old __d_lookup_rcu(), so it's not really "slow", but it's quite noticeably slower than the new streamlined __d_lookup_rcu(). And as you can tell, that means that we must have a 90%+ hitrate in the new L1 dcache lookup, since we only see 10% as much time in the slow routine as in the L1 front-end. HOWEVER. The fast L1 lookup right now is subtly buggy: not the lookup itself, but the code that adds entries to the L1 is racy with those entries being removed. I added a comment on it, and I think it's quite solvable (and it's all in the slow-path), but I'll need to think it through. Cc: Al Viro Cc: Nick Piggin Cc: Miklos Szeredi Signed-off-by: Linus Torvalds --- fs/dcache.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index 4435d8b32904..f549f6505e53 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -402,6 +402,45 @@ static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent) } /* + * This has a NULL parent and zero length, and will thus + * never match anything. But it means that the dcache_l1 + * array never contains NULL, so you don't need to check. + */ +static struct dentry invalid_dentry; + +#define L1_DCACHE_BITS (13) +#define L1_DCACHE_SIZE (1u << L1_DCACHE_BITS) +static struct dentry *dcache_l1[L1_DCACHE_SIZE] = { + [0 ... L1_DCACHE_SIZE-1] = &invalid_dentry +}; + +static unsigned int dentry_l1_index(unsigned int hash, const struct dentry *parent) +{ + hash += (unsigned long) parent / L1_CACHE_BYTES; + hash = hash + (hash >> L1_DCACHE_BITS); + return hash & (L1_DCACHE_SIZE-1); +} + +/* Must be called with d_lock held */ +static void d_remove_from_l1(const struct dentry *dentry) +{ + unsigned int n = dentry_l1_index(dentry->d_name.hash, dentry->d_parent); + dcache_l1[n] = &invalid_dentry; +} + +static void d_add_to_l1(struct dentry *dentry, unsigned int hash, const struct dentry *parent) +{ + unsigned int n = dentry_l1_index(hash, parent); + dcache_l1[n] = dentry; +} + +static struct dentry *d_lookup_l1(unsigned int hash, const struct dentry *parent) +{ + unsigned int n = dentry_l1_index(hash, parent); + return ACCESS_ONCE(dcache_l1[n]); +} + +/* * Unhash a dentry without inserting an RCU walk barrier or checking that * dentry->d_lock is locked. The caller must take care of that, if * appropriate. @@ -416,6 +455,7 @@ static void __d_shrink(struct dentry *dentry) b = d_hash(dentry->d_parent, dentry->d_name.hash); hlist_bl_lock(b); + d_remove_from_l1(dentry); __hlist_bl_del(&dentry->d_hash); dentry->d_hash.pprev = NULL; hlist_bl_unlock(b); @@ -1825,7 +1865,7 @@ static noinline enum slow_d_compare slow_dentry_cmp( * NOTE! The caller *has* to check the resulting dentry against the sequence * number we've returned before using any of the resulting dentry state! */ -struct dentry *__d_lookup_rcu(const struct dentry *parent, +static noinline struct dentry *__d_lookup_rcu_slow(const struct dentry *parent, const struct qstr *name, unsigned *seqp, struct inode *inode) { @@ -1896,12 +1936,49 @@ seqretry: if (dentry->d_name.hash_len != hashlen) continue; - if (!dentry_cmp(dentry, str, hashlen_len(hashlen))) + if (!dentry_cmp(dentry, str, hashlen_len(hashlen))) { + /* FIXME! RACY! What if it just got unhashed? */ + d_add_to_l1(dentry, hashlen_hash(hashlen), parent); return dentry; + } } return NULL; } +/* + * Fast non-chained L1 hash lookup. + * + * NOTE! We don't need to worry about DCACHE_OP_COMPARE, because + * dentries with complex parents never get added to the L1 cache. + * + * We also don't need to worry about d_lookup_l1() returning NULL, + * because we fill the cache with otherwise valid dentries that do + * not match anything. + */ +struct dentry *__d_lookup_rcu(const struct dentry *parent, + const struct qstr *name, + unsigned *seqp, struct inode *inode) +{ + u64 hashlen = name->hash_len; + struct dentry *dentry = d_lookup_l1(hashlen_hash(hashlen), parent); + unsigned int seq; + + do { + seq = raw_seqcount_begin(&dentry->d_seq); + if (unlikely(dentry->d_parent != parent)) + break; + *seqp = seq; + if (unlikely(dentry->d_name.hash_len != hashlen)) + break; + if (unlikely(dentry_cmp(dentry, name->name, hashlen_len(hashlen)))) + break; + if (unlikely(d_unhashed(dentry))) + break; + return dentry; + } while (0); + return __d_lookup_rcu_slow(parent, name, seqp, inode); +} + /** * d_lookup - search for a dentry * @parent: parent dentry