[PATCH 05/11] fs/dcache: Reclaim excessive negative dentries in directories

From: Waiman Long
Date: Wed Feb 26 2020 - 11:15:25 EST


When the "dentry-dir-max" sysctl parameter is set, it will enable the
checking of dentry count in the parent directory when a negative dentry
is being retained. If the count exceeds the given limit, it will schedule
a work function to scan the children of that parent directory to find
negative dentries to be reclaimed. Positive dentries will not be touched.

Signed-off-by: Waiman Long <longman@xxxxxxxxxx>
---
fs/dcache.c | 207 +++++++++++++++++++++++++++++++++++++++++
include/linux/dcache.h | 2 +
2 files changed, 209 insertions(+)

diff --git a/fs/dcache.c b/fs/dcache.c
index 8f3ac31a582b..01c6d7277244 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -133,6 +133,11 @@ static DEFINE_PER_CPU(long, nr_dentry_negative);
int dcache_dentry_dir_max_sysctl __read_mostly;
EXPORT_SYMBOL_GPL(dcache_dentry_dir_max_sysctl);

+static LLIST_HEAD(negative_reclaim_list);
+static void negative_reclaim_check(struct dentry *parent);
+static void negative_reclaim_workfn(struct work_struct *work);
+static DECLARE_WORK(negative_reclaim_work, negative_reclaim_workfn);
+
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)

/*
@@ -869,7 +874,20 @@ void dput(struct dentry *dentry)
rcu_read_unlock();

if (likely(retain_dentry(dentry))) {
+ struct dentry *neg_parent = NULL;
+
+ if (d_is_negative(dentry)) {
+ neg_parent = dentry->d_parent;
+ rcu_read_lock();
+ }
spin_unlock(&dentry->d_lock);
+
+ /*
+ * Negative dentry reclaim check is only done when
+ * a negative dentry is being put into a LRU list.
+ */
+ if (neg_parent)
+ negative_reclaim_check(neg_parent);
return;
}

@@ -1261,6 +1279,195 @@ void shrink_dcache_sb(struct super_block *sb)
}
EXPORT_SYMBOL(shrink_dcache_sb);

+/*
+ * Return true if reclaiming negative dentry can happen.
+ */
+static inline bool can_reclaim_dentry(unsigned int flags)
+{
+ return !(flags & (DCACHE_SHRINK_LIST | DCACHE_GENOCIDE |
+ DCACHE_DENTRY_KILLED));
+}
+
+struct reclaim_dentry
+{
+ struct llist_node reclaim_node;
+ struct dentry *parent_dir;
+};
+
+/*
+ * Reclaim excess negative dentries in a directory
+ */
+static void reclaim_negative_dentry(struct dentry *parent,
+ struct list_head *dispose)
+{
+ struct dentry *child;
+ int limit = READ_ONCE(dcache_dentry_dir_max_sysctl);
+ int cnt;
+
+ lockdep_assert_held(&parent->d_lock);
+
+ cnt = parent->d_nchildren;
+
+ /*
+ * Compute # of negative dentries to be reclaimed
+ * An extra 1/8 of dcache_dentry_dir_max_sysctl is added.
+ */
+ if (cnt <= limit)
+ return;
+ cnt -= limit;
+ cnt += (limit >> 3);
+
+ /*
+ * The d_subdirs is treated like a kind of LRU where
+ * non-negative dentries are skipped. Negative dentries
+ * with DCACHE_REFERENCED bit set are also skipped but
+ * with DCACHE_REFERENCED cleared.
+ */
+ list_for_each_entry(child, &parent->d_subdirs, d_child) {
+ /*
+ * This check is racy and so it may not be accurate.
+ */
+ if (!d_is_negative(child))
+ continue;
+
+ if (!spin_trylock(&child->d_lock))
+ continue;
+
+ /*
+ * Only reclaim zero-refcnt negative dentries in the
+ * LRU, but not in shrink list.
+ */
+ if (can_reclaim_dentry(child->d_flags) &&
+ d_is_negative(child) && d_in_lru(child) &&
+ !child->d_lockref.count) {
+ if (child->d_flags & DCACHE_REFERENCED) {
+ child->d_flags &= ~DCACHE_REFERENCED;
+ } else {
+ cnt--;
+ d_lru_del(child);
+ d_shrink_add(child, dispose);
+ }
+ }
+ spin_unlock(&child->d_lock);
+ if (!cnt) {
+ child = list_next_entry(child, d_child);
+ break;
+ }
+ }
+
+ if (&child->d_child != &parent->d_subdirs) {
+ /*
+ * Split out the childs from the head up to just
+ * before the current entry into a separate list and
+ * then splice it to the end of the child list so
+ * that the unscanned entries will be in the front.
+ * This simulates LRU.
+ */
+ struct list_head list;
+
+ list_cut_before(&list, &parent->d_subdirs,
+ &child->d_child);
+ list_splice_tail(&list, &parent->d_subdirs);
+ }
+}
+
+/*
+ * Excess negative dentry reclaim work function.
+ */
+static void negative_reclaim_workfn(struct work_struct *work)
+{
+ struct llist_node *nodes, *next;
+ struct dentry *parent;
+ struct reclaim_dentry *dentry_node;
+
+ /*
+ * Collect excess negative dentries in dispose list & shrink them.
+ */
+ for (nodes = llist_del_all(&negative_reclaim_list);
+ nodes; nodes = next) {
+ LIST_HEAD(dispose);
+
+ next = llist_next(nodes);
+ dentry_node = container_of(nodes, struct reclaim_dentry,
+ reclaim_node);
+ parent = dentry_node->parent_dir;
+ spin_lock(&parent->d_lock);
+
+ if (d_is_dir(parent) &&
+ can_reclaim_dentry(parent->d_flags) &&
+ (parent->d_flags & DCACHE_RECLAIMING))
+ reclaim_negative_dentry(parent, &dispose);
+
+ if (!list_empty(&dispose)) {
+ spin_unlock(&parent->d_lock);
+ shrink_dentry_list(&dispose);
+ spin_lock(&parent->d_lock);
+ }
+
+ parent->d_flags &= ~DCACHE_RECLAIMING;
+ spin_unlock(&parent->d_lock);
+ dput(parent);
+ kfree(dentry_node);
+ cond_resched();
+ }
+}
+
+/*
+ * Check the parent to see if it has too many negative dentries and queue
+ * it up for the negative dentry reclaim work function to handle it.
+ */
+static void negative_reclaim_check(struct dentry *parent)
+ __releases(rcu)
+{
+ int limit = dcache_dentry_dir_max_sysctl;
+ struct reclaim_dentry *dentry_node;
+
+ if (!limit)
+ goto rcu_unlock_out;
+
+ /*
+ * These checks are racy before spin_lock().
+ */
+ if (!can_reclaim_dentry(parent->d_flags) ||
+ (parent->d_flags & DCACHE_RECLAIMING) ||
+ (READ_ONCE(parent->d_nchildren) <= limit))
+ goto rcu_unlock_out;
+
+ spin_lock(&parent->d_lock);
+ if (!can_reclaim_dentry(parent->d_flags) ||
+ (parent->d_flags & DCACHE_RECLAIMING) ||
+ (READ_ONCE(parent->d_nchildren) <= limit))
+ goto unlock_out;
+
+ if (!d_is_dir(parent))
+ goto unlock_out;
+
+ dentry_node = kzalloc(sizeof(*dentry_node), GFP_NOWAIT);
+ if (!dentry_node)
+ goto unlock_out;
+
+ rcu_read_unlock();
+ __dget_dlock(parent);
+ dentry_node->parent_dir = parent;
+ parent->d_flags |= DCACHE_RECLAIMING;
+ spin_unlock(&parent->d_lock);
+
+ /*
+ * Only call schedule_work() if negative_reclaim_list is previously
+ * empty. Otherwise, schedule_work() had been called but the workfn
+ * workfn hasn't retrieved the list yet.
+ */
+ if (llist_add(&dentry_node->reclaim_node, &negative_reclaim_list))
+ schedule_work(&negative_reclaim_work);
+ return;
+
+unlock_out:
+ spin_unlock(&parent->d_lock);
+rcu_unlock_out:
+ rcu_read_unlock();
+ return;
+}
+
/**
* enum d_walk_ret - action to talke during tree walk
* @D_WALK_CONTINUE: contrinue walk
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index e9e66eb50d1a..f14d72738903 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -13,6 +13,7 @@
#include <linux/lockref.h>
#include <linux/stringhash.h>
#include <linux/wait.h>
+#include <linux/llist.h>

struct path;
struct vfsmount;
@@ -214,6 +215,7 @@ struct dentry_operations {
#define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */
#define DCACHE_ENCRYPTED_NAME 0x02000000 /* Encrypted name (dir key was unavailable) */
#define DCACHE_OP_REAL 0x04000000
+#define DCACHE_RECLAIMING 0x08000000 /* Reclaiming negative dentries */

#define DCACHE_PAR_LOOKUP 0x10000000 /* being looked up (with parent locked shared) */
#define DCACHE_DENTRY_CURSOR 0x20000000
--
2.18.1