Re: [PATCH] proc/sysctl: prune stale dentries during unregistering

From: Konstantin Khlebnikov
Date: Sat Feb 18 2017 - 13:56:20 EST


This patch has locking problem. I've got lockdep splat under LTP.

[ 6633.115456] ======================================================
[ 6633.115502] [ INFO: possible circular locking dependency detected ]
[ 6633.115553] 4.9.10-debug+ #9 Tainted: G L
[ 6633.115584] -------------------------------------------------------
[ 6633.115627] ksm02/284980 is trying to acquire lock:
[ 6633.115659] (&sb->s_type->i_lock_key#4){+.+...}, at: [<ffffffff816bc1ce>] igrab+0x1e/0x80
[ 6633.115834] but task is already holding lock:
[ 6633.115882] (sysctl_lock){+.+...}, at: [<ffffffff817e379b>] unregister_sysctl_table+0x6b/0x110
[ 6633.116026] which lock already depends on the new lock.
[ 6633.116026]
[ 6633.116080]
[ 6633.116080] the existing dependency chain (in reverse order) is:
[ 6633.116117]
-> #2 (sysctl_lock){+.+...}:
-> #1 (&(&dentry->d_lockref.lock)->rlock){+.+...}:
-> #0 (&sb->s_type->i_lock_key#4){+.+...}:

d_lock nests inside i_lock
sysctl_lock nests inside d_lock in d_compare

This patch adds i_lock nesting inside sysctl_lock.

On 10.02.2017 10:35, Konstantin Khlebnikov wrote:
Currently unregistering sysctl table does not prune its dentries.
Stale dentries could slowdown sysctl operations significantly.

For example, command:

# for i in {1..100000} ; do unshare -n -- sysctl -a &> /dev/null ; done

creates a millions of stale denties around sysctls of loopback interface:

# sysctl fs.dentry-state
fs.dentry-state = 25812579 24724135 45 0 0 0

All of them have matching names thus lookup have to scan though whole
hash chain and call d_compare (proc_sys_compare) which checks them
under system-wide spinlock (sysctl_lock).

# time sysctl -a > /dev/null
real 1m12.806s
user 0m0.016s
sys 1m12.400s

Currently only memory reclaimer could remove this garbage.
But without significant memory pressure this never happens.

This patch collects sysctl inodes into list on sysctl table header and
prunes all their dentries once that table unregisters.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxxxxxx>
Suggested-by: Al Viro <viro@xxxxxxxxxxxxxxxxxx>
---
fs/proc/inode.c | 3 ++
fs/proc/internal.h | 7 ++++--
fs/proc/proc_sysctl.c | 59 +++++++++++++++++++++++++++++++++++-------------
include/linux/sysctl.h | 1 +
4 files changed, 51 insertions(+), 19 deletions(-)

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 842a5ff5b85c..7ad9ed7958af 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -43,10 +43,11 @@ static void proc_evict_inode(struct inode *inode)
de = PDE(inode);
if (de)
pde_put(de);
+
head = PROC_I(inode)->sysctl;
if (head) {
RCU_INIT_POINTER(PROC_I(inode)->sysctl, NULL);
- sysctl_head_put(head);
+ proc_sys_evict_inode(inode, head);
}
}

diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 2de5194ba378..ed1d762160e6 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -65,6 +65,7 @@ struct proc_inode {
struct proc_dir_entry *pde;
struct ctl_table_header *sysctl;
struct ctl_table *sysctl_entry;
+ struct list_head sysctl_inodes;
const struct proc_ns_operations *ns_ops;
struct inode vfs_inode;
};
@@ -249,10 +250,12 @@ extern void proc_thread_self_init(void);
*/
#ifdef CONFIG_PROC_SYSCTL
extern int proc_sys_init(void);
-extern void sysctl_head_put(struct ctl_table_header *);
+extern void proc_sys_evict_inode(struct inode *inode,
+ struct ctl_table_header *head);
#else
static inline void proc_sys_init(void) { }
-static inline void sysctl_head_put(struct ctl_table_header *head) { }
+static inline void proc_sys_evict_inode(struct inode *inode,
+ struct ctl_table_header *head) { }
#endif

/*
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index d4e37acd4821..8efb1e10b025 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -190,6 +190,7 @@ static void init_header(struct ctl_table_header *head,
head->set = set;
head->parent = NULL;
head->node = node;
+ INIT_LIST_HEAD(&head->inodes);
if (node) {
struct ctl_table *entry;
for (entry = table; entry->procname; entry++, node++)
@@ -259,6 +260,29 @@ static void unuse_table(struct ctl_table_header *p)
complete(p->unregistering);
}

+/* called under sysctl_lock */
+static void proc_sys_prune_dcache(struct ctl_table_header *head)
+{
+ struct inode *inode, *prev = NULL;
+ struct proc_inode *ei;
+
+ list_for_each_entry(ei, &head->inodes, sysctl_inodes) {
+ inode = igrab(&ei->vfs_inode);
+ if (inode) {
+ spin_unlock(&sysctl_lock);
+ iput(prev);
+ prev = inode;
+ d_prune_aliases(inode);
+ spin_lock(&sysctl_lock);
+ }
+ }
+ if (prev) {
+ spin_unlock(&sysctl_lock);
+ iput(prev);
+ spin_lock(&sysctl_lock);
+ }
+}
+
/* called under sysctl_lock, will reacquire if has to wait */
static void start_unregistering(struct ctl_table_header *p)
{
@@ -278,27 +302,17 @@ static void start_unregistering(struct ctl_table_header *p)
p->unregistering = ERR_PTR(-EINVAL);
}
/*
+ * Prune dentries for unregistered sysctls: namespaced sysctls
+ * can have duplicate names and contaminate dcache very badly.
+ */
+ proc_sys_prune_dcache(p);
+ /*
* do not remove from the list until nobody holds it; walking the
* list in do_sysctl() relies on that.
*/
erase_header(p);
}

-static void sysctl_head_get(struct ctl_table_header *head)
-{
- spin_lock(&sysctl_lock);
- head->count++;
- spin_unlock(&sysctl_lock);
-}
-
-void sysctl_head_put(struct ctl_table_header *head)
-{
- spin_lock(&sysctl_lock);
- if (!--head->count)
- kfree_rcu(head, rcu);
- spin_unlock(&sysctl_lock);
-}
-
static struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
{
BUG_ON(!head);
@@ -440,11 +454,15 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,

inode->i_ino = get_next_ino();

- sysctl_head_get(head);
ei = PROC_I(inode);
ei->sysctl = head;
ei->sysctl_entry = table;

+ spin_lock(&sysctl_lock);
+ list_add(&ei->sysctl_inodes, &head->inodes);
+ head->count++;
+ spin_unlock(&sysctl_lock);
+
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
inode->i_mode = table->mode;
if (!S_ISDIR(table->mode)) {
@@ -466,6 +484,15 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
return inode;
}

+void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head)
+{
+ spin_lock(&sysctl_lock);
+ list_del(&PROC_I(inode)->sysctl_inodes);
+ if (!--head->count)
+ kfree_rcu(head, rcu);
+ spin_unlock(&sysctl_lock);
+}
+
static struct ctl_table_header *grab_header(struct inode *inode)
{
struct ctl_table_header *head = PROC_I(inode)->sysctl;
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index adf4e51cf597..b7e82049fec7 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -143,6 +143,7 @@ struct ctl_table_header
struct ctl_table_set *set;
struct ctl_dir *parent;
struct ctl_node *node;
+ struct list_head inodes; /* head for proc_inode->sysctl_inodes */
};

struct ctl_dir {