[PATCH] net: Convert net_mutex into rw_semaphore and down read it on net->init/->exit
From: Kirill Tkhai
Date: Tue Nov 14 2017 - 08:54:00 EST
Curently mutex is used to protect pernet operations list. It makes
cleanup_net() to execute ->exit methods of the same operations set,
which was used on the time of ->init, even after net namespace is
unlinked from net_namespace_list.
But the problem is it's need to synchronize_rcu() after net is removed
from net_namespace_list():
Destroy net_ns:
cleanup_net()
mutex_lock(&net_mutex)
list_del_rcu(&net->list)
synchronize_rcu() <--- Sleep there for ages
list_for_each_entry_reverse(ops, &pernet_list, list)
ops_exit_list(ops, &net_exit_list)
list_for_each_entry_reverse(ops, &pernet_list, list)
ops_free_list(ops, &net_exit_list)
mutex_unlock(&net_mutex)
This primitive is not fast, especially on the systems with many processors
and/or when preemptible RCU is enabled in config. So, all the time, while
cleanup_net() is waiting for RCU grace period, creation of new net namespaces
is not possible, the tasks, who makes it, are sleeping on the same mutex:
Create net_ns:
copy_net_ns()
mutex_lock_killable(&net_mutex) <--- Sleep there for ages
The solution is to convert net_mutex to the rw_semaphore. Then,
pernet_operations::init/::exit methods, modifying the net-related data,
will require down_read() locking only, while down_write() will be used
for changing pernet_list.
This gives signify performance increase, like you may see below. There
is measured sequential net namespace creation in a cycle, in single
thread, without other tasks (single user mode):
1)int main(int argc, char *argv[])
{
unsigned nr;
if (argc < 2) {
fprintf(stderr, "Provide nr iterations arg\n");
return 1;
}
nr = atoi(argv[1]);
while (nr-- > 0) {
if (unshare(CLONE_NEWNET)) {
perror("Can't unshare");
return 1;
}
}
return 0;
}
Origin, 100000 unshare():
0.03user 23.14system 1:39.85elapsed 23%CPU
Patched, 100000 unshare():
0.03user 67.49system 1:08.34elapsed 98%CPU
2)for i in {1..10000}; do unshare -n bash -c exit; done
Origin:
real 1m24,190s
user 0m6,225s
sys 0m15,132s
Patched:
real 0m18,235s (4.6 times faster)
user 0m4,544s
sys 0m13,796s
This patch requires commit 76f8507f7a64 "locking/rwsem: Add down_read_killable()"
from Linus tree (not in net-next yet).
Signed-off-by: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx>
---
include/linux/rtnetlink.h | 2 +-
include/net/net_namespace.h | 9 +++++++--
net/core/net_namespace.c | 40 ++++++++++++++++++++--------------------
net/core/rtnetlink.c | 4 ++--
4 files changed, 30 insertions(+), 25 deletions(-)
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 54bcd970bfd3..36cc009f4580 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -32,7 +32,7 @@ extern int rtnl_trylock(void);
extern int rtnl_is_locked(void);
extern wait_queue_head_t netdev_unregistering_wq;
-extern struct mutex net_mutex;
+extern struct rw_semaphore net_sem;
#ifdef CONFIG_PROVE_LOCKING
extern bool lockdep_rtnl_is_held(void);
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 10f99dafd5ac..aaed826ccbe7 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -60,7 +60,7 @@ struct net {
struct list_head list; /* list of network namespaces */
struct list_head cleanup_list; /* namespaces on death row */
- struct list_head exit_list; /* Use only net_mutex */
+ struct list_head exit_list; /* Use only net_sem */
struct user_namespace *user_ns; /* Owning user namespace */
struct ucounts *ucounts;
@@ -89,7 +89,12 @@ struct net {
/* core fib_rules */
struct list_head rules_ops;
- struct list_head fib_notifier_ops; /* protected by net_mutex */
+ /*
+ * RCU-protected list, modifiable by pernet-init and -exit methods.
+ * When net namespace is alive (net::count > 0), all the changes
+ * are made under rw_sem held on write.
+ */
+ struct list_head fib_notifier_ops;
struct net_device *loopback_dev; /* The loopback */
struct netns_core core;
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 6cfdc7c84c48..f502b11b507e 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -29,7 +29,7 @@
static LIST_HEAD(pernet_list);
static struct list_head *first_device = &pernet_list;
-DEFINE_MUTEX(net_mutex);
+DECLARE_RWSEM(net_sem);
LIST_HEAD(net_namespace_list);
EXPORT_SYMBOL_GPL(net_namespace_list);
@@ -65,11 +65,11 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data)
{
struct net_generic *ng, *old_ng;
- BUG_ON(!mutex_is_locked(&net_mutex));
+ BUG_ON(!rwsem_is_locked(&net_sem));
BUG_ON(id < MIN_PERNET_OPS_ID);
old_ng = rcu_dereference_protected(net->gen,
- lockdep_is_held(&net_mutex));
+ lockdep_is_held(&net_sem));
if (old_ng->s.len > id) {
old_ng->ptr[id] = data;
return 0;
@@ -278,7 +278,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
*/
static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
{
- /* Must be called with net_mutex held */
+ /* Must be called with net_sem held */
const struct pernet_operations *ops, *saved_ops;
int error = 0;
LIST_HEAD(net_exit_list);
@@ -406,7 +406,7 @@ struct net *copy_net_ns(unsigned long flags,
get_user_ns(user_ns);
- rv = mutex_lock_killable(&net_mutex);
+ rv = down_read_killable(&net_sem);
if (rv < 0) {
net_free(net);
dec_net_namespaces(ucounts);
@@ -421,7 +421,7 @@ struct net *copy_net_ns(unsigned long flags,
list_add_tail_rcu(&net->list, &net_namespace_list);
rtnl_unlock();
}
- mutex_unlock(&net_mutex);
+ up_read(&net_sem);
if (rv < 0) {
dec_net_namespaces(ucounts);
put_user_ns(user_ns);
@@ -446,7 +446,7 @@ static void cleanup_net(struct work_struct *work)
list_replace_init(&cleanup_list, &net_kill_list);
spin_unlock_irq(&cleanup_list_lock);
- mutex_lock(&net_mutex);
+ down_read(&net_sem);
/* Don't let anyone else find us. */
rtnl_lock();
@@ -486,7 +486,7 @@ static void cleanup_net(struct work_struct *work)
list_for_each_entry_reverse(ops, &pernet_list, list)
ops_free_list(ops, &net_exit_list);
- mutex_unlock(&net_mutex);
+ up_read(&net_sem);
/* Ensure there are no outstanding rcu callbacks using this
* network namespace.
@@ -513,8 +513,8 @@ static void cleanup_net(struct work_struct *work)
*/
void net_ns_barrier(void)
{
- mutex_lock(&net_mutex);
- mutex_unlock(&net_mutex);
+ down_write(&net_sem);
+ up_write(&net_sem);
}
EXPORT_SYMBOL(net_ns_barrier);
@@ -841,7 +841,7 @@ static int __init net_ns_init(void)
rcu_assign_pointer(init_net.gen, ng);
- mutex_lock(&net_mutex);
+ down_read(&net_sem);
if (setup_net(&init_net, &init_user_ns))
panic("Could not setup the initial network namespace");
@@ -851,7 +851,7 @@ static int __init net_ns_init(void)
list_add_tail_rcu(&init_net.list, &net_namespace_list);
rtnl_unlock();
- mutex_unlock(&net_mutex);
+ up_read(&net_sem);
register_pernet_subsys(&net_ns_ops);
@@ -991,9 +991,9 @@ static void unregister_pernet_operations(struct pernet_operations *ops)
int register_pernet_subsys(struct pernet_operations *ops)
{
int error;
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
error = register_pernet_operations(first_device, ops);
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
return error;
}
EXPORT_SYMBOL_GPL(register_pernet_subsys);
@@ -1009,9 +1009,9 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys);
*/
void unregister_pernet_subsys(struct pernet_operations *ops)
{
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
unregister_pernet_operations(ops);
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
@@ -1037,11 +1037,11 @@ EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
int register_pernet_device(struct pernet_operations *ops)
{
int error;
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
error = register_pernet_operations(&pernet_list, ops);
if (!error && (first_device == &pernet_list))
first_device = &ops->list;
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
return error;
}
EXPORT_SYMBOL_GPL(register_pernet_device);
@@ -1057,11 +1057,11 @@ EXPORT_SYMBOL_GPL(register_pernet_device);
*/
void unregister_pernet_device(struct pernet_operations *ops)
{
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
if (&ops->list == first_device)
first_device = first_device->next;
unregister_pernet_operations(ops);
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
}
EXPORT_SYMBOL_GPL(unregister_pernet_device);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 5ace48926b19..caa215fd170b 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -390,11 +390,11 @@ static void rtnl_lock_unregistering_all(void)
void rtnl_link_unregister(struct rtnl_link_ops *ops)
{
/* Close the race with cleanup_net() */
- mutex_lock(&net_mutex);
+ down_write(&net_sem);
rtnl_lock_unregistering_all();
__rtnl_link_unregister(ops);
rtnl_unlock();
- mutex_unlock(&net_mutex);
+ up_write(&net_sem);
}
EXPORT_SYMBOL_GPL(rtnl_link_unregister);