Re: [PATCH v2 03/31] net: Introduce net_sem for protection of pernet_list

From: Andrei Vagin
Date: Wed Jan 17 2018 - 15:05:05 EST


On Mon, Nov 20, 2017 at 09:32:34PM +0300, Kirill Tkhai wrote:
> Curently mutex is used to protect pernet operations list. It makes
> cleanup_net() to execute ->exit methods of the same operations set,
> which was used on the time of ->init, even after net namespace is
> unlinked from net_namespace_list.
>
> But the problem is it's need to synchronize_rcu() after net is removed
> from net_namespace_list():
>
> Destroy net_ns:
> cleanup_net()
> mutex_lock(&net_mutex)
> list_del_rcu(&net->list)
> synchronize_rcu() <--- Sleep there for ages
> list_for_each_entry_reverse(ops, &pernet_list, list)
> ops_exit_list(ops, &net_exit_list)
> list_for_each_entry_reverse(ops, &pernet_list, list)
> ops_free_list(ops, &net_exit_list)
> mutex_unlock(&net_mutex)
>
> This primitive is not fast, especially on the systems with many processors
> and/or when preemptible RCU is enabled in config. So, all the time, while
> cleanup_net() is waiting for RCU grace period, creation of new net namespaces
> is not possible, the tasks, who makes it, are sleeping on the same mutex:
>
> Create net_ns:
> copy_net_ns()
> mutex_lock_killable(&net_mutex) <--- Sleep there for ages
>
> I observed 20-30 seconds hangs of "unshare -n" on ordinary 8-cpu laptop
> with preemptible RCU enabled.
>
> The solution is to convert net_mutex to the rw_semaphore and add small locks
> to really small number of pernet_operations, what really need them. Then,
> pernet_operations::init/::exit methods, modifying the net-related data,
> will require down_read() locking only, while down_write() will be used
> for changing pernet_list.
>
> This gives signify performance increase, after all patch set is applied,
> like you may see here:
>
> %for i in {1..10000}; do unshare -n bash -c exit; done
>
> *before*
> real 1m40,377s
> user 0m9,672s
> sys 0m19,928s
>
> *after*
> real 0m17,007s
> user 0m5,311s
> sys 0m11,779
>
> (5.8 times faster)
>
> This patch starts replacing net_mutex to net_sem. It adds rw_semaphore,
> describes the variables it protects, and makes to use where appropriate.
> net_mutex is still present, and next patches will kick it out step-by-step.
>
> Signed-off-by: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx>
> ---
> include/linux/rtnetlink.h | 1 +
> net/core/net_namespace.c | 39 ++++++++++++++++++++++++++-------------
> net/core/rtnetlink.c | 4 ++--
> 3 files changed, 29 insertions(+), 15 deletions(-)
>
> diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
> index 2032ce2eb20b..f640fc87fe1d 100644
> --- a/include/linux/rtnetlink.h
> +++ b/include/linux/rtnetlink.h
> @@ -35,6 +35,7 @@ extern int rtnl_is_locked(void);
>
> extern wait_queue_head_t netdev_unregistering_wq;
> extern struct mutex net_mutex;
> +extern struct rw_semaphore net_sem;
>
> #ifdef CONFIG_PROVE_LOCKING
> extern bool lockdep_rtnl_is_held(void);
> diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
> index 2e512965bf42..859dce31e37e 100644
> --- a/net/core/net_namespace.c
> +++ b/net/core/net_namespace.c
> @@ -41,6 +41,11 @@ struct net init_net = {

> static LIST_HEAD(pernet_list);
> static struct list_head *first_device = &pernet_list;
> DEFINE_MUTEX(net_mutex);

With all patches, we still have the net_mutex, I think we need to add a
comment, which explains why we need it. Are "sync" pernet operations
depricated after this series? Or is it ok to have them?


> EXPORT_SYMBOL(init_net);
>
> static bool init_net_initialized;
> +/*
> + * net_sem: protects: pernet_list, net_generic_ids,
> + * init_net_initialized and first_device pointer.
> + */
> +DECLARE_RWSEM(net_sem);
>
> #define MIN_PERNET_OPS_ID \
> ((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
> @@ -279,7 +284,7 @@ struct net *get_net_ns_by_id(struct net *net, int id)
> */
> static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
> {
> - /* Must be called with net_mutex held */
> + /* Must be called with net_sem held */
> const struct pernet_operations *ops, *saved_ops;
> int error = 0;
> LIST_HEAD(net_exit_list);
> @@ -411,12 +416,16 @@ struct net *copy_net_ns(unsigned long flags,
> net->ucounts = ucounts;
> get_user_ns(user_ns);
>
> - rv = mutex_lock_killable(&net_mutex);
> + rv = down_read_killable(&net_sem);
> if (rv < 0)
> goto put_userns;
> -
> + rv = mutex_lock_killable(&net_mutex);
> + if (rv < 0)
> + goto up_read;
> rv = setup_net(net, user_ns);
> mutex_unlock(&net_mutex);
> +up_read:
> + up_read(&net_sem);
> if (rv < 0) {
> put_userns:
> put_user_ns(user_ns);
> @@ -443,6 +452,7 @@ static void cleanup_net(struct work_struct *work)
> list_replace_init(&cleanup_list, &net_kill_list);
> spin_unlock_irq(&cleanup_list_lock);
>
> + down_read(&net_sem);
> mutex_lock(&net_mutex);
>
> /* Don't let anyone else find us. */
> @@ -484,6 +494,7 @@ static void cleanup_net(struct work_struct *work)
> ops_free_list(ops, &net_exit_list);
>
> mutex_unlock(&net_mutex);
> + up_read(&net_sem);
>
> /* Ensure there are no outstanding rcu callbacks using this
> * network namespace.
> @@ -510,8 +521,10 @@ static void cleanup_net(struct work_struct *work)
> */
> void net_ns_barrier(void)
> {
> + down_write(&net_sem);
> mutex_lock(&net_mutex);
> mutex_unlock(&net_mutex);
> + up_write(&net_sem);
> }
> EXPORT_SYMBOL(net_ns_barrier);
>
> @@ -838,12 +851,12 @@ static int __init net_ns_init(void)
>
> rcu_assign_pointer(init_net.gen, ng);
>
> - mutex_lock(&net_mutex);
> + down_write(&net_sem);
> if (setup_net(&init_net, &init_user_ns))
> panic("Could not setup the initial network namespace");
>
> init_net_initialized = true;
> - mutex_unlock(&net_mutex);
> + up_write(&net_sem);
>
> register_pernet_subsys(&net_ns_ops);
>
> @@ -983,9 +996,9 @@ static void unregister_pernet_operations(struct pernet_operations *ops)
> int register_pernet_subsys(struct pernet_operations *ops)
> {
> int error;
> - mutex_lock(&net_mutex);
> + down_write(&net_sem);
> error = register_pernet_operations(first_device, ops);
> - mutex_unlock(&net_mutex);
> + up_write(&net_sem);
> return error;
> }
> EXPORT_SYMBOL_GPL(register_pernet_subsys);
> @@ -1001,9 +1014,9 @@ EXPORT_SYMBOL_GPL(register_pernet_subsys);
> */
> void unregister_pernet_subsys(struct pernet_operations *ops)
> {
> - mutex_lock(&net_mutex);
> + down_write(&net_sem);
> unregister_pernet_operations(ops);
> - mutex_unlock(&net_mutex);
> + up_write(&net_sem);
> }
> EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
>
> @@ -1029,11 +1042,11 @@ EXPORT_SYMBOL_GPL(unregister_pernet_subsys);
> int register_pernet_device(struct pernet_operations *ops)
> {
> int error;
> - mutex_lock(&net_mutex);
> + down_write(&net_sem);
> error = register_pernet_operations(&pernet_list, ops);
> if (!error && (first_device == &pernet_list))
> first_device = &ops->list;
> - mutex_unlock(&net_mutex);
> + up_write(&net_sem);
> return error;
> }
> EXPORT_SYMBOL_GPL(register_pernet_device);
> @@ -1049,11 +1062,11 @@ EXPORT_SYMBOL_GPL(register_pernet_device);
> */
> void unregister_pernet_device(struct pernet_operations *ops)
> {
> - mutex_lock(&net_mutex);
> + down_write(&net_sem);
> if (&ops->list == first_device)
> first_device = first_device->next;
> unregister_pernet_operations(ops);
> - mutex_unlock(&net_mutex);
> + up_write(&net_sem);
> }
> EXPORT_SYMBOL_GPL(unregister_pernet_device);
>
> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
> index dabba2a91fc8..cb06d43c4230 100644
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -390,11 +390,11 @@ static void rtnl_lock_unregistering_all(void)
> void rtnl_link_unregister(struct rtnl_link_ops *ops)
> {
> /* Close the race with cleanup_net() */
> - mutex_lock(&net_mutex);
> + down_write(&net_sem);
> rtnl_lock_unregistering_all();
> __rtnl_link_unregister(ops);
> rtnl_unlock();
> - mutex_unlock(&net_mutex);
> + up_write(&net_sem);
> }
> EXPORT_SYMBOL_GPL(rtnl_link_unregister);
>
>