[RFC] netlink: get rid of nl_table_lock

From: Stephen Hemminger
Date: Sat Jan 03 2015 - 14:02:37 EST


As a follow on to Thomas's patch I think this would complete the
transistion to RCU for netlink.
Compile tested only.



This patch gets rid of the reader/writer nl_table_lock and replaces it
with exclusively using RCU for reading, and a mutex for writing.

Signed-off-by: Stephen Hemminger <stephen@xxxxxxxxxxxxxxxxxx>


--- a/include/net/sock.h 2015-01-01 10:05:35.805253771 -0800
+++ b/include/net/sock.h 2015-01-03 10:45:29.661737618 -0800
@@ -666,6 +666,8 @@ static inline void sk_add_bind_node(stru
hlist_for_each_entry_safe(__sk, tmp, list, sk_node)
#define sk_for_each_bound(__sk, list) \
hlist_for_each_entry(__sk, list, sk_bind_node)
+#define sk_for_each_bound_rcu(__sk, list) \
+ hlist_for_each_entry_rcu(__sk, list, sk_bind_node)

/**
* sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset
--- a/net/netlink/af_netlink.c 2015-01-03 10:04:37.738319202 -0800
+++ b/net/netlink/af_netlink.c 2015-01-03 10:53:29.568387253 -0800
@@ -100,15 +100,14 @@ static void netlink_skb_destructor(struc
* Lookup and traversal are protected with an RCU read-side lock. Insertion
* and removal are protected with nl_sk_hash_lock while using RCU list
* modification primitives and may run in parallel to RCU protected lookups.
- * Destruction of the Netlink socket may only occur *after* nl_table_lock has
+ * Destruction of the Netlink socket may only occur *after* nl_table_mutex has
* been acquired * either during or after the socket has been removed from
* the list and after an RCU grace period.
*/
-DEFINE_RWLOCK(nl_table_lock);
-EXPORT_SYMBOL_GPL(nl_table_lock);
-static atomic_t nl_table_users = ATOMIC_INIT(0);
+static DEFINE_MUTEX(nl_table_mutex);

-#define nl_deref_protected(X) rcu_dereference_protected(X, lockdep_is_held(&nl_table_lock));
+#define nl_deref_protected(X) \
+ rcu_dereference_protected(X, lockdep_is_held(&nl_table_mutex))

/* Protects netlink socket hash table mutations */
DEFINE_MUTEX(nl_sk_hash_lock);
@@ -118,7 +117,8 @@ EXPORT_SYMBOL_GPL(nl_sk_hash_lock);
static int lockdep_nl_sk_hash_is_held(void *parent)
{
if (debug_locks)
- return lockdep_is_held(&nl_sk_hash_lock) || lockdep_is_held(&nl_table_lock);
+ return lockdep_is_held(&nl_sk_hash_lock) ||
+ lockdep_is_held(&nl_table_mutex);
return 1;
}
#endif
@@ -925,59 +925,14 @@ static void netlink_sock_destruct(struct
WARN_ON(nlk_sk(sk)->groups);
}

-/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
- * SMP. Look, when several writers sleep and reader wakes them up, all but one
- * immediately hit write lock and grab all the cpus. Exclusive sleep solves
- * this, _but_ remember, it adds useless work on UP machines.
- */
-
void netlink_table_grab(void)
- __acquires(nl_table_lock)
{
- might_sleep();
-
- write_lock_irq(&nl_table_lock);
-
- if (atomic_read(&nl_table_users)) {
- DECLARE_WAITQUEUE(wait, current);
-
- add_wait_queue_exclusive(&nl_table_wait, &wait);
- for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (atomic_read(&nl_table_users) == 0)
- break;
- write_unlock_irq(&nl_table_lock);
- schedule();
- write_lock_irq(&nl_table_lock);
- }
-
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&nl_table_wait, &wait);
- }
+ mutex_lock(&nl_table_mutex);
}

void netlink_table_ungrab(void)
- __releases(nl_table_lock)
-{
- write_unlock_irq(&nl_table_lock);
- wake_up(&nl_table_wait);
-}
-
-static inline void
-netlink_lock_table(void)
{
- /* read_lock() synchronizes us to netlink_table_grab */
-
- read_lock(&nl_table_lock);
- atomic_inc(&nl_table_users);
- read_unlock(&nl_table_lock);
-}
-
-static inline void
-netlink_unlock_table(void)
-{
- if (atomic_dec_and_test(&nl_table_users))
- wake_up(&nl_table_wait);
+ mutex_unlock(&nl_table_mutex);
}

struct netlink_compare_arg
@@ -1151,12 +1106,12 @@ static int netlink_create(struct net *ne
if (protocol < 0 || protocol >= MAX_LINKS)
return -EPROTONOSUPPORT;

- netlink_lock_table();
+ mutex_lock(&nl_table_mutex);
#ifdef CONFIG_MODULES
if (!nl_table[protocol].registered) {
- netlink_unlock_table();
+ mutex_unlock(&nl_table_mutex);
request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
- netlink_lock_table();
+ mutex_lock(&nl_table_mutex);
}
#endif
if (nl_table[protocol].registered &&
@@ -1167,7 +1122,7 @@ static int netlink_create(struct net *ne
cb_mutex = nl_table[protocol].cb_mutex;
bind = nl_table[protocol].bind;
unbind = nl_table[protocol].unbind;
- netlink_unlock_table();
+ mutex_unlock(&nl_table_mutex);

if (err < 0)
goto out;
@@ -1982,17 +1937,13 @@ int netlink_broadcast_filtered(struct so
info.tx_filter = filter;
info.tx_data = filter_data;

- /* While we sleep in clone, do not allow to change socket list */
-
- netlink_lock_table();
-
- sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
+ rcu_read_lock();
+ sk_for_each_bound_rcu(sk, &nl_table[ssk->sk_protocol].mc_list)
do_one_broadcast(sk, &info);
+ rcu_read_unlock();

consume_skb(skb);

- netlink_unlock_table();
-
if (info.delivery_failure) {
kfree_skb(info.skb2);
return -ENOBUFS;
@@ -2071,12 +2022,11 @@ int netlink_set_err(struct sock *ssk, u3
/* sk->sk_err wants a positive error value */
info.code = -code;

- read_lock(&nl_table_lock);
-
- sk_for_each_bound(sk, &nl_table[ssk->sk_protocol].mc_list)
+ rcu_read_lock();
+ sk_for_each_bound_rcu(sk, &nl_table[ssk->sk_protocol].mc_list)
ret += do_one_set_err(sk, &info);
+ rcu_read_unlock();

- read_unlock(&nl_table_lock);
return ret;
}
EXPORT_SYMBOL(netlink_set_err);
--- a/net/netlink/diag.c 2014-08-09 08:39:57.756179454 -0700
+++ b/net/netlink/diag.c 2015-01-03 10:57:31.113791535 -0800
@@ -136,7 +136,7 @@ static int __netlink_diag_dump(struct sk
}
}

- sk_for_each_bound(sk, &tbl->mc_list) {
+ sk_for_each_bound_rcu(sk, &tbl->mc_list) {
if (sk_hashed(sk))
continue;
if (!net_eq(sock_net(sk), net))
@@ -171,7 +171,7 @@ static int netlink_diag_dump(struct sk_b
req = nlmsg_data(cb->nlh);

mutex_lock(&nl_sk_hash_lock);
- read_lock(&nl_table_lock);
+ rcu_read_lock();

if (req->sdiag_protocol == NDIAG_PROTO_ALL) {
int i;
@@ -183,7 +183,7 @@ static int netlink_diag_dump(struct sk_b
}
} else {
if (req->sdiag_protocol >= MAX_LINKS) {
- read_unlock(&nl_table_lock);
+ rcu_read_unlock();
mutex_unlock(&nl_sk_hash_lock);
return -ENOENT;
}
@@ -191,7 +191,7 @@ static int netlink_diag_dump(struct sk_b
__netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num);
}

- read_unlock(&nl_table_lock);
+ rcu_read_unlock();
mutex_unlock(&nl_sk_hash_lock);

return skb->len;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/