Re: Longstanding networking / SMP issue? (duplextest)

From: David S. Miller (davem@redhat.com)
Date: Sun Feb 23 2003 - 04:12:17 EST


   From: Andi Kleen <ak@suse.de>
   Date: Fri, 21 Feb 2003 11:45:41 +0100
   
   at least ipip.c will send icmps from process context (ipip_tunnel_xmit)
   
   netfilter will probably do too.

We can easily make them all run in BH (protected) context :-)

This makes the locking much simpler. Add to this the per-cpu socket
idea I proposed and we arrive at the patch below (against 2.5.x,
easily ported to 2.4.x just remove the cpu_possible() stuff).

Comments?

--- net/ipv4/icmp.c.~1~ Sat Feb 22 23:41:53 2003
+++ net/ipv4/icmp.c Sun Feb 23 01:25:43 2003
@@ -223,57 +223,29 @@
 static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];
 
 /*
- * The ICMP socket. This is the most convenient way to flow control
+ * The ICMP socket(s). This is the most convenient way to flow control
  * our ICMP output as well as maintain a clean interface throughout
  * all layers. All Socketless IP sends will soon be gone.
+ *
+ * On SMP we have one ICMP socket per-cpu.
  */
-struct socket *icmp_socket;
-
-/* ICMPv4 socket is only a bit non-reenterable (unlike ICMPv6,
- which is strongly non-reenterable). A bit later it will be made
- reenterable and the lock may be removed then.
- */
-
-static int icmp_xmit_holder = -1;
-
-static int icmp_xmit_lock_bh(void)
-{
- int rc;
- if (!spin_trylock(&icmp_socket->sk->lock.slock)) {
- rc = -EAGAIN;
- if (icmp_xmit_holder == smp_processor_id())
- goto out;
- spin_lock(&icmp_socket->sk->lock.slock);
- }
- rc = 0;
- icmp_xmit_holder = smp_processor_id();
-out:
- return rc;
-}
+static struct socket *__icmp_socket[NR_CPUS];
+#define icmp_socket __icmp_socket[smp_processor_id()]
 
-static __inline__ int icmp_xmit_lock(void)
+static __inline__ void icmp_xmit_lock(void)
 {
- int ret;
         local_bh_disable();
- ret = icmp_xmit_lock_bh();
- if (ret)
- local_bh_enable();
- return ret;
-}
 
-static void icmp_xmit_unlock_bh(void)
-{
- icmp_xmit_holder = -1;
- spin_unlock(&icmp_socket->sk->lock.slock);
+ if (!spin_trylock(&icmp_socket->sk->lock.slock))
+ BUG();
 }
 
-static __inline__ void icmp_xmit_unlock(void)
+static void icmp_xmit_unlock(void)
 {
- icmp_xmit_unlock_bh();
+ spin_unlock(&icmp_socket->sk->lock.slock);
         local_bh_enable();
 }
 
-
 /*
  * Send an ICMP frame.
  */
@@ -404,10 +376,11 @@
         struct rtable *rt = (struct rtable *)skb->dst;
         u32 daddr;
 
- if (ip_options_echo(&icmp_param->replyopts, skb) ||
- icmp_xmit_lock_bh())
+ if (ip_options_echo(&icmp_param->replyopts, skb))
                 goto out;
 
+ icmp_xmit_lock();
+
         icmp_param->data.icmph.checksum = 0;
         icmp_out_count(icmp_param->data.icmph.type);
 
@@ -434,7 +407,7 @@
                 icmp_push_reply(icmp_param, &ipc, rt);
         ip_rt_put(rt);
 out_unlock:
- icmp_xmit_unlock_bh();
+ icmp_xmit_unlock();
 out:;
 }
 
@@ -519,8 +492,7 @@
                 }
         }
 
- if (icmp_xmit_lock())
- goto out;
+ icmp_xmit_lock();
 
         /*
          * Construct source address and options.
@@ -1141,19 +1113,30 @@
 void __init icmp_init(struct net_proto_family *ops)
 {
         struct inet_opt *inet;
- int err = sock_create(PF_INET, SOCK_RAW, IPPROTO_ICMP, &icmp_socket);
+ int i;
 
- if (err < 0)
- panic("Failed to create the ICMP control socket.\n");
- icmp_socket->sk->allocation = GFP_ATOMIC;
- icmp_socket->sk->sndbuf = SK_WMEM_MAX * 2;
- inet = inet_sk(icmp_socket->sk);
- inet->ttl = MAXTTL;
- inet->pmtudisc = IP_PMTUDISC_DONT;
-
- /* Unhash it so that IP input processing does not even
- * see it, we do not wish this socket to see incoming
- * packets.
- */
- icmp_socket->sk->prot->unhash(icmp_socket->sk);
+ for (i = 0; i < NR_CPUS; i++) {
+ int err;
+
+ if (!cpu_possible(i))
+ continue;
+
+ err = sock_create(PF_INET, SOCK_RAW, IPPROTO_ICMP,
+ &__icmp_socket[i]);
+
+ if (err < 0)
+ panic("Failed to create the ICMP control socket.\n");
+
+ __icmp_socket[i]->sk->allocation = GFP_ATOMIC;
+ __icmp_socket[i]->sk->sndbuf = SK_WMEM_MAX * 2;
+ inet = inet_sk(__icmp_socket[i]->sk);
+ inet->ttl = MAXTTL;
+ inet->pmtudisc = IP_PMTUDISC_DONT;
+
+ /* Unhash it so that IP input processing does not even
+ * see it, we do not wish this socket to see incoming
+ * packets.
+ */
+ __icmp_socket[i]->sk->prot->unhash(__icmp_socket[i]->sk);
+ }
 }
--- net/ipv6/icmp.c.~1~ Sat Feb 22 23:48:49 2003
+++ net/ipv6/icmp.c Sun Feb 23 01:27:09 2003
@@ -67,10 +67,11 @@
 DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics);
 
 /*
- * ICMP socket for flow control.
+ * ICMP socket(s) for flow control.
  */
 
-struct socket *icmpv6_socket;
+static struct socket *__icmpv6_socket[NR_CPUS];
+#define icmpv6_socket __icmpv6_socket[smp_processor_id()]
 
 static int icmpv6_rcv(struct sk_buff *skb);
 
@@ -87,39 +88,16 @@
         __u32 csum;
 };
 
-
-static int icmpv6_xmit_holder = -1;
-
-static int icmpv6_xmit_lock_bh(void)
+static __inline__ void icmpv6_xmit_lock(void)
 {
- if (!spin_trylock(&icmpv6_socket->sk->lock.slock)) {
- if (icmpv6_xmit_holder == smp_processor_id())
- return -EAGAIN;
- spin_lock(&icmpv6_socket->sk->lock.slock);
- }
- icmpv6_xmit_holder = smp_processor_id();
- return 0;
-}
-
-static __inline__ int icmpv6_xmit_lock(void)
-{
- int ret;
         local_bh_disable();
- ret = icmpv6_xmit_lock_bh();
- if (ret)
- local_bh_enable();
- return ret;
-}
-
-static void icmpv6_xmit_unlock_bh(void)
-{
- icmpv6_xmit_holder = -1;
- spin_unlock(&icmpv6_socket->sk->lock.slock);
+ if (!spin_trylock(&icmpv6_socket->sk->lock.slock))
+ BUG();
 }
 
 static __inline__ void icmpv6_xmit_unlock(void)
 {
- icmpv6_xmit_unlock_bh();
+ spin_unlock(&icmpv6_socket->sk->lock.slock);
         local_bh_enable();
 }
 
@@ -341,8 +319,7 @@
         fl.uli_u.icmpt.type = type;
         fl.uli_u.icmpt.code = code;
 
- if (icmpv6_xmit_lock())
- return;
+ icmpv6_xmit_lock();
 
         if (!icmpv6_xrlim_allow(sk, type, &fl))
                 goto out;
@@ -415,15 +392,14 @@
         fl.uli_u.icmpt.type = ICMPV6_ECHO_REPLY;
         fl.uli_u.icmpt.code = 0;
 
- if (icmpv6_xmit_lock_bh())
- return;
+ icmpv6_xmit_lock();
 
         ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, msg.len, NULL, -1,
                        MSG_DONTWAIT);
         ICMP6_INC_STATS_BH(Icmp6OutEchoReplies);
         ICMP6_INC_STATS_BH(Icmp6OutMsgs);
 
- icmpv6_xmit_unlock_bh();
+ icmpv6_xmit_unlock();
 }
 
 static void icmpv6_notify(struct sk_buff *skb, int type, int code, u32 info)
@@ -626,26 +602,47 @@
 int __init icmpv6_init(struct net_proto_family *ops)
 {
         struct sock *sk;
- int err;
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ int err;
+
+ if (!cpu_possible(i))
+ continue;
 
- err = sock_create(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, &icmpv6_socket);
- if (err < 0) {
- printk(KERN_ERR
- "Failed to initialize the ICMP6 control socket (err %d).\n",
- err);
- icmpv6_socket = NULL; /* for safety */
- return err;
+ err = sock_create(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6,
+ &__icmpv6_socket[i]);
+ if (err < 0) {
+ int j;
+
+ printk(KERN_ERR
+ "Failed to initialize the ICMP6 control socket "
+ "(err %d).\n",
+ err);
+ for (j = 0; j < i; j++) {
+ if (!cpu_possible(j))
+ continue;
+ sock_release(__icmpv6_socket[j]);
+ __icmpv6_socket[j] = NULL; /* for safety */
+ }
+ return err;
+ }
+
+ sk = __icmpv6_socket[i]->sk;
+ sk->allocation = GFP_ATOMIC;
+ sk->sndbuf = SK_WMEM_MAX*2;
+ sk->prot->unhash(sk);
         }
 
- sk = icmpv6_socket->sk;
- sk->allocation = GFP_ATOMIC;
- sk->sndbuf = SK_WMEM_MAX*2;
- sk->prot->unhash(sk);
 
         if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) {
                 printk(KERN_ERR "Failed to register ICMP6 protocol\n");
- sock_release(icmpv6_socket);
- icmpv6_socket = NULL;
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_possible(i))
+ continue;
+ sock_release(__icmpv6_socket[i]);
+ __icmpv6_socket[i] = NULL;
+ }
                 return -EAGAIN;
         }
 
@@ -654,8 +651,14 @@
 
 void icmpv6_cleanup(void)
 {
- sock_release(icmpv6_socket);
- icmpv6_socket = NULL; /* For safety. */
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_possible(i))
+ continue;
+ sock_release(__icmpv6_socket[i]);
+ __icmpv6_socket[i] = NULL; /* For safety. */
+ }
         inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
 }
 
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Sun Feb 23 2003 - 22:00:37 EST