Re: [patch] af_unix fix for a panic a DoS and a memory leak [Re:

Andrea Arcangeli (andrea@e-mind.com)
Tue, 2 Mar 1999 21:58:46 +0100 (CET)


On Tue, 2 Mar 1999, Andrea Arcangeli wrote:

>I think the cleaner thing to do is to only limit the number of sk checking
>the backlog as I did, plus taking care of SOMAXCONN. My apps are all
>working fine with this new behavior.

I implemented the last security thing and rediffed everything against
clean 2.2.2. Checking SOMAXCONN wasn't enough for security because that
way I could been able to allocate as normal user something like
NR_TASKS*NR_OPEN*SOMAXCONN socks and that's a bit more than 2*max_files ;).

Index: net/unix//af_unix.c
===================================================================
RCS file: /var/cvs/linux/net/unix/af_unix.c,v
retrieving revision 1.1.1.2
retrieving revision 1.1.2.4
diff -u -r1.1.1.2 -r1.1.2.4
--- af_unix.c 1999/01/18 13:41:22 1.1.1.2
+++ linux/net/unix/af_unix.c 1999/03/02 20:45:24 1.1.2.4
@@ -33,6 +33,10 @@
* Lots of bug fixes.
* Alexey Kuznetosv : Repaired (I hope) bugs introduces
* by above two patches.
+ * Andrea Arcangeli : Check max backlog of the listen sock
+ * at connect time and security fix
+ * that limits the max number of socks
+ * to 2*max_files.
*
* Known differences from reference BSD that was tested:
*
@@ -102,6 +106,8 @@
int sysctl_unix_destroy_delay = 10*HZ;

unix_socket *unix_socket_table[UNIX_HASH_SIZE+1];
+int unix_nr_socks = 0;
+spinlock_t unix_nr_lock = SPIN_LOCK_UNLOCKED;

#define unix_sockets_unbound (unix_socket_table[UNIX_HASH_SIZE])

@@ -263,6 +269,10 @@
unix_socket *sk=(unix_socket *)data;
if(!unix_locked(sk) && atomic_read(&sk->wmem_alloc) == 0)
{
+ spin_lock_irq(&unix_nr_lock);
+ --unix_nr_socks;
+ spin_unlock_irq(&unix_nr_lock);
+
sk_free(sk);

/* socket destroyed, decrement count */
@@ -347,6 +357,10 @@

if(!unix_locked(sk) && atomic_read(&sk->wmem_alloc) == 0)
{
+ spin_lock_irq(&unix_nr_lock);
+ --unix_nr_socks;
+ spin_unlock_irq(&unix_nr_lock);
+
sk_free(sk);

/* socket destroyed, decrement count */
@@ -371,6 +385,8 @@
return -EOPNOTSUPP; /* Only stream sockets accept */
if (!sk->protinfo.af_unix.addr)
return -EINVAL; /* No listens on an unbound socket */
+ if ((unsigned) backlog > SOMAXCONN)
+ backlog = SOMAXCONN;
sk->max_ack_backlog=backlog;
sk->state=TCP_LISTEN;
sock->flags |= SO_ACCEPTCON;
@@ -388,10 +404,22 @@
{
struct sock *sk;

+ spin_lock_irq(&unix_nr_lock);
+ if (unix_nr_socks++ > 2*max_files)
+ {
+ spin_unlock_irq(&unix_nr_lock);
+ goto too_many;
+ }
+ spin_unlock_irq(&unix_nr_lock);
+
MOD_INC_USE_COUNT;
sk = sk_alloc(PF_UNIX, GFP_KERNEL, 1);
if (!sk) {
MOD_DEC_USE_COUNT;
+ too_many:
+ spin_lock_irq(&unix_nr_lock);
+ --unix_nr_socks;
+ spin_unlock_irq(&unix_nr_lock);
return NULL;
}

@@ -707,6 +735,10 @@
if (other == NULL || other->dead || other->state != TCP_LISTEN)
goto out;

+ /* Fail if the listen backlog is just full. -arca */
+ if (other->ack_backlog >= other->max_ack_backlog)
+ goto out;
+
err = -ENOMEM;
if (newsk == NULL || skb == NULL)
goto out;
@@ -817,9 +849,7 @@
tsk = skb->sk;
sk->ack_backlog--;
kfree_skb(skb);
- if (!tsk->dead)
- break;
- unix_release_sock(tsk);
+ break;
}


Index: net/unix//garbage.c
===================================================================
RCS file: /var/cvs/linux/net/unix/garbage.c,v
retrieving revision 1.1.1.1
retrieving revision 1.1.2.3
diff -u -r1.1.1.1 -r1.1.2.3
--- garbage.c 1999/01/18 01:27:46 1.1.1.1
+++ linux/net/unix/garbage.c 1999/03/01 13:11:33 1.1.2.3
@@ -17,11 +17,12 @@
*
* - explicit stack instead of recursion
* - tail recurse on first born instead of immediate push/pop
+ * - we gather the stuff that should not be killed into list
+ * and stack is just a path from root to the current pointer.
*
* Future optimizations:
*
* - don't just push entire root set; process in place
- * - use linked list for internal stack
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -51,6 +52,14 @@
* That might be the reason of random oopses on close_fp() in
* unrelated processes.
*
+ * AV 28 Feb 1999
+ * Kill the explicit allocation of stack. Now we keep the marked
+ * socks in the list with root in gc_current to one of the nodes.
+ * Stack is represented as path from gc_current to the last elem.
+ * Unmark now means "add to list". Push == "make it a son of
+ * gc_current". Pop == "move gc_current to parent". We keep only
+ * pointers to parents (->gc_next).
+ *
*/

#include <linux/kernel.h>
@@ -65,7 +74,6 @@
#include <linux/netdevice.h>
#include <linux/file.h>
#include <linux/proc_fs.h>
-#include <linux/vmalloc.h>

#include <net/sock.h>
#include <net/tcp.h>
@@ -74,9 +82,8 @@

/* Internal data structures and random procedures: */

-static unix_socket **stack; /* stack of objects to mark */
-static int in_stack = 0; /* first free entry in stack */
-static int max_stack; /* Top of stack */
+#define GC_HEAD ((unix_socket *) -1)
+static unix_socket *gc_current=GC_HEAD; /* stack of objects to mark */

extern inline unix_socket *unix_get_socket(struct file *filp)
{
@@ -122,32 +129,25 @@
/*
* Garbage Collector Support Functions
*/
-
-extern inline void push_stack(unix_socket *x)
-{
- if (in_stack == max_stack)
- panic("can't push onto full stack");
- stack[in_stack++] = x;
-}

extern inline unix_socket *pop_stack(void)
{
- if (in_stack == 0)
- panic("can't pop empty gc stack");
- return stack[--in_stack];
+ unix_socket *p=gc_current;
+ gc_current = p->protinfo.af_unix.gc_next;
+ return p;
}

extern inline int empty_stack(void)
{
- return in_stack == 0;
+ return gc_current == GC_HEAD;
}

extern inline void maybe_unmark_and_push(unix_socket *x)
{
- if (!(x->protinfo.af_unix.marksweep&MARKED))
+ if (x->protinfo.af_unix.gc_next)
return;
- x->protinfo.af_unix.marksweep&=~MARKED;
- push_stack(x);
+ x->protinfo.af_unix.gc_next = gc_current;
+ gc_current = x;
}


@@ -169,23 +169,9 @@
return;
in_unix_gc=1;

- if(stack==NULL || max_files>max_stack)
- {
- if(stack)
- vfree(stack);
- stack=(unix_socket **)vmalloc(max_files*sizeof(struct unix_socket *));
- if(stack==NULL)
- {
- printk(KERN_NOTICE "unix_gc: deferred due to low memory.\n");
- in_unix_gc=0;
- return;
- }
- max_stack=max_files;
- }
-
forall_unix_sockets(i, s)
{
- s->protinfo.af_unix.marksweep|=MARKED;
+ s->protinfo.af_unix.gc_next=NULL;
}
/*
* Everything is now marked
@@ -276,9 +262,9 @@

if (f)
{
- if ((f->protinfo.af_unix.marksweep&MARKED))
+ if (!f->protinfo.af_unix.gc_next)
{
- f->protinfo.af_unix.marksweep&=~MARKED;
+ f->protinfo.af_unix.gc_next=GC_HEAD;
x=f;
f=NULL;
goto tail;
@@ -290,7 +276,7 @@

forall_unix_sockets(i, s)
{
- if (s->protinfo.af_unix.marksweep&MARKED)
+ if (!s->protinfo.af_unix.gc_next)
{
struct sk_buff *nextsk;
skb=skb_peek(&s->receive_queue);
Index: include/net/sock.h
===================================================================
RCS file: /var/cvs/linux/include/net/sock.h,v
retrieving revision 1.1.1.3
retrieving revision 1.1.2.6
diff -u -r1.1.1.3 -r1.1.2.6
--- sock.h 1999/02/20 15:42:53 1.1.1.3
+++ linux/include/net/sock.h 1999/03/01 13:11:34 1.1.2.6
@@ -99,8 +99,7 @@
struct semaphore readsem;
struct sock * other;
struct sock ** list;
- int marksweep;
-#define MARKED 1
+ struct sock * gc_next;
int inflight;
};

I also checked that the high-hard limit of sock, the high limit of backlog
and the backlog work.

If my backlog check is breaking some app (which?) we can #if 0 it of
course, but I see it as a feature for decrease the load of the machine. I
cheked that for ex mysql uses listen(sk, 5). Now it really allow only 5
pending not-yet-accepted-connections. I think the reason Tomasz seen a
load decrease from 5-15 to 2-5 is just that now there are at max 5 pending
socks in the unix domain socket hash (and so the garbage collector is much
faster).

Andrea Arcangeli

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu
Please read the FAQ at http://www.tux.org/lkml/