Testers needed for select() support for more than 256fds.

Andi Kleen (andi@maja)
Sat, 04 Jan 1997 22:37:41 +0100


Hi,

I changed select() that it works with with NR_OPEN>256 without
overflowing the kernel patch. With this patch it should be possible
to use 1024 file descriptors per process or more (using the patches
already posted to the kernel list). The new select() works fine
on my machine but I can't stress test it. So I'm looking for testers
before I submit it to Linus.

The patch is against 2.1.19 (should work with 2.0.x too).

Please send success or failure reports to me.

-Andi

--- linux/fs/select.c-2.1.19 Wed Jan 1 19:01:32 1997
+++ linux/fs/select.c Sat Jan 4 22:17:56 1997
@@ -8,6 +8,10 @@
* COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
* flag set in its personality we do *not* modify the given timeout
* parameter to reflect time remaining.
+ * 4 Jan 1997 Andi Kleen, <andi@mlm.extern.lrz-muenchen.de>
+ * Added support for more than 256 files per process.
+ * Removed all verify_area() calls.
+ *
*/

#include <linux/types.h>
@@ -24,6 +28,7 @@

#include <asm/uaccess.h>
#include <asm/system.h>
+#include <asm/page.h>

#define ROUND_UP(x,y) (((x)+(y)-1)/(y))

@@ -84,32 +89,23 @@
: \
(flag != SEL_EX))

-/*
- * Due to kernel stack usage, we use a _limited_ fd_set type here, and once
- * we really start supporting >256 file descriptors we'll probably have to
- * allocate the kernel fd_set copies dynamically.. (The kernel select routines
- * are careful to touch only the defined low bits of any fd_set pointer, this
- * is important for performance too).
- */
-typedef unsigned long limited_fd_set[NR_OPEN/(8*(sizeof(unsigned long)))];
-
typedef struct {
- limited_fd_set in, out, ex;
- limited_fd_set res_in, res_out, res_ex;
+ unsigned int *inp, *outp, *exp;
+ unsigned int *res_inp, *res_outp, *res_exp;
} fd_set_buffer;

-#define __IN(in) (in)
-#define __OUT(in) (in + sizeof(limited_fd_set)/sizeof(unsigned long))
-#define __EX(in) (in + 2*sizeof(limited_fd_set)/sizeof(unsigned long))
-#define __RES_IN(in) (in + 3*sizeof(limited_fd_set)/sizeof(unsigned long))
-#define __RES_OUT(in) (in + 4*sizeof(limited_fd_set)/sizeof(unsigned long))
-#define __RES_EX(in) (in + 5*sizeof(limited_fd_set)/sizeof(unsigned long))
+#define __IN(fds,n) ((fds)->inp+(n))
+#define __OUT(fds,n) ((fds)->outp+(n))
+#define __EX(fds,n) ((fds)->exp+(n))
+#define __RES_IN(fds,n) ((fds)->res_inp+(n))
+#define __RES_OUT(fds,n) ((fds)->res_outp+(n))
+#define __RES_EX(fds,n) ((fds)->res_exp+(n))

-#define BITS(in) (*__IN(in)|*__OUT(in)|*__EX(in))
+#define BITS(fds,n) (*__IN(fds,n)|*__OUT(fds,n)|*__EX(fds,n))

static int max_select_fd(unsigned long n, fd_set_buffer *fds)
{
- unsigned long *open_fds, *in;
+ unsigned long *open_fds;
unsigned long set;
int max;

@@ -117,10 +113,9 @@
set = ~(~0UL << (n & (__NFDBITS-1)));
n /= __NFDBITS;
open_fds = current->files->open_fds.fds_bits+n;
- in = fds->in+n;
max = 0;
if (set) {
- set &= BITS(in);
+ set &= BITS(fds,n);
if (set) {
if (!(set & ~*open_fds))
goto get_max;
@@ -128,10 +123,9 @@
}
}
while (n) {
- in--;
open_fds--;
n--;
- set = BITS(in);
+ set = BITS(fds,n);
if (!set)
continue;
if (set & ~*open_fds)
@@ -177,23 +171,25 @@
current->state = TASK_INTERRUPTIBLE;
for (i = 0 ; i < n ; i++,fd++) {
unsigned long bit = BIT(i);
+#if 0
unsigned long *in = MEM(i,fds->in);
+#endif
struct file * file = *fd;

if (!file)
continue;
- if (ISSET(bit,__IN(in)) && check(SEL_IN,wait,file)) {
- SET(bit, __RES_IN(in));
+ if (ISSET(bit,__IN(fds,i/__NFDBITS)) && check(SEL_IN,wait,file)) {
+ SET(bit, __RES_IN(fds,i/__NFDBITS));
retval++;
wait = NULL;
}
- if (ISSET(bit,__OUT(in)) && check(SEL_OUT,wait,file)) {
- SET(bit, __RES_OUT(in));
+ if (ISSET(bit,__OUT(fds,i/__NFDBITS)) && check(SEL_OUT,wait,file)) {
+ SET(bit, __RES_OUT(fds,i/__NFDBITS));
retval++;
wait = NULL;
}
- if (ISSET(bit,__EX(in)) && check(SEL_EX,wait,file)) {
- SET(bit, __RES_EX(in));
+ if (ISSET(bit,__EX(fds,i/__NFDBITS)) && check(SEL_EX,wait,file)) {
+ SET(bit, __RES_EX(fds,i/__NFDBITS));
retval++;
wait = NULL;
}
@@ -210,47 +206,45 @@
return retval;
}

-/*
- * We do a VERIFY_WRITE here even though we are only reading this time:
- * we'll write to it eventually..
- *
- * Use "int" accesses to let user-mode fd_set's be int-aligned.
- */
static int __get_fd_set(unsigned long nr, int * fs_pointer, int * fdset)
{
/* round up nr to nearest "int" */
- nr = (nr + 8*sizeof(int)-1) / (8*sizeof(int));
+ nr = (nr + 8*sizeof(int)-1) / 8;
if (fs_pointer) {
- int error = verify_area(VERIFY_WRITE,fs_pointer,nr*sizeof(int));
- if (!error) {
- while (nr) {
- get_user(*fdset, fs_pointer);
- nr--;
- fs_pointer++;
- fdset++;
- }
- }
- return error;
+ if (copy_from_user(fdset, fs_pointer, nr))
+ return -EFAULT;
+ return 0;
+ } else {
+ memset(fdset, 0, nr);
+ return 0;
}
- while (nr) {
- *fdset = 0;
- nr--;
- fdset++;
- }
- return 0;
}

+#if 0
+/* gcc seems to miscompile this. */
static void __set_fd_set(long nr, int * fs_pointer, int * fdset)
{
if (!fs_pointer)
return;
+ /* round up nr to nearest "int" */
+ nr = (nr + 8*sizeof(int)-1) / 8;
+ /* XXX use __copy_to_user */
+ copy_to_user(fs_pointer, fdset, nr);
+}
+#else
+static int __set_fd_set(long nr, int * fs_pointer, int * fdset)
+{
+ if (!fs_pointer)
+ return 0;
while (nr >= 0) {
- put_user(*fdset, fs_pointer);
+ if (put_user(*fdset, fs_pointer)) return 0;
nr -= 8 * sizeof(int);
fdset++;
fs_pointer++;
}
+ return 0;
}
+#endif

/* We can do long accesses here, kernel fdsets are always long-aligned */
static inline void __zero_fd_set(long nr, unsigned long * fdset)
@@ -262,6 +256,46 @@
}
}

+#if (NR_OPEN/8)*6 > PAGE_SIZE
+#error "NR_OPEN is too big."
+#endif
+
+/* XXX use a cached page like namei.c:getname() */
+static inline int alloc_fd_set_buffer(int n, fd_set_buffer *fdb, fd_set *exp)
+{
+ void *m;
+ int sz = (n+(sizeof(long)*8)-1)/8;
+
+ m = (void*)__get_free_page(GFP_KERNEL);
+ if (!m)
+ return -ENOMEM;
+ fdb->inp = m; m+=sz;
+ fdb->outp = m; m+=sz;
+ fdb->res_inp = m; m+=sz;
+ fdb->res_outp = m; m+=sz;
+ fdb->exp = m; m+=sz;
+ fdb->res_exp = m;
+ return 0;
+}
+
+static inline void free_fd_set_buffer(fd_set_buffer *fdb)
+{
+ free_page((unsigned long) fdb->inp);
+}
+
+/* We have to do this with a macro because of alloca(). */
+void *__builtin_alloca(size_t);
+#define alloc_small_fd_set(n,fdb) \
+ ({ int sz = (n + sizeof(int)*8-1)/8; \
+ fdb.inp = __builtin_alloca(sz); \
+ fdb.outp = __builtin_alloca(sz); \
+ fdb.exp = __builtin_alloca(sz); \
+ fdb.res_inp = __builtin_alloca(sz); \
+ fdb.res_outp = __builtin_alloca(sz);\
+ fdb.res_exp = __builtin_alloca(sz); \
+ })
+
+
/*
* Note a few subtleties: we use "long" for the dummy, not int, and we do a
* subtract by 1 on the nr of file descriptors. The former is better for
@@ -278,6 +312,9 @@
#define zero_fd_set(nr,fdp) \
__zero_fd_set((nr)-1, (unsigned long *) (fdp))

+/* Number of fds small enough to fit on the kernel stack. */
+#define SMALL_NR_OPEN 256
+
/*
* We can actually return ERESTARTSYS instead of EINTR, but I'd
* like to be certain this leads to no problems. So I return
@@ -294,53 +331,65 @@

error = -EINVAL;
if (n < 0)
- goto out;
+ goto finalout;
if (n > NR_OPEN)
n = NR_OPEN;
- if ((error = get_fd_set(n, inp, &fds.in)) ||
- (error = get_fd_set(n, outp, &fds.out)) ||
- (error = get_fd_set(n, exp, &fds.ex))) goto out;
+
+ if (n > SMALL_NR_OPEN) {
+ if (alloc_fd_set_buffer(n, &fds, exp))
+ return -ENOMEM;
+ } else {
+ alloc_small_fd_set(n,fds);
+ }
+
+ if ((error = get_fd_set(n, inp, fds.inp)) ||
+ (error = get_fd_set(n, outp, fds.outp)) ||
+ (error = get_fd_set(n, exp, fds.exp))) goto out;
timeout = ~0UL;
if (tvp) {
- error = verify_area(VERIFY_WRITE, tvp, sizeof(*tvp));
- if (error)
- goto out;
- get_user(timeout, &tvp->tv_usec);
+ unsigned long tmp;
+
+ if ((error = get_user(timeout, &tvp->tv_usec)) ||
+ (error = get_user(tmp, &tvp->tv_sec)))
+ goto out;
timeout = ROUND_UP(timeout,(1000000/HZ));
- {
- unsigned long tmp;
- get_user(tmp, &tvp->tv_sec);
- timeout += tmp * (unsigned long) HZ;
- }
+ timeout += tmp * (unsigned long) HZ;
if (timeout)
timeout += jiffies + 1;
}
- zero_fd_set(n, &fds.res_in);
- zero_fd_set(n, &fds.res_out);
- zero_fd_set(n, &fds.res_ex);
+ zero_fd_set(n, fds.res_inp);
+ zero_fd_set(n, fds.res_outp);
+ zero_fd_set(n, fds.res_exp);
current->timeout = timeout;
error = do_select(n, &fds);
timeout = current->timeout - jiffies - 1;
current->timeout = 0;
if ((long) timeout < 0)
timeout = 0;
+ if (error < 0)
+ goto out;
if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
- put_user(timeout/HZ, &tvp->tv_sec);
- timeout %= HZ;
- timeout *= (1000000/HZ);
- put_user(timeout, &tvp->tv_usec);
+ if (put_user(timeout/HZ, &tvp->tv_sec) ||
+ put_user((timeout%HZ)*(1000000/HZ), &tvp->tv_usec)) {
+ error = -EFAULT;
+ goto out;
+ }
}
- if (error < 0)
- goto out;
if (!error) {
error = -ERESTARTNOHAND;
if (current->signal & ~current->blocked)
goto out;
error = 0;
}
- set_fd_set(n, inp, &fds.res_in);
- set_fd_set(n, outp, &fds.res_out);
- set_fd_set(n, exp, &fds.res_ex);
+
+ if (set_fd_set(n, inp, fds.res_inp) ||
+ set_fd_set(n, outp, fds.res_outp) ||
+ set_fd_set(n, exp, fds.res_exp)) {
+ error = -EFAULT;
+ }
out:
+ if (n > SMALL_NR_OPEN)
+ free_fd_set_buffer(&fds);
+finalout:
return error;
}

--
|andi@mlm.extern.lrz-muenchen.de     Nonsense is better than no sense at all.
|                                        -NoMeansNo,0-1=2