Another select() patch (support for >256fds per process)

Andi Kleen (andi@mlm.extern.lrz-muenchen.de)
Mon, 27 Jan 1997 16:48:05 +0100


Hi,

This patch removes the 256fd per process limit that was imposed by
the select() kernel call. With it it should be possible to compile
the kernel with a big NR_OPEN (but see the table I added for good
values). The patch is against 2.1.23.

Please test this patch out so that I can send it to Linus.

-Andi

--- linux-clean/fs/select.c Sun Jan 26 20:09:42 1997
+++ linux-wrk-2.1.23/fs/select.c Sun Jan 26 20:18:34 1997
@@ -8,6 +8,10 @@
* COFF/ELF binary emulation. If the process has the STICKY_TIMEOUTS
* flag set in its personality we do *not* modify the given timeout
* parameter to reflect time remaining.
+ * 26 January 1997
+ * Removed all fd_set size limits from sys_select().
+ * Removed all verify_area() calls.
+ * - Andi Kleen <andi@mlm.extern.lrz-muenchen.de>
*/

#include <linux/types.h>
@@ -24,6 +28,7 @@
#include <linux/malloc.h>
#include <linux/smp.h>
#include <linux/smp_lock.h>
+#include <linux/malloc.h>

#include <asm/uaccess.h>
#include <asm/system.h>
@@ -32,6 +37,9 @@
#define ROUND_UP(x,y) (((x)+(y)-1)/(y))
#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)

+#define CUR_NR_OPEN NR_OPEN
+#define SMALL_NR_OPEN 256
+
/*
* Ok, Peter made a complicated, but straightforward multiple_wait() function.
* I have rewritten this, taking some shortcuts: This code may not be easy to
@@ -61,30 +69,21 @@
}
}

-/*
- * Due to kernel stack usage, we use a _limited_ fd_set type here, and once
- * we really start supporting >256 file descriptors we'll probably have to
- * allocate the kernel fd_set copies dynamically.. (The kernel select routines
- * are careful to touch only the defined low bits of any fd_set pointer, this
- * is important for performance too).
- */
-typedef unsigned long limited_fd_set[NR_OPEN/(8*(sizeof(unsigned long)))];
+typedef unsigned long * fds_ptr;
+
+/* assumes sizeof(type) is a power of 2 */
+#define roundbit(n, type) (((n) + sizeof(type)*8 - 1) & ~(sizeof(type)*8-1))

-typedef struct {
- limited_fd_set in, out, ex;
- limited_fd_set res_in, res_out, res_ex;
-} fd_set_buffer;
-
-#define __IN(in) (in)
-#define __OUT(in) (in + sizeof(limited_fd_set)/sizeof(unsigned long))
-#define __EX(in) (in + 2*sizeof(limited_fd_set)/sizeof(unsigned long))
-#define __RES_IN(in) (in + 3*sizeof(limited_fd_set)/sizeof(unsigned long))
-#define __RES_OUT(in) (in + 4*sizeof(limited_fd_set)/sizeof(unsigned long))
-#define __RES_EX(in) (in + 5*sizeof(limited_fd_set)/sizeof(unsigned long))
+#define __IN(fds,sz) (fds)
+#define __OUT(fds,sz) ((unsigned long *) ((char *)(fds) + sz))
+#define __EX(fds,sz) ((unsigned long *) ((char *)(fds) + 2*sz))
+#define __RES_IN(fds,sz) ((unsigned long *) ((char *)(fds) + 3*sz))
+#define __RES_OUT(fds,sz) ((unsigned long *) ((char *)(fds) + 4*sz))
+#define __RES_EX(fds,sz) ((unsigned long *) ((char *)(fds) + 5*sz))

-#define BITS(in) (*__IN(in)|*__OUT(in)|*__EX(in))
+#define BITS(fds,sz) (*__IN(fds,sz)|*__OUT(fds,sz)|*__EX(fds,sz))

-static int max_select_fd(unsigned long n, fd_set_buffer *fds)
+static int max_select_fd(int sz,unsigned long n,fds_ptr fds)
{
unsigned long *open_fds, *in;
unsigned long set;
@@ -94,10 +93,10 @@
set = ~(~0UL << (n & (__NFDBITS-1)));
n /= __NFDBITS;
open_fds = current->files->open_fds.fds_bits+n;
- in = fds->in+n;
+ in = fds+n;
max = 0;
if (set) {
- set &= BITS(in);
+ set &= BITS(in, sz);
if (set) {
if (!(set & ~*open_fds))
goto get_max;
@@ -108,7 +107,7 @@
in--;
open_fds--;
n--;
- set = BITS(in);
+ set = BITS(in,sz);
if (!set)
continue;
if (set & ~*open_fds)
@@ -135,14 +134,14 @@
#define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR)
#define POLLEX_SET (POLLPRI)

-static int do_select(int n, fd_set_buffer *fds)
+static int do_select(int sz, int n, fds_ptr fds)
{
int retval;
poll_table wait_table, *wait;
struct poll_table_entry *entry;
int i;

- retval = max_select_fd(n, fds);
+ retval = max_select_fd(sz, n, fds);
if (retval < 0)
goto out;
n = retval;
@@ -159,9 +158,9 @@
current->state = TASK_INTERRUPTIBLE;
for (i = 0 ; i < n ; i++,fd++) {
unsigned long bit = BIT(i);
- unsigned long *in = MEM(i,fds->in);
+ unsigned long *in = MEM(i,fds);

- if (bit & BITS(in)) {
+ if (bit & BITS(in, sz)) {
struct file * file = *fd;
unsigned int mask = POLLNVAL;
if (file) {
@@ -169,18 +168,18 @@
if (file->f_op && file->f_op->poll)
mask = file->f_op->poll(file, wait);
}
- if ((mask & POLLIN_SET) && ISSET(bit, __IN(in))) {
- SET(bit, __RES_IN(in));
+ if ((mask & POLLIN_SET) && ISSET(bit, __IN(in,sz))) {
+ SET(bit, __RES_IN(in,sz));
retval++;
wait = NULL;
}
- if ((mask & POLLOUT_SET) && ISSET(bit, __OUT(in))) {
- SET(bit, __RES_OUT(in));
+ if ((mask & POLLOUT_SET) && ISSET(bit, __OUT(in,sz))) {
+ SET(bit, __RES_OUT(in,sz));
retval++;
wait = NULL;
}
- if ((mask & POLLEX_SET) && ISSET(bit, __EX(in))) {
- SET(bit, __RES_EX(in));
+ if ((mask & POLLEX_SET) && ISSET(bit, __EX(in,sz))) {
+ SET(bit, __RES_EX(in,sz));
retval++;
wait = NULL;
}
@@ -199,9 +198,6 @@
}

/*
- * We do a VERIFY_WRITE here even though we are only reading this time:
- * we'll write to it eventually..
- *
* Use "int" accesses to let user-mode fd_set's be int-aligned.
*/
static int __get_fd_set(unsigned long nr, int * fs_pointer, int * fdset)
@@ -209,16 +205,14 @@
/* round up nr to nearest "int" */
nr = (nr + 8*sizeof(int)-1) / (8*sizeof(int));
if (fs_pointer) {
- int error = verify_area(VERIFY_WRITE,fs_pointer,nr*sizeof(int));
- if (!error) {
- while (nr) {
- get_user(*fdset, fs_pointer);
- nr--;
- fs_pointer++;
- fdset++;
- }
+ while (nr) {
+ if (get_user(*fdset, fs_pointer))
+ return -EFAULT;
+ nr--;
+ fs_pointer++;
+ fdset++;
}
- return error;
+ return 0;
}
while (nr) {
*fdset = 0;
@@ -228,16 +222,20 @@
return 0;
}

-static void __set_fd_set(long nr, int * fs_pointer, int * fdset)
+static int __set_fd_set(long nr, int * fs_pointer, int * fdset)
{
+ int err = 0;
if (!fs_pointer)
- return;
+ return 0;
while (nr >= 0) {
- put_user(*fdset, fs_pointer);
+ err = __put_user(*fdset, fs_pointer);
+ if (err)
+ break;
nr -= 8 * sizeof(int);
fdset++;
fs_pointer++;
}
+ return err;
}

/* We can do long accesses here, kernel fdsets are always long-aligned */
@@ -277,63 +275,83 @@
asmlinkage int sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
{
int error = -EINVAL;
- fd_set_buffer fds;
+ fds_ptr fds;
unsigned long timeout;
+ unsigned int sz;

lock_kernel();
if (n < 0)
- goto out;
- if (n > NR_OPEN)
- n = NR_OPEN;
- if ((error = get_fd_set(n, inp, &fds.in)) ||
- (error = get_fd_set(n, outp, &fds.out)) ||
- (error = get_fd_set(n, exp, &fds.ex))) goto out;
+ goto finalout;
+ if (n > CUR_NR_OPEN)
+ n = CUR_NR_OPEN; /* why not return -EINVAL here? */
+
+ sz = roundbit(n, unsigned long)/8;
+ if (n <= SMALL_NR_OPEN)
+ fds = __builtin_alloca(6*sz);
+ else {
+ fds = kmalloc(6*sz, GFP_KERNEL);
+ if (!fds)
+ goto finalout;
+ }
+
+ if ((error = get_fd_set(n, inp, __IN(fds,sz))) ||
+ (error = get_fd_set(n, outp, __OUT(fds,sz))) ||
+ (error = get_fd_set(n, exp, __EX(fds,sz)))) goto out;
+
timeout = ~0UL;
if (tvp) {
- error = verify_area(VERIFY_WRITE, tvp, sizeof(*tvp));
- if (error)
- goto out;
- get_user(timeout, &tvp->tv_usec);
+ unsigned long tmp;
+
+ if ((error = get_user(timeout, &tvp->tv_usec)) ||
+ (error = get_user(tmp, &tvp->tv_sec)))
+ goto out;
timeout = ROUND_UP(timeout,(1000000/HZ));
- {
- unsigned long tmp;
- get_user(tmp, &tvp->tv_sec);
- timeout += tmp * (unsigned long) HZ;
- }
+ timeout += tmp * (unsigned long) HZ;
+ /* XXX Note that timeout might still be too
+ small because of rounding errors to 100000/HZ.
+ I'm not sure about the right fix though. */
if (timeout)
timeout += jiffies + 1;
}
- zero_fd_set(n, &fds.res_in);
- zero_fd_set(n, &fds.res_out);
- zero_fd_set(n, &fds.res_ex);
+ zero_fd_set(n, __RES_IN(fds,sz));
+ zero_fd_set(n, __RES_OUT(fds,sz));
+ zero_fd_set(n, __RES_EX(fds,sz));
current->timeout = timeout;
- error = do_select(n, &fds);
+ error = do_select(sz, n, fds);
timeout = current->timeout - jiffies - 1;
current->timeout = 0;
if ((long) timeout < 0)
timeout = 0;
if (tvp && !(current->personality & STICKY_TIMEOUTS)) {
- put_user(timeout/HZ, &tvp->tv_sec);
- timeout %= HZ;
- timeout *= (1000000/HZ);
- put_user(timeout, &tvp->tv_usec);
+ if (__put_user(timeout/HZ, &tvp->tv_sec) ||
+ __put_user((timeout%HZ)*(1000000/HZ), &tvp->tv_usec)) {
+ error = -EFAULT;
+ goto out;
+ }
}
- if (error < 0)
- goto out;
+ if (error < 0)
+ goto out;
if (!error) {
error = -ERESTARTNOHAND;
if (current->signal & ~current->blocked)
goto out;
error = 0;
}
- set_fd_set(n, inp, &fds.res_in);
- set_fd_set(n, outp, &fds.res_out);
- set_fd_set(n, exp, &fds.res_ex);
+
+ if (set_fd_set(n, inp, __RES_IN(fds,sz)) ||
+ set_fd_set(n, outp, __RES_OUT(fds,sz)) ||
+ set_fd_set(n, exp, __RES_EX(fds,sz))) {
+ error = -EFAULT;
+ }
out:
- unlock_kernel();
+ if (n > SMALL_NR_OPEN)
+ kfree(fds);
+finalout:
+ unlock_kernel();
return error;
}

+
static int do_poll(unsigned int nfds, struct pollfd *fds, poll_table *wait)
{
int count;
@@ -376,13 +394,13 @@

asmlinkage int sys_poll(struct pollfd * ufds, unsigned int nfds, int timeout)
{
- int i, count, fdcount, err = -EINVAL;
+ int i, count, fdcount, err = -EINVAL;
struct pollfd * fds, *fds1;
poll_table wait_table;
struct poll_table_entry *entry;

lock_kernel();
- if (nfds > NR_OPEN)
+ if (nfds > CUR_NR_OPEN)
goto out;

err = -ENOMEM;
--- linux-clean/include/linux/limits.h Sat Nov 9 18:32:22 1996
+++ linux-wrk-2.1.23/include/linux/limits.h Sun Jan 26 20:30:51 1997
@@ -1,12 +1,12 @@
#ifndef _LINUX_LIMITS_H
#define _LINUX_LIMITS_H

-#define NR_OPEN 256
+#define NR_OPEN 959

#define NGROUPS_MAX 32 /* supplemental group IDs are available */
#define ARG_MAX 131072 /* # bytes of args + environ for exec() */
#define CHILD_MAX 999 /* no limit :-) */
-#define OPEN_MAX 256 /* # open files a process may have */
+#define OPEN_MAX 959 /* # open files a process may have */
#define LINK_MAX 127 /* # links a file may have */
#define MAX_CANON 255 /* size of the canonical input queue */
#define MAX_INPUT 255 /* size of the type-ahead buffer */
--- linux-clean/include/linux/fs.h Sun Jan 26 20:09:47 1997
+++ linux-wrk-2.1.23/include/linux/fs.h Sun Jan 26 20:50:27 1997
@@ -25,9 +25,30 @@
* recompiled to take full advantage of the new limits..
*/

+/* Good values for NR_OPEN with the current kmalloc() (that always
+ rounds the size to the next power of 2). The header of struct
+ file_struct is 2*sizeof(fd_set)+sizeof(int).
+
+ allocated size 2K 4K 8K 16K 32K
+ -------------------------------------------------------
+ 32bit architecture
+ NR_OPEN 383 959 1919 3839 7935
+ fd_set size 128 128 256 512 512
+ -------------------------------------------------------
+ 64bit architecture
+ NR_OPEN 223 479 991 1983 3967
+ fd_set size 128 128 128 256 512
+
+ When we use slab_cache_alloc() to allocate struct files_struct
+ at fork time the optimal NR_OPEN value is different I think.
+
+ Note that when you change the fd_set size you have to change
+ posix_limits.h, the libc include files and recompile the applications.
+ */
+
/* Fixed constants first: */
#undef NR_OPEN
-#define NR_OPEN 256
+#define NR_OPEN 959

#define NR_SUPER 64
#define BLOCK_SIZE 1024

--
|andi@mlm.extern.lrz-muenchen.de     Nonsense is better than no sense at all.
|                                        -NoMeansNo,0-1=2