--- linux/fs/select.c Mon Jul 24 07:39:44 2000 +++ linux-work/fs/select.c Sat Aug 5 17:23:18 2000 @@ -12,6 +12,9 @@ * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). + * + * July 2000 + * Add fast poll/select (Andi Kleen) */ #include @@ -24,6 +27,8 @@ #define ROUND_UP(x,y) (((x)+(y)-1)/(y)) #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM) +#define __MAX_POLL_TABLE_ENTRIES ((PAGE_SIZE - sizeof (struct poll_table_page)) / sizeof (struct poll_table_entry)) + struct poll_table_entry { struct file * filp; wait_queue_t wait; @@ -32,42 +37,43 @@ struct poll_table_page { struct poll_table_page * next; - struct poll_table_entry * entry; + int nr, max; struct poll_table_entry entries[0]; }; -#define POLL_TABLE_FULL(table) \ - ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) - /* - * Ok, Peter made a complicated, but straightforward multiple_wait() function. - * I have rewritten this, taking some shortcuts: This code may not be easy to - * follow, but it should be free of race-conditions, and it's practical. If you - * understand what I'm doing here, then you understand how the linux - * sleep/wakeup mechanism works. - * - * Two very simple procedures, poll_wait() and poll_freewait() make all the - * work. poll_wait() is an inline-function defined in , - * as all select/poll functions have to call it to add an entry to the - * poll table. + * Tune fast poll/select. Limit is the kernel stack. */ +#define FAST_SELECT_LIMIT 40 +#define FAST_POLL_LIMIT 40 + +#define FAST_SELECT_BYTES ((FAST_SELECT_LIMIT + 7) / 8) +#define FAST_SELECT_ULONG \ + ((FAST_SELECT_BYTES + sizeof(unsigned long) - 1) / sizeof(unsigned long)) + +static inline void do_freewait(struct poll_table_page *p) +{ + struct poll_table_entry * entry; + entry = p->entries + p->nr; + while (p->nr > 0) { + p->nr--; + entry--; + remove_wait_queue(entry->wait_address,&entry->wait); + fput(entry->filp); + } +} void poll_freewait(poll_table* pt) { + struct poll_table_page *old; struct poll_table_page * p = pt->table; while (p) { - struct poll_table_entry * entry; - struct poll_table_page *old; - - entry = p->entry; - do { - entry--; - remove_wait_queue(entry->wait_address,&entry->wait); - fput(entry->filp); - } while (entry > p->entries); old = p; + do_freewait(p); p = p->next; - free_page((unsigned long) old); + if (old->max == __MAX_POLL_TABLE_ENTRIES) { + free_page((unsigned long) old); + } } } @@ -75,7 +81,7 @@ { struct poll_table_page *table = p->table; - if (!table || POLL_TABLE_FULL(table)) { + if (!table || table->nr >= table->max) { struct poll_table_page *new_table; new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); @@ -84,16 +90,17 @@ __set_current_state(TASK_RUNNING); return; } - new_table->entry = new_table->entries; - new_table->next = table; + new_table->nr = 0; + new_table->max = __MAX_POLL_TABLE_ENTRIES; + new_table->next = table; p->table = new_table; table = new_table; } /* Add a new entry */ { - struct poll_table_entry * entry = table->entry; - table->entry = entry+1; + struct poll_table_entry * entry = table->entries + table->nr; + table->nr++; get_file(filp); entry->filp = filp; entry->wait_address = wait_address; @@ -160,9 +167,8 @@ #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) #define POLLEX_SET (POLLPRI) -int do_select(int n, fd_set_bits *fds, long *timeout) +int __do_select(poll_table *wait, int n, fd_set_bits *fds, long *timeout) { - poll_table table, *wait; int retval, i, off; long __timeout = *timeout; @@ -173,22 +179,24 @@ if (retval < 0) return retval; n = retval; - - poll_initwait(&table); - wait = &table; - if (!__timeout) - wait = NULL; retval = 0; for (;;) { set_current_state(TASK_INTERRUPTIBLE); for (i = 0 ; i < n; i++) { - unsigned long bit = BIT(i); + unsigned long bit; unsigned long mask; struct file *file; - + off = i / __NFDBITS; - if (!(bit & BITS(fds, off))) + mask = BITS(fds, off); + if (!mask) { + i = (i + __NFDBITS - 1) & ~__NFDBITS; + continue; + } + bit = BIT(i); + if (!(bit & mask)) continue; + file = fget(i); mask = POLLNVAL; if (file) { @@ -213,19 +221,17 @@ wait = NULL; } } + if (wait && wait->error) { + retval = wait->error; + break; + } wait = NULL; if (retval || !__timeout || signal_pending(current)) break; - if(table.error) { - retval = table.error; - break; - } __timeout = schedule_timeout(__timeout); } current->state = TASK_RUNNING; - poll_freewait(&table); - /* * Up-to-date the caller timeout. */ @@ -233,15 +239,85 @@ return retval; } -static void *select_bits_alloc(int size) +int do_select(int n, fd_set_bits *fds, long *timeout) { - return kmalloc(6 * size, GFP_KERNEL); -} + int r; + poll_table *wp, table; + wp = NULL; + if (*timeout) { + wp = &table; + poll_initwait(wp); + } + r = __do_select(wp, n, fds, timeout); + if (wp) + poll_freewait(wp); + return r; +} + +/* + * Fast path for small selects that do not needed auxillary memory allocations. + * This could still be optimized by using a leaner do_select like 2.0 had. + */ +static int fast_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, + long timeout, struct timeval *tvp) +{ + fd_set_bits fds; + int ret; + unsigned long stackbits[6 * FAST_SELECT_ULONG]; + poll_table wait; + struct { + struct poll_table_page w; + struct poll_table_entry entries[FAST_SELECT_LIMIT]; + } waitp; + + fds.in = stackbits; + fds.out = stackbits + FAST_SELECT_ULONG; + fds.ex = stackbits + FAST_SELECT_ULONG * 2; + fds.res_in = stackbits + FAST_SELECT_ULONG * 3; + fds.res_out = stackbits + FAST_SELECT_ULONG * 4; + fds.res_ex = stackbits + FAST_SELECT_ULONG * 5; + if ((ret = get_fd_set(n, inp, fds.in)) || + (ret = get_fd_set(n, outp, fds.out)) || + (ret = get_fd_set(n, exp, fds.ex))) + return ret; + memset(stackbits + FAST_SELECT_ULONG*3, 0, + FAST_SELECT_ULONG*sizeof(unsigned long)*3); -static void select_bits_free(void *bits, int size) -{ - kfree(bits); -} + { + poll_table *wp = NULL; + if (timeout) { + wp = &wait; + __poll_initwait(wp, &waitp.w); + waitp.w.max = FAST_SELECT_LIMIT; + waitp.w.next = NULL; + waitp.w.nr = 0; + } + ret = __do_select(wp, n, &fds, &timeout); + if (wp) { + poll_freewait(wp); + } + } + + if (tvp && !(current->personality & STICKY_TIMEOUTS)) { + time_t sec = 0, usec = 0; + if (timeout) { + sec = timeout / HZ; + usec = timeout % HZ; + usec *= (1000000/HZ); + } + __put_user(sec, &tvp->tv_sec); + __put_user(usec, &tvp->tv_usec); + } + + if (ret >= 0) { + if (ret == 0 && signal_pending(current)) + return -ERESTARTNOHAND; + set_fd_set(n, inp, fds.res_in); + set_fd_set(n, outp, fds.res_out); + set_fd_set(n, exp, fds.res_ex); + } + return ret; +} /* * We can actually return ERESTARTSYS instead of EINTR, but I'd @@ -288,6 +364,9 @@ if (n > current->files->max_fdset) n = current->files->max_fdset; + if (n <= FAST_SELECT_LIMIT) + return fast_select(n, inp, outp, exp, timeout, tvp); + /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), * since we used fdset we need to allocate memory in units of @@ -295,7 +374,7 @@ */ ret = -ENOMEM; size = FDS_BYTES(n); - bits = select_bits_alloc(size); + bits = kmalloc(6 * size, GFP_KERNEL); if (!bits) goto out_nofds; fds.in = (unsigned long *) bits; @@ -309,11 +388,9 @@ (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; - zero_fd_set(n, fds.res_in); - zero_fd_set(n, fds.res_out); - zero_fd_set(n, fds.res_ex); + memset(fds.res_in, 0, 3*size); - ret = do_select(n, &fds, &timeout); + ret = do_select(n, &fds, &timeout); if (tvp && !(current->personality & STICKY_TIMEOUTS)) { time_t sec = 0, usec = 0; @@ -340,17 +417,18 @@ set_fd_set(n, exp, fds.res_ex); out: - select_bits_free(bits, size); + kfree(bits); out_nofds: return ret; } #define POLLFD_PER_PAGE ((PAGE_SIZE) / sizeof(struct pollfd)) -static void do_pollfd(unsigned int num, struct pollfd * fdpage, - poll_table ** pwait, int *count) +static int do_pollfd(unsigned int num, struct pollfd * fdpage, + poll_table ** pwait) { int i; + int count = 0; for (i = 0; i < num; i++) { int fd; @@ -372,28 +450,75 @@ } if (mask) { *pwait = NULL; - (*count)++; + count++; } } fdp->revents = mask; } + return count; } +/* Not inlined to save stack space for slow poll */ +static int fast_poll(struct pollfd *ufds, unsigned int nfds, long timeout) +{ + struct pollfd kfds[FAST_POLL_LIMIT]; + poll_table wait, *wp; + struct { + struct poll_table_page w; + struct poll_table_entry entries[FAST_POLL_LIMIT]; + } waitp; + int count, i; + + wp = NULL; + waitp.w.nr = 0; + if (timeout) { + waitp.w.next = NULL; + waitp.w.max = FAST_POLL_LIMIT; + __poll_initwait(&wait, &waitp.w); + wp = &wait; + } + + if (copy_from_user(kfds, ufds, nfds * sizeof(struct pollfd))) { + return -EFAULT; + } + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + count = do_pollfd(nfds, kfds, &wp); + wp = NULL; + if (count || !timeout || signal_pending(current)) + break; + timeout = schedule_timeout(timeout); + } + current->state = TASK_RUNNING; + + if (waitp.w.nr > 0) { + poll_freewait(&wait); + } + + for (i = 0; i < nfds; i++) + __put_user(kfds[i].revents, &ufds[i].revents); + + if (!count && signal_pending(current)) + return -EINTR; + + return count; +} + static int do_poll(unsigned int nfds, unsigned int nchunks, unsigned int nleft, struct pollfd *fds[], poll_table *wait, long timeout) { int count = 0; - poll_table* pt = wait; for (;;) { unsigned int i; set_current_state(TASK_INTERRUPTIBLE); for (i=0; i < nchunks; i++) - do_pollfd(POLLFD_PER_PAGE, fds[i], &pt, &count); + count += do_pollfd(POLLFD_PER_PAGE, fds[i], &wait); if (nleft) - do_pollfd(nleft, fds[nchunks], &pt, &count); - pt = NULL; + count += do_pollfd(nleft, fds[nchunks], &wait); + wait = NULL; if (count || !timeout || signal_pending(current)) break; if(wait->error) { @@ -405,31 +530,20 @@ return count; } -asmlinkage long sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout) +static inline int slow_poll(struct pollfd * ufds, unsigned int nfds, long timeout) { int i, j, fdcount, err; struct pollfd **fds; - poll_table table, *wait; + poll_table *wait, table; int nchunks, nleft; - /* Do a sanity check on nfds ... */ - if (nfds > current->files->max_fds) - return -EINVAL; - + wait = NULL; if (timeout) { - /* Careful about overflow in the intermediate values */ - if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ) - timeout = (unsigned long)(timeout*HZ+999)/1000+1; - else /* Negative or overflow */ - timeout = MAX_SCHEDULE_TIMEOUT; + poll_initwait(&table); + wait = &table; } - poll_initwait(&table); - wait = &table; - if (!timeout) - wait = NULL; - - err = -ENOMEM; + err = -ENOMEM; fds = NULL; if (nfds != 0) { fds = (struct pollfd **)kmalloc( @@ -489,4 +603,24 @@ out: poll_freewait(&table); return err; +} + +asmlinkage long sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout) +{ + /* Do a sanity check on nfds ... */ + if (nfds > current->files->max_fds) + return -EINVAL; + + if (timeout) { + /* Careful about overflow in the intermediate values */ + if ((unsigned long) timeout < MAX_SCHEDULE_TIMEOUT / HZ) + timeout = (unsigned long)(timeout*HZ+999)/1000+1; + else /* Negative or overflow */ + timeout = MAX_SCHEDULE_TIMEOUT; + } + + if (nfds <= FAST_POLL_LIMIT) + return fast_poll(ufds, nfds, timeout); + + return slow_poll(ufds, nfds, timeout); } --- linux/include/linux/poll.h Tue Aug 1 03:06:38 2000 +++ linux-work/include/linux/poll.h Sun Jul 30 23:46:10 2000 @@ -25,11 +25,17 @@ __pollwait(filp, wait_address, p); } -static inline void poll_initwait(poll_table* pt) +static inline void __poll_initwait(poll_table* pt, struct poll_table_page *tab) { pt->error = 0; - pt->table = NULL; + pt->table = tab; +} + +static inline void poll_initwait(poll_table* pt) +{ + __poll_initwait(pt, NULL); } + extern void poll_freewait(poll_table* pt);