Re: Thread implementations...

Linus Torvalds (torvalds@transmeta.com)
Fri, 26 Jun 1998 02:09:08 -0700 (PDT)


On Fri, 26 Jun 1998, Dean Gaudet wrote:
>
> This thread is, uh, fun ;) I started out liking sendfile, and now I'm
> thinking it may not be worth it!

Well, I decided I might as well implement it. Appended is a stupid example
program (and I mean _really_ stupid), and the diff against 2.1.107. It was
pretty much exactly as I expected it to be: a small amount of judicious
re-organization made the whole system call be less than 100 lines of code,
and most of that is just checking the inputs to the system call rather
than actual work.

CAUTION! The system call may well be completely broken. It seems to work
for me with the silly test-program, and I tried to make it do all the
right checks, but it's 2AM, and I did this in little more than an hour.
Caveat emptor. But I'd like to hear what people think about it..

> Another problem with sendfile(): the file FD's seek pointer is changed
> while it's used, which means multiple clients can't be serviced from the
> same FD. This kind of defeats the purpose of caching the open FD in a
> threaded server...

My current implementation does this, but that is purely co-incidental. I
could equally well have done a "pread()+pwrite()" kind of thing, it just
requires more arguments to the system call. It's technically trivial to
implement (the kernel internally does everything with the pread/pwrite
interface anyway).

But as far as I'm concerned, the kernel is better at looking up filenames
than apache will ever be unless you guys start really doing well. Under
those kinds of circumstances it's fairly pointless to do fd caching.
That's especially true in SMP environments and threading, where I just
don't think that the apache guys seem to be able to scale.

Note! I don't think apache is going to be the best thing to use with this,
if only because apache tries to be too clever. This is really meant for
something that

- uses threads
- does no caching _at_all_, because it knows the kernel can cache
everything better than most user mode programs
- just has a simple main loop that looks something like

for (;;) {
if (accept() >= 0)
clone(fd, connection)
}

connection(socket)
{
fd = open(filename);
fstat(fd, &st)
sendfile(socket, fd, st.st_size);
}

(Yes, the above is _very_ simplistic, please don't tell me that
web-serving is slightly more complex than this ;)

Anyway, the "sendfile()" system call "man-page" is:

sendfile(int outfd, int infd, size_t size);

is pretty much equivalent to

write(outfd, tempbuf, read(infd, tempbuf, size));

(with all the error handling details etc), except that "infd" has to be a
real file on a filesystem that supports the page cache.

Linus
-----
/*
* Very stupid example of using the sendfile()
* system call.
*/

#include <stdio.h>
#include <sys/types.h>
#include <unistd.h>
#include <errno.h>
#include <sys/fcntl.h>

ssize_t sendfile(int out, int in, size_t size)
{
ssize_t retval;

asm volatile(
"pushl %%ebx\n\t"
"movl %%esi,%%ebx\n\t"
"int $0x80\n\t"
"popl %%ebx"
:"=a" (retval)
:"0" (187),
"S" (out), /* pseudo-ebx */
"c" (in),
"d" (size));
if ((unsigned long) retval > (unsigned long)-1000) {
errno = -retval;
retval = -1;
}
return retval;
}

int main(int argc, char **argv)
{
int in, out, error;

in = open(argv[1], O_RDONLY);
if (in < 0) {
perror("open input");
exit(1);
}
out = open(argv[2], O_WRONLY | O_CREAT, 0666);
if (out < 0) {
perror("open output");
exit(1);
}
error = sendfile(out, in, 1024);
printf("sendfile returned %d\n", error);
if (error < 0) {
perror("sendfile");
}
return 0;
}
-----
diff -u --recursive v2.1.107/linux/arch/i386/kernel/entry.S linux/arch/i386/kernel/entry.S
--- v2.1.107/linux/arch/i386/kernel/entry.S Tue Jun 9 00:55:09 1998
+++ linux/arch/i386/kernel/entry.S Fri Jun 26 01:04:21 1998
@@ -547,7 +547,8 @@
.long SYMBOL_NAME(sys_capget)
.long SYMBOL_NAME(sys_capset) /* 185 */
.long SYMBOL_NAME(sys_sigaltstack)
+ .long SYMBOL_NAME(sys_sendfile)

- .rept NR_syscalls-186
+ .rept NR_syscalls-187
.long SYMBOL_NAME(sys_ni_syscall)
.endr
diff -u --recursive v2.1.107/linux/include/asm-i386/unistd.h linux/include/asm-i386/unistd.h
--- v2.1.107/linux/include/asm-i386/unistd.h Tue Jun 9 00:55:10 1998
+++ linux/include/asm-i386/unistd.h Fri Jun 26 01:14:18 1998
@@ -192,6 +192,7 @@
#define __NR_capget 184
#define __NR_capset 185
#define __NR_sigaltstack 186
+#define __NR_sendfile 187

/* user-visible error numbers are in the range -1 - -122: see <asm-i386/errno.h> */

diff -u --recursive v2.1.107/linux/mm/filemap.c linux/mm/filemap.c
--- v2.1.107/linux/mm/filemap.c Fri Jun 26 01:31:42 1998
+++ linux/mm/filemap.c Fri Jun 26 01:09:30 1998
@@ -567,6 +567,23 @@
return page_cache;
}

+/*
+ * "descriptor" for what we're up to with a read.
+ * This allows us to use the same read code yet
+ * have multiple different users of the data that
+ * we read from a file.
+ *
+ * The simplest case just copies the data to user
+ * mode.
+ */
+typedef struct {
+ size_t written;
+ size_t count;
+ char * buf;
+ int error;
+} read_descriptor_t;
+
+typedef int (*read_actor_t)(read_descriptor_t *, const char *, unsigned long);

/*
* This is a generic file read routine, and uses the
@@ -576,23 +593,14 @@
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
*/
-
-ssize_t generic_file_read(struct file * filp, char * buf,
- size_t count, loff_t *ppos)
+static void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
{
struct dentry *dentry = filp->f_dentry;
struct inode *inode = dentry->d_inode;
- ssize_t error, read;
size_t pos, pgpos, page_cache;
int reada_ok;
int max_readahead = get_max_readahead(inode);

- if (!access_ok(VERIFY_WRITE, buf, count))
- return -EFAULT;
- if (!count)
- return 0;
- error = 0;
- read = 0;
page_cache = 0;

pos = *ppos;
@@ -620,12 +628,12 @@
* Then, at least MIN_READAHEAD if read ahead is ok,
* and at most MAX_READAHEAD in all cases.
*/
- if (pos + count <= (PAGE_SIZE >> 1)) {
+ if (pos + desc->count <= (PAGE_SIZE >> 1)) {
filp->f_ramax = 0;
} else {
unsigned long needed;

- needed = ((pos + count) & PAGE_MASK) - pgpos;
+ needed = ((pos + desc->count) & PAGE_MASK) - pgpos;

if (filp->f_ramax < needed)
filp->f_ramax = needed;
@@ -678,20 +686,20 @@

offset = pos & ~PAGE_MASK;
nr = PAGE_SIZE - offset;
- if (nr > count)
- nr = count;
if (nr > inode->i_size - pos)
nr = inode->i_size - pos;
- nr -= copy_to_user(buf, (void *) (page_address(page) + offset), nr);
- release_page(page);
- error = -EFAULT;
- if (!nr)
- break;
- buf += nr;
+
+ /*
+ * The actor routine returns how many bytes were actually used..
+ * NOTE! This may not be the same as how much of a user buffer
+ * we filled up (we may be padding etc), so we can only update
+ * "pos" here (the actor routine has to update the user buffer
+ * pointers and the remaining count).
+ */
+ nr = actor(desc, (const char *) (page_address(page) + offset), nr);
pos += nr;
- read += nr;
- count -= nr;
- if (count)
+ release_page(page);
+ if (nr && desc->count)
continue;
break;
}
@@ -709,7 +717,7 @@
*/
if (page_cache)
continue;
- error = -ENOMEM;
+ desc->error = -ENOMEM;
break;
}

@@ -738,11 +746,14 @@
if (reada_ok && filp->f_ramax > MIN_READAHEAD)
filp->f_ramax = MIN_READAHEAD;

- error = inode->i_op->readpage(filp, page);
- if (!error)
- goto found_page;
- release_page(page);
- break;
+ {
+ int error = inode->i_op->readpage(filp, page);
+ if (!error)
+ goto found_page;
+ desc->error = error;
+ release_page(page);
+ break;
+ }

page_read_error:
/*
@@ -750,15 +761,18 @@
* Try to re-read it _once_. We do this synchronously,
* because this happens only if there were errors.
*/
- error = inode->i_op->readpage(filp, page);
- if (!error) {
- wait_on_page(page);
- if (PageUptodate(page) && !PageError(page))
- goto success;
- error = -EIO; /* Some unspecified error occurred.. */
+ {
+ int error = inode->i_op->readpage(filp, page);
+ if (!error) {
+ wait_on_page(page);
+ if (PageUptodate(page) && !PageError(page))
+ goto success;
+ error = -EIO; /* Some unspecified error occurred.. */
+ }
+ desc->error = error;
+ release_page(page);
+ break;
}
- release_page(page);
- break;
}

*ppos = pos;
@@ -766,9 +780,143 @@
if (page_cache)
free_page(page_cache);
UPDATE_ATIME(inode);
- if (!read)
- read = error;
- return read;
+}
+
+static int file_read_actor(read_descriptor_t * desc, const char *area, unsigned long size)
+{
+ unsigned long left;
+ unsigned long count = desc->count;
+
+ if (size > count)
+ size = count;
+ left = __copy_to_user(desc->buf, area, size);
+ if (left) {
+ size -= left;
+ desc->error = -EFAULT;
+ }
+ desc->count = count - size;
+ desc->written += size;
+ desc->buf += size;
+ return size;
+}
+
+/*
+ * This is the "read()" routine for all filesystems
+ * that can use the page cache directly.
+ */
+ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
+{
+ ssize_t retval;
+
+ retval = -EFAULT;
+ if (access_ok(VERIFY_WRITE, buf, count)) {
+ retval = 0;
+ if (count) {
+ read_descriptor_t desc;
+
+ desc.written = 0;
+ desc.count = count;
+ desc.buf = buf;
+ desc.error = 0;
+ do_generic_file_read(filp, ppos, &desc, file_read_actor);
+
+ retval = desc.written;
+ if (!retval)
+ retval = desc.error;
+ }
+ }
+ return retval;
+}
+
+static int file_send_actor(read_descriptor_t * desc, const char *area, unsigned long size)
+{
+ ssize_t written;
+ unsigned long count = desc->count;
+ struct file *file = (struct file *) desc->buf;
+ struct inode *inode = file->f_dentry->d_inode;
+
+ if (size > count)
+ size = count;
+ down(&inode->i_sem);
+ set_fs(KERNEL_DS);
+ written = file->f_op->write(file, area, size, &file->f_pos);
+ set_fs(USER_DS);
+ up(&inode->i_sem);
+ if (written < 0) {
+ desc->error = written;
+ written = 0;
+ }
+ desc->count = count - written;
+ desc->written += written;
+ return written;
+}
+
+asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, size_t count)
+{
+ ssize_t retval;
+ struct file * in_file, * out_file;
+ struct inode * in_inode, * out_inode;
+
+ /*
+ * Get input file, and verify that it is ok..
+ */
+ retval = -EBADF;
+ in_file = fget(in_fd);
+ if (!in_file)
+ goto out;
+ if (!(in_file->f_mode & FMODE_READ))
+ goto fput_in;
+ retval = -EINVAL;
+ in_inode = in_file->f_dentry->d_inode;
+ if (!in_inode)
+ goto fput_in;
+ if (!in_inode->i_op || !in_inode->i_op->readpage)
+ goto fput_in;
+ retval = locks_verify_area(FLOCK_VERIFY_READ, in_inode, in_file, in_file->f_pos, count);
+ if (retval)
+ goto fput_in;
+
+ /*
+ * Get output file, and verify that it is ok..
+ */
+ retval = -EBADF;
+ out_file = fget(out_fd);
+ if (!out_file)
+ goto fput_in;
+ if (!(out_file->f_mode & FMODE_WRITE))
+ goto fput_out;
+ retval = -EINVAL;
+ if (!out_file->f_op || !out_file->f_op->write)
+ goto fput_out;
+ out_inode = out_file->f_dentry->d_inode;
+ if (!out_inode)
+ goto fput_out;
+ retval = locks_verify_area(FLOCK_VERIFY_WRITE, out_inode, out_file, out_file->f_pos, count);
+ if (retval)
+ goto fput_out;
+
+ retval = 0;
+ if (count) {
+ read_descriptor_t desc;
+
+ desc.written = 0;
+ desc.count = count;
+ desc.buf = (char *) out_file;
+ desc.error = 0;
+ do_generic_file_read(in_file, &in_file->f_pos, &desc, file_send_actor);
+
+ retval = desc.written;
+ if (!retval)
+ retval = desc.error;
+ }
+
+
+fput_out:
+ fput(out_file);
+fput_in:
+ fput(in_file);
+out:
+ return retval;
}

/*

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.rutgers.edu