Re: F_DUPFD_CLOEXEC implementation

From: Denys Vlasenko
Date: Mon Oct 01 2007 - 06:07:51 EST


On Monday 01 October 2007 04:15, Davide Libenzi wrote:
> On Mon, 1 Oct 2007, Denys Vlasenko wrote:
>
> > My use case is: I want to do a nonblocking read on descriptor 0 (stdin).
> > It may be a pipe or a socket.
> >
> > There may be other processes which share this descriptor with me,
> > I simply cannot know that. And they, too, may want to do reads on it.
> >
> > I want to do nonblocking read in such a way that neither those other
> > processes will ever see fd switching to O_NONBLOCK and back, and
> > I also want to be safe from other processes doing the same.
> >
> > I don't see how this can be done using standard unix primitives.
>
> Indeed. You could simulate non-blocking using poll with zero timeout, but
> if another task may read/write on it, your following read/write may end up
> blocking even after a poll returned the required events.
> One way to solve this would be some sort of readx/writex where you pass an
> extra flags parameter

We have that already. They are called send and recv. ;)

> (this could be done with sys_indirect, assuming
> we'll ever get that mainline) where you specify the non-blocking
> requirement for-this-call, and not as global per-file flag. Then, of
> course, you'll have to modify all the "file->f_flags & O_NONBLOCK" tests
> (and there are many of them) to check for that flag too (that can be a
> per task_struct flag).

Attached patch detects send/recv(fd, buf, size, MSG_DONTWAIT) on
non-sockets and turns them into non-blocking write/read.
Since filp->f_flags appear to be read and modified without any locking,
I cannot modify it without potentially affecting other processes
accessing the same file through shared struct file.

Therefore I simply make a temporary copy of struct file, set
O_NONBLOCK in it and pass it to vfs_read/write.
Is this heresy? ;) I see only one spinlock in struct file:

#ifdef CONFIG_EPOLL
spinlock_t f_ep_lock;
#endif /* #ifdef CONFIG_EPOLL */

Do I need to take it?

Also attached is ndelaytest.c which can be used to test that
send(MSG_DONTWAIT) indeed is failing with EAGAIN if write would block
and that other processes never see O_NONBLOCK set.

Comments?
--
vda
#include <sys/types.h>
#include <sys/socket.h>
#include <errno.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <time.h>
#include <signal.h>

#define SECONDS 10

#define STR "."
//#define STR "123456789 123456789 123456789 123456789 "

/* To see send() resulting in EAGAIN:
* strace -ff -o log ndelaytest | while sleep 11; do break; done
* log.$PID:
* send(1, "123456789 123456789 123456789 12"..., 40, MSG_DONTWAIT)
* = -1 EAGAIN (Resource temporarily unavailable)
*/

int main()
{
pid_t pid;
time_t t;
int fl;

puts("starting");
t = time(0);

pid = fork();
if (pid == 0) {
/* child */
while ((time(0) - t) < SECONDS-1) {
#if 0
/* Uncomment this part and simply run the executable
* to see race detection code in action */
#define OP "write"
fcntl(1, F_SETFL, fcntl(1, F_GETFL) | O_NONBLOCK);
fl = write(1, STR, sizeof(STR) - 1);
fcntl(1, F_SETFL, fcntl(1, F_GETFL) & ~O_NONBLOCK);
#else
/* This part tests whether send(MSG_DONTWAIT)
* is racy or not */
#define OP "send"
fl = send(1, STR, sizeof(STR) - 1, MSG_DONTWAIT);
#endif
if (fl < 0) {
perror(OP);
kill(getppid(), SIGKILL);
return 1;
}
}
return 0;
}

while ((time(0) - t) < SECONDS) {
fl = fcntl(1, F_GETFL);
if (fl & O_NONBLOCK) {
fprintf(stderr, "NONBLOCK:1\n");
kill(pid, SIGKILL);
fcntl(1, F_SETFL, fl & ~O_NONBLOCK);
return 1;
}
}
fprintf(stderr, "NONBLOCK:0\n");
return 0;
}
--- linux-2.6.22-rc6.src/fs/read_write.c Fri Jun 15 19:30:05 2007
+++ linux-2.6.22-rc6_ndelay/fs/read_write.c Sun Aug 19 10:43:24 2007
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
+#include <linux/socket.h>
#include "read_write.h"

#include <asm/uaccess.h>
@@ -351,6 +352,36 @@
static inline void file_pos_write(struct file *file, loff_t pos)
{
file->f_pos = pos;
+}
+
+/* Helper for send/recv on non-sockets */
+ssize_t rw_with_flags(struct file *file, int fput_needed, void __user *buf, size_t count, unsigned flags)
+{
+ int err;
+ loff_t pos;
+ struct file *file_copy;
+
+ file_copy = file;
+ if (flags & MSG_DONTWAIT) {
+ /* We make copy even if O_NONBLOCK is already set. */
+ /* We don't want it to change under our feet. */
+ file_copy = kmalloc(sizeof(*file_copy), GFP_KERNEL);
+ memcpy(file_copy, file, sizeof(*file_copy));
+ file_copy->f_flags |= O_NONBLOCK;
+ }
+
+ pos = file_pos_read(file);
+ if (flags & MSG_OOB) /* MSG_OOB is reused to mean 'write' */
+ err = vfs_write(file_copy, buf, count, &pos);
+ else
+ err = vfs_read(file_copy, buf, count, &pos);
+ file_pos_write(file, pos);
+
+ if (flags & MSG_DONTWAIT) {
+ kfree(file_copy);
+ }
+ fput_light(file, fput_needed);
+ return err;
}

asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
--- linux-2.6.22-rc6.src/include/linux/fs.h Wed Jun 27 21:24:18 2007
+++ linux-2.6.22-rc6_ndelay/include/linux/fs.h Sun Aug 19 10:32:20 2007
@@ -1154,6 +1154,9 @@
extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
unsigned long, loff_t *);

+extern ssize_t rw_with_flags(struct file *, int, void __user *, size_t,
+ unsigned);
+
/*
* NOTE: write_inode, delete_inode, clear_inode, put_inode can be called
* without the big kernel lock held in all filesystems.
--- linux-2.6.22-rc6.src/net/socket.c Fri Jun 15 19:30:08 2007
+++ linux-2.6.22-rc6_ndelay/net/socket.c Sun Aug 19 11:34:07 2007
@@ -1585,8 +1585,17 @@
goto out;

sock = sock_from_file(sock_file, &err);
- if (!sock)
- goto out_put;
+ if (!sock) {
+ if (addr)
+ goto out_put;
+ if (flags & ~MSG_DONTWAIT)
+ goto out_put;
+ /* it's not a socket, but we support a special case:
+ * send(fd, buf, count, MSG_DONTWAIT)
+ * (MSG_OOB is reused to mean 'write') */
+ return rw_with_flags(sock_file, fput_needed, buff, len, flags | MSG_OOB);
+ }
+
iov.iov_base = buff;
iov.iov_len = len;
msg.msg_name = NULL;
@@ -1646,8 +1655,15 @@
goto out;

sock = sock_from_file(sock_file, &err);
- if (!sock)
- goto out_put;
+ if (!sock) {
+ if (addr)
+ goto out_put;
+ if (flags & ~MSG_DONTWAIT)
+ goto out_put;
+ /* it's not a socket, but we support a special case:
+ * recv(fd, ubuf, size, MSG_DONTWAIT) */
+ return rw_with_flags(sock_file, fput_needed, ubuf, size, flags);
+ }

msg.msg_control = NULL;
msg.msg_controllen = 0;