[1/1] eventfs: pseudo fs which allows to bind events to file descriptors.
From: Evgeniy Polyakov
Date: Wed Mar 07 2007 - 06:31:40 EST
Hello.
This pseudo fs allows to bind a file descriptor to different kinds of
events, which allows to poll them using epoll().
This particular morning hack supports signals only.
If idea is supposed to be right, I can cook up POSIX timers support.
Signal delivery note.
If special flag is set in signalfd(signo, flag), then signals are _not_
delivered through pending mask update but only through epoll queue.
(Copied from kevent).
Userspace signal code and patch itself can be found at:
http://tservice.net.ru/~s0mbre/archive/eventfs/
signal.c is also attached for interested reader.
Signed-off-by: Evgeniy Polyakov <johnpol@xxxxxxxxxxx>
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 2697e92..b14ee54 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -319,3 +319,4 @@ ENTRY(sys_call_table)
.long sys_move_pages
.long sys_getcpu
.long sys_epoll_pwait
+ .long sys_signalfd /* 320 */
diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S
index eda7a0d..bc6336c 100644
--- a/arch/x86_64/ia32/ia32entry.S
+++ b/arch/x86_64/ia32/ia32entry.S
@@ -719,4 +719,5 @@ ia32_sys_call_table:
.quad compat_sys_move_pages
.quad sys_getcpu
.quad sys_epoll_pwait
+ .quad sys_signalfd
ia32_syscall_end:
diff --git a/fs/Kconfig b/fs/Kconfig
index 3c4886b..09803ad 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1032,6 +1032,15 @@ config CONFIGFS_FS
Both sysfs and configfs can and should exist together on the
same system. One is not a replacement for the other.
+config EVENTFS
+ bool "Enable eventpoll filesystem support" if EMBEDDED
+ depends on EPOLL
+ default y
+ help
+ Allows to bind file descriptors to different kinds of objects
+ like signals and timers and work with them using epoll
+ family of system calls.
+
endmenu
menu "Miscellaneous filesystems"
diff --git a/fs/Makefile b/fs/Makefile
index 9edf411..185bcb1 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -22,6 +22,7 @@ endif
obj-$(CONFIG_INOTIFY) += inotify.o
obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
obj-$(CONFIG_EPOLL) += eventpoll.o
+obj-$(CONFIG_EVENTFS) += eventfs.o
obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o
nfsd-$(CONFIG_NFSD) := nfsctl.o
diff --git a/fs/eventfs.c b/fs/eventfs.c
new file mode 100644
index 0000000..dae108c
--- /dev/null
+++ b/fs/eventfs.c
@@ -0,0 +1,221 @@
+/*
+ * 2007 Copyright (c) Evgeniy Polyakov <johnpol@xxxxxxxxxxx>
+ * All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/device.h>
+#include <linux/poll.h>
+#include <asm/io.h>
+
+static inline void eventfs_set_signal_file(int sig, struct file *file)
+{
+ spin_lock_irq(¤t->sighand->siglock);
+ current->signal_file[sig-1] = file;
+ spin_unlock_irq(¤t->sighand->siglock);
+}
+
+static int eventfs_signal_release(struct inode *inode, struct file *file)
+{
+ int sig = (int)((unsigned long)(file->private_data) & 0x0fffffff);
+ eventfs_set_signal_file(sig, NULL);
+ return 0;
+}
+
+static unsigned int eventfs_signal_poll(struct file *file, struct poll_table_struct *wait)
+{
+ int sig = (int)((unsigned long)(file->private_data) & 0x0fffffff);
+ unsigned int mask = 0;
+ unsigned long flags;
+
+ poll_wait(file, ¤t->signal_wait, wait);
+
+ spin_lock_irqsave(¤t->sighand->siglock, flags);
+ if (!sigismember(¤t->blocked, sig) && (((unsigned long)(file->private_data)) & 0x40000000)) {
+ mask = POLLIN | POLLRDNORM;
+ file->private_data = (void *)(((unsigned long)(file->private_data)) & ~0x40000000);
+ }
+ spin_unlock_irqrestore(¤t->sighand->siglock, flags);
+
+ return mask;
+}
+
+struct file_operations eventfs_signal_fops = {
+ .release = eventfs_signal_release,
+ .poll = eventfs_signal_poll,
+ .owner = THIS_MODULE,
+};
+
+static struct vfsmount *eventfs_mnt __read_mostly;
+
+static int eventfs_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data, struct vfsmount *mnt)
+{
+ return get_sb_pseudo(fs_type, "eventfs", NULL, 0x193748dd, mnt);
+}
+
+static struct file_system_type eventfs_fs_type = {
+ .name = "eventfs",
+ .get_sb = eventfs_get_sb,
+ .kill_sb = kill_anon_super,
+};
+
+static int eventfs_delete_dentry(struct dentry *dentry)
+{
+ return 1;
+}
+
+static struct dentry_operations eventfs_dentry_operations = {
+ .d_delete = eventfs_delete_dentry,
+};
+
+static int eventfs_init(struct file **filp, struct file_operations *fops, unsigned long priv)
+{
+ struct qstr this;
+ char name[32];
+ struct dentry *dentry;
+ struct inode *inode;
+ struct file *file;
+ int err = -ENFILE, fd;
+
+ file = get_empty_filp();
+ if (!file)
+ goto err_out_exit;
+
+ inode = new_inode(eventfs_mnt->mnt_sb);
+ if (!inode)
+ goto err_out_fput;
+
+ inode->i_fop = fops;
+
+ inode->i_state = I_DIRTY;
+ inode->i_mode = S_IRUSR | S_IWUSR;
+ inode->i_uid = current->fsuid;
+ inode->i_gid = current->fsgid;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+ err = get_unused_fd();
+ if (err < 0)
+ goto err_out_iput;
+ fd = err;
+
+ sprintf(name, "[%lu]", inode->i_ino);
+ this.name = name;
+ this.len = strlen(name);
+ this.hash = inode->i_ino;
+ dentry = d_alloc(eventfs_mnt->mnt_sb->s_root, &this);
+ if (!dentry)
+ goto err_out_put_fd;
+ dentry->d_op = &eventfs_dentry_operations;
+ d_add(dentry, inode);
+ file->f_vfsmnt = mntget(eventfs_mnt);
+ file->f_dentry = dentry;
+ file->f_mapping = inode->i_mapping;
+ file->f_pos = 0;
+ file->f_flags = O_RDONLY;
+ file->f_op = fops;
+ file->f_mode = FMODE_READ;
+ file->f_version = 0;
+ file->private_data = (void *)priv;
+
+ fd_install(fd, file);
+ *filp = file;
+
+ return fd;
+
+err_out_put_fd:
+ put_unused_fd(fd);
+err_out_iput:
+ iput(inode);
+err_out_fput:
+ put_filp(file);
+err_out_exit:
+ return err;
+}
+
+asmlinkage long sys_signalfd(int sig, int flags)
+{
+ int fd, err = 0;
+ struct file *file;
+ unsigned long priv = sig;
+
+ if (!valid_signal(sig) || sig < 1/* || sig_kernel_only(sig) */)
+ return -EINVAL;
+
+ spin_lock_irq(¤t->sighand->siglock);
+ file = current->signal_file[sig-1];
+ if (file)
+ err = -EEXIST;
+ else
+ current->signal_file[sig-1] = (void *)1;
+ spin_unlock_irq(¤t->sighand->siglock);
+
+ if (err)
+ return err;
+
+ file = NULL;
+ if (flags)
+ priv |= 0x80000000;
+ fd = eventfs_init(&file, &eventfs_signal_fops, priv);
+ if (fd < 0)
+ goto err_out_clean_file;
+
+ eventfs_set_signal_file(sig, file);
+
+ return fd;
+
+err_out_clean_file:
+ eventfs_set_signal_file(sig, NULL);
+ return fd;
+}
+
+/*
+ * Eventfs subsystem initialization - create caches and register
+ * filesystem to get control file descriptors from.
+ */
+static int __init eventfs_sys_init(void)
+{
+ int err;
+
+ err = register_filesystem(&eventfs_fs_type);
+ if (err)
+ goto err_out_exit;
+
+ eventfs_mnt = kern_mount(&eventfs_fs_type);
+ err = PTR_ERR(eventfs_mnt);
+ if (IS_ERR(eventfs_mnt))
+ goto err_out_unreg;
+
+ printk(KERN_INFO "Eventfs subsystem has been successfully registered.\n");
+
+ return 0;
+
+err_out_unreg:
+ unregister_filesystem(&eventfs_fs_type);
+err_out_exit:
+ return err;
+}
+
+module_init(eventfs_sys_init);
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 833fa17..c72a568 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -325,10 +325,11 @@
#define __NR_move_pages 317
#define __NR_getcpu 318
#define __NR_epoll_pwait 319
+#define __NR_signalfd 320
#ifdef __KERNEL__
-#define NR_syscalls 320
+#define NR_syscalls 321
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h
index c5f596e..62a21f3 100644
--- a/include/asm-x86_64/unistd.h
+++ b/include/asm-x86_64/unistd.h
@@ -619,8 +619,10 @@ __SYSCALL(__NR_sync_file_range, sys_sync_file_range)
__SYSCALL(__NR_vmsplice, sys_vmsplice)
#define __NR_move_pages 279
__SYSCALL(__NR_move_pages, sys_move_pages)
+#define __NR_signalfd 280
+__SYSCALL(__NR_signalfd, sys_signalfd)
-#define __NR_syscall_max __NR_move_pages
+#define __NR_syscall_max __NR_signalfd
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 49fe299..22c1412 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -935,6 +935,10 @@ struct task_struct {
/* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand;
+#ifdef CONFIG_EVENTFS
+ struct file *signal_file[_NSIG];
+ wait_queue_head_t signal_wait;
+#endif
sigset_t blocked, real_blocked;
sigset_t saved_sigmask; /* To be restored with TIF_RESTORE_SIGMASK */
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1912c6c..b34f4e6 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -605,4 +605,6 @@ asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct g
int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
+asmlinkage long sys_signalfd(int sig, int flags);
+
#endif
diff --git a/kernel/fork.c b/kernel/fork.c
index d154cc7..1b318da 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1128,6 +1128,11 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (retval)
goto bad_fork_cleanup_namespaces;
+#ifdef CONFIG_EVENTFS
+ memset(p->signal_file, 0, ARRAY_SIZE(p->signal_file));
+ init_waitqueue_head(&p->signal_wait);
+#endif
+
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
diff --git a/kernel/signal.c b/kernel/signal.c
index 3670225..059977c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -739,6 +739,16 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
struct sigqueue * q = NULL;
int ret = 0;
+#ifdef CONFIG_EVENTFS
+ if (t->signal_file[sig-1]) {
+ struct file *file = t->signal_file[sig-1];
+ file->private_data = (void *)(((unsigned long)(file->private_data)) | 0x40000000);
+ wake_up(&t->signal_wait);
+ if (((unsigned long)(file->private_data)) & 0x80000000)
+ return 1;
+ }
+#endif
+
/*
* fast-pathed signals for kernel-internal things like SIGSTOP
* or SIGKILL.
@@ -817,6 +827,18 @@ specific_send_sig_info(int sig, struct siginfo *info, struct task_struct *t)
ret = send_signal(sig, info, t, &t->pending);
if (!ret && !sigismember(&t->blocked, sig))
signal_wake_up(t, sig == SIGKILL);
+#ifdef CONFIG_EVENTFS
+ /*
+ * Eventfs allows to deliver signals through epoll queue,
+ * it is possible to setup epoll to not deliver
+ * signal through the usual way, in that case send_signal()
+ * returns 1 and signal is delivered only through epoll queue.
+ * We simulate successfull delivery notification through this hack:
+ */
+ if (ret == 1)
+ ret = 0;
+
+#endif
out:
return ret;
}
@@ -1006,6 +1028,18 @@ __group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
* to avoid several races.
*/
ret = send_signal(sig, info, p, &p->signal->shared_pending);
+#ifdef CONFIG_EVENTFS
+ /*
+ * Eventfs allows to deliver signals through epoll queue,
+ * it is possible to setup epoll to not deliver
+ * signal through the usual way, in that case send_signal()
+ * returns 1 and signal is delivered only through epoll queue.
+ * We simulate successfull delivery notification through this hack:
+ */
+ if (ret == 1)
+ ret = 0;
+
+#endif
if (unlikely(ret))
return ret;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d7306d0..c131d20 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -113,6 +113,8 @@ cond_syscall(sys_vm86);
cond_syscall(compat_sys_ipc);
cond_syscall(compat_sys_sysctl);
+cond_syscall(sys_signalfd);
+
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
cond_syscall(sys_pciconfig_write);
--
Evgeniy Polyakov
#include <sys/epoll.h>
#include <signal.h>
#include <fcntl.h>
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include <string.h>
#include <time.h>
#include <unistd.h>
#include <linux/unistd.h>
#include <linux/types.h>
#define _syscall2(type,name,type1,arg1,type2,arg2) \
type name (type1 arg1, type2 arg2) \
{\
return syscall(__NR_##name, arg1, arg2);\
}
_syscall2(int, signalfd, int, sig, int, flags);
#if 1
#define ulog(f, a...) fprintf(stderr, f, ##a)
#else
#define ulog(f, a...)
#endif
#define ulog_err(f, a...) ulog(f ": %s [%d].\n", ##a, strerror(errno), errno)
static void sig_handler(int signo)
{
printf("%s, signo: %d.\n", __func__, signo);
}
static int epoll_signal_init(int ctl_fd, int sig, int flags)
{
int fd;
struct epoll_event event;
fd = signalfd(sig, flags);
if (fd < 0) {
ulog_err("Failed to bind signal: %d, flags: %d", sig, flags);
return fd;
}
event.events = EPOLLIN;
event.data.fd = fd;
if (epoll_ctl(ctl_fd, EPOLL_CTL_ADD, fd, &event) < 0) {
ulog_err("Failed to perform control ADD operation: fd: %d", fd);
return -1;
}
return 0;
}
int main()
{
int err, ctl_fd, i;
struct epoll_event event[256];
ctl_fd = epoll_create(10);
if (ctl_fd < 0) {
ulog_err("Failed to create epoll control file descriptor");
return -1;
}
signal(SIGUSR1, sig_handler);
signal(SIGUSR2, sig_handler);
err = epoll_signal_init(ctl_fd, SIGUSR1, 1);
if (err)
return err;
err = epoll_signal_init(ctl_fd, SIGUSR2, 0);
if (err)
return err;
ulog("%s: pid: %d\n", __func__, getpid());
kill(getpid(), SIGUSR1);
kill(getpid(), SIGUSR2);
while (1) {
err = epoll_wait(ctl_fd, event, 256, 10000);
if (err < 0) {
ulog_err("Failed to wait for events");
continue;
}
if (err == 0)
continue;
for (i=0; i<err; ++i)
printf("Dequeued signal, fd: %u.\n", event[i].data.fd);
}
return 0;
}