[RFC/PATCH] inotify -- a dnotify replacement

From: John McCutchan
Date: Sun May 09 2004 - 20:36:59 EST


Hi,

I have been working on inotify a dnotify replacement.

inotify is a char device that has two ioctls: INOTIFY_WATCH and
INOTIFY_IGNORE Which expect an inode number and an inode device number.
I know that on some file systems the inode number and inode device
number are not guaranteed to be unique. This driver is only meant for
file systems that have unique inode numbers.

The two biggest complaints people have about dnotify are

1) dnotify delivers events using signals.

2) dnotify needs a file to be kept open on the device, causing problems
during unmount.


As for 1) -- since inotify is a char device the events are delivered
when the user calls read (). This easily integrates in to an
applications select() loop.

I have a solution for 2) but I haven't implemented it yet, as I don't
have the expertise. What I want to do is distinguish between some one
holding a 'real' reference on an inode, and a reference that is used
only for watching. This way in the unmount code we can make sure that no
one is holding a real reference to an inode on the device, then while
invalidating the inodes, clear the inodes watch list and send UNMOUNT
events to the inotify device. Changes would have to be made so that
inodes are kept in the inode cache when a watching reference is held.

I would like some pointers on how to implement 2). I am not sure the
code path that takes place on unmount, or the best place to change the
inode cache semantics.

I would also appreciate comments on the design in general and code
review (this is my first kernel project).

Attached is a patch implementing everything described in the email.

John

diff -ru clean/linux-2.6.5/fs/attr.c linux-2.6.5/fs/attr.c
--- clean/linux-2.6.5/fs/attr.c 2004-04-03 22:36:24.000000000 -0500
+++ linux-2.6.5/fs/attr.c 2004-05-09 21:02:24.000000000 -0400
@@ -11,6 +11,7 @@
#include <linux/string.h>
#include <linux/smp_lock.h>
#include <linux/dnotify.h>
+#include <linux/inotify.h>
#include <linux/fcntl.h>
#include <linux/quotaops.h>
#include <linux/security.h>
@@ -127,6 +128,7 @@
return dn_mask;
}

+
int notify_change(struct dentry * dentry, struct iattr * attr)
{
struct inode *inode = dentry->d_inode;
@@ -184,8 +186,10 @@
}
if (!error) {
unsigned long dn_mask = setattr_mask(ia_valid);
- if (dn_mask)
+ if (dn_mask) {
dnotify_parent(dentry, dn_mask);
+ inotify_dentry_queue_event (dentry, dn_mask);
+ }
}
return error;
}
diff -ru clean/linux-2.6.5/fs/inode.c linux-2.6.5/fs/inode.c
--- clean/linux-2.6.5/fs/inode.c 2004-04-03 22:38:23.000000000 -0500
+++ linux-2.6.5/fs/inode.c 2004-05-09 21:10:12.000000000 -0400
@@ -151,6 +151,10 @@
mapping->backing_dev_info = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
memset(&inode->u, 0, sizeof(inode->u));
inode->i_mapping = mapping;
+
+ INIT_LIST_HEAD(&inode->watchers);
+ atomic_set (&inode->watcher_count, 0);
+ inode->watchers_mask = 0;
}
return inode;
}
@@ -159,11 +163,15 @@
{
if (inode_has_buffers(inode))
BUG();
+ /* When an inode gets destroyed it shouldn't have any watchers left on it */
+ if (!list_empty (&inode->watchers))
+ BUG();
security_inode_free(inode);
if (inode->i_sb->s_op->destroy_inode)
inode->i_sb->s_op->destroy_inode(inode);
else
kmem_cache_free(inode_cachep, (inode));
+
}


diff -ru clean/linux-2.6.5/fs/Makefile linux-2.6.5/fs/Makefile
--- clean/linux-2.6.5/fs/Makefile 2004-04-03 22:37:23.000000000 -0500
+++ linux-2.6.5/fs/Makefile 2004-04-20 23:35:17.000000000 -0400
@@ -10,7 +10,7 @@
namei.o fcntl.o ioctl.o readdir.o select.o fifo.o locks.o \
dcache.o inode.o attr.o bad_inode.o file.o dnotify.o \
filesystems.o namespace.o seq_file.o xattr.o libfs.o \
- fs-writeback.o mpage.o direct-io.o aio.o
+ fs-writeback.o mpage.o direct-io.o aio.o inotify.o

obj-$(CONFIG_EPOLL) += eventpoll.o
obj-$(CONFIG_COMPAT) += compat.o
diff -ru clean/linux-2.6.5/fs/read_write.c linux-2.6.5/fs/read_write.c
--- clean/linux-2.6.5/fs/read_write.c 2004-04-03 22:37:36.000000000 -0500
+++ linux-2.6.5/fs/read_write.c 2004-05-09 21:02:14.000000000 -0400
@@ -11,6 +11,7 @@
#include <linux/uio.h>
#include <linux/smp_lock.h>
#include <linux/dnotify.h>
+#include <linux/inotify.h>
#include <linux/security.h>
#include <linux/module.h>

@@ -214,8 +215,10 @@
ret = file->f_op->read(file, buf, count, pos);
else
ret = do_sync_read(file, buf, count, pos);
- if (ret > 0)
+ if (ret > 0) {
dnotify_parent(file->f_dentry, DN_ACCESS);
+ inotify_dentry_queue_event (file->f_dentry, IN_ACCESS);
+ }
}
}

@@ -258,8 +261,10 @@
ret = file->f_op->write(file, buf, count, pos);
else
ret = do_sync_write(file, buf, count, pos);
- if (ret > 0)
+ if (ret > 0) {
dnotify_parent(file->f_dentry, DN_MODIFY);
+ inotify_dentry_queue_event (file->f_dentry, IN_MODIFY);
+ }
}
}

@@ -473,9 +478,12 @@
out:
if (iov != iovstack)
kfree(iov);
- if ((ret + (type == READ)) > 0)
+ if ((ret + (type == READ)) > 0) {
dnotify_parent(file->f_dentry,
(type == READ) ? DN_ACCESS : DN_MODIFY);
+ inotify_dentry_queue_event (file->f_dentry,
+ (type == READ) ? IN_ACCESS : IN_MODIFY);
+ }
return ret;
}

diff -ru clean/linux-2.6.5/include/linux/fs.h linux-2.6.5/include/linux/fs.h
--- clean/linux-2.6.5/include/linux/fs.h 2004-04-03 22:36:52.000000000 -0500
+++ linux-2.6.5/include/linux/fs.h 2004-05-09 21:06:42.000000000 -0400
@@ -414,6 +414,10 @@
unsigned long i_dnotify_mask; /* Directory notify events */
struct dnotify_struct *i_dnotify; /* for directory notifications */

+ struct list_head watchers;
+ unsigned long watchers_mask;
+ atomic_t watcher_count;
+
unsigned long i_state;
unsigned long dirtied_when; /* jiffies of first dirtying */

diff -ru clean/linux-2.6.5/include/linux/miscdevice.h linux-2.6.5/include/linux/miscdevice.h
--- clean/linux-2.6.5/include/linux/miscdevice.h 2004-04-03 22:37:23.000000000 -0500
+++ linux-2.6.5/include/linux/miscdevice.h 2004-05-08 19:34:04.000000000 -0400
@@ -36,6 +36,8 @@

#define TUN_MINOR 200

+#define INOTIFY_MINOR 99
+
struct device;

struct miscdevice
/*
* Inode based directory notification for Linux
*
* Copyright (C) 2004 John McCutchan
*/

#ifndef _LINUX_INOTIFY_H
#define _LINUX_INOTIFY_H

#include <linux/fs.h>
#include <linux/ioctl.h>

/* When reading from the device the format is:
* INODE_NUMBER [unsigned long]
* INODE_DEVICE_NUMBER [unsigned long]
* EVENT_MASK [unsigned long]
*/

#define IN_ACCESS 0x00000001 /* File was accessed */
#define IN_MODIFY 0x00000002 /* File was modified */
#define IN_CREATE 0x00000004 /* File was created */
#define IN_DELETE 0x00000008 /* File was deleted */
#define IN_RENAME 0x00000010 /* File was renamed */
#define IN_ATTRIB 0x00000020 /* File changed attributes */
#define IN_MOVE 0x00000040 /* File was moved */
#define IN_UNMOUNT 0x00000080 /* Device file was on, was unmounted */

/* Fill this and pass it to the ioctl */
struct inotify_packet {
unsigned long i_no; // from stat
unsigned long i_dev; // from stat
unsigned long mask; // combination of above
};

/* ioctl */

#define INOTIFY_IOCTL_MAGIC 'Q'
#define INOTIFY_IOCTL_MAXNR 2

#define INOTIFY_WATCH _IOW(INOTIFY_IOCTL_MAGIC, 1, struct inotify_watch)
#define INOTIFY_IGNORE _IOW(INOTIFY_IOCTL_MAGIC, 2, struct inotify_watch)

/* Kernel API */
void inotify_inode_queue_event (struct inode *inode, unsigned long mask);
void inotify_dentry_queue_event (struct dentry *dentry, unsigned long mask);
void inotify_inode_update_mask (struct inode *inode);

#endif

/*
* Inode based directory notifications for Linux.
*
* Copyright (C) 2004 John McCutchan
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 2, or (at your option) any
* later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/

#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/miscdevice.h>
#include <linux/device.h>
#include <linux/init.h>
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/kernel.h>
#include <linux/list.h>
#include <linux/inotify.h>

/* TODO:
* get more events sent to the queue
* rename
* move
* unmount
* check that any inodes we are asked to watch are actually directories
* write user space test app
* change umount code
*/

#define MAX_WATCHER_COUNT 8 /* We only support 8 watchers */
#define MAX_WATCH_COUNT 128 /* A watcher can only be watching 128 inodes */

static int watcher_count = 0;

struct inotify_watch {
struct list_head list;
struct inode * inode;
};
#define list_to_inotify_watch(pos) list_entry((pos),struct inotify_watch,list)

/* A list of these structures is attached to each inode that is being watched.
* * each item in the list represents a unique watcher.
* * it tell us what events we are looking at and who is watching the inode.
* */
struct inotify_inode_watcher {
struct list_head list;
unsigned long mask;
struct inode * inode;
void * private_data; // the driver instance
};
#define list_to_inotify_inode_watcher(pos) list_entry((pos), struct inofiy_inode_watcher, list)

/* A list of these is attached to each instance of the driver
* when the drivers read() gets called, this list is walked and
* all events that can fit in the buffer get delivered
*/
struct inotify_event {
struct list_head list;
unsigned long i_no;
unsigned long i_dev;
unsigned long mask;
};
#define list_to_inotify_event(pos) list_entry((pos), struct inotify_event, list)



/* For each inotify device we need to keep a list of events queued on it
* aswell as a list of inodes that we are watching
*/
struct inotify_device {
struct list_head * events;
struct list_head * watch;
atomic_t watch_count;
char read_state;
wait_queue_head_t wait;
};

static struct inotify_inode_watcher *inotify_inode_find_watcher (struct inotify_device *dev, struct inode *inode)
{
struct inotify_inode_watcher *watcher;

list_for_each_entry (watcher, &inode->watchers, list)
{
/* This watcher is from this device */
if (watcher->private_data == dev)
{
return watcher;
}
}
return NULL;
}

static void inotify_inode_add_watcher (struct inotify_device *dev, struct inode *inode, unsigned long mask)
{
struct inotify_inode_watcher *watcher;

watcher = inotify_inode_find_watcher (dev, inode);

/* Check if we are already watching this inode */
if (NULL != watcher) {
watcher->mask = mask;
} else {
watcher = kmalloc (sizeof (struct inotify_inode_watcher), GFP_KERNEL);
watcher->private_data = dev;
watcher->inode = inode;
watcher->mask = mask;

atomic_inc (&inode->watcher_count);
list_add (&watcher->list, &inode->watchers);
}

inotify_inode_update_mask (inode);
}

static void inotify_inode_rm_watcher (struct inotify_device *dev, struct inode *inode)
{
struct inotify_inode_watcher *watcher;

watcher = inotify_inode_find_watcher (dev, inode);

if (NULL != watcher) {
list_del(&watcher->list);
kfree (watcher);

atomic_dec (&inode->watcher_count);
}
}

static struct inotify_watch *inotify_dev_find_watcher (struct inotify_device *dev, struct inode *inode)
{
struct inotify_watch *watch;

list_for_each_entry (watch, dev->watch, list)
{
if (watch->inode == inode) {
return watch;
}
}

return NULL;
}

static void inotify_dev_add_watcher (struct inotify_device *dev, struct inode *inode)
{
struct inotify_watch *watch;

watch = inotify_dev_find_watcher (dev, inode);

if (watch == NULL)
{
watch = kmalloc (sizeof (struct inotify_watch), GFP_KERNEL);
watch->inode = inode;

atomic_inc (&dev->watch_count);
list_add (&watch->list, dev->watch);
}
}

static void inotify_dev_rm_watcher (struct inotify_device *dev, struct inode *inode)
{
struct inotify_watch *watch;

watch = inotify_dev_find_watcher (dev, inode);

if (NULL != watch) {
list_del (&watch->list);
kfree (watch);

atomic_dec (&dev->watch_count);
}
}

static char inotify_dev_has_events (struct inotify_device *dev)
{
return dev->events && !list_empty(dev->events);
}

static void inotify_dev_event_dequeue (struct inotify_device *dev)
{
struct inotify_event *event;

if (list_empty (dev->events)) {
return;
}

/* Get the first entry from the event queue */
event = list_to_inotify_event (dev->events->next);
list_del (&event->list);

kfree (event);
}

static void inotify_dev_event_queue (struct inotify_device *dev, unsigned long i_no, unsigned long i_dev, unsigned long mask)
{
struct inotify_event *event;

event = kmalloc (sizeof (struct inotify_event), GFP_KERNEL);
event->i_no = i_no;
event->i_dev = i_dev;
event->mask = mask;

list_add_tail (&event->list, dev->events);
}

/* Kernel API */
void inotify_inode_queue_event (struct inode *inode, unsigned long mask)
{
struct inotify_inode_watcher *watcher;

/* If this inode isn't interested in the event */
if (!(inode->watchers_mask & mask)) {
return;
}

list_for_each_entry (watcher, &inode->watchers, list) {
if (watcher->mask & mask) {
inotify_dev_event_queue (watcher->private_data, inode->i_ino, new_encode_dev (inode->i_sb->s_dev), mask);
}
}
}
EXPORT_SYMBOL_GPL(inotify_inode_queue_event);

/* Based on dnotify_parent -- locking may be broken */
void inotify_dentry_queue_event (struct dentry *dentry, unsigned long mask)
{
struct dentry *parent;

spin_lock(&dentry->d_lock);
parent = dentry->d_parent;
dget (parent);
inotify_inode_queue_event (parent->d_inode, mask);
dput (parent);
spin_unlock(&dentry->d_lock);
}
EXPORT_SYMBOL_GPL(inotify_dentry_queue_event);


void inotify_inode_update_mask (struct inode *inode)
{
struct inotify_inode_watcher *watcher;
unsigned long new_mask;

new_mask = 0;
list_for_each_entry (watcher, &inode->watchers, list) {
new_mask |= watcher->mask;
}
inode->watchers_mask = new_mask;
}
EXPORT_SYMBOL_GPL(inotify_inode_update_mask);

/* The driver interface is implemented below */

#define INOTIFY_READ_STATE_INODE 0
#define INOTIFY_READ_STATE_DEV 1
#define INOTIFY_READ_STATE_MASK 2

static ssize_t inotify_read(struct file *file, char *buf,
size_t count, loff_t *pos) {
struct inotify_device *dev;
size_t out;

out = 0;

dev = file->private_data;

if (!inotify_dev_has_events (dev)) {
return 0;
}

while (out < count && inotify_dev_has_events (dev)) {
struct inotify_event *event;

event = list_to_inotify_event (dev->events->next);

switch (dev->read_state) {
case INOTIFY_READ_STATE_INODE:
if (put_user (event->i_no, buf))
return -EFAULT;
buf += sizeof (event->i_no);
dev->read_state++;
break;
case INOTIFY_READ_STATE_DEV:
if (put_user (event->i_dev, buf))
return -EFAULT;
buf += sizeof (event->i_dev);
dev->read_state++;
break;
case INOTIFY_READ_STATE_MASK:
if (put_user (event->mask, buf))
return -EFAULT;
buf += sizeof (event->mask);
dev->read_state = INOTIFY_READ_STATE_INODE;
/* We have finished delivering the complete event, so dequeue it */
inotify_dev_event_dequeue (dev);
break;
default:
/* BUG */
break;
}

out++;
}

return out;
}

static unsigned int inotify_poll(struct file *file, poll_table *wait) {
struct inotify_device *dev;

dev = file->private_data;

poll_wait (file, &dev->wait, wait);

if (inotify_dev_has_events (dev))
return POLLIN | POLLRDNORM;

return 0;
}

static int inotify_open(struct inode *inode, struct file *file) {
struct inotify_device *dev;

if (watcher_count == MAX_WATCHER_COUNT)
return -ENODEV;

watcher_count++;

dev = kmalloc (sizeof (struct inotify_device), GFP_KERNEL);
dev->events = kmalloc (sizeof (struct list_head), GFP_KERNEL);
INIT_LIST_HEAD(dev->events);
dev->watch = kmalloc (sizeof (struct list_head), GFP_KERNEL);
INIT_LIST_HEAD(dev->watch);
atomic_set(&dev->watch_count, 0);
dev->read_state = INOTIFY_READ_STATE_INODE;
init_waitqueue_head (&dev->wait);

file->private_data = dev;

return 0;
}

static void inotify_release_all_watchers (struct inotify_device *dev)
{
struct inotify_watch *watch;

list_for_each_entry (watch, dev->watch, list) {
/* Remove the watcher from the inode */
inotify_inode_rm_watcher (dev, watch->inode);
/* Remove the inode from the device */
inotify_dev_rm_watcher (dev, watch->inode);

}
}

static void inotify_release_all_events (struct inotify_device *dev)
{
while (inotify_dev_has_events (dev)) {
inotify_dev_event_dequeue (dev);
}
}


static int inotify_release(struct inode *inode, struct file *file)
{
if (file->private_data) {
struct inotify_device *dev;

dev = (struct inotify_device *)file->private_data;

inotify_release_all_watchers (dev);
kfree (dev->watch);

inotify_release_all_events (dev);
kfree (dev->events);

kfree (dev);
}

watcher_count--;
return 0;
}

static struct inode *inotify_find_inode (struct inotify_packet *packet)
{
struct super_block *s;
struct inode *inode;

s = user_get_super (new_decode_dev(packet->i_dev));

if (!s) {
return NULL;
}

inode = ilookup (s, packet->i_no);

drop_super(s);

if (!inode) {
return NULL;
}

/* ilookup takes a reference on the inode, release it */
iput (inode);
return inode;
}

static int inotify_watch(struct inotify_device *dev, struct inotify_packet *packet)
{
int err;
struct inode *inode;
err = 0;

inode = inotify_find_inode (packet);

if (!inode) {
err = -ENOENT;
goto out;
}

inotify_inode_add_watcher (dev, inode, packet->mask);
inotify_dev_add_watcher (dev, inode);
out:
return err;
}

static int inotify_ignore(struct inotify_device *dev, struct inotify_packet *packet)
{
int err;
struct inode *inode;
err = 0;

inode = inotify_find_inode (packet);

if (!inode) {
err = -ENOENT;
goto out;
}

inotify_inode_rm_watcher (dev, inode);
inotify_dev_rm_watcher (dev, inode);
out:
return err;
}

static int inotify_ioctl(struct inode *ip, struct file *fp,
unsigned int cmd, unsigned long arg) {
int err;
struct inotify_device *dev;
struct inotify_packet *packet;

dev = fp->private_data;
err = 0;

if (_IOC_TYPE(cmd) != INOTIFY_IOCTL_MAGIC) return -EINVAL;
if (_IOC_NR(cmd) > INOTIFY_IOCTL_MAXNR) return -EINVAL;

if (_IOC_DIR(cmd) & _IOC_READ)
err = !access_ok (VERIFY_WRITE, (void *)arg, _IOC_SIZE(cmd));
if (_IOC_DIR(cmd) & _IOC_WRITE)
err = !access_ok (VERIFY_READ, (void *)arg, _IOC_SIZE(cmd));

if (err) return -EFAULT;

packet = kmalloc (sizeof (struct inotify_packet), GFP_KERNEL);

if (copy_from_user (packet, (void *)arg, sizeof (struct inotify_packet))) {
err = -EFAULT;
goto out;
}

err = -EINVAL;

switch (cmd) {
case INOTIFY_WATCH:
printk(KERN_ALERT "inotify WATCH: %ld %ld %ld\n", packet->i_no, packet->i_dev, packet->mask);
err = inotify_watch (dev, packet);
break;
case INOTIFY_IGNORE:
printk(KERN_ALERT "inotify IGNORE: %ld %ld\n", packet->i_no, packet->i_dev);
err = inotify_ignore (dev, packet);
break;
}
out:
kfree (packet);
return err;
}

static struct file_operations inotify_fops = {
.owner = THIS_MODULE,
.read = inotify_read,
.poll = inotify_poll,
.open = inotify_open,
.release = inotify_release,
.ioctl = inotify_ioctl,
};

struct miscdevice inotify_device = {
.minor = INOTIFY_MINOR,
.name = "inotify",
.fops = &inotify_fops,
};


static int __init inotify_init (void)
{
int ret;

ret = misc_register (&inotify_device);

if (ret) {
goto out;
}

ret = 0;
printk(KERN_ALERT "inotify 0.1 startup\n");
out:
return ret;
}

static void inotify_exit (void)
{
misc_deregister (&inotify_device);
printk(KERN_ALERT "inotify 0.1 shutdown\n");
}


MODULE_AUTHOR("John McCutchan <ttb@xxxxxxxxxxxxxxxx>");
MODULE_DESCRIPTION("Inode event driver");
MODULE_LICENSE("GPL");

module_init (inotify_init);
module_exit (inotify_exit);