Re: [RFC PATCH -v4 07/14] fsnotify: add in inode fsnotify markings

From: Evgeniy Polyakov
Date: Fri Dec 12 2008 - 22:07:30 EST


On Fri, Dec 12, 2008 at 04:51:51PM -0500, Eric Paris (eparis@xxxxxxxxxx) wrote:
> +void fsnotify_put_mark(struct fsnotify_mark_entry *entry)
> +{
> + if (atomic_dec_and_test(&entry->refcnt)) {
> + spin_lock(&entry->lock);
> + /* entries can only be found by the kernel by searching the
> + * inode->i_fsnotify_entries or the group->mark_entries lists.
> + * if freeme is set that means this entry is off both lists.
> + * if refcnt is 0 that means we are the last thing still
> + * looking at this entry, so its time to free.
> + */
> + if (!atomic_read(&entry->refcnt) && entry->freeme) {

Why do you check refcnt again? Does it mean its life cycle does not end
when it hits zero and should be freed only under lock, which in turn
means it may be accessed under that lock and zero refcnt? Please
describe this locking in more details.

> + spin_unlock(&entry->lock);
> + fsnotify_destroy_mark(entry);
> + return;
> + }
> + spin_unlock(&entry->lock);
> + }
> +}
> +
> +void fsnotify_clear_marks_by_group(struct fsnotify_group *group)
> +{
> + struct fsnotify_mark_entry *lentry, *entry;
> + struct inode *inode;
> + LIST_HEAD(free_list);
> +
> + spin_lock(&group->mark_lock);
> + list_for_each_entry_safe(entry, lentry, &group->mark_entries, g_list) {
> + list_del_init(&entry->g_list);
> + list_add(&entry->free_g_list, &free_list);
> + }
> + spin_unlock(&group->mark_lock);
> +
> + list_for_each_entry_safe(entry, lentry, &free_list, free_g_list) {
> + fsnotify_get_mark(entry);
> + spin_lock(&entry->lock);
> + inode = entry->inode;
> + if (!inode) {

This inode does not seem to be grabbed previously, or I missed that
part? What prevents it from being freed?

> + entry->group = NULL;
> + spin_unlock(&entry->lock);
> + fsnotify_put_mark(entry);
> + continue;
> + }
> + spin_lock(&inode->i_lock);
> +
> + list_del_init(&entry->i_list);
> + entry->inode = NULL;
> + list_del_init(&entry->g_list);
> + entry->group = NULL;
> + entry->freeme = 1;
> +
> + spin_unlock(&inode->i_lock);
> + spin_unlock(&entry->lock);
> +
> + fsnotify_put_mark(entry);
> + }
> +}

> +void fsnotify_clear_marks_by_inode(struct inode *inode, unsigned int flags)
> +{
> + struct fsnotify_mark_entry *lentry, *entry;
> + LIST_HEAD(free_list);
> +
> + spin_lock(&inode->i_lock);
> + list_for_each_entry_safe(entry, lentry, &inode->i_fsnotify_mark_entries, i_list) {
> + list_del_init(&entry->i_list);
> + list_add(&entry->free_i_list, &free_list);
> + }
> + spin_unlock(&inode->i_lock);
> +
> + /*
> + * at this point destroy_by_* might race.
> + *
> + * we used list_del_init() so it can be list_del_init'd again, no harm.
> + * we were called from an inode function so we know that other user can
> + * try to grab entry->inode->i_lock without a problem.
> + */
> + list_for_each_entry_safe(entry, lentry, &free_list, free_i_list) {
> + fsnotify_get_mark(entry);
> + entry->group->ops->mark_clear_inode(entry, inode, flags);
> + fsnotify_put_mark(entry);
> + }

Since entry was not grabbed in the above locked list, what prevents it
to be freed before fsnotify_get_mark() executed? Could that get moved
under lock?

> +}
> +
> +/* caller must hold inode->i_lock */
> +struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, struct inode *inode)
> +{
> + struct fsnotify_mark_entry *entry;
> +
> + list_for_each_entry(entry, &inode->i_fsnotify_mark_entries, i_list) {
> + if (entry->group == group) {
> + fsnotify_get_mark(entry);
> + return entry;
> + }
> + }
> + return NULL;
> +}
> +/*

Missed newline before comment's start :)

> + * This is a low use function called when userspace is changing what is being
> + * watched. I don't mind doing the allocation since I'm assuming we will have
> + * more new events than we have adding to old events...
> + *
> + * add (we use |=) the mark to the in core inode mark, if you need to change
> + * rather than | some new bits you needs to fsnotify_destroy_mark_by_inode()
> + * then call this with all the right bits in the mask.
> + */
> +struct fsnotify_mark_entry *fsnotify_mark_add(struct fsnotify_group *group, struct inode *inode, __u64 mask)
> +{
> + /* we initialize entry to shut up the compiler in case we just to out... */
> + struct fsnotify_mark_entry *entry = NULL, *lentry;
> +
> + /* pre allocate an entry so we can hold the lock */
> + entry = fsnotify_alloc_mark();
> + if (!entry)
> + return NULL;
> +
> + /*
> + * LOCKING ORDER!!!!
> + * entry->lock
> + * group->mark_lock
> + * inode->i_lock
> + */
> + spin_lock(&group->mark_lock);
> + spin_lock(&inode->i_lock);
> + lentry = fsnotify_find_mark_entry(group, inode);
> + if (lentry) {
> + /* we didn't use the new entry, kill it */
> + fsnotify_destroy_mark(entry);
> + entry = lentry;
> + entry->mask |= mask;
> + goto out_unlock;
> + }
> +
> + spin_lock_init(&entry->lock);
> + atomic_set(&entry->refcnt, 1);
> + entry->group = group;
> + entry->mask = mask;
> + entry->inode = inode;

That's what I talked about previously, what if this inode will be
released after inode unlocked?

> + entry->freeme = 0;
> + entry->private = NULL;
> + entry->free_private = group->ops->free_mark_priv;
> +
> + list_add(&entry->i_list, &inode->i_fsnotify_mark_entries);
> + list_add(&entry->g_list, &group->mark_entries);
> +
> +out_unlock:
> + spin_unlock(&inode->i_lock);
> + spin_unlock(&group->mark_lock);
> + return entry;
> +}
> +
> +void fsnotify_recalc_inode_mask(struct inode *inode)
> +{
> + unsigned long new_mask = 0;
> + struct fsnotify_mark_entry *entry;
> +
> + spin_lock(&inode->i_lock);
> + list_for_each_entry(entry, &inode->i_fsnotify_mark_entries, i_list) {
> + new_mask |= entry->mask;
> + }
> + inode->i_fsnotify_mask = new_mask;
> + spin_unlock(&inode->i_lock);
> +}
> +
> +
> +__init int fsnotify_mark_init(void)
> +{
> + fsnotify_mark_kmem_cache = kmem_cache_create("fsnotify_mark_entry", sizeof(struct fsnotify_mark_entry), 0, SLAB_PANIC, NULL);
> +
> + return 0;
> +}
> +subsys_initcall(fsnotify_mark_init);
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 4a853ef..b5a7bce 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -665,6 +665,11 @@ struct inode {
>
> __u32 i_generation;
>
> +#ifdef CONFIG_FSNOTIFY
> + __u64 i_fsnotify_mask; /* all events this inode cares about */
> + struct list_head i_fsnotify_mark_entries; /* fsnotify mark entries */
> +#endif
> +
> #ifdef CONFIG_DNOTIFY
> unsigned long i_dnotify_mask; /* Directory notify events */
> struct dnotify_struct *i_dnotify; /* for directory notifications */
> diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
> index b084b98..c2ed916 100644
> --- a/include/linux/fsnotify.h
> +++ b/include/linux/fsnotify.h
> @@ -99,6 +99,14 @@ static inline void fsnotify_nameremove(struct dentry *dentry, int isdir)
> }
>
> /*
> + * fsnotify_inode_delete - and inode is being evicted from cache, clean up is needed
> + */
> +static inline void fsnotify_inode_delete(struct inode *inode)
> +{
> + __fsnotify_inode_delete(inode, FSNOTIFY_INODE_DESTROY);
> +}
> +
> +/*
> * fsnotify_inoderemove - an inode is going away
> */
> static inline void fsnotify_inoderemove(struct inode *inode)
> @@ -107,6 +115,7 @@ static inline void fsnotify_inoderemove(struct inode *inode)
> inotify_inode_is_dead(inode);
>
> fsnotify(inode, FS_DELETE_SELF, inode, FSNOTIFY_EVENT_INODE);
> + __fsnotify_inode_delete(inode, FSNOTIFY_LAST_DENTRY);
> }
>
> /*
> diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
> index 924902e..0482a14 100644
> --- a/include/linux/fsnotify_backend.h
> +++ b/include/linux/fsnotify_backend.h
> @@ -13,6 +13,7 @@
> #include <linux/fs.h>
> #include <linux/list.h>
> #include <linux/mutex.h>
> +#include <linux/spinlock.h>
> #include <linux/wait.h>
>
> #include <asm/atomic.h>
> @@ -68,13 +69,21 @@
> #define FSNOTIFY_EVENT_FILE 1
> #define FSNOTIFY_EVENT_INODE 2
>
> +/* these tell __fsnotify_inode_delete what kind of event this is */
> +#define FSNOTIFY_LAST_DENTRY 1
> +#define FSNOTIFY_INODE_DESTROY 2
> +
> struct fsnotify_group;
> struct fsnotify_event;
> +struct fsnotify_mark_entry;
>
> struct fsnotify_ops {
> int (*event_to_notif)(struct fsnotify_group *group, struct fsnotify_event *event);
> + void (*mark_clear_inode)(struct fsnotify_mark_entry *entry, struct inode *inode, unsigned int flags);
> + int (*should_send_event)(struct fsnotify_group *group, struct inode *inode, __u64 mask);
> void (*free_group_priv)(struct fsnotify_group *group);
> void (*free_event_priv)(struct fsnotify_group *group, struct fsnotify_event *event);
> + void (*free_mark_priv)(struct fsnotify_mark_entry *entry);
> };
>
> struct fsnotify_group {
> @@ -85,6 +94,10 @@ struct fsnotify_group {
>
> const struct fsnotify_ops *ops; /* how this group handles things */
>
> + /* stores all fastapth entries assoc with this group so they can be cleaned on unregister */
> + spinlock_t mark_lock; /* protect mark_entries list */
> + struct list_head mark_entries; /* all inode mark entries for this group */
> +
> unsigned int priority; /* order this group should receive msgs. low first */
>
> void *private; /* private data for implementers (dnotify, inotify, fanotify) */
> @@ -94,17 +107,26 @@ struct fsnotify_group {
>
> /* called from the vfs to signal fs events */
> extern void fsnotify(struct inode *to_tell, __u64 mask, void *data, int data_is);
> +extern void __fsnotify_inode_delete(struct inode *inode, int flag);
>
> /* called from fsnotify interfaces, such as fanotify or dnotify */
> extern void fsnotify_recalc_global_mask(void);
> +extern void fsnotify_recalc_group_mask(struct fsnotify_group *group);
> extern struct fsnotify_group *fsnotify_obtain_group(unsigned int priority, unsigned int group_num, __u64 mask, const struct fsnotify_ops *ops);
> extern void fsnotify_put_group(struct fsnotify_group *group);
> extern void fsnotify_get_group(struct fsnotify_group *group);
>
> +extern void fsnotify_recalc_inode_mask(struct inode *inode);
> +extern struct fsnotify_mark_entry *fsnotify_find_mark_entry(struct fsnotify_group *group, struct inode *inode);
> +extern struct fsnotify_mark_entry *fsnotify_mark_add(struct fsnotify_group *group, struct inode *inode, __u64 mask);
> #else
>
> static inline void fsnotify(struct inode *to_tell, __u64 mask, void *data, int data_is);
> {}
> +
> +static inline void __fsnotify_inode_delete(struct inode *inode, int flag)
> +{}
> +
> #endif /* CONFIG_FSNOTIFY */
>
> #endif /* __KERNEL __ */

--
Evgeniy Polyakov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/