Re: [PATCH 8/9] inode: convert per-sb inode list to a list_lru

From: Jan Kara
Date: Mon Mar 16 2015 - 11:48:23 EST


On Mon 16-03-15 11:34:41, Josef Bacik wrote:
...
> >>diff --git a/fs/block_dev.c b/fs/block_dev.c
> >>index 2eb2436..d23ce6f 100644
> >>--- a/fs/block_dev.c
> >>+++ b/fs/block_dev.c
> >>@@ -1749,38 +1749,56 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty)
> >> }
> >> EXPORT_SYMBOL(__invalidate_device);
> >>
> >>-void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
> >>-{
> >>- struct inode *inode, *old_inode = NULL;
> >>+struct bdev_iter {
> >>+ void (*func)(struct block_device *, void *);
> >>+ void *arg;
> >>+ struct inode *toput_inode;
> >>+};
> >>
> >>- spin_lock(&blockdev_superblock->s_inode_list_lock);
> >>- list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
> >>- struct address_space *mapping = inode->i_mapping;
> >>+static enum lru_status
> >>+bdev_iter_cb(struct list_head *item, struct list_lru_one *lru,
> >>+ spinlock_t *lock, void *cb_arg)
> >>+{
> >>+ struct bdev_iter *iter = cb_arg;
> >>+ struct inode *inode = container_of(item, struct inode, i_sb_list);
> >>
> >>- spin_lock(&inode->i_lock);
> >>- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
> >>- mapping->nrpages == 0) {
> >>- spin_unlock(&inode->i_lock);
> >>- continue;
> >>- }
> >>- __iget(inode);
> >>+ spin_lock(&inode->i_lock);
> >>+ if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
> >>+ inode->i_mapping->nrpages == 0) {
> >> spin_unlock(&inode->i_lock);
> >>- spin_unlock(&blockdev_superblock->s_inode_list_lock);
> >>- /*
> >>- * We hold a reference to 'inode' so it couldn't have been
> >>- * removed from s_inodes list while we dropped the
> >>- * s_inode_list_lock We cannot iput the inode now as we can
> >>- * be holding the last reference and we cannot iput it under
> >>- * s_inode_list_lock. So we keep the reference and iput it
> >>- * later.
> >>- */
> >>- iput(old_inode);
> >>- old_inode = inode;
> >>+ return LRU_SKIP;
> >>+ }
> >>+ __iget(inode);
> >>+ spin_unlock(&inode->i_lock);
> >>+ spin_unlock(lock);
> >>
> >>- func(I_BDEV(inode), arg);
> >>+ iput(iter->toput_inode);
> >>+ iter->toput_inode = inode;
> >>
> >>- spin_lock(&blockdev_superblock->s_inode_list_lock);
> >>- }
> >>- spin_unlock(&blockdev_superblock->s_inode_list_lock);
> >>- iput(old_inode);
> >>+ iter->func(I_BDEV(inode), iter->arg);
> >>+
> >>+ /*
> >>+ * Even though we have dropped the lock here, we can return LRU_SKIP as
> >>+ * we have a reference to the current inode and so it's next pointer is
> >>+ * guaranteed to be valid even though we dropped the list lock.
> >>+ */
> >>+ spin_lock(lock);
> >>+ return LRU_SKIP;
> >>+}
> > So I actually think doing the iteration like this is buggy with list_lru
> >code. When we pin inode by grabbing i_count, we are only sure the inode
> >won't go away under us. However there's nothing which protects from list
> >modifications. So this method is only safe to be combined with
> >list_for_each[_entry](). Specifically it does *not* work when combined with
> >list_for_each_safe() which __list_lru_walk_one() uses because the next
> >pointer that is cached may become invalid once we drop the lock. And I'd
> >really hate to try to make these two work together - that would seem really
> >fragile since subtle changes to how lru_list iteration work could break
> >inode iteration.
> >
> >This is true for all the iterations over the inodes that are playing with
> >i_count. It's not just about this particular case of blockdev iteration.
> >
> >>diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
> >>index a4e1a8f..a0cdc66 100644
> >>--- a/fs/notify/inode_mark.c
> >>+++ b/fs/notify/inode_mark.c
> >>@@ -161,87 +161,60 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
> >> return ret;
> >> }
> >>
> >>-/**
> >>- * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes.
> >>- * @sb: superblock being unmounted.
> >>- *
> >>- * Called during unmount with no locks held, so needs to be safe against
> >>- * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block.
> >>- */
> >>-void fsnotify_unmount_inodes(struct super_block *sb)
> >>-{
> >>- struct inode *inode, *next_i, *need_iput = NULL;
> >>-
> >>- spin_lock(&sb->s_inode_list_lock);
> >>- list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) {
> >>- struct inode *need_iput_tmp;
> >>-
> >>- /*
> >>- * We cannot __iget() an inode in state I_FREEING,
> >>- * I_WILL_FREE, or I_NEW which is fine because by that point
> >>- * the inode cannot have any associated watches.
> >>- */
> >>- spin_lock(&inode->i_lock);
> >>- if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
> >>- spin_unlock(&inode->i_lock);
> >>- continue;
> >>- }
> >>+static enum lru_status
> >>+fsnotify_unmount_inode(struct list_head *item, struct list_lru_one *lru,
> >>+ spinlock_t *lock, void *cb_arg)
> >>+ {
> >>+ struct inode **toput_inode = cb_arg;
> >>+ struct inode *inode = container_of(item, struct inode, i_sb_list);
> >>+
> >>+ /* New or being freed inodes cannot have any associated watches. */
> >>+ spin_lock(&inode->i_lock);
> >>+ if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) {
> >>+ spin_unlock(&inode->i_lock);
> >>+ return LRU_SKIP;
> >>+ }
> >>
> >>- /*
> >>- * If i_count is zero, the inode cannot have any watches and
> >>- * doing an __iget/iput with MS_ACTIVE clear would actually
> >>- * evict all inodes with zero i_count from icache which is
> >>- * unnecessarily violent and may in fact be illegal to do.
> >>- */
> >>- if (!atomic_read(&inode->i_count)) {
> >>- spin_unlock(&inode->i_lock);
> >>- continue;
> >>- }
> >>-
> >>- need_iput_tmp = need_iput;
> >>- need_iput = NULL;
> >>-
> >>- /* In case fsnotify_inode_delete() drops a reference. */
> >>- if (inode != need_iput_tmp)
> >>- __iget(inode);
> >>- else
> >>- need_iput_tmp = NULL;
> >>+ /* If i_count is zero, the inode cannot have any watches */
> >>+ if (!atomic_read(&inode->i_count)) {
> >> spin_unlock(&inode->i_lock);
> >>+ return LRU_SKIP;
> >>+ }
> >>
> >>- /* In case the dropping of a reference would nuke next_i. */
> >>- while (&next_i->i_sb_list != &sb->s_inodes) {
> >>- spin_lock(&next_i->i_lock);
> >>- if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
> >>- atomic_read(&next_i->i_count)) {
> >>- __iget(next_i);
> >>- need_iput = next_i;
> >>- spin_unlock(&next_i->i_lock);
> >>- break;
> >>- }
> >>- spin_unlock(&next_i->i_lock);
> >>- next_i = list_entry(next_i->i_sb_list.next,
> >>- struct inode, i_sb_list);
> >>- }
> >>+ __iget(inode);
> >>+ spin_unlock(&inode->i_lock);
> >>+ spin_unlock(lock);
> >>
> >>- /*
> >>- * We can safely drop s_inode_list_lock here because either
> >>- * we actually hold references on both inode and next_i or
> >>- * end of list. Also no new inodes will be added since the
> >>- * umount has begun.
> >>- */
> >>- spin_unlock(&sb->s_inode_list_lock);
> >>+ iput(*toput_inode);
> >>+ *toput_inode = inode;
> >>
> >>- if (need_iput_tmp)
> >>- iput(need_iput_tmp);
> >>+ /* for each watch, send FS_UNMOUNT and then remove it */
> >>+ fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
> >>+ fsnotify_inode_delete(inode);
> >>
> >>- /* for each watch, send FS_UNMOUNT and then remove it */
> >>- fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
> >>+ /*
> >>+ * Even though we have dropped the lock here, we can return LRU_SKIP as
> >>+ * we have a reference to the current inode and so it's next pointer is
> >>+ * guaranteed to be valid even though we dropped the list lock.
> >>+ */
> >>+ spin_lock(lock);
> >>+ return LRU_SKIP;
> >>+}
> > So in the original code of fsnotify_unmount_inodes() you can see the
> >hoops you have to jump through when having to iterate inode list with
> >list_for_each_entry_safe(). Deleting that code without deeper thought
> >wasn't a good idea ;).
> >
> >As a side note, it should be possible to convert this code to use just
> >list_for_each() and avoid all that crap but it will require me to dwelve into
> >fsnotify magic again to verify it's correct. I guess I'll leave that after
> >your patches are queued so that I don't make life harder to you.
> >
>
> Well these two cases are just bugs, we are supposed to return
> LRU_RETRY every time we drop the lru list lock so we loop back to
> the beginning. But I agree with your other points, I'll just drop
> this one and find/create a batched per-cpu list to use instead and
> do that separately so we can still get the meat of this patchset in.
I don't think you can just return LRU_RETRY. That way e.g. iterating
all bdevs to wait for IO would become an infinite loop. The trouble with
inode iteration is we need to iterate all of them, sleep on each inode but
we don't remove the inode from the list (at least not in all callers) so
forward progress isn't guaranteed by that.

Honza
--
Jan Kara <jack@xxxxxxx>
SUSE Labs, CR
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/