Re: [patch 0/9] mm: thrash detection-based file cache sizing v6

From: Johannes Weiner
Date: Wed Nov 27 2013 - 23:41:20 EST


Here are the changes to the series I have accumulated so far. Mainly:

o truncate_inode_pages_final() that sets mapping_set_exiting() and
uses ordered but unlocked nrshadows & nrpages reads to skip the tree
lock acquisition and IRQ disabling on empty page cache trees.

o revert all efforts to make the lru_lock IRQ-safe just to silence
lockdep. Also solves the problem of the list_lru_init() key API.

o in the shadow shrinker, drop the lru_lock after mapping->tree_lock
has been acquired. The latter pins the inode by preventing the
final truncate from removing shadow entries, so we can safely
release the lru lock once mapping->tree_lock is acquired and the
node is taken off the list.

o changed radix_tree_node member names and documented them better

o fixed typos

As we agreed to keep the shadow node lru management non-lazy for now,
we don't need to worry about the lifetime of radix tree nodes in the
shrinker beyond taking it off the lru list with the lru_lock and
mapping->tree_lock held. No complicated RCU scheme required.

---

diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index f089058..fc0de70 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -295,9 +295,9 @@ in the beginning of ->setattr unconditionally.
->clear_inode() and ->delete_inode() are gone; ->evict_inode() should
be used instead. It gets called whenever the inode is evicted, whether it has
remaining links or not. Caller does *not* evict the pagecache or inode-associated
-metadata buffers; getting rid of those is responsibility of method, as it had
-been for ->delete_inode(). Caller makes sure async writeback cannot be running
-for the inode while (or after) ->evict_inode() is called.
+metadata buffers; the method has to use truncate_inode_pages_final() to get rid
+of those. Caller makes sure async writeback cannot be running for the inode while
+(or after) ->evict_inode() is called.

->drop_inode() returns int now; it's called on final iput() with
inode->i_lock held and it returns true if filesystems wants the inode to be
diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c
index b868c2b..79cbc9c 100644
--- a/drivers/staging/lustre/lustre/llite/llite_lib.c
+++ b/drivers/staging/lustre/lustre/llite/llite_lib.c
@@ -1817,7 +1817,7 @@ void ll_delete_inode(struct inode *inode)
cl_sync_file_range(inode, 0, OBD_OBJECT_EOF,
CL_FSYNC_DISCARD, 1);

- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);

/* Workaround for LU-118 */
if (inode->i_data.nrpages) {
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 94de6d1..e6716c2 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -444,7 +444,7 @@ void v9fs_evict_inode(struct inode *inode)
{
struct v9fs_inode *v9inode = V9FS_I(inode);

- truncate_inode_pages(inode->i_mapping, 0);
+ truncate_inode_pages_final(inode->i_mapping);
clear_inode(inode);
filemap_fdatawrite(inode->i_mapping);

diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 0e092d0..96df91e 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -259,7 +259,7 @@ affs_evict_inode(struct inode *inode)
{
unsigned long cache_page;
pr_debug("AFFS: evict_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);

if (!inode->i_nlink) {
inode->i_size = 0;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 789bc25..2bbe60e 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -422,7 +422,7 @@ void afs_evict_inode(struct inode *inode)

ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);

- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);

afs_give_up_callback(vnode);
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 8defc6b..29aa5cf 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -172,7 +172,7 @@ static void bfs_evict_inode(struct inode *inode)

dprintf("ino=%08lx\n", ino);

- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode);
clear_inode(inode);

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 391ffe5..c7a7def 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -419,7 +419,7 @@ static void bdev_evict_inode(struct inode *inode)
{
struct block_device *bdev = &BDEV_I(inode)->bdev;
struct list_head *p;
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode); /* is it needed here? */
clear_inode(inode);
spin_lock(&bdev_lock);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 51e3afa..d3e4983 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4471,7 +4471,7 @@ void btrfs_evict_inode(struct inode *inode)

trace_btrfs_inode_evict(inode);

- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
btrfs_is_free_space_inode(inode)))
goto no_delete;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 77fc5e1..d795c50 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -286,7 +286,7 @@ cifs_destroy_inode(struct inode *inode)
static void
cifs_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
cifs_fscache_release_inode_cookie(inode);
}
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index 4dcc0d8..43a5b38 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -250,7 +250,7 @@ static void coda_put_super(struct super_block *sb)

static void coda_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
coda_cache_clear_inode(inode);
}
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index e879cf8..afa1b81 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -132,7 +132,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf)
*/
static void ecryptfs_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
iput(ecryptfs_inode_to_lower(inode));
}
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a52a5d2..d9ff4d3 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1479,7 +1479,7 @@ void exofs_evict_inode(struct inode *inode)
struct ore_io_state *ios;
int ret;

- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);

/* TODO: should do better here */
if (inode->i_nlink || is_bad_inode(inode))
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index c260de6..115fa58 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -78,7 +78,7 @@ void ext2_evict_inode(struct inode * inode)
dquot_drop(inode);
}

- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);

if (want_delete) {
sb_start_intwrite(inode->i_sb);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 2bd8548..153f4be 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -228,7 +228,7 @@ void ext3_evict_inode (struct inode *inode)
log_wait_commit(journal, commit_tid);
filemap_write_and_wait(&inode->i_data);
}
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);

ext3_discard_reservation(inode);
rsv = ei->i_block_alloc_info;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index e274e9c..3b75e70 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -214,7 +214,7 @@ void ext4_evict_inode(struct inode *inode)
jbd2_complete_transaction(journal, commit_tid);
filemap_write_and_wait(&inode->i_data);
}
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);

WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
goto no_delete;
@@ -225,7 +225,7 @@ void ext4_evict_inode(struct inode *inode)

if (ext4_should_order_data(inode))
ext4_begin_ordered_truncate(inode, 0);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);

WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
if (is_bad_inode(inode))
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 9339cd2..0bd44f8 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -246,7 +246,7 @@ void f2fs_evict_inode(struct inode *inode)
int ilock;

trace_f2fs_evict_inode(inode);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);

if (inode->i_ino == F2FS_NODE_INO(sbi) ||
inode->i_ino == F2FS_META_INO(sbi))
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0062da2..fe802d8 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -490,7 +490,7 @@ EXPORT_SYMBOL_GPL(fat_build_inode);

static void fat_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
fat_truncate_blocks(inode, 0);
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index f47df72..363e3ae 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -354,7 +354,7 @@ static void vxfs_i_callback(struct rcu_head *head)
void
vxfs_evict_inode(struct inode *ip)
{
- truncate_inode_pages(&ip->i_data, 0);
+ truncate_inode_pages_final(&ip->i_data);
clear_inode(ip);
call_rcu(&ip->i_rcu, vxfs_i_callback);
}
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index a8ce6da..09d7fa0 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -123,7 +123,7 @@ static void fuse_destroy_inode(struct inode *inode)

static void fuse_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (inode->i_sb->s_flags & MS_ACTIVE) {
struct fuse_conn *fc = get_fuse_conn(inode);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index e5639de..ac96a99 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1525,7 +1525,7 @@ out_unlock:
fs_warn(sdp, "gfs2_evict_inode: %d\n", error);
out:
/* Case 3 starts here */
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
gfs2_rs_delete(ip);
gfs2_ordered_del_inode(ip);
clear_inode(inode);
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index 380ab31..9e2fecd 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -547,7 +547,7 @@ out:

void hfs_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (HFS_IS_RSRC(inode) && HFS_I(inode)->rsrc_inode) {
HFS_I(HFS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 4c4d142..b9436d9 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -161,7 +161,7 @@ static int hfsplus_write_inode(struct inode *inode,
static void hfsplus_evict_inode(struct inode *inode)
{
hfs_dbg(INODE, "hfsplus_evict_inode: %lu\n", inode->i_ino);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (HFSPLUS_IS_RSRC(inode)) {
HFSPLUS_I(HFSPLUS_I(inode)->rsrc_inode)->rsrc_inode = NULL;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 2543728..0c9f640 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -239,7 +239,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)

static void hostfs_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (HOSTFS_I(inode)->fd != -1) {
close_file(&HOSTFS_I(inode)->fd);
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 9edeeb0..50a4273 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -304,7 +304,7 @@ void hpfs_write_if_changed(struct inode *inode)

void hpfs_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (!inode->i_nlink) {
hpfs_lock(inode->i_sb);
diff --git a/fs/inode.c b/fs/inode.c
index 7858fb7..093864e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -546,26 +546,10 @@ static void evict(struct inode *inode)
*/
inode_wait_for_writeback(inode);

- /*
- * Page reclaim can not do iput() and thus can race with the
- * inode teardown. Tell it when the address space is exiting,
- * so that it does not install eviction information after the
- * final truncate has begun.
- *
- * As truncation uses a lockless tree lookup, acquire the
- * spinlock to make sure any ongoing tree modification that
- * does not see AS_EXITING is completed before starting the
- * final truncate.
- */
- spin_lock_irq(&inode->i_data.tree_lock);
- mapping_set_exiting(&inode->i_data);
- spin_unlock_irq(&inode->i_data.tree_lock);
-
if (op->evict_inode) {
op->evict_inode(inode);
} else {
- if (inode->i_data.nrpages || inode->i_data.nrshadows)
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
}
if (S_ISBLK(inode->i_mode) && inode->i_bdev)
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index fe3c052..00ed6c6 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -241,7 +241,7 @@ void jffs2_evict_inode (struct inode *inode)

jffs2_dbg(1, "%s(): ino #%lu mode %o\n",
__func__, inode->i_ino, inode->i_mode);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
jffs2_do_clear_inode(c, f);
}
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index f4aab71..6f8fe72 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -154,7 +154,7 @@ void jfs_evict_inode(struct inode *inode)
dquot_initialize(inode);

if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);

if (test_cflag(COMMIT_Freewmap, inode))
jfs_free_zero_link(inode);
@@ -168,7 +168,7 @@ void jfs_evict_inode(struct inode *inode)
dquot_free_inode(inode);
}
} else {
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
}
clear_inode(inode);
dquot_drop(inode);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 9a59cba..4814031 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -2180,7 +2180,7 @@ void logfs_evict_inode(struct inode *inode)
do_delete_inode(inode);
}
}
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);

/* Cheaper version of write_inode. All changes are concealed in
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 0332109..03aaeb1 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -26,7 +26,7 @@ static int minix_remount (struct super_block * sb, int * flags, char * data);

static void minix_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
minix_truncate(inode);
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 4659da6..e728061 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -296,7 +296,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
static void
ncp_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);

if (S_ISDIR(inode->i_mode)) {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index eda8879..fbc38a6 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -128,7 +128,7 @@ EXPORT_SYMBOL_GPL(nfs_clear_inode);

void nfs_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
nfs_clear_inode(inode);
}
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index e26acdd..f2a5c44 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -98,7 +98,7 @@ static int nfs4_write_inode(struct inode *inode, struct writeback_control *wbc)
*/
static void nfs4_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
pnfs_return_layout(inode);
pnfs_destroy_layout(NFS_I(inode));
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 42fcbe3..b9c5726 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -783,16 +783,14 @@ void nilfs_evict_inode(struct inode *inode)
int ret;

if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
- if (inode->i_data.nrpages || inode->i_data.nrshadows)
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
nilfs_clear_inode(inode);
return;
}
nilfs_transaction_begin(sb, &ti, 0); /* never fails */

- if (inode->i_data.nrpages || inode->i_data.nrshadows)
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);

/* TODO: some of the following operations may fail. */
nilfs_truncate_bmap(ii, 0);
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 2778b02..bd50adc1 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -2259,7 +2259,7 @@ void ntfs_evict_big_inode(struct inode *vi)
{
ntfs_inode *ni = NTFS_I(vi);

- truncate_inode_pages(&vi->i_data, 0);
+ truncate_inode_pages_final(&vi->i_data);
clear_inode(vi);

#ifdef NTFS_RW
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index f87f9bd..f1c46a7 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -951,7 +951,7 @@ static void ocfs2_cleanup_delete_inode(struct inode *inode,
(unsigned long long)OCFS2_I(inode)->ip_blkno, sync_data);
if (sync_data)
filemap_write_and_wait(inode->i_mapping);
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
}

static void ocfs2_delete_inode(struct inode *inode)
@@ -1167,7 +1167,7 @@ void ocfs2_evict_inode(struct inode *inode)
(OCFS2_I(inode)->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)) {
ocfs2_delete_inode(inode);
} else {
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
}
ocfs2_clear_inode(inode);
}
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index d8b0afd..ec58c76 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -183,7 +183,7 @@ int omfs_sync_inode(struct inode *inode)
*/
static void omfs_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);

if (inode->i_nlink)
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8eaa1ba..9ca0f08 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -35,7 +35,7 @@ static void proc_evict_inode(struct inode *inode)
const struct proc_ns_operations *ns_ops;
void *ns;

- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);

/* Stop tracking associated processes */
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index ad62bdbb..bc8b800 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -35,7 +35,7 @@ void reiserfs_evict_inode(struct inode *inode)
if (!inode->i_nlink && !is_bad_inode(inode))
dquot_initialize(inode);

- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
if (inode->i_nlink)
goto no_delete;

diff --git a/fs/super.c b/fs/super.c
index a958d52..0225c20 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -196,9 +196,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
INIT_HLIST_BL_HEAD(&s->s_anon);
INIT_LIST_HEAD(&s->s_inodes);

- if (list_lru_init(&s->s_dentry_lru, NULL))
+ if (list_lru_init(&s->s_dentry_lru))
goto err_out;
- if (list_lru_init(&s->s_inode_lru, NULL))
+ if (list_lru_init(&s->s_inode_lru))
goto err_out_dentry_lru;

INIT_LIST_HEAD(&s->s_mounts);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 963f910..bd0dd8d 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -309,7 +309,7 @@ void sysfs_evict_inode(struct inode *inode)
{
struct sysfs_dirent *sd = inode->i_private;

- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
sysfs_put(sd);
}
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index c327d4e..5625ca9 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -295,7 +295,7 @@ int sysv_sync_inode(struct inode *inode)

static void sysv_evict_inode(struct inode *inode)
{
- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
if (!inode->i_nlink) {
inode->i_size = 0;
sysv_truncate(inode);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 3e4aa72..b9ac1f3 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -351,7 +351,7 @@ static void ubifs_evict_inode(struct inode *inode)
dbg_gen("inode %lu, mode %#x", inode->i_ino, (int)inode->i_mode);
ubifs_assert(!atomic_read(&inode->i_count));

- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);

if (inode->i_nlink)
goto done;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 062b792..af6f4c3 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -146,8 +146,8 @@ void udf_evict_inode(struct inode *inode)
want_delete = 1;
udf_setsize(inode, 0);
udf_update_inode(inode, IS_SYNC(inode));
- } else
- truncate_inode_pages(&inode->i_data, 0);
+ }
+ truncate_inode_pages_final(&inode->i_data);
invalidate_inode_buffers(inode);
clear_inode(inode);
if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB &&
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index c8ca960..61e8a9b 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -885,7 +885,7 @@ void ufs_evict_inode(struct inode * inode)
if (!inode->i_nlink && !is_bad_inode(inode))
want_delete = 1;

- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
if (want_delete) {
loff_t old_i_size;
/*UFS_I(inode)->i_dtime = CURRENT_TIME;*/
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c49cbce..2634700 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1670,7 +1670,7 @@ xfs_alloc_buftarg(
if (xfs_setsize_buftarg_early(btp, bdev))
goto error;

- if (list_lru_init(&btp->bt_lru, NULL))
+ if (list_lru_init(&btp->bt_lru))
goto error;

btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 57d6aa9..3e6c2e6 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -831,7 +831,7 @@ xfs_qm_init_quotainfo(

qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);

- if ((error = list_lru_init(&qinf->qi_lru, NULL))) {
+ if ((error = list_lru_init(&qinf->qi_lru))) {
kmem_free(qinf);
mp->m_quotainfo = NULL;
return error;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 15188cc..47ce25d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1006,7 +1006,7 @@ xfs_fs_evict_inode(

trace_xfs_evict_inode(ip);

- truncate_inode_pages(&inode->i_data, 0);
+ truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
XFS_STATS_INC(vn_rele);
XFS_STATS_INC(vn_remove);
diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h
index b970a45..3ce5417 100644
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -32,7 +32,7 @@ struct list_lru {
};

void list_lru_destroy(struct list_lru *lru);
-int list_lru_init(struct list_lru *lru, struct lock_class_key *key);
+int list_lru_init(struct list_lru *lru);

/**
* list_lru_add: add an element to the lru list's tail
diff --git a/include/linux/mm.h b/include/linux/mm.h
index c09ef3a..5449e7a 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1588,6 +1588,7 @@ vm_unmapped_area(struct vm_unmapped_area_info *info)
extern void truncate_inode_pages(struct address_space *, loff_t);
extern void truncate_inode_pages_range(struct address_space *,
loff_t lstart, loff_t lend);
+extern void truncate_inode_pages_final(struct address_space *);

/* generic vm_area_ops exported for stackable file systems */
extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
index 29df11f..33170db 100644
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -88,15 +88,17 @@ struct radix_tree_node {
unsigned int path; /* Offset in parent & height from the bottom */
unsigned int count;
union {
- /* Used when ascending tree */
struct {
+ /* Used when ascending tree */
struct radix_tree_node *parent;
- void *private;
+ /* For tree user */
+ void *private_data;
};
/* Used when freeing node */
struct rcu_head rcu_head;
};
- struct list_head lru;
+ /* For tree user */
+ struct list_head private_list;
void __rcu *slots[RADIX_TREE_MAP_SIZE];
unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
};
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 1865cd2..0a08953 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -1425,7 +1425,7 @@ radix_tree_node_ctor(void *arg)
struct radix_tree_node *node = arg;

memset(node, 0, sizeof(*node));
- INIT_LIST_HEAD(&node->lru);
+ INIT_LIST_HEAD(&node->private_list);
}

static __init unsigned long __maxindex(unsigned int height)
diff --git a/mm/filemap.c b/mm/filemap.c
index 79a7546..b93e223 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -120,8 +120,17 @@ static void page_cache_tree_delete(struct address_space *mapping,

__radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot);

- if (shadow)
+ if (shadow) {
mapping->nrshadows++;
+ /*
+ * Make sure the nrshadows update is committed before
+ * the nrpages update so that final truncate racing
+ * with reclaim does not see both counters 0 at the
+ * same time and miss a shadow entry.
+ */
+ smp_wmb();
+ }
+ mapping->nrpages--;

if (!node) {
/* Clear direct pointer tags in root node */
@@ -148,9 +157,10 @@ static void page_cache_tree_delete(struct address_space *mapping,
return;

/* Only shadow entries in there, keep track of this node */
- if (!(node->count & RADIX_TREE_COUNT_MASK) && list_empty(&node->lru)) {
- node->private = mapping;
- list_lru_add(&workingset_shadow_nodes, &node->lru);
+ if (!(node->count & RADIX_TREE_COUNT_MASK) &&
+ list_empty(&node->private_list)) {
+ node->private_data = mapping;
+ list_lru_add(&workingset_shadow_nodes, &node->private_list);
}
}

@@ -178,7 +188,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)

page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
- mapping->nrpages--;
+
__dec_zone_page_state(page, NR_FILE_PAGES);
if (PageSwapBacked(page))
__dec_zone_page_state(page, NR_SHMEM);
@@ -518,11 +528,13 @@ static int page_cache_tree_insert(struct address_space *mapping,
node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
}
radix_tree_replace_slot(slot, page);
+ mapping->nrpages++;
if (node) {
node->count++;
/* Installed page, can't be shadow-only anymore */
- if (!list_empty(&node->lru))
- list_lru_del(&workingset_shadow_nodes, &node->lru);
+ if (!list_empty(&node->private_list))
+ list_lru_del(&workingset_shadow_nodes,
+ &node->private_list);
}
return 0;
}
@@ -557,7 +569,6 @@ static int __add_to_page_cache_locked(struct page *page,
radix_tree_preload_end();
if (unlikely(error))
goto err_insert;
- mapping->nrpages++;
__inc_zone_page_state(page, NR_FILE_PAGES);
spin_unlock_irq(&mapping->tree_lock);
trace_mm_filemap_add_to_page_cache(page);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index c357e8f..72f9dec 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -114,7 +114,7 @@ restart:
}
EXPORT_SYMBOL_GPL(list_lru_walk_node);

-int list_lru_init(struct list_lru *lru, struct lock_class_key *key)
+int list_lru_init(struct list_lru *lru)
{
int i;
size_t size = sizeof(*lru->node) * nr_node_ids;
@@ -126,8 +126,6 @@ int list_lru_init(struct list_lru *lru, struct lock_class_key *key)
nodes_clear(lru->active_nodes);
for (i = 0; i < nr_node_ids; i++) {
spin_lock_init(&lru->node[i].lock);
- if (key)
- lockdep_set_class(&lru->node[i].lock, key);
INIT_LIST_HEAD(&lru->node[i].list);
lru->node[i].nr_items = 0;
}
diff --git a/mm/truncate.c b/mm/truncate.c
index 9cf5f88..5c2615d 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -48,8 +48,9 @@ static void clear_exceptional_entry(struct address_space *mapping,
goto unlock;
node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
/* No more shadow entries, stop tracking the node */
- if (!(node->count >> RADIX_TREE_COUNT_SHIFT) && !list_empty(&node->lru))
- list_lru_del(&workingset_shadow_nodes, &node->lru);
+ if (!(node->count >> RADIX_TREE_COUNT_SHIFT) &&
+ !list_empty(&node->private_list))
+ list_lru_del(&workingset_shadow_nodes, &node->private_list);
__radix_tree_delete_node(&mapping->page_tree, node);
unlock:
spin_unlock_irq(&mapping->tree_lock);
@@ -407,6 +408,53 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart)
EXPORT_SYMBOL(truncate_inode_pages);

/**
+ * truncate_inode_pages_final - truncate *all* pages before inode dies
+ * @mapping: mapping to truncate
+ *
+ * Called under (and serialized by) inode->i_mutex.
+ *
+ * Filesystems have to use this in the .evict_inode path to inform the
+ * VM that this is the final truncate and the inode is going away.
+ */
+void truncate_inode_pages_final(struct address_space *mapping)
+{
+ unsigned long nrshadows;
+ unsigned long nrpages;
+
+ /*
+ * Page reclaim can not participate in regular inode lifetime
+ * management (can't call iput()) and thus can race with the
+ * inode teardown. Tell it when the address space is exiting,
+ * so that it does not install eviction information after the
+ * final truncate has begun.
+ */
+ mapping_set_exiting(mapping);
+
+ /*
+ * When reclaim installs eviction entries, it increases
+ * nrshadows first, then decreases nrpages. Make sure we see
+ * this in the right order or we might miss an entry.
+ */
+ nrpages = mapping->nrpages;
+ smp_rmb();
+ nrshadows = mapping->nrshadows;
+
+ if (nrpages || nrshadows) {
+ /*
+ * As truncation uses a lockless tree lookup, acquire
+ * the spinlock to make sure any ongoing tree
+ * modification that does not see AS_EXITING is
+ * completed before starting the final truncate.
+ */
+ spin_lock_irq(&mapping->tree_lock);
+ spin_unlock_irq(&mapping->tree_lock);
+
+ truncate_inode_pages(mapping, 0);
+ }
+}
+EXPORT_SYMBOL(truncate_inode_pages_final);
+
+/**
* invalidate_mapping_pages - Invalidate all the unlocked pages of one inode
* @mapping: the address_space which holds the pages to invalidate
* @start: the offset 'from' which to invalidate
diff --git a/mm/workingset.c b/mm/workingset.c
index ba8f0dd..2c3b5ad 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -35,7 +35,7 @@
*
* Access frequency and refault distance
*
- * A workload is trashing when its pages are frequently used but they
+ * A workload is thrashing when its pages are frequently used but they
* are evicted from the inactive list every time before another access
* would have promoted them to the active list.
*
@@ -62,7 +62,7 @@
*
* Approximating inactive page access frequency - Observations:
*
- * 1. When a page is accesed for the first time, it is added to the
+ * 1. When a page is accessed for the first time, it is added to the
* head of the inactive list, slides every existing inactive page
* towards the tail by one slot, and pushes the current tail page
* out of memory.
@@ -259,9 +259,6 @@ void workingset_activation(struct page *page)
* slightly higher threshold than regular shrinkers so we don't
* discard the entries too eagerly - after all, during light memory
* pressure is exactly when we need them.
- *
- * The list_lru lock nests inside the IRQ-safe mapping->tree_lock, so
- * we have to disable IRQs for any list_lru operation as well.
*/

struct list_lru workingset_shadow_nodes;
@@ -269,47 +266,47 @@ struct list_lru workingset_shadow_nodes;
static unsigned long count_shadow_nodes(struct shrinker *shrinker,
struct shrink_control *sc)
{
- unsigned long count;
-
- local_irq_disable();
- count = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
- local_irq_enable();
-
- return count;
+ return list_lru_count_node(&workingset_shadow_nodes, sc->nid);
}

-#define NOIRQ_BATCH 32
-
static enum lru_status shadow_lru_isolate(struct list_head *item,
spinlock_t *lru_lock,
void *arg)
{
struct address_space *mapping;
struct radix_tree_node *node;
- unsigned long *batch = arg;
unsigned int i;

- node = container_of(item, struct radix_tree_node, lru);
- mapping = node->private;
+ /*
+ * Page cache insertions and deletions synchroneously maintain
+ * the shadow node LRU under the mapping->tree_lock and the
+ * lru_lock. Because the page cache tree is emptied before
+ * the inode can be destroyed, holding the lru_lock pins any
+ * address_space that has radix tree nodes on the LRU.
+ *
+ * We can then safely transition to the mapping->tree_lock to
+ * pin only the address_space of the particular node we want
+ * to reclaim, take the node off-LRU, and drop the lru_lock.
+ */
+
+ node = container_of(item, struct radix_tree_node, private_list);
+ mapping = node->private_data;

- /* Don't disable IRQs for too long */
- if (--(*batch) == 0) {
- spin_unlock_irq(lru_lock);
- *batch = NOIRQ_BATCH;
- spin_lock_irq(lru_lock);
- return LRU_RETRY;
+ /* Coming from the list, invert the lock order */
+ if (!spin_trylock_irq(&mapping->tree_lock)) {
+ spin_unlock(lru_lock);
+ goto out_retry;
}

- /* Coming from the list, inverse the lock order */
- if (!spin_trylock(&mapping->tree_lock))
- return LRU_SKIP;
-
/*
* The nodes should only contain one or more shadow entries,
* no pages, so we expect to be able to remove them all and
* delete and free the empty node afterwards.
*/

+ list_del_init(&node->private_list);
+ spin_unlock(lru_lock);
+
BUG_ON(!node->count);
BUG_ON(node->count & RADIX_TREE_COUNT_MASK);

@@ -323,30 +320,24 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
mapping->nrshadows--;
}
}
- list_del_init(&node->lru);
BUG_ON(node->count);
if (!__radix_tree_delete_node(&mapping->page_tree, node))
BUG();

- spin_unlock(&mapping->tree_lock);
+ spin_unlock_irq(&mapping->tree_lock);

count_vm_event(WORKINGSET_NODES_RECLAIMED);
-
- return LRU_REMOVED;
+out_retry:
+ cond_resched();
+ spin_lock(lru_lock);
+ return LRU_RETRY;
}

static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
struct shrink_control *sc)
{
- unsigned long batch = NOIRQ_BATCH;
- unsigned long freed;
-
- local_irq_disable();
- freed = list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
- shadow_lru_isolate, &batch, &sc->nr_to_scan);
- local_irq_enable();
-
- return freed;
+ return list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
+ shadow_lru_isolate, NULL, &sc->nr_to_scan);
}

static struct shrinker workingset_shadow_shrinker = {
@@ -356,13 +347,11 @@ static struct shrinker workingset_shadow_shrinker = {
.flags = SHRINKER_NUMA_AWARE,
};

-static struct lock_class_key shadow_nodes_key;
-
static int __init workingset_init(void)
{
int ret;

- ret = list_lru_init(&workingset_shadow_nodes, &shadow_nodes_key);
+ ret = list_lru_init(&workingset_shadow_nodes);
if (ret)
goto err;
ret = register_shrinker(&workingset_shadow_shrinker);

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/