Re: [07/24] tmpfs: fix race between umount and swapoff

From: Hugh Dickins
Date: Sat May 21 2011 - 00:53:30 EST


On Thu, 19 May 2011, Greg KH wrote:
> 2.6.33-longterm review patch. If anyone has any objections, please let us know.

Witold has found that I screwed up the highmem case in this patch.
Please add the commit appended at the bottom - or else delay both
until the next 2.6.33-longterm if you prefer.

Thanks,
Hugh

>
> ------------------
> Content-Length: 6159
> Lines: 197
>
> From: Hugh Dickins <hughd@xxxxxxxxxx>
>
> commit 778dd893ae785c5fd505dac30b5fc40aae188bf1 upstream.
>
> The use of igrab() in swapoff's shmem_unuse_inode() is just as vulnerable
> to umount as that in shmem_writepage().
>
> Fix this instance by extending the protection of shmem_swaplist_mutex
> right across shmem_unuse_inode(): while it's on the list, the inode cannot
> be evicted (and the filesystem cannot be unmounted) without
> shmem_evict_inode() taking that mutex to remove it from the list.
>
> But since shmem_writepage() might take that mutex, we should avoid making
> memory allocations or memcg charges while holding it: prepare them at the
> outer level in shmem_unuse(). When mem_cgroup_cache_charge() was
> originally placed, we didn't know until that point that the page from swap
> was actually a shmem page; but nowadays it's noted in the swap_map, so
> we're safe to charge upfront. For the radix_tree, do as is done in
> shmem_getpage(): preload upfront, but don't pin to the cpu; so we make a
> habit of refreshing the node pool, but might dip into GFP_NOWAIT reserves
> on occasion if subsequently preempted.
>
> With the allocation and charge moved out from shmem_unuse_inode(),
> we can also hold index map and info->lock over from finding the entry.
>
> Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx>
> Cc: Konstantin Khlebnikov <khlebnikov@xxxxxxxxxx>
> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
> Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
> Signed-off-by: Greg Kroah-Hartman <gregkh@xxxxxxx>
>
> ---
> mm/shmem.c | 88 +++++++++++++++++++++++++++++--------------------------------
> 1 file changed, 43 insertions(+), 45 deletions(-)
>
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -850,7 +850,7 @@ static inline int shmem_find_swp(swp_ent
>
> static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
> {
> - struct inode *inode;
> + struct address_space *mapping;
> unsigned long idx;
> unsigned long size;
> unsigned long limit;
> @@ -873,8 +873,10 @@ static int shmem_unuse_inode(struct shme
> if (size > SHMEM_NR_DIRECT)
> size = SHMEM_NR_DIRECT;
> offset = shmem_find_swp(entry, ptr, ptr+size);
> - if (offset >= 0)
> + if (offset >= 0) {
> + shmem_swp_balance_unmap();
> goto found;
> + }
> if (!info->i_indirect)
> goto lost2;
>
> @@ -912,11 +914,11 @@ static int shmem_unuse_inode(struct shme
> if (size > ENTRIES_PER_PAGE)
> size = ENTRIES_PER_PAGE;
> offset = shmem_find_swp(entry, ptr, ptr+size);
> - shmem_swp_unmap(ptr);
> if (offset >= 0) {
> shmem_dir_unmap(dir);
> goto found;
> }
> + shmem_swp_unmap(ptr);
> }
> }
> lost1:
> @@ -926,8 +928,7 @@ lost2:
> return 0;
> found:
> idx += offset;
> - inode = igrab(&info->vfs_inode);
> - spin_unlock(&info->lock);
> + ptr += offset;
>
> /*
> * Move _head_ to start search for next from here.
> @@ -938,37 +939,18 @@ found:
> */
> if (shmem_swaplist.next != &info->swaplist)
> list_move_tail(&shmem_swaplist, &info->swaplist);
> - mutex_unlock(&shmem_swaplist_mutex);
>
> - error = 1;
> - if (!inode)
> - goto out;
> /*
> - * Charge page using GFP_KERNEL while we can wait.
> - * Charged back to the user(not to caller) when swap account is used.
> - * add_to_page_cache() will be called with GFP_NOWAIT.
> + * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
> + * but also to hold up shmem_evict_inode(): so inode cannot be freed
> + * beneath us (pagelock doesn't help until the page is in pagecache).
> */
> - error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
> - if (error)
> - goto out;
> - error = radix_tree_preload(GFP_KERNEL);
> - if (error) {
> - mem_cgroup_uncharge_cache_page(page);
> - goto out;
> - }
> - error = 1;
> -
> - spin_lock(&info->lock);
> - ptr = shmem_swp_entry(info, idx, NULL);
> - if (ptr && ptr->val == entry.val) {
> - error = add_to_page_cache_locked(page, inode->i_mapping,
> - idx, GFP_NOWAIT);
> - /* does mem_cgroup_uncharge_cache_page on error */
> - } else /* we must compensate for our precharge above */
> - mem_cgroup_uncharge_cache_page(page);
> + mapping = info->vfs_inode.i_mapping;
> + error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT);
> + /* which does mem_cgroup_uncharge_cache_page on error */
>
> if (error == -EEXIST) {
> - struct page *filepage = find_get_page(inode->i_mapping, idx);
> + struct page *filepage = find_get_page(mapping, idx);
> error = 1;
> if (filepage) {
> /*
> @@ -988,14 +970,8 @@ found:
> swap_free(entry);
> error = 1; /* not an error, but entry was found */
> }
> - if (ptr)
> - shmem_swp_unmap(ptr);
> + shmem_swp_unmap(ptr);
> spin_unlock(&info->lock);
> - radix_tree_preload_end();
> -out:
> - unlock_page(page);
> - page_cache_release(page);
> - iput(inode); /* allows for NULL */
> return error;
> }
>
> @@ -1007,6 +983,26 @@ int shmem_unuse(swp_entry_t entry, struc
> struct list_head *p, *next;
> struct shmem_inode_info *info;
> int found = 0;
> + int error;
> +
> + /*
> + * Charge page using GFP_KERNEL while we can wait, before taking
> + * the shmem_swaplist_mutex which might hold up shmem_writepage().
> + * Charged back to the user (not to caller) when swap account is used.
> + * add_to_page_cache() will be called with GFP_NOWAIT.
> + */
> + error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
> + if (error)
> + goto out;
> + /*
> + * Try to preload while we can wait, to not make a habit of
> + * draining atomic reserves; but don't latch on to this cpu,
> + * it's okay if sometimes we get rescheduled after this.
> + */
> + error = radix_tree_preload(GFP_KERNEL);
> + if (error)
> + goto uncharge;
> + radix_tree_preload_end();
>
> mutex_lock(&shmem_swaplist_mutex);
> list_for_each_safe(p, next, &shmem_swaplist) {
> @@ -1014,17 +1010,19 @@ int shmem_unuse(swp_entry_t entry, struc
> found = shmem_unuse_inode(info, entry, page);
> cond_resched();
> if (found)
> - goto out;
> + break;
> }
> mutex_unlock(&shmem_swaplist_mutex);
> - /*
> - * Can some race bring us here? We've been holding page lock,
> - * so I think not; but would rather try again later than BUG()
> - */
> +
> +uncharge:
> + if (!found)
> + mem_cgroup_uncharge_cache_page(page);
> + if (found < 0)
> + error = found;
> +out:
> unlock_page(page);
> page_cache_release(page);
> -out:
> - return (found < 0) ? found : 0;
> + return error;
> }
>
> /*

commit e6c9366b2adb52cba64b359b3050200743c7568c
Author: Hugh Dickins <hughd@xxxxxxxxxx>
Date: Fri May 20 15:47:33 2011 -0700

tmpfs: fix highmem swapoff crash regression

Commit 778dd893ae78 ("tmpfs: fix race between umount and swapoff")
forgot the new rules for strict atomic kmap nesting, causing

WARNING: at arch/x86/mm/highmem_32.c:81

from __kunmap_atomic(), then

BUG: unable to handle kernel paging request at fffb9000

from shmem_swp_set() when shmem_unuse_inode() is handling swapoff with
highmem in use. My disgrace again.

See
https://bugzilla.kernel.org/show_bug.cgi?id=35352

Reported-by: Witold Baryluk <baryluk@xxxxxxxxxxxxxxxx>
Signed-off-by: Hugh Dickins <hughd@xxxxxxxxxx>
Cc: stable@xxxxxxxxxx
Signed-off-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>

diff --git a/mm/shmem.c b/mm/shmem.c
index dfc7069..ba4ad28 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -916,11 +916,12 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
if (size > ENTRIES_PER_PAGE)
size = ENTRIES_PER_PAGE;
offset = shmem_find_swp(entry, ptr, ptr+size);
+ shmem_swp_unmap(ptr);
if (offset >= 0) {
shmem_dir_unmap(dir);
+ ptr = shmem_swp_map(subdir);
goto found;
}
- shmem_swp_unmap(ptr);
}
}
lost1:
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/