[PATCH] mm/shmem: use invalidate_lock to fix hole-punch race
From: Gregory Price
Date: Thu Mar 26 2026 - 12:41:57 EST
Inflating a VM's balloon while vhost-user-net fork+exec's a helper
triggers "still mapped when deleted" on the memfd backing guest RAM:
BUG: Bad page cache in process __balloon pfn:6520704
page dumped because: still mapped when deleted
...
shmem_undo_range+0x3fa/0x570
shmem_fallocate+0x366/0x4d0
vfs_fallocate+0x13c/0x310
This BUG also resulted in guests seeing stale mappings backed by a
zeroed page, causing guest kernel panics. I was unable to trace that
specific interaction, but it appears to be related to THP splitting.
Two races allow PTEs to be re-installed for a folio that fallocate
is about to remove from page cache:
Race 1 — fault-around (filemap_map_pages):
fallocate fault-around fork
-------- ------------ ----
set i_private
unmap_mapping_range()
# zaps PTEs
filemap_map_pages()
# re-maps folio!
dup_mmap()
# child VMA
# in tree
shmem_undo_range()
lock folio
unmap_mapping_folio()
# child VMA:
# no PTE, skip
copy_page_range()
# copies PTE
# parent VMA:
# zaps PTE
filemap_remove_folio()
# mapcount=1, BUG!
filemap_map_pages() is called directly as .map_pages, bypassing
shmem_fault()'s i_private synchronization.
Race 2 — shmem_fault TOCTOU:
fallocate shmem_fault
-------- -----------
check i_private → NULL
set i_private
unmap_mapping_range()
# zaps PTEs
shmem_get_folio_gfp()
# finds folio in cache
finish_fault()
# installs PTE
shmem_undo_range()
truncate_inode_folio()
# mapcount=1, BUG!
Fix both races with invalidate_lock.
This matches the existing pattern used by secretmem_fault(),
udf_page_mkwrite(), and zonefs_filemap_page_mkwrite(), all of
which take invalidate_lock shared under mmap_lock in their
fault handlers.
This also requires removing the rcu_read_lock() from
do_fault_around() so that .map_pages may use sleeping locks.
The outer rcu_read_lock is redundant for all in-tree .map_pages
implementations: every one either IS filemap_map_pages (which
takes rcu_read_lock) or is a thin wrapper around it.
Fixes: d7c1755179b8 ("mm: implement ->map_pages for shmem/tmpfs")
Cc: stable@xxxxxxxxxxxxxxx
Signed-off-by: Gregory Price <gourry@xxxxxxxxxx>
---
mm/memory.c | 2 --
mm/shmem.c | 33 ++++++++++++++++++++++++++++++---
2 files changed, 30 insertions(+), 5 deletions(-)
diff --git a/mm/memory.c b/mm/memory.c
index e44469f9cf65..838583591fdf 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -5900,11 +5900,9 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
return VM_FAULT_OOM;
}
- rcu_read_lock();
ret = vmf->vma->vm_ops->map_pages(vmf,
vmf->pgoff + from_pte - pte_off,
vmf->pgoff + to_pte - pte_off);
- rcu_read_unlock();
return ret;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 4ecefe02881d..5c654b86f3cf 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2731,7 +2731,8 @@ static vm_fault_t shmem_falloc_wait(struct vm_fault *vmf, struct inode *inode)
static vm_fault_t shmem_fault(struct vm_fault *vmf)
{
struct inode *inode = file_inode(vmf->vma->vm_file);
- gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
+ struct address_space *mapping = inode->i_mapping;
+ gfp_t gfp = mapping_gfp_mask(mapping);
struct folio *folio = NULL;
vm_fault_t ret = 0;
int err;
@@ -2747,8 +2748,15 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
}
WARN_ON_ONCE(vmf->page != NULL);
+ /*
+ * shmem_fallocate(PUNCH_HOLE) holds invalidate_lock exclusive across
+ * unmap+truncate. Take it shared here so shmem_fault cannot obtain
+ * a folio in the process of being punched.
+ */
+ filemap_invalidate_lock_shared(mapping);
err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE,
gfp, vmf, &ret);
+ filemap_invalidate_unlock_shared(mapping);
if (err)
return vmf_error(err);
if (folio) {
@@ -3683,11 +3691,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
inode->i_private = &shmem_falloc;
spin_unlock(&inode->i_lock);
+ filemap_invalidate_lock(mapping);
if ((u64)unmap_end > (u64)unmap_start)
unmap_mapping_range(mapping, unmap_start,
1 + unmap_end - unmap_start, 0);
shmem_truncate_range(inode, offset, offset + len - 1);
/* No need to unmap again: hole-punching leaves COWed pages */
+ filemap_invalidate_unlock(mapping);
spin_lock(&inode->i_lock);
inode->i_private = NULL;
@@ -5268,9 +5278,26 @@ static const struct super_operations shmem_ops = {
#endif
};
+/*
+ * shmem_fallocate(PUNCH_HOLE) holds invalidate_lock for write across
+ * unmap+truncate. Take it for read here so fault-around cannot re-map
+ * pages being punched.
+ */
+static vm_fault_t shmem_map_pages(struct vm_fault *vmf,
+ pgoff_t start_pgoff, pgoff_t end_pgoff)
+{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ vm_fault_t ret;
+
+ filemap_invalidate_lock_shared(mapping);
+ ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
+ filemap_invalidate_unlock_shared(mapping);
+ return ret;
+}
+
static const struct vm_operations_struct shmem_vm_ops = {
.fault = shmem_fault,
- .map_pages = filemap_map_pages,
+ .map_pages = shmem_map_pages,
#ifdef CONFIG_NUMA
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
@@ -5282,7 +5309,7 @@ static const struct vm_operations_struct shmem_vm_ops = {
static const struct vm_operations_struct shmem_anon_vm_ops = {
.fault = shmem_fault,
- .map_pages = filemap_map_pages,
+ .map_pages = shmem_map_pages,
#ifdef CONFIG_NUMA
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
--
2.53.0