Orphan filesystems after mount namespace destruction and tmpfs "leak"
From: Kiryl Shutsemau
Date: Mon Feb 02 2026 - 12:59:52 EST
Hi,
In the Meta fleet, we saw a problem where destroying a container didn't
lead to freeing the shmem memory attributed to a tmpfs mounted inside
that container. It triggered an OOM when a new container attempted to
start.
Investigation has shown that this happened because a process outside of
the container kept a file from the tmpfs mapped. The mapped file is
small (4k), but it holds all the contents of the tmpfs (~47GiB) from
being freed.
When a tmpfs filesystem is mounted inside a mount namespace (e.g., a
container), and a process outside that namespace holds an open file
descriptor to a file on that tmpfs, the tmpfs superblock remains in
kernel memory indefinitely after:
1. All processes inside the mount namespace have exited.
2. The mount namespace has been destroyed.
3. The tmpfs is no longer visible in any mount namespace.
The superblock persists with mnt_ns = NULL in its mount structures,
keeping all tmpfs contents pinned in memory until the external file
descriptor is closed.
The problem is not specific to tmpfs, but for filesystems with backing
storage, the memory impact is not as severe since the page cache is
reclaimable.
The obvious solution to the problem is "Don't do that": the file should
be unmapped/closed upon container destruction.
But I wonder if the kernel can/should do better here? Currently, this
scenario is hard to diagnose. It looks like a leak of shmem pages.
Also, I wonder if the current behavior can lead to data loss on a
filesystem with backing storage:
- The mount namespace where my USB stick was mounted is gone.
- The USB stick is no longer mounted anywhere.
- I can pull the USB stick out.
- Oops, someone was writing there: corruption/data loss.
I am not sure what a possible solution would be here. I can only think
of blocking exit(2) for the last process in the namespace until all
filesystems are cleanly unmounted, but that is not very informative
either.
I have attached a Claude-generated reproducer and a drgn script that
lists orphan tmpfs filesystems.
--
Kiryl Shutsemau / Kirill A. Shutemov
Attachment:
tmpfs_leak_reproducer.sh
Description: Bourne shell script
"""
Monitor tmpfs superblocks to detect orphaned/leaked tmpfs filesystems.
Run with: sudo drgn -s /path/to/vmlinux monitor_tmpfs.py
This script lists all tmpfs superblocks and shows which ones have
NULL mount namespaces (orphaned) or are otherwise in unusual states.
"""
from drgn import container_of
from drgn.helpers.linux.list import hlist_for_each_entry, list_for_each_entry
def get_mount_info(sb):
"""Get mount information for a superblock"""
mounts = []
try:
for mnt in list_for_each_entry('struct mount', sb.s_mounts.address_of_(), 'mnt_instance'):
try:
mnt_ns = mnt.mnt_ns
mnt_ns_addr = mnt_ns.value_() if mnt_ns else 0
# Get device name
devname = mnt.mnt_devname.string_().decode('utf-8', errors='replace') if mnt.mnt_devname else "?"
# Get mount point
mnt_mountpoint = mnt.mnt_mountpoint
if mnt_mountpoint:
name = mnt_mountpoint.d_name.name.string_().decode('utf-8', errors='replace')
else:
name = "?"
mounts.append({
'mnt_ns': mnt_ns_addr,
'devname': devname,
'mountpoint': name,
'mnt_addr': mnt.value_()
})
except:
continue
except:
pass
return mounts
def main():
# Get the tmpfs/shmem filesystem type
shmem_fs_type = prog['shmem_fs_type']
print("=" * 80)
print("tmpfs Superblock Monitor")
print("=" * 80)
print()
# Collect all tmpfs superblocks
tmpfs_sbs = []
for sb in hlist_for_each_entry('struct super_block', shmem_fs_type.fs_supers, 's_instances'):
tmpfs_sbs.append(sb)
print(f"Found {len(tmpfs_sbs)} tmpfs superblocks\n")
orphaned = []
normal = []
for sb in tmpfs_sbs:
sb_addr = sb.value_()
s_dev = sb.s_dev.value_()
s_active = sb.s_active.counter.value_()
mounts = get_mount_info(sb)
# Check if any mount has NULL namespace
has_orphaned = any(m['mnt_ns'] == 0 for m in mounts)
has_normal = any(m['mnt_ns'] != 0 for m in mounts)
info = {
'sb': sb,
'sb_addr': sb_addr,
's_dev': s_dev,
's_active': s_active,
'mounts': mounts,
'has_orphaned': has_orphaned
}
if has_orphaned and not has_normal:
orphaned.append(info)
else:
normal.append(info)
# Print orphaned superblocks first (these are the leaked ones)
if orphaned:
print("!!! ORPHANED tmpfs SUPERBLOCKS (potential leaks) !!!")
print("-" * 80)
for info in orphaned:
sb = info['sb']
print(f"Superblock: 0x{info['sb_addr']:016x}")
print(f" s_dev: {info['s_dev']}")
print(f" s_active: {info['s_active']}")
# Try to list some files
try:
root = sb.s_root
if root:
print(" Root dentry contents:")
count = 0
for child in hlist_for_each_entry('struct dentry', root.d_children, 'd_sib'):
name = child.d_name.name.string_().decode('utf-8', errors='replace')
inode = child.d_inode
if inode:
size = inode.i_size.value_()
print(f" - {name} ({size} bytes)")
count += 1
if count >= 10:
print(" ... (more files)")
break
except Exception as e:
print(f" Error listing files: {e}")
for m in info['mounts']:
print(f" Mount: {m['devname']} -> {m['mountpoint']}")
print(f" mnt_ns: 0x{m['mnt_ns']:016x} (NULL = orphaned)")
print()
print()
# Print summary of normal superblocks
print("Normal tmpfs superblocks:")
print("-" * 80)
for info in normal:
devnames = set(m['devname'] for m in info['mounts'])
namespaces = set(m['mnt_ns'] for m in info['mounts'])
print(f"0x{info['sb_addr']:016x} s_dev={info['s_dev']:<4} "
f"active={info['s_active']:<3} "
f"mounts={len(info['mounts']):<3} "
f"devnames={devnames}")
print()
print("=" * 80)
print(f"Summary: {len(orphaned)} orphaned, {len(normal)} normal")
if orphaned:
print("WARNING: Orphaned tmpfs superblocks detected - these may be leaking memory!")
print("=" * 80)
if __name__ == "__main__":
main()