[PATCH RFC v3 05/26] fs: make userspace_init_fs a dynamically-initialized pointer

From: Christian Brauner

Date: Wed Mar 11 2026 - 17:58:58 EST


Change userspace_init_fs from a declared-but-unused extern struct to
a dynamically initialized pointer. Add init_userspace_fs() which is
called early in kernel_init() (PID 1) to record PID 1's fs_struct
as the canonical userspace filesystem state.

Wire up __override_init_fs() and __revert_init_fs() to actually swap
current->fs to/from userspace_init_fs. Previously these were no-ops
that stored current->fs back to itself.

Fix nullfs_userspace_init() to compare against userspace_init_fs
instead of &init_fs. When PID 1 unshares its filesystem state, revert
userspace_init_fs to init_fs's root (nullfs) so that stale filesystem
state is not silently inherited by kworkers and usermodehelpers.

At this stage PID 1's fs still points to rootfs (set by
init_mount_tree), so userspace_init_fs points to rootfs and
scoped_with_init_fs() is functionally equivalent to its previous no-op
behavior.

Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx>
---
fs/fs_struct.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++-
include/linux/fs_struct.h | 15 ++++++++-------
include/linux/init_task.h | 1 +
init/main.c | 3 +++
4 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index c03a574ed65a..f44e43ce6d93 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -8,6 +8,7 @@
#include <linux/fs_struct.h>
#include <linux/init_task.h>
#include "internal.h"
+#include "mount.h"

/*
* Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
@@ -163,15 +164,34 @@ EXPORT_SYMBOL_GPL(unshare_fs_struct);
* fs_struct state. Breaking that contract sucks for both sides.
* So just don't bother with extra work for this. No sane init
* system should ever do this.
+ *
+ * On older kernels if PID 1 unshared its filesystem state with us the
+ * kernel simply used the stale fs_struct state implicitly pinning
+ * anything that PID 1 had last used. Even if PID 1 might've moved on to
+ * some completely different fs_struct state and might've even unmounted
+ * the old root.
+ *
+ * This has hilarious consequences: Think continuing to dump coredump
+ * state into an implicitly pinned directory somewhere. Calling random
+ * binaries in the old rootfs via usermodehelpers.
+ *
+ * Be aggressive about this: We simply reject operating on stale
+ * fs_struct state by reverting to nullfs. Every kworker that does
+ * lookups after this point will fail. Every usermodehelper call will
+ * fail. Tough luck but let's be kind and emit a warning to userspace.
*/
static inline void validate_fs_switch(struct fs_struct *old_fs)
{
+ might_sleep();
+
if (likely(current->pid != 1))
return;
/* @old_fs may be dangling but for comparison it's fine */
- if (old_fs != &init_fs)
+ if (old_fs != userspace_init_fs)
return;
pr_warn("VFS: Pid 1 stopped sharing filesystem state\n");
+ set_fs_root(userspace_init_fs, &init_fs.root);
+ set_fs_pwd(userspace_init_fs, &init_fs.root);
}

struct fs_struct *switch_fs_struct(struct fs_struct *new_fs)
@@ -201,3 +221,29 @@ struct fs_struct init_fs = {
.seq = __SEQLOCK_UNLOCKED(init_fs.seq),
.umask = 0022,
};
+
+struct fs_struct *userspace_init_fs __ro_after_init;
+EXPORT_SYMBOL_GPL(userspace_init_fs);
+
+void __init init_userspace_fs(void)
+{
+ struct mount *m;
+ struct path root;
+
+ /* Move PID 1 from nullfs into the initramfs. */
+ m = topmost_overmount(current->nsproxy->mnt_ns->root);
+ root.mnt = &m->mnt;
+ root.dentry = root.mnt->mnt_root;
+
+ VFS_WARN_ON_ONCE(current->pid != 1);
+
+ set_fs_root(current->fs, &root);
+ set_fs_pwd(current->fs, &root);
+
+ /* Hold a reference for the global pointer. */
+ read_seqlock_excl(&current->fs->seq);
+ current->fs->users++;
+ read_sequnlock_excl(&current->fs->seq);
+
+ userspace_init_fs = current->fs;
+}
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index e11d0e57168f..97eef8d3863d 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -17,6 +17,7 @@ struct fs_struct {
} __randomize_layout;

extern struct kmem_cache *fs_cachep;
+extern struct fs_struct *userspace_init_fs;

extern void exit_fs(struct task_struct *);
extern void set_fs_root(struct fs_struct *, const struct path *);
@@ -57,17 +58,17 @@ static inline int current_umask(void)
*/
static inline struct fs_struct *__override_init_fs(void)
{
- struct fs_struct *fs;
+ struct fs_struct *old_fs;

- fs = current->fs;
- WRITE_ONCE(current->fs, fs);
- return fs;
+ old_fs = current->fs;
+ WRITE_ONCE(current->fs, userspace_init_fs);
+ return old_fs;
}

-static inline void __revert_init_fs(struct fs_struct *revert_fs)
+static inline void __revert_init_fs(struct fs_struct *old_fs)
{
- VFS_WARN_ON_ONCE(current->fs != revert_fs);
- WRITE_ONCE(current->fs, revert_fs);
+ VFS_WARN_ON_ONCE(current->fs != userspace_init_fs);
+ WRITE_ONCE(current->fs, old_fs);
}

DEFINE_CLASS(__override_init_fs,
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index a6cb241ea00c..61536be773f5 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -24,6 +24,7 @@

extern struct files_struct init_files;
extern struct fs_struct init_fs;
+extern struct fs_struct *userspace_init_fs;
extern struct nsproxy init_nsproxy;

#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/init/main.c b/init/main.c
index 1cb395dd94e4..5ccc642a5aa7 100644
--- a/init/main.c
+++ b/init/main.c
@@ -102,6 +102,7 @@
#include <linux/stackdepot.h>
#include <linux/randomize_kstack.h>
#include <linux/pidfs.h>
+#include <linux/fs_struct.h>
#include <linux/ptdump.h>
#include <linux/time_namespace.h>
#include <linux/unaligned.h>
@@ -1574,6 +1575,8 @@ static int __ref kernel_init(void *unused)
{
int ret;

+ init_userspace_fs();
+
/*
* Wait until kthreadd is all set-up.
*/

--
2.47.3