[PATCH RFC v2 15/23] fs: add real_fs to track task's actual fs_struct

From: Christian Brauner

Date: Thu Mar 05 2026 - 18:31:49 EST


Add a real_fs field to task_struct that always mirrors the fs field.
This lays the groundwork for distinguishing between a task's permanent
fs_struct and one that is temporarily overridden via scoped_with_init_fs().

When a kthread temporarily overrides current->fs for path lookup, we
need to know the original fs_struct for operations like exit_fs() and
unshare_fs_struct() that must operate on the real, permanent fs.

For now real_fs is always equal to fs. It is maintained alongside fs in
all the relevant paths: exit_fs(), unshare_fs_struct(),
switch_fs_struct(), and copy_fs().

Also fix the argument passed to nullfs_userspace_init() in
switch_fs_struct(): pass the old fs_struct itself rather than the
conditional return value which is NULL when other users still hold
a reference, ensuring the PID 1 unshare detection actually works.

Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx>
---
fs/fs_struct.c | 10 +++++++---
include/linux/sched.h | 1 +
init/init_task.c | 1 +
kernel/fork.c | 4 +++-
4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 3ff79fb894c1..b9b9a327f299 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -89,12 +89,13 @@ void free_fs_struct(struct fs_struct *fs)

void exit_fs(struct task_struct *tsk)
{
- struct fs_struct *fs = tsk->fs;
+ struct fs_struct *fs = tsk->real_fs;

if (fs) {
int kill;
task_lock(tsk);
read_seqlock_excl(&fs->seq);
+ tsk->real_fs = NULL;
tsk->fs = NULL;
kill = !--fs->users;
read_sequnlock_excl(&fs->seq);
@@ -126,7 +127,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)

int unshare_fs_struct(void)
{
- struct fs_struct *fs = current->fs;
+ struct fs_struct *fs = current->real_fs;
struct fs_struct *new_fs = copy_fs_struct(fs);
int kill;

@@ -135,8 +136,10 @@ int unshare_fs_struct(void)

task_lock(current);
read_seqlock_excl(&fs->seq);
+ VFS_WARN_ON_ONCE(fs != current->fs);
kill = !--fs->users;
current->fs = new_fs;
+ current->real_fs = new_fs;
read_sequnlock_excl(&fs->seq);
task_unlock(current);

@@ -177,13 +180,14 @@ struct fs_struct *switch_fs_struct(struct fs_struct *new_fs)

fs = current->fs;
read_seqlock_excl(&fs->seq);
+ VFS_WARN_ON_ONCE(current->fs != current->real_fs);
current->fs = new_fs;
+ current->real_fs = new_fs;
if (--fs->users)
new_fs = NULL;
else
new_fs = fs;
read_sequnlock_excl(&fs->seq);
-
nullfs_userspace_init(fs);
return new_fs;
}
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a7b4a980eb2f..5c7b9df92ebb 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1179,6 +1179,7 @@ struct task_struct {
unsigned long last_switch_time;
#endif
/* Filesystem information: */
+ struct fs_struct *real_fs;
struct fs_struct *fs;

/* Open file information: */
diff --git a/init/init_task.c b/init/init_task.c
index 5c838757fc10..7d0b4a5927eb 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -152,6 +152,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
RCU_POINTER_INITIALIZER(cred, &init_cred),
.comm = INIT_TASK_COMM,
.thread = INIT_THREAD,
+ .real_fs = &init_fs,
.fs = &init_fs,
.files = &init_files,
#ifdef CONFIG_IO_URING
diff --git a/kernel/fork.c b/kernel/fork.c
index 583078c69bbd..73f4ed82f656 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1593,6 +1593,8 @@ static int copy_mm(u64 clone_flags, struct task_struct *tsk)
static int copy_fs(u64 clone_flags, struct task_struct *tsk)
{
struct fs_struct *fs = current->fs;
+
+ VFS_WARN_ON_ONCE(current->fs != current->real_fs);
if (clone_flags & CLONE_FS) {
/* tsk->fs is already what we want */
read_seqlock_excl(&fs->seq);
@@ -1605,7 +1607,7 @@ static int copy_fs(u64 clone_flags, struct task_struct *tsk)
read_sequnlock_excl(&fs->seq);
return 0;
}
- tsk->fs = copy_fs_struct(fs);
+ tsk->real_fs = tsk->fs = copy_fs_struct(fs);
if (!tsk->fs)
return -ENOMEM;
return 0;

--
2.47.3