BUG: d_path() races with do_move_mount() on ->mnt_ns, leading to use-after-free

From: Jann Horn
Date: Tue Sep 13 2022 - 14:10:20 EST

As the subject says, there's a race between d_path() (specifically
__prepend_path()) looking at mnt->mnt_ns with is_anon_ns(), and
do_move_mount() switching out the ->mnt_ns and freeing the old one.
This can theoretically lead to a use-after-free read, but it doesn't
seem to be very interesting from a security perspective, since all it
gets you is a comparison of a value in freed memory with zero.

KASAN splat from a kernel that's been patched to widen the race window:

BUG: KASAN: use-after-free in prepend_path (fs/mount.h:146
fs/d_path.c:127 fs/d_path.c:177)
Read of size 8 at addr ffff88800add2748 by task SLOWME/685

CPU: 8 PID: 685 Comm: SLOWME Not tainted
6.0.0-rc5-00015-ge839a756012b-dirty #110
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
1.16.0-debian-1.16.0-4 04/01/2014
Call Trace:
dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1))
print_report.cold (mm/kasan/report.c:318 mm/kasan/report.c:433)
kasan_report (mm/kasan/report.c:162 mm/kasan/report.c:497)
prepend_path (fs/mount.h:146 fs/d_path.c:127 fs/d_path.c:177)
__do_sys_getcwd (fs/d_path.c:438)
do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)

Allocated by task 685:
kasan_save_stack (mm/kasan/common.c:39)
__kasan_kmalloc (mm/kasan/common.c:45 mm/kasan/common.c:437
mm/kasan/common.c:516 mm/kasan/common.c:525)
alloc_mnt_ns (./include/linux/slab.h:600 ./include/linux/slab.h:733
__do_sys_fsmount (fs/namespace.c:3720)
do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)

Freed by task 686:
kasan_save_stack (mm/kasan/common.c:39)
kasan_set_track (mm/kasan/common.c:45)
kasan_set_free_info (mm/kasan/generic.c:372)
____kasan_slab_free (mm/kasan/common.c:369 mm/kasan/common.c:329)
kfree (mm/slub.c:1780 mm/slub.c:3534 mm/slub.c:4562)
do_move_mount (fs/namespace.c:2899)
__x64_sys_move_mount (fs/namespace.c:3812 fs/namespace.c:3765
do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120)

The buggy address belongs to the object at ffff88800add2700
which belongs to the cache kmalloc-128 of size 128
The buggy address is located 72 bytes inside of
128-byte region [ffff88800add2700, ffff88800add2780)


Memory state around the buggy address:
ffff88800add2600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff88800add2680: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff88800add2700: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff88800add2780: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff88800add2800: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc

To reproduce, apply this kernel patch to widen the race window:

diff --git a/fs/d_path.c b/fs/d_path.c
index e4e0ebad1f153..51fbed8deffe4 100644
--- a/fs/d_path.c
+++ b/fs/d_path.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/prefetch.h>
#include "mount.h"
+#include <linux/delay.h>

struct prepend_buffer {
char *buf;
@@ -117,6 +118,11 @@ static int __prepend_path(const struct dentry
*dentry, const struct mount *mnt,
/* Global root */
mnt_ns = READ_ONCE(mnt->mnt_ns);
+ if (strcmp(current->comm, "SLOWME") == 0) {
+ pr_warn("%s: begin delay\n", __func__);
+ mdelay(1000);
+ pr_warn("%s: end delay\n", __func__);
+ }
/* open-coded is_mounted() to use local mnt_ns */
if (!IS_ERR_OR_NULL(mnt_ns) && !is_anon_ns(mnt_ns))
return 1; // absolute root

Then run this reproducer (build with "-pthread"):

#define _GNU_SOURCE
#include <pthread.h>
#include <unistd.h>
#include <err.h>
#include <fcntl.h>
#include <sys/syscall.h>
#include <sys/stat.h>
#include <sys/prctl.h>
#include <sys/mount.h>
#include <linux/mount.h>

#define SYSCHK(x) ({ \
typeof(x) __res = (x); \
if (__res == (typeof(x))-1) \
err(1, "SYSCHK(" #x ")"); \
__res; \

void fsconfig(int fd, unsigned int cmd, char *key, void *value, int aux) {
SYSCHK(syscall(__NR_fsconfig, fd, cmd, key, value, aux));

static int mnt_fd = -1;

static void *thread_fn(void *dummy) {
mkdir("/dev/shm/test", 0700);
SYSCHK(syscall(__NR_move_mount, mnt_fd, "", AT_FDCWD, "/dev/shm/test",
SYSCHK(umount2("/dev/shm/test", MNT_DETACH));
return NULL;

int main(void) {
int fs_fd = SYSCHK(syscall(__NR_fsopen, "tmpfs", 0));
fsconfig(fs_fd, FSCONFIG_CMD_CREATE, NULL, NULL, 0);
mnt_fd = SYSCHK(syscall(__NR_fsmount, fs_fd, 0, MOUNT_ATTR_NOSUID |

pthread_t thread;
if (pthread_create(&thread, NULL, thread_fn, NULL))
errx(1, "pthread_create");

char buf[0x10000];
SYSCHK(getcwd(buf, sizeof(buf)));
SYSCHK(prctl(PR_SET_NAME, "dummy"));
if (pthread_join(thread, NULL))
errx(1, "pthread_join");
return 0;