[RFC PATCH 0/3] quering mount attributes

From: Miklos Szeredi
Date: Wed Sep 13 2023 - 11:23:48 EST


Implement the mount querying syscalls agreed on at LSF/MM 2023. This is an
RFC with just x86_64 syscalls.

Excepting notification this should allow full replacement for
parsing /proc/self/mountinfo.

It is not a replacement for /proc/$OTHER_PID/mountinfo, since mount
namespace and root are taken from the current task. I guess namespace and
root could be switched before invoking these syscalls but that sounds a bit
complicated. Not sure if this is a problem.

Test utility attached at the end.
---

Miklos Szeredi (3):
add unique mount ID
add statmnt(2) syscall
add listmnt(2) syscall

arch/x86/entry/syscalls/syscall_64.tbl | 2 +
fs/internal.h | 5 +
fs/mount.h | 3 +-
fs/namespace.c | 365 +++++++++++++++++++++++++
fs/proc_namespace.c | 19 +-
fs/stat.c | 9 +-
fs/statfs.c | 1 +
include/linux/syscalls.h | 5 +
include/uapi/asm-generic/unistd.h | 8 +-
include/uapi/linux/mount.h | 36 +++
include/uapi/linux/stat.h | 1 +
11 files changed, 443 insertions(+), 11 deletions(-)

--
2.41.0

=== statmnt.c ===
#define _GNU_SOURCE
#include <unistd.h>
#include <stdio.h>
#include <fcntl.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <err.h>

struct stmt_str {
__u32 off;
__u32 len;
};

struct statmnt {
__u64 mask; /* What results were written [uncond] */
__u32 sb_dev_major; /* Device ID */
__u32 sb_dev_minor;
__u64 sb_magic; /* ..._SUPER_MAGIC */
__u32 sb_flags; /* MS_{RDONLY,SYNCHRONOUS,DIRSYNC,LAZYTIME} */
__u32 __spare1;
__u64 mnt_id; /* Unique ID of mount */
__u64 mnt_parent_id; /* Unique ID of parent (for root == mnt_id) */
__u32 mnt_id_old; /* Reused IDs used in proc/.../mountinfo */
__u32 mnt_parent_id_old;
__u64 mnt_attr; /* MOUNT_ATTR_... */
__u64 mnt_propagation; /* MS_{SHARED,SLAVE,PRIVATE,UNBINDABLE} */
__u64 mnt_peer_group; /* ID of shared peer group */
__u64 mnt_master; /* Mount receives propagation from this ID */
__u64 propagate_from; /* Propagation from in current namespace */
__u64 __spare[20];
struct stmt_str mnt_root; /* Root of mount relative to root of fs */
struct stmt_str mountpoint; /* Mountpoint relative to root of process */
struct stmt_str fs_type; /* Filesystem type[.subtype] */
struct stmt_str sb_opts; /* Super block string options (nul delimted) */
};

#define STMT_SB_BASIC 0x00000001U /* Want/got sb_... */
#define STMT_MNT_BASIC 0x00000002U /* Want/got mnt_... */
#define STMT_PROPAGATE_FROM 0x00000004U /* Want/got propagate_from */
#define STMT_MNT_ROOT 0x00000008U /* Want/got mnt_root */
#define STMT_MOUNTPOINT 0x00000010U /* Want/got mountpoint */
#define STMT_FS_TYPE 0x00000020U /* Want/got fs_type */
#define STMT_SB_OPTS 0x00000040U /* Want/got sb_opts */

#define __NR_statmnt 454
#define __NR_listmnt 455

#define STATX_MNT_ID_UNIQUE 0x00004000U /* Want/got extended stx_mount_id */

int main(int argc, char *argv[])
{
char buf[65536];
struct statmnt *st = (void *) buf;
char *end;
const char *arg = argv[1];
long res;
int list = 0;
unsigned long mnt_id;
unsigned int mask = STMT_SB_BASIC | STMT_MNT_BASIC | STMT_PROPAGATE_FROM | STMT_MNT_ROOT | STMT_MOUNTPOINT | STMT_FS_TYPE | STMT_SB_OPTS;

if (arg && strcmp(arg, "-l") == 0) {
list = 1;
arg = argv[2];
}
if (argc != list + 2)
errx(1, "usage: %s [-l] (mnt_id|path)", argv[0]);

mnt_id = strtol(arg, &end, 0);
if (!mnt_id || *end != '\0') {
struct statx sx;

res = statx(AT_FDCWD, arg, 0, STATX_MNT_ID_UNIQUE, &sx);
if (res == -1)
err(1, "%s", arg);

if (!(sx.stx_mask & (STATX_MNT_ID | STATX_MNT_ID_UNIQUE)))
errx(1, "Sorry, no mount ID");

mnt_id = sx.stx_mnt_id;
}


if (list) {
size_t size = 8192;
uint64_t list[size];
long i, num;

res = syscall(__NR_listmnt, mnt_id, list, size, 0);
if (res == -1)
err(1, "listmnt(%lu)", mnt_id);

num = res;
for (i = 0; i < num; i++) {
printf("0x%lx / ", list[i]);

res = syscall(__NR_statmnt, list[i], STMT_MNT_BASIC | STMT_MOUNTPOINT, &buf, sizeof(buf), 0);
if (res == -1) {
printf("???\t[%s]\n", strerror(errno));
} else {
printf("%u\t%s\n", st->mnt_id_old,
(st->mask & STMT_MOUNTPOINT) ? buf + st->mountpoint.off : "???");
}
}

return 0;
}

res = syscall(__NR_statmnt, mnt_id, mask, &buf, sizeof(buf), 0);
if (res == -1)
err(1, "statmnt(%lu)", mnt_id);

printf("mask: 0x%llx\n", st->mask);
if (st->mask & STMT_SB_BASIC) {
printf("sb_dev_major: %u\n", st->sb_dev_major);
printf("sb_dev_minor: %u\n", st->sb_dev_minor);
printf("sb_magic: 0x%llx\n", st->sb_magic);
printf("sb_flags: 0x%08x\n", st->sb_flags);
}
if (st->mask & STMT_MNT_BASIC) {
printf("mnt_id: 0x%llx\n", st->mnt_id);
printf("mnt_parent_id: 0x%llx\n", st->mnt_parent_id);
printf("mnt_id_old: %u\n", st->mnt_id_old);
printf("mnt_parent_id_old: %u\n", st->mnt_parent_id_old);
printf("mnt_attr: 0x%08llx\n", st->mnt_attr);
printf("mnt_propagation: %s%s%s%s\n",
st->mnt_propagation & MS_SHARED ? "shared," : "",
st->mnt_propagation & MS_SLAVE ? "slave," : "",
st->mnt_propagation & MS_UNBINDABLE ? "unbindable," : "",
st->mnt_propagation & MS_PRIVATE ? "private" : "");
printf("mnt_peer_group: %llu\n", st->mnt_peer_group);
printf("mnt_master: %llu\n", st->mnt_master);
}
if (st->mask & STMT_PROPAGATE_FROM) {
printf("propagate_from: %llu\n", st->propagate_from);
}
if (st->mask & STMT_MNT_ROOT) {
printf("mnt_root: %i/%u <%s>\n", st->mnt_root.off,
st->mnt_root.len, buf + st->mnt_root.off);
}
if (st->mask & STMT_MOUNTPOINT) {
printf("mountpoint: %i/%u <%s>\n", st->mountpoint.off,
st->mountpoint.len, buf + st->mountpoint.off);
}
if (st->mask & STMT_FS_TYPE) {
printf("fs_type: %i/%u <%s>\n", st->fs_type.off,
st->fs_type.len, buf + st->fs_type.off);
}

if (st->mask & STMT_SB_OPTS) {
char *p = buf + st->sb_opts.off;
char *end = p + st->sb_opts.len;

printf("sb_opts: %i/%u ", st->sb_opts.off, st->sb_opts.len);
for (; p < end; p += strlen(p) + 1)
printf("<%s>, ", p);
printf("\n");
}

return 0;
}