Re: file metadata via fs API (was: [GIT PULL] Filesystem Information)

From: Miklos Szeredi
Date: Tue Aug 11 2020 - 09:54:54 EST


On Wed, Aug 05, 2020 at 10:24:23AM +0200, Miklos Szeredi wrote:
> On Tue, Aug 4, 2020 at 4:36 PM Miklos Szeredi <miklos@xxxxxxxxxx> wrote:
>
> > I think we already lost that with the xattr API, that should have been
> > done in a way that fits this philosophy. But given that we have "/"
> > as the only special purpose char in filenames, and even repetitions
> > are allowed, it's hard to think of a good way to do that. Pity.
>
> One way this could be solved is to allow opting into an alternative
> path resolution mode.
>
> E.g.
> openat(AT_FDCWD, "foo/bar//mnt/info", O_RDONLY | O_ALT);

Proof of concept patch and test program below.

Opted for triple slash in the hope that just maybe we could add a global
/proc/sys/fs/resolve_alt knob to optionally turn on alternative (non-POSIX) path
resolution without breaking too many things. Will try that later...

Comments?

Thanks,
Miklos

cat_alt.c:
-------- >8 --------
#define _GNU_SOURCE
#include <err.h>
#include <unistd.h>
#include <fcntl.h>
#include <string.h>
#include <stdlib.h>
#include <linux/unistd.h>
#include <linux/openat2.h>

#define RESOLVE_ALT 0x20 /* Alternative path walk mode where
multiple slashes have special meaning */

int main(int argc, char *argv[])
{
struct open_how how = {
.flags = O_RDONLY,
.resolve = RESOLVE_ALT,
};
int fd, res, i;
char buf[65536], *end;
const char *path = argv[1];
int dfd = AT_FDCWD;

if (argc < 2 || argc > 4)
errx(1, "usage: %s path [dirfd] [--nofollow]", argv[0]);


for (i = 2; i < argc; i++) {
if (strcmp(argv[i], "--nofollow") == 0) {
how.flags |= O_NOFOLLOW;
} else {
dfd = strtoul(argv[i], &end, 0);
if (end == argv[i] || *end)
errx(1, "invalid dirfd: %s", argv[i]);
}
}

fd = syscall(__NR_openat2, dfd, path, &how, sizeof(how));
if (fd == -1)
err(1, "failed to open %s", argv[1]);

while (1) {
res = read(fd, buf, sizeof(buf));
if (res == -1)
err(1, "failed to read file");
if (res == 0)
break;

write(1, buf, res);
}
close(fd);
return 0;
}
-------- >8 --------

---
fs/Makefile | 2
fs/file_table.c | 70 ++++++++++++++--------
fs/fsmeta.c | 135 +++++++++++++++++++++++++++++++++++++++++++
fs/internal.h | 9 ++
fs/mount.h | 4 +
fs/namei.c | 77 +++++++++++++++++++++---
fs/namespace.c | 12 +++
fs/open.c | 2
fs/proc_namespace.c | 2
include/linux/fcntl.h | 2
include/linux/namei.h | 3
include/uapi/linux/magic.h | 1
include/uapi/linux/openat2.h | 2
13 files changed, 282 insertions(+), 39 deletions(-)

--- a/fs/Makefile
+++ b/fs/Makefile
@@ -13,7 +13,7 @@ obj-y := open.o read_write.o file_table.
seq_file.o xattr.o libfs.o fs-writeback.o \
pnode.o splice.o sync.o utimes.o d_path.o \
stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \
- fs_types.o fs_context.o fs_parser.o fsopen.o
+ fs_types.o fs_context.o fs_parser.o fsopen.o fsmeta.o \

ifeq ($(CONFIG_BLOCK),y)
obj-y += buffer.o block_dev.o direct-io.o mpage.o
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -178,22 +178,9 @@ struct file *alloc_empty_file_noaccount(
return f;
}

-/**
- * alloc_file - allocate and initialize a 'struct file'
- *
- * @path: the (dentry, vfsmount) pair for the new file
- * @flags: O_... flags with which the new file will be opened
- * @fop: the 'struct file_operations' for the new file
- */
-static struct file *alloc_file(const struct path *path, int flags,
- const struct file_operations *fop)
+static void init_file(struct file *file, const struct path *path, int flags,
+ const struct file_operations *fop)
{
- struct file *file;
-
- file = alloc_empty_file(flags, current_cred());
- if (IS_ERR(file))
- return file;
-
file->f_path = *path;
file->f_inode = path->dentry->d_inode;
file->f_mapping = path->dentry->d_inode->i_mapping;
@@ -209,31 +196,66 @@ static struct file *alloc_file(const str
file->f_op = fop;
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
i_readcount_inc(path->dentry->d_inode);
+}
+
+/**
+ * alloc_file - allocate and initialize a 'struct file'
+ *
+ * @path: the (dentry, vfsmount) pair for the new file
+ * @flags: O_... flags with which the new file will be opened
+ * @fop: the 'struct file_operations' for the new file
+ */
+static struct file *alloc_file(const struct path *path, int flags,
+ const struct file_operations *fop)
+{
+ struct file *file;
+
+ file = alloc_empty_file(flags, current_cred());
+ if (IS_ERR(file))
+ return file;
+
+ init_file(file, path, flags, fop);
+
return file;
}

-struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
- const char *name, int flags,
- const struct file_operations *fops)
+int init_file_pseudo(struct file *file, struct inode *inode,
+ struct vfsmount *mnt, const char *name, int flags,
+ const struct file_operations *fops)
{
static const struct dentry_operations anon_ops = {
.d_dname = simple_dname
};
struct qstr this = QSTR_INIT(name, strlen(name));
struct path path;
- struct file *file;

path.dentry = d_alloc_pseudo(mnt->mnt_sb, &this);
if (!path.dentry)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
if (!mnt->mnt_sb->s_d_op)
d_set_d_op(path.dentry, &anon_ops);
path.mnt = mntget(mnt);
d_instantiate(path.dentry, inode);
- file = alloc_file(&path, flags, fops);
- if (IS_ERR(file)) {
- ihold(inode);
- path_put(&path);
+ init_file(file, &path, flags, fops);
+
+ return 0;
+}
+
+struct file *alloc_file_pseudo(struct inode *inode, struct vfsmount *mnt,
+ const char *name, int flags,
+ const struct file_operations *fops)
+{
+ struct file *file;
+ int err;
+
+ file = alloc_empty_file(flags, current_cred());
+ if (IS_ERR(file))
+ return file;
+
+ err = init_file_pseudo(file, inode, mnt, name, flags, fops);
+ if (err) {
+ fput(file);
+ file = ERR_PTR(err);
}
return file;
}
--- /dev/null
+++ b/fs/fsmeta.c
@@ -0,0 +1,135 @@
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/magic.h>
+#include <linux/seq_file.h>
+#include <linux/fs_struct.h>
+#include <linux/pseudo_fs.h>
+
+#include "mount.h"
+#include "internal.h"
+
+static struct vfsmount *fsmeta_mnt;
+static struct inode *fsmeta_inode;
+
+
+static struct vfsmount *fsmeta_mnt_info_get_mnt(struct seq_file *seq)
+{
+ struct proc_mounts *p = seq->private;
+
+ return &list_entry(p->cursor.mnt_list.next, struct mount, mnt_list)->mnt;
+}
+
+static void *fsmeta_mnt_info_start(struct seq_file *seq, loff_t *pos)
+{
+ mnt_namespace_lock_read();
+ return *pos == 0 ? fsmeta_mnt_info_get_mnt(seq) : NULL;
+}
+
+static void *fsmeta_mnt_info_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ ++*pos;
+ return NULL;
+}
+
+static void fsmeta_mnt_info_stop(struct seq_file *seq, void *v)
+{
+ mnt_namespace_unlock_read();
+}
+
+static int fsmeta_mnt_info_show(struct seq_file *seq, void *v)
+{
+ return show_mountinfo(seq, v);
+}
+
+static const struct seq_operations fsmeta_mnt_info_sops = {
+ .start = fsmeta_mnt_info_start,
+ .next = fsmeta_mnt_info_next,
+ .stop = fsmeta_mnt_info_stop,
+ .show = fsmeta_mnt_info_show,
+};
+
+static int fsmeta_mnt_info_release(struct inode *inode, struct file *file)
+{
+ if (file->private_data) {
+ struct seq_file *seq = file->private_data;
+ struct proc_mounts *p = seq->private;
+
+ mntput(fsmeta_mnt_info_get_mnt(seq));
+ path_put(&p->root);
+
+ return seq_release_private(inode, file);
+ }
+ return 0;
+}
+
+static const struct file_operations fsmeta_mnt_info_fops = {
+ .release = fsmeta_mnt_info_release,
+ .read = seq_read,
+ .llseek = no_llseek,
+};
+
+static int fsmeta_mnt_info_open(struct file *file, const struct path *path,
+ const struct open_flags *op)
+{
+ struct proc_mounts *p;
+ int err;
+
+ err = init_file_pseudo(file, fsmeta_inode, fsmeta_mnt, "[mnt.info]",
+ op->open_flag, &fsmeta_mnt_info_fops);
+ if (err)
+ return err;
+ /*
+ * This reference is now sunk in file->f_path.dentry->d_inode and will
+ * be released by fput()
+ */
+ ihold(fsmeta_inode);
+
+ err = seq_open_private(file, &fsmeta_mnt_info_sops, sizeof(*p));
+ if (err)
+ return err;
+
+ p = ((struct seq_file *)file->private_data)->private;
+ get_fs_root(current->fs, &p->root);
+ p->cursor.mnt_list.next = &real_mount(mntget(path->mnt))->mnt_list;
+
+ return 0;
+}
+
+int fsmeta_open(const char *meta_name, const struct path *path,
+ struct file *file, const struct open_flags *op)
+{
+ if (op->open_flag & ~(O_LARGEFILE | O_CLOEXEC | O_NOFOLLOW))
+ return -EINVAL;
+
+ if (strcmp(meta_name, "mnt/info") == 0)
+ return fsmeta_mnt_info_open(file, path, op);
+
+ pr_info("invalid fsmeta file <%s> on %pd4\n", meta_name, path->dentry);
+ return -EINVAL;
+}
+
+static int fsmeta_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, FSMETA_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type fsmeta_fs_type = {
+ .name = "fsmeta",
+ .init_fs_context = fsmeta_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static int __init fsmeta_init(void)
+{
+ fsmeta_mnt = kern_mount(&fsmeta_fs_type);
+ if (IS_ERR(fsmeta_mnt))
+ panic("fsmeta_init() kernel mount failed (%ld)\n", PTR_ERR(fsmeta_mnt));
+
+ fsmeta_inode = alloc_anon_inode(fsmeta_mnt->mnt_sb);
+ if (IS_ERR(fsmeta_inode))
+ panic("fsmeta_init() inode allocation failed (%ld)\n", PTR_ERR(fsmeta_inode));
+
+ return 0;
+}
+fs_initcall(fsmeta_init);
+
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -99,6 +99,9 @@ extern void chroot_fs_refs(const struct
*/
extern struct file *alloc_empty_file(int, const struct cred *);
extern struct file *alloc_empty_file_noaccount(int, const struct cred *);
+extern int init_file_pseudo(struct file *file, struct inode *inode,
+ struct vfsmount *mnt, const char *name, int flags,
+ const struct file_operations *fops);

/*
* super.c
@@ -185,3 +188,9 @@ int sb_init_dio_done_wq(struct super_blo
*/
int do_statx(int dfd, const char __user *filename, unsigned flags,
unsigned int mask, struct statx __user *buffer);
+
+/*
+ * fs/fsmeta.c
+ */
+int fsmeta_open(const char *meta_name, const struct path *path,
+ struct file *file, const struct open_flags *op);
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -159,3 +159,7 @@ static inline bool is_anon_ns(struct mnt
}

extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
+
+void mnt_namespace_lock_read(void);
+void mnt_namespace_unlock_read(void);
+int show_mountinfo(struct seq_file *m, struct vfsmount *mnt);
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2094,6 +2094,30 @@ static inline u64 hash_name(const void *

#endif

+static int lookup_alt(const char *name, struct nameidata *nd)
+{
+ if ((nd->flags & LOOKUP_RCU) && unlazy_walk(nd) != 0)
+ return -ECHILD;
+
+ nd->last.name = name + 3;
+ nd->last_type = LAST_META;
+
+ return 0;
+}
+
+static bool is_alt(const char *name, struct nameidata *nd, int depth)
+{
+ if (!(nd->flags & LOOKUP_ALT))
+ return false;
+
+ /* no alternative lookup inside symlinks */
+ if (depth)
+ return false;
+
+ /* name[0] has already been verified to be a slash */
+ return name[1] == '/' && name[2] == '/' && name[3] != '/';
+}
+
/*
* Name resolution.
* This is the basic name resolution function, turning a pathname into
@@ -2111,8 +2135,13 @@ static int link_path_walk(const char *na
nd->flags |= LOOKUP_PARENT;
if (IS_ERR(name))
return PTR_ERR(name);
- while (*name=='/')
- name++;
+ if (*name == '/') {
+ if (!is_alt(name, nd, depth)) {
+ do {
+ name++;
+ } while (*name == '/');
+ }
+ }
if (!*name)
return 0;

@@ -2122,6 +2151,9 @@ static int link_path_walk(const char *na
u64 hash_len;
int type;

+ if (*name == '/')
+ return lookup_alt(name, nd);
+
err = may_lookup(nd);
if (err)
return err;
@@ -2163,6 +2195,13 @@ static int link_path_walk(const char *na
* If it wasn't NUL, we know it was '/'. Skip that
* slash, and continue until no more slashes.
*/
+ if (is_alt(name, nd, depth)) {
+ link = walk_component(nd, WALK_TRAILING);
+ if (unlikely(link))
+ goto LINK;
+
+ return lookup_alt(name, nd);
+ }
do {
name++;
} while (unlikely(*name == '/'));
@@ -2183,6 +2222,7 @@ static int link_path_walk(const char *na
link = walk_component(nd, WALK_MORE);
}
if (unlikely(link)) {
+LINK:
if (IS_ERR(link))
return PTR_ERR(link);
/* a symlink to follow */
@@ -2239,11 +2279,11 @@ static const char *path_init(struct name
nd->path.dentry = NULL;

/* Absolute pathname -- fetch the root (LOOKUP_IN_ROOT uses nd->dfd). */
- if (*s == '/' && !(flags & LOOKUP_IN_ROOT)) {
+ if (*s == '/' && !is_alt(s, nd, 0) && !(flags & LOOKUP_IN_ROOT)) {
error = nd_jump_root(nd);
if (unlikely(error))
return ERR_PTR(error);
- return s;
+ return s + 1;
}

/* Relative pathname -- get the starting-point it is relative to. */
@@ -2272,7 +2312,8 @@ static const char *path_init(struct name

dentry = f.file->f_path.dentry;

- if (*s && unlikely(!d_can_lookup(dentry))) {
+ if (*s && unlikely(!d_can_lookup(dentry)) &&
+ !is_alt(s, nd, 0)) {
fdput(f);
return ERR_PTR(-ENOTDIR);
}
@@ -2303,6 +2344,9 @@ static const char *path_init(struct name

static inline const char *lookup_last(struct nameidata *nd)
{
+ if (nd->last_type == LAST_META)
+ return ERR_PTR(-EINVAL);
+
if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len])
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;

@@ -2331,7 +2375,7 @@ static int path_lookupat(struct nameidat

while (!(err = link_path_walk(s, nd)) &&
(s = lookup_last(nd)) != NULL)
- ;
+ nd->flags &= ~LOOKUP_ALT;
if (!err)
err = complete_walk(nd);

@@ -2410,9 +2454,15 @@ static struct filename *filename_parenta
if (unlikely(retval == -ESTALE))
retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent);
if (likely(!retval)) {
- *last = nd.last;
- *type = nd.last_type;
- audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
+ if (nd.last_type == LAST_META) {
+ path_put(parent);
+ putname(name);
+ name = ERR_PTR(-EINVAL);
+ } else {
+ *last = nd.last;
+ *type = nd.last_type;
+ audit_inode(name, parent->dentry, AUDIT_INODE_PARENT);
+ }
} else {
putname(name);
name = ERR_PTR(retval);
@@ -3123,6 +3173,10 @@ static const char *open_last_lookups(str
nd->flags |= op->intent;

if (nd->last_type != LAST_NORM) {
+ if (nd->last_type == LAST_META) {
+ return ERR_PTR(fsmeta_open(nd->last.name, &nd->path,
+ file, op));
+ }
if (nd->depth)
put_link(nd);
return handle_dots(nd, nd->last_type);
@@ -3206,6 +3260,9 @@ static int do_open(struct nameidata *nd,
int acc_mode;
int error;

+ if (nd->last_type == LAST_META)
+ return 0;
+
if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
error = complete_walk(nd);
if (error)
@@ -3355,7 +3412,7 @@ static struct file *path_openat(struct n
const char *s = path_init(nd, flags);
while (!(error = link_path_walk(s, nd)) &&
(s = open_last_lookups(nd, file, op)) != NULL)
- ;
+ nd->flags &= ~LOOKUP_ALT;
if (!error)
error = do_open(nd, file, op);
terminate_walk(nd);
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -69,7 +69,7 @@ static DEFINE_IDA(mnt_group_ida);
static struct hlist_head *mount_hashtable __read_mostly;
static struct hlist_head *mountpoint_hashtable __read_mostly;
static struct kmem_cache *mnt_cache __read_mostly;
-static DECLARE_RWSEM(namespace_sem);
+DECLARE_RWSEM(namespace_sem);
static HLIST_HEAD(unmounted); /* protected by namespace_sem */
static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */

@@ -1435,6 +1435,16 @@ static inline void namespace_lock(void)
down_write(&namespace_sem);
}

+void mnt_namespace_lock_read(void)
+{
+ down_read(&namespace_sem);
+}
+
+void mnt_namespace_unlock_read(void)
+{
+ up_read(&namespace_sem);
+}
+
enum umount_tree_flags {
UMOUNT_SYNC = 1,
UMOUNT_PROPAGATE = 2,
--- a/fs/open.c
+++ b/fs/open.c
@@ -1098,6 +1098,8 @@ inline int build_open_flags(const struct
lookup_flags |= LOOKUP_BENEATH;
if (how->resolve & RESOLVE_IN_ROOT)
lookup_flags |= LOOKUP_IN_ROOT;
+ if (how->resolve & RESOLVE_ALT)
+ lookup_flags |= LOOKUP_ALT;

op->lookup_flags = lookup_flags;
return 0;
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -128,7 +128,7 @@ static int show_vfsmnt(struct seq_file *
return err;
}

-static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
+int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
{
struct proc_mounts *p = m->private;
struct mount *r = real_mount(mnt);
--- a/include/linux/fcntl.h
+++ b/include/linux/fcntl.h
@@ -19,7 +19,7 @@
/* List of all valid flags for the how->resolve argument: */
#define VALID_RESOLVE_FLAGS \
(RESOLVE_NO_XDEV | RESOLVE_NO_MAGICLINKS | RESOLVE_NO_SYMLINKS | \
- RESOLVE_BENEATH | RESOLVE_IN_ROOT)
+ RESOLVE_BENEATH | RESOLVE_IN_ROOT | RESOLVE_ALT)

/* List of all open_how "versions". */
#define OPEN_HOW_SIZE_VER0 24 /* sizeof first published struct */
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -15,7 +15,7 @@ enum { MAX_NESTED_LINKS = 8 };
/*
* Type of the last component on LOOKUP_PARENT
*/
-enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT};
+enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_META};

/* pathwalk mode */
#define LOOKUP_FOLLOW 0x0001 /* follow links at the end */
@@ -27,6 +27,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LA

#define LOOKUP_REVAL 0x0020 /* tell ->d_revalidate() to trust no cache */
#define LOOKUP_RCU 0x0040 /* RCU pathwalk mode; semi-internal */
+#define LOOKUP_ALT 0x200000 /* Alternative path walk mode */

/* These tell filesystem methods that we are dealing with the final component... */
#define LOOKUP_OPEN 0x0100 /* ... in open */
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -88,6 +88,7 @@
#define BPF_FS_MAGIC 0xcafe4a11
#define AAFS_MAGIC 0x5a3c69f0
#define ZONEFS_MAGIC 0x5a4f4653
+#define FSMETA_MAGIC 0x9f8ea387

/* Since UDF 2.01 is ISO 13346 based... */
#define UDF_SUPER_MAGIC 0x15013346
--- a/include/uapi/linux/openat2.h
+++ b/include/uapi/linux/openat2.h
@@ -35,5 +35,7 @@ struct open_how {
#define RESOLVE_IN_ROOT 0x10 /* Make all jumps to "/" and ".."
be scoped inside the dirfd
(similar to chroot(2)). */
+#define RESOLVE_ALT 0x20 /* Alternative path walk mode where
+ multiple slashes have special meaning */

#endif /* _UAPI_LINUX_OPENAT2_H */