[PATCH 1/1] proc: "mount -o lookup=..." support
From: Alexey Dobriyan
Date: Mon Nov 22 2021 - 12:41:06 EST
Docker implements MaskedPaths configuration option
https://github.com/estesp/docker/blob/9c15e82f19b0ad3c5fe8617a8ec2dddc6639f40a/oci/defaults.go#L97
to disable certain /proc files. It overmounts them with /dev/null.
Implement proper mount option which selectively disables lookup/readdir
in the top level /proc directory so that MaskedPaths doesn't need
to be updated as time goes on.
Syntax is
Filter everything
# mount -t proc -o lookup=/ proc /proc
# ls /proc
dr-xr-xr-x 8 root root 0 Nov 22 21:12 995
lrwxrwxrwx 1 root root 0 Nov 22 21:12 self -> 1163
lrwxrwxrwx 1 root root 0 Nov 22 21:12 thread-self -> 1163/task/1163
Allow /proc/cpuinfo and /proc/uptime
# mount -t proc proc -o lookup=cpuinfo/uptime /proc
# ls /proc
...
dr-xr-xr-x 8 root root 0 Nov 22 21:12 995
-r--r--r-- 1 root root 0 Nov 22 21:12 cpuinfo
lrwxrwxrwx 1 root root 0 Nov 22 21:12 self -> 1163
lrwxrwxrwx 1 root root 0 Nov 22 21:12 thread-self -> 1163/task/1163
-r--r--r-- 1 root root 0 Nov 22 21:12 uptime
Trailing slash is optional but saves 1 allocation.
Trailing slash is mandatory for "filter everything".
Remounting with lookup= is disabled so that files and dcache entries
don't stay active while filter list is changed. Users are supposed
to unmount and mount again with different lookup= set.
Remount rules may change in the future. (Eric W. Biederman)
Re: speed
This is the price for filtering, given that lookup= is whitelist it is
not supposed to be very long. Second, it is one linear memory scan per
lookup, there are no linked lists. It may be faster than rbtree in fact.
It consumes 1 allocation per superblock which is list of names itself.
Signed-off-by: Alexey Dobriyan <adobriyan@xxxxxxxxx>
---
v2
documentation!
descriptive comments!
disable remount
Documentation/filesystems/proc.rst | 8 ++
fs/proc/generic.c | 18 ++--
fs/proc/internal.h | 31 ++++++-
fs/proc/proc_net.c | 2 +-
fs/proc/root.c | 127 ++++++++++++++++++++++++++++-
include/linux/proc_fs.h | 2 +
6 files changed, 178 insertions(+), 10 deletions(-)
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 8d7f141c6fc7..9a328f0b4346 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -2186,6 +2186,7 @@ The following mount options are supported:
hidepid= Set /proc/<pid>/ access mode.
gid= Set the group authorized to learn processes information.
subset= Show only the specified subset of procfs.
+ lookup= Top-level /proc filter, independent of subset=
========= ========================================================
hidepid=off or hidepid=0 means classic mode - everybody may access all
@@ -2218,6 +2219,13 @@ information about processes information, just add identd to this group.
subset=pid hides all top level files and directories in the procfs that
are not related to tasks.
+lookup= mount option makes available only listed files/directories in
+the top-level /proc directory. Individual names are separated
+by slash. Empty list is equivalent to subset=pid. lookup= filters before
+subset= if both options are supplied. lookup= doesn't affect /proc/${pid}
+directories availability as well as /proc/self and /proc/thread-self
+symlinks. More fine-grained filtering is not supported at the moment.
+
Chapter 5: Filesystem behavior
==============================
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 5b78739e60e4..4d04f8d89cdc 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -282,7 +282,7 @@ struct dentry *proc_lookup(struct inode *dir, struct dentry *dentry,
* for success..
*/
int proc_readdir_de(struct file *file, struct dir_context *ctx,
- struct proc_dir_entry *de)
+ struct proc_dir_entry *de, const struct proc_lookup_list *ll)
{
int i;
@@ -307,12 +307,15 @@ int proc_readdir_de(struct file *file, struct dir_context *ctx,
struct proc_dir_entry *next;
pde_get(de);
read_unlock(&proc_subdir_lock);
- if (!dir_emit(ctx, de->name, de->namelen,
- de->low_ino, de->mode >> 12)) {
- pde_put(de);
- return 0;
+
+ if (in_lookup_list(ll, de->name, de->namelen)) {
+ if (!dir_emit(ctx, de->name, de->namelen, de->low_ino, de->mode >> 12)) {
+ pde_put(de);
+ return 0;
+ }
+ ctx->pos++;
}
- ctx->pos++;
+
read_lock(&proc_subdir_lock);
next = pde_subdir_next(de);
pde_put(de);
@@ -330,7 +333,8 @@ int proc_readdir(struct file *file, struct dir_context *ctx)
if (fs_info->pidonly == PROC_PIDONLY_ON)
return 1;
- return proc_readdir_de(file, ctx, PDE(inode));
+ return proc_readdir_de(file, ctx, PDE(inode),
+ PDE(inode) == &proc_root ? fs_info->lookup_list : NULL);
}
/*
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 03415f3fb3a8..e74acb437c56 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -190,7 +190,7 @@ struct proc_dir_entry *proc_register(struct proc_dir_entry *dir,
extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int);
struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *);
extern int proc_readdir(struct file *, struct dir_context *);
-int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *);
+int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *, const struct proc_lookup_list *);
static inline void pde_get(struct proc_dir_entry *pde)
{
@@ -318,3 +318,32 @@ static inline void pde_force_lookup(struct proc_dir_entry *pde)
/* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */
pde->proc_dops = &proc_net_dentry_ops;
}
+
+/*
+ * Pascal strings stiched together making filtering memory access pattern linear.
+ *
+ * "mount -t proc -o lookup=/" results in
+ *
+ * (u8[]){
+ * 0
+ * }
+ *
+ * "mount -t proc -o lookup=cpuinfo/uptime/" results in
+ *
+ * (u8[]){
+ * 7, 'c', 'p', 'u', 'i', 'n', 'f', 'o',
+ * 6, 'u', 'p', 't', 'i', 'm', 'e',
+ * 0
+ * }
+ */
+struct proc_lookup_list {
+ u8 len;
+ char str[];
+};
+
+static inline struct proc_lookup_list *lookup_list_next(const struct proc_lookup_list *ll)
+{
+ return (struct proc_lookup_list *)((void *)ll + 1 + ll->len);
+}
+
+bool in_lookup_list(const struct proc_lookup_list *ll, const char *str, unsigned int len);
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 15c2e55d2ed2..7941df2d3d74 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -321,7 +321,7 @@ static int proc_tgid_net_readdir(struct file *file, struct dir_context *ctx)
ret = -EINVAL;
net = get_proc_task_net(file_inode(file));
if (net != NULL) {
- ret = proc_readdir_de(file, ctx, net->proc_net);
+ ret = proc_readdir_de(file, ctx, net->proc_net, NULL);
put_net(net);
}
return ret;
diff --git a/fs/proc/root.c b/fs/proc/root.c
index c7e3b1350ef8..8000558d7d2c 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -35,18 +35,22 @@ struct proc_fs_context {
enum proc_hidepid hidepid;
int gid;
enum proc_pidonly pidonly;
+ struct proc_lookup_list *lookup_list;
+ unsigned int lookup_list_len;
};
enum proc_param {
Opt_gid,
Opt_hidepid,
Opt_subset,
+ Opt_lookup,
};
static const struct fs_parameter_spec proc_fs_parameters[] = {
fsparam_u32("gid", Opt_gid),
fsparam_string("hidepid", Opt_hidepid),
fsparam_string("subset", Opt_subset),
+ fsparam_string("lookup", Opt_lookup),
{}
};
@@ -112,6 +116,65 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value)
return 0;
}
+static int proc_parse_lookup_param(struct fs_context *fc, char *str0)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct proc_lookup_list *ll;
+ char *str;
+ const char *slash;
+ const char *src;
+ unsigned int len;
+ int rv;
+
+ /* Force trailing slash, simplify loops below. */
+ len = strlen(str0);
+ if (len > 0 && str0[len - 1] == '/') {
+ str = str0;
+ } else {
+ str = kmalloc(len + 2, GFP_KERNEL);
+ if (!str) {
+ rv = -ENOMEM;
+ goto out;
+ }
+ memcpy(str, str0, len);
+ str[len] = '/';
+ str[len + 1] = '\0';
+ }
+
+ len = 0;
+ for (src = str; (slash = strchr(src, '/')); src = slash + 1) {
+ if (slash - src >= 256) {
+ rv = -EINVAL;
+ goto out_free_str;
+ }
+ len += 1 + (slash - src);
+ }
+ len += 1;
+
+ ctx->lookup_list = ll = kmalloc(len, GFP_KERNEL);
+ ctx->lookup_list_len = len;
+ if (!ll) {
+ rv = -ENOMEM;
+ goto out_free_str;
+ }
+
+ for (src = str; (slash = strchr(src, '/')); src = slash + 1) {
+ ll->len = slash - src;
+ memcpy(ll->str, src, ll->len);
+ ll = lookup_list_next(ll);
+ }
+ ll->len = 0;
+
+ rv = 0;
+
+out_free_str:
+ if (str != str0) {
+ kfree(str);
+ }
+out:
+ return rv;
+}
+
static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct proc_fs_context *ctx = fc->fs_private;
@@ -137,6 +200,11 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
return -EINVAL;
break;
+ case Opt_lookup:
+ if (proc_parse_lookup_param(fc, param->string) < 0)
+ return -EINVAL;
+ break;
+
default:
return -EINVAL;
}
@@ -157,6 +225,10 @@ static void proc_apply_options(struct proc_fs_info *fs_info,
fs_info->hide_pid = ctx->hidepid;
if (ctx->mask & (1 << Opt_subset))
fs_info->pidonly = ctx->pidonly;
+ if (ctx->mask & (1 << Opt_lookup)) {
+ fs_info->lookup_list = ctx->lookup_list;
+ ctx->lookup_list = NULL;
+ }
}
static int proc_fill_super(struct super_block *s, struct fs_context *fc)
@@ -218,6 +290,14 @@ static int proc_reconfigure(struct fs_context *fc)
struct super_block *sb = fc->root->d_sb;
struct proc_fs_info *fs_info = proc_sb_info(sb);
+ /*
+ * "Hide everything" lookup filter is not a problem as only
+ * /proc/${pid}, /proc/self and /proc/thread-self are accessible.
+ */
+ if (fs_info->lookup_list && fs_info->lookup_list->len > 0) {
+ return invalfc(fc, "'-o remount,lookup=' is unsupported, unmount and mount instead");
+ }
+
sync_filesystem(sb);
proc_apply_options(fs_info, fc, current_user_ns());
@@ -234,11 +314,34 @@ static void proc_fs_context_free(struct fs_context *fc)
struct proc_fs_context *ctx = fc->fs_private;
put_pid_ns(ctx->pid_ns);
+ kfree(ctx->lookup_list);
kfree(ctx);
}
+static int proc_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc)
+{
+ struct proc_fs_context *src = fc->fs_private;
+ struct proc_fs_context *dst;
+
+ dst = kmemdup(src, sizeof(struct proc_fs_context), GFP_KERNEL);
+ if (!dst) {
+ return -ENOMEM;
+ }
+
+ dst->lookup_list = kmemdup(dst->lookup_list, dst->lookup_list_len, GFP_KERNEL);
+ if (!dst->lookup_list) {
+ kfree(dst);
+ return -ENOMEM;
+ }
+ get_pid_ns(dst->pid_ns);
+
+ fc->fs_private = dst;
+ return 0;
+}
+
static const struct fs_context_operations proc_fs_context_ops = {
.free = proc_fs_context_free,
+ .dup = proc_fs_context_dup,
.parse_param = proc_parse_param,
.get_tree = proc_get_tree,
.reconfigure = proc_reconfigure,
@@ -274,6 +377,7 @@ static void proc_kill_sb(struct super_block *sb)
kill_anon_super(sb);
put_pid_ns(fs_info->pid_ns);
+ kfree(fs_info->lookup_list);
kfree(fs_info);
}
@@ -317,12 +421,33 @@ static int proc_root_getattr(struct user_namespace *mnt_userns,
return 0;
}
+bool in_lookup_list(const struct proc_lookup_list *ll, const char *str, unsigned int len)
+{
+ if (ll) {
+ for (; ll->len > 0; ll = lookup_list_next(ll)) {
+ if (ll->len == len && strncmp(ll->str, str, len) == 0) {
+ return true;
+ }
+ }
+ return false;
+ } else {
+ return true;
+ }
+}
+
static struct dentry *proc_root_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags)
{
+ struct proc_fs_info *proc_sb = proc_sb_info(dir->i_sb);
+
if (!proc_pid_lookup(dentry, flags))
return NULL;
- return proc_lookup(dir, dentry, flags);
+ if (in_lookup_list(proc_sb->lookup_list, dentry->d_name.name, dentry->d_name.len)) {
+ return proc_lookup(dir, dentry, flags);
+ } else {
+ return NULL;
+ }
+
}
static int proc_root_readdir(struct file *file, struct dir_context *ctx)
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 069c7fd95396..d2c067560bf9 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -10,6 +10,7 @@
#include <linux/fs.h>
struct proc_dir_entry;
+struct proc_lookup_list;
struct seq_file;
struct seq_operations;
@@ -65,6 +66,7 @@ struct proc_fs_info {
kgid_t pid_gid;
enum proc_hidepid hide_pid;
enum proc_pidonly pidonly;
+ const struct proc_lookup_list *lookup_list;
};
static inline struct proc_fs_info *proc_sb_info(struct super_block *sb)
--
2.31.1