[RFC 1/2] ns: introduce binfmt_misc namespace

From: Laurent Vivier
Date: Sun Sep 30 2018 - 19:47:32 EST


Signed-off-by: Laurent Vivier <laurent@xxxxxxxxx>
---
fs/proc/namespaces.c | 3 +
include/linux/binfmt_namespace.h | 51 +++++++++++
include/linux/nsproxy.h | 2 +
include/linux/proc_ns.h | 2 +
include/linux/user_namespace.h | 1 +
include/uapi/linux/sched.h | 1 +
init/Kconfig | 8 ++
kernel/Makefile | 1 +
kernel/binfmt_namespace.c | 153 +++++++++++++++++++++++++++++++
kernel/fork.c | 3 +-
kernel/nsproxy.c | 18 +++-
11 files changed, 240 insertions(+), 3 deletions(-)
create mode 100644 include/linux/binfmt_namespace.h
create mode 100644 kernel/binfmt_namespace.c

diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index dd2b35f78b09..4d86549a788f 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -33,6 +33,9 @@ static const struct proc_ns_operations *ns_entries[] = {
#ifdef CONFIG_CGROUPS
&cgroupns_operations,
#endif
+#ifdef CONFIG_BINFMT_NS
+ &binfmtns_operations,
+#endif
};

static const char *proc_ns_get_link(struct dentry *dentry,
diff --git a/include/linux/binfmt_namespace.h b/include/linux/binfmt_namespace.h
new file mode 100644
index 000000000000..8688869ee254
--- /dev/null
+++ b/include/linux/binfmt_namespace.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_BINFMT_NAMESPACE_H
+#define _LINUX_BINFMT_NAMESPACE_H
+
+struct user_namespace;
+extern struct user_namespace init_user_ns;
+
+struct binfmt_namespace {
+ struct kref kref;
+ struct user_namespace *user_ns;
+ struct ucounts *ucounts;
+ struct ns_common ns;
+} __randomize_layout;
+extern struct binfmt_namespace init_binfmt_ns;
+
+#ifdef CONFIG_BINFMT_NS
+static inline void get_binfmt_ns(struct binfmt_namespace *ns)
+{
+ if (ns)
+ kref_get(&ns->kref);
+}
+
+extern struct binfmt_namespace *copy_binfmt_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct binfmt_namespace *old_ns);
+extern void free_binfmt_ns(struct kref *kref);
+
+static inline void put_binfmt_ns(struct binfmt_namespace *ns)
+{
+ if (ns)
+ kref_put(&ns->kref, free_binfmt_ns);
+}
+
+#else
+static inline void get_binfmt_ns(struct binfmt_namespace *ns)
+{
+}
+
+static inline void put_binfmt_ns(struct binfmt_namespace *ns)
+{
+}
+
+static inline struct binfmt_namespace *copy_binfmt_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct binfmt_namespace *old_ns)
+{
+ if (flags & CLONE_NEWBINFMT)
+ return ERR_PTR(-EINVAL);
+
+ return old_ns;
+}
+#endif
+#endif /* _LINUX_BINFMT_NAMESPACE_H */
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 2ae1b1a4d84d..8d2294477095 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -10,6 +10,7 @@ struct uts_namespace;
struct ipc_namespace;
struct pid_namespace;
struct cgroup_namespace;
+struct binfmt_namespace;
struct fs_struct;

/*
@@ -36,6 +37,7 @@ struct nsproxy {
struct pid_namespace *pid_ns_for_children;
struct net *net_ns;
struct cgroup_namespace *cgroup_ns;
+ struct binfmt_namespace *binfmt_ns;
};
extern struct nsproxy init_nsproxy;

diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index d31cb6215905..6afa2dbc5204 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -32,6 +32,7 @@ extern const struct proc_ns_operations pidns_for_children_operations;
extern const struct proc_ns_operations userns_operations;
extern const struct proc_ns_operations mntns_operations;
extern const struct proc_ns_operations cgroupns_operations;
+extern const struct proc_ns_operations binfmtns_operations;

/*
* We always define these enumerators
@@ -43,6 +44,7 @@ enum {
PROC_USER_INIT_INO = 0xEFFFFFFDU,
PROC_PID_INIT_INO = 0xEFFFFFFCU,
PROC_CGROUP_INIT_INO = 0xEFFFFFFBU,
+ PROC_BINFMT_INIT_INO = 0xEFFFFFFAU,
};

#ifdef CONFIG_PROC_FS
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index d6b74b91096b..81365a22362c 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -45,6 +45,7 @@ enum ucount_type {
UCOUNT_NET_NAMESPACES,
UCOUNT_MNT_NAMESPACES,
UCOUNT_CGROUP_NAMESPACES,
+ UCOUNT_BINFMT_NAMESPACES,
#ifdef CONFIG_INOTIFY_USER
UCOUNT_INOTIFY_INSTANCES,
UCOUNT_INOTIFY_WATCHES,
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 22627f80063e..51fe40681e8e 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -10,6 +10,7 @@
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
+#define CLONE_NEWBINFMT 0x00001000 /* New binfmt_misc namespace */
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
diff --git a/init/Kconfig b/init/Kconfig
index 1e234e2f1cba..4874719a2799 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -965,6 +965,14 @@ config NET_NS
Allow user space to create what appear to be multiple instances
of the network stack.

+config BINFMT_NS
+ bool "binfmt_misc Namespace"
+ depends on BINFMT_MISC
+ default y
+ help
+ This allows to use several binfmt_misc configurations on
+ the same system.
+
endif # NAMESPACES

config CHECKPOINT_RESTORE
diff --git a/kernel/Makefile b/kernel/Makefile
index 7a63d567fdb5..313c80f5883f 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -72,6 +72,7 @@ obj-$(CONFIG_CGROUPS) += cgroup/
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
+obj-$(CONFIG_BINFMT_NS) += binfmt_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
diff --git a/kernel/binfmt_namespace.c b/kernel/binfmt_namespace.c
new file mode 100644
index 000000000000..63a80bcd70df
--- /dev/null
+++ b/kernel/binfmt_namespace.c
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/slab.h>
+#include <linux/user_namespace.h>
+#include <linux/cred.h>
+#include <linux/binfmt_namespace.h>
+#include <linux/proc_ns.h>
+#include <linux/sched/task.h>
+
+static struct ucounts *inc_binfmt_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_BINFMT_NAMESPACES);
+}
+
+static void dec_binfmt_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_BINFMT_NAMESPACES);
+}
+
+static struct binfmt_namespace *create_binfmt_ns(void)
+{
+ struct binfmt_namespace *binfmt_ns;
+
+ binfmt_ns = kmalloc(sizeof(struct binfmt_namespace), GFP_KERNEL);
+ if (binfmt_ns)
+ kref_init(&binfmt_ns->kref);
+ return binfmt_ns;
+}
+
+static struct binfmt_namespace *clone_binfmt_ns(struct user_namespace *user_ns,
+ struct binfmt_namespace *old_ns)
+{
+ struct binfmt_namespace *ns;
+ struct ucounts *ucounts;
+ int err;
+
+ err = -ENOSPC;
+ ucounts = inc_binfmt_namespaces(user_ns);
+ if (!ucounts)
+ goto fail;
+
+ err = -ENOMEM;
+ ns = create_binfmt_ns();
+ if (!ns)
+ goto fail_dec;
+
+ err = ns_alloc_inum(&ns->ns);
+ if (err)
+ goto fail_free;
+
+ ns->ucounts = ucounts;
+ ns->ns.ops = &binfmtns_operations;
+ ns->user_ns = get_user_ns(user_ns);
+ return ns;
+
+fail_free:
+ kfree(ns);
+fail_dec:
+ dec_binfmt_namespaces(ucounts);
+fail:
+ return ERR_PTR(err);
+}
+
+struct binfmt_namespace *copy_binfmt_ns(unsigned long flags,
+ struct user_namespace *user_ns, struct binfmt_namespace *old_ns)
+{
+ if (!(flags & CLONE_NEWBINFMT)) {
+ get_binfmt_ns(old_ns);
+ return old_ns;
+ }
+
+ return clone_binfmt_ns(user_ns, old_ns);
+}
+
+void free_binfmt_ns(struct kref *kref)
+{
+ struct binfmt_namespace *ns;
+
+ ns = container_of(kref, struct binfmt_namespace, kref);
+ dec_binfmt_namespaces(ns->ucounts);
+ put_user_ns(ns->user_ns);
+ ns_free_inum(&ns->ns);
+ kfree(ns);
+}
+
+static inline struct binfmt_namespace *to_binfmt_ns(struct ns_common *ns)
+{
+ return container_of(ns, struct binfmt_namespace, ns);
+}
+
+static struct ns_common *binfmtns_get(struct task_struct *task)
+{
+ struct binfmt_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ task_lock(task);
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = nsproxy->binfmt_ns;
+ get_binfmt_ns(ns);
+ }
+ task_unlock(task);
+
+ return ns ? &ns->ns : NULL;
+}
+
+static void binfmtns_put(struct ns_common *ns)
+{
+ put_binfmt_ns(to_binfmt_ns(ns));
+}
+
+static int binfmtns_install(struct nsproxy *nsproxy, struct ns_common *new)
+{
+ struct binfmt_namespace *ns = to_binfmt_ns(new);
+
+ if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN) ||
+ !ns_capable(current_user_ns(), CAP_SYS_ADMIN))
+ return -EPERM;
+
+ get_binfmt_ns(ns);
+ put_binfmt_ns(nsproxy->binfmt_ns);
+ nsproxy->binfmt_ns = ns;
+ return 0;
+}
+
+static struct user_namespace *binfmtns_owner(struct ns_common *ns)
+{
+ return to_binfmt_ns(ns)->user_ns;
+}
+
+const struct proc_ns_operations binfmtns_operations = {
+ .name = "binfmt_misc",
+ .type = CLONE_NEWBINFMT,
+ .get = binfmtns_get,
+ .put = binfmtns_put,
+ .install = binfmtns_install,
+ .owner = binfmtns_owner,
+};
+
+struct binfmt_namespace init_binfmt_ns = {
+ .kref = KREF_INIT(2),
+ .user_ns = &init_user_ns,
+ .ns.inum = PROC_BINFMT_INIT_INO,
+#ifdef CONFIG_BINFMT_NS
+ .ns.ops = &binfmtns_operations,
+#endif
+};
+
+static int __init binfmt_ns_init(void)
+{
+ return 0;
+}
+subsys_initcall(binfmt_ns_init);
diff --git a/kernel/fork.c b/kernel/fork.c
index f0b58479534f..d89cf8b89e43 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2365,7 +2365,8 @@ static int check_unshare_flags(unsigned long unshare_flags)
if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET|
- CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP))
+ CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP|
+ CLONE_NEWBINFMT))
return -EINVAL;
/*
* Not implemented, but pretend it works if there is nothing
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f6c5d330059a..386028e6da39 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,7 @@
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
+#include <linux/binfmt_namespace.h>
#include <linux/proc_ns.h>
#include <linux/file.h>
#include <linux/syscalls.h>
@@ -44,6 +45,9 @@ struct nsproxy init_nsproxy = {
#ifdef CONFIG_CGROUPS
.cgroup_ns = &init_cgroup_ns,
#endif
+#if IS_ENABLED(BINFMT_MISC)
+ .binfmt_ns = &init_binfmt_ns,
+#endif
};

static inline struct nsproxy *create_nsproxy(void)
@@ -110,6 +114,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_net;
}

+ new_nsp->binfmt_ns = copy_binfmt_ns(flags, user_ns,
+ tsk->nsproxy->binfmt_ns);
+ if (IS_ERR(new_nsp->binfmt_ns)) {
+ err = PTR_ERR(new_nsp->binfmt_ns);
+ goto out_net;
+ }
+
return new_nsp;

out_net:
@@ -143,7 +154,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)

if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET |
- CLONE_NEWCGROUP)))) {
+ CLONE_NEWCGROUP | CLONE_NEWBINFMT)))) {
get_nsproxy(old_ns);
return 0;
}
@@ -180,6 +191,8 @@ void free_nsproxy(struct nsproxy *ns)
put_ipc_ns(ns->ipc_ns);
if (ns->pid_ns_for_children)
put_pid_ns(ns->pid_ns_for_children);
+ if (ns->binfmt_ns)
+ put_binfmt_ns(ns->binfmt_ns);
put_cgroup_ns(ns->cgroup_ns);
put_net(ns->net_ns);
kmem_cache_free(nsproxy_cachep, ns);
@@ -196,7 +209,8 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
int err = 0;

if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
- CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP)))
+ CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
+ CLONE_NEWBINFMT)))
return 0;

user_ns = new_cred ? new_cred->user_ns : current_user_ns();
--
2.17.1