[RFC PATCH bpf-next 03/13] bpf: Implement bpf namespace

From: Yafang Shao
Date: Sun Mar 26 2023 - 05:22:33 EST


It is similar with pid namespace. When we create a new bpf object in a
child BPF namespace, it will alloc the id in current BPF namespace and
its parent BPF namespace. The hierarchy as follows,

init_bpf_ns : level = 0
/ \
child_a child_b : level = 1
/ \
child_b_a child_b_b : level = 2

When we create a bpf object in child_bb, it will allocate IDs for this
object in child_bb, child_b and the init_bpf_ns.

We will allocate the id for bpf_map, bpf_prog and bpf_link in bpf
namespace.

Signed-off-by: Yafang Shao <laoar.shao@xxxxxxxxx>
---
fs/proc/namespaces.c | 4 +
include/linux/bpf_namespace.h | 46 +++++++++
include/linux/nsproxy.h | 4 +
include/linux/proc_ns.h | 1 +
include/linux/user_namespace.h | 1 +
kernel/bpf/Makefile | 1 +
kernel/bpf/bpf_namespace.c | 219 +++++++++++++++++++++++++++++++++++++++++
kernel/nsproxy.c | 19 +++-
kernel/ucount.c | 1 +
9 files changed, 294 insertions(+), 2 deletions(-)
create mode 100644 include/linux/bpf_namespace.h
create mode 100644 kernel/bpf/bpf_namespace.c

diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 8e159fc..1a36757 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -9,6 +9,7 @@
#include <linux/ipc_namespace.h>
#include <linux/pid_namespace.h>
#include <linux/user_namespace.h>
+#include <linux/bpf_namespace.h>
#include "internal.h"


@@ -37,6 +38,9 @@
&timens_operations,
&timens_for_children_operations,
#endif
+#ifdef CONFIG_BPF
+ &bpfns_operations,
+#endif
};

static const char *proc_ns_get_link(struct dentry *dentry,
diff --git a/include/linux/bpf_namespace.h b/include/linux/bpf_namespace.h
new file mode 100644
index 0000000..06aa51f
--- /dev/null
+++ b/include/linux/bpf_namespace.h
@@ -0,0 +1,46 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_BPF_ID_NS_H
+#define _LINUX_BPF_ID_NS_H
+#include <linux/types.h>
+#include <linux/idr.h>
+#include <linux/ns_common.h>
+#include <linux/user_namespace.h>
+
+struct ubpf_obj_id {
+ int nr;
+ struct bpf_namespace *ns;
+};
+
+struct bpf_obj_id {
+ refcount_t count;
+ unsigned int level;
+ struct rcu_head rcu;
+ struct ubpf_obj_id numbers[1];
+};
+
+enum {
+ MAP_OBJ_ID = 0,
+ PROG_OBJ_ID,
+ LINK_OBJ_ID,
+ OBJ_ID_NUM,
+};
+
+struct bpf_namespace {
+ struct idr idr[OBJ_ID_NUM];
+ struct rcu_head rcu;
+ int level;
+ struct ns_common ns;
+ struct user_namespace *user_ns;
+ struct kmem_cache *obj_id_cachep;
+ struct bpf_namespace *parent;
+ struct ucounts *ucounts;
+};
+
+extern struct bpf_namespace init_bpf_ns;
+extern struct proc_ns_operations bpfns_operations;
+
+struct bpf_namespace *copy_bpfns(unsigned long flags,
+ struct user_namespace *user_ns,
+ struct bpf_namespace *old_ns);
+void put_bpfns(struct bpf_namespace *ns);
+#endif /* _LINUX_BPF_ID_NS_H */
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index fee881c..d24ab6b 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -10,6 +10,9 @@
struct ipc_namespace;
struct pid_namespace;
struct cgroup_namespace;
+#ifdef CONFIG_BPF
+struct bpf_namespace;
+#endif
struct fs_struct;

/*
@@ -38,6 +41,7 @@ struct nsproxy {
struct time_namespace *time_ns;
struct time_namespace *time_ns_for_children;
struct cgroup_namespace *cgroup_ns;
+ struct bpf_namespace *bpf_ns;
};
extern struct nsproxy init_nsproxy;

diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h
index 555c257..c10ce2c 100644
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -46,6 +46,7 @@ enum {
PROC_PID_INIT_INO = 0xEFFFFFFCU,
PROC_CGROUP_INIT_INO = 0xEFFFFFFBU,
PROC_TIME_INIT_INO = 0xEFFFFFFAU,
+ PROC_BPF_INIT_INO = 0xEFFFFFF9U,
};

#ifdef CONFIG_PROC_FS
diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 45f09be..93eb618 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -54,6 +54,7 @@ enum ucount_type {
UCOUNT_FANOTIFY_GROUPS,
UCOUNT_FANOTIFY_MARKS,
#endif
+ UCOUNT_BPF_NAMESPACES,
UCOUNT_COUNTS,
};

diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0224261..828aef0 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -44,3 +44,4 @@ obj-$(CONFIG_BPF_PRELOAD) += preload/
obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
$(call if_changed_rule,cc_o_c)
+obj-$(CONFIG_BPF_SYSCALL) += bpf_namespace.o
diff --git a/kernel/bpf/bpf_namespace.c b/kernel/bpf/bpf_namespace.c
new file mode 100644
index 0000000..88a86cd
--- /dev/null
+++ b/kernel/bpf/bpf_namespace.c
@@ -0,0 +1,219 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/ns_common.h>
+#include <linux/syscalls.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/proc_ns.h>
+#include <linux/sched/task.h>
+#include <linux/sched/signal.h>
+#include <linux/idr.h>
+#include <linux/user_namespace.h>
+#include <linux/bpf_namespace.h>
+
+#define MAX_BPF_NS_LEVEL 32
+static struct kmem_cache *bpfns_cachep;
+static struct kmem_cache *obj_id_cache[MAX_PID_NS_LEVEL];
+static struct ns_common *bpfns_get(struct task_struct *task);
+static void bpfns_put(struct ns_common *ns);
+static struct kmem_cache *create_bpf_cachep(unsigned int level);
+static DEFINE_MUTEX(obj_id_caches_mutex);
+
+static int bpfns_install(struct nsset *nsset, struct ns_common *ns)
+{
+ pr_info("setns not supported for bpf namespace");
+ return -EOPNOTSUPP;
+}
+
+struct proc_ns_operations bpfns_operations = {
+ .name = "bpf",
+ .type = CLONE_NEWBPF,
+ .get = bpfns_get,
+ .put = bpfns_put,
+ .install = bpfns_install,
+};
+
+struct bpf_namespace init_bpf_ns = {
+ .level = 0,
+ .user_ns = &init_user_ns,
+ .ns.ops = &bpfns_operations,
+ .ns.inum = PROC_BPF_INIT_INO,
+};
+
+static struct bpf_namespace *get_bpfns(struct bpf_namespace *ns)
+{
+ if (ns != &init_bpf_ns)
+ refcount_inc(&ns->ns.count);
+ return ns;
+}
+
+static struct ns_common *bpfns_get(struct task_struct *task)
+{
+ struct ns_common *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ rcu_read_lock();
+ nsproxy = task->nsproxy;
+ if (nsproxy) {
+ ns = &nsproxy->bpf_ns->ns;
+ get_bpfns(container_of(ns, struct bpf_namespace, ns));
+ }
+ rcu_read_unlock();
+ return ns;
+}
+
+static struct ucounts *inc_bpf_namespaces(struct user_namespace *ns)
+{
+ return inc_ucount(ns, current_euid(), UCOUNT_BPF_NAMESPACES);
+}
+
+static void dec_bpf_namespaces(struct ucounts *ucounts)
+{
+ dec_ucount(ucounts, UCOUNT_BPF_NAMESPACES);
+}
+
+static void delayed_free_bpfns(struct rcu_head *p)
+{
+ struct bpf_namespace *ns = container_of(p, struct bpf_namespace, rcu);
+
+ dec_bpf_namespaces(ns->ucounts);
+ put_user_ns(ns->user_ns);
+ kmem_cache_free(bpfns_cachep, ns);
+}
+
+static void destroy_bpf_namespace(struct bpf_namespace *ns)
+{
+ int i;
+
+ ns_free_inum(&ns->ns);
+ for (i = 0; i < OBJ_ID_NUM; i++)
+ idr_destroy(&ns->idr[i]);
+ call_rcu(&ns->rcu, delayed_free_bpfns);
+}
+
+void put_bpfns(struct bpf_namespace *ns)
+{
+ struct bpf_namespace *parent;
+
+ while (ns != &init_bpf_ns) {
+ parent = ns->parent;
+ if (!refcount_dec_and_test(&ns->ns.count))
+ break;
+ destroy_bpf_namespace(ns);
+ ns = parent;
+ }
+}
+
+static void bpfns_put(struct ns_common *ns)
+{
+ struct bpf_namespace *bpf_ns;
+
+ bpf_ns = container_of(ns, struct bpf_namespace, ns);
+ put_bpfns(bpf_ns);
+}
+
+static struct bpf_namespace *
+create_bpf_namespace(struct user_namespace *user_ns,
+ struct bpf_namespace *parent_bpfns)
+{
+ struct bpf_namespace *ns;
+ unsigned int level = parent_bpfns->level + 1;
+ struct ucounts *ucounts;
+ int err;
+ int i;
+
+ err = -EINVAL;
+ if (!in_userns(parent_bpfns->user_ns, user_ns))
+ goto out;
+
+ err = -ENOSPC;
+ if (level > MAX_BPF_NS_LEVEL)
+ goto out;
+ ucounts = inc_bpf_namespaces(user_ns);
+ if (!ucounts)
+ goto out;
+
+ err = -ENOMEM;
+ ns = kmem_cache_zalloc(bpfns_cachep, GFP_KERNEL);
+ if (!ns)
+ goto out_dec;
+
+ for (i = 0; i < OBJ_ID_NUM; i++)
+ idr_init(&ns->idr[i]);
+
+ ns->obj_id_cachep = create_bpf_cachep(level);
+ if (!ns->obj_id_cachep)
+ goto out_free_idr;
+
+ err = ns_alloc_inum(&ns->ns);
+ if (err)
+ goto out_free_idr;
+ ns->ns.ops = &bpfns_operations;
+
+ refcount_set(&ns->ns.count, 1);
+ ns->level = level;
+ ns->parent = get_bpfns(parent_bpfns);
+ ns->user_ns = get_user_ns(user_ns);
+ ns->ucounts = ucounts;
+ return ns;
+
+out_free_idr:
+ for (i = 0; i < OBJ_ID_NUM; i++)
+ idr_destroy(&ns->idr[i]);
+ kmem_cache_free(bpfns_cachep, ns);
+out_dec:
+ dec_bpf_namespaces(ucounts);
+out:
+ return ERR_PTR(err);
+}
+
+struct bpf_namespace *copy_bpfns(unsigned long flags,
+ struct user_namespace *user_ns,
+ struct bpf_namespace *old_ns)
+{
+ if (!(flags & CLONE_NEWBPF))
+ return get_bpfns(old_ns);
+ return create_bpf_namespace(user_ns, old_ns);
+}
+
+static struct kmem_cache *create_bpf_cachep(unsigned int level)
+{
+ /* Level 0 is init_bpf_ns.obj_id_cachep */
+ struct kmem_cache **pkc = &obj_id_cache[level - 1];
+ struct kmem_cache *kc;
+ char name[4 + 10 + 1];
+ unsigned int len;
+
+ kc = READ_ONCE(*pkc);
+ if (kc)
+ return kc;
+
+ snprintf(name, sizeof(name), "bpf_%u", level + 1);
+ len = sizeof(struct bpf_obj_id) + level * sizeof(struct ubpf_obj_id);
+ mutex_lock(&obj_id_caches_mutex);
+ /* Name collision forces to do allocation under mutex. */
+ if (!*pkc)
+ *pkc = kmem_cache_create(name, len, 0,
+ SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
+ mutex_unlock(&obj_id_caches_mutex);
+ /* current can fail, but someone else can succeed. */
+ return READ_ONCE(*pkc);
+}
+
+static void __init bpfns_idr_init(void)
+{
+ int i;
+
+ init_bpf_ns.obj_id_cachep =
+ KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
+ for (i = 0; i < OBJ_ID_NUM; i++)
+ idr_init(&init_bpf_ns.idr[i]);
+}
+
+static __init int bpf_namespaces_init(void)
+{
+ bpfns_cachep = KMEM_CACHE(bpf_namespace, SLAB_PANIC | SLAB_ACCOUNT);
+ bpfns_idr_init();
+ return 0;
+}
+
+late_initcall(bpf_namespaces_init);
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a487ff2..6a6fa70 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -19,6 +19,7 @@
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
#include <linux/time_namespace.h>
+#include <linux/bpf_namespace.h>
#include <linux/fs_struct.h>
#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
@@ -26,6 +27,7 @@
#include <linux/syscalls.h>
#include <linux/cgroup.h>
#include <linux/perf_event.h>
+#include <linux/bpf_namespace.h>

static struct kmem_cache *nsproxy_cachep;

@@ -47,6 +49,9 @@ struct nsproxy init_nsproxy = {
.time_ns = &init_time_ns,
.time_ns_for_children = &init_time_ns,
#endif
+#ifdef CONFIG_BPF
+ .bpf_ns = &init_bpf_ns,
+#endif
};

static inline struct nsproxy *create_nsproxy(void)
@@ -121,8 +126,16 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
}
new_nsp->time_ns = get_time_ns(tsk->nsproxy->time_ns);

+ new_nsp->bpf_ns = copy_bpfns(flags, user_ns, tsk->nsproxy->bpf_ns);
+ if (IS_ERR(new_nsp->bpf_ns)) {
+ err = PTR_ERR(new_nsp->bpf_ns);
+ goto out_bpf;
+ }
return new_nsp;

+out_bpf:
+ put_time_ns(new_nsp->time_ns);
+ put_time_ns(new_nsp->time_ns_for_children);
out_time:
put_net(new_nsp->net_ns);
out_net:
@@ -156,7 +169,7 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)

if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWPID | CLONE_NEWNET |
- CLONE_NEWCGROUP | CLONE_NEWTIME)))) {
+ CLONE_NEWCGROUP | CLONE_NEWTIME | CLONE_NEWBPF)))) {
if ((flags & CLONE_VM) ||
likely(old_ns->time_ns_for_children == old_ns->time_ns)) {
get_nsproxy(old_ns);
@@ -203,6 +216,8 @@ void free_nsproxy(struct nsproxy *ns)
put_time_ns(ns->time_ns_for_children);
put_cgroup_ns(ns->cgroup_ns);
put_net(ns->net_ns);
+ if (ns->bpf_ns)
+ put_bpfns(ns->bpf_ns);
kmem_cache_free(nsproxy_cachep, ns);
}

@@ -218,7 +233,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,

if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP |
- CLONE_NEWTIME)))
+ CLONE_NEWTIME | CLONE_NEWBPF)))
return 0;

user_ns = new_cred ? new_cred->user_ns : current_user_ns();
diff --git a/kernel/ucount.c b/kernel/ucount.c
index ee8e57f..97e0ae3 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -87,6 +87,7 @@ static int set_permissions(struct ctl_table_header *head,
UCOUNT_ENTRY("max_fanotify_groups"),
UCOUNT_ENTRY("max_fanotify_marks"),
#endif
+ UCOUNT_ENTRY("max_bpf_namespaces"),
{ }
};
#endif /* CONFIG_SYSCTL */
--
1.8.3.1