[RFC PATCH bpf-next RESEND 01/16] bpf: Introduce BPF_PROG_TYPE_CRIB
From: Juntong Deng
Date: Thu Jul 11 2024 - 07:24:48 EST
This patch adds a new BPF program type CRIB (Checkpoint/Restore In eBPF)
for checkpointing/restoring processes through eBPF.
CRIB BPF programs are not attached to any hooks, run through
BPF_PROG_RUN, and are called by userspace programs as eBPF APIs
for dumping/restoring process information.
CRIB BPF programs dump/restore process information through CRIB
kfunc APIs.
Signed-off-by: Juntong Deng <juntong.deng@xxxxxxxxxxx>
---
include/linux/bpf_crib.h | 16 +++++
include/linux/bpf_types.h | 4 ++
include/uapi/linux/bpf.h | 1 +
kernel/bpf/Kconfig | 2 +
kernel/bpf/Makefile | 2 +
kernel/bpf/btf.c | 4 ++
kernel/bpf/crib/Kconfig | 14 ++++
kernel/bpf/crib/Makefile | 3 +
kernel/bpf/crib/bpf_checkpoint.c | 13 ++++
kernel/bpf/crib/bpf_crib.c | 109 +++++++++++++++++++++++++++++++
kernel/bpf/crib/bpf_restore.c | 13 ++++
kernel/bpf/helpers.c | 1 +
kernel/bpf/syscall.c | 1 +
tools/include/uapi/linux/bpf.h | 1 +
tools/lib/bpf/libbpf.c | 2 +
tools/lib/bpf/libbpf_probes.c | 1 +
16 files changed, 187 insertions(+)
create mode 100644 include/linux/bpf_crib.h
create mode 100644 kernel/bpf/crib/Kconfig
create mode 100644 kernel/bpf/crib/Makefile
create mode 100644 kernel/bpf/crib/bpf_checkpoint.c
create mode 100644 kernel/bpf/crib/bpf_crib.c
create mode 100644 kernel/bpf/crib/bpf_restore.c
diff --git a/include/linux/bpf_crib.h b/include/linux/bpf_crib.h
new file mode 100644
index 000000000000..f667b740fcc2
--- /dev/null
+++ b/include/linux/bpf_crib.h
@@ -0,0 +1,16 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Checkpoint/Restore In eBPF (CRIB)
+ *
+ * Author:
+ * Juntong Deng <juntong.deng@xxxxxxxxxxx>
+ */
+#ifndef _BPF_CRIB_H
+#define _BPF_CRIB_H
+
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/btf_ids.h>
+#include <linux/filter.h>
+
+#endif /* _BPF_CRIB_H */
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index 9f2a6b83b49e..a6feddfd17e2 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -83,6 +83,10 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_SYSCALL, bpf_syscall,
BPF_PROG_TYPE(BPF_PROG_TYPE_NETFILTER, netfilter,
struct bpf_nf_ctx, struct bpf_nf_ctx)
#endif
+#ifdef CONFIG_BPF_CRIB
+BPF_PROG_TYPE(BPF_PROG_TYPE_CRIB, bpf_crib,
+ void *, void *)
+#endif /* CONFIG_BPF_CRIB */
BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 35bcf52dbc65..cb67a9cad8c6 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1055,6 +1055,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SK_LOOKUP,
BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
BPF_PROG_TYPE_NETFILTER,
+ BPF_PROG_TYPE_CRIB,
__MAX_BPF_PROG_TYPE
};
diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig
index 17067dcb4386..a129677a03e3 100644
--- a/kernel/bpf/Kconfig
+++ b/kernel/bpf/Kconfig
@@ -101,4 +101,6 @@ config BPF_LSM
If you are unsure how to answer this question, answer N.
+source "kernel/bpf/crib/Kconfig"
+
endmenu # "BPF subsystem"
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 0291eef9ce92..8c350d159d3c 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -58,3 +58,5 @@ vpath %.c $(srctree)/kernel/bpf:$(srctree)/tools/lib/bpf
$(obj)/%.o: %.c FORCE
$(call if_changed_rule,cc_o_c)
+
+obj-$(CONFIG_BPF_CRIB) += crib/
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index 4ff11779699e..306349ee3d6a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -219,6 +219,7 @@ enum btf_kfunc_hook {
BTF_KFUNC_HOOK_LWT,
BTF_KFUNC_HOOK_NETFILTER,
BTF_KFUNC_HOOK_KPROBE,
+ BTF_KFUNC_HOOK_CRIB,
BTF_KFUNC_HOOK_MAX,
};
@@ -6037,6 +6038,7 @@ static int btf_validate_prog_ctx_type(struct bpf_verifier_log *log, const struct
case BPF_PROG_TYPE_TRACEPOINT:
case BPF_PROG_TYPE_SYSCALL:
case BPF_PROG_TYPE_EXT:
+ case BPF_PROG_TYPE_CRIB:
return 0; /* anything goes */
default:
break;
@@ -8326,6 +8328,8 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
return BTF_KFUNC_HOOK_NETFILTER;
case BPF_PROG_TYPE_KPROBE:
return BTF_KFUNC_HOOK_KPROBE;
+ case BPF_PROG_TYPE_CRIB:
+ return BTF_KFUNC_HOOK_CRIB;
default:
return BTF_KFUNC_HOOK_MAX;
}
diff --git a/kernel/bpf/crib/Kconfig b/kernel/bpf/crib/Kconfig
new file mode 100644
index 000000000000..346304f65db6
--- /dev/null
+++ b/kernel/bpf/crib/Kconfig
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: GPL-2.0-only
+
+config BPF_CRIB
+ bool "Checkpoint/Restore In eBPF (CRIB)"
+ depends on BPF_SYSCALL
+ depends on BPF_JIT
+ depends on DEBUG_INFO_BTF
+ help
+ Enable CRIB (Checkpoint/Restore In eBPF), which allows
+ checkpointing/restoring of processes through BPF programs.
+
+ Compared to procfs and system call interfaces, CRIB achieves
+ higher performance and supports dumping/restoring more
+ comprehensive process status information.
diff --git a/kernel/bpf/crib/Makefile b/kernel/bpf/crib/Makefile
new file mode 100644
index 000000000000..abd43c76140b
--- /dev/null
+++ b/kernel/bpf/crib/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+
+obj-$(CONFIG_BPF_CRIB) += bpf_crib.o bpf_checkpoint.o bpf_restore.o
diff --git a/kernel/bpf/crib/bpf_checkpoint.c b/kernel/bpf/crib/bpf_checkpoint.c
new file mode 100644
index 000000000000..efaca6bcdfe4
--- /dev/null
+++ b/kernel/bpf/crib/bpf_checkpoint.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Checkpoint/Restore In eBPF (CRIB): Checkpoint
+ *
+ * Author:
+ * Juntong Deng <juntong.deng@xxxxxxxxxxx>
+ */
+
+#include <linux/bpf_crib.h>
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/crib/bpf_crib.c b/kernel/bpf/crib/bpf_crib.c
new file mode 100644
index 000000000000..9ef2d61955bf
--- /dev/null
+++ b/kernel/bpf/crib/bpf_crib.c
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Checkpoint/Restore In eBPF (CRIB): Common
+ *
+ * Author:
+ * Juntong Deng <juntong.deng@xxxxxxxxxxx>
+ */
+
+#include <linux/bpf_crib.h>
+#include <linux/init.h>
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc_end_defs();
+
+BTF_KFUNCS_START(bpf_crib_kfuncs)
+
+BTF_KFUNCS_END(bpf_crib_kfuncs)
+
+static int bpf_prog_run_crib(struct bpf_prog *prog,
+ const union bpf_attr *kattr,
+ union bpf_attr __user *uattr)
+{
+ void __user *ctx_in = u64_to_user_ptr(kattr->test.ctx_in);
+ __u32 ctx_size_in = kattr->test.ctx_size_in;
+ void *ctx = NULL;
+ u32 retval;
+ int err = 0;
+
+ /* doesn't support data_in/out, ctx_out, duration, or repeat or flags */
+ if (kattr->test.data_in || kattr->test.data_out ||
+ kattr->test.ctx_out || kattr->test.duration ||
+ kattr->test.repeat || kattr->test.flags ||
+ kattr->test.batch_size)
+ return -EINVAL;
+
+ if (ctx_size_in < prog->aux->max_ctx_offset ||
+ ctx_size_in > U16_MAX)
+ return -EINVAL;
+
+ if (ctx_size_in) {
+ ctx = memdup_user(ctx_in, ctx_size_in);
+ if (IS_ERR(ctx))
+ return PTR_ERR(ctx);
+ }
+
+ rcu_read_lock_trace();
+ retval = bpf_prog_run_pin_on_cpu(prog, ctx);
+ rcu_read_unlock_trace();
+
+ if (copy_to_user(&uattr->test.retval, &retval, sizeof(u32))) {
+ err = -EFAULT;
+ goto out;
+ }
+out:
+ if (ctx_size_in)
+ kfree(ctx);
+
+ return err;
+}
+
+static const struct bpf_func_proto *
+bpf_crib_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+ switch (func_id) {
+ default:
+ return bpf_base_func_proto(func_id, prog);
+ }
+}
+
+static bool bpf_crib_is_valid_access(int off, int size,
+ enum bpf_access_type type,
+ const struct bpf_prog *prog,
+ struct bpf_insn_access_aux *info)
+{
+ /*
+ * Changing the context is not allowed, and all dumped data
+ * is returned to userspace via ringbuf.
+ */
+ if (type != BPF_READ)
+ return false;
+ if (off < 0 || off >= U16_MAX)
+ return false;
+ if (off % size != 0)
+ return false;
+
+ return true;
+}
+
+const struct bpf_prog_ops bpf_crib_prog_ops = {
+ .test_run = bpf_prog_run_crib,
+};
+
+const struct bpf_verifier_ops bpf_crib_verifier_ops = {
+ .get_func_proto = bpf_crib_func_proto,
+ .is_valid_access = bpf_crib_is_valid_access,
+};
+
+static const struct btf_kfunc_id_set bpf_crib_kfunc_set = {
+ .owner = THIS_MODULE,
+ .set = &bpf_crib_kfuncs,
+};
+
+static int __init bpf_crib_init(void)
+{
+ return register_btf_kfunc_id_set(BPF_PROG_TYPE_CRIB, &bpf_crib_kfunc_set);
+}
+
+late_initcall(bpf_crib_init);
diff --git a/kernel/bpf/crib/bpf_restore.c b/kernel/bpf/crib/bpf_restore.c
new file mode 100644
index 000000000000..6bbb4b01e34b
--- /dev/null
+++ b/kernel/bpf/crib/bpf_restore.c
@@ -0,0 +1,13 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Checkpoint/Restore In eBPF (CRIB): Restore
+ *
+ * Author:
+ * Juntong Deng <juntong.deng@xxxxxxxxxxx>
+ */
+
+#include <linux/bpf_crib.h>
+
+__bpf_kfunc_start_defs();
+
+__bpf_kfunc_end_defs();
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 5241ba671c5a..bcd3ce9da00c 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -2986,6 +2986,7 @@ static int __init kfunc_init(void)
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, &generic_kfunc_set);
ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &generic_kfunc_set);
+ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CRIB, &generic_kfunc_set);
ret = ret ?: register_btf_id_dtor_kfuncs(generic_dtors,
ARRAY_SIZE(generic_dtors),
THIS_MODULE);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0719192a3482..faf99e53d706 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2633,6 +2633,7 @@ bpf_prog_load_check_attach(enum bpf_prog_type prog_type,
return -EINVAL;
case BPF_PROG_TYPE_SYSCALL:
case BPF_PROG_TYPE_EXT:
+ case BPF_PROG_TYPE_CRIB:
if (expected_attach_type)
return -EINVAL;
fallthrough;
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 35bcf52dbc65..cb67a9cad8c6 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1055,6 +1055,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_SK_LOOKUP,
BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
BPF_PROG_TYPE_NETFILTER,
+ BPF_PROG_TYPE_CRIB,
__MAX_BPF_PROG_TYPE
};
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 30f121754d83..4e1451901b7d 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -224,6 +224,7 @@ static const char * const prog_type_name[] = {
[BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup",
[BPF_PROG_TYPE_SYSCALL] = "syscall",
[BPF_PROG_TYPE_NETFILTER] = "netfilter",
+ [BPF_PROG_TYPE_CRIB] = "crib",
};
static int __base_pr(enum libbpf_print_level level, const char *format,
@@ -9449,6 +9450,7 @@ static const struct bpf_sec_def section_defs[] = {
SEC_DEF("struct_ops.s+", STRUCT_OPS, 0, SEC_SLEEPABLE),
SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE),
SEC_DEF("netfilter", NETFILTER, BPF_NETFILTER, SEC_NONE),
+ SEC_DEF("crib", CRIB, 0, SEC_NONE),
};
int libbpf_register_prog_handler(const char *sec,
diff --git a/tools/lib/bpf/libbpf_probes.c b/tools/lib/bpf/libbpf_probes.c
index 9dfbe7750f56..2e087280c5f0 100644
--- a/tools/lib/bpf/libbpf_probes.c
+++ b/tools/lib/bpf/libbpf_probes.c
@@ -180,6 +180,7 @@ static int probe_prog_load(enum bpf_prog_type prog_type,
case BPF_PROG_TYPE_SK_REUSEPORT:
case BPF_PROG_TYPE_FLOW_DISSECTOR:
case BPF_PROG_TYPE_CGROUP_SYSCTL:
+ case BPF_PROG_TYPE_CRIB:
break;
case BPF_PROG_TYPE_NETFILTER:
opts.expected_attach_type = BPF_NETFILTER;
--
2.39.2