Re: [RFC PATCH bpf-next 1/9] bpf: introduce CGROUP_SUBSYS_RSTAT program type
From: Yosry Ahmed
Date: Tue May 10 2022 - 14:08:48 EST
On Mon, May 9, 2022 at 5:18 PM Yosry Ahmed <yosryahmed@xxxxxxxxxx> wrote:
>
> This patch introduces a new bpf program type CGROUP_SUBSYS_RSTAT,
> with new corresponding link and attach types.
>
> The main purpose of these programs is to allow BPF programs to collect
> and maintain hierarchical cgroup stats easily and efficiently by making
> using of the rstat framework in the kernel.
>
> Those programs attach to a cgroup subsystem. They typically contain logic
> to aggregate per-cpu and per-cgroup stats collected by other BPF programs.
>
> Currently, only rstat flusher programs can be attached to cgroup
> subsystems, but this can be extended later if a use-case arises.
>
> See the selftest in the final patch for a practical example.
>
> Signed-off-by: Yosry Ahmed <yosryahmed@xxxxxxxxxx>
> ---
> include/linux/bpf-cgroup-subsys.h | 30 ++++++
> include/linux/bpf_types.h | 2 +
> include/linux/cgroup-defs.h | 4 +
> include/uapi/linux/bpf.h | 12 +++
> kernel/bpf/Makefile | 1 +
> kernel/bpf/cgroup_subsys.c | 166 ++++++++++++++++++++++++++++++
> kernel/bpf/syscall.c | 6 ++
> kernel/cgroup/cgroup.c | 1 +
> tools/include/uapi/linux/bpf.h | 12 +++
> 9 files changed, 234 insertions(+)
> create mode 100644 include/linux/bpf-cgroup-subsys.h
> create mode 100644 kernel/bpf/cgroup_subsys.c
>
> diff --git a/include/linux/bpf-cgroup-subsys.h b/include/linux/bpf-cgroup-subsys.h
> new file mode 100644
> index 000000000000..4dcde06b5599
> --- /dev/null
> +++ b/include/linux/bpf-cgroup-subsys.h
> @@ -0,0 +1,30 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright 2022 Google LLC.
> + */
> +#ifndef _BPF_CGROUP_SUBSYS_H_
> +#define _BPF_CGROUP_SUBSYS_H_
> +
> +#include <linux/bpf.h>
> +
> +struct cgroup_subsys_bpf {
> + /* Head of the list of BPF rstat flushers attached to this subsystem */
> + struct list_head rstat_flushers;
> + spinlock_t flushers_lock;
> +};
> +
> +struct bpf_subsys_rstat_flusher {
> + struct bpf_prog *prog;
> + /* List of BPF rtstat flushers, anchored at subsys->bpf */
> + struct list_head list;
> +};
> +
> +struct bpf_cgroup_subsys_link {
> + struct bpf_link link;
> + struct cgroup_subsys *ss;
> +};
> +
> +int cgroup_subsys_bpf_link_attach(const union bpf_attr *attr,
> + struct bpf_prog *prog);
> +
In the next version I will make sure everything here is also defined
for when CONFIG_BPF_SYSCALL is not set, and move the structs that can
be moved to the cc file there.
> +#endif // _BPF_CGROUP_SUBSYS_H_
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index 3e24ad0c4b3c..854ee958b0e4 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -56,6 +56,8 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SYSCTL, cg_sysctl,
> struct bpf_sysctl, struct bpf_sysctl_kern)
> BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCKOPT, cg_sockopt,
> struct bpf_sockopt, struct bpf_sockopt_kern)
> +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT, cgroup_subsys_rstat,
> + struct bpf_rstat_ctx, struct bpf_rstat_ctx)
> #endif
> #ifdef CONFIG_BPF_LIRC_MODE2
> BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2,
> diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
> index 1bfcfb1af352..3bd6eed1fa13 100644
> --- a/include/linux/cgroup-defs.h
> +++ b/include/linux/cgroup-defs.h
> @@ -20,6 +20,7 @@
> #include <linux/u64_stats_sync.h>
> #include <linux/workqueue.h>
> #include <linux/bpf-cgroup-defs.h>
> +#include <linux/bpf-cgroup-subsys.h>
> #include <linux/psi_types.h>
>
> #ifdef CONFIG_CGROUPS
> @@ -706,6 +707,9 @@ struct cgroup_subsys {
> * specifies the mask of subsystems that this one depends on.
> */
> unsigned int depends_on;
> +
> + /* used to store bpf programs.*/
> + struct cgroup_subsys_bpf bpf;
> };
>
> extern struct percpu_rw_semaphore cgroup_threadgroup_rwsem;
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index d14b10b85e51..0f4855fa85db 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -952,6 +952,7 @@ enum bpf_prog_type {
> BPF_PROG_TYPE_LSM,
> BPF_PROG_TYPE_SK_LOOKUP,
> BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> + BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT,
> };
>
> enum bpf_attach_type {
> @@ -998,6 +999,7 @@ enum bpf_attach_type {
> BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
> BPF_PERF_EVENT,
> BPF_TRACE_KPROBE_MULTI,
> + BPF_CGROUP_SUBSYS_RSTAT,
> __MAX_BPF_ATTACH_TYPE
> };
>
> @@ -1013,6 +1015,7 @@ enum bpf_link_type {
> BPF_LINK_TYPE_XDP = 6,
> BPF_LINK_TYPE_PERF_EVENT = 7,
> BPF_LINK_TYPE_KPROBE_MULTI = 8,
> + BPF_LINK_TYPE_CGROUP_SUBSYS = 9,
>
> MAX_BPF_LINK_TYPE,
> };
> @@ -1482,6 +1485,9 @@ union bpf_attr {
> */
> __u64 bpf_cookie;
> } perf_event;
> + struct {
> + __u64 name;
> + } cgroup_subsys;
> struct {
> __u32 flags;
> __u32 cnt;
> @@ -6324,6 +6330,12 @@ struct bpf_cgroup_dev_ctx {
> __u32 minor;
> };
>
> +struct bpf_rstat_ctx {
> + __u64 cgroup_id;
> + __u64 parent_cgroup_id; /* 0 if root */
> + __s32 cpu;
> +};
> +
> struct bpf_raw_tracepoint_args {
> __u64 args[0];
> };
> diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
> index c1a9be6a4b9f..6caf4a61e543 100644
> --- a/kernel/bpf/Makefile
> +++ b/kernel/bpf/Makefile
> @@ -25,6 +25,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
> obj-$(CONFIG_BPF_SYSCALL) += stackmap.o
> endif
> obj-$(CONFIG_CGROUP_BPF) += cgroup.o
> +obj-$(CONFIG_CGROUP_BPF) += cgroup_subsys.o
In the next version I will replace this with:
ifeq ($(CONFIG_CGROUP),y)
obj-$(CONFIG_BPF_SYSCALL) += cgroup_subsys.o
endif
, as this program type doesn't attach to cgroups and does not depend
on CONFIG_CGROUP_BPF, only CONFIG_CGROUP and CONFIG_BPF_SYSCALL.
> ifeq ($(CONFIG_INET),y)
> obj-$(CONFIG_BPF_SYSCALL) += reuseport_array.o
> endif
> diff --git a/kernel/bpf/cgroup_subsys.c b/kernel/bpf/cgroup_subsys.c
> new file mode 100644
> index 000000000000..9673ce6aa84a
> --- /dev/null
> +++ b/kernel/bpf/cgroup_subsys.c
> @@ -0,0 +1,166 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Functions to manage eBPF programs attached to cgroup subsystems
> + *
> + * Copyright 2022 Google LLC.
> + */
> +
> +#include <linux/bpf-cgroup-subsys.h>
> +#include <linux/filter.h>
> +
> +#include "../cgroup/cgroup-internal.h"
> +
> +
> +static int cgroup_subsys_bpf_attach(struct cgroup_subsys *ss, struct bpf_prog *prog)
> +{
> + struct bpf_subsys_rstat_flusher *rstat_flusher;
> +
> + rstat_flusher = kmalloc(sizeof(*rstat_flusher), GFP_KERNEL);
> + if (!rstat_flusher)
> + return -ENOMEM;
> + rstat_flusher->prog = prog;
> +
> + spin_lock(&ss->bpf.flushers_lock);
> + list_add(&rstat_flusher->list, &ss->bpf.rstat_flushers);
> + spin_unlock(&ss->bpf.flushers_lock);
> +
> + return 0;
> +}
> +
> +static void cgroup_subsys_bpf_detach(struct cgroup_subsys *ss, struct bpf_prog *prog)
> +{
> + struct bpf_subsys_rstat_flusher *rstat_flusher = NULL;
> +
> + spin_lock(&ss->bpf.flushers_lock);
> + list_for_each_entry(rstat_flusher, &ss->bpf.rstat_flushers, list)
> + if (rstat_flusher->prog == prog)
> + break;
> +
> + if (rstat_flusher) {
> + list_del(&rstat_flusher->list);
> + bpf_prog_put(rstat_flusher->prog);
> + kfree(rstat_flusher);
> + }
> + spin_unlock(&ss->bpf.flushers_lock);
> +}
> +
> +static void bpf_cgroup_subsys_link_release(struct bpf_link *link)
> +{
> + struct bpf_cgroup_subsys_link *ss_link = container_of(link,
> + struct bpf_cgroup_subsys_link,
> + link);
> + if (ss_link->ss) {
> + cgroup_subsys_bpf_detach(ss_link->ss, ss_link->link.prog);
> + ss_link->ss = NULL;
> + }
> +}
> +
> +static int bpf_cgroup_subsys_link_detach(struct bpf_link *link)
> +{
> + bpf_cgroup_subsys_link_release(link);
> + return 0;
> +}
> +
> +static void bpf_cgroup_subsys_link_dealloc(struct bpf_link *link)
> +{
> + struct bpf_cgroup_subsys_link *ss_link = container_of(link,
> + struct bpf_cgroup_subsys_link,
> + link);
> + kfree(ss_link);
> +}
> +
> +static const struct bpf_link_ops bpf_cgroup_subsys_link_lops = {
> + .detach = bpf_cgroup_subsys_link_detach,
> + .release = bpf_cgroup_subsys_link_release,
> + .dealloc = bpf_cgroup_subsys_link_dealloc,
> +};
> +
> +int cgroup_subsys_bpf_link_attach(const union bpf_attr *attr,
> + struct bpf_prog *prog)
> +{
> + struct bpf_link_primer link_primer;
> + struct bpf_cgroup_subsys_link *link;
> + struct cgroup_subsys *ss, *attach_ss = NULL;
> + const char __user *ss_name_user;
> + char ss_name[MAX_CGROUP_TYPE_NAMELEN];
> + int ssid, err;
> +
> + if (attr->link_create.target_fd || attr->link_create.flags)
> + return -EINVAL;
> +
> + ss_name_user = u64_to_user_ptr(attr->link_create.cgroup_subsys.name);
> + if (strncpy_from_user(ss_name, ss_name_user, sizeof(ss_name) - 1) < 0)
> + return -EFAULT;
> +
> + for_each_subsys(ss, ssid)
> + if (!strcmp(ss_name, ss->name) ||
> + !strcmp(ss_name, ss->legacy_name))
> + attach_ss = ss;
> +
> + if (!attach_ss)
> + return -EINVAL;
> +
> + link = kzalloc(sizeof(*link), GFP_USER);
> + if (!link)
> + return -ENOMEM;
> +
> + bpf_link_init(&link->link, BPF_LINK_TYPE_CGROUP_SUBSYS,
> + &bpf_cgroup_subsys_link_lops,
> + prog);
> + link->ss = attach_ss;
> +
> + err = bpf_link_prime(&link->link, &link_primer);
> + if (err) {
> + kfree(link);
> + return err;
> + }
> +
> + err = cgroup_subsys_bpf_attach(attach_ss, prog);
> + if (err) {
> + bpf_link_cleanup(&link_primer);
> + return err;
> + }
> +
> + return bpf_link_settle(&link_primer);
> +}
> +
> +static const struct bpf_func_proto *
> +cgroup_subsys_rstat_func_proto(enum bpf_func_id func_id,
> + const struct bpf_prog *prog)
> +{
> + return bpf_base_func_proto(func_id);
> +}
> +
> +static bool cgroup_subsys_rstat_is_valid_access(int off, int size,
> + enum bpf_access_type type,
> + const struct bpf_prog *prog,
> + struct bpf_insn_access_aux *info)
> +{
> + if (type == BPF_WRITE)
> + return false;
> +
> + if (off < 0 || off + size > sizeof(struct bpf_rstat_ctx))
> + return false;
> + /* The verifier guarantees that size > 0 */
> + if (off % size != 0)
> + return false;
> +
> + switch (off) {
> + case offsetof(struct bpf_rstat_ctx, cgroup_id):
> + return size == sizeof(__u64);
> + case offsetof(struct bpf_rstat_ctx, parent_cgroup_id):
> + return size == sizeof(__u64);
> + case offsetof(struct bpf_rstat_ctx, cpu):
> + return size == sizeof(__s32);
> + default:
> + return false;
> + }
> +}
> +
> +const struct bpf_prog_ops cgroup_subsys_rstat_prog_ops = {
> +};
> +
> +const struct bpf_verifier_ops cgroup_subsys_rstat_verifier_ops = {
> + .get_func_proto = cgroup_subsys_rstat_func_proto,
> + .is_valid_access = cgroup_subsys_rstat_is_valid_access,
> +};
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index cdaa1152436a..48149c54d969 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -3,6 +3,7 @@
> */
> #include <linux/bpf.h>
> #include <linux/bpf-cgroup.h>
> +#include <linux/bpf-cgroup-subsys.h>
> #include <linux/bpf_trace.h>
> #include <linux/bpf_lirc.h>
> #include <linux/bpf_verifier.h>
> @@ -3194,6 +3195,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
> return BPF_PROG_TYPE_SK_LOOKUP;
> case BPF_XDP:
> return BPF_PROG_TYPE_XDP;
> + case BPF_CGROUP_SUBSYS_RSTAT:
> + return BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT;
> default:
> return BPF_PROG_TYPE_UNSPEC;
> }
> @@ -4341,6 +4344,9 @@ static int link_create(union bpf_attr *attr, bpfptr_t uattr)
> else
> ret = bpf_kprobe_multi_link_attach(attr, prog);
> break;
> + case BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT:
> + ret = cgroup_subsys_bpf_link_attach(attr, prog);
> + break;
> default:
> ret = -EINVAL;
> }
> diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
> index adb820e98f24..7b1448013009 100644
> --- a/kernel/cgroup/cgroup.c
> +++ b/kernel/cgroup/cgroup.c
> @@ -5745,6 +5745,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
>
> idr_init(&ss->css_idr);
> INIT_LIST_HEAD(&ss->cfts);
> + INIT_LIST_HEAD(&ss->bpf.rstat_flushers);
>
> /* Create the root cgroup state for this subsystem */
> ss->root = &cgrp_dfl_root;
> diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
> index d14b10b85e51..0f4855fa85db 100644
> --- a/tools/include/uapi/linux/bpf.h
> +++ b/tools/include/uapi/linux/bpf.h
> @@ -952,6 +952,7 @@ enum bpf_prog_type {
> BPF_PROG_TYPE_LSM,
> BPF_PROG_TYPE_SK_LOOKUP,
> BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */
> + BPF_PROG_TYPE_CGROUP_SUBSYS_RSTAT,
> };
>
> enum bpf_attach_type {
> @@ -998,6 +999,7 @@ enum bpf_attach_type {
> BPF_SK_REUSEPORT_SELECT_OR_MIGRATE,
> BPF_PERF_EVENT,
> BPF_TRACE_KPROBE_MULTI,
> + BPF_CGROUP_SUBSYS_RSTAT,
> __MAX_BPF_ATTACH_TYPE
> };
>
> @@ -1013,6 +1015,7 @@ enum bpf_link_type {
> BPF_LINK_TYPE_XDP = 6,
> BPF_LINK_TYPE_PERF_EVENT = 7,
> BPF_LINK_TYPE_KPROBE_MULTI = 8,
> + BPF_LINK_TYPE_CGROUP_SUBSYS = 9,
>
> MAX_BPF_LINK_TYPE,
> };
> @@ -1482,6 +1485,9 @@ union bpf_attr {
> */
> __u64 bpf_cookie;
> } perf_event;
> + struct {
> + __u64 name;
> + } cgroup_subsys;
> struct {
> __u32 flags;
> __u32 cnt;
> @@ -6324,6 +6330,12 @@ struct bpf_cgroup_dev_ctx {
> __u32 minor;
> };
>
> +struct bpf_rstat_ctx {
> + __u64 cgroup_id;
> + __u64 parent_cgroup_id; /* 0 if root */
> + __s32 cpu;
> +};
> +
> struct bpf_raw_tracepoint_args {
> __u64 args[0];
> };
> --
> 2.36.0.512.ge40c2bad7a-goog
>