Re: [PATCH v2 bpf-next 03/10] bpf: introduce per-cpu cgroup local storage

From: Roman Gushchin
Date: Wed Sep 26 2018 - 04:43:06 EST


On Tue, Sep 25, 2018 at 11:54:33AM -0700, Song Liu wrote:
>
>
> > On Sep 25, 2018, at 8:21 AM, Roman Gushchin <guro@xxxxxx> wrote:
> >
> > This commit introduced per-cpu cgroup local storage.
> >
> > Per-cpu cgroup local storage is very similar to simple cgroup storage
> > (let's call it shared), except all the data is per-cpu.
> >
> > The main goal of per-cpu variant is to implement super fast
> > counters (e.g. packet counters), which don't require neither
> > lookups, neither atomic operations.
> >
> > From userspace's point of view, accessing a per-cpu cgroup storage
> > is similar to other per-cpu map types (e.g. per-cpu hashmaps and
> > arrays).
> >
> > Writing to a per-cpu cgroup storage is not atomic, but is performed
> > by copying longs, so some minimal atomicity is here, exactly
> > as with other per-cpu maps.
> >
> > Signed-off-by: Roman Gushchin <guro@xxxxxx>
> > Cc: Daniel Borkmann <daniel@xxxxxxxxxxxxx>
> > Cc: Alexei Starovoitov <ast@xxxxxxxxxx>
> > ---
> > include/linux/bpf-cgroup.h | 20 ++++-
> > include/linux/bpf.h | 1 +
> > include/linux/bpf_types.h | 1 +
> > include/uapi/linux/bpf.h | 1 +
> > kernel/bpf/helpers.c | 8 +-
> > kernel/bpf/local_storage.c | 148 ++++++++++++++++++++++++++++++++-----
> > kernel/bpf/syscall.c | 11 ++-
> > kernel/bpf/verifier.c | 15 +++-
> > 8 files changed, 177 insertions(+), 28 deletions(-)
> >
> > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> > index 7e0c9a1d48b7..9bd907657f9b 100644
> > --- a/include/linux/bpf-cgroup.h
> > +++ b/include/linux/bpf-cgroup.h
> > @@ -37,7 +37,10 @@ struct bpf_storage_buffer {
> > };
> >
> > struct bpf_cgroup_storage {
> > - struct bpf_storage_buffer *buf;
> > + union {
> > + struct bpf_storage_buffer *buf;
> > + char __percpu *percpu_buf;
>
> "char *" here looks weird. Did you mean to use "void *"?

Fair enough. It's probably a leftover from (previously used) char[0].

>
> > + };
> > struct bpf_cgroup_storage_map *map;
> > struct bpf_cgroup_storage_key key;
> > struct list_head list;
> > @@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
> > static inline enum bpf_cgroup_storage_type cgroup_storage_type(
> > struct bpf_map *map)
> > {
> > + if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
> > + return BPF_CGROUP_STORAGE_PERCPU;
> > +
> > return BPF_CGROUP_STORAGE_SHARED;
> > }
> >
> > @@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
> > int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
> > void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
> >
> > +int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
> > +int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
> > + void *value, u64 flags);
> > +
> > /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
> > #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
> > ({ \
> > @@ -285,6 +295,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
> > struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
> > static inline void bpf_cgroup_storage_free(
> > struct bpf_cgroup_storage *storage) {}
> > +static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
> > + void *value) {
> > + return 0;
> > +}
> > +static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
> > + void *key, void *value, u64 flags) {
> > + return 0;
> > +}
> >
> > #define cgroup_bpf_enabled (0)
> > #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index b457fbe7b70b..018299a595c8 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -274,6 +274,7 @@ struct bpf_prog_offload {
> >
> > enum bpf_cgroup_storage_type {
> > BPF_CGROUP_STORAGE_SHARED,
> > + BPF_CGROUP_STORAGE_PERCPU,
> > __BPF_CGROUP_STORAGE_MAX
> > };
> >
> > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> > index c9bd6fb765b0..5432f4c9f50e 100644
> > --- a/include/linux/bpf_types.h
> > +++ b/include/linux/bpf_types.h
> > @@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
> > #endif
> > #ifdef CONFIG_CGROUP_BPF
> > BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
> > +BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops)
> > #endif
> > BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
> > BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index aa5ccd2385ed..e2070d819e04 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -127,6 +127,7 @@ enum bpf_map_type {
> > BPF_MAP_TYPE_SOCKHASH,
> > BPF_MAP_TYPE_CGROUP_STORAGE,
> > BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
> > + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
> > };
> >
> > enum bpf_prog_type {
> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index e42f8789b7ea..1f21ef1c4ad3 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
> > */
> > enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
> > struct bpf_cgroup_storage *storage;
> > + void *ptr = NULL;
>
> Not necessary to initialize to NULL.

Fixed.

>
> >
> > storage = this_cpu_read(bpf_cgroup_storage[stype]);
> >
> > - return (unsigned long)&READ_ONCE(storage->buf)->data[0];
> > + if (stype == BPF_CGROUP_STORAGE_SHARED)
> > + ptr = &READ_ONCE(storage->buf)->data[0];
> > + else
> > + ptr = this_cpu_ptr(storage->percpu_buf);
> > +
> > + return (unsigned long)ptr;
> > }
> >
> > const struct bpf_func_proto bpf_get_local_storage_proto = {
> > diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
> > index 6742292fb39e..d991355b5b46 100644
> > --- a/kernel/bpf/local_storage.c
> > +++ b/kernel/bpf/local_storage.c
> > @@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
> > return 0;
> > }
> >
> > +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
> > + void *value)
> > +{
> > + struct bpf_cgroup_storage_map *map = map_to_storage(_map);
> > + struct bpf_cgroup_storage_key *key = _key;
> > + struct bpf_cgroup_storage *storage;
> > + int cpu, off = 0;
> > + u32 size;
> > +
> > + rcu_read_lock();
> > + storage = cgroup_storage_lookup(map, key, false);
> > + if (!storage) {
> > + rcu_read_unlock();
> > + return -ENOENT;
> > + }
> > +
> > + /* per_cpu areas are zero-filled and bpf programs can only
> > + * access 'value_size' of them, so copying rounded areas
> > + * will not leak any kernel data
> > + */
> > + size = round_up(_map->value_size, 8);
> > + for_each_possible_cpu(cpu) {
> > + bpf_long_memcpy(value + off,
> > + per_cpu_ptr(storage->percpu_buf, cpu), size);
> > + off += size;
> > + }
> > + rcu_read_unlock();
> > + return 0;
> > +}
> > +
> > +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
> > + void *value, u64 map_flags)
> > +{
> > + struct bpf_cgroup_storage_map *map = map_to_storage(_map);
> > + struct bpf_cgroup_storage_key *key = _key;
> > + struct bpf_cgroup_storage *storage;
> > + int cpu, off = 0;
> > + u32 size;
> > +
> > + if (unlikely(map_flags & BPF_EXIST))
> > + return -EINVAL;
> > +
> > + rcu_read_lock();
> > + storage = cgroup_storage_lookup(map, key, false);
> > + if (!storage) {
> > + rcu_read_unlock();
> > + return -ENOENT;
> > + }
> > +
> > + /* the user space will provide round_up(value_size, 8) bytes that
> > + * will be copied into per-cpu area. bpf programs can only access
> > + * value_size of it. During lookup the same extra bytes will be
> > + * returned or zeros which were zero-filled by percpu_alloc,
> > + * so no kernel data leaks possible
> > + */
> > + size = round_up(_map->value_size, 8);
> > + for_each_possible_cpu(cpu) {
> > + bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
> > + value + off, size);
> > + off += size;
> > + }
> > + rcu_read_unlock();
> > + return 0;
> > +}
> > +
> > static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
> > void *_next_key)
> > {
> > @@ -292,55 +357,98 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
> > {
> > struct bpf_cgroup_storage *storage;
> > struct bpf_map *map;
> > + gfp_t flags;
> > + size_t size;
> > u32 pages;
> >
> > map = prog->aux->cgroup_storage[stype];
> > if (!map)
> > return NULL;
> >
> > - pages = round_up(sizeof(struct bpf_cgroup_storage) +
> > - sizeof(struct bpf_storage_buffer) +
> > - map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
> > + if (stype == BPF_CGROUP_STORAGE_SHARED) {
> > + size = sizeof(struct bpf_storage_buffer) + map->value_size;
> > + pages = round_up(sizeof(struct bpf_cgroup_storage) + size,
> > + PAGE_SIZE) >> PAGE_SHIFT;
> > + } else {
> > + size = map->value_size;
> > + pages = round_up(round_up(size, 8) * num_possible_cpus(),
> > + PAGE_SIZE) >> PAGE_SHIFT;
> > + }
> > +
> > if (bpf_map_charge_memlock(map, pages))
> > return ERR_PTR(-EPERM);
> >
> > storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
> > __GFP_ZERO | GFP_USER, map->numa_node);
> > - if (!storage) {
> > - bpf_map_uncharge_memlock(map, pages);
> > - return ERR_PTR(-ENOMEM);
> > - }
> > + if (!storage)
> > + goto enomem;
> >
> > - storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) +
> > - map->value_size, __GFP_ZERO | GFP_USER,
> > - map->numa_node);
> > - if (!storage->buf) {
> > - bpf_map_uncharge_memlock(map, pages);
> > - kfree(storage);
> > - return ERR_PTR(-ENOMEM);
> > + flags = __GFP_ZERO | GFP_USER;
> > +
> > + if (stype == BPF_CGROUP_STORAGE_SHARED) {
> > + storage->buf = kmalloc_node(size, flags, map->numa_node);
> > + if (!storage->buf)
> > + goto enomem;
> > + } else {
> > + storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
> > + if (!storage->percpu_buf)
> > + goto enomem;
> > }
> >
> > storage->map = (struct bpf_cgroup_storage_map *)map;
> >
> > return storage;
> > +
> > +enomem:
> > + bpf_map_uncharge_memlock(map, pages);
> > + kfree(storage);
> > + return ERR_PTR(-ENOMEM);
> > +}
> > +
> > +static void free_cgroup_storage_rcu(struct rcu_head *rcu)
>
> Maybe rename as free_shared_cgroup_storage_rcu()?

Yeah, might be more clear.

Thank you for the review!
Will send v3 with these changes and your acks soon.

Thanks!