Re: [PATCH v3 bpf-next 03/10] bpf: introduce per-cpu cgroup local storage
From: Roman Gushchin
Date: Fri Sep 28 2018 - 06:04:07 EST
On Fri, Sep 28, 2018 at 10:45:31AM +0200, Alexei Starovoitov wrote:
> On Wed, Sep 26, 2018 at 12:33:19PM +0100, Roman Gushchin wrote:
> > This commit introduced per-cpu cgroup local storage.
> >
> > Per-cpu cgroup local storage is very similar to simple cgroup storage
> > (let's call it shared), except all the data is per-cpu.
> >
> > The main goal of per-cpu variant is to implement super fast
> > counters (e.g. packet counters), which don't require neither
> > lookups, neither atomic operations.
> >
> > From userspace's point of view, accessing a per-cpu cgroup storage
> > is similar to other per-cpu map types (e.g. per-cpu hashmaps and
> > arrays).
> >
> > Writing to a per-cpu cgroup storage is not atomic, but is performed
> > by copying longs, so some minimal atomicity is here, exactly
> > as with other per-cpu maps.
> >
> > Signed-off-by: Roman Gushchin <guro@xxxxxx>
> > Cc: Daniel Borkmann <daniel@xxxxxxxxxxxxx>
> > Cc: Alexei Starovoitov <ast@xxxxxxxxxx>
> > ---
> > include/linux/bpf-cgroup.h | 20 ++++-
> > include/linux/bpf.h | 1 +
> > include/linux/bpf_types.h | 1 +
> > include/uapi/linux/bpf.h | 1 +
> > kernel/bpf/helpers.c | 8 +-
> > kernel/bpf/local_storage.c | 148 ++++++++++++++++++++++++++++++++-----
> > kernel/bpf/syscall.c | 11 ++-
> > kernel/bpf/verifier.c | 15 +++-
> > 8 files changed, 177 insertions(+), 28 deletions(-)
> >
> > diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> > index 7e0c9a1d48b7..588dd5f0bd85 100644
> > --- a/include/linux/bpf-cgroup.h
> > +++ b/include/linux/bpf-cgroup.h
> > @@ -37,7 +37,10 @@ struct bpf_storage_buffer {
> > };
> >
> > struct bpf_cgroup_storage {
> > - struct bpf_storage_buffer *buf;
> > + union {
> > + struct bpf_storage_buffer *buf;
> > + void __percpu *percpu_buf;
> > + };
> > struct bpf_cgroup_storage_map *map;
> > struct bpf_cgroup_storage_key key;
> > struct list_head list;
> > @@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
> > static inline enum bpf_cgroup_storage_type cgroup_storage_type(
> > struct bpf_map *map)
> > {
> > + if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
> > + return BPF_CGROUP_STORAGE_PERCPU;
> > +
> > return BPF_CGROUP_STORAGE_SHARED;
> > }
> >
> > @@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
> > int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
> > void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
> >
> > +int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
> > +int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
> > + void *value, u64 flags);
> > +
> > /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
> > #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
> > ({ \
> > @@ -285,6 +295,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
> > struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
> > static inline void bpf_cgroup_storage_free(
> > struct bpf_cgroup_storage *storage) {}
> > +static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
> > + void *value) {
> > + return 0;
> > +}
> > +static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
> > + void *key, void *value, u64 flags) {
> > + return 0;
> > +}
> >
> > #define cgroup_bpf_enabled (0)
> > #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
> > diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> > index b457fbe7b70b..018299a595c8 100644
> > --- a/include/linux/bpf.h
> > +++ b/include/linux/bpf.h
> > @@ -274,6 +274,7 @@ struct bpf_prog_offload {
> >
> > enum bpf_cgroup_storage_type {
> > BPF_CGROUP_STORAGE_SHARED,
> > + BPF_CGROUP_STORAGE_PERCPU,
> > __BPF_CGROUP_STORAGE_MAX
> > };
> >
> > diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> > index c9bd6fb765b0..5432f4c9f50e 100644
> > --- a/include/linux/bpf_types.h
> > +++ b/include/linux/bpf_types.h
> > @@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
> > #endif
> > #ifdef CONFIG_CGROUP_BPF
> > BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
> > +BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops)
> > #endif
> > BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
> > BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index aa5ccd2385ed..e2070d819e04 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -127,6 +127,7 @@ enum bpf_map_type {
> > BPF_MAP_TYPE_SOCKHASH,
> > BPF_MAP_TYPE_CGROUP_STORAGE,
> > BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
> > + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
> > };
> >
> > enum bpf_prog_type {
> > diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> > index e42f8789b7ea..6502115e8f55 100644
> > --- a/kernel/bpf/helpers.c
> > +++ b/kernel/bpf/helpers.c
> > @@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
> > */
> > enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
> > struct bpf_cgroup_storage *storage;
> > + void *ptr;
> >
> > storage = this_cpu_read(bpf_cgroup_storage[stype]);
> >
> > - return (unsigned long)&READ_ONCE(storage->buf)->data[0];
> > + if (stype == BPF_CGROUP_STORAGE_SHARED)
> > + ptr = &READ_ONCE(storage->buf)->data[0];
> > + else
> > + ptr = this_cpu_ptr(storage->percpu_buf);
> > +
> > + return (unsigned long)ptr;
> > }
> >
> > const struct bpf_func_proto bpf_get_local_storage_proto = {
> > diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
> > index 6742292fb39e..c739f6dcc3c2 100644
> > --- a/kernel/bpf/local_storage.c
> > +++ b/kernel/bpf/local_storage.c
> > @@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
> > return 0;
> > }
> >
> > +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
> > + void *value)
> > +{
> > + struct bpf_cgroup_storage_map *map = map_to_storage(_map);
> > + struct bpf_cgroup_storage_key *key = _key;
> > + struct bpf_cgroup_storage *storage;
> > + int cpu, off = 0;
> > + u32 size;
> > +
> > + rcu_read_lock();
> > + storage = cgroup_storage_lookup(map, key, false);
> > + if (!storage) {
> > + rcu_read_unlock();
> > + return -ENOENT;
> > + }
> > +
> > + /* per_cpu areas are zero-filled and bpf programs can only
> > + * access 'value_size' of them, so copying rounded areas
> > + * will not leak any kernel data
> > + */
> > + size = round_up(_map->value_size, 8);
> > + for_each_possible_cpu(cpu) {
> > + bpf_long_memcpy(value + off,
> > + per_cpu_ptr(storage->percpu_buf, cpu), size);
> > + off += size;
> > + }
> > + rcu_read_unlock();
> > + return 0;
> > +}
> > +
> > +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
> > + void *value, u64 map_flags)
> > +{
> > + struct bpf_cgroup_storage_map *map = map_to_storage(_map);
> > + struct bpf_cgroup_storage_key *key = _key;
> > + struct bpf_cgroup_storage *storage;
> > + int cpu, off = 0;
> > + u32 size;
> > +
> > + if (unlikely(map_flags & BPF_EXIST))
> > + return -EINVAL;
>
> that should have been BPF_NOEXIST ?
Yeah, or maybe even better s/&/!= ?
It's probably better to require BPF_EXIST flag to update a cgroup storage?
Agree? If so, let me fix this for both shared and per-cpu versions in
a follow-up patch.
>
> > +
> > + rcu_read_lock();
> > + storage = cgroup_storage_lookup(map, key, false);
> > + if (!storage) {
> > + rcu_read_unlock();
> > + return -ENOENT;
> > + }
> > +
> > + /* the user space will provide round_up(value_size, 8) bytes that
> > + * will be copied into per-cpu area. bpf programs can only access
> > + * value_size of it. During lookup the same extra bytes will be
> > + * returned or zeros which were zero-filled by percpu_alloc,
> > + * so no kernel data leaks possible
> > + */
> > + size = round_up(_map->value_size, 8);
> > + for_each_possible_cpu(cpu) {
> > + bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
> > + value + off, size);
> > + off += size;
> > + }
> > + rcu_read_unlock();
>
> storage_update and storage_copy look essentially the same
> with the only difference that src/dst swap.
> Would it be possible to combine them ?
> Not sure whether #define template would look better.
I'll try to refactor this and next one in a follow-up patch.
Thanks!