Re: [PATCH v2 bpf-next 03/10] bpf: introduce per-cpu cgroup local storage
From: Song Liu
Date: Tue Sep 25 2018 - 14:55:03 EST
> On Sep 25, 2018, at 8:21 AM, Roman Gushchin <guro@xxxxxx> wrote:
>
> This commit introduced per-cpu cgroup local storage.
>
> Per-cpu cgroup local storage is very similar to simple cgroup storage
> (let's call it shared), except all the data is per-cpu.
>
> The main goal of per-cpu variant is to implement super fast
> counters (e.g. packet counters), which don't require neither
> lookups, neither atomic operations.
>
> From userspace's point of view, accessing a per-cpu cgroup storage
> is similar to other per-cpu map types (e.g. per-cpu hashmaps and
> arrays).
>
> Writing to a per-cpu cgroup storage is not atomic, but is performed
> by copying longs, so some minimal atomicity is here, exactly
> as with other per-cpu maps.
>
> Signed-off-by: Roman Gushchin <guro@xxxxxx>
> Cc: Daniel Borkmann <daniel@xxxxxxxxxxxxx>
> Cc: Alexei Starovoitov <ast@xxxxxxxxxx>
> ---
> include/linux/bpf-cgroup.h | 20 ++++-
> include/linux/bpf.h | 1 +
> include/linux/bpf_types.h | 1 +
> include/uapi/linux/bpf.h | 1 +
> kernel/bpf/helpers.c | 8 +-
> kernel/bpf/local_storage.c | 148 ++++++++++++++++++++++++++++++++-----
> kernel/bpf/syscall.c | 11 ++-
> kernel/bpf/verifier.c | 15 +++-
> 8 files changed, 177 insertions(+), 28 deletions(-)
>
> diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
> index 7e0c9a1d48b7..9bd907657f9b 100644
> --- a/include/linux/bpf-cgroup.h
> +++ b/include/linux/bpf-cgroup.h
> @@ -37,7 +37,10 @@ struct bpf_storage_buffer {
> };
>
> struct bpf_cgroup_storage {
> - struct bpf_storage_buffer *buf;
> + union {
> + struct bpf_storage_buffer *buf;
> + char __percpu *percpu_buf;
"char *" here looks weird. Did you mean to use "void *"?
> + };
> struct bpf_cgroup_storage_map *map;
> struct bpf_cgroup_storage_key key;
> struct list_head list;
> @@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
> static inline enum bpf_cgroup_storage_type cgroup_storage_type(
> struct bpf_map *map)
> {
> + if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
> + return BPF_CGROUP_STORAGE_PERCPU;
> +
> return BPF_CGROUP_STORAGE_SHARED;
> }
>
> @@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
> int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
> void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
>
> +int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
> +int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
> + void *value, u64 flags);
> +
> /* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
> #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
> ({ \
> @@ -285,6 +295,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
> struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
> static inline void bpf_cgroup_storage_free(
> struct bpf_cgroup_storage *storage) {}
> +static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
> + void *value) {
> + return 0;
> +}
> +static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
> + void *key, void *value, u64 flags) {
> + return 0;
> +}
>
> #define cgroup_bpf_enabled (0)
> #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index b457fbe7b70b..018299a595c8 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -274,6 +274,7 @@ struct bpf_prog_offload {
>
> enum bpf_cgroup_storage_type {
> BPF_CGROUP_STORAGE_SHARED,
> + BPF_CGROUP_STORAGE_PERCPU,
> __BPF_CGROUP_STORAGE_MAX
> };
>
> diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
> index c9bd6fb765b0..5432f4c9f50e 100644
> --- a/include/linux/bpf_types.h
> +++ b/include/linux/bpf_types.h
> @@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
> #endif
> #ifdef CONFIG_CGROUP_BPF
> BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
> +BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops)
> #endif
> BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
> BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
> diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> index aa5ccd2385ed..e2070d819e04 100644
> --- a/include/uapi/linux/bpf.h
> +++ b/include/uapi/linux/bpf.h
> @@ -127,6 +127,7 @@ enum bpf_map_type {
> BPF_MAP_TYPE_SOCKHASH,
> BPF_MAP_TYPE_CGROUP_STORAGE,
> BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
> + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
> };
>
> enum bpf_prog_type {
> diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
> index e42f8789b7ea..1f21ef1c4ad3 100644
> --- a/kernel/bpf/helpers.c
> +++ b/kernel/bpf/helpers.c
> @@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
> */
> enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
> struct bpf_cgroup_storage *storage;
> + void *ptr = NULL;
Not necessary to initialize to NULL.
>
> storage = this_cpu_read(bpf_cgroup_storage[stype]);
>
> - return (unsigned long)&READ_ONCE(storage->buf)->data[0];
> + if (stype == BPF_CGROUP_STORAGE_SHARED)
> + ptr = &READ_ONCE(storage->buf)->data[0];
> + else
> + ptr = this_cpu_ptr(storage->percpu_buf);
> +
> + return (unsigned long)ptr;
> }
>
> const struct bpf_func_proto bpf_get_local_storage_proto = {
> diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
> index 6742292fb39e..d991355b5b46 100644
> --- a/kernel/bpf/local_storage.c
> +++ b/kernel/bpf/local_storage.c
> @@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
> return 0;
> }
>
> +int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
> + void *value)
> +{
> + struct bpf_cgroup_storage_map *map = map_to_storage(_map);
> + struct bpf_cgroup_storage_key *key = _key;
> + struct bpf_cgroup_storage *storage;
> + int cpu, off = 0;
> + u32 size;
> +
> + rcu_read_lock();
> + storage = cgroup_storage_lookup(map, key, false);
> + if (!storage) {
> + rcu_read_unlock();
> + return -ENOENT;
> + }
> +
> + /* per_cpu areas are zero-filled and bpf programs can only
> + * access 'value_size' of them, so copying rounded areas
> + * will not leak any kernel data
> + */
> + size = round_up(_map->value_size, 8);
> + for_each_possible_cpu(cpu) {
> + bpf_long_memcpy(value + off,
> + per_cpu_ptr(storage->percpu_buf, cpu), size);
> + off += size;
> + }
> + rcu_read_unlock();
> + return 0;
> +}
> +
> +int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
> + void *value, u64 map_flags)
> +{
> + struct bpf_cgroup_storage_map *map = map_to_storage(_map);
> + struct bpf_cgroup_storage_key *key = _key;
> + struct bpf_cgroup_storage *storage;
> + int cpu, off = 0;
> + u32 size;
> +
> + if (unlikely(map_flags & BPF_EXIST))
> + return -EINVAL;
> +
> + rcu_read_lock();
> + storage = cgroup_storage_lookup(map, key, false);
> + if (!storage) {
> + rcu_read_unlock();
> + return -ENOENT;
> + }
> +
> + /* the user space will provide round_up(value_size, 8) bytes that
> + * will be copied into per-cpu area. bpf programs can only access
> + * value_size of it. During lookup the same extra bytes will be
> + * returned or zeros which were zero-filled by percpu_alloc,
> + * so no kernel data leaks possible
> + */
> + size = round_up(_map->value_size, 8);
> + for_each_possible_cpu(cpu) {
> + bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
> + value + off, size);
> + off += size;
> + }
> + rcu_read_unlock();
> + return 0;
> +}
> +
> static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
> void *_next_key)
> {
> @@ -292,55 +357,98 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
> {
> struct bpf_cgroup_storage *storage;
> struct bpf_map *map;
> + gfp_t flags;
> + size_t size;
> u32 pages;
>
> map = prog->aux->cgroup_storage[stype];
> if (!map)
> return NULL;
>
> - pages = round_up(sizeof(struct bpf_cgroup_storage) +
> - sizeof(struct bpf_storage_buffer) +
> - map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
> + if (stype == BPF_CGROUP_STORAGE_SHARED) {
> + size = sizeof(struct bpf_storage_buffer) + map->value_size;
> + pages = round_up(sizeof(struct bpf_cgroup_storage) + size,
> + PAGE_SIZE) >> PAGE_SHIFT;
> + } else {
> + size = map->value_size;
> + pages = round_up(round_up(size, 8) * num_possible_cpus(),
> + PAGE_SIZE) >> PAGE_SHIFT;
> + }
> +
> if (bpf_map_charge_memlock(map, pages))
> return ERR_PTR(-EPERM);
>
> storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
> __GFP_ZERO | GFP_USER, map->numa_node);
> - if (!storage) {
> - bpf_map_uncharge_memlock(map, pages);
> - return ERR_PTR(-ENOMEM);
> - }
> + if (!storage)
> + goto enomem;
>
> - storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) +
> - map->value_size, __GFP_ZERO | GFP_USER,
> - map->numa_node);
> - if (!storage->buf) {
> - bpf_map_uncharge_memlock(map, pages);
> - kfree(storage);
> - return ERR_PTR(-ENOMEM);
> + flags = __GFP_ZERO | GFP_USER;
> +
> + if (stype == BPF_CGROUP_STORAGE_SHARED) {
> + storage->buf = kmalloc_node(size, flags, map->numa_node);
> + if (!storage->buf)
> + goto enomem;
> + } else {
> + storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
> + if (!storage->percpu_buf)
> + goto enomem;
> }
>
> storage->map = (struct bpf_cgroup_storage_map *)map;
>
> return storage;
> +
> +enomem:
> + bpf_map_uncharge_memlock(map, pages);
> + kfree(storage);
> + return ERR_PTR(-ENOMEM);
> +}
> +
> +static void free_cgroup_storage_rcu(struct rcu_head *rcu)
Maybe rename as free_shared_cgroup_storage_rcu()?
> +{
> + struct bpf_cgroup_storage *storage =
> + container_of(rcu, struct bpf_cgroup_storage, rcu);
> +
> + kfree(storage->buf);
> + kfree(storage);
> +}
> +
> +static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu)
> +{
> + struct bpf_cgroup_storage *storage =
> + container_of(rcu, struct bpf_cgroup_storage, rcu);
> +
> + free_percpu(storage->percpu_buf);
> + kfree(storage);
> }
>
> void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
> {
> - u32 pages;
> + enum bpf_cgroup_storage_type stype;
> struct bpf_map *map;
> + u32 pages;
>
> if (!storage)
> return;
>
> map = &storage->map->map;
> - pages = round_up(sizeof(struct bpf_cgroup_storage) +
> - sizeof(struct bpf_storage_buffer) +
> - map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
> + stype = cgroup_storage_type(map);
> + if (stype == BPF_CGROUP_STORAGE_SHARED)
> + pages = round_up(sizeof(struct bpf_cgroup_storage) +
> + sizeof(struct bpf_storage_buffer) +
> + map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
> + else
> + pages = round_up(round_up(map->value_size, 8) *
> + num_possible_cpus(),
> + PAGE_SIZE) >> PAGE_SHIFT;
> +
> bpf_map_uncharge_memlock(map, pages);
>
> - kfree_rcu(storage->buf, rcu);
> - kfree_rcu(storage, rcu);
> + if (stype == BPF_CGROUP_STORAGE_SHARED)
> + call_rcu(&storage->rcu, free_cgroup_storage_rcu);
> + else
> + call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu);
> }
>
> void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
> diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> index 8c91d2b41b1e..5742df21598c 100644
> --- a/kernel/bpf/syscall.c
> +++ b/kernel/bpf/syscall.c
> @@ -686,7 +686,8 @@ static int map_lookup_elem(union bpf_attr *attr)
>
> if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
> map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
> - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
> + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
> + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
> value_size = round_up(map->value_size, 8) * num_possible_cpus();
> else if (IS_FD_MAP(map))
> value_size = sizeof(u32);
> @@ -705,6 +706,8 @@ static int map_lookup_elem(union bpf_attr *attr)
> err = bpf_percpu_hash_copy(map, key, value);
> } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
> err = bpf_percpu_array_copy(map, key, value);
> + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
> + err = bpf_percpu_cgroup_storage_copy(map, key, value);
> } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
> err = bpf_stackmap_copy(map, key, value);
> } else if (IS_FD_ARRAY(map)) {
> @@ -774,7 +777,8 @@ static int map_update_elem(union bpf_attr *attr)
>
> if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
> map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
> - map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
> + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
> + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
> value_size = round_up(map->value_size, 8) * num_possible_cpus();
> else
> value_size = map->value_size;
> @@ -809,6 +813,9 @@ static int map_update_elem(union bpf_attr *attr)
> err = bpf_percpu_hash_update(map, key, value, attr->flags);
> } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
> err = bpf_percpu_array_update(map, key, value, attr->flags);
> + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
> + err = bpf_percpu_cgroup_storage_update(map, key, value,
> + attr->flags);
> } else if (IS_FD_ARRAY(map)) {
> rcu_read_lock();
> err = bpf_fd_array_map_update_elem(map, f.file, key, value,
> diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
> index e75f36de91d6..d94073deb68a 100644
> --- a/kernel/bpf/verifier.c
> +++ b/kernel/bpf/verifier.c
> @@ -2074,6 +2074,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
> goto error;
> break;
> case BPF_MAP_TYPE_CGROUP_STORAGE:
> + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
> if (func_id != BPF_FUNC_get_local_storage)
> goto error;
> break;
> @@ -2164,7 +2165,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
> goto error;
> break;
> case BPF_FUNC_get_local_storage:
> - if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
> + if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
> + map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
> goto error;
> break;
> case BPF_FUNC_sk_select_reuseport:
> @@ -5049,6 +5051,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
> return 0;
> }
>
> +static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
> +{
> + return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
> + map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
> +}
> +
> /* look for pseudo eBPF instructions that access map FDs and
> * replace them with actual map pointers
> */
> @@ -5139,10 +5147,9 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
> }
> env->used_maps[env->used_map_cnt++] = map;
>
> - if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE &&
> + if (bpf_map_is_cgroup_storage(map) &&
> bpf_cgroup_storage_assign(env->prog, map)) {
> - verbose(env,
> - "only one cgroup storage is allowed\n");
> + verbose(env, "only one cgroup storage of each type is allowed\n");
> fdput(f);
> return -EBUSY;
> }
> --
> 2.17.1
>