[PATCH v3 bpf-next 03/10] bpf: introduce per-cpu cgroup local storage
From: Roman Gushchin
Date: Wed Sep 26 2018 - 07:35:27 EST
This commit introduced per-cpu cgroup local storage.
Per-cpu cgroup local storage is very similar to simple cgroup storage
(let's call it shared), except all the data is per-cpu.
The main goal of per-cpu variant is to implement super fast
counters (e.g. packet counters), which don't require neither
lookups, neither atomic operations.
>From userspace's point of view, accessing a per-cpu cgroup storage
is similar to other per-cpu map types (e.g. per-cpu hashmaps and
arrays).
Writing to a per-cpu cgroup storage is not atomic, but is performed
by copying longs, so some minimal atomicity is here, exactly
as with other per-cpu maps.
Signed-off-by: Roman Gushchin <guro@xxxxxx>
Cc: Daniel Borkmann <daniel@xxxxxxxxxxxxx>
Cc: Alexei Starovoitov <ast@xxxxxxxxxx>
---
include/linux/bpf-cgroup.h | 20 ++++-
include/linux/bpf.h | 1 +
include/linux/bpf_types.h | 1 +
include/uapi/linux/bpf.h | 1 +
kernel/bpf/helpers.c | 8 +-
kernel/bpf/local_storage.c | 148 ++++++++++++++++++++++++++++++++-----
kernel/bpf/syscall.c | 11 ++-
kernel/bpf/verifier.c | 15 +++-
8 files changed, 177 insertions(+), 28 deletions(-)
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 7e0c9a1d48b7..588dd5f0bd85 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -37,7 +37,10 @@ struct bpf_storage_buffer {
};
struct bpf_cgroup_storage {
- struct bpf_storage_buffer *buf;
+ union {
+ struct bpf_storage_buffer *buf;
+ void __percpu *percpu_buf;
+ };
struct bpf_cgroup_storage_map *map;
struct bpf_cgroup_storage_key key;
struct list_head list;
@@ -109,6 +112,9 @@ int __cgroup_bpf_check_dev_permission(short dev_type, u32 major, u32 minor,
static inline enum bpf_cgroup_storage_type cgroup_storage_type(
struct bpf_map *map)
{
+ if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
+ return BPF_CGROUP_STORAGE_PERCPU;
+
return BPF_CGROUP_STORAGE_SHARED;
}
@@ -131,6 +137,10 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
int bpf_cgroup_storage_assign(struct bpf_prog *prog, struct bpf_map *map);
void bpf_cgroup_storage_release(struct bpf_prog *prog, struct bpf_map *map);
+int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
+int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
+ void *value, u64 flags);
+
/* Wrappers for __cgroup_bpf_run_filter_skb() guarded by cgroup_bpf_enabled. */
#define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk, skb) \
({ \
@@ -285,6 +295,14 @@ static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return 0; }
static inline void bpf_cgroup_storage_free(
struct bpf_cgroup_storage *storage) {}
+static inline int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key,
+ void *value) {
+ return 0;
+}
+static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
+ void *key, void *value, u64 flags) {
+ return 0;
+}
#define cgroup_bpf_enabled (0)
#define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index b457fbe7b70b..018299a595c8 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -274,6 +274,7 @@ struct bpf_prog_offload {
enum bpf_cgroup_storage_type {
BPF_CGROUP_STORAGE_SHARED,
+ BPF_CGROUP_STORAGE_PERCPU,
__BPF_CGROUP_STORAGE_MAX
};
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index c9bd6fb765b0..5432f4c9f50e 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -43,6 +43,7 @@ BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops)
#endif
#ifdef CONFIG_CGROUP_BPF
BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_STORAGE, cgroup_storage_map_ops)
+BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE, cgroup_storage_map_ops)
#endif
BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops)
BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index aa5ccd2385ed..e2070d819e04 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -127,6 +127,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_SOCKHASH,
BPF_MAP_TYPE_CGROUP_STORAGE,
BPF_MAP_TYPE_REUSEPORT_SOCKARRAY,
+ BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE,
};
enum bpf_prog_type {
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index e42f8789b7ea..6502115e8f55 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -206,10 +206,16 @@ BPF_CALL_2(bpf_get_local_storage, struct bpf_map *, map, u64, flags)
*/
enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
struct bpf_cgroup_storage *storage;
+ void *ptr;
storage = this_cpu_read(bpf_cgroup_storage[stype]);
- return (unsigned long)&READ_ONCE(storage->buf)->data[0];
+ if (stype == BPF_CGROUP_STORAGE_SHARED)
+ ptr = &READ_ONCE(storage->buf)->data[0];
+ else
+ ptr = this_cpu_ptr(storage->percpu_buf);
+
+ return (unsigned long)ptr;
}
const struct bpf_func_proto bpf_get_local_storage_proto = {
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 6742292fb39e..c739f6dcc3c2 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -152,6 +152,71 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
return 0;
}
+int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
+ void *value)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+ struct bpf_cgroup_storage_key *key = _key;
+ struct bpf_cgroup_storage *storage;
+ int cpu, off = 0;
+ u32 size;
+
+ rcu_read_lock();
+ storage = cgroup_storage_lookup(map, key, false);
+ if (!storage) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+
+ /* per_cpu areas are zero-filled and bpf programs can only
+ * access 'value_size' of them, so copying rounded areas
+ * will not leak any kernel data
+ */
+ size = round_up(_map->value_size, 8);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(value + off,
+ per_cpu_ptr(storage->percpu_buf, cpu), size);
+ off += size;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
+ void *value, u64 map_flags)
+{
+ struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+ struct bpf_cgroup_storage_key *key = _key;
+ struct bpf_cgroup_storage *storage;
+ int cpu, off = 0;
+ u32 size;
+
+ if (unlikely(map_flags & BPF_EXIST))
+ return -EINVAL;
+
+ rcu_read_lock();
+ storage = cgroup_storage_lookup(map, key, false);
+ if (!storage) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+
+ /* the user space will provide round_up(value_size, 8) bytes that
+ * will be copied into per-cpu area. bpf programs can only access
+ * value_size of it. During lookup the same extra bytes will be
+ * returned or zeros which were zero-filled by percpu_alloc,
+ * so no kernel data leaks possible
+ */
+ size = round_up(_map->value_size, 8);
+ for_each_possible_cpu(cpu) {
+ bpf_long_memcpy(per_cpu_ptr(storage->percpu_buf, cpu),
+ value + off, size);
+ off += size;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
void *_next_key)
{
@@ -292,55 +357,98 @@ struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
{
struct bpf_cgroup_storage *storage;
struct bpf_map *map;
+ gfp_t flags;
+ size_t size;
u32 pages;
map = prog->aux->cgroup_storage[stype];
if (!map)
return NULL;
- pages = round_up(sizeof(struct bpf_cgroup_storage) +
- sizeof(struct bpf_storage_buffer) +
- map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+ if (stype == BPF_CGROUP_STORAGE_SHARED) {
+ size = sizeof(struct bpf_storage_buffer) + map->value_size;
+ pages = round_up(sizeof(struct bpf_cgroup_storage) + size,
+ PAGE_SIZE) >> PAGE_SHIFT;
+ } else {
+ size = map->value_size;
+ pages = round_up(round_up(size, 8) * num_possible_cpus(),
+ PAGE_SIZE) >> PAGE_SHIFT;
+ }
+
if (bpf_map_charge_memlock(map, pages))
return ERR_PTR(-EPERM);
storage = kmalloc_node(sizeof(struct bpf_cgroup_storage),
__GFP_ZERO | GFP_USER, map->numa_node);
- if (!storage) {
- bpf_map_uncharge_memlock(map, pages);
- return ERR_PTR(-ENOMEM);
- }
+ if (!storage)
+ goto enomem;
- storage->buf = kmalloc_node(sizeof(struct bpf_storage_buffer) +
- map->value_size, __GFP_ZERO | GFP_USER,
- map->numa_node);
- if (!storage->buf) {
- bpf_map_uncharge_memlock(map, pages);
- kfree(storage);
- return ERR_PTR(-ENOMEM);
+ flags = __GFP_ZERO | GFP_USER;
+
+ if (stype == BPF_CGROUP_STORAGE_SHARED) {
+ storage->buf = kmalloc_node(size, flags, map->numa_node);
+ if (!storage->buf)
+ goto enomem;
+ } else {
+ storage->percpu_buf = __alloc_percpu_gfp(size, 8, flags);
+ if (!storage->percpu_buf)
+ goto enomem;
}
storage->map = (struct bpf_cgroup_storage_map *)map;
return storage;
+
+enomem:
+ bpf_map_uncharge_memlock(map, pages);
+ kfree(storage);
+ return ERR_PTR(-ENOMEM);
+}
+
+static void free_shared_cgroup_storage_rcu(struct rcu_head *rcu)
+{
+ struct bpf_cgroup_storage *storage =
+ container_of(rcu, struct bpf_cgroup_storage, rcu);
+
+ kfree(storage->buf);
+ kfree(storage);
+}
+
+static void free_percpu_cgroup_storage_rcu(struct rcu_head *rcu)
+{
+ struct bpf_cgroup_storage *storage =
+ container_of(rcu, struct bpf_cgroup_storage, rcu);
+
+ free_percpu(storage->percpu_buf);
+ kfree(storage);
}
void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage)
{
- u32 pages;
+ enum bpf_cgroup_storage_type stype;
struct bpf_map *map;
+ u32 pages;
if (!storage)
return;
map = &storage->map->map;
- pages = round_up(sizeof(struct bpf_cgroup_storage) +
- sizeof(struct bpf_storage_buffer) +
- map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+ stype = cgroup_storage_type(map);
+ if (stype == BPF_CGROUP_STORAGE_SHARED)
+ pages = round_up(sizeof(struct bpf_cgroup_storage) +
+ sizeof(struct bpf_storage_buffer) +
+ map->value_size, PAGE_SIZE) >> PAGE_SHIFT;
+ else
+ pages = round_up(round_up(map->value_size, 8) *
+ num_possible_cpus(),
+ PAGE_SIZE) >> PAGE_SHIFT;
+
bpf_map_uncharge_memlock(map, pages);
- kfree_rcu(storage->buf, rcu);
- kfree_rcu(storage, rcu);
+ if (stype == BPF_CGROUP_STORAGE_SHARED)
+ call_rcu(&storage->rcu, free_shared_cgroup_storage_rcu);
+ else
+ call_rcu(&storage->rcu, free_percpu_cgroup_storage_rcu);
}
void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 8c91d2b41b1e..5742df21598c 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -686,7 +686,8 @@ static int map_lookup_elem(union bpf_attr *attr)
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
value_size = round_up(map->value_size, 8) * num_possible_cpus();
else if (IS_FD_MAP(map))
value_size = sizeof(u32);
@@ -705,6 +706,8 @@ static int map_lookup_elem(union bpf_attr *attr)
err = bpf_percpu_hash_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_copy(map, key, value);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ err = bpf_percpu_cgroup_storage_copy(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) {
err = bpf_stackmap_copy(map, key, value);
} else if (IS_FD_ARRAY(map)) {
@@ -774,7 +777,8 @@ static int map_update_elem(union bpf_attr *attr)
if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
- map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
+ map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
value_size = round_up(map->value_size, 8) * num_possible_cpus();
else
value_size = map->value_size;
@@ -809,6 +813,9 @@ static int map_update_elem(union bpf_attr *attr)
err = bpf_percpu_hash_update(map, key, value, attr->flags);
} else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
err = bpf_percpu_array_update(map, key, value, attr->flags);
+ } else if (map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE) {
+ err = bpf_percpu_cgroup_storage_update(map, key, value,
+ attr->flags);
} else if (IS_FD_ARRAY(map)) {
rcu_read_lock();
err = bpf_fd_array_map_update_elem(map, f.file, key, value,
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index e90899df585d..a8cc83a970d1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2074,6 +2074,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
goto error;
break;
case BPF_MAP_TYPE_CGROUP_STORAGE:
+ case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE:
if (func_id != BPF_FUNC_get_local_storage)
goto error;
break;
@@ -2164,7 +2165,8 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
goto error;
break;
case BPF_FUNC_get_local_storage:
- if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE)
+ if (map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
+ map->map_type != BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE)
goto error;
break;
case BPF_FUNC_sk_select_reuseport:
@@ -5049,6 +5051,12 @@ static int check_map_prog_compatibility(struct bpf_verifier_env *env,
return 0;
}
+static bool bpf_map_is_cgroup_storage(struct bpf_map *map)
+{
+ return (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE ||
+ map->map_type == BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE);
+}
+
/* look for pseudo eBPF instructions that access map FDs and
* replace them with actual map pointers
*/
@@ -5139,10 +5147,9 @@ static int replace_map_fd_with_map_ptr(struct bpf_verifier_env *env)
}
env->used_maps[env->used_map_cnt++] = map;
- if (map->map_type == BPF_MAP_TYPE_CGROUP_STORAGE &&
+ if (bpf_map_is_cgroup_storage(map) &&
bpf_cgroup_storage_assign(env->prog, map)) {
- verbose(env,
- "only one cgroup storage is allowed\n");
+ verbose(env, "only one cgroup storage of each type is allowed\n");
fdput(f);
return -EBUSY;
}
--
2.17.1