[PATCH net-next 2/4] bpf: bpf_htab: Add BPF_MAP_TYPE_PERCPU_HASH

From: Martin KaFai Lau
Date: Thu Jan 07 2016 - 17:37:12 EST


This patch adds BPFMAP_TYPE_PERCPU_HASH map type and its
htab_map_ops implementation.

Signed-off-by: Martin KaFai Lau <kafai@xxxxxx>
---
include/uapi/linux/bpf.h | 1 +
kernel/bpf/hashtab.c | 201 ++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 201 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 8bed7f1..e4f8060 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -81,6 +81,7 @@ enum bpf_map_type {
BPF_MAP_TYPE_ARRAY,
BPF_MAP_TYPE_PROG_ARRAY,
BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+ BPF_MAP_TYPE_PERCPU_HASH,
};

enum bpf_prog_type {
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d55df8c..63f2945 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -278,7 +278,7 @@ find_first_elem:
}

static struct htab_elem_common *htab_elem_common_alloc(struct bpf_htab *htab,
- void *key)
+ void *key)
{
struct htab_elem_common *l;

@@ -451,9 +451,208 @@ static struct bpf_map_type_list htab_type __read_mostly = {
.type = BPF_MAP_TYPE_HASH,
};

+/* each htab_percpu_elem is struct htab_percpu_elem + key */
+struct htab_percpu_elem {
+ struct htab_elem_common common;
+ void * __percpu value;
+ char key[0] __aligned(8);
+};
+
+static struct htab_percpu_elem *htab_percpu_elem(struct htab_elem_common *l)
+{
+ return (struct htab_percpu_elem *)l;
+}
+
+static void htab_percpu_elem_free(struct htab_percpu_elem *l)
+{
+ free_percpu(l->value);
+ kfree(l);
+}
+
+static void htab_percpu_elem_rcu_free(struct rcu_head *head)
+{
+ struct htab_elem_common *l = container_of(head,
+ struct htab_elem_common,
+ rcu);
+
+ htab_percpu_elem_free(htab_percpu_elem(l));
+}
+
+static void htab_percpu_map_flush(struct bpf_htab *htab)
+{
+ int i;
+
+ for (i = 0; i < htab->n_buckets; i++) {
+ struct hlist_head *head = select_bucket(htab, i);
+ struct hlist_node *n;
+ struct htab_elem_common *l;
+
+ hlist_for_each_entry_safe(l, n, head, hash_node) {
+ hlist_del_rcu(&l->hash_node);
+ atomic_dec(&htab->count);
+ htab_percpu_elem_free(htab_percpu_elem(l));
+ }
+ }
+}
+
+/* Called from syscall */
+static struct bpf_map *htab_percpu_map_alloc(union bpf_attr *attr)
+{
+ u32 elem_size = sizeof(struct htab_percpu_elem) +
+ round_up(attr->key_size, 8);
+ u32 elem_value_size = elem_size +
+ num_possible_cpus() * attr->value_size;
+
+ return __htab_map_alloc(attr, elem_size, elem_value_size,
+ offsetof(struct htab_percpu_elem, key),
+ htab_percpu_map_flush);
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_percpu_map_delete_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem_common *l;
+ struct hlist_head *head;
+ unsigned long flags;
+ u32 hash, key_size;
+ struct bucket *b;
+ int ret = -ENOENT;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ l = lookup_elem_raw(htab, head, hash, key);
+
+ if (l) {
+ hlist_del_rcu(&l->hash_node);
+ atomic_dec(&htab->count);
+ call_rcu(&l->rcu, htab_percpu_elem_rcu_free);
+ ret = 0;
+ }
+
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ return ret;
+}
+
+/* Called from syscall or eBPF program */
+static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_elem_common *l;
+
+ l = __htab_map_lookup_elem(htab, key);
+ if (l) {
+ void *value = per_cpu_ptr(htab_percpu_elem(l)->value,
+ smp_processor_id());
+ return value;
+ }
+
+ return NULL;
+
+}
+
+/* Called from syscall or from eBPF program */
+static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
+ void *value, u64 map_flags)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct htab_percpu_elem *l_new, *l_old;
+ struct hlist_head *head;
+ struct bucket *b;
+ unsigned long flags;
+ int ret;
+
+ if (map_flags > BPF_EXIST)
+ /* unknown flags */
+ return -EINVAL;
+
+ WARN_ON_ONCE(!rcu_read_lock_held());
+
+ /* allocate new element outside of lock */
+ l_new = htab_percpu_elem(htab_elem_common_alloc(htab, key));
+ if (!l_new)
+ return -ENOMEM;
+
+ l_new->value = __alloc_percpu_gfp(htab->map.value_size, 8,
+ GFP_ATOMIC | __GFP_NOWARN);
+ if (!l_new->value) {
+ htab_percpu_elem_free(l_new);
+ return -ENOMEM;
+ }
+
+ memcpy(raw_cpu_ptr(l_new->value), value, map->value_size);
+
+ b = __select_bucket(htab, l_new->common.hash);
+ head = &b->head;
+
+ /* bpf_map_update_elem() can be called in_irq() */
+ raw_spin_lock_irqsave(&b->lock, flags);
+
+ l_old = htab_percpu_elem(lookup_elem_raw(htab, head, l_new->common.hash,
+ key));
+
+ if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) {
+ /* if elem with this 'key' doesn't exist and we've reached
+ * max_entries limit, fail insertion of new elem
+ */
+ ret = -E2BIG;
+ goto err;
+ }
+
+ if (l_old && map_flags == BPF_NOEXIST) {
+ /* elem already exists */
+ ret = -EEXIST;
+ goto err;
+ }
+
+ if (!l_old && map_flags == BPF_EXIST) {
+ /* elem doesn't exist, cannot update it */
+ ret = -ENOENT;
+ goto err;
+ }
+
+ if (l_old) {
+ memcpy(this_cpu_ptr(l_old->value), value, map->value_size);
+ } else {
+ hlist_add_head_rcu(&l_new->common.hash_node, head);
+ atomic_inc(&htab->count);
+ }
+
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+
+ return 0;
+err:
+ raw_spin_unlock_irqrestore(&b->lock, flags);
+ htab_percpu_elem_free(l_new);
+ return ret;
+}
+
+static const struct bpf_map_ops htab_percpu_ops = {
+ .map_alloc = htab_percpu_map_alloc,
+ .map_free = htab_map_free,
+ .map_get_next_key = htab_map_get_next_key,
+ .map_lookup_elem = htab_percpu_map_lookup_elem,
+ .map_update_elem = htab_percpu_map_update_elem,
+ .map_delete_elem = htab_percpu_map_delete_elem,
+};
+
+static struct bpf_map_type_list htab_percpu_type __read_mostly = {
+ .ops = &htab_percpu_ops,
+ .type = BPF_MAP_TYPE_PERCPU_HASH,
+};
+
static int __init register_htab_map(void)
{
bpf_register_map_type(&htab_type);
+ bpf_register_map_type(&htab_percpu_type);
return 0;
}
late_initcall(register_htab_map);
--
2.5.1