Re: [PATCH 3/3] bpf: hash: use per-bucket spinlock

From: Daniel Borkmann
Date: Mon Dec 28 2015 - 04:13:27 EST


On 12/26/2015 10:31 AM, Ming Lei wrote:
From: Ming Lei <ming.lei@xxxxxxxxxxxxx>

Both htab_map_update_elem() and htab_map_delete_elem() can be
called from eBPF program, and they may be in kernel hot path,
so it isn't efficient to use a per-hashtable lock in this two
helpers.

The per-hashtable spinlock is used just for protecting bucket's
hlist, and per-bucket lock should be enough. This patch converts
the per-hashtable lock into per-bucket spinlock, so that contention
can be decreased a lot.

Signed-off-by: Ming Lei <ming.lei@xxxxxxxxxxxxx>
---
kernel/bpf/hashtab.c | 35 +++++++++++++++++++++++++----------
1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d857fcb..66bad7a 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -14,9 +14,14 @@
#include <linux/filter.h>
#include <linux/vmalloc.h>

+struct bucket {
+ struct hlist_head head;
+ raw_spinlock_t lock;
+};
+
struct bpf_htab {
struct bpf_map map;
- struct hlist_head *buckets;
+ struct bucket *buckets;
raw_spinlock_t lock;

Shouldn't the lock member be removed from here?

atomic_t count; /* number of elements in this hashtable */
u32 n_buckets; /* number of hash buckets */
@@ -88,24 +93,25 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
/* make sure page count doesn't overflow */
goto free_htab;

- htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) +
+ htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) +
htab->elem_size * htab->map.max_entries,
PAGE_SIZE) >> PAGE_SHIFT;

err = -ENOMEM;
- htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head),
+ htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
GFP_USER | __GFP_NOWARN);

if (!htab->buckets) {
- htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head));
+ htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
if (!htab->buckets)
goto free_htab;
}

- for (i = 0; i < htab->n_buckets; i++)
- INIT_HLIST_HEAD(&htab->buckets[i]);
+ for (i = 0; i < htab->n_buckets; i++) {
+ INIT_HLIST_HEAD(&htab->buckets[i].head);
+ raw_spin_lock_init(&htab->buckets[i].lock);
+ }

- raw_spin_lock_init(&htab->lock);
atomic_set(&htab->count, 0);

return &htab->map;
@@ -120,11 +126,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len)
return jhash(key, key_len, 0);
}

-static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
{
return &htab->buckets[hash & (htab->n_buckets - 1)];
}

+static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
+{
+ return &__select_bucket(htab, hash)->head;
+}
+
static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
void *key, u32 key_size)
{
@@ -227,6 +238,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct htab_elem *l_new, *l_old;
struct hlist_head *head;
+ struct bucket *b;
unsigned long flags;
u32 key_size;
int ret;
@@ -248,7 +260,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
memcpy(l_new->key + round_up(key_size, 8), value, map->value_size);

l_new->hash = htab_map_hash(l_new->key, key_size);
- head = select_bucket(htab, l_new->hash);
+ b = __select_bucket(htab, l_new->hash);
+ head = &b->head;

/* bpf_map_update_elem() can be called in_irq() */
raw_spin_lock_irqsave(&htab->lock, flags);

I am a bit confused on this one, though.

The raw spin lock is still per hashtable (htab->lock), and has not been converted in
this patch to the per bucket one (as opposed to what the commit message says), so
this patch seems to go into the right direction, but is a bit incomplete? Shouldn't
the above f.e. take b->lock, etc?

@@ -299,6 +312,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
struct hlist_head *head;
+ struct bucket *b;
struct htab_elem *l;
unsigned long flags;
u32 hash, key_size;
@@ -309,7 +323,8 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key)
key_size = map->key_size;

hash = htab_map_hash(key, key_size);
- head = select_bucket(htab, hash);
+ b = __select_bucket(htab, hash);
+ head = &b->head;

raw_spin_lock_irqsave(&htab->lock, flags);


Same here?

Thanks,
Daniel
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/