Re: [PATCH v3 bpf-next 06/11] bpf: add batch ops to all htab bpf map

From: Yonghong Song
Date: Fri Dec 13 2019 - 13:17:07 EST




On 12/11/19 2:33 PM, Brian Vazquez wrote:
> From: Yonghong Song <yhs@xxxxxx>
>
> htab can't use generic batch support due some problematic behaviours
> inherent to the data structre, i.e. while iterating the bpf map a
> concurrent program might delete the next entry that batch was about to
> use, in that case there's no easy solution to retrieve the next entry,
> the issue has been discussed multiple times (see [1] and [2]).
>
> The only way hmap can be traversed without the problem previously
> exposed is by making sure that the map is traversing entire buckets.
> This commit implements those strict requirements for hmap, the
> implementation follows the same interaction that generic support with
> some exceptions:
>
> - If keys/values buffer are not big enough to traverse a bucket,
> ENOSPC will be returned.
> - out_batch contains the value of the next bucket in the iteration, not
> the next key, but this is transparent for the user since the user
> should never use out_batch for other than bpf batch syscalls.
>
> Note that only lookup and lookup_and_delete batch ops require the hmap
> specific implementation, update/delete batch ops can be the generic
> ones.
>
> [1] https://lore.kernel.org/bpf/20190724165803.87470-1-brianvv@xxxxxxxxxx/
> [2] https://lore.kernel.org/bpf/20190906225434.3635421-1-yhs@xxxxxx/
>
> Signed-off-by: Yonghong Song <yhs@xxxxxx>
> Signed-off-by: Brian Vazquez <brianvv@xxxxxxxxxx>
> ---
> kernel/bpf/hashtab.c | 242 +++++++++++++++++++++++++++++++++++++++++++
> 1 file changed, 242 insertions(+)
>
> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> index 22066a62c8c97..fac107bdaf9ec 100644
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -17,6 +17,17 @@
> (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE | \
> BPF_F_ACCESS_MASK | BPF_F_ZERO_SEED)
>
> +#define BATCH_OPS(_name) \
> + .map_lookup_batch = \
> + _name##_map_lookup_batch, \
> + .map_lookup_and_delete_batch = \
> + _name##_map_lookup_and_delete_batch, \
> + .map_update_batch = \
> + generic_map_update_batch, \
> + .map_delete_batch = \
> + generic_map_delete_batch
> +
> +
> struct bucket {
> struct hlist_nulls_head head;
> raw_spinlock_t lock;
> @@ -1232,6 +1243,233 @@ static void htab_map_seq_show_elem(struct bpf_map *map, void *key,
> rcu_read_unlock();
> }
>
> +static int
> +__htab_map_lookup_and_delete_batch(struct bpf_map *map,
> + const union bpf_attr *attr,
> + union bpf_attr __user *uattr,
> + bool do_delete, bool is_lru_map,
> + bool is_percpu)
> +{
> + struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
> + u32 bucket_cnt, total, key_size, value_size, roundup_key_size;
> + void *keys = NULL, *values = NULL, *value, *dst_key, *dst_val;
> + void __user *uvalues = u64_to_user_ptr(attr->batch.values);
> + void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
> + void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
> + u64 elem_map_flags, map_flags;
> + struct hlist_nulls_head *head;
> + u32 batch, max_count, size;
> + struct hlist_nulls_node *n;
> + unsigned long flags;
> + struct htab_elem *l;
> + struct bucket *b;
> + int ret = 0;
> +
> + max_count = attr->batch.count;
> + if (!max_count)
> + return 0;

In all previous implementation, we did sanity checks for flags and then
checked max_count. To be consistent, we should move this after map_flags
check.

> +
> + elem_map_flags = attr->batch.elem_flags;
> + if ((elem_map_flags & ~BPF_F_LOCK) ||
> + ((elem_map_flags & BPF_F_LOCK) && !map_value_has_spin_lock(map)))
> + return -EINVAL;
> +
> + map_flags = attr->batch.flags;
> + if (map_flags)
> + return -EINVAL;
> +
> + batch = 0;
> + if (ubatch && copy_from_user(&batch, ubatch, sizeof(batch)))
> + return -EFAULT;
> +
> + if (batch >= htab->n_buckets)
> + return -ENOENT;
> +
> + /* We cannot do copy_from_user or copy_to_user inside
> + * the rcu_read_lock. Allocate enough space here.
> + */
> + key_size = htab->map.key_size;
> + roundup_key_size = round_up(htab->map.key_size, 8);
> + value_size = htab->map.value_size;
> + size = round_up(value_size, 8);
> + if (is_percpu)
> + value_size = size * num_possible_cpus();
> + keys = kvmalloc(key_size, GFP_USER | __GFP_NOWARN);
> + values = kvmalloc(value_size, GFP_USER | __GFP_NOWARN);
> + if (!keys || !values) {
> + ret = -ENOMEM;
> + goto out;
> + }
> +
> + dst_key = keys;
> + dst_val = values;
> + total = 0;
> +
> + preempt_disable();
> + this_cpu_inc(bpf_prog_active);
> + rcu_read_lock();
> +
> +again:
> + b = &htab->buckets[batch];
> + head = &b->head;
> + raw_spin_lock_irqsave(&b->lock, flags);
> +
> + bucket_cnt = 0;
> + hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
> + bucket_cnt++;
> +
> + if (bucket_cnt > (max_count - total)) {
> + if (total == 0)
> + ret = -ENOSPC;
> + goto after_loop;
> + }
> +
> + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
> + memcpy(dst_key, l->key, key_size);
> +
> + if (is_percpu) {
> + int off = 0, cpu;
> + void __percpu *pptr;
> +
> + pptr = htab_elem_get_ptr(l, map->key_size);
> + for_each_possible_cpu(cpu) {
> + bpf_long_memcpy(dst_val + off,
> + per_cpu_ptr(pptr, cpu), size);
> + off += size;
> + }
> + } else {
> + value = l->key + roundup_key_size;
> + if (elem_map_flags & BPF_F_LOCK)
> + copy_map_value_locked(map, dst_val, value,
> + true);
> + else
> + copy_map_value(map, dst_val, value);
> + check_and_init_map_lock(map, dst_val);
> + }
> + if (do_delete) {
> + hlist_nulls_del_rcu(&l->hash_node);
> + if (is_lru_map)
> + bpf_lru_push_free(&htab->lru, &l->lru_node);
> + else
> + free_htab_elem(htab, l);
> + }
> + if (copy_to_user(ukeys + total * key_size, keys, key_size) ||
> + copy_to_user(uvalues + total * value_size, values,
> + value_size)) {
> + ret = -EFAULT;
> + goto after_loop;
> + }
> + total++;
> + }
> +
> + batch++;
> + if (batch >= htab->n_buckets) {
> + ret = -ENOENT;
> + goto after_loop;
> + }
> +
> + raw_spin_unlock_irqrestore(&b->lock, flags);
> + goto again;
> +
> +after_loop:
> + raw_spin_unlock_irqrestore(&b->lock, flags);
> +
> + rcu_read_unlock();
> + this_cpu_dec(bpf_prog_active);
> + preempt_enable();
> +

When reaching here, only the following values are possible
for 'ret':
0 <=== everything is okay, still have some buckets left
-ENOENT <=== everything is okay, not all user buffer filled as
we reach the end of hash table
-EFAULT <=== fault during copy data to user space
-ENOSPC <=== no enough buffer space to copy the 'batch'.

> + if (ret && ret != -ENOENT)
> + goto out;

Maybe we should do:
0 <=== continue to copy data back to user
-ENOSPC <=== continue to copy data back to user
user needs to check -ENOSPC error code
-EFAULT <=== we can do the copy below or go out
since -EFAULT will be returned any way
-ENOENT <=== we reached the end, so we actually have
no error. we should still copy data
back to user. The user can check -ENOENT
to signal end of the traversal, similar to
get_next_key().

Do this make sense?

> +
> + /* copy data back to user */
> + ubatch = u64_to_user_ptr(attr->batch.out_batch);
> + if (copy_to_user(ubatch, &batch, sizeof(batch)) ||
> + put_user(total, &uattr->batch.count))
> + ret = -EFAULT;
> +
> +out:
> + kvfree(keys);
> + kvfree(values);
> + return ret;
> +}
> +
[...]