Re: [PATCH bpf-next 2/6] bpf: add BPF_MAP_DUMP command to dump more than one entry per call

From: Brian Vazquez
Date: Wed Jul 24 2019 - 18:44:57 EST


On Wed, Jul 24, 2019 at 2:40 PM Song Liu <liu.song.a23@xxxxxxxxx> wrote:
>
> On Wed, Jul 24, 2019 at 10:10 AM Brian Vazquez <brianvv@xxxxxxxxxx> wrote:
> >
> > This introduces a new command to retrieve multiple number of entries
> > from a bpf map, wrapping the existing bpf methods:
> > map_get_next_key and map_lookup_elem
> >
> > To start dumping the map from the beginning you must specify NULL as
> > the prev_key.
> >
> > The new API returns 0 when it successfully copied all the elements
> > requested or it copied less because there weren't more elements to
> > retrieved (i.e err == -ENOENT). In last scenario err will be masked to 0.
> >
> > On a successful call buf and buf_len will contain correct data and in
> > case prev_key was provided (not for the first walk, since prev_key is
> > NULL) it will contain the last_key copied into the prev_key which will
> > simplify next call.
> >
> > Only when it can't find a single element it will return -ENOENT meaning
> > that the map has been entirely walked. When an error is return buf,
> > buf_len and prev_key shouldn't be read nor used.
> >
> > Because maps can be called from userspace and kernel code, this function
> > can have a scenario where the next_key was found but by the time we
> > try to retrieve the value the element is not there, in this case the
> > function continues and tries to get a new next_key value, skipping the
> > deleted key. If at some point the function find itself trap in a loop,
> > it will return -EINTR.
> >
> > The function will try to fit as much as possible in the buf provided and
> > will return -EINVAL if buf_len is smaller than elem_size.
> >
> > QUEUE and STACK maps are not supported.
> >
> > Note that map_dump doesn't guarantee that reading the entire table is
> > consistent since this function is always racing with kernel and user code
> > but the same behaviour is found when the entire table is walked using
> > the current interfaces: map_get_next_key + map_lookup_elem.
> > It is also important to note that with a locked map, the lock is grabbed
> > for 1 entry at the time, meaning that the returned buf might or might not
> > be consistent.
> >
> > Suggested-by: Stanislav Fomichev <sdf@xxxxxxxxxx>
> > Signed-off-by: Brian Vazquez <brianvv@xxxxxxxxxx>
> > ---
> > include/uapi/linux/bpf.h | 9 +++
> > kernel/bpf/syscall.c | 117 +++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 126 insertions(+)
> >
> > diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
> > index fa1c753dcdbc7..66dab5385170d 100644
> > --- a/include/uapi/linux/bpf.h
> > +++ b/include/uapi/linux/bpf.h
> > @@ -106,6 +106,7 @@ enum bpf_cmd {
> > BPF_TASK_FD_QUERY,
> > BPF_MAP_LOOKUP_AND_DELETE_ELEM,
> > BPF_MAP_FREEZE,
> > + BPF_MAP_DUMP,
> > };
> >
> > enum bpf_map_type {
> > @@ -388,6 +389,14 @@ union bpf_attr {
> > __u64 flags;
> > };
> >
> > + struct { /* struct used by BPF_MAP_DUMP command */
> > + __aligned_u64 prev_key;
> > + __aligned_u64 buf;
> > + __aligned_u64 buf_len; /* input/output: len of buf */
> > + __u64 flags;
>
> Please add explanation of flags here.

got it!

> Also, we need to update the
> comments of BPF_F_LOCK for BPF_MAP_DUMP.

What do you mean? I didn't get this part.

>
> > + __u32 map_fd;
> > + } dump;
> > +
> > struct { /* anonymous struct used by BPF_PROG_LOAD command */
> > __u32 prog_type; /* one of enum bpf_prog_type */
> > __u32 insn_cnt;
> > diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
> > index 86cdc2f7bb56e..0c35505aa219f 100644
> > --- a/kernel/bpf/syscall.c
> > +++ b/kernel/bpf/syscall.c
> > @@ -1097,6 +1097,120 @@ static int map_get_next_key(union bpf_attr *attr)
> > return err;
> > }
> >
> > +/* last field in 'union bpf_attr' used by this command */
> > +#define BPF_MAP_DUMP_LAST_FIELD dump.map_fd
> > +
> > +static int map_dump(union bpf_attr *attr)
> > +{
> > + void __user *ukey = u64_to_user_ptr(attr->dump.prev_key);
> > + void __user *ubuf = u64_to_user_ptr(attr->dump.buf);
> > + u32 __user *ubuf_len = u64_to_user_ptr(attr->dump.buf_len);
> > + int ufd = attr->dump.map_fd;
> > + struct bpf_map *map;
> > + void *buf, *prev_key, *key, *value;
> > + u32 value_size, elem_size, buf_len, cp_len;
> > + struct fd f;
> > + int err;
> > + bool first_key = false;
> > +
> > + if (CHECK_ATTR(BPF_MAP_DUMP))
> > + return -EINVAL;
> > +
> > + if (attr->dump.flags & ~BPF_F_LOCK)
> > + return -EINVAL;
> > +
> > + f = fdget(ufd);
> > + map = __bpf_map_get(f);
> > + if (IS_ERR(map))
> > + return PTR_ERR(map);
> > + if (!(map_get_sys_perms(map, f) & FMODE_CAN_READ)) {
> > + err = -EPERM;
> > + goto err_put;
> > + }
> > +
> > + if ((attr->dump.flags & BPF_F_LOCK) &&
> > + !map_value_has_spin_lock(map)) {
> > + err = -EINVAL;
> > + goto err_put;
> > + }
>
> We can share these lines with map_lookup_elem(). Maybe
> add another helper function?

Which are the lines you are referring to? the dump.flags? It makes
sense so that way when a new flag is added you only need to modify
them in one spot.

> > +
> > + if (map->map_type == BPF_MAP_TYPE_QUEUE ||
> > + map->map_type == BPF_MAP_TYPE_STACK) {
> > + err = -ENOTSUPP;
> > + goto err_put;
> > + }
> > +
> > + value_size = bpf_map_value_size(map);
> > +
> > + err = get_user(buf_len, ubuf_len);
> > + if (err)
> > + goto err_put;
> > +
> > + elem_size = map->key_size + value_size;
> > + if (buf_len < elem_size) {
> > + err = -EINVAL;
> > + goto err_put;
> > + }
> > +
> > + if (ukey) {
> > + prev_key = __bpf_copy_key(ukey, map->key_size);
> > + if (IS_ERR(prev_key)) {
> > + err = PTR_ERR(prev_key);
> > + goto err_put;
> > + }
> > + } else {
> > + prev_key = NULL;
> > + first_key = true;
> > + }
> > +
> > + err = -ENOMEM;
> > + buf = kmalloc(elem_size, GFP_USER | __GFP_NOWARN);
> > + if (!buf)
> > + goto err_put;
> > +
> > + key = buf;
> > + value = key + map->key_size;
> > + for (cp_len = 0; cp_len + elem_size <= buf_len;) {
> > + if (signal_pending(current)) {
> > + err = -EINTR;
> > + break;
> > + }
> > +
> > + rcu_read_lock();
> > + err = map->ops->map_get_next_key(map, prev_key, key);
>
> If prev_key is deleted before map_get_next_key(), we get the first key
> again. This is pretty weird.

Yes, I know. But note that the current scenario happens even for the
old interface (imagine you are walking a map from userspace and you
tried get_next_key the prev_key was removed, you will start again from
the beginning without noticing it).
I tried to sent a patch in the past but I was missing some context:
before NULL was used to get the very first_key the interface relied in
a random (non existent) key to retrieve the first_key in the map, and
I was told what we still have to support that scenario.

>
> > + rcu_read_unlock();
> > +
> > + if (err)
> > + break;
> > +
> > + err = bpf_map_copy_value(map, key, value, attr->dump.flags);
> > +
> > + if (err == -ENOENT)
> > + continue;
> > + if (err)
> > + goto free_buf;
> > +
> > + if (copy_to_user(ubuf + cp_len, buf, elem_size)) {
> > + err = -EFAULT;
> > + goto free_buf;
> > + }
> > +
> > + prev_key = key;
> > + cp_len += elem_size;
> > + }
> > +
> > + if (err == -ENOENT && cp_len)
> > + err = 0;
> > + if (!err && (copy_to_user(ubuf_len, &cp_len, sizeof(cp_len)) ||
> > + (!first_key && copy_to_user(ukey, key, map->key_size))))
> > + err = -EFAULT;
> > +free_buf:
> > + kfree(buf);
> > +err_put:
> > + fdput(f);
> > + return err;
> > +}
> > +
> > #define BPF_MAP_LOOKUP_AND_DELETE_ELEM_LAST_FIELD value
> >
> > static int map_lookup_and_delete_elem(union bpf_attr *attr)
> > @@ -2910,6 +3024,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
> > case BPF_MAP_LOOKUP_AND_DELETE_ELEM:
> > err = map_lookup_and_delete_elem(&attr);
> > break;
> > + case BPF_MAP_DUMP:
> > + err = map_dump(&attr);
> > + break;
> > default:
> > err = -EINVAL;
> > break;
> > --
> > 2.22.0.657.g960e92d24f-goog
> >

Thanks for reviewing it!!