[RFC bpf-next 2/3] bpf: implement compare-and-delete (BPF_F_COMPARE) for BPF_MAP_TYPE_HASH
From: Gyutae Bae
Date: Mon Jun 22 2026 - 03:17:16 EST
From: Gyutae Bae <gyutae.bae@xxxxxxxxxxxxx>
map_delete_elem() now honours BPF_F_COMPARE: it copies the expected
bytes, validates the compare window against value_size, and dispatches to a
new ->map_delete_elem_cmp(). The hash-map implementation, under the bucket
lock, memcmps the [off, off+size) window of the live value and unlinks only
on a match (mismatch -EBUSY, absent -ENOENT), making the compare and the
delete atomic w.r.t. every bucket-lock holder. Only BPF_MAP_TYPE_HASH
wires up the op; every other map type lacks it and returns -EOPNOTSUPP.
Atomicity boundary: the compare is read under the bucket lock, so it is
atomic against every other bucket-lock holder (concurrent deletes,
element-replacing updates). It is NOT mutual exclusion against a BPF
program that writes the value in place via the pointer from
bpf_map_lookup_elem(), which takes no bucket lock; against such a writer
the compare may observe a torn value within the locked section. For a
meaningful result the compared region should be a synchronization variable
-- written with a single aligned store such as a monotonic revision, or
replaced via an update -- not a multi-field blob mutated in place.
It compares the raw stored value, so maps whose value carries BTF-managed
fields (bpf_spin_lock, bpf_timer, kptr, ...) are also rejected with
-EOPNOTSUPP: those bytes are sanitised on lookup, so a raw compare would
never match the caller's snapshot and could expose kernel-internal bytes.
BPF_MAP_DELETE_ELEM_LAST_FIELD now reaches compare_size, so the compare*
fields pass CHECK_ATTR even without BPF_F_COMPARE. Reject that combination
with -EINVAL: a dropped flag must not silently turn a compare-and-delete
into an unconditional delete.
Co-developed-by: Minsu Jeon <minsu.jeon@xxxxxxxxxxxxx>
Signed-off-by: Minsu Jeon <minsu.jeon@xxxxxxxxxxxxx>
Co-developed-by: Siwan Kim <siwan.kim@xxxxxxxxxxxxx>
Signed-off-by: Siwan Kim <siwan.kim@xxxxxxxxxxxxx>
Co-developed-by: Jonghyeon Kim <jong-hyeon.kim@xxxxxxxxxxxxx>
Signed-off-by: Jonghyeon Kim <jong-hyeon.kim@xxxxxxxxxxxxx>
Signed-off-by: Gyutae Bae <gyutae.bae@xxxxxxxxxxxxx>
---
include/linux/bpf.h | 2 ++
kernel/bpf/hashtab.c | 39 ++++++++++++++++++++++++++++++++
kernel/bpf/syscall.c | 54 +++++++++++++++++++++++++++++++++++++++++---
3 files changed, 92 insertions(+), 3 deletions(-)
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 7719f6528445..2e858272f7a7 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -110,6 +110,8 @@ struct bpf_map_ops {
void *(*map_lookup_elem)(struct bpf_map *map, void *key);
long (*map_update_elem)(struct bpf_map *map, void *key, void *value, u64 flags);
long (*map_delete_elem)(struct bpf_map *map, void *key);
+ long (*map_delete_elem_cmp)(struct bpf_map *map, void *key,
+ const void *compare, u32 off, u32 size);
long (*map_push_elem)(struct bpf_map *map, void *value, u64 flags);
long (*map_pop_elem)(struct bpf_map *map, void *value);
long (*map_peek_elem)(struct bpf_map *map, void *value);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 9f394e1aa2e8..8fc3ab938b9e 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1534,6 +1534,44 @@ static long htab_map_delete_elem(struct bpf_map *map, void *key)
return ret;
}
+static long htab_map_delete_elem_cmp(struct bpf_map *map, void *key,
+ const void *compare, u32 off, u32 size)
+{
+ struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
+ struct hlist_nulls_head *head;
+ struct bucket *b;
+ struct htab_elem *l;
+ unsigned long flags;
+ u32 hash, key_size;
+ int ret;
+
+ WARN_ON_ONCE(!bpf_rcu_lock_held());
+
+ key_size = map->key_size;
+
+ hash = htab_map_hash(key, key_size, htab->hashrnd);
+ b = __select_bucket(htab, hash);
+ head = &b->head;
+
+ ret = htab_lock_bucket(b, &flags);
+ if (ret)
+ return ret;
+
+ l = lookup_elem_raw(head, hash, key, key_size);
+ if (!l)
+ ret = -ENOENT;
+ else if (memcmp(htab_elem_value(l, key_size) + off, compare, size) != 0)
+ ret = -EBUSY;
+ else
+ hlist_nulls_del_rcu(&l->hash_node);
+
+ htab_unlock_bucket(b, flags);
+
+ if (!ret)
+ free_htab_elem(htab, l); /* ret==0 implies we unlinked l */
+ return ret;
+}
+
static long htab_lru_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
@@ -2366,6 +2404,7 @@ const struct bpf_map_ops htab_map_ops = {
.map_lookup_and_delete_elem = htab_map_lookup_and_delete_elem,
.map_update_elem = htab_map_update_elem,
.map_delete_elem = htab_map_delete_elem,
+ .map_delete_elem_cmp = htab_map_delete_elem_cmp,
.map_gen_lookup = htab_map_gen_lookup,
.map_seq_show_elem = htab_map_seq_show_elem,
.map_set_for_each_callback_args = map_set_for_each_callback_args,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index b44106c8ea75..42fc432f82e1 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -1892,18 +1892,30 @@ static int map_update_elem(union bpf_attr *attr, bpfptr_t uattr)
return err;
}
-#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
+#define BPF_MAP_DELETE_ELEM_LAST_FIELD compare_size
static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
{
bpfptr_t ukey = make_bpfptr(attr->key, uattr.is_kernel);
struct bpf_map *map;
- void *key;
+ void *key, *compare = NULL;
+ u32 off = 0, size = 0;
int err;
if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
return -EINVAL;
+ if (attr->flags & ~BPF_F_COMPARE)
+ return -EINVAL;
+
+ /* The compare* fields are meaningful only with BPF_F_COMPARE. Reject them
+ * when the flag is absent so a dropped BPF_F_COMPARE cannot silently turn a
+ * compare-and-delete into an unconditional delete.
+ */
+ if (!(attr->flags & BPF_F_COMPARE) &&
+ (attr->compare || attr->compare_offset || attr->compare_size))
+ return -EINVAL;
+
CLASS(fd, f)(attr->map_fd);
map = __bpf_map_get(f);
if (IS_ERR(map))
@@ -1920,6 +1932,38 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
goto err_put;
}
+ if (attr->flags & BPF_F_COMPARE) {
+ bpfptr_t ucmp = make_bpfptr(attr->compare, uattr.is_kernel);
+
+ off = attr->compare_offset;
+ size = attr->compare_size ?: map->value_size;
+ /* off + size must fit in value_size, overflow-safe */
+ if (size > map->value_size || off > map->value_size - size) {
+ err = -EINVAL;
+ goto out;
+ }
+ if (!map->ops->map_delete_elem_cmp) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ /* Compare-and-delete reads the raw stored value. Maps whose value carries
+ * BTF-managed fields (bpf_spin_lock, bpf_timer, kptr, ...) have
+ * those bytes sanitised on lookup, so a raw compare would never
+ * match the caller's snapshot and could expose kernel-internal
+ * bytes. Reject such maps for now.
+ */
+ if (!IS_ERR_OR_NULL(map->record)) {
+ err = -EOPNOTSUPP;
+ goto out;
+ }
+ compare = kvmemdup_bpfptr(ucmp, size);
+ if (IS_ERR(compare)) {
+ err = PTR_ERR(compare);
+ compare = NULL;
+ goto out;
+ }
+ }
+
if (bpf_map_is_offloaded(map)) {
err = bpf_map_offload_delete_elem(map, key);
goto out;
@@ -1932,12 +1976,16 @@ static int map_delete_elem(union bpf_attr *attr, bpfptr_t uattr)
bpf_disable_instrumentation();
rcu_read_lock();
- err = map->ops->map_delete_elem(map, key);
+ if (compare)
+ err = map->ops->map_delete_elem_cmp(map, key, compare, off, size);
+ else
+ err = map->ops->map_delete_elem(map, key);
rcu_read_unlock();
bpf_enable_instrumentation();
if (!err)
maybe_wait_bpf_programs(map);
out:
+ kvfree(compare);
kvfree(key);
err_put:
bpf_map_write_active_dec(map);
--
2.39.5 (Apple Git-154)