[PATCH RFC bpf-next v2 01/11] Sleepable timers

From: Benjamin Tissoires
Date: Tue Feb 13 2024 - 12:40:01 EST


---
include/linux/bpf_verifier.h | 2 +
include/uapi/linux/bpf.h | 13 ++++++
kernel/bpf/helpers.c | 91 +++++++++++++++++++++++++++++++++++++++---
kernel/bpf/verifier.c | 20 ++++++++--
tools/include/uapi/linux/bpf.h | 13 ++++++
5 files changed, 130 insertions(+), 9 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index 84365e6dd85d..789ef5fec547 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -426,6 +426,7 @@ struct bpf_verifier_state {
* while they are still in use.
*/
bool used_as_loop_entry;
+ bool in_sleepable;

/* first and last insn idx of this verifier state */
u32 first_insn_idx;
@@ -626,6 +627,7 @@ struct bpf_subprog_info {
bool is_async_cb: 1;
bool is_exception_cb: 1;
bool args_cached: 1;
+ bool is_sleepable: 1;

u8 arg_cnt;
struct bpf_subprog_arg_info args[MAX_BPF_FUNC_REG_ARGS];
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index d96708380e52..ef1f2be4cfef 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -5742,6 +5742,18 @@ union bpf_attr {
* 0 on success.
*
* **-ENOENT** if the bpf_local_storage cannot be found.
+ *
+ * long bpf_timer_set_sleepable_cb(struct bpf_timer *timer, void *callback_fn)
+ * Description
+ * Configure the timer to call *callback_fn* static function in a
+ * sleepable context.
+ * Return
+ * 0 on success.
+ * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ * **-EPERM** if *timer* is in a map that doesn't have any user references.
+ * The user space should either hold a file descriptor to a map with timers
+ * or pin such map in bpffs. When map is unpinned or file descriptor is
+ * closed all timers in the map will be cancelled and freed.
*/
#define ___BPF_FUNC_MAPPER(FN, ctx...) \
FN(unspec, 0, ##ctx) \
@@ -5956,6 +5968,7 @@ union bpf_attr {
FN(user_ringbuf_drain, 209, ##ctx) \
FN(cgrp_storage_get, 210, ##ctx) \
FN(cgrp_storage_delete, 211, ##ctx) \
+ FN(timer_set_sleepable_cb, 212, ##ctx) \
/* */

/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 4db1c658254c..e3b83d27b1b6 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -1097,9 +1097,11 @@ const struct bpf_func_proto bpf_snprintf_proto = {
*/
struct bpf_hrtimer {
struct hrtimer timer;
+ struct work_struct work;
struct bpf_map *map;
struct bpf_prog *prog;
void __rcu *callback_fn;
+ void __rcu *sleepable_cb_fn;
void *value;
};

@@ -1113,18 +1115,64 @@ struct bpf_timer_kern {
struct bpf_spin_lock lock;
} __attribute__((aligned(8)));

+static void bpf_timer_work_cb(struct work_struct *work)
+{
+ struct bpf_hrtimer *t = container_of(work, struct bpf_hrtimer, work);
+ struct bpf_map *map = t->map;
+ void *value = t->value;
+ bpf_callback_t callback_fn;
+ void *key;
+ u32 idx;
+
+ BTF_TYPE_EMIT(struct bpf_timer);
+
+ rcu_read_lock();
+ callback_fn = rcu_dereference(t->sleepable_cb_fn);
+ rcu_read_unlock();
+ if (!callback_fn)
+ return;
+
+ // /* bpf_timer_work_cb() runs in hrtimer_run_softirq. It doesn't migrate and
+ // * cannot be preempted by another bpf_timer_cb() on the same cpu.
+ // * Remember the timer this callback is servicing to prevent
+ // * deadlock if callback_fn() calls bpf_timer_cancel() or
+ // * bpf_map_delete_elem() on the same timer.
+ // */
+ // this_cpu_write(hrtimer_running, t);
+ if (map->map_type == BPF_MAP_TYPE_ARRAY) {
+ struct bpf_array *array = container_of(map, struct bpf_array, map);
+
+ /* compute the key */
+ idx = ((char *)value - array->value) / array->elem_size;
+ key = &idx;
+ } else { /* hash or lru */
+ key = value - round_up(map->key_size, 8);
+ }
+
+ callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
+ /* The verifier checked that return value is zero. */
+
+ // this_cpu_write(hrtimer_running, NULL);
+}
+
static DEFINE_PER_CPU(struct bpf_hrtimer *, hrtimer_running);

static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
{
struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
+ bpf_callback_t callback_fn, sleepable_cb_fn;
struct bpf_map *map = t->map;
void *value = t->value;
- bpf_callback_t callback_fn;
void *key;
u32 idx;

BTF_TYPE_EMIT(struct bpf_timer);
+ sleepable_cb_fn = rcu_dereference_check(t->sleepable_cb_fn, rcu_read_lock_bh_held());
+ if (sleepable_cb_fn) {
+ schedule_work(&t->work);
+ goto out;
+ }
+
callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
if (!callback_fn)
goto out;
@@ -1190,7 +1238,9 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
t->map = map;
t->prog = NULL;
rcu_assign_pointer(t->callback_fn, NULL);
+ rcu_assign_pointer(t->sleepable_cb_fn, NULL);
hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+ INIT_WORK(&t->work, bpf_timer_work_cb);
t->timer.function = bpf_timer_cb;
WRITE_ONCE(timer->timer, t);
/* Guarantee the order between timer->timer and map->usercnt. So
@@ -1221,8 +1271,8 @@ static const struct bpf_func_proto bpf_timer_init_proto = {
.arg3_type = ARG_ANYTHING,
};

-BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
- struct bpf_prog_aux *, aux)
+static int __bpf_timer_set_callback(struct bpf_timer_kern *timer, void *callback_fn,
+ struct bpf_prog_aux *aux, bool is_sleepable)
{
struct bpf_prog *prev, *prog = aux->prog;
struct bpf_hrtimer *t;
@@ -1260,12 +1310,24 @@ BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callb
bpf_prog_put(prev);
t->prog = prog;
}
- rcu_assign_pointer(t->callback_fn, callback_fn);
+ if (is_sleepable) {
+ rcu_assign_pointer(t->sleepable_cb_fn, callback_fn);
+ rcu_assign_pointer(t->callback_fn, NULL);
+ } else {
+ rcu_assign_pointer(t->callback_fn, callback_fn);
+ rcu_assign_pointer(t->sleepable_cb_fn, NULL);
+ }
out:
__bpf_spin_unlock_irqrestore(&timer->lock);
return ret;
}

+BPF_CALL_3(bpf_timer_set_callback, struct bpf_timer_kern *, timer, void *, callback_fn,
+ struct bpf_prog_aux *, aux)
+{
+ return __bpf_timer_set_callback(timer, callback_fn, aux, false);
+}
+
static const struct bpf_func_proto bpf_timer_set_callback_proto = {
.func = bpf_timer_set_callback,
.gpl_only = true,
@@ -1274,6 +1336,20 @@ static const struct bpf_func_proto bpf_timer_set_callback_proto = {
.arg2_type = ARG_PTR_TO_FUNC,
};

+BPF_CALL_3(bpf_timer_set_sleepable_cb, struct bpf_timer_kern *, timer, void *, callback_fn,
+ struct bpf_prog_aux *, aux)
+{
+ return __bpf_timer_set_callback(timer, callback_fn, aux, true);
+}
+
+static const struct bpf_func_proto bpf_timer_set_sleepable_cb_proto = {
+ .func = bpf_timer_set_sleepable_cb,
+ .gpl_only = true,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_TIMER,
+ .arg2_type = ARG_PTR_TO_FUNC,
+};
+
BPF_CALL_3(bpf_timer_start, struct bpf_timer_kern *, timer, u64, nsecs, u64, flags)
{
struct bpf_hrtimer *t;
@@ -1353,6 +1429,7 @@ BPF_CALL_1(bpf_timer_cancel, struct bpf_timer_kern *, timer)
* if it was running.
*/
ret = ret ?: hrtimer_cancel(&t->timer);
+ ret = ret ?: cancel_work_sync(&t->work);
return ret;
}

@@ -1405,8 +1482,10 @@ void bpf_timer_cancel_and_free(void *val)
* effectively cancelled because bpf_timer_cb() will return
* HRTIMER_NORESTART.
*/
- if (this_cpu_read(hrtimer_running) != t)
+ if (this_cpu_read(hrtimer_running) != t) {
hrtimer_cancel(&t->timer);
+ }
+ cancel_work_sync(&t->work);
kfree(t);
}

@@ -1749,6 +1828,8 @@ bpf_base_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_timer_init_proto;
case BPF_FUNC_timer_set_callback:
return &bpf_timer_set_callback_proto;
+ case BPF_FUNC_timer_set_sleepable_cb:
+ return &bpf_timer_set_sleepable_cb_proto;
case BPF_FUNC_timer_start:
return &bpf_timer_start_proto;
case BPF_FUNC_timer_cancel:
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 64fa188d00ad..400c625efe22 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -513,7 +513,8 @@ static bool is_sync_callback_calling_function(enum bpf_func_id func_id)

static bool is_async_callback_calling_function(enum bpf_func_id func_id)
{
- return func_id == BPF_FUNC_timer_set_callback;
+ return func_id == BPF_FUNC_timer_set_callback ||
+ func_id == BPF_FUNC_timer_set_sleepable_cb;
}

static bool is_callback_calling_function(enum bpf_func_id func_id)
@@ -1414,6 +1415,7 @@ static int copy_verifier_state(struct bpf_verifier_state *dst_state,
}
dst_state->speculative = src->speculative;
dst_state->active_rcu_lock = src->active_rcu_lock;
+ dst_state->in_sleepable = src->in_sleepable;
dst_state->curframe = src->curframe;
dst_state->active_lock.ptr = src->active_lock.ptr;
dst_state->active_lock.id = src->active_lock.id;
@@ -9434,11 +9436,13 @@ static int push_callback_call(struct bpf_verifier_env *env, struct bpf_insn *ins

if (insn->code == (BPF_JMP | BPF_CALL) &&
insn->src_reg == 0 &&
- insn->imm == BPF_FUNC_timer_set_callback) {
+ (insn->imm == BPF_FUNC_timer_set_callback ||
+ insn->imm == BPF_FUNC_timer_set_sleepable_cb)) {
struct bpf_verifier_state *async_cb;

/* there is no real recursion here. timer callbacks are async */
env->subprog_info[subprog].is_async_cb = true;
+ env->subprog_info[subprog].is_sleepable = insn->imm == BPF_FUNC_timer_set_sleepable_cb;
async_cb = push_async_cb(env, env->subprog_info[subprog].start,
insn_idx, subprog);
if (!async_cb)
@@ -10280,6 +10284,8 @@ static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn
set_map_elem_callback_state);
break;
case BPF_FUNC_timer_set_callback:
+ fallthrough;
+ case BPF_FUNC_timer_set_sleepable_cb:
err = push_callback_call(env, insn, insn_idx, meta.subprogno,
set_timer_callback_state);
break;
@@ -15586,7 +15592,9 @@ static int visit_insn(int t, struct bpf_verifier_env *env)
return DONE_EXPLORING;

case BPF_CALL:
- if (insn->src_reg == 0 && insn->imm == BPF_FUNC_timer_set_callback)
+ if (insn->src_reg == 0 &&
+ (insn->imm == BPF_FUNC_timer_set_callback ||
+ insn->imm == BPF_FUNC_timer_set_sleepable_cb))
/* Mark this call insn as a prune point to trigger
* is_state_visited() check before call itself is
* processed by __check_func_call(). Otherwise new
@@ -16762,6 +16770,9 @@ static bool states_equal(struct bpf_verifier_env *env,
if (old->active_rcu_lock != cur->active_rcu_lock)
return false;

+ if (old->in_sleepable != cur->in_sleepable)
+ return false;
+
/* for states to be equal callsites have to be the same
* and all frame states need to be equivalent
*/
@@ -19639,7 +19650,8 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
continue;
}

- if (insn->imm == BPF_FUNC_timer_set_callback) {
+ if (insn->imm == BPF_FUNC_timer_set_callback ||
+ insn->imm == BPF_FUNC_timer_set_sleepable_cb) {
/* The verifier will process callback_fn as many times as necessary
* with different maps and the register states prepared by
* set_timer_callback_state will be accurate.
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index d96708380e52..ef1f2be4cfef 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -5742,6 +5742,18 @@ union bpf_attr {
* 0 on success.
*
* **-ENOENT** if the bpf_local_storage cannot be found.
+ *
+ * long bpf_timer_set_sleepable_cb(struct bpf_timer *timer, void *callback_fn)
+ * Description
+ * Configure the timer to call *callback_fn* static function in a
+ * sleepable context.
+ * Return
+ * 0 on success.
+ * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier.
+ * **-EPERM** if *timer* is in a map that doesn't have any user references.
+ * The user space should either hold a file descriptor to a map with timers
+ * or pin such map in bpffs. When map is unpinned or file descriptor is
+ * closed all timers in the map will be cancelled and freed.
*/
#define ___BPF_FUNC_MAPPER(FN, ctx...) \
FN(unspec, 0, ##ctx) \
@@ -5956,6 +5968,7 @@ union bpf_attr {
FN(user_ringbuf_drain, 209, ##ctx) \
FN(cgrp_storage_get, 210, ##ctx) \
FN(cgrp_storage_delete, 211, ##ctx) \
+ FN(timer_set_sleepable_cb, 212, ##ctx) \
/* */

/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't

--
2.43.0

---