Re: [PATCH v2 2/2] tools/sched_ext: scx_sdt: Fix BPF verifier rejection on older LLVMs

From: Emil Tsalapatis

Date: Mon Mar 09 2026 - 12:40:20 EST


On Sun Mar 8, 2026 at 10:28 PM EDT, Zhao Mengmeng wrote:
> From: Zhao Mengmeng <zhaomengmeng@xxxxxxxxxx>
>
> Under Clang 17/18, when running scx_sdt scheduler, it fails with:
>
> libbpf: prog 'sdt_init_task': BPF program load failed: -EACCES
> libbpf: prog 'sdt_init_task': -- BEGIN PROG LOAD LOG --
> ...
> ; desc = desc_find_empty(alloc->root, &idx); @ scx_sdt.bpf.c:479
> 43: (79) r8 = *(u64 *)(r6 +32) ; frame1: R6=map_value(map=scx_sdt.bss,ks=4,vs=200,off=120) R8=scalar()
> ; for (level = zero; level < SDT_TASK_LEVELS && can_loop; level++) { @ scx_sdt.bpf.c:407
> 44: (e5) may_goto pc+51
> ; idx |= pos; @ scx_sdt.bpf.c:418
> 96: (bf) r7 = r2 ; frame1: R2=0 R7=0
> 97: (bf) r1 = r10 ; frame1: R1=fp0 R10=fp0
> ; @ scx_sdt.bpf.c:0
> 98: (07) r1 += -56 ; frame1: R1=fp-56
> ; bpf_for(u, 0, SDT_TASK_LEVELS) { @ scx_sdt.bpf.c:447
> 99: (b4) w2 = 0 ; frame1: R2=0
> 100: (b4) w3 = 3 ; frame1: R3=3
> 101: (85) call bpf_iter_num_new#82234 ; frame1: R0=scalar() fp-56=iter_num(ref_id=2,state=active,depth=0)
> 102: (18) r9 = 0x1ffffffffffffff8 ; frame1: R9=0x1ffffffffffffff8
> 104: (bf) r1 = r10 ; frame1: R1=fp0 R10=fp0
> ; @ scx_sdt.bpf.c:0
> 105: (07) r1 += -56 ; frame1: R1=fp-56
> ; bpf_for(u, 0, SDT_TASK_LEVELS) { @ scx_sdt.bpf.c:447
> 106: (85) call bpf_iter_num_next#82235 ; frame1: R0=0 fp-56=iter_num(ref_id=2,state=drained,depth=0)
> 107: (15) if r0 == 0x0 goto pc+29 ; frame1: R0=0
> ; if (tmp->nr_free > 0) @ scx_sdt.bpf.c:456
> 137: (bf) r1 = r10 ; frame1: R1=fp0 R10=fp0
> ; bpf_for(u, 0, SDT_TASK_LEVELS) { @ scx_sdt.bpf.c:447
> 138: (07) r1 += -56 ; frame1: R1=fp-56
> 139: (85) call bpf_iter_num_destroy#82232 ; frame1:
> 140: (b7) r9 = 0 ; frame1: R9=0
> ; if (unlikely(desc == NULL)) { @ scx_sdt.bpf.c:480
> 141: (15) if r8 == 0x0 goto pc+15 ; frame1: R8=scalar(umin=1)
> ; chunk = desc->chunk; @ scx_sdt.bpf.c:485
> 142: (79) r4 = *(u64 *)(r8 +72)
> R8 invalid mem access 'scalar'
>
> The reason is these older compilers lacks native support for
> __BPF_FEATURE_ADDR_SPACE_CAST, __arena macro is defined as empty.
>
> Fix it by adding cast_kern when dereferencing variables with __arena tag.
>

I am not sure if we want to support older Clang versions at this point.
This issue is fixed for Clang 19, and adding the macros back in makes it
confusing for those who use the code as a starting point. And while
it would be nice to support older Clang versions, we already don't
handle Clang 15/16 that don't have arena support. So it's not
unreasonable if we say Clang 17/18 are also incompatible with this
example.

On the other hand, maybe the extra compatibility is worth re-adding
cast_kern/cast_user to the code. I am slightly in favor of keeping it
as-is to avoid churn, but can easily see why we'd go the other way.

@htejun WDYT?

> Signed-off-by: Zhao Mengmeng <zhaomengmeng@xxxxxxxxxx>
> ---
> tools/sched_ext/scx_sdt.bpf.c | 20 +++++++++++++++++++-
> 1 file changed, 19 insertions(+), 1 deletion(-)
>
> diff --git a/tools/sched_ext/scx_sdt.bpf.c b/tools/sched_ext/scx_sdt.bpf.c
> index 31b09958e8d5..caacc55bd7a5 100644
> --- a/tools/sched_ext/scx_sdt.bpf.c
> +++ b/tools/sched_ext/scx_sdt.bpf.c
> @@ -148,6 +148,7 @@ static sdt_desc_t *scx_alloc_chunk(void)
>
> out = desc;
>
> + cast_kern(desc);
> desc->nr_free = SDT_TASK_ENTS_PER_CHUNK;
> desc->chunk = chunk;
>
> @@ -244,6 +245,7 @@ int mark_nodes_avail(sdt_desc_t *lv_desc[SDT_TASK_LEVELS], __u64 lv_pos[SDT_TASK
> /* Only propagate upwards if we are the parent's only free chunk. */
> desc = lv_desc[level];
>
> + cast_kern(desc);
> ret = set_idx_state(desc, lv_pos[level], false);
> if (unlikely(ret != 0))
> return ret;
> @@ -298,20 +300,26 @@ int scx_alloc_free_idx(struct scx_allocator *alloc, __u64 idx)
> if (level == SDT_TASK_LEVELS - 1)
> break;
>
> + cast_kern(desc);
> chunk = desc->chunk;
>
> + cast_kern(chunk);
> desc_children = (sdt_desc_t * __arena *)chunk->descs;
> + cast_kern(desc_children);
> desc = desc_children[pos];
>
> if (unlikely(!desc))
> return -EINVAL;
> }
>
> + cast_kern(desc);
> chunk = desc->chunk;
>
> pos = idx & mask;
> + cast_kern(chunk);
> data = chunk->data[pos];
> if (likely(data)) {
> + cast_kern(data);
> *data = (struct sdt_data) {
> .tid.genn = data->tid.genn + 1,
> };
> @@ -378,6 +386,7 @@ __u64 chunk_find_empty(sdt_desc_t __arg_arena *desc)
> __u64 freeslots;
> __u64 i;
>
> + cast_kern(desc);
> for (i = 0; i < SDT_TASK_CHUNK_BITMAP_U64S; i++) {
> freeslots = ~desc->allocated[i];
> if (freeslots == (__u64)0)
> @@ -426,9 +435,12 @@ static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp)
> break;
>
> /* Allocate an internal node if necessary. */
> + cast_kern(desc);
> chunk = desc->chunk;
> + cast_kern(chunk);
> desc_children = (sdt_desc_t * __arena *)chunk->descs;
>
> + cast_kern(desc_children);
> desc = desc_children[pos];
> if (!desc) {
> desc = scx_alloc_chunk();
> @@ -448,6 +460,7 @@ static sdt_desc_t * desc_find_empty(sdt_desc_t *desc, __u64 *idxp)
> level = SDT_TASK_LEVELS - 1 - u;
> tmp = lv_desc[level];
>
> + cast_kern(tmp);
> ret = set_idx_state(tmp, lv_pos[level], true);
> if (ret != 0)
> break;
> @@ -482,10 +495,12 @@ void __arena *scx_alloc(struct scx_allocator *alloc)
> return NULL;
> }
>
> + cast_kern(desc);
> chunk = desc->chunk;
>
> /* Populate the leaf node if necessary. */
> pos = idx & (SDT_TASK_ENTS_PER_CHUNK - 1);
> + cast_kern(chunk);
> data = chunk->data[pos];
> if (!data) {
> data = scx_alloc_from_pool(&alloc->pool);
> @@ -503,10 +518,12 @@ void __arena *scx_alloc(struct scx_allocator *alloc)
> alloc_stats.alloc_ops += 1;
> alloc_stats.active_allocs += 1;
>
> + cast_kern(data);
> data->tid.idx = idx;
>
> bpf_spin_unlock(&alloc_lock);
>
> + cast_user(data);
> return data;
> }
>
> @@ -544,9 +561,10 @@ void __arena *scx_task_alloc(struct task_struct *p)
> if (unlikely(!data))
> return NULL;
>
> + mval->data = data;
> + cast_kern(data);
> mval->tid = data->tid;
> mval->tptr = (__u64) p;
> - mval->data = data;
>
> return (void __arena *)data->payload;
> }