Re: [PATCH v7 2/6] sched_ext: Implement scx_bpf_now()

From: Peter Zijlstra
Date: Wed Jan 08 2025 - 03:50:44 EST



> +__bpf_kfunc u64 scx_bpf_now(void)
> +{
> + struct rq *rq;
> + u64 clock;
> +
> + preempt_disable();
> +
> + rq = this_rq();
> + if (READ_ONCE(rq->scx.flags) & SCX_RQ_CLK_VALID) {
> + /*
> + * If the rq clock is valid, use the cached rq clock.
> + *
> + * Note that scx_bpf_now() is re-entrant between a process
> + * context and an interrupt context (e.g., timer interrupt).
> + * However, we don't need to consider the race between them
> + * because such race is not observable from a caller.
> + */
> + clock = READ_ONCE(rq->scx.clock);
> + } else {
> + /*
> + * Otherwise, return a fresh rq clock.
> + *
> + * The rq clock is updated outside of the rq lock.
> + * In this case, keep the updated rq clock invalid so the next
> + * kfunc call outside the rq lock gets a fresh rq clock.
> + */
> + clock = sched_clock_cpu(cpu_of(rq));
> + }
> +
> + preempt_enable();
> +
> + return clock;
> +}

> +static inline void scx_rq_clock_update(struct rq *rq, u64 clock)
> +{
> + if (!scx_enabled())
> + return;
> + WRITE_ONCE(rq->scx.clock, clock);
> + WRITE_ONCE(rq->scx.flags, rq->scx.flags | SCX_RQ_CLK_VALID);
> +}


AFAICT it is possible to be used like:


CPU0 CPU1

lock(rq1->lock);
...
scx_rq_clock_update(...); scx_bpf_now();
...
unlock(rq->lock);


Which then enables the following ordering problem:

CPU0 CPU1

WRITE_ONCE(rq->scx.clock, clock); if (rq->scx.flags & VALID)
WRITE_ONCE(rq->scx.flags, VALID); return rq->scx.clock;

Where it then becomes possible to observe VALID before clock is written.

That is, I rather think you need:

> +static inline void scx_rq_clock_update(struct rq *rq, u64 clock)
> +{
> + if (!scx_enabled())
> + return;
> + WRITE_ONCE(rq->scx.clock, clock);
> + smp_store_release(&rq->scx.flags, rq->scx.flags | SCX_RQ_CLK_VALID);
> +}

and:

if (smp_load_acquire(&rq->scx.flags) & SCX_RQ_CLK_VALID) {
> + /*
> + * If the rq clock is valid, use the cached rq clock.
> + *
> + * Note that scx_bpf_now() is re-entrant between a process
> + * context and an interrupt context (e.g., timer interrupt).
> + * However, we don't need to consider the race between them
> + * because such race is not observable from a caller.
> + */
> + clock = READ_ONCE(rq->scx.clock);

Such that if you ovbserve VALID, you must then also observe the clock.