Re: [PATCH] sched: Re-evaluate scheduling when migrating queued tasks out of throttled cgroups

From: Zicheng Qu

Date: Fri Jan 30 2026 - 04:04:23 EST

On 1/30/2026 4:34 PM, Zicheng Qu wrote:

4) For kernel <= 5.10: Later, cgroup A is unthrottled. However, the task
P has already been migrated out of cgroup A, so unthrottle_cfs_rq()
may observe load_weight == 0 and return early without resched_curr()
called. For kernel >= 6.6: The unthrottling path normally triggers
`resched_curr()` almost cases even when no runnable tasks remain in the
unthrottled cgroup, preventing the idle stall described above. However,
if cgroup A is removed before it gets unthrottled, the unthrottling path
for cgroup A is never executed. In a result, no `resched_curr()` can be
called.

Hi Aaron,

Apologies for the confusion in my earlier description — the original
failure model was identified and analyzed on kernels based on LTS 5.10.

Later I realized that on v6.6 and mainline, the issue becomes much harder
to reproduce due to additional conditions introduced in the condition
(cfs_rq->on_list) in unthrottle_cfs_rq(), which effectively mask the
original reproduction path.

As a result, I adjusted the reproducer accordingly. With the updated
reproducer, the issue can still be triggered on mainline by explicitly
bypassing the unthrottling reschedule path, as described in the commit
message.

The reproducer can be run directly via:

./make.sh

My local /proc/cmdline is:

systemd.unified_cgroup_hierarchy=0 nohz_full=2-15 rcu_nocbs=2-15

With this setup, the issue is reproducible on current mainline.

make.sh
```sh
#!/bin/bash

gcc -O2 heartbeat.c -o heartbeat

chmod +x ./run_test.sh && ./run_test.sh
```

heartbeat.c
```c
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <unistd.h>
#include <time.h>

static inline long long now_ns(void)
{
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return ts.tv_sec * 1000000000LL + ts.tv_nsec;
}

int main(void)
{
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(12, &set); // CPU 12 is nohz_full
sched_setaffinity(0, sizeof(set), &set);

long long last = now_ns();
unsigned long long iter = 0;

while (1) {
iter++;
long long now = now_ns();
if (now - last > 1000 * 1000 * 1000) { // 1000ms
printf("[HB] sec=%lld pid=%d cpu=%d iter=%llu\n", now / 1000000000LL, getpid(), sched_getcpu(), iter);
fflush(stdout);
last = now_ns();
}
}
}
```

```sh
#!/bin/bash
#
# run_test.sh
#
# Reproducer for a scheduling stall on nohz_full CPUs when migrating
# queued tasks out of throttled cgroups.
#
# Test outline:
# 1. Start a CPU-bound workload (heartbeat) that prints a heartbeat (HB)
# once per second.
# 2. Migrate the task into a heavily throttled child cgroup.
# 3. Migrate the task back to the root cgroup (potential trigger point).
# 4. Immediately remove (destroy) the throttled cgroup before it gets
# unthrottled.
# 5. Observe whether the heartbeat continues to advance.
# - If HB advances: no stall, continue to next round.
# - If HB stops advancing: scheduling stall detected, freeze the setup
# for debugging.
#

set -e

########################
# Basic configuration
########################

ROOT_CG=/sys/fs/cgroup/cpu
THROTTLED_CG=$ROOT_CG/child_cgroup

mkdir -p "$ROOT_CG"

HB_LOG=heartbeat.log

# Throttle settings: 1ms runtime per 1s period
CFS_QUOTA_US=1000
CFS_PERIOD_US=1000000

# Timeout (in seconds) to consider the workload "stuck"
STUCK_TIMEOUT=10
CHECK_INTERVAL=0.2

########################
# Cleanup logic
########################

PID=

cleanup() {
echo
echo "[!] cleanup: stopping workload"

if [[ -n "$PID" ]] && kill -0 "$PID" 2>/dev/null; then
echo "[!] killing pid $PID"
kill -TERM "$PID"
wait "$PID" 2>/dev/null || true
fi

echo "[!] cleanup done"
}

trap cleanup INT TERM EXIT

########################
# Start workload
########################

echo "[+] starting heartbeat workload"

./heartbeat | tee "$HB_LOG" &
PID=$(($! - 1)) # temporary hack PID

echo "[+] workload pid = $PID"
echo

########################
# Helper functions
########################

# Extract the last printed heartbeat second from the log
last_hb_sec() {
tail -n 1 "$HB_LOG" 2>/dev/null | awk '{
for (i = 1; i <= NF; i++) {
if ($i ~ /^sec=/) {
split($i, a, "=");
print a[2];
exit;
}
}
}'
}

verify_cgroup_location() {
echo " root cgroup:"
cat "$ROOT_CG/tasks" | grep "$PID" || true
echo " throttled cgroup:"
cat "$THROTTLED_CG/tasks" | grep "$PID" || true
}

########################
# Main test loop
########################

round=0

while true; do
# Recreate the throttled cgroup for the next iteration
mkdir -p "$THROTTLED_CG"
echo $CFS_QUOTA_US > "$THROTTLED_CG/cpu.cfs_quota_us"
echo $CFS_PERIOD_US > "$THROTTLED_CG/cpu.cfs_period_us"

round=$((round + 1))
echo "========== ROUND $round =========="

echo "[1] move task into throttled cgroup"
echo "$PID" > "$THROTTLED_CG/tasks"

echo "[1.1] verify cgroup placement"
verify_cgroup_location

# Give the task some time to consume its quota and become throttled
sleep 0.2

echo "[2] migrate task back to root cgroup (potential trigger)"
echo "$PID" > "$ROOT_CG/tasks"

echo "[2.1] verify cgroup placement"
verify_cgroup_location

#
# IMPORTANT:
# For kernels >= 6.6, unthrottling normally triggers resched_curr().
# Removing the throttled cgroup before it gets unthrottled bypasses
# the unthrottle path and is required to reproduce the stall.
#
echo "[2.2] remove throttled cgroup before unthrottling"
rmdir "$THROTTLED_CG"

# Observe heartbeat after migration back to root
base_hb=$(last_hb_sec)
[[ -z "$base_hb" ]] && base_hb=0

echo "[3] observing heartbeat (base_hb_sec=$base_hb)"

start_ts=$(date +%s)

while true; do
cur_hb=$(last_hb_sec)
[[ -z "$cur_hb" ]] && cur_hb=0

if (( cur_hb > base_hb )); then
echo "[OK] heartbeat advanced: $base_hb -> $cur_hb"
break
fi

now_ts=$(date +%s)
if (( now_ts - start_ts >= STUCK_TIMEOUT )); then
echo
echo "[!!!] SCHEDULING STALL DETECTED AFTER MIGRATION !!!"
echo "[!!!] base_hb_sec=$base_hb cur_hb_sec=$cur_hb"
echo "[!!!] freezing setup for debugging for 20s"
echo

# Give some time to attach debuggers / tracing
sleep 20

echo "[!!!] workload still stuck, entering infinite sleep, and will continue to run now"

taskset -c 12 sleep 1 # more than 1 tasks, will break the nohz_full state

while true; do
sleep 3600
done
fi

sleep "$CHECK_INTERVAL"
done

echo "[4] wait before next round"
sleep 1
done
```

Best regards,
Zicheng