Re: [PATCH] sched/deadline: Reject debugfs dl_server writes for offline CPUs
From: abaci-kreproducer
Date: Tue May 26 2026 - 08:10:56 EST
From: abaci-kreproducer <abaci@xxxxxxxxxxxxxxxxx>
This is an AI-generated validation of this patch. AI successfully
reproduced the issue and confirmed the fix is valid. The results were
verified by D. Wythe <alibuda@xxxxxxxxxxxxxxxxx>.
Tested-by: abaci-kreproducer <abaci@xxxxxxxxxxxxxxxxx>
---
We reproduced this issue on the unpatched kernel:
Writing dl_server debugfs parameters (runtime/period) on an offline CPU
triggers WARN_ON_ONCE in dl_server_start(). dmesg shows: "WARNING:
kernel/sched/deadline.c:1804 at dl_server_start+0xa6/0x120".
sched_server_write_common() did not check cpu_online() before calling
dl_server_stop(), dl_server_apply_params(), and dl_server_start(). When
the target CPU is offline, dl_server_start() hits
WARN_ON_ONCE(!cpu_online(cpu_of(rq))). Additionally, if the CPU has been
removed from the root-domain span, dl_bw_cpus() returns 0, causing a
divide-by-zero in __dl_sub()/__dl_add().
The reproducer offlines a non-boot CPU via sysfs, then writes to
/sys/kernel/debug/sched/fair_server/cpuN/runtime and period. This
exercises the unguarded path in sched_server_write_common().
On the patched kernel, writes to an offline CPU's dl_server debugfs
files return -EBUSY. No WARNING or Oops appears in dmesg.
---
Key configuration
* kconfig:
CONFIG_SMP=y
CONFIG_SCHED_DEBUG=y
* kernel_cmdline: -
* rpm package: -
--
run.sh
#!/bin/bash
set -x
echo "=== Kernel Bug Reproducer ==="
echo "Bug: sched/deadline - Divide-by-zero and WARN_ON_ONCE when writing dl_server"
echo " debugfs params on offline CPUs"
echo "Commit: 7b63735b609891e9dd441361df3ed42a404a5cc6"
echo "Fix: Reject debugfs dl_server writes for offline CPUs with -EBUSY"
echo
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
# Build the reproducer
echo "Building reproducer..."
make -C "$SCRIPT_DIR/bin" clean
make -C "$SCRIPT_DIR/bin"
if [ $? -ne 0 ]; then
echo "ERROR: Failed to build reproducer"
exit 2
fi
# Check prerequisites
echo "Checking prerequisites..."
# Check for debugfs sched path
if [ ! -d "/sys/kernel/debug/sched/fair_server" ]; then
echo "ERROR: /sys/kernel/debug/sched/fair_server not found."
echo " Kernel needs CONFIG_SCHED_DEBUG=y and debugfs mounted."
exit 2
fi
echo " debugfs fair_server path: OK"
# Check for at least 2 CPUs
NUM_CPUS=$(nproc)
if [ "$NUM_CPUS" -lt 2 ]; then
echo "ERROR: Need at least 2 CPUs (have $NUM_CPUS)"
exit 2
fi
echo " CPU count: $NUM_CPUS (>= 2): OK"
# Clear dmesg before running
echo "Clearing dmesg..."
dmesg -c > /dev/null || true
# Run the reproducer
echo ""
echo "Running reproducer..."
echo "========================="
"$SCRIPT_DIR/bin/reproducer"
REPRODUCER_EXIT=$?
echo "========================="
echo ""
# Capture dmesg errors after run
echo "Capturing kernel messages..."
mkdir -p "$SCRIPT_DIR/results" || true
dmesg 2>&1 | grep -iE "divide error|Oops|panic|BUG|WARNING|dl_server" > "$SCRIPT_DIR/results/dmesg-errors.txt" || true
# Print dmesg errors if any
if [ -s "$SCRIPT_DIR/results/dmesg-errors.txt" ]; then
echo ""
echo "=== Kernel messages detected ==="
cat "$SCRIPT_DIR/results/dmesg-errors.txt"
echo ""
fi
# Save test output
echo "Reproducer exit code: $REPRODUCER_EXIT"
echo ""
echo "Reproducer completed."
echo "Exit code: $REPRODUCER_EXIT"
echo " 0 = bug not triggered (possibly fixed)"
echo " 1 = bug triggered"
echo " 2 = prerequisite failure"
exit $REPRODUCER_EXIT
--
reproducer.c
/*
* reproducer.c - Reproducer for sched/deadline dl_server offline CPU bug
*
* Bug: Writing runtime or period via per-CPU dl_server debugfs files
* (/sys/kernel/debug/sched/fair_server/cpuN/{runtime,period})
* on an offline CPU triggers:
*
* 1) Divide-by-zero in dl_server_apply_params():
* __dl_sub() and __dl_add() divide by cpus, which can be 0
* when the CPU has been removed from any active root-domain span.
*
* 2) WARN_ON_ONCE in dl_server_start():
* Catches enqueueing the server on an offline rq.
*
* Reproducer strategy:
* - Find an available CPU (not CPU0, which usually can't be offlined)
* - Offline the CPU
* - Write to the dl_server debugfs runtime/period files for that CPU
* - Check for kernel warnings, Oops, or divide errors in dmesg
*
* Fix: commit 7b63735b609891e9dd441361df3ed42a404a5cc6 adds
* `if (!cpu_online(cpu_of(rq))) return -EBUSY;` check.
*/
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <errno.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <dirent.h>
#include <sched.h>
#include <time.h>
#define DEBUGFS_SCHED_PATH "/sys/kernel/debug/sched/fair_server"
#define CPU_OFFLINE_PATH "/sys/devices/system/cpu/cpu%d/online"
#define MAX_CPUS 256
/*
* Check if debugfs fair_server path exists (requires CONFIG_SCHED_DEBUG)
*/
static int check_debugfs_available(void)
{
struct stat st;
return stat(DEBUGFS_SCHED_PATH, &st) == 0;
}
/*
* Get number of online CPUs
*/
static int get_num_cpus(void)
{
int n = sysconf(_SC_NPROCESSORS_ONLN);
if (n < 0) {
perror("sysconf(_SC_NPROCESSORS_ONLN)");
return -1;
}
return n;
}
/*
* Check if a specific CPU directory exists in sysfs
*/
static int cpu_dir_exists(int cpu)
{
char path[256];
struct stat st;
snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d", cpu);
return stat(path, &st) == 0;
}
/*
* Read the online status of a CPU from sysfs
* Returns: 1 if online, 0 if offline, -1 if error or no online file (CPU0)
*/
static int cpu_is_online(int cpu)
{
char path[256];
char buf[16];
int fd;
ssize_t n;
snprintf(path, sizeof(path), CPU_OFFLINE_PATH, cpu);
fd = open(path, O_RDONLY);
if (fd < 0) {
/* No online file (e.g., CPU0 or boot CPU) - always online */
return 1;
}
n = read(fd, buf, sizeof(buf) - 1);
close(fd);
if (n < 0) {
perror("read cpu online status");
return -1;
}
buf[n] = '\0';
return atoi(buf);
}
/*
* Offline a CPU by writing 0 to its online sysfs file
*/
static int offline_cpu(int cpu)
{
char path[256];
int fd;
ssize_t n;
snprintf(path, sizeof(path), CPU_OFFLINE_PATH, cpu);
fd = open(path, O_WRONLY);
if (fd < 0) {
fprintf(stderr, "Cannot open %s: %s\n", path, strerror(errno));
return -1;
}
n = write(fd, "0\n", 2);
close(fd);
if (n < 0) {
fprintf(stderr, "Failed to offline CPU %d: %s\n", cpu, strerror(errno));
return -1;
}
printf("CPU %d offlined successfully\n", cpu);
return 0;
}
/*
* Online a CPU by writing 1 to its online sysfs file
*/
static int online_cpu(int cpu)
{
char path[256];
int fd;
ssize_t n;
snprintf(path, sizeof(path), CPU_OFFLINE_PATH, cpu);
fd = open(path, O_WRONLY);
if (fd < 0) {
fprintf(stderr, "Cannot open %s: %s\n", path, strerror(errno));
return -1;
}
n = write(fd, "1\n", 2);
close(fd);
if (n < 0) {
fprintf(stderr, "Failed to online CPU %d: %s\n", cpu, strerror(errno));
return -1;
}
printf("CPU %d onlined successfully\n", cpu);
return 0;
}
/*
* Write a value to the dl_server debugfs runtime file for a given CPU
*/
static int write_dl_runtime(int cpu, const char *value)
{
char path[512];
int fd;
ssize_t n;
snprintf(path, sizeof(path), DEBUGFS_SCHED_PATH "/cpu%d/runtime", cpu);
fd = open(path, O_WRONLY);
if (fd < 0) {
fprintf(stderr, "Cannot open %s: %s\n", path, strerror(errno));
return -1;
}
n = write(fd, value, strlen(value));
close(fd);
if (n < 0) {
fprintf(stderr, "Write to %s failed: %s\n", path, strerror(errno));
return -1;
}
printf("Wrote '%s' to %s (success, wrote %zd bytes)\n", value, path, n);
return 0;
}
/*
* Write a value to the dl_server debugfs period file for a given CPU
*/
static int write_dl_period(int cpu, const char *value)
{
char path[512];
int fd;
ssize_t n;
snprintf(path, sizeof(path), DEBUGFS_SCHED_PATH "/cpu%d/period", cpu);
fd = open(path, O_WRONLY);
if (fd < 0) {
fprintf(stderr, "Cannot open %s: %s\n", path, strerror(errno));
return -1;
}
n = write(fd, value, strlen(value));
close(fd);
if (n < 0) {
fprintf(stderr, "Write to %s failed: %s\n", path, strerror(errno));
return -1;
}
printf("Wrote '%s' to %s (success, wrote %zd bytes)\n", value, path, n);
return 0;
}
/*
* Clear dmesg buffer
*/
static void clear_dmesg(void)
{
/* Use system call to clear dmesg */
int ret = system("dmesg -c > /dev/null 2>&1");
(void)ret;
}
/*
* Check dmesg for kernel error indicators
* Returns 1 if bug symptoms found, 0 otherwise
*/
static int check_dmesg_for_bug(void)
{
int ret;
/* Check for divide error, Oops, panic, WARNING in dmesg */
ret = system("dmesg 2>&1 | grep -qiE 'divide error|Oops|panic|BUG|WARNING.*deadline'");
if (ret == 0) {
return 1;
}
return 0;
}
/*
* Print relevant dmesg output for debugging
*/
static void print_dmesg_errors(void)
{
system("dmesg 2>&1 | grep -iE 'divide error|Oops|panic|BUG|WARNING|dl_server|EBUSY'");
}
int main(void)
{
int num_cpus;
int target_cpu = -1;
int i;
int ret;
int bug_triggered = 0;
printf("=== sched/deadline dl_server offline CPU bug reproducer ===\n\n");
/* Step 1: Check prerequisites */
printf("[Step 1] Checking prerequisites...\n");
if (!check_debugfs_available()) {
fprintf(stderr, "ERROR: %s not found. Kernel needs CONFIG_SCHED_DEBUG=y.\n",
DEBUGFS_SCHED_PATH);
return 2;
}
printf(" debugfs fair_server path exists: OK\n");
num_cpus = get_num_cpus();
if (num_cpus < 2) {
fprintf(stderr, "ERROR: Need at least 2 CPUs to reproduce (have %d)\n", num_cpus);
return 2;
}
printf(" Number of CPUs: %d (need >= 2)\n", num_cpus);
/* Step 2: Find a target CPU that can be offlined */
printf("\n[Step 2] Finding offlinable CPU...\n");
for (i = num_cpus - 1; i > 0; i--) {
if (!cpu_dir_exists(i)) {
continue;
}
int online = cpu_is_online(i);
if (online < 0) {
printf(" CPU %d: error checking status, skipping\n", i);
continue;
}
if (online == 0) {
/* Already offline, use this one */
target_cpu = i;
printf(" CPU %d: already offline, using it\n", target_cpu);
break;
}
/* Try to offline this CPU */
if (offline_cpu(i) == 0) {
target_cpu = i;
break;
}
printf(" CPU %d: cannot be offlined, trying next...\n", i);
}
if (target_cpu < 0) {
fprintf(stderr, "ERROR: Could not find any CPU that can be offlined\n");
return 2;
}
/* If we found an already-offline CPU, we need to make sure it's the one
* we'll use. Otherwise we already offlined it above. */
if (cpu_is_online(target_cpu)) {
/* Shouldn't happen, but just in case */
if (offline_cpu(target_cpu) != 0) {
fprintf(stderr, "ERROR: Could not offline CPU %d\n", target_cpu);
return 2;
}
}
printf(" Target CPU for reproducer: %d\n", target_cpu);
/* Step 3: Clear dmesg before test */
printf("\n[Step 3] Clearing dmesg...\n");
clear_dmesg();
/* Step 4: Trigger the bug - write to dl_server debugfs on offline CPU */
printf("\n[Step 4] Triggering bug by writing dl_server params on offline CPU %d...\n\n",
target_cpu);
/*
* Try writing runtime first. This should trigger:
* - Before fix: divide-by-zero in dl_server_apply_params() -> Oops
* - Before fix: WARN_ON_ONCE in dl_server_start()
* - After fix: -EBUSY error returned
*/
printf("--- Writing runtime to offline CPU %d ---\n", target_cpu);
ret = write_dl_runtime(target_cpu, "1000000000"); /* 1 second in ns */
if (ret == 0) {
/* Write succeeded - this means the kernel accepted the write,
* but the bug may have been triggered internally. Check dmesg. */
printf(" Write returned success, checking dmesg for kernel messages...\n");
sleep(1); /* Give kernel time to log */
if (check_dmesg_for_bug()) {
printf(" BUG DETECTED: Kernel error found in dmesg!\n");
bug_triggered = 1;
} else {
printf(" No kernel errors detected in dmesg (bug may be fixed)\n");
}
} else {
/* Write failed - check if it's -EBUSY (fixed) or another error */
printf(" Write failed. Checking dmesg for any kernel messages...\n");
sleep(1);
if (check_dmesg_for_bug()) {
printf(" BUG DETECTED: Kernel error found in dmesg despite write failure!\n");
bug_triggered = 1;
}
}
/* Also try writing period */
printf("\n--- Writing period to offline CPU %d ---\n", target_cpu);
ret = write_dl_period(target_cpu, "2000000000"); /* 2 seconds in ns */
if (ret == 0) {
printf(" Write returned success, checking dmesg...\n");
sleep(1);
if (check_dmesg_for_bug() && !bug_triggered) {
printf(" BUG DETECTED: Kernel error found in dmesg!\n");
bug_triggered = 1;
}
} else {
printf(" Write failed. Checking dmesg...\n");
sleep(1);
if (check_dmesg_for_bug() && !bug_triggered) {
printf(" BUG DETECTED: Kernel error found in dmesg despite write failure!\n");
bug_triggered = 1;
}
}
/* Step 5: Print dmesg for debugging */
printf("\n[Step 5] Relevant kernel messages:\n");
print_dmesg_errors();
/* Step 6: Cleanup - bring the CPU back online */
printf("\n[Step 6] Cleanup - onlining CPU %d...\n", target_cpu);
if (cpu_is_online(target_cpu) <= 0) {
online_cpu(target_cpu);
}
/* Summary */
printf("\n=== Summary ===\n");
if (bug_triggered) {
printf("BUG TRIGGERED: The kernel bug was successfully reproduced.\n");
printf("Expected behavior: divide-by-zero Oops or WARN_ON_ONCE in dl_server_start()\n");
return 1; /* Return non-zero to indicate bug triggered */
} else {
printf("BUG NOT TRIGGERED: No kernel errors detected.\n");
printf("Possible reasons:\n");
printf(" - The fix patch is applied (write returns -EBUSY)\n");
printf(" - The CPU could not be fully offlined from root-domain span\n");
printf(" - Race condition: CPU came back online before write\n");
return 0;
}
}