[PATCH V6 3/6] sched/fair: Fair server interface

From: Daniel Bristot de Oliveira
Date: Fri Apr 05 2024 - 13:29:07 EST


Add an interface for fair server setup on debugfs.

Each CPU has three files under /debug/sched/fair_server/cpu{ID}:

- runtime: set runtime in ns
- period: set period in ns
- defer: on/off for the defer mechanism

This then leaves /proc/sys/kernel/sched_rt_{period,runtime}_us to set
bounds on admission control.

The interface also add the server to the dl bandwidth accounting.

Signed-off-by: Daniel Bristot de Oliveira <bristot@xxxxxxxxxx>
---
kernel/sched/deadline.c | 111 ++++++++++++++++++----
kernel/sched/debug.c | 206 ++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 3 +
kernel/sched/topology.c | 8 ++
4 files changed, 311 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6ea9c05711ce..dd38370aa276 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -321,19 +321,12 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
__sub_running_bw(dl_se->dl_bw, dl_rq);
}

-static void dl_change_utilization(struct task_struct *p, u64 new_bw)
+static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw)
{
- struct rq *rq;
-
- WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
-
- if (task_on_rq_queued(p))
- return;
+ if (dl_se->dl_non_contending) {
+ sub_running_bw(dl_se, &rq->dl);
+ dl_se->dl_non_contending = 0;

- rq = task_rq(p);
- if (p->dl.dl_non_contending) {
- sub_running_bw(&p->dl, &rq->dl);
- p->dl.dl_non_contending = 0;
/*
* If the timer handler is currently running and the
* timer cannot be canceled, inactive_task_timer()
@@ -341,13 +334,25 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw)
* will not touch the rq's active utilization,
* so we are still safe.
*/
- if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
- put_task_struct(p);
+ if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) {
+ if (!dl_server(dl_se))
+ put_task_struct(dl_task_of(dl_se));
+ }
}
- __sub_rq_bw(p->dl.dl_bw, &rq->dl);
+ __sub_rq_bw(dl_se->dl_bw, &rq->dl);
__add_rq_bw(new_bw, &rq->dl);
}

+static void dl_change_utilization(struct task_struct *p, u64 new_bw)
+{
+ WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV);
+
+ if (task_on_rq_queued(p))
+ return;
+
+ dl_rq_change_utilization(task_rq(p), &p->dl, new_bw);
+}
+
static void __dl_clear_params(struct sched_dl_entity *dl_se);

/*
@@ -1191,6 +1196,11 @@ static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_
u64 fw;

rq_lock(rq, &rf);
+
+ if (!dl_se->dl_runtime) {
+ goto unlock;
+ }
+
if (dl_se->dl_throttled) {
sched_clock_tick();
update_rq_clock(rq);
@@ -1617,11 +1627,17 @@ void dl_server_start(struct sched_dl_entity *dl_se)
{
struct rq *rq = dl_se->rq;

+ /*
+ * XXX: the apply do not work fine at the init phase for the
+ * fair server because things are not yet set. We need to improve
+ * this before getting generic.
+ */
if (!dl_server(dl_se)) {
/* Disabled */
- dl_se->dl_runtime = 0;
- dl_se->dl_deadline = 1000 * NSEC_PER_MSEC;
- dl_se->dl_period = 1000 * NSEC_PER_MSEC;
+ u64 runtime = 0;
+ u64 period = 1000 * NSEC_PER_MSEC;
+
+ dl_server_apply_params(dl_se, runtime, period, 1);

dl_se->dl_server = 1;
dl_se->dl_defer = 1;
@@ -1656,6 +1672,67 @@ void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
dl_se->server_pick = pick;
}

+void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq)
+{
+ u64 new_bw = dl_se->dl_bw;
+ struct dl_bw *dl_b;
+ int cpu = cpu_of(rq);
+
+ dl_b = dl_bw_of(cpu_of(rq));
+ raw_spin_lock(&dl_b->lock);
+
+ __dl_add(dl_b, new_bw, dl_bw_cpus(cpu));
+
+ raw_spin_unlock(&dl_b->lock);
+}
+
+int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init)
+{
+ u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ u64 new_bw = to_ratio(period, runtime);
+ struct rq *rq = dl_se->rq;
+ int cpu = cpu_of(rq);
+ struct dl_bw *dl_b;
+ unsigned long cap;
+ int retval = 0;
+ int cpus;
+
+ dl_b = dl_bw_of(cpu);
+ raw_spin_lock(&dl_b->lock);
+ cpus = dl_bw_cpus(cpu);
+ cap = dl_bw_capacity(cpu);
+
+ if (__dl_overflow(dl_b, cap, old_bw, new_bw)) {
+ retval = -EBUSY;
+ goto out;
+ }
+
+ if (init) {
+ __add_rq_bw(new_bw, &rq->dl);
+ __dl_add(dl_b, new_bw, cpus);
+ } else {
+ __dl_sub(dl_b, dl_se->dl_bw, cpus);
+ __dl_add(dl_b, new_bw, cpus);
+
+ dl_rq_change_utilization(rq, dl_se, new_bw);
+ }
+
+ dl_se->dl_runtime = runtime;
+ dl_se->dl_deadline = period;
+ dl_se->dl_period = period;
+
+ dl_se->runtime = 0;
+ dl_se->deadline = 0;
+
+ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+ dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
+
+out:
+ raw_spin_unlock(&dl_b->lock);
+
+ return retval;
+}
+
/*
* Update the current task's runtime statistics (provided it is still
* a -deadline task and has not been removed from the dl_rq).
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 8d5d98a5834d..5da3297270cd 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -333,8 +333,212 @@ static const struct file_operations sched_debug_fops = {
.release = seq_release,
};

+enum dl_param {
+ DL_RUNTIME = 0,
+ DL_PERIOD,
+ DL_DEFER
+};
+
+static unsigned long fair_server_period_max = (1 << 22) * NSEC_PER_USEC; /* ~4 seconds */
+static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */
+
+static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos, enum dl_param param)
+{
+ long cpu = (long) ((struct seq_file *) filp->private_data)->private;
+ u64 runtime, period, defer;
+ struct rq *rq = cpu_rq(cpu);
+ size_t err;
+ int retval;
+ u64 value;
+
+ err = kstrtoull_from_user(ubuf, cnt, 10, &value);
+ if (err)
+ return err;
+
+ scoped_guard (rq_lock_irqsave, rq) {
+
+ runtime = rq->fair_server.dl_runtime;
+ period = rq->fair_server.dl_period;
+ defer = rq->fair_server.dl_defer;
+
+ switch (param) {
+ case DL_RUNTIME:
+ if (runtime == value)
+ goto out;
+ runtime = value;
+ break;
+ case DL_PERIOD:
+ if (value == period)
+ goto out;
+ period = value;
+ break;
+ case DL_DEFER:
+ if (defer == value)
+ goto out;
+ defer = value;
+ break;
+ }
+
+ if (runtime > period ||
+ period > fair_server_period_max ||
+ period < fair_server_period_min ||
+ defer > 1) {
+ cnt = -EINVAL;
+ goto out;
+ }
+
+ if (rq->cfs.h_nr_running) {
+ update_rq_clock(rq);
+ dl_server_stop(&rq->fair_server);
+ }
+
+ /*
+ * The defer does not change utilization, so just
+ * setting it is enough.
+ */
+ if (rq->fair_server.dl_defer != defer) {
+ rq->fair_server.dl_defer = defer;
+ } else {
+ retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0);
+ if (retval)
+ cnt = retval;
+ }
+
+ if (!runtime)
+ printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n",
+ cpu_of(rq));
+
+ if (rq->cfs.h_nr_running)
+ dl_server_start(&rq->fair_server);
+ }
+
+out:
+ *ppos += cnt;
+ return cnt;
+}
+
+static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param)
+{
+ unsigned long cpu = (unsigned long) m->private;
+ struct rq *rq = cpu_rq(cpu);
+ u64 value;
+
+ switch (param) {
+ case DL_RUNTIME:
+ value = rq->fair_server.dl_runtime;
+ break;
+ case DL_PERIOD:
+ value = rq->fair_server.dl_period;
+ break;
+ case DL_DEFER:
+ value = rq->fair_server.dl_defer;
+ }
+
+ seq_printf(m, "%llu\n", value);
+ return 0;
+
+}
+
+static ssize_t
+sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME);
+}
+
+static int sched_fair_server_runtime_show(struct seq_file *m, void *v)
+{
+ return sched_fair_server_show(m, v, DL_RUNTIME);
+}
+
+static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_fair_server_runtime_show, inode->i_private);
+}
+
+static const struct file_operations fair_server_runtime_fops = {
+ .open = sched_fair_server_runtime_open,
+ .write = sched_fair_server_runtime_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static ssize_t
+sched_fair_server_period_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD);
+}
+
+static int sched_fair_server_period_show(struct seq_file *m, void *v)
+{
+ return sched_fair_server_show(m, v, DL_PERIOD);
+}
+
+static int sched_fair_server_period_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_fair_server_period_show, inode->i_private);
+}
+
+static const struct file_operations fair_server_period_fops = {
+ .open = sched_fair_server_period_open,
+ .write = sched_fair_server_period_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static ssize_t
+sched_fair_server_defer_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_DEFER);
+}
+
+static int sched_fair_server_defer_show(struct seq_file *m, void *v)
+{
+ return sched_fair_server_show(m, v, DL_DEFER);
+}
+
+static int sched_fair_server_defer_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_fair_server_defer_show, inode->i_private);
+}
+
+static const struct file_operations fair_server_defer_fops = {
+ .open = sched_fair_server_defer_open,
+ .write = sched_fair_server_defer_write,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static struct dentry *debugfs_sched;

+static void debugfs_fair_server_init(void)
+{
+ struct dentry *d_fair;
+ unsigned long cpu;
+
+ d_fair = debugfs_create_dir("fair_server", debugfs_sched);
+ if (!d_fair)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct dentry *d_cpu;
+ char buf[32];
+
+ snprintf(buf, sizeof(buf), "cpu%lu", cpu);
+ d_cpu = debugfs_create_dir(buf, d_fair);
+
+ debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops);
+ debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops);
+ debugfs_create_file("defer", 0644, d_cpu, (void *) cpu, &fair_server_defer_fops);
+ }
+}
+
static __init int sched_init_debug(void)
{
struct dentry __maybe_unused *numa;
@@ -374,6 +578,8 @@ static __init int sched_init_debug(void)

debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops);

+ debugfs_fair_server_init();
+
return 0;
}
late_initcall(sched_init_debug);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e70e17be83c3..a80a236da57c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -343,6 +343,9 @@ extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq,
extern void dl_server_update_idle_time(struct rq *rq,
struct task_struct *p);
extern void fair_server_init(struct rq *rq);
+extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq);
+extern int dl_server_apply_params(struct sched_dl_entity *dl_se,
+ u64 runtime, u64 period, bool init);

#ifdef CONFIG_CGROUP_SCHED

diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 99ea5986038c..ecb089c4967f 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -517,6 +517,14 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd)
if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
set_rq_online(rq);

+ /*
+ * Because the rq is not a task, dl_add_task_root_domain() did not
+ * move the fair server bw to the rd if it already started.
+ * Add it now.
+ */
+ if (rq->fair_server.dl_server)
+ __dl_server_attach_root(&rq->fair_server, rq);
+
rq_unlock_irqrestore(rq, &rf);

if (old_rd)
--
2.44.0