[PATCH 12/14] sched: make dl_bw a sub-quota of rt_bw
From: Juri Lelli
Date: Thu Nov 07 2013 - 08:44:59 EST
Change real-time bandwidth management as to make dl_bw a sub-quota
of rt_bw. This patch leaves rt_bw at its default value and sets
dl_bw at 40% of rt_bw. It also remove sched_dl_period_us control
knob using sched_rt_period_us as common period for both rt_bw and
dl_bw.
Checks are made when the user tries to change dl_bw sub-quota as to
not fall below what currently used. Since dl_bw now depends upon
rt_bw, similar checks are performed when the users modifies rt_bw
and dl_bw is changed accordingly. Setting rt_bw sysctl variable to
-1 (actually disabling rt throttling) disables dl_bw checks as well.
Signed-off-by: Juri Lelli <juri.lelli@xxxxxxxxx>
---
include/linux/sched/deadline.h | 2 +
include/linux/sched/sysctl.h | 2 -
kernel/sched/core.c | 322 ++++++++++++++++++++--------------------
kernel/sched/deadline.c | 13 +-
kernel/sched/sched.h | 24 ++-
kernel/sysctl.c | 7 -
6 files changed, 173 insertions(+), 197 deletions(-)
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index 9d303b8..2e44877 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -21,4 +21,6 @@ static inline int dl_task(struct task_struct *p)
return dl_prio(p->prio);
}
+extern inline struct dl_bw *dl_bw_of(int i);
+
#endif /* _SCHED_DEADLINE_H */
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 33fbf32..444a257 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -86,10 +86,8 @@ extern int sysctl_sched_rt_runtime;
/*
* control SCHED_DEADLINE reservations:
*
- * /proc/sys/kernel/sched_dl_period_us
* /proc/sys/kernel/sched_dl_runtime_us
*/
-extern unsigned int sysctl_sched_dl_period;
extern int sysctl_sched_dl_runtime;
#ifdef CONFIG_CFS_BANDWIDTH
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8a8622..08500e0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -297,13 +297,12 @@ __read_mostly int scheduler_running;
int sysctl_sched_rt_runtime = 950000;
/*
- * Maximum bandwidth available for all -deadline tasks and groups
- * (if group scheduling is configured) on each CPU.
+ * Sub-quota or rt bandwidth available for all -deadline tasks
+ * on each CPU.
*
- * default: 5%
+ * default: 40%
*/
-unsigned int sysctl_sched_dl_period = 1000000;
-int sysctl_sched_dl_runtime = 50000;
+int sysctl_sched_dl_runtime = 400000;
@@ -1770,6 +1769,28 @@ unsigned long to_ratio(u64 period, u64 runtime)
return div64_u64(runtime << 20, period);
}
+#ifdef CONFIG_SMP
+inline struct dl_bw *dl_bw_of(int i)
+{
+ return &cpu_rq(i)->rd->dl_bw;
+}
+
+static inline int __dl_span_weight(struct rq *rq)
+{
+ return cpumask_weight(rq->rd->span);
+}
+#else
+inline struct dl_bw *dl_bw_of(int i)
+{
+ return &cpu_rq(i)->dl.dl_bw;
+}
+
+static inline int __dl_span_weight(struct rq *rq)
+{
+ return 1;
+}
+#endif
+
static inline
void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
{
@@ -1800,19 +1821,11 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
static int dl_overflow(struct task_struct *p, int policy,
const struct sched_param2 *param2)
{
-#ifdef CONFIG_SMP
- struct dl_bw *dl_b = &task_rq(p)->rd->dl_bw;
-#else
- struct dl_bw *dl_b = &task_rq(p)->dl.dl_bw;
-#endif
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
u64 period = param2->sched_period;
u64 runtime = param2->sched_runtime;
u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
-#ifdef CONFIG_SMP
- int cpus = cpumask_weight(task_rq(p)->rd->span);
-#else
- int cpus = 1;
-#endif
+ int cpus = __dl_span_weight(task_rq(p));
int err = -1;
if (new_bw == p->dl.dl_bw)
@@ -4685,8 +4698,8 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
static inline
bool set_task_cpu_dl(struct task_struct *p, unsigned int cpu)
{
- struct dl_bw *dl_b = &task_rq(p)->rd->dl_bw;
- struct dl_bw *cpu_b = &cpu_rq(cpu)->rd->dl_bw;
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+ struct dl_bw *cpu_b = dl_bw_of(cpu);
int ret = 1;
u64 bw;
@@ -6815,6 +6828,8 @@ LIST_HEAD(task_groups);
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
+static u64 actual_dl_runtime(void);
+
void __init sched_init(void)
{
int i, j;
@@ -6856,15 +6871,14 @@ void __init sched_init(void)
#endif /* CONFIG_CPUMASK_OFFSTACK */
}
+ init_rt_bandwidth(&def_rt_bandwidth,
+ global_rt_period(), global_rt_runtime());
+ init_dl_bandwidth(&def_dl_bandwidth, actual_dl_runtime());
+
#ifdef CONFIG_SMP
init_defrootdomain();
#endif
- init_rt_bandwidth(&def_rt_bandwidth,
- global_rt_period(), global_rt_runtime());
- init_dl_bandwidth(&def_dl_bandwidth,
- global_dl_period(), global_dl_runtime());
-
#ifdef CONFIG_RT_GROUP_SCHED
init_rt_bandwidth(&root_task_group.rt_bandwidth,
global_rt_period(), global_rt_runtime());
@@ -7263,6 +7277,86 @@ void sched_move_task(struct task_struct *tsk)
}
#endif /* CONFIG_CGROUP_SCHED */
+static u64 actual_dl_runtime(void)
+{
+ u64 dl_runtime = global_dl_runtime();
+ u64 rt_runtime = global_rt_runtime();
+ u64 period = global_rt_period();
+
+ /*
+ * We want to calculate the sub-quota of rt_bw actually available
+ * for -dl tasks. It is a percentage of percentage. By default 95%
+ * of system bandwidth is allocate to -rt tasks; among this, a 40%
+ * quota is reserved for -dl tasks. To have the actual quota a simple
+ * multiplication is needed: .95 * .40 = .38 (38% of system bandwidth
+ * for deadline tasks).
+ * What follows is basically the same, but using unsigned integers.
+ *
+ * dl_runtime rt_runtime
+ * actual_runtime = ---------- * ---------- * period
+ * period period
+ */
+ if (dl_runtime == RUNTIME_INF)
+ return RUNTIME_INF;
+
+ return div64_u64 (dl_runtime * rt_runtime, period);
+}
+
+static int check_dl_bw(void)
+{
+ int i;
+ u64 period = global_rt_period();
+ u64 dl_actual_runtime = def_dl_bandwidth.dl_runtime;
+ u64 new_bw = to_ratio(period, dl_actual_runtime);
+
+ /*
+ * Here we want to check the bandwidth not being set to some
+ * value smaller than the currently allocated bandwidth in
+ * any of the root_domains.
+ *
+ * FIXME: Cycling on all the CPUs is overdoing, but simpler than
+ * cycling on root_domains... Discussion on different/better
+ * solutions is welcome!
+ */
+ for_each_possible_cpu(i) {
+ struct dl_bw *dl_b = dl_bw_of(i);
+
+ raw_spin_lock(&dl_b->lock);
+ if (new_bw < dl_b->total_bw) {
+ raw_spin_unlock(&dl_b->lock);
+ return -EBUSY;
+ }
+ raw_spin_unlock(&dl_b->lock);
+ }
+
+ return 0;
+}
+
+static void update_dl_bw(void)
+{
+ u64 new_bw;
+ int i;
+
+ def_dl_bandwidth.dl_runtime = actual_dl_runtime();
+ if (def_dl_bandwidth.dl_runtime == RUNTIME_INF ||
+ global_rt_runtime() == RUNTIME_INF)
+ new_bw = ULLONG_MAX;
+ else {
+ new_bw = to_ratio(global_rt_period(),
+ def_dl_bandwidth.dl_runtime);
+ }
+ /*
+ * FIXME: As above...
+ */
+ for_each_possible_cpu(i) {
+ struct dl_bw *dl_b = dl_bw_of(i);
+
+ raw_spin_lock(&dl_b->lock);
+ dl_b->bw = new_bw;
+ raw_spin_unlock(&dl_b->lock);
+ }
+}
+
#ifdef CONFIG_RT_GROUP_SCHED
/*
* Ensure that the real time constraints are schedulable.
@@ -7436,48 +7530,10 @@ static long sched_group_rt_period(struct task_group *tg)
do_div(rt_period_us, NSEC_PER_USEC);
return rt_period_us;
}
-#endif /* CONFIG_RT_GROUP_SCHED */
-
-/*
- * Coupling of -rt and -deadline bandwidth.
- *
- * Here we check if the new -rt bandwidth value is consistent
- * with the system settings for the bandwidth available
- * to -deadline tasks.
- *
- * IOW, we want to enforce that
- *
- * rt_bandwidth + dl_bandwidth <= 100%
- *
- * is always true.
- */
-static bool __sched_rt_dl_global_constraints(u64 rt_bw)
-{
- unsigned long flags;
- u64 dl_bw;
- bool ret;
-
- raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock, flags);
- if (global_rt_runtime() == RUNTIME_INF ||
- global_dl_runtime() == RUNTIME_INF) {
- ret = true;
- goto unlock;
- }
-
- dl_bw = to_ratio(def_dl_bandwidth.dl_period,
- def_dl_bandwidth.dl_runtime);
-
- ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF);
-unlock:
- raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock, flags);
-
- return ret;
-}
-#ifdef CONFIG_RT_GROUP_SCHED
static int sched_rt_global_constraints(void)
{
- u64 runtime, period, bw;
+ u64 runtime, period;
int ret = 0;
if (sysctl_sched_rt_period <= 0)
@@ -7492,9 +7548,13 @@ static int sched_rt_global_constraints(void)
if (runtime > period && runtime != RUNTIME_INF)
return -EINVAL;
- bw = to_ratio(period, runtime);
- if (!__sched_rt_dl_global_constraints(bw))
- return -EINVAL;
+ /*
+ * Check if changing rt_bw could have negative effects
+ * on dl_bw
+ */
+ ret = check_dl_bw();
+ if (ret)
+ return ret;
mutex_lock(&rt_constraints_mutex);
read_lock(&tasklist_lock);
@@ -7518,18 +7578,27 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
static int sched_rt_global_constraints(void)
{
unsigned long flags;
- int i, ret = 0;
- u64 bw;
+ int i, ret;
if (sysctl_sched_rt_period <= 0)
return -EINVAL;
+ /*
+ * There's always some RT tasks in the root group
+ * -- migration, kstopmachine etc..
+ */
+ if (sysctl_sched_rt_runtime == 0)
+ return -EBUSY;
+
+ /*
+ * Check if changing rt_bw could have negative effects
+ * on dl_bw
+ */
+ ret = check_dl_bw();
+ if (ret)
+ return ret;
+
raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
- bw = to_ratio(global_rt_period(), global_rt_runtime());
- if (!__sched_rt_dl_global_constraints(bw)) {
- ret = -EINVAL;
- goto unlock;
- }
for_each_possible_cpu(i) {
struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -7538,48 +7607,12 @@ static int sched_rt_global_constraints(void)
rt_rq->rt_runtime = global_rt_runtime();
raw_spin_unlock(&rt_rq->rt_runtime_lock);
}
-unlock:
raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
- return ret;
+ return 0;
}
#endif /* CONFIG_RT_GROUP_SCHED */
-/*
- * Coupling of -dl and -rt bandwidth.
- *
- * Here we check, while setting the system wide bandwidth available
- * for -dl tasks and groups, if the new values are consistent with
- * the system settings for the bandwidth available to -rt entities.
- *
- * IOW, we want to enforce that
- *
- * rt_bandwidth + dl_bandwidth <= 100%
- *
- * is always true.
- */
-static bool __sched_dl_rt_global_constraints(u64 dl_bw)
-{
- u64 rt_bw;
- bool ret;
-
- raw_spin_lock(&def_rt_bandwidth.rt_runtime_lock);
- if (global_dl_runtime() == RUNTIME_INF ||
- global_rt_runtime() == RUNTIME_INF) {
- ret = true;
- goto unlock;
- }
-
- rt_bw = to_ratio(ktime_to_ns(def_rt_bandwidth.rt_period),
- def_rt_bandwidth.rt_runtime);
-
- ret = rt_bw + dl_bw <= to_ratio(RUNTIME_INF, RUNTIME_INF);
-unlock:
- raw_spin_unlock(&def_rt_bandwidth.rt_runtime_lock);
-
- return ret;
-}
-
static bool __sched_dl_global_constraints(u64 runtime, u64 period)
{
if (!period || (runtime != RUNTIME_INF && runtime > period))
@@ -7590,40 +7623,17 @@ static bool __sched_dl_global_constraints(u64 runtime, u64 period)
static int sched_dl_global_constraints(void)
{
- u64 runtime = global_dl_runtime();
- u64 period = global_dl_period();
- u64 new_bw = to_ratio(period, runtime);
- int ret, i;
+ u64 period = global_rt_period();
+ u64 dl_actual_runtime = def_dl_bandwidth.dl_runtime;
+ int ret;
- ret = __sched_dl_global_constraints(runtime, period);
+ ret = __sched_dl_global_constraints(dl_actual_runtime, period);
if (ret)
return ret;
- if (!__sched_dl_rt_global_constraints(new_bw))
- return -EINVAL;
-
- /*
- * Here we want to check the bandwidth not being set to some
- * value smaller than the currently allocated bandwidth in
- * any of the root_domains.
- *
- * FIXME: Cycling on all the CPUs is overdoing, but simpler than
- * cycling on root_domains... Discussion on different/better
- * solutions is welcome!
- */
- for_each_possible_cpu(i) {
-#ifdef CONFIG_SMP
- struct dl_bw *dl_b = &cpu_rq(i)->rd->dl_bw;
-#else
- struct dl_bw *dl_b = &cpu_rq(i)->dl.dl_bw;
-#endif
- raw_spin_lock(&dl_b->lock);
- if (new_bw < dl_b->total_bw) {
- raw_spin_unlock(&dl_b->lock);
- return -EBUSY;
- }
- raw_spin_unlock(&dl_b->lock);
- }
+ ret = check_dl_bw();
+ if (ret)
+ return ret;
return 0;
}
@@ -7654,6 +7664,7 @@ int sched_rt_handler(struct ctl_table *table, int write,
int ret;
int old_period, old_runtime;
static DEFINE_MUTEX(mutex);
+ unsigned long flags;
mutex_lock(&mutex);
old_period = sysctl_sched_rt_period;
@@ -7663,6 +7674,8 @@ int sched_rt_handler(struct ctl_table *table, int write,
if (!ret && write) {
ret = sched_rt_global_constraints();
+ raw_spin_lock_irqsave(&def_dl_bandwidth.dl_runtime_lock,
+ flags);
if (ret) {
sysctl_sched_rt_period = old_period;
sysctl_sched_rt_runtime = old_runtime;
@@ -7670,7 +7683,11 @@ int sched_rt_handler(struct ctl_table *table, int write,
def_rt_bandwidth.rt_runtime = global_rt_runtime();
def_rt_bandwidth.rt_period =
ns_to_ktime(global_rt_period());
+
+ update_dl_bw();
}
+ raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock,
+ flags);
}
mutex_unlock(&mutex);
@@ -7682,12 +7699,11 @@ int sched_dl_handler(struct ctl_table *table, int write,
loff_t *ppos)
{
int ret;
- int old_period, old_runtime;
+ int old_runtime;
static DEFINE_MUTEX(mutex);
unsigned long flags;
mutex_lock(&mutex);
- old_period = sysctl_sched_dl_period;
old_runtime = sysctl_sched_dl_runtime;
ret = proc_dointvec(table, write, buffer, lenp, ppos);
@@ -7698,33 +7714,9 @@ int sched_dl_handler(struct ctl_table *table, int write,
ret = sched_dl_global_constraints();
if (ret) {
- sysctl_sched_dl_period = old_period;
sysctl_sched_dl_runtime = old_runtime;
} else {
- u64 new_bw;
- int i;
-
- def_dl_bandwidth.dl_period = global_dl_period();
- def_dl_bandwidth.dl_runtime = global_dl_runtime();
- if (global_dl_runtime() == RUNTIME_INF)
- new_bw = -1;
- else
- new_bw = to_ratio(global_dl_period(),
- global_dl_runtime());
- /*
- * FIXME: As above...
- */
- for_each_possible_cpu(i) {
-#ifdef CONFIG_SMP
- struct dl_bw *dl_b = &cpu_rq(i)->rd->dl_bw;
-#else
- struct dl_bw *dl_b = &cpu_rq(i)->dl.dl_bw;
-#endif
-
- raw_spin_lock(&dl_b->lock);
- dl_b->bw = new_bw;
- raw_spin_unlock(&dl_b->lock);
- }
+ update_dl_bw();
}
raw_spin_unlock_irqrestore(&def_dl_bandwidth.dl_runtime_lock,
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index fe39d7e..be84cb3 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -48,10 +48,9 @@ static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
return dl_rq->rb_leftmost == &dl_se->rb_node;
}
-void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
+void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 runtime)
{
raw_spin_lock_init(&dl_b->dl_runtime_lock);
- dl_b->dl_period = period;
dl_b->dl_runtime = runtime;
}
@@ -62,9 +61,9 @@ void init_dl_bw(struct dl_bw *dl_b)
raw_spin_lock_init(&dl_b->lock);
raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
if (global_dl_runtime() == RUNTIME_INF)
- dl_b->bw = -1;
+ dl_b->bw = ULLONG_MAX;
else
- dl_b->bw = to_ratio(global_dl_period(), global_dl_runtime());
+ dl_b->bw = to_ratio(global_rt_period(), global_dl_runtime());
raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
dl_b->total_bw = 0;
}
@@ -1061,11 +1060,7 @@ static void task_fork_dl(struct task_struct *p)
static void task_dead_dl(struct task_struct *p)
{
struct hrtimer *timer = &p->dl.dl_timer;
-#ifdef CONFIG_SMP
- struct dl_bw *dl_b = &task_rq(p)->rd->dl_bw;
-#else
- struct dl_bw *dl_b = &task_rq(p)->dl.dl_bw;
-#endif
+ struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
/*
* Since we are TASK_DEAD we won't slip out of the domain!
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 08931a6..bf4acf8 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -142,20 +142,20 @@ struct rt_bandwidth {
struct hrtimer rt_period_timer;
};
/*
- * To keep the bandwidth of -deadline tasks and groups under control
- * we need some place where:
- * - store the maximum -deadline bandwidth of the system (the group);
+ * To keep the bandwidth of -deadline tasks under control we need some
+ * place where:
+ * - store the maximum -deadline bandwidth of the system;
* - cache the fraction of that bandwidth that is currently allocated.
*
* This is all done in the data structure below. It is similar to the
* one used for RT-throttling (rt_bandwidth), with the main difference
* that, since here we are only interested in admission control, we
- * do not decrease any runtime while the group "executes", neither we
+ * do not decrease any runtime while the task "executes", neither we
* need a timer to replenish it.
*
* With respect to SMP, the bandwidth is given on a per-CPU basis,
* meaning that:
- * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
+ * - dl_bw (< 100%) is the bandwidth of the system on each CPU;
* - dl_total_bw array contains, in the i-eth element, the currently
* allocated bandwidth on the i-eth CPU.
* Moreover, groups consume bandwidth on each CPU, while tasks only
@@ -168,7 +168,6 @@ struct rt_bandwidth {
struct dl_bandwidth {
raw_spinlock_t dl_runtime_lock;
u64 dl_runtime;
- u64 dl_period;
};
static inline int dl_bandwidth_enabled(void)
@@ -178,10 +177,12 @@ static inline int dl_bandwidth_enabled(void)
struct dl_bw {
raw_spinlock_t lock;
- u64 bw, total_bw;
+ /* default value */
+ u64 bw;
+ /* allocated */
+ u64 total_bw;
};
-static inline u64 global_dl_period(void);
static inline u64 global_dl_runtime(void);
extern struct mutex sched_domains_mutex;
@@ -924,11 +925,6 @@ static inline u64 global_rt_runtime(void)
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
-static inline u64 global_dl_period(void)
-{
- return (u64)sysctl_sched_dl_period * NSEC_PER_USEC;
-}
-
static inline u64 global_dl_runtime(void)
{
if (sysctl_sched_dl_runtime < 0)
@@ -1192,7 +1188,7 @@ extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
extern struct dl_bandwidth def_dl_bandwidth;
-extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
+extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
unsigned long to_ratio(u64 period, u64 runtime);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 8492cbb..39167fc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -415,13 +415,6 @@ static struct ctl_table kern_table[] = {
.proc_handler = sched_rr_handler,
},
{
- .procname = "sched_dl_period_us",
- .data = &sysctl_sched_dl_period,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = sched_dl_handler,
- },
- {
.procname = "sched_dl_runtime_us",
.data = &sysctl_sched_dl_runtime,
.maxlen = sizeof(int),
--
1.7.9.5
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/