[RFC PATCH 2/3] sched/deadline: Hierarchical scheduling with DL on top of RT

From: Alessio Balsini
Date: Fri Mar 31 2017 - 14:45:17 EST


The runtime of RT tasks controlled by CGroups are enforced by the
SCHED_DEADLINE scheduling class, based on the runtime and period (the
deadline is set equal to the period) parameters.

sched_dl_entity may also represent a group of RT tasks, providing a rt_rq.

Signed-off-by: Andrea Parri <parri.andrea@xxxxxxxxx>
Signed-off-by: Luca Abeni <luca.abeni@xxxxxxxxxxxxxxx>
Cc: Tommaso Cucinotta <tommaso.cucinotta@xxxxxxxx>
Cc: Juri Lelli <juri.lelli@xxxxxxx>
Cc: Daniel Bristot de Oliveira <bristot@xxxxxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Signed-off-by: Alessio Balsini <a.balsini@xxxxxxxx>
---
include/linux/sched.h | 13 +-
kernel/sched/autogroup.c | 4 +-
kernel/sched/core.c | 86 ++++++++--
kernel/sched/deadline.c | 174 ++++++++++++++++----
kernel/sched/rt.c | 407 ++++++++++++++++++++++++++++++-----------------
kernel/sched/sched.h | 142 +++++++++++++++--
6 files changed, 611 insertions(+), 215 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index d67eee8..fdd62f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -402,7 +402,7 @@ struct sched_rt_entity {

struct sched_rt_entity *back;
#ifdef CONFIG_RT_GROUP_SCHED
- struct sched_rt_entity *parent;
+ struct sched_dl_entity *parent;
/* rq on which this entity is (to be) queued: */
struct rt_rq *rt_rq;
/* rq "owned" by this entity/group: */
@@ -455,6 +455,17 @@ struct sched_dl_entity {
* own bandwidth to be enforced, thus we need one timer per task.
*/
struct hrtimer dl_timer;
+
+/*
+ * An instance of a sched_dl_entity may represent a group of tasks, therefore
+ * it requires:
+ * - dl_rq: the rq on which this entity is queued;
+ * - rt_rq: the rq owned by this entity;
+ */
+#ifdef CONFIG_RT_GROUP_SCHED
+ struct dl_rq *dl_rq;
+ struct rt_rq *my_q;
+#endif
};

union rcu_special {
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index da39489..e14acb4 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -30,7 +30,7 @@ static inline void autogroup_destroy(struct kref *kref)

#ifdef CONFIG_RT_GROUP_SCHED
/* We've redirected RT tasks to the root task group... */
- ag->tg->rt_se = NULL;
+ ag->tg->dl_se = NULL;
ag->tg->rt_rq = NULL;
#endif
sched_offline_group(ag->tg);
@@ -88,7 +88,7 @@ static inline struct autogroup *autogroup_create(void)
* the policy change to proceed.
*/
free_rt_sched_group(tg);
- tg->rt_se = root_task_group.rt_se;
+ tg->dl_se = root_task_group.dl_se;
tg->rt_rq = root_task_group.rt_rq;
#endif
tg->autogroup = ag;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3d4cce4..b139719 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -902,6 +902,9 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
const struct sched_class *class;

+ if (is_dl_group(rt_rq_of_se(&p->rt)) && task_has_rt_policy(p))
+ resched_curr(rq);
+
if (p->sched_class == rq->curr->sched_class) {
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
} else {
@@ -2165,6 +2168,9 @@ void __dl_clear_params(struct task_struct *p)

dl_se->dl_throttled = 0;
dl_se->dl_yielded = 0;
+#ifdef CONFIG_RT_GROUP_SCHED
+ dl_se->my_q = NULL;
+#endif
}

/*
@@ -4261,7 +4267,8 @@ static int __sched_setscheduler(struct task_struct *p,
* Do not allow realtime tasks into groups that have no runtime
* assigned.
*/
- if (rt_policy(policy) &&
+ if (dl_bandwidth_enabled() && rt_policy(policy) &&
+ task_group(p)->dl_bandwidth.dl_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
task_rq_unlock(rq, p, &rf);
return -EPERM;
@@ -5987,7 +5994,7 @@ void __init sched_init(void)

#endif /* CONFIG_FAIR_GROUP_SCHED */
#ifdef CONFIG_RT_GROUP_SCHED
- root_task_group.rt_se = (struct sched_rt_entity **)ptr;
+ root_task_group.dl_se = (struct sched_dl_entity **)ptr;
ptr += nr_cpu_ids * sizeof(void **);

root_task_group.rt_rq = (struct rt_rq **)ptr;
@@ -6010,6 +6017,10 @@ void __init sched_init(void)
init_defrootdomain();
#endif

+#ifdef CONFIG_RT_GROUP_SCHED
+ init_dl_bandwidth(&root_task_group.dl_bandwidth,
+ global_rt_period(), global_rt_runtime());
+#endif /* CONFIG_RT_GROUP_SCHED */

#ifdef CONFIG_CGROUP_SCHED
task_group_cache = KMEM_CACHE(task_group, 0);
@@ -6460,9 +6471,10 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
struct task_group *child;
unsigned long total, sum = 0;
u64 period, runtime;
+ unsigned long flags;

- period = 0;
- runtime = 0;
+ period = tg->dl_bandwidth.dl_period;
+ runtime = tg->dl_bandwidth.dl_runtime;

if (tg == d->tg) {
period = d->rt_period;
@@ -6478,7 +6490,7 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
/*
* Ensure we don't starve existing RT tasks.
*/
- if (!runtime && tg_has_rt_tasks(tg))
+ if (dl_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
return -EBUSY;

total = to_ratio(period, runtime);
@@ -6489,12 +6501,27 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
if (total > to_ratio(global_rt_period(), global_rt_runtime()))
return -EINVAL;

+ if (tg == &root_task_group) {
+ int cpus = num_online_cpus();
+ struct dl_bw *dl_b = dl_bw_of(smp_processor_id());
+
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+
+ if (dl_b->bw != -1 &&
+ dl_b->bw * cpus < dl_b->total_bw + total * cpus) {
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ return -EBUSY;
+ }
+
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ }
+
/*
* The sum of our children's runtime should not exceed our own.
*/
list_for_each_entry_rcu(child, &tg->children, siblings) {
- period = 0;
- runtime = 0;
+ period = child->dl_bandwidth.dl_period;
+ runtime = child->dl_bandwidth.dl_runtime;

if (child == d->tg) {
period = d->rt_period;
@@ -6549,6 +6576,33 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
if (err)
goto unlock;

+ raw_spin_lock_irq(&tg->dl_bandwidth.dl_runtime_lock);
+ tg->dl_bandwidth.dl_period = rt_period;
+ tg->dl_bandwidth.dl_runtime = rt_runtime;
+
+ if (tg == &root_task_group)
+ goto unlock_bandwidth;
+
+ for_each_possible_cpu(i) {
+ struct sched_dl_entity *dl_se = tg->dl_se[i];
+ struct rq *rq = container_of(dl_se->dl_rq, struct rq, dl);
+
+ raw_spin_lock_irq(&rq->lock);
+ dl_se->dl_runtime = rt_runtime;
+ dl_se->dl_period = rt_period;
+ dl_se->dl_deadline = dl_se->dl_period;
+ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+
+ if (!((s64)(rt_period - rt_runtime) >= 0) ||
+ !(rt_runtime >= (2 << (DL_SCALE - 1)))) {
+ raw_spin_unlock_irq(&rq->lock);
+ continue;
+ }
+
+ raw_spin_unlock_irq(&rq->lock);
+ }
+unlock_bandwidth:
+ raw_spin_unlock_irq(&tg->dl_bandwidth.dl_runtime_lock);
unlock:
read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
@@ -6560,7 +6614,7 @@ static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
{
u64 rt_runtime, rt_period;

- rt_period = 0;
+ rt_period = tg->dl_bandwidth.dl_period;
rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
if (rt_runtime_us < 0)
rt_runtime = RUNTIME_INF;
@@ -6572,10 +6626,10 @@ static long sched_group_rt_runtime(struct task_group *tg)
{
u64 rt_runtime_us;

- if (0 == RUNTIME_INF)
+ if (tg->dl_bandwidth.dl_runtime == RUNTIME_INF)
return -1;

- rt_runtime_us = 0;
+ rt_runtime_us = tg->dl_bandwidth.dl_runtime;
do_div(rt_runtime_us, NSEC_PER_USEC);
return rt_runtime_us;
}
@@ -6585,7 +6639,7 @@ static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
u64 rt_runtime, rt_period;

rt_period = rt_period_us * NSEC_PER_USEC;
- rt_runtime = 0;
+ rt_runtime = tg->dl_bandwidth.dl_runtime;

return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
@@ -6594,7 +6648,7 @@ static long sched_group_rt_period(struct task_group *tg)
{
u64 rt_period_us;

- rt_period_us = 0;
+ rt_period_us = tg->dl_bandwidth.dl_period;
do_div(rt_period_us, NSEC_PER_USEC);
return rt_period_us;
}
@@ -6617,7 +6671,7 @@ static int sched_rt_global_constraints(void)
static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
/* Don't accept realtime tasks when there is no way for them to run */
- if (rt_task(tsk))
+ if (rt_task(tsk) && tg->dl_bandwidth.dl_runtime == 0)
return 0;

return 1;
@@ -6785,6 +6839,12 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
return &root_task_group.css;
}

+ /* Do not allow cpu_cgroup hierachies with depth greater than 2. */
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (parent != &root_task_group)
+ return ERR_PTR(-EINVAL);
+#endif
+
tg = sched_create_group(parent);
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1af6219..9a1988b 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -20,8 +20,17 @@

struct dl_bandwidth def_dl_bandwidth;

+#ifdef CONFIG_RT_GROUP_SCHED
+#define dl_entity_is_task(dl_se) (!(dl_se)->my_q)
+#define rt_rq_of_dl_entity(dl_se) ((dl_se)->my_q)
+#else
+#define dl_entity_is_task(dl_se) (1)
+#define rt_rq_of_dl_entity(dl_se) (NULL)
+#endif
+
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
{
+ BUG_ON(!dl_entity_is_task(dl_se));
return container_of(dl_se, struct task_struct, dl);
}

@@ -30,6 +39,14 @@ static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
return container_of(dl_rq, struct rq, dl);
}

+#ifdef CONFIG_RT_GROUP_SCHED
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+ return dl_se->dl_rq;
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
{
struct task_struct *p = dl_task_of(dl_se);
@@ -37,6 +54,7 @@ static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)

return &rq->dl;
}
+#endif

static inline int on_dl_rq(struct sched_dl_entity *dl_se)
{
@@ -119,7 +137,11 @@ static inline void dl_clear_overload(struct rq *rq)

static void update_dl_migration(struct dl_rq *dl_rq)
{
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_total > 1) {
+#else
if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
+#endif
if (!dl_rq->overloaded) {
dl_set_overload(rq_of_dl_rq(dl_rq));
dl_rq->overloaded = 1;
@@ -520,11 +542,11 @@ static inline u64 dl_next_period(struct sched_dl_entity *dl_se)
* actually started or not (i.e., the replenishment instant is in
* the future or in the past).
*/
-static int start_dl_timer(struct task_struct *p)
+int start_dl_timer(struct sched_dl_entity *dl_se)
{
- struct sched_dl_entity *dl_se = &p->dl;
struct hrtimer *timer = &dl_se->dl_timer;
- struct rq *rq = task_rq(p);
+ struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
ktime_t now, act;
s64 delta;

@@ -558,7 +580,11 @@ static int start_dl_timer(struct task_struct *p)
* and observe our state.
*/
if (!hrtimer_is_queued(timer)) {
- get_task_struct(p);
+ if (dl_entity_is_task(dl_se)) {
+ struct task_struct *p = dl_task_of(dl_se);
+
+ get_task_struct(p);
+ }
hrtimer_start(timer, act, HRTIMER_MODE_ABS);
}

@@ -583,10 +609,43 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
struct sched_dl_entity *dl_se = container_of(timer,
struct sched_dl_entity,
dl_timer);
- struct task_struct *p = dl_task_of(dl_se);
+ struct task_struct *p;
struct rq_flags rf;
struct rq *rq;

+#ifdef CONFIG_RT_GROUP_SCHED
+ /* Replenish dl group and check for preemption. */
+ if (!dl_entity_is_task(dl_se)) {
+ struct rt_rq *rt_rq = rt_rq_of_dl_entity(dl_se);
+
+ rq = rq_of_dl_rq(dl_rq_of_se(dl_se));
+
+ raw_spin_lock(&rq->lock);
+
+
+ sched_clock_tick();
+ update_rq_clock(rq);
+
+ dl_se->dl_throttled = 0;
+ if (rt_rq->rt_nr_running) {
+ enqueue_dl_entity(dl_se, dl_se, ENQUEUE_REPLENISH);
+
+ resched_curr(rq);
+#ifdef CONFIG_SMP
+ if (has_pushable_dl_tasks(rq))
+ push_dl_task(rq);
+#endif
+ } else {
+ replenish_dl_entity(dl_se, dl_se);
+ }
+
+ raw_spin_unlock(&rq->lock);
+
+ return HRTIMER_NORESTART;
+ }
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+ p = dl_task_of(dl_se);
rq = task_rq_lock(p, &rf);

/*
@@ -720,13 +779,12 @@ static inline void dl_check_constrained_dl(struct sched_dl_entity *dl_se)

if (dl_time_before(dl_se->deadline, rq_clock(rq)) &&
dl_time_before(rq_clock(rq), dl_next_period(dl_se))) {
- if (unlikely(dl_se->dl_boosted || !start_dl_timer(p)))
+ if (unlikely(dl_se->dl_boosted || !start_dl_timer(&p->dl)))
return;
dl_se->dl_throttled = 1;
}
}

-static
int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
{
return (dl_se->runtime <= 0);
@@ -780,7 +838,7 @@ static void update_curr_dl(struct rq *rq)
if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) {
dl_se->dl_throttled = 1;
__dequeue_task_dl(rq, curr, 0);
- if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr)))
+ if (unlikely(dl_se->dl_boosted || !start_dl_timer(&curr->dl)))
enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);

if (!is_leftmost(curr, &rq->dl))
@@ -833,29 +891,39 @@ static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
static inline
void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- int prio = dl_task_of(dl_se)->prio;
u64 deadline = dl_se->deadline;

- WARN_ON(!dl_prio(prio));
- dl_rq->dl_nr_running++;
- add_nr_running(rq_of_dl_rq(dl_rq), 1);
+ if (dl_entity_is_task(dl_se)) {
+ dl_rq->dl_nr_running++;
+ add_nr_running(rq_of_dl_rq(dl_rq), 1);
+ inc_dl_migration(dl_se, dl_rq);
+ } else {
+ struct rt_rq *rt_rq = rt_rq_of_dl_entity(dl_se);
+
+ add_nr_running(rq_of_dl_rq(dl_rq), rt_rq->rt_nr_running);
+ }

inc_dl_deadline(dl_rq, deadline);
- inc_dl_migration(dl_se, dl_rq);
}

static inline
void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
{
- int prio = dl_task_of(dl_se)->prio;
+#ifdef CONFIG_RT_GROUP_SCHED
+ WARN_ON(!dl_rq->dl_nr_total);
+#endif

- WARN_ON(!dl_prio(prio));
- WARN_ON(!dl_rq->dl_nr_running);
- dl_rq->dl_nr_running--;
- sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+ if (dl_entity_is_task(dl_se)) {
+ dl_rq->dl_nr_running--;
+ sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+ dec_dl_migration(dl_se, dl_rq);
+ } else {
+ struct rt_rq *rt_rq = rt_rq_of_dl_entity(dl_se);
+
+ sub_nr_running(rq_of_dl_rq(dl_rq), rt_rq->rt_nr_running);
+ }

dec_dl_deadline(dl_rq, dl_se->deadline);
- dec_dl_migration(dl_se, dl_rq);
}

static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
@@ -884,7 +952,9 @@ static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)

rb_link_node(&dl_se->rb_node, parent, link);
rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
-
+#ifdef CONFIG_RT_GROUP_SCHED
+ dl_rq->dl_nr_total++;
+#endif
inc_dl_tasks(dl_se, dl_rq);
}

@@ -906,9 +976,12 @@ static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
RB_CLEAR_NODE(&dl_se->rb_node);

dec_dl_tasks(dl_se, dl_rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+ dl_rq->dl_nr_total--;
+#endif
}

-static void
+void
enqueue_dl_entity(struct sched_dl_entity *dl_se,
struct sched_dl_entity *pi_se, int flags)
{
@@ -927,7 +1000,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
__enqueue_dl_entity(dl_se);
}

-static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
+void dequeue_dl_entity(struct sched_dl_entity *dl_se)
{
__dequeue_dl_entity(dl_se);
}
@@ -1120,12 +1193,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
}

#ifdef CONFIG_SCHED_HRTICK
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
- hrtick_start(rq, p->dl.runtime);
+ hrtick_start(rq, dl_se->runtime);
}
#else /* !CONFIG_SCHED_HRTICK */
-static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se)
{
}
#endif
@@ -1176,14 +1249,35 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (prev->sched_class == &dl_sched_class)
update_curr_dl(rq);

+#ifdef CONFIG_RT_GROUP_SCHED
+ if (unlikely(!dl_rq->dl_nr_total))
+ return NULL;
+#else
if (unlikely(!dl_rq->dl_nr_running))
return NULL;
-
- put_prev_task(rq, prev);
+#endif

dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);

+ put_prev_task(rq, prev);
+
+ if (!dl_entity_is_task(dl_se)) {
+ struct rt_rq *rt_rq = rt_rq_of_dl_entity(dl_se);
+ struct sched_rt_entity *rt_se;
+
+ rt_se = pick_next_rt_entity(rq, rt_rq);
+ p = container_of(rt_se, struct task_struct, rt);
+ p->se.exec_start = rq_clock_task(rq);
+
+ dequeue_pushable_task(rt_rq_of_se(&p->rt), p);
+
+ if (hrtick_enabled(rq))
+ start_hrtick_dl(rq, dl_se);
+
+ return p;
+ }
+
p = dl_task_of(dl_se);
p->se.exec_start = rq_clock_task(rq);

@@ -1191,7 +1285,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
dequeue_pushable_dl_task(rq, p);

if (hrtick_enabled(rq))
- start_hrtick_dl(rq, p);
+ start_hrtick_dl(rq, &p->dl);

queue_push_tasks(rq);

@@ -1217,7 +1311,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
*/
if (hrtick_enabled(rq) && queued && p->dl.runtime > 0 &&
is_leftmost(p, &rq->dl))
- start_hrtick_dl(rq, p);
+ start_hrtick_dl(rq, &p->dl);
}

static void task_fork_dl(struct task_struct *p)
@@ -1632,14 +1726,21 @@ static void pull_dl_task(struct rq *this_rq)
*/
static void task_woken_dl(struct rq *rq, struct task_struct *p)
{
- if (!task_running(rq, p) &&
- !test_tsk_need_resched(rq->curr) &&
- p->nr_cpus_allowed > 1 &&
- dl_task(rq->curr) &&
+ if (task_running(rq, p) ||
+ test_tsk_need_resched(rq->curr) ||
+ p->nr_cpus_allowed <= 1)
+ return;
+
+ if (dl_task(rq->curr) &&
(rq->curr->nr_cpus_allowed < 2 ||
!dl_entity_preempt(&p->dl, &rq->curr->dl))) {
push_dl_tasks(rq);
}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (rt_task(rq->curr) && is_dl_group(rq->curr->rt.rt_rq))
+ push_dl_tasks(rq);
+#endif
}

static void set_cpus_allowed_dl(struct task_struct *p,
@@ -1715,7 +1816,7 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
* SCHED_DEADLINE until the deadline passes, the timer will reset the
* task.
*/
- if (!start_dl_timer(p))
+ if (!start_dl_timer(&p->dl))
__dl_clear_params(p);

/*
@@ -1723,10 +1824,15 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
* this is the right place to try to pull some other one
* from an overloaded cpu, if any.
*/
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (!rq->dl.dl_nr_total)
+ queue_pull_task(rq);
+#else
if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
return;

queue_pull_task(rq);
+#endif
}

/*
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index e72ccb8..f38bd4b 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -46,49 +46,37 @@ void init_rt_rq(struct rt_rq *rt_rq)

#ifdef CONFIG_RT_GROUP_SCHED

-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
-{
-#ifdef CONFIG_SCHED_DEBUG
- WARN_ON_ONCE(!rt_entity_is_task(rt_se));
-#endif
- return container_of(rt_se, struct task_struct, rt);
-}
-
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
return rt_rq->rq;
}

-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
- return rt_se->rt_rq;
-}
-
-static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
-{
- struct rt_rq *rt_rq = rt_se->rt_rq;
-
- return rt_rq->rq;
-}
-
void free_rt_sched_group(struct task_group *tg)
{
+ unsigned long flags;
int i;

for_each_possible_cpu(i) {
if (tg->rt_rq)
kfree(tg->rt_rq[i]);
- if (tg->rt_se)
- kfree(tg->rt_se[i]);
+ if (tg->dl_se) {
+ raw_spin_lock_irqsave(&cpu_rq(i)->lock, flags);
+ if (!tg->dl_se[i]->dl_throttled)
+ dequeue_dl_entity(tg->dl_se[i]);
+ raw_spin_unlock_irqrestore(&cpu_rq(i)->lock, flags);
+
+ hrtimer_cancel(&tg->dl_se[i]->dl_timer);
+ kfree(tg->dl_se[i]);
+ }
}

kfree(tg->rt_rq);
- kfree(tg->rt_se);
+ kfree(tg->dl_se);
}

void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
- struct sched_rt_entity *rt_se, int cpu,
- struct sched_rt_entity *parent)
+ struct sched_dl_entity *dl_se, int cpu,
+ struct sched_dl_entity *parent)
{
struct rq *rq = cpu_rq(cpu);

@@ -97,47 +85,56 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
rt_rq->tg = tg;

tg->rt_rq[cpu] = rt_rq;
- tg->rt_se[cpu] = rt_se;
+ tg->dl_se[cpu] = dl_se;

- if (!rt_se)
+ if (!dl_se)
return;

- if (!parent)
- rt_se->rt_rq = &rq->rt;
- else
- rt_se->rt_rq = parent->my_q;
-
- rt_se->my_q = rt_rq;
- rt_se->parent = parent;
- INIT_LIST_HEAD(&rt_se->run_list);
+ dl_se->dl_rq = &rq->dl;
+ dl_se->my_q = rt_rq;
+ RB_CLEAR_NODE(&dl_se->rb_node);
}

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{
struct rt_rq *rt_rq;
- struct sched_rt_entity *rt_se;
+ struct sched_dl_entity *dl_se;
int i;

- tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
+ tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
if (!tg->rt_rq)
goto err;
- tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
- if (!tg->rt_se)
+ tg->dl_se = kcalloc(nr_cpu_ids, sizeof(dl_se), GFP_KERNEL);
+ if (!tg->dl_se)
goto err;

+ init_dl_bandwidth(&tg->dl_bandwidth,
+ def_dl_bandwidth.dl_period, 0);
+
for_each_possible_cpu(i) {
rt_rq = kzalloc_node(sizeof(struct rt_rq),
GFP_KERNEL, cpu_to_node(i));
if (!rt_rq)
goto err;

- rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+ dl_se = kzalloc_node(sizeof(struct sched_dl_entity),
GFP_KERNEL, cpu_to_node(i));
- if (!rt_se)
+ if (!dl_se)
goto err_free_rq;

init_rt_rq(rt_rq);
- init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
+ rt_rq->rq = cpu_rq(i);
+
+ init_dl_task_timer(dl_se);
+
+ dl_se->dl_runtime = tg->dl_bandwidth.dl_runtime;
+ dl_se->dl_period = tg->dl_bandwidth.dl_period;
+ dl_se->dl_deadline = dl_se->dl_period;
+ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+
+ dl_se->dl_throttled = 0;
+
+ init_tg_rt_entry(tg, rt_rq, dl_se, i, parent->dl_se[i]);
}

return 1;
@@ -150,30 +147,11 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

#else /* CONFIG_RT_GROUP_SCHED */

-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
-{
- return container_of(rt_se, struct task_struct, rt);
-}
-
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
return container_of(rt_rq, struct rq, rt);
}

-static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
-{
- struct task_struct *p = rt_task_of(rt_se);
-
- return task_rq(p);
-}
-
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
- struct rq *rq = rq_of_rt_se(rt_se);
-
- return &rq->rt;
-}
-
void free_rt_sched_group(struct task_group *tg) { }

int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
@@ -228,7 +206,7 @@ static inline void rt_clear_overload(struct rq *rq)

static void update_rt_migration(struct rt_rq *rt_rq)
{
- if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
+ if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_running > 1) {
if (!rt_rq->overloaded) {
rt_set_overload(rq_of_rt_rq(rt_rq));
rt_rq->overloaded = 1;
@@ -243,8 +221,6 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
struct task_struct *p;

- return;
-
p = rt_task_of(rt_se);

if (p->nr_cpus_allowed > 1)
@@ -258,18 +234,16 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
struct task_struct *p;

p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;

- rt_rq->rt_nr_total--;
if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory--;

update_rt_migration(rt_rq);
}

-static inline int has_pushable_tasks(struct rq *rq)
+static inline int has_pushable_tasks(struct rt_rq *rt_rq)
{
- return !plist_head_empty(&rq->rt.pushable_tasks);
+ return !plist_head_empty(&rt_rq->pushable_tasks);
}

static DEFINE_PER_CPU(struct callback_head, rt_push_head);
@@ -280,7 +254,7 @@ static void pull_rt_task(struct rq *);

static inline void queue_push_tasks(struct rq *rq)
{
- if (!has_pushable_tasks(rq))
+ if (!has_pushable_tasks(&rq->rt))
return;

queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
@@ -291,37 +265,35 @@ static inline void queue_pull_task(struct rq *rq)
queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
}

-static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+static void enqueue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p)
{
- plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ plist_del(&p->pushable_tasks, &rt_rq->pushable_tasks);
plist_node_init(&p->pushable_tasks, p->prio);
- plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ plist_add(&p->pushable_tasks, &rt_rq->pushable_tasks);

/* Update the highest prio pushable task */
- if (p->prio < rq->rt.highest_prio.next)
- rq->rt.highest_prio.next = p->prio;
+ if (p->prio < rt_rq->highest_prio.next)
+ rt_rq->highest_prio.next = p->prio;
}

-static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+#ifdef CONFIG_RT_GROUP_SCHED
+void dequeue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p)
{
- plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+ plist_del(&p->pushable_tasks, &rt_rq->pushable_tasks);

/* Update the new highest prio pushable task */
- if (has_pushable_tasks(rq)) {
- p = plist_first_entry(&rq->rt.pushable_tasks,
+ if (has_pushable_tasks(rt_rq)) {
+ p = plist_first_entry(&rt_rq->pushable_tasks,
struct task_struct, pushable_tasks);
- rq->rt.highest_prio.next = p->prio;
+ rt_rq->highest_prio.next = p->prio;
} else
- rq->rt.highest_prio.next = MAX_RT_PRIO;
+ rt_rq->highest_prio.next = MAX_RT_PRIO;
}
-
+#endif
#else

-static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
-{
-}
-
-static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+static inline
+void enqueue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p)
{
}

@@ -347,33 +319,17 @@ static inline void pull_rt_task(struct rq *this_rq)
static inline void queue_push_tasks(struct rq *rq)
{
}
+
+static inline void queue_pull_task(struct rq *rq)
+{
+}
#endif /* CONFIG_SMP */

static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
- return rt_se->on_rq;
+ return !list_empty(&rt_se->run_list);
}

-#ifdef CONFIG_RT_GROUP_SCHED
-
-#define for_each_sched_rt_entity(rt_se) \
- for (; rt_se; rt_se = rt_se->parent)
-
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
-
-#else /* !CONFIG_RT_GROUP_SCHED */
-
-typedef struct rt_rq *rt_rq_iter_t;
-
-#define for_each_rt_rq(rt_rq, iter, rq) \
- for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
-
-#define for_each_sched_rt_entity(rt_se) \
- for (; rt_se; rt_se = NULL)
-
-#endif /* CONFIG_RT_GROUP_SCHED */
-
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{
return rt_task_of(rt_se)->prio;
@@ -386,6 +342,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
+ struct rt_rq *rt_rq = rt_rq_of_se(&curr->rt);
u64 delta_exec;

if (curr->sched_class != &rt_sched_class)
@@ -408,6 +365,34 @@ static void update_curr_rt(struct rq *rq)
cpuacct_charge(curr, delta_exec);

sched_rt_avg_update(rq, delta_exec);
+
+ if (!dl_bandwidth_enabled())
+ return;
+
+ if (is_dl_group(rt_rq)) {
+ struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+ if (dl_se->dl_throttled) {
+ resched_curr(rq);
+ return;
+ }
+
+ BUG_ON(rt_rq->rt_nr_running > rq->nr_running);
+ dl_se->runtime -= delta_exec;
+
+ /* A group exhausts the budget. */
+ if (dl_runtime_exceeded(dl_se)) {
+ dequeue_dl_entity(dl_se);
+
+ if (likely(start_dl_timer(dl_se)))
+ dl_se->dl_throttled = 1;
+ else
+ enqueue_dl_entity(dl_se, dl_se,
+ ENQUEUE_REPLENISH);
+
+ resched_curr(rq);
+ }
+ }
}

#if defined CONFIG_SMP
@@ -417,7 +402,7 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);

- if (&rq->rt != rt_rq)
+ if (is_dl_group(rt_rq))
return;

if (rq->online && prio < prev_prio)
@@ -429,7 +414,7 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);

- if (&rq->rt != rt_rq)
+ if (is_dl_group(rt_rq))
return;

if (rq->online && rt_rq->highest_prio.curr != prev_prio)
@@ -445,12 +430,15 @@ void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}

#endif /* CONFIG_SMP */

-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined(CONFIG_SMP)
static void
inc_rt_prio(struct rt_rq *rt_rq, int prio)
{
int prev_prio = rt_rq->highest_prio.curr;

+ if (is_dl_group(rt_rq))
+ return;
+
if (prio < prev_prio)
rt_rq->highest_prio.curr = prio;

@@ -462,6 +450,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
{
int prev_prio = rt_rq->highest_prio.curr;

+ if (is_dl_group(rt_rq))
+ return;
+
if (rt_rq->rt_nr_running) {

WARN_ON(prio < prev_prio);
@@ -488,7 +479,7 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}

-#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
+#endif /* CONFIG_SMP && !CONFIG_RT_GROUP_SCHED */

static inline
unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
@@ -516,6 +507,16 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);

inc_rt_prio(rt_rq, prio);
+
+ if (is_dl_group(rt_rq)) {
+ struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+ if (!dl_se->dl_throttled)
+ add_nr_running(rq_of_rt_rq(rt_rq), 1);
+ } else {
+ add_nr_running(rq_of_rt_rq(rt_rq), 1);
+ }
+
inc_rt_migration(rt_se, rt_rq);
}

@@ -523,11 +524,18 @@ static inline
void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
- WARN_ON(!rt_rq->rt_nr_running);
rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);

dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+ if (is_dl_group(rt_rq)) {
+ struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+ if (!dl_se->dl_throttled)
+ sub_nr_running(rq_of_rt_rq(rt_rq), 1);
+ } else {
+ sub_nr_running(rq_of_rt_rq(rt_rq), 1);
+ }
dec_rt_migration(rt_se, rt_rq);
}

@@ -596,24 +604,49 @@ static void
enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);

if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;

- enqueue_rt_entity(rt_se, flags);
+ /* Task arriving in an idle group of tasks. */
+ if (is_dl_group(rt_rq) && (rt_rq->rt_nr_running == 0)) {
+ struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+ if (!dl_se->dl_throttled) {
+ enqueue_dl_entity(dl_se, dl_se, flags);
+ resched_curr(rq);
+ }
+ }
+
+ enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);

if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
- enqueue_pushable_task(rq, p);
+ enqueue_pushable_task(rt_rq, p);
}

static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
{
struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);

update_curr_rt(rq);
dequeue_rt_entity(rt_se, flags);

- dequeue_pushable_task(rq, p);
+ dequeue_pushable_task(rt_rq_of_se(rt_se), p);
+
+ /* Last task of the task group. */
+ if (is_dl_group(rt_rq) && !rt_rq->rt_nr_running) {
+ struct sched_dl_entity *dl_se = dl_group_of(rt_rq);
+
+#ifndef CONFIG_RT_GROUP_SCHED
+ queue_pull_task(rq);
+#endif
+ if (!rt_rq->rt_nr_running) {
+ dequeue_dl_entity(dl_se);
+ resched_curr(rq);
+ }
+ }
}

/*
@@ -639,10 +672,8 @@ static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
struct sched_rt_entity *rt_se = &p->rt;
struct rt_rq *rt_rq;

- for_each_sched_rt_entity(rt_se) {
- rt_rq = rt_rq_of_se(rt_se);
- requeue_rt_entity(rt_rq, rt_se, head);
- }
+ rt_rq = rt_rq_of_se(rt_se);
+ requeue_rt_entity(rt_rq, rt_se, head);
}

static void yield_task_rt(struct rq *rq)
@@ -743,6 +774,30 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
*/
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
{
+ if (is_dl_group(rt_rq_of_se(&p->rt)) &&
+ is_dl_group(rt_rq_of_se(&rq->curr->rt))) {
+ struct sched_dl_entity *dl_se, *curr_dl_se;
+
+ dl_se = dl_group_of(rt_rq_of_se(&p->rt));
+ curr_dl_se = dl_group_of(rt_rq_of_se(&rq->curr->rt));
+
+ if (dl_entity_preempt(dl_se, curr_dl_se)) {
+ resched_curr(rq);
+ return;
+ } else if (!dl_entity_preempt(curr_dl_se, dl_se)) {
+ if (p->prio < rq->curr->prio) {
+ resched_curr(rq);
+ return;
+ }
+ }
+ return;
+ } else if (is_dl_group(rt_rq_of_se(&p->rt))) {
+ resched_curr(rq);
+ return;
+ } else if (is_dl_group(rt_rq_of_se(&rq->curr->rt))) {
+ return;
+ }
+
if (p->prio < rq->curr->prio) {
resched_curr(rq);
return;
@@ -766,7 +821,7 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
#endif
}

-static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
+struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
struct rt_rq *rt_rq)
{
struct rt_prio_array *array = &rt_rq->active;
@@ -831,7 +886,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
if (prev->sched_class == &rt_sched_class)
update_curr_rt(rq);

- if (!rt_rq->rt_queued)
+ if (!rt_rq->rt_nr_running)
return NULL;

put_prev_task(rq, prev);
@@ -839,7 +894,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
p = _pick_next_task_rt(rq);

/* The running task is never eligible for pushing */
- dequeue_pushable_task(rq, p);
+ dequeue_pushable_task(rt_rq, p);

queue_push_tasks(rq);

@@ -848,6 +903,8 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)

static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
+ struct rt_rq *rt_rq = rt_rq_of_se(&p->rt);
+
update_curr_rt(rq);

/*
@@ -855,7 +912,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
* if it is still active
*/
if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
- enqueue_pushable_task(rq, p);
+ enqueue_pushable_task(rt_rq, p);
+
}

#ifdef CONFIG_SMP
@@ -863,9 +921,9 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
/* Only try algorithms three times */
#define RT_MAX_TRIES 3

-static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
+static int pick_rt_task(struct rt_rq *rt_rq, struct task_struct *p, int cpu)
{
- if (!task_running(rq, p) &&
+ if (!task_running(rq_of_rt_rq(rt_rq), p) &&
cpumask_test_cpu(cpu, &p->cpus_allowed))
return 1;
return 0;
@@ -875,16 +933,17 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
* Return the highest pushable rq's task, which is suitable to be executed
* on the cpu, NULL otherwise
*/
-static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
+static
+struct task_struct *pick_highest_pushable_task(struct rt_rq *rt_rq, int cpu)
{
- struct plist_head *head = &rq->rt.pushable_tasks;
+ struct plist_head *head = &rt_rq->pushable_tasks;
struct task_struct *p;

- if (!has_pushable_tasks(rq))
+ if (!has_pushable_tasks(rt_rq))
return NULL;

plist_for_each_entry(p, head, pushable_tasks) {
- if (pick_rt_task(rq, p, cpu))
+ if (pick_rt_task(rt_rq, p, cpu))
return p;
}

@@ -1024,14 +1083,15 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
return lowest_rq;
}

-static struct task_struct *pick_next_pushable_task(struct rq *rq)
+static struct task_struct *pick_next_pushable_task(struct rt_rq *rt_rq)
{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
struct task_struct *p;

- if (!has_pushable_tasks(rq))
+ if (!has_pushable_tasks(rt_rq))
return NULL;

- p = plist_first_entry(&rq->rt.pushable_tasks,
+ p = plist_first_entry(&rt_rq->pushable_tasks,
struct task_struct, pushable_tasks);

BUG_ON(rq->cpu != task_cpu(p));
@@ -1058,7 +1118,7 @@ static int push_rt_task(struct rq *rq)
if (!rq->rt.overloaded)
return 0;

- next_task = pick_next_pushable_task(rq);
+ next_task = pick_next_pushable_task(&rq->rt);
if (!next_task)
return 0;

@@ -1093,7 +1153,7 @@ static int push_rt_task(struct rq *rq)
* run-queue and is also still the next task eligible for
* pushing.
*/
- task = pick_next_pushable_task(rq);
+ task = pick_next_pushable_task(&rq->rt);
if (task_cpu(next_task) == rq->cpu && task == next_task) {
/*
* The task hasn't migrated, and is still the next
@@ -1331,7 +1391,7 @@ static void try_to_push_tasks(void *arg)
src_rq = rq_of_rt_rq(rt_rq);

again:
- if (has_pushable_tasks(rq)) {
+ if (has_pushable_tasks(&rq->rt)) {
raw_spin_lock(&rq->lock);
push_rt_task(rq);
raw_spin_unlock(&rq->lock);
@@ -1382,6 +1442,7 @@ static void pull_rt_task(struct rq *this_rq)
int this_cpu = this_rq->cpu, cpu;
bool resched = false;
struct task_struct *p;
+ struct rt_rq *src_rt_rq;
struct rq *src_rq;

if (likely(!rt_overloaded(this_rq)))
@@ -1405,6 +1466,7 @@ static void pull_rt_task(struct rq *this_rq)
continue;

src_rq = cpu_rq(cpu);
+ src_rt_rq = &src_rq->rt;

/*
* Don't bother taking the src_rq->lock if the next highest
@@ -1413,7 +1475,7 @@ static void pull_rt_task(struct rq *this_rq)
* logically higher, the src_rq will push this task away.
* And if its going logically lower, we do not care
*/
- if (src_rq->rt.highest_prio.next >=
+ if (src_rt_rq->highest_prio.next >=
this_rq->rt.highest_prio.curr)
continue;

@@ -1428,7 +1490,7 @@ static void pull_rt_task(struct rq *this_rq)
* We can pull only a task, which is pushable
* on its rq, and no others.
*/
- p = pick_highest_pushable_task(src_rq, this_cpu);
+ p = pick_highest_pushable_task(src_rt_rq, this_cpu);

/*
* Do we have an RT task that preempts
@@ -1469,19 +1531,44 @@ static void pull_rt_task(struct rq *this_rq)
resched_curr(this_rq);
}

+#ifdef CONFIG_RT_GROUP_SCHED
+int group_push_rt_task(struct rt_rq *rt_rq)
+{
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+
+ if (is_dl_group(rt_rq))
+ return 0;
+
+ return push_rt_task(rq);
+}
+
+void group_push_rt_tasks(struct rt_rq *rt_rq)
+{
+ while (group_push_rt_task(rt_rq))
+ ;
+}
+#else
+void group_push_rt_tasks(struct rt_rq *rt_rq)
+{
+ push_rt_tasks(rq_of_rt_rq(rt_rq));
+}
+#endif
+
/*
* If we are not running and we are not going to reschedule soon, we should
* try to push tasks away now
*/
static void task_woken_rt(struct rq *rq, struct task_struct *p)
{
+ struct rt_rq *rt_rq = rt_rq_of_se(&p->rt);
+
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
p->nr_cpus_allowed > 1 &&
(dl_task(rq->curr) || rt_task(rq->curr)) &&
(rq->curr->nr_cpus_allowed < 2 ||
rq->curr->prio <= p->prio))
- push_rt_tasks(rq);
+ group_push_rt_tasks(rt_rq);
}

/* Assumes rq->lock is held */
@@ -1508,6 +1595,8 @@ static void rq_offline_rt(struct rq *rq)
*/
static void switched_from_rt(struct rq *rq, struct task_struct *p)
{
+ struct rt_rq *rt_rq = rt_rq_of_se(&p->rt);
+
/*
* If there are other RT tasks then we will reschedule
* and the scheduling of the other RT tasks will handle
@@ -1515,10 +1604,12 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
* we may need to handle the pulling of RT tasks
* now.
*/
- if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
+ if (!task_on_rq_queued(p) || rt_rq->rt_nr_running)
return;

+#ifndef CONFIG_RT_GROUP_SCHED
queue_pull_task(rq);
+#endif
}

void __init init_sched_rt_class(void)
@@ -1548,8 +1639,16 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
*/
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
+#ifndef CONFIG_RT_GROUP_SCHED
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
queue_push_tasks(rq);
+#else
+ if (rt_rq_of_se(&p->rt)->overloaded) {
+ } else {
+ if (p->prio < rq->curr->prio)
+ resched_curr(rq);
+ }
+#endif
#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio)
resched_curr(rq);
@@ -1563,6 +1662,10 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
static void
prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
{
+#ifdef CONFIG_SMP
+ struct rt_rq *rt_rq = rt_rq_of_se(&p->rt);
+#endif
+
if (!task_on_rq_queued(p))
return;

@@ -1573,13 +1676,14 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
* may need to pull tasks to this runqueue.
*/
if (oldprio < p->prio)
+#ifndef CONFIG_RT_GROUP_SCHED
queue_pull_task(rq);
-
+#endif
/*
* If there's a higher priority task waiting to run
* then reschedule.
*/
- if (p->prio > rq->rt.highest_prio.curr)
+ if (p->prio > rt_rq->highest_prio.curr)
resched_curr(rq);
#else
/* For UP simply resched on drop of prio */
@@ -1628,6 +1732,14 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
struct sched_rt_entity *rt_se = &p->rt;

update_curr_rt(rq);
+#ifdef CONFIG_RT_GROUP_SCHED
+ if (is_dl_group(&rq->rt)) {
+ struct sched_dl_entity *dl_se = dl_group_of(&rq->rt);
+
+ if (hrtick_enabled(rq) && queued && dl_se->runtime > 0)
+ start_hrtick_dl(rq, dl_se);
+ }
+#endif

watchdog(rq, p);

@@ -1647,23 +1759,22 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
* Requeue to the end of queue if we (and all of our ancestors) are not
* the only element on the queue
*/
- for_each_sched_rt_entity(rt_se) {
- if (rt_se->run_list.prev != rt_se->run_list.next) {
- requeue_task_rt(rq, p, 0);
- resched_curr(rq);
- return;
- }
+ if (rt_se->run_list.prev != rt_se->run_list.next) {
+ requeue_task_rt(rq, p, 0);
+ set_tsk_need_resched(p);
+ return;
}
}

static void set_curr_task_rt(struct rq *rq)
{
struct task_struct *p = rq->curr;
+ struct rt_rq *rt_rq = rt_rq_of_se(&p->rt);

p->se.exec_start = rq_clock_task(rq);

/* The running task is never eligible for pushing */
- dequeue_pushable_task(rq, p);
+ dequeue_pushable_task(rt_rq, p);
}

static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index a4c4a18..528b41c 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -112,6 +112,8 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
*/
#define DL_SCALE (10)

+unsigned long to_ratio(u64 period, u64 runtime);
+
/*
* These are the 'tuning knobs' of the scheduler:
*/
@@ -228,12 +230,6 @@ void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
dl_b->total_bw += tsk_bw;
}

-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
-{
- return dl_b->bw != -1 &&
- dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
-}

extern void init_dl_bw(struct dl_bw *dl_b);

@@ -286,9 +282,14 @@ struct task_group {
#endif

#ifdef CONFIG_RT_GROUP_SCHED
- struct sched_rt_entity **rt_se;
+ /*
+ * The scheduling entities for the task group are managed as a single
+ * sched_dl_entity, each of them sharing the same dl_bandwidth.
+ */
+ struct sched_dl_entity **dl_se;
struct rt_rq **rt_rq;

+ struct dl_bandwidth dl_bandwidth;
#endif

struct rcu_head rcu;
@@ -354,8 +355,8 @@ extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
extern void free_rt_sched_group(struct task_group *tg);
extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
- struct sched_rt_entity *rt_se, int cpu,
- struct sched_rt_entity *parent);
+ struct sched_dl_entity *rt_se, int cpu,
+ struct sched_dl_entity *parent);

extern struct task_group *sched_create_group(struct task_group *parent);
extern void sched_online_group(struct task_group *tg,
@@ -383,6 +384,21 @@ struct cfs_bandwidth { };

#endif /* CONFIG_CGROUP_SCHED */

+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+ u64 dl_groups_root = 0;
+
+#ifdef CONFIG_RT_GROUP_SCHED
+ dl_groups_root = to_ratio(root_task_group.dl_bandwidth.dl_period,
+ root_task_group.dl_bandwidth.dl_runtime);
+#endif
+ return dl_b->bw != -1 &&
+ dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw
+ + dl_groups_root * cpus;
+}
+
+
/* CFS-related fields in a runqueue */
struct cfs_rq {
struct load_weight load;
@@ -484,7 +500,6 @@ struct rt_rq {
#endif
#ifdef CONFIG_SMP
unsigned long rt_nr_migratory;
- unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
#ifdef HAVE_RT_PUSH_IPI
@@ -494,14 +509,11 @@ struct rt_rq {
raw_spinlock_t push_lock;
#endif
#endif /* CONFIG_SMP */
- int rt_queued;

#ifdef CONFIG_RT_GROUP_SCHED
- unsigned long rt_nr_boosted;
-
- struct rq *rq;
struct task_group *tg;
#endif
+ struct rq *rq;
};

/* Deadline class' related fields in a runqueue */
@@ -512,6 +524,12 @@ struct dl_rq {

unsigned long dl_nr_running;

+#ifdef CONFIG_RT_GROUP_SCHED
+ unsigned long dl_nr_total;
+ struct rt_rq *rq_to_push_from;
+ struct rt_rq *rq_to_pull_to;
+#endif
+
#ifdef CONFIG_SMP
/*
* Deadline values of the currently executing and the
@@ -1106,7 +1124,8 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)

#ifdef CONFIG_RT_GROUP_SCHED
p->rt.rt_rq = tg->rt_rq[cpu];
- p->rt.parent = tg->rt_se[cpu];
+ p->rt.parent = tg->dl_se[cpu];
+ p->dl.dl_rq = &cpu_rq(cpu)->dl;
#endif
}

@@ -1461,8 +1480,6 @@ extern struct dl_bandwidth def_dl_bandwidth;
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);

-unsigned long to_ratio(u64 period, u64 runtime);
-
extern void init_entity_runnable_average(struct sched_entity *se);
extern void post_init_entity_util_avg(struct sched_entity *se);

@@ -1513,6 +1530,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count)

static inline void sub_nr_running(struct rq *rq, unsigned count)
{
+ BUG_ON(rq->nr_running < count);
rq->nr_running -= count;
/* Check if we still need preemption */
sched_update_tick_dependency(rq);
@@ -1864,6 +1882,71 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)

#endif

+#ifdef CONFIG_RT_GROUP_SCHED
+static inline int is_dl_group(struct rt_rq *rt_rq)
+{
+ return rt_rq->tg != &root_task_group;
+}
+
+/*
+ * Return the scheduling entity of this group of tasks.
+ */
+static inline struct sched_dl_entity *dl_group_of(struct rt_rq *rt_rq)
+{
+ BUG_ON(!is_dl_group(rt_rq));
+
+ return rt_rq->tg->dl_se[cpu_of(rt_rq->rq)];
+}
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+ return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_se->rt_rq;
+
+ return rt_rq->rq;
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ return rt_se->rt_rq;
+}
+#else
+static inline int is_dl_group(struct rt_rq *rt_rq)
+{
+ return 0;
+}
+
+static inline struct sched_dl_entity *dl_group_of(struct rt_rq *rt_rq)
+{
+ return NULL;
+}
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+ return container_of(rt_se, struct task_struct, rt);
+}
+
+static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
+{
+ struct task_struct *p = rt_task_of(rt_se);
+
+ return task_rq(p);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+ struct rq *rq = rq_of_rt_se(rt_se);
+
+ return &rq->rt;
+}
+#endif
+
+
+
extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);

@@ -1986,3 +2069,28 @@ static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
static inline void arch_task_migrate(struct task_struct *p) { }
#endif

+int group_pull_rt_task(struct rt_rq *rt_rq);
+int group_push_rt_task(struct rt_rq *rt_rq);
+
+struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, struct rt_rq *rt_rq);
+#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_SMP)
+void dequeue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p);
+#else
+static inline
+void dequeue_pushable_task(struct rt_rq *rt_rq, struct task_struct *p)
+{
+}
+#endif
+
+int is_dl_group(struct rt_rq *rt_rq);
+
+void dequeue_dl_entity(struct sched_dl_entity *dl_se);
+
+void init_dl_timer(struct sched_dl_entity *dl_se);
+
+void enqueue_dl_entity(struct sched_dl_entity *dl_se,
+ struct sched_dl_entity *pi_se, int flags);
+int dl_runtime_exceeded(struct sched_dl_entity *dl_se);
+int start_dl_timer(struct sched_dl_entity *dl_se);
+void start_hrtick_dl(struct rq *rq, struct sched_dl_entity *dl_se);
+
--
2.7.4