[RFC PATCH 1/3] sched/rt: Removed CGroup functionalities from RT scheduling class

From: Alessio Balsini
Date: Fri Mar 31 2017 - 14:45:05 EST


Clean the current RT CGroups throttling code and prevent RT tasks to be
managed by a CGroup.

The new mechanism for managing the RT tasks throttling with CGroups will be
introduced in the upcoming patches.

Signed-off-by: Andrea Parri <parri.andrea@xxxxxxxxx>
Signed-off-by: Luca Abeni <luca.abeni@xxxxxxxxxxxxxxx>
Cc: Tommaso Cucinotta <tommaso.cucinotta@xxxxxxxx>
Cc: Juri Lelli <juri.lelli@xxxxxxx>
Cc: Daniel Bristot de Oliveira <bristot@xxxxxxxxxx>
Cc: Steven Rostedt <rostedt@xxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Signed-off-by: Alessio Balsini <a.balsini@xxxxxxxx>
---
kernel/sched/core.c | 63 +---
kernel/sched/deadline.c | 27 --
kernel/sched/debug.c | 3 -
kernel/sched/rt.c | 752 +-----------------------------------------------
kernel/sched/sched.h | 24 --
5 files changed, 20 insertions(+), 849 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2633b88..3d4cce4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4261,8 +4261,7 @@ static int __sched_setscheduler(struct task_struct *p,
* Do not allow realtime tasks into groups that have no runtime
* assigned.
*/
- if (rt_bandwidth_enabled() && rt_policy(policy) &&
- task_group(p)->rt_bandwidth.rt_runtime == 0 &&
+ if (rt_policy(policy) &&
!task_group_is_autogroup(task_group(p))) {
task_rq_unlock(rq, p, &rf);
return -EPERM;
@@ -6005,17 +6004,12 @@ void __init sched_init(void)
}
#endif /* CONFIG_CPUMASK_OFFSTACK */

- init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());

#ifdef CONFIG_SMP
init_defrootdomain();
#endif

-#ifdef CONFIG_RT_GROUP_SCHED
- init_rt_bandwidth(&root_task_group.rt_bandwidth,
- global_rt_period(), global_rt_runtime());
-#endif /* CONFIG_RT_GROUP_SCHED */

#ifdef CONFIG_CGROUP_SCHED
task_group_cache = KMEM_CACHE(task_group, 0);
@@ -6064,7 +6058,6 @@ void __init sched_init(void)
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */

- rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
@@ -6468,8 +6461,8 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
unsigned long total, sum = 0;
u64 period, runtime;

- period = ktime_to_ns(tg->rt_bandwidth.rt_period);
- runtime = tg->rt_bandwidth.rt_runtime;
+ period = 0;
+ runtime = 0;

if (tg == d->tg) {
period = d->rt_period;
@@ -6485,7 +6478,7 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
/*
* Ensure we don't starve existing RT tasks.
*/
- if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
+ if (!runtime && tg_has_rt_tasks(tg))
return -EBUSY;

total = to_ratio(period, runtime);
@@ -6500,8 +6493,8 @@ static int tg_rt_schedulable(struct task_group *tg, void *data)
* The sum of our children's runtime should not exceed our own.
*/
list_for_each_entry_rcu(child, &tg->children, siblings) {
- period = ktime_to_ns(child->rt_bandwidth.rt_period);
- runtime = child->rt_bandwidth.rt_runtime;
+ period = 0;
+ runtime = 0;

if (child == d->tg) {
period = d->rt_period;
@@ -6556,18 +6549,6 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
if (err)
goto unlock;

- raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
- tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
- tg->rt_bandwidth.rt_runtime = rt_runtime;
-
- for_each_possible_cpu(i) {
- struct rt_rq *rt_rq = tg->rt_rq[i];
-
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_runtime = rt_runtime;
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- }
- raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
unlock:
read_unlock(&tasklist_lock);
mutex_unlock(&rt_constraints_mutex);
@@ -6579,7 +6560,7 @@ static int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
{
u64 rt_runtime, rt_period;

- rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ rt_period = 0;
rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
if (rt_runtime_us < 0)
rt_runtime = RUNTIME_INF;
@@ -6591,10 +6572,10 @@ static long sched_group_rt_runtime(struct task_group *tg)
{
u64 rt_runtime_us;

- if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
+ if (0 == RUNTIME_INF)
return -1;

- rt_runtime_us = tg->rt_bandwidth.rt_runtime;
+ rt_runtime_us = 0;
do_div(rt_runtime_us, NSEC_PER_USEC);
return rt_runtime_us;
}
@@ -6604,7 +6585,7 @@ static int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
u64 rt_runtime, rt_period;

rt_period = rt_period_us * NSEC_PER_USEC;
- rt_runtime = tg->rt_bandwidth.rt_runtime;
+ rt_runtime = 0;

return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
}
@@ -6613,7 +6594,7 @@ static long sched_group_rt_period(struct task_group *tg)
{
u64 rt_period_us;

- rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
+ rt_period_us = 0;
do_div(rt_period_us, NSEC_PER_USEC);
return rt_period_us;
}
@@ -6636,7 +6617,7 @@ static int sched_rt_global_constraints(void)
static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
{
/* Don't accept realtime tasks when there is no way for them to run */
- if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+ if (rt_task(tsk))
return 0;

return 1;
@@ -6645,19 +6626,6 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
#else /* !CONFIG_RT_GROUP_SCHED */
static int sched_rt_global_constraints(void)
{
- unsigned long flags;
- int i;
-
- raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
- for_each_possible_cpu(i) {
- struct rt_rq *rt_rq = &cpu_rq(i)->rt;
-
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_runtime = global_rt_runtime();
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- }
- raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
-
return 0;
}
#endif /* CONFIG_RT_GROUP_SCHED */
@@ -6738,12 +6706,6 @@ static int sched_rt_global_validate(void)
return 0;
}

-static void sched_rt_do_global(void)
-{
- def_rt_bandwidth.rt_runtime = global_rt_runtime();
- def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
-}
-
int sched_rt_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -6771,7 +6733,6 @@ int sched_rt_handler(struct ctl_table *table, int write,
if (ret)
goto undo;

- sched_rt_do_global();
sched_dl_do_global();
}
if (0) {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index a2ce590..1af6219 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -732,8 +732,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
return (dl_se->runtime <= 0);
}

-extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-
/*
* Update the current task's runtime statistics (provided it is still
* a -deadline task and has not been removed from the dl_rq).
@@ -788,31 +786,6 @@ static void update_curr_dl(struct rq *rq)
if (!is_leftmost(curr, &rq->dl))
resched_curr(rq);
}
-
- /*
- * Because -- for now -- we share the rt bandwidth, we need to
- * account our runtime there too, otherwise actual rt tasks
- * would be able to exceed the shared quota.
- *
- * Account to the root rt group for now.
- *
- * The solution we're working towards is having the RT groups scheduled
- * using deadline servers -- however there's a few nasties to figure
- * out before that can happen.
- */
- if (rt_bandwidth_enabled()) {
- struct rt_rq *rt_rq = &rq->rt;
-
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- /*
- * We'll let actual RT tasks worry about the overflow here, we
- * have our own CBS to keep us inline; only account when RT
- * bandwidth is relevant.
- */
- if (sched_rt_bandwidth_account(rt_rq))
- rt_rq->rt_time += delta_exec;
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- }
}

#ifdef CONFIG_SMP
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 38f0193..a81e6a1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -556,9 +556,6 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))

P(rt_nr_running);
- P(rt_throttled);
- PN(rt_time);
- PN(rt_runtime);

#undef PN
#undef P
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 979b734..e72ccb8 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -11,68 +11,6 @@
int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;

-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
-
-struct rt_bandwidth def_rt_bandwidth;
-
-static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
-{
- struct rt_bandwidth *rt_b =
- container_of(timer, struct rt_bandwidth, rt_period_timer);
- int idle = 0;
- int overrun;
-
- raw_spin_lock(&rt_b->rt_runtime_lock);
- for (;;) {
- overrun = hrtimer_forward_now(timer, rt_b->rt_period);
- if (!overrun)
- break;
-
- raw_spin_unlock(&rt_b->rt_runtime_lock);
- idle = do_sched_rt_period_timer(rt_b, overrun);
- raw_spin_lock(&rt_b->rt_runtime_lock);
- }
- if (idle)
- rt_b->rt_period_active = 0;
- raw_spin_unlock(&rt_b->rt_runtime_lock);
-
- return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
-}
-
-void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
-{
- rt_b->rt_period = ns_to_ktime(period);
- rt_b->rt_runtime = runtime;
-
- raw_spin_lock_init(&rt_b->rt_runtime_lock);
-
- hrtimer_init(&rt_b->rt_period_timer,
- CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- rt_b->rt_period_timer.function = sched_rt_period_timer;
-}
-
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
-{
- if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
- return;
-
- raw_spin_lock(&rt_b->rt_runtime_lock);
- if (!rt_b->rt_period_active) {
- rt_b->rt_period_active = 1;
- /*
- * SCHED_DEADLINE updates the bandwidth, as a run away
- * RT task with a DL task could hog a CPU. But DL does
- * not reset the period. If a deadline task was running
- * without an RT task running, it can cause RT tasks to
- * throttle when they start up. Kick the timer right away
- * to update the period.
- */
- hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
- hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
- }
- raw_spin_unlock(&rt_b->rt_runtime_lock);
-}
-
#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
static void push_irq_work_func(struct irq_work *work);
#endif
@@ -104,22 +42,9 @@ void init_rt_rq(struct rt_rq *rt_rq)
init_irq_work(&rt_rq->push_work, push_irq_work_func);
#endif
#endif /* CONFIG_SMP */
- /* We start is dequeued state, because no RT tasks are queued */
- rt_rq->rt_queued = 0;
-
- rt_rq->rt_time = 0;
- rt_rq->rt_throttled = 0;
- rt_rq->rt_runtime = 0;
- raw_spin_lock_init(&rt_rq->rt_runtime_lock);
}

#ifdef CONFIG_RT_GROUP_SCHED
-static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
-{
- hrtimer_cancel(&rt_b->rt_period_timer);
-}
-
-#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)

static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{
@@ -150,9 +75,6 @@ void free_rt_sched_group(struct task_group *tg)
{
int i;

- if (tg->rt_se)
- destroy_rt_bandwidth(&tg->rt_bandwidth);
-
for_each_possible_cpu(i) {
if (tg->rt_rq)
kfree(tg->rt_rq[i]);
@@ -171,7 +93,6 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
struct rq *rq = cpu_rq(cpu);

rt_rq->highest_prio.curr = MAX_RT_PRIO;
- rt_rq->rt_nr_boosted = 0;
rt_rq->rq = rq;
rt_rq->tg = tg;

@@ -204,9 +125,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
if (!tg->rt_se)
goto err;

- init_rt_bandwidth(&tg->rt_bandwidth,
- ktime_to_ns(def_rt_bandwidth.rt_period), 0);
-
for_each_possible_cpu(i) {
rt_rq = kzalloc_node(sizeof(struct rt_rq),
GFP_KERNEL, cpu_to_node(i));
@@ -219,7 +137,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
goto err_free_rq;

init_rt_rq(rt_rq);
- rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
}

@@ -233,8 +150,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)

#else /* CONFIG_RT_GROUP_SCHED */

-#define rt_entity_is_task(rt_se) (1)
-
static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
{
return container_of(rt_se, struct task_struct, rt);
@@ -328,13 +243,10 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
struct task_struct *p;

- if (!rt_entity_is_task(rt_se))
- return;
+ return;

p = rt_task_of(rt_se);
- rt_rq = &rq_of_rt_rq(rt_rq)->rt;

- rt_rq->rt_nr_total++;
if (p->nr_cpus_allowed > 1)
rt_rq->rt_nr_migratory++;

@@ -345,9 +257,6 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
struct task_struct *p;

- if (!rt_entity_is_task(rt_se))
- return;
-
p = rt_task_of(rt_se);
rt_rq = &rq_of_rt_rq(rt_rq)->rt;

@@ -440,9 +349,6 @@ static inline void queue_push_tasks(struct rq *rq)
}
#endif /* CONFIG_SMP */

-static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
-static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
-
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
return rt_se->on_rq;
@@ -450,136 +356,14 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)

#ifdef CONFIG_RT_GROUP_SCHED

-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
-{
- if (!rt_rq->tg)
- return RUNTIME_INF;
-
- return rt_rq->rt_runtime;
-}
-
-static inline u64 sched_rt_period(struct rt_rq *rt_rq)
-{
- return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
-}
-
-typedef struct task_group *rt_rq_iter_t;
-
-static inline struct task_group *next_task_group(struct task_group *tg)
-{
- do {
- tg = list_entry_rcu(tg->list.next,
- typeof(struct task_group), list);
- } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
-
- if (&tg->list == &task_groups)
- tg = NULL;
-
- return tg;
-}
-
-#define for_each_rt_rq(rt_rq, iter, rq) \
- for (iter = container_of(&task_groups, typeof(*iter), list); \
- (iter = next_task_group(iter)) && \
- (rt_rq = iter->rt_rq[cpu_of(rq)]);)
-
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = rt_se->parent)

-static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
-{
- return rt_se->my_q;
-}
-
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);

-static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
-{
- struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
- struct rq *rq = rq_of_rt_rq(rt_rq);
- struct sched_rt_entity *rt_se;
-
- int cpu = cpu_of(rq);
-
- rt_se = rt_rq->tg->rt_se[cpu];
-
- if (rt_rq->rt_nr_running) {
- if (!rt_se)
- enqueue_top_rt_rq(rt_rq);
- else if (!on_rt_rq(rt_se))
- enqueue_rt_entity(rt_se, 0);
-
- if (rt_rq->highest_prio.curr < curr->prio)
- resched_curr(rq);
- }
-}
-
-static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
-{
- struct sched_rt_entity *rt_se;
- int cpu = cpu_of(rq_of_rt_rq(rt_rq));
-
- rt_se = rt_rq->tg->rt_se[cpu];
-
- if (!rt_se)
- dequeue_top_rt_rq(rt_rq);
- else if (on_rt_rq(rt_se))
- dequeue_rt_entity(rt_se, 0);
-}
-
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
- return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
-}
-
-static int rt_se_boosted(struct sched_rt_entity *rt_se)
-{
- struct rt_rq *rt_rq = group_rt_rq(rt_se);
- struct task_struct *p;
-
- if (rt_rq)
- return !!rt_rq->rt_nr_boosted;
-
- p = rt_task_of(rt_se);
- return p->prio != p->normal_prio;
-}
-
-#ifdef CONFIG_SMP
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
- return this_rq()->rd->span;
-}
-#else
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
- return cpu_online_mask;
-}
-#endif
-
-static inline
-struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
-{
- return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
-}
-
-static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
-{
- return &rt_rq->tg->rt_bandwidth;
-}
-
#else /* !CONFIG_RT_GROUP_SCHED */

-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
-{
- return rt_rq->rt_runtime;
-}
-
-static inline u64 sched_rt_period(struct rt_rq *rt_rq)
-{
- return ktime_to_ns(def_rt_bandwidth.rt_period);
-}
-
typedef struct rt_rq *rt_rq_iter_t;

#define for_each_rt_rq(rt_rq, iter, rq) \
@@ -588,359 +372,13 @@ typedef struct rt_rq *rt_rq_iter_t;
#define for_each_sched_rt_entity(rt_se) \
for (; rt_se; rt_se = NULL)

-static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
-{
- return NULL;
-}
-
-static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
-{
- struct rq *rq = rq_of_rt_rq(rt_rq);
-
- if (!rt_rq->rt_nr_running)
- return;
-
- enqueue_top_rt_rq(rt_rq);
- resched_curr(rq);
-}
-
-static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
-{
- dequeue_top_rt_rq(rt_rq);
-}
-
-static inline int rt_rq_throttled(struct rt_rq *rt_rq)
-{
- return rt_rq->rt_throttled;
-}
-
-static inline const struct cpumask *sched_rt_period_mask(void)
-{
- return cpu_online_mask;
-}
-
-static inline
-struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
-{
- return &cpu_rq(cpu)->rt;
-}
-
-static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
-{
- return &def_rt_bandwidth;
-}
-
#endif /* CONFIG_RT_GROUP_SCHED */

-bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
-{
- struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-
- return (hrtimer_active(&rt_b->rt_period_timer) ||
- rt_rq->rt_time < rt_b->rt_runtime);
-}
-
-#ifdef CONFIG_SMP
-/*
- * We ran out of runtime, see if we can borrow some from our neighbours.
- */
-static void do_balance_runtime(struct rt_rq *rt_rq)
-{
- struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
- struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
- int i, weight;
- u64 rt_period;
-
- weight = cpumask_weight(rd->span);
-
- raw_spin_lock(&rt_b->rt_runtime_lock);
- rt_period = ktime_to_ns(rt_b->rt_period);
- for_each_cpu(i, rd->span) {
- struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
- s64 diff;
-
- if (iter == rt_rq)
- continue;
-
- raw_spin_lock(&iter->rt_runtime_lock);
- /*
- * Either all rqs have inf runtime and there's nothing to steal
- * or __disable_runtime() below sets a specific rq to inf to
- * indicate its been disabled and disalow stealing.
- */
- if (iter->rt_runtime == RUNTIME_INF)
- goto next;
-
- /*
- * From runqueues with spare time, take 1/n part of their
- * spare time, but no more than our period.
- */
- diff = iter->rt_runtime - iter->rt_time;
- if (diff > 0) {
- diff = div_u64((u64)diff, weight);
- if (rt_rq->rt_runtime + diff > rt_period)
- diff = rt_period - rt_rq->rt_runtime;
- iter->rt_runtime -= diff;
- rt_rq->rt_runtime += diff;
- if (rt_rq->rt_runtime == rt_period) {
- raw_spin_unlock(&iter->rt_runtime_lock);
- break;
- }
- }
-next:
- raw_spin_unlock(&iter->rt_runtime_lock);
- }
- raw_spin_unlock(&rt_b->rt_runtime_lock);
-}
-
-/*
- * Ensure this RQ takes back all the runtime it lend to its neighbours.
- */
-static void __disable_runtime(struct rq *rq)
-{
- struct root_domain *rd = rq->rd;
- rt_rq_iter_t iter;
- struct rt_rq *rt_rq;
-
- if (unlikely(!scheduler_running))
- return;
-
- for_each_rt_rq(rt_rq, iter, rq) {
- struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
- s64 want;
- int i;
-
- raw_spin_lock(&rt_b->rt_runtime_lock);
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- /*
- * Either we're all inf and nobody needs to borrow, or we're
- * already disabled and thus have nothing to do, or we have
- * exactly the right amount of runtime to take out.
- */
- if (rt_rq->rt_runtime == RUNTIME_INF ||
- rt_rq->rt_runtime == rt_b->rt_runtime)
- goto balanced;
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
-
- /*
- * Calculate the difference between what we started out with
- * and what we current have, that's the amount of runtime
- * we lend and now have to reclaim.
- */
- want = rt_b->rt_runtime - rt_rq->rt_runtime;
-
- /*
- * Greedy reclaim, take back as much as we can.
- */
- for_each_cpu(i, rd->span) {
- struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
- s64 diff;
-
- /*
- * Can't reclaim from ourselves or disabled runqueues.
- */
- if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
- continue;
-
- raw_spin_lock(&iter->rt_runtime_lock);
- if (want > 0) {
- diff = min_t(s64, iter->rt_runtime, want);
- iter->rt_runtime -= diff;
- want -= diff;
- } else {
- iter->rt_runtime -= want;
- want -= want;
- }
- raw_spin_unlock(&iter->rt_runtime_lock);
-
- if (!want)
- break;
- }
-
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- /*
- * We cannot be left wanting - that would mean some runtime
- * leaked out of the system.
- */
- BUG_ON(want);
-balanced:
- /*
- * Disable all the borrow logic by pretending we have inf
- * runtime - in which case borrowing doesn't make sense.
- */
- rt_rq->rt_runtime = RUNTIME_INF;
- rt_rq->rt_throttled = 0;
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- raw_spin_unlock(&rt_b->rt_runtime_lock);
-
- /* Make rt_rq available for pick_next_task() */
- sched_rt_rq_enqueue(rt_rq);
- }
-}
-
-static void __enable_runtime(struct rq *rq)
-{
- rt_rq_iter_t iter;
- struct rt_rq *rt_rq;
-
- if (unlikely(!scheduler_running))
- return;
-
- /*
- * Reset each runqueue's bandwidth settings
- */
- for_each_rt_rq(rt_rq, iter, rq) {
- struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-
- raw_spin_lock(&rt_b->rt_runtime_lock);
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_runtime = rt_b->rt_runtime;
- rt_rq->rt_time = 0;
- rt_rq->rt_throttled = 0;
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- raw_spin_unlock(&rt_b->rt_runtime_lock);
- }
-}
-
-static void balance_runtime(struct rt_rq *rt_rq)
-{
- if (!sched_feat(RT_RUNTIME_SHARE))
- return;
-
- if (rt_rq->rt_time > rt_rq->rt_runtime) {
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- do_balance_runtime(rt_rq);
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- }
-}
-#else /* !CONFIG_SMP */
-static inline void balance_runtime(struct rt_rq *rt_rq) {}
-#endif /* CONFIG_SMP */
-
-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
-{
- int i, idle = 1, throttled = 0;
- const struct cpumask *span;
-
- span = sched_rt_period_mask();
-#ifdef CONFIG_RT_GROUP_SCHED
- /*
- * FIXME: isolated CPUs should really leave the root task group,
- * whether they are isolcpus or were isolated via cpusets, lest
- * the timer run on a CPU which does not service all runqueues,
- * potentially leaving other CPUs indefinitely throttled. If
- * isolation is really required, the user will turn the throttle
- * off to kill the perturbations it causes anyway. Meanwhile,
- * this maintains functionality for boot and/or troubleshooting.
- */
- if (rt_b == &root_task_group.rt_bandwidth)
- span = cpu_online_mask;
-#endif
- for_each_cpu(i, span) {
- int enqueue = 0;
- struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
- struct rq *rq = rq_of_rt_rq(rt_rq);
-
- raw_spin_lock(&rq->lock);
- if (rt_rq->rt_time) {
- u64 runtime;
-
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- if (rt_rq->rt_throttled)
- balance_runtime(rt_rq);
- runtime = rt_rq->rt_runtime;
- rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
- if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
- rt_rq->rt_throttled = 0;
- enqueue = 1;
-
- /*
- * When we're idle and a woken (rt) task is
- * throttled check_preempt_curr() will set
- * skip_update and the time between the wakeup
- * and this unthrottle will get accounted as
- * 'runtime'.
- */
- if (rt_rq->rt_nr_running && rq->curr == rq->idle)
- rq_clock_skip_update(rq, false);
- }
- if (rt_rq->rt_time || rt_rq->rt_nr_running)
- idle = 0;
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- } else if (rt_rq->rt_nr_running) {
- idle = 0;
- if (!rt_rq_throttled(rt_rq))
- enqueue = 1;
- }
- if (rt_rq->rt_throttled)
- throttled = 1;
-
- if (enqueue)
- sched_rt_rq_enqueue(rt_rq);
- raw_spin_unlock(&rq->lock);
- }
-
- if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
- return 1;
-
- return idle;
-}
-
static inline int rt_se_prio(struct sched_rt_entity *rt_se)
{
-#ifdef CONFIG_RT_GROUP_SCHED
- struct rt_rq *rt_rq = group_rt_rq(rt_se);
-
- if (rt_rq)
- return rt_rq->highest_prio.curr;
-#endif
-
return rt_task_of(rt_se)->prio;
}

-static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
-{
- u64 runtime = sched_rt_runtime(rt_rq);
-
- if (rt_rq->rt_throttled)
- return rt_rq_throttled(rt_rq);
-
- if (runtime >= sched_rt_period(rt_rq))
- return 0;
-
- balance_runtime(rt_rq);
- runtime = sched_rt_runtime(rt_rq);
- if (runtime == RUNTIME_INF)
- return 0;
-
- if (rt_rq->rt_time > runtime) {
- struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-
- /*
- * Don't actually throttle groups that have no runtime assigned
- * but accrue some time due to boosting.
- */
- if (likely(rt_b->rt_runtime)) {
- rt_rq->rt_throttled = 1;
- printk_deferred_once("sched: RT throttling activated\n");
- } else {
- /*
- * In case we did anyway, make it go away,
- * replenishment is a joke, since it will replenish us
- * with exactly 0 ns.
- */
- rt_rq->rt_time = 0;
- }
-
- if (rt_rq_throttled(rt_rq)) {
- sched_rt_rq_dequeue(rt_rq);
- return 1;
- }
- }
-
- return 0;
-}
-
/*
* Update the current task's runtime statistics. Skip current tasks that
* are not in our scheduling class.
@@ -948,7 +386,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
- struct sched_rt_entity *rt_se = &curr->rt;
u64 delta_exec;

if (curr->sched_class != &rt_sched_class)
@@ -971,53 +408,6 @@ static void update_curr_rt(struct rq *rq)
cpuacct_charge(curr, delta_exec);

sched_rt_avg_update(rq, delta_exec);
-
- if (!rt_bandwidth_enabled())
- return;
-
- for_each_sched_rt_entity(rt_se) {
- struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
-
- if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
- raw_spin_lock(&rt_rq->rt_runtime_lock);
- rt_rq->rt_time += delta_exec;
- if (sched_rt_runtime_exceeded(rt_rq))
- resched_curr(rq);
- raw_spin_unlock(&rt_rq->rt_runtime_lock);
- }
- }
-}
-
-static void
-dequeue_top_rt_rq(struct rt_rq *rt_rq)
-{
- struct rq *rq = rq_of_rt_rq(rt_rq);
-
- BUG_ON(&rq->rt != rt_rq);
-
- if (!rt_rq->rt_queued)
- return;
-
- BUG_ON(!rq->nr_running);
-
- sub_nr_running(rq, rt_rq->rt_nr_running);
- rt_rq->rt_queued = 0;
-}
-
-static void
-enqueue_top_rt_rq(struct rt_rq *rt_rq)
-{
- struct rq *rq = rq_of_rt_rq(rt_rq);
-
- BUG_ON(&rq->rt != rt_rq);
-
- if (rt_rq->rt_queued)
- return;
- if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
- return;
-
- add_nr_running(rq, rt_rq->rt_nr_running);
- rt_rq->rt_queued = 1;
}

#if defined CONFIG_SMP
@@ -1027,13 +417,9 @@ inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);

-#ifdef CONFIG_RT_GROUP_SCHED
- /*
- * Change rq's cpupri only if rt_rq is the top queue.
- */
if (&rq->rt != rt_rq)
return;
-#endif
+
if (rq->online && prio < prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
}
@@ -1043,13 +429,9 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
{
struct rq *rq = rq_of_rt_rq(rt_rq);

-#ifdef CONFIG_RT_GROUP_SCHED
- /*
- * Change rq's cpupri only if rt_rq is the top queue.
- */
if (&rq->rt != rt_rq)
return;
-#endif
+
if (rq->online && rt_rq->highest_prio.curr != prev_prio)
cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
}
@@ -1108,60 +490,17 @@ static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}

#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */

-#ifdef CONFIG_RT_GROUP_SCHED
-
-static void
-inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- if (rt_se_boosted(rt_se))
- rt_rq->rt_nr_boosted++;
-
- if (rt_rq->tg)
- start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
-}
-
-static void
-dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- if (rt_se_boosted(rt_se))
- rt_rq->rt_nr_boosted--;
-
- WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
-}
-
-#else /* CONFIG_RT_GROUP_SCHED */
-
-static void
-inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
- start_rt_bandwidth(&def_rt_bandwidth);
-}
-
-static inline
-void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
-
-#endif /* CONFIG_RT_GROUP_SCHED */
-
static inline
unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
{
- struct rt_rq *group_rq = group_rt_rq(rt_se);
-
- if (group_rq)
- return group_rq->rt_nr_running;
- else
- return 1;
+ return 1;
}

static inline
unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
{
- struct rt_rq *group_rq = group_rt_rq(rt_se);
struct task_struct *tsk;

- if (group_rq)
- return group_rq->rr_nr_running;
-
tsk = rt_task_of(rt_se);

return (tsk->policy == SCHED_RR) ? 1 : 0;
@@ -1178,7 +517,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)

inc_rt_prio(rt_rq, prio);
inc_rt_migration(rt_se, rt_rq);
- inc_rt_group(rt_se, rt_rq);
}

static inline
@@ -1191,7 +529,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)

dec_rt_prio(rt_rq, rt_se_prio(rt_se));
dec_rt_migration(rt_se, rt_rq);
- dec_rt_group(rt_se, rt_rq);
}

/*
@@ -1217,25 +554,12 @@ static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_arr
rt_se->on_list = 0;
}

-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
- struct rt_rq *group_rq = group_rt_rq(rt_se);
struct list_head *queue = array->queue + rt_se_prio(rt_se);

- /*
- * Don't enqueue the group if its throttled, or when empty.
- * The latter is a consequence of the former when a child group
- * get throttled and the current group doesn't have any other
- * active members.
- */
- if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
- if (rt_se->on_list)
- __delist_rt_entity(rt_se, array);
- return;
- }
-
if (move_entity(flags)) {
WARN_ON_ONCE(rt_se->on_list);
if (flags & ENQUEUE_HEAD)
@@ -1251,7 +575,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
inc_rt_tasks(rt_se, rt_rq);
}

-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
@@ -1266,52 +590,6 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
}

/*
- * Because the prio of an upper entry depends on the lower
- * entries, we must remove entries top - down.
- */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
-{
- struct sched_rt_entity *back = NULL;
-
- for_each_sched_rt_entity(rt_se) {
- rt_se->back = back;
- back = rt_se;
- }
-
- dequeue_top_rt_rq(rt_rq_of_se(back));
-
- for (rt_se = back; rt_se; rt_se = rt_se->back) {
- if (on_rt_rq(rt_se))
- __dequeue_rt_entity(rt_se, flags);
- }
-}
-
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
-{
- struct rq *rq = rq_of_rt_se(rt_se);
-
- dequeue_rt_stack(rt_se, flags);
- for_each_sched_rt_entity(rt_se)
- __enqueue_rt_entity(rt_se, flags);
- enqueue_top_rt_rq(&rq->rt);
-}
-
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
-{
- struct rq *rq = rq_of_rt_se(rt_se);
-
- dequeue_rt_stack(rt_se, flags);
-
- for_each_sched_rt_entity(rt_se) {
- struct rt_rq *rt_rq = group_rt_rq(rt_se);
-
- if (rt_rq && rt_rq->rt_nr_running)
- __enqueue_rt_entity(rt_se, flags);
- }
- enqueue_top_rt_rq(&rq->rt);
-}
-
-/*
* Adding/removing a task to/from a priority array:
*/
static void
@@ -1511,11 +789,8 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;

- do {
- rt_se = pick_next_rt_entity(rq, rt_rq);
- BUG_ON(!rt_se);
- rt_rq = group_rt_rq(rt_se);
- } while (rt_rq);
+ rt_se = pick_next_rt_entity(rq, rt_rq);
+ BUG_ON(!rt_se);

p = rt_task_of(rt_se);
p->se.exec_start = rq_clock_task(rq);
@@ -2215,8 +1490,6 @@ static void rq_online_rt(struct rq *rq)
if (rq->rt.overloaded)
rt_set_overload(rq);

- __enable_runtime(rq);
-
cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
}

@@ -2226,8 +1499,6 @@ static void rq_offline_rt(struct rq *rq)
if (rq->rt.overloaded)
rt_clear_overload(rq);

- __disable_runtime(rq);
-
cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
}

@@ -2443,12 +1714,5 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);

void print_rt_stats(struct seq_file *m, int cpu)
{
- rt_rq_iter_t iter;
- struct rt_rq *rt_rq;
-
- rcu_read_lock();
- for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
- print_rt_rq(m, cpu, rt_rq);
- rcu_read_unlock();
}
#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 57caf36..a4c4a18 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -172,15 +172,6 @@ struct rt_prio_array {
struct list_head queue[MAX_RT_PRIO];
};

-struct rt_bandwidth {
- /* nests inside the rq lock: */
- raw_spinlock_t rt_runtime_lock;
- ktime_t rt_period;
- u64 rt_runtime;
- struct hrtimer rt_period_timer;
- unsigned int rt_period_active;
-};
-
void __dl_clear_params(struct task_struct *p);

/*
@@ -298,7 +289,6 @@ struct task_group {
struct sched_rt_entity **rt_se;
struct rt_rq **rt_rq;

- struct rt_bandwidth rt_bandwidth;
#endif

struct rcu_head rcu;
@@ -474,11 +464,6 @@ struct cfs_rq {
#endif /* CONFIG_FAIR_GROUP_SCHED */
};

-static inline int rt_bandwidth_enabled(void)
-{
- return sysctl_sched_rt_runtime >= 0;
-}
-
/* RT IPI pull logic requires IRQ_WORK */
#ifdef CONFIG_IRQ_WORK
# define HAVE_RT_PUSH_IPI
@@ -511,12 +496,6 @@ struct rt_rq {
#endif /* CONFIG_SMP */
int rt_queued;

- int rt_throttled;
- u64 rt_time;
- u64 rt_runtime;
- /* Nests inside the rq lock: */
- raw_spinlock_t rt_runtime_lock;
-
#ifdef CONFIG_RT_GROUP_SCHED
unsigned long rt_nr_boosted;

@@ -1478,9 +1457,6 @@ extern void init_sched_fair_class(void);
extern void resched_curr(struct rq *rq);
extern void resched_cpu(int cpu);

-extern struct rt_bandwidth def_rt_bandwidth;
-extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
-
extern struct dl_bandwidth def_dl_bandwidth;
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
--
2.7.4