Re: [PATCH 1/2] sched/deadline: add per rq tracking of admitted bandwidth

From: Peter Zijlstra
Date: Thu Mar 24 2016 - 05:21:12 EST


On Thu, Feb 25, 2016 at 11:20:34AM +0100, Peter Zijlstra wrote:
> On Thu, Feb 25, 2016 at 10:07:06AM +0000, Juri Lelli wrote:
> > Argh, this makes lot of sense to me. I've actually pondered a tree/list
> > solution, but then decided to try the cumulative approach because it
> > looked nicer. But it contains holes, I'm afraid. As Luca already said,
> > GRUB shouldn't have these problems though.
> >
> > I'll try and see what introducting a list of blocked/throttled deadline
> > tasks means, considering also the interaction with cpusets and such.
> > Maybe it's simpler than it seems.
> >
> > I'm not sure this will come anytime soon, unfortunately. I'm almost 100%
> > on the sched-freq/schedutil discussion these days.
>
> Just skip sleep and write them when its dark outside :-)
>
> > Anyway, do you also think that what we want to solve the root domain
> > issue is something based on rq_online/offline and per-rq information?
> > Everything else that I tried or thought of was broken/more horrible. :-/
>
> I was still trying to get my head around this, the above was my
> suggestion to the per-rq state, but I've not thought hard on alternative
> approaches to the root_domain issue.

So the below is the inactive list; it seems to not insta-explode when I
run a few simple dl proglets.

I don't particularly like it because it makes wakeups (esp. cross-cpu
ones) more expensive for the benefit of hotplug/cpusets which is
something that 'never' happens.

So what I'm going to try and do is forget all about this here patch and
see what I can do with a full task-list iteration on rebuild. But I
figured that since I wrote it and it might work, I might as well post
it.

---
include/linux/sched.h | 5 ++
kernel/sched/core.c | 6 ++-
kernel/sched/deadline.c | 118 ++++++++++++++++++++++++++++++++++++++++++++++--
kernel/sched/fair.c | 2 +-
kernel/sched/sched.h | 7 ++-
5 files changed, 132 insertions(+), 6 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index c617ea12c6b7..d9848eac35f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1351,6 +1351,11 @@ struct sched_dl_entity {
* own bandwidth to be enforced, thus we need one timer per task.
*/
struct hrtimer dl_timer;
+
+#ifdef CONFIG_SMP
+ struct list_head dl_inactive_entry;
+ int dl_inactive_cpu;
+#endif
};

union rcu_special {
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0b21e7a724e1..7f3fab6349a4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1162,7 +1162,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)

if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
- p->sched_class->migrate_task_rq(p);
+ p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_event_task_migrate(p);
}
@@ -2077,6 +2077,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
RB_CLEAR_NODE(&p->dl.rb_node);
init_dl_task_timer(&p->dl);
__dl_clear_params(p);
+#ifdef CONFIG_SMP
+ INIT_LIST_HEAD(&p->dl.dl_inactive_entry);
+#endif

INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
@@ -5397,6 +5400,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
migrate_tasks(rq);
BUG_ON(rq->nr_running != 1); /* the migration thread */
raw_spin_unlock_irqrestore(&rq->lock, flags);
+ migrate_inactive_dl(rq);
break;

case CPU_DEAD:
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index c7a036facbe1..f999b8bb6fea 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -80,6 +80,9 @@ void init_dl_rq(struct dl_rq *dl_rq)
dl_rq->dl_nr_migratory = 0;
dl_rq->overloaded = 0;
dl_rq->pushable_dl_tasks_root = RB_ROOT;
+
+ raw_spin_lock_init(&dl_rq->dl_inactive_lock);
+ INIT_LIST_HEAD(&dl_rq->dl_inactive_list);
#else
init_dl_bw(&dl_rq->dl_bw);
#endif
@@ -289,6 +292,62 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
return later_rq;
}

+static void enqueue_inactive(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ raw_spin_lock(&dl_rq->dl_inactive_lock);
+ WRITE_ONCE(dl_se->dl_inactive_cpu, rq_of_dl_rq(dl_rq)->cpu);
+ list_add(&dl_se->dl_inactive_entry, &dl_rq->dl_inactive_list);
+ raw_spin_unlock(&dl_rq->dl_inactive_lock);
+}
+
+static void dequeue_inactive(struct sched_dl_entity *dl_se)
+{
+ int tmp, cpu = READ_ONCE(dl_se->dl_inactive_cpu);
+ struct rq *rq;
+
+again:
+ if (cpu == -1)
+ return;
+ rq = cpu_rq(cpu);
+
+ raw_spin_lock(&rq->dl.dl_inactive_lock);
+ tmp = READ_ONCE(dl_se->dl_inactive_cpu);
+ if (cpu != tmp) {
+ cpu = tmp;
+ raw_spin_unlock(&rq->dl.dl_inactive_lock);
+ goto again;
+ }
+ list_del_init(&dl_se->dl_inactive_entry);
+ WRITE_ONCE(dl_se->dl_inactive_cpu, -1);
+ raw_spin_unlock(&rq->dl.dl_inactive_lock);
+}
+
+static void migrate_inactive(struct sched_dl_entity *dl_se, int new_cpu)
+{
+ int tmp, cpu = READ_ONCE(dl_se->dl_inactive_cpu);
+ struct rq *src_rq, *dst_rq;
+
+ dst_rq = cpu_rq(new_cpu);
+again:
+ if (cpu == -1)
+ return;
+ src_rq = cpu_rq(cpu);
+
+ double_raw_lock(&src_rq->dl.dl_inactive_lock,
+ &dst_rq->dl.dl_inactive_lock);
+ tmp = READ_ONCE(dl_se->dl_inactive_cpu);
+ if (cpu != tmp) {
+ cpu = tmp;
+ raw_spin_unlock(&dst_rq->dl.dl_inactive_lock);
+ raw_spin_unlock(&src_rq->dl.dl_inactive_lock);
+ goto again;
+ }
+ list_move(&dl_se->dl_inactive_entry, &dst_rq->dl.dl_inactive_list);
+ WRITE_ONCE(dl_se->dl_inactive_cpu, new_cpu);
+ raw_spin_unlock(&dst_rq->dl.dl_inactive_lock);
+ raw_spin_unlock(&src_rq->dl.dl_inactive_lock);
+}
+
#else

static inline
@@ -327,6 +386,11 @@ static inline void queue_push_tasks(struct rq *rq)
static inline void queue_pull_task(struct rq *rq)
{
}
+
+static inline void enqueue_inactive(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) { }
+static inline void dequeue_inactive(struct sched_dl_entity *dl_se) { }
+static inline void migrate_inactive(struct sched_dl_entity *dl_se, int new_cpu) { }
+
#endif /* CONFIG_SMP */

static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
@@ -960,6 +1024,9 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
if (p->dl.dl_throttled && !(flags & ENQUEUE_REPLENISH))
return;

+ if (!(flags & ENQUEUE_RESTORE))
+ dequeue_inactive(&p->dl);
+
enqueue_dl_entity(&p->dl, pi_se, flags);

if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
@@ -970,6 +1037,8 @@ static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
{
dequeue_dl_entity(&p->dl);
dequeue_pushable_dl_task(rq, p);
+ if (!(flags & DEQUEUE_SAVE))
+ enqueue_inactive(&p->dl, &rq->dl);
}

static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
@@ -1074,6 +1143,34 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
resched_curr(rq);
}

+static void migrate_task_rq_dl(struct task_struct *p, int new_cpu)
+{
+ struct sched_dl_entity *dl_se = &p->dl;
+
+ if (list_empty(&dl_se->dl_inactive_entry))
+ return;
+
+ migrate_inactive(dl_se, new_cpu);
+}
+
+void migrate_inactive_dl(struct rq *src_rq)
+{
+ int cpu = cpumask_any_and(src_rq->rd->online, cpu_active_mask);
+ struct rq *dst_rq = cpu_rq(cpu);
+ struct sched_dl_entity *dl_se, *tmp;
+
+ double_raw_lock(&src_rq->dl.dl_inactive_lock,
+ &dst_rq->dl.dl_inactive_lock);
+
+ list_for_each_entry_safe(dl_se, tmp, &src_rq->dl.dl_inactive_list, dl_inactive_entry) {
+ WRITE_ONCE(dl_se->dl_inactive_cpu, cpu);
+ list_move(&dl_se->dl_inactive_entry, &dst_rq->dl.dl_inactive_list);
+ }
+
+ raw_spin_unlock(&dst_rq->dl.dl_inactive_lock);
+ raw_spin_unlock(&src_rq->dl.dl_inactive_lock);
+}
+
#endif /* CONFIG_SMP */

/*
@@ -1211,13 +1308,19 @@ static void task_dead_dl(struct task_struct *p)
{
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));

+ local_irq_disable();
+
/*
* Since we are TASK_DEAD we won't slip out of the domain!
*/
- raw_spin_lock_irq(&dl_b->lock);
+ raw_spin_lock(&dl_b->lock);
/* XXX we should retain the bw until 0-lag */
dl_b->total_bw -= p->dl.dl_bw;
- raw_spin_unlock_irq(&dl_b->lock);
+ raw_spin_unlock(&dl_b->lock);
+
+ dequeue_inactive(&p->dl);
+
+ local_irq_enable();
}

static void set_curr_task_dl(struct rq *rq)
@@ -1702,7 +1805,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
* this is the right place to try to pull some other one
* from an overloaded cpu, if any.
*/
- if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+ if (!task_on_rq_queued(p)) {
+ dequeue_inactive(&p->dl);
+ return;
+ }
+
+ if (rq->dl.dl_nr_running)
return;

queue_pull_task(rq);
@@ -1728,6 +1836,9 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
resched_curr(rq);
#endif
}
+
+ if (!task_on_rq_queued(p))
+ enqueue_inactive(&p->dl, &rq->dl);
}

/*
@@ -1779,6 +1890,7 @@ const struct sched_class dl_sched_class = {

#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_dl,
+ .migrate_task_rq = migrate_task_rq_dl,
.set_cpus_allowed = set_cpus_allowed_dl,
.rq_online = rq_online_dl,
.rq_offline = rq_offline_dl,
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 303d6392b389..04e856a85c0f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5231,7 +5231,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
* cfs_rq_of(p) references at time of call are still valid and identify the
* previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
*/
-static void migrate_task_rq_fair(struct task_struct *p)
+static void migrate_task_rq_fair(struct task_struct *p, int new_cpu)
{
/*
* We are supposed to update the task to "current" time, then its up to date
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e6d4a3fa3660..0de1e2894d22 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -517,6 +517,9 @@ struct dl_rq {
*/
struct rb_root pushable_dl_tasks_root;
struct rb_node *pushable_dl_tasks_leftmost;
+
+ raw_spinlock_t dl_inactive_lock;
+ struct list_head dl_inactive_list;
#else
struct dl_bw dl_bw;
#endif
@@ -776,6 +779,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);

#ifdef CONFIG_SMP

+extern void migrate_inactive_dl(struct rq *src_rq);
+
static inline void
queue_balance_callback(struct rq *rq,
struct callback_head *head,
@@ -1205,7 +1210,7 @@ struct sched_class {

#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
- void (*migrate_task_rq)(struct task_struct *p);
+ void (*migrate_task_rq)(struct task_struct *p, int new_cpu);

void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);