[PATCH 2/5] sched/BT: implement the BT scheduling class

From: xiaoggchen
Date: Fri Jun 21 2019 - 03:59:13 EST


From: chen xiaoguang <xiaoggchen@xxxxxxxxxxx>

Signed-off-by: Newton Gao <newtongao@xxxxxxxxxxx>
Signed-off-by: Shook Liu <shookliu@xxxxxxxxxxx>
Signed-off-by: Zhiguang Peng <zgpeng@xxxxxxxxxxx>
Signed-off-by: Xiaoguang Chen <xiaoggchen@xxxxxxxxxxx>
---
kernel/sched/bt.c | 1012 ++++++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/fair.c | 4 +-
kernel/sched/sched.h | 5 +-
3 files changed, 1018 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/bt.c b/kernel/sched/bt.c
index 56566e6..87eb04f 100644
--- a/kernel/sched/bt.c
+++ b/kernel/sched/bt.c
@@ -5,6 +5,8 @@

#include "sched.h"

+#include <trace/events/sched.h>
+
void init_bt_rq(struct bt_rq *bt_rq)
{
bt_rq->tasks_timeline = RB_ROOT;
@@ -14,4 +16,1014 @@ void init_bt_rq(struct bt_rq *bt_rq)
#endif
}

+static inline struct bt_rq *task_bt_rq(struct task_struct *p)
+{
+ return &task_rq(p)->bt;
+}
+
+static inline struct task_struct *bt_task_of(struct sched_bt_entity *bt_se)
+{
+ return container_of(bt_se, struct task_struct, bt);
+}
+
+static inline struct bt_rq *bt_rq_of(struct sched_bt_entity *bt_se)
+{
+ struct task_struct *p = bt_task_of(bt_se);
+ struct rq *rq = task_rq(p);
+
+ return &rq->bt;
+}
+
+static inline struct rq *rq_of_bt_rq(struct bt_rq *bt_rq)
+{
+ return container_of(bt_rq, struct rq, bt);
+}
+
+static u64 __calc_delta(u64 delta_exec, unsigned long weight,
+ struct load_weight *lw);
+
+/*
+ * delta /= w
+ */
+static inline unsigned long
+calc_delta_bt(unsigned long delta, struct sched_bt_entity *bt_se)
+{
+ if (unlikely(bt_se->load.weight != NICE_0_LOAD))
+ delta = __calc_delta(delta, NICE_0_LOAD, &bt_se->load);
+
+ return delta;
+}
+
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+ lw->weight += inc;
+ lw->inv_weight = 0;
+}
+
+/*
+ * The idea is to set a period in which each task runs once.
+ *
+ * When there are too many tasks (sched_nr_latency) we have to stretch
+ * this period because otherwise the slices get too small.
+ *
+ * p = (nr <= nl) ? l : l*nr/nl
+ */
+static u64 __sched_period(unsigned long nr_running)
+{
+if (unlikely(nr_running > sched_nr_latency))
+ return nr_running * sysctl_sched_min_granularity;
+else
+ return sysctl_sched_latency;
+}
+
+/*
+ * We calculate the wall-time slice from the period by taking a part
+ * proportional to the weight.
+ *
+ * s = p*P[w/rw]
+ */
+static u64 sched_bt_slice(struct bt_rq *bt_rq, struct sched_bt_entity *se)
+{
+ u64 slice = __sched_period(bt_rq->bt_nr_running + !se->on_rq);
+ struct load_weight *load;
+ struct load_weight lw;
+
+ bt_rq = bt_rq_of(se);
+ load = &bt_rq->load;
+
+ if (unlikely(!se->on_rq)) {
+ lw = bt_rq->load;
+ update_load_add(&lw, se->load.weight);
+ load = &lw;
+ }
+ slice = __calc_delta(slice, se->load.weight, load);
+ return slice;
+}
+
+/*
+ * We calculate the vruntime slice of a to-be-inserted task.
+ *
+ * vs = s/w
+ */
+static u64 sched_bt_vslice(struct bt_rq *bt_rq, struct sched_bt_entity *se)
+{
+ return calc_delta_bt(sched_bt_slice(bt_rq, se), se);
+}
+
+static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
+{
+ s64 delta = (s64)(vruntime - max_vruntime);
+
+ if (delta > 0)
+ max_vruntime = vruntime;
+
+ return max_vruntime;
+}
+
+static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
+{
+ s64 delta = (s64)(vruntime - min_vruntime);
+
+ if (delta < 0)
+ min_vruntime = vruntime;
+
+ return min_vruntime;
+}
+
+static inline int bt_entity_before(struct sched_bt_entity *a,
+ struct sched_bt_entity *b)
+{
+ return (s64)(a->vruntime - b->vruntime) < 0;
+}
+
+static void
+place_bt_entity(struct bt_rq *bt_rq, struct sched_bt_entity *se, int initial)
+{
+ u64 vruntime = bt_rq->min_vruntime;
+
+ /*
+ * The 'current' period is already promised to the current tasks,
+ * however the extra weight of the new task will slow them down a
+ * little, place the new task so that it fits in the slot that
+ * stays open at the end.
+ */
+ if (initial && sched_feat(START_DEBIT))
+ vruntime += sched_bt_vslice(bt_rq, se);
+
+ /* sleeps up to a single latency don't count. */
+ if (!initial) {
+ unsigned long thresh = sysctl_sched_latency;
+
+ /*
+ * Halve their sleep time's effect, to allow
+ * for a gentler effect of sleepers:
+ */
+ if (sched_feat(GENTLE_FAIR_SLEEPERS))
+ thresh >>= 1;
+
+ vruntime -= thresh;
+ }
+
+ /* ensure we never gain time by being placed backwards. */
+ se->vruntime = max_vruntime(se->vruntime, vruntime);
+}
+
+static void update_bt_min_vruntime(struct bt_rq *bt_rq)
+{
+ u64 vruntime = bt_rq->min_vruntime;
+
+ if (bt_rq->curr)
+ vruntime = bt_rq->curr->vruntime;
+
+ if (bt_rq->rb_leftmost) {
+ struct sched_bt_entity *bt_se = rb_entry(bt_rq->rb_leftmost,
+ struct sched_bt_entity,
+ run_node);
+
+ if (!bt_rq->curr)
+ vruntime = bt_se->vruntime;
+ else
+ vruntime = min_vruntime(vruntime, bt_se->vruntime);
+ }
+
+ /* ensure we never gain time by being placed backwards. */
+ bt_rq->min_vruntime = max_vruntime(bt_rq->min_vruntime, vruntime);
+#ifndef CONFIG_64BIT
+ smp_wmb();
+ bt_rq->min_vruntime_copy = bt_rq->min_vruntime;
+#endif
+}
+
+/*
+ * Update the current task's runtime statistics. Skip current tasks that
+ * are not in our scheduling class.
+ */
+static inline void
+__update_curr_bt(struct bt_rq *bt_rq, struct sched_bt_entity *curr,
+ unsigned long delta_exec)
+{
+ unsigned long delta_exec_weighted;
+
+ schedstat_set(curr->statistics->exec_max,
+ max((u64)delta_exec, curr->statistics->exec_max));
+
+ curr->sum_exec_runtime += delta_exec;
+ schedstat_add(bt_rq->exec_clock, delta_exec);
+ delta_exec_weighted = calc_delta_bt(delta_exec, curr);
+
+ curr->vruntime += delta_exec_weighted;
+ update_bt_min_vruntime(bt_rq);
+}
+
+static void
+account_bt_entity_enqueue(struct bt_rq *bt_rq, struct sched_bt_entity *se)
+{
+ update_load_add(&bt_rq->load, se->load.weight);
+ bt_rq->bt_nr_running++;
+}
+
+static inline void
+update_stats_wait_start_bt(struct bt_rq *bt_rq, struct sched_bt_entity *bt_se)
+{
+ schedstat_set(bt_se->statistics->wait_start, rq_of_bt_rq(bt_rq)->clock);
+}
+
+static void update_stats_enqueue_bt(struct bt_rq *bt_rq,
+ struct sched_bt_entity *se)
+{
+ /*
+ * Are we enqueueing a waiting task? (for current tasks
+ * a dequeue/enqueue event is a NOP)
+ */
+ if (se != bt_rq->curr)
+ update_stats_wait_start_bt(bt_rq, se);
+}
+
+static void update_curr(struct bt_rq *bt_rq)
+{
+ struct sched_bt_entity *curr = bt_rq->curr;
+ u64 now = rq_of_bt_rq(bt_rq)->clock_task;
+ unsigned long delta_exec;
+ struct task_struct *tsk;
+
+ if (unlikely(!curr))
+ return;
+
+ /*
+ * Get the amount of time the current task was running
+ * since the last time we changed load (this cannot
+ * overflow on 32 bits):
+ */
+ delta_exec = (unsigned long)(now - curr->exec_start);
+ if (!delta_exec)
+ return;
+
+ __update_curr_bt(bt_rq, curr, delta_exec);
+ curr->exec_start = now;
+
+ tsk = bt_task_of(curr);
+ trace_sched_stat_runtime(tsk, delta_exec, curr->vruntime);
+ cpuacct_charge(tsk, delta_exec);
+}
+
+static void __enqueue_bt_entity(struct bt_rq *bt_rq,
+ struct sched_bt_entity *bt_se)
+{
+ struct rb_node **link = &bt_rq->tasks_timeline.rb_node;
+ struct rb_node *parent = NULL;
+ struct sched_bt_entity *entry;
+ int leftmost = 1;
+
+ /*
+ * Find the right place in the rbtree:
+ */
+ while (*link) {
+ parent = *link;
+ entry = rb_entry(parent, struct sched_bt_entity, run_node);
+ /*
+ * We dont care about collisions. Nodes with
+ * the same key stay together.
+ */
+ if (bt_entity_before(bt_se, entry)) {
+ link = &parent->rb_left;
+ } else {
+ link = &parent->rb_right;
+ leftmost = 0;
+ }
+ }
+
+ /*
+ * Maintain a cache of leftmost tree entries (it is frequently
+ * used):
+ */
+ if (leftmost)
+ bt_rq->rb_leftmost = &bt_se->run_node;
+
+ rb_link_node(&bt_se->run_node, parent, link);
+ rb_insert_color(&bt_se->run_node, &bt_rq->tasks_timeline);
+}
+
+
+static void
+enqueue_bt_entity(struct bt_rq *bt_rq, struct sched_bt_entity *bt_se, int flags)
+{
+ bool renorm = !(flags && ENQUEUE_WAKEUP) || (flags & ENQUEUE_MIGRATED);
+
+ /*
+ * Update the normalized vruntime before updating min_vruntime
+ * through callig update_curr().
+ */
+ if (renorm && (bt_rq->curr == bt_se))
+ bt_se->vruntime += bt_rq->min_vruntime;
+
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(bt_rq);
+ account_bt_entity_enqueue(bt_rq, bt_se);
+
+ if (flags & ENQUEUE_WAKEUP)
+ place_bt_entity(bt_rq, bt_se, 0);
+
+ update_stats_enqueue_bt(bt_rq, bt_se);
+ if (bt_se != bt_rq->curr)
+ __enqueue_bt_entity(bt_rq, bt_se);
+ bt_se->on_rq = 1;
+}
+
+/*
+ * The enqueue_task method is called before nr_running is
+ * increased. Here we update the fair scheduling stats and
+ * then put the task into the rbtree:
+ */
+static void
+enqueue_task_bt(struct rq *rq, struct task_struct *p, int flags)
+{
+ struct bt_rq *bt_rq;
+ struct sched_bt_entity *se = &p->bt;
+
+ if (se->on_rq)
+ return;
+
+ bt_rq = bt_rq_of(se);
+ flags = ENQUEUE_WAKEUP;
+ enqueue_bt_entity(bt_rq, se, flags);
+
+ add_nr_running(rq, 1);
+}
+
+static void
+update_stats_wait_end_bt(struct bt_rq *bt_rq, struct sched_bt_entity *se)
+{
+ schedstat_set(se->statistics->wait_max, max(se->statistics->wait_max,
+ rq_of_bt_rq(bt_rq)->clock - se->statistics->wait_start));
+ schedstat_set(se->statistics->wait_count, se->statistics->wait_count
+ + 1);
+ schedstat_set(se->statistics->wait_sum, se->statistics->wait_sum +
+ rq_of_bt_rq(bt_rq)->clock - se->statistics->wait_start);
+#ifdef CONFIG_SCHEDSTATS
+ trace_sched_stat_wait(bt_task_of(se),
+ rq_of_bt_rq(bt_rq)->clock - se->statistics->wait_start);
+#endif
+ schedstat_set(se->statistics->wait_start, 0);
+}
+
+static inline void
+update_stats_dequeue_bt(struct bt_rq *bt_rq, struct sched_bt_entity *se)
+{
+ /*
+ * Mark the end of the wait period if dequeueing a
+ * waiting task:
+ */
+ if (se != bt_rq->curr)
+ update_stats_wait_end_bt(bt_rq, se);
+}
+
+static void __clear_buddies_next_bt(struct sched_bt_entity *se)
+{
+ struct bt_rq *bt_rq = bt_rq_of(se);
+
+ bt_rq->next = NULL;
+}
+
+static void __clear_buddies_skip_bt(struct sched_bt_entity *se)
+{
+ struct bt_rq *bt_rq = bt_rq_of(se);
+
+ bt_rq->skip = NULL;
+}
+
+static void set_skip_buddy_bt(struct sched_bt_entity *se)
+{
+ bt_rq_of(se)->skip = se;
+}
+
+static void clear_buddies_bt(struct bt_rq *bt_rq, struct sched_bt_entity *se)
+{
+ if (bt_rq->next == se)
+ __clear_buddies_next_bt(se);
+ if (bt_rq->skip == se)
+ __clear_buddies_skip_bt(se);
+}
+
+static void __dequeue_bt_entity(struct bt_rq *bt_rq,
+ struct sched_bt_entity *bt_se)
+{
+ if (bt_rq->rb_leftmost == &bt_se->run_node) {
+ struct rb_node *next_node;
+
+ next_node = rb_next(&bt_se->run_node);
+ bt_rq->rb_leftmost = next_node;
+ }
+ rb_erase(&bt_se->run_node, &bt_rq->tasks_timeline);
+}
+
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+ lw->weight -= dec;
+ lw->inv_weight = 0;
+}
+
+static void
+account_bt_entity_dequeue(struct bt_rq *bt_rq, struct sched_bt_entity *se)
+{
+ update_load_sub(&bt_rq->load, se->load.weight);
+ bt_rq->bt_nr_running--;
+}
+
+static void
+dequeue_bt_entity(struct bt_rq *bt_rq, struct sched_bt_entity *se, int flags)
+{
+ struct task_struct *tsk = bt_task_of(se);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(bt_rq);
+
+ update_stats_dequeue_bt(bt_rq, se);
+ if (flags & DEQUEUE_SLEEP) {
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_LATENCYTOP)
+ if (tsk->state & TASK_INTERRUPTIBLE)
+ se->statistics->sleep_start = rq_of_bt_rq(bt_rq)->clock;
+ if (tsk->state & TASK_UNINTERRUPTIBLE)
+ se->statistics->block_start = rq_of_bt_rq(bt_rq)->clock;
+ }
+#endif
+ clear_buddies_bt(bt_rq, se);
+
+ if (se != bt_rq->curr)
+ __dequeue_bt_entity(bt_rq, se);
+ se->on_rq = 0;
+ account_bt_entity_dequeue(bt_rq, se);
+
+ /*
+ * Normalize the entity after updating the min_vruntime because the
+ * update can refer to the ->curr item and we need to reflect this
+ * movement in our normalized position.
+ */
+ if (!(flags & DEQUEUE_SLEEP))
+ se->vruntime -= bt_rq->min_vruntime;
+
+ update_bt_min_vruntime(bt_rq);
+}
+
+/*
+ * The dequeue_task method is called before nr_running is
+ * decreased. We remove the task from the rbtree and
+ * update the fair scheduling stats:
+ */
+static void dequeue_task_bt(struct rq *rq, struct task_struct *p, int flags)
+{
+ struct bt_rq *bt_rq;
+ struct sched_bt_entity *se = &p->bt;
+
+ bt_rq = bt_rq_of(se);
+ flags = DEQUEUE_SLEEP;
+ dequeue_bt_entity(bt_rq, se, flags);
+
+ bt_rq->bt_nr_running--;
+
+ sub_nr_running(rq, 1);
+}
+
+static void update_curr_bt(struct rq *rq)
+{
+ update_curr(&rq->bt);
+}
+
+/*
+ * sched_yield() is very simple
+ *
+ * The magic of dealing with the ->skip buddy is in pick_next_entity.
+ */
+static void yield_task_bt(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct bt_rq *bt_rq = task_bt_rq(curr);
+ struct sched_bt_entity *se = &curr->bt;
+
+ /*
+ * Are we the only task in the tree?
+ */
+ if (unlikely(rq->nr_running == 1))
+ return;
+
+ clear_buddies_bt(bt_rq, se);
+ update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(bt_rq);
+ rq_clock_skip_update(rq);
+ set_skip_buddy_bt(se);
+}
+
+struct sched_bt_entity *__pick_first_bt_entity(struct bt_rq *bt_rq)
+{
+ struct rb_node *left = bt_rq->rb_leftmost;
+
+ if (!left)
+ return NULL;
+
+ return rb_entry(left, struct sched_bt_entity, run_node);
+}
+
+
+#define WMULT_CONST (~0U)
+#define WMULT_SHIFT 32
+
+static void __update_inv_weight(struct load_weight *lw)
+{
+ unsigned long w;
+
+ if (likely(lw->inv_weight))
+ return;
+
+ w = scale_load_down(lw->weight);
+
+ if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+ lw->inv_weight = 1;
+ else if (unlikely(!w))
+ lw->inv_weight = WMULT_CONST;
+ else
+ lw->inv_weight = WMULT_CONST / w;
+}
+
+/*
+ * delta_exec * weight / lw.weight
+ * OR
+ * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
+ *
+ * Either weight := NICE_0_LOAD and lw \e sched_prio_to_wmult[], in which case
+ * we're guaranteed shift stays positive because inv_weight is guaranteed to
+ * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
+ *
+ * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
+ * weight/lw.weight <= 1, and therefore our shift will also be positive.
+ */
+static u64 __calc_delta(u64 delta_exec, unsigned long weight,
+ struct load_weight *lw)
+{
+ u64 fact = scale_load_down(weight);
+ int shift = WMULT_SHIFT;
+
+ __update_inv_weight(lw);
+
+ if (unlikely(fact >> 32)) {
+ while (fact >> 32) {
+ fact >>= 1;
+ shift--;
+ }
+ }
+
+ /* hint to use a 32x32->64 mul */
+ fact = (u64)(u32)fact * lw->inv_weight;
+
+ while (fact >> 32) {
+ fact >>= 1;
+ shift--;
+ }
+
+ return mul_u64_u32_shr(delta_exec, fact, shift);
+}
+
+static unsigned long
+wakeup_gran_bt(struct sched_bt_entity *curr, struct sched_bt_entity *se)
+{
+ unsigned long gran = sysctl_sched_wakeup_granularity;
+
+ /*
+ * Since its curr running now, convert the gran from real-time
+ * to virtual-time in his units.
+ *
+ * By using 'se' instead of 'curr' we penalize light tasks, so
+ * they get preempted easier. That is, if 'se' < 'curr' then
+ * the resulting gran will be larger, therefore penalizing the
+ * lighter, if otoh 'se' > 'curr' then the resulting gran will
+ * be smaller, again penalizing the lighter task.
+ *
+ * This is especially important for buddies when the leftmost
+ * task is higher priority than the buddy.
+ */
+ return calc_delta_bt(gran, se);
+}
+
+
+static int
+wakeup_preempt_bt_entity(struct sched_bt_entity *curr,
+ struct sched_bt_entity *se)
+{
+ s64 gran, vdiff = curr->vruntime - se->vruntime;
+
+ if (vdiff <= 0)
+ return -1;
+
+ gran = wakeup_gran_bt(curr, se);
+ if (vdiff > gran)
+ return 1;
+
+ return 0;
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void
+check_preempt_tick_bt(struct bt_rq *bt_rq, struct sched_bt_entity *curr)
+{
+ unsigned long ideal_runtime, delta_exec;
+ struct sched_bt_entity *se;
+ s64 delta;
+
+ ideal_runtime = sched_bt_slice(bt_rq, curr);
+ delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+ if (delta_exec > ideal_runtime) {
+ resched_curr(rq_of_bt_rq(bt_rq));
+ /*
+ * The current task ran long enough, ensure it doesn't get
+ * re-elected due to buddy favours.
+ */
+ clear_buddies_bt(bt_rq, curr);
+ return;
+ }
+
+ /*
+ * Ensure that a task that missed wakeup preemption by a
+ * narrow margin doesn't have to wait for a full slice.
+ * This also mitigates buddy induced latencies under load.
+ */
+ if (delta_exec < sysctl_sched_min_granularity)
+ return;
+
+ se = __pick_first_bt_entity(bt_rq);
+ delta = curr->vruntime - se->vruntime;
+
+ if (delta < 0)
+ return;
+
+ if (delta > ideal_runtime)
+ resched_curr(rq_of_bt_rq(bt_rq));
+}
+
+static void set_next_buddy_bt(struct sched_bt_entity *se)
+{
+ if (unlikely(task_has_idle_policy(bt_task_of(se))))
+ return;
+
+ bt_rq_of(se)->next = se;
+}
+
+/*
+ * Preempt the current task with a newly woken task if needed:
+ */
+static void check_preempt_wakeup_bt(struct rq *rq, struct task_struct *p,
+ int wake_flags)
+{
+ struct task_struct *curr = rq->curr;
+ struct sched_bt_entity *se = &curr->bt, *pse = &p->bt;
+ struct bt_rq *bt_rq = task_bt_rq(curr);
+ int scale = bt_rq->bt_nr_running >= sched_nr_latency;
+ int next_buddy_marked = 0;
+
+ if (unlikely(se == pse))
+ return;
+
+ if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
+ set_next_buddy_bt(pse);
+ next_buddy_marked = 1;
+ }
+
+ /*
+ * We can come here with TIF_NEED_RESCHED already set from new task
+ * wake up path.
+ *
+ * Note: this also catches the edge-case of curr being in a throttled
+ * group (e.g. via set_curr_task), since update_curr() (in the
+ * enqueue of curr) will have resulted in resched being set. This
+ * prevents us from potentially nominating it as a false LAST_BUDDY
+ * below.
+ */
+ if (test_tsk_need_resched(curr))
+ return;
+
+ /* BT tasks are by definition preempted by non-bt tasks. */
+ if (likely(p->policy < SCHED_BT))
+ goto preempt;
+
+ if (!sched_feat(WAKEUP_PREEMPTION))
+ return;
+
+ update_curr(bt_rq_of(se));
+ BUG_ON(!pse);
+ if (wakeup_preempt_bt_entity(se, pse) == 1) {
+ /*
+ * Bias pick_next to pick the sched entity that is
+ * triggering this preemption.
+ */
+ if (!next_buddy_marked)
+ set_next_buddy_bt(pse);
+ goto preempt;
+ }
+
+ return;
+
+preempt:
+ resched_curr(rq);
+ /*
+ * Only set the backward buddy when the current task is still
+ * on the rq. This can happen when a wakeup gets interleaved
+ * with schedule on the ->pre_schedule() or idle_balance()
+ * point, either of which can * drop the rq lock.
+ *
+ * Also, during early boot the idle thread is in the fair class,
+ * for obvious reasons its a bad idea to schedule back to it.
+ */
+ if (unlikely(!se->on_rq || curr == rq->idle))
+ return;
+}
+
+static struct sched_bt_entity *__pick_next_bt_entity(
+ struct sched_bt_entity *bt_se)
+{
+ struct rb_node *next = rb_next(&bt_se->run_node);
+
+ if (!next)
+ return NULL;
+
+ return rb_entry(next, struct sched_bt_entity, run_node);
+}
+
+/*
+ * We are picking a new current task - update its stats:
+ */
+static inline void
+update_stats_curr_start_bt(struct bt_rq *bt_rq, struct sched_bt_entity *se)
+{
+ /*
+ * We are starting a new run period:
+ */
+ se->exec_start = rq_of_bt_rq(bt_rq)->clock_task;
+}
+
+static void
+set_next_bt_entity(struct bt_rq *bt_rq, struct sched_bt_entity *se)
+{
+ /* 'current' is not kept within the tree. */
+ if (se->on_rq) {
+ /*
+ * Any task has to be enqueued before it get to execute on
+ * a CPU. So account for the time it spent waiting on the
+ * runqueue.
+ */
+ update_stats_wait_end_bt(bt_rq, se);
+ __dequeue_bt_entity(bt_rq, se);
+ }
+
+ update_stats_curr_start_bt(bt_rq, se);
+ bt_rq->curr = se;
+#ifdef CONFIG_SCHEDSTATS
+ /*
+ * Track our maximum slice length, if the CPU's load is at
+ * least twice that of our own weight (i.e. dont track it
+ * when there are only lesser-weight tasks around):
+ */
+ if (bt_rq->load.weight >= 2*se->load.weight) {
+ se->statistics->slice_max = max(se->statistics->slice_max,
+ se->sum_exec_runtime - se->prev_sum_exec_runtime);
+ }
+#endif
+ se->prev_sum_exec_runtime = se->sum_exec_runtime;
+}
+
+/*
+ * Pick the next process, keeping these things in mind, in this order:
+ * 1) keep things fair between processes/task groups
+ * 2) pick the "next" process, since someone really wants that to run
+ * 3) do not run the "skip" process, if something else is available
+ */
+static struct sched_bt_entity *pick_next_bt_entity(struct bt_rq *bt_rq)
+{
+ struct sched_bt_entity *se = __pick_first_bt_entity(bt_rq);
+ struct sched_bt_entity *left = se;
+
+ /*
+ * Avoid running the skip buddy, if running something else can
+ * be done without getting too unfair.
+ */
+ if (bt_rq->skip == se) {
+ struct sched_bt_entity *second = __pick_next_bt_entity(se);
+
+ if (second && wakeup_preempt_bt_entity(second, left) < 1)
+ se = second;
+ }
+
+ /*
+ * Someone really wants this to run. If it's not unfair, run it.
+ */
+ if (bt_rq->next && wakeup_preempt_bt_entity(bt_rq->next, left) < 1)
+ se = bt_rq->next;
+
+ clear_buddies_bt(bt_rq, se);
+
+ return se;
+}
+
+
+static struct task_struct *
+pick_next_task_bt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+ struct task_struct *p;
+ struct bt_rq *bt_rq;
+ struct sched_bt_entity *se;
+
+ bt_rq = &rq->bt;
+
+ if (!bt_rq->bt_nr_running)
+ return NULL;
+
+ put_prev_task(rq, prev);
+
+ se = pick_next_bt_entity(bt_rq);
+ set_next_bt_entity(bt_rq, se);
+
+ p = bt_task_of(se);
+
+ return p;
+}
+
+static void put_prev_bt_entity(struct bt_rq *bt_rq,
+ struct sched_bt_entity *prev)
+{
+ /*
+ * If still on the runqueue then deactivate_task()
+ * was not called and update_curr() has to be done:
+ */
+ if (prev->on_rq)
+ update_curr(bt_rq);
+
+ if (prev->on_rq) {
+ update_stats_wait_start_bt(bt_rq, prev);
+ /* Put 'current' back into the tree. */
+ __enqueue_bt_entity(bt_rq, prev);
+ }
+ bt_rq->curr = NULL;
+}
+
+/*
+ * Account for a descheduled task:
+ */
+static void put_prev_task_bt(struct rq *rq, struct task_struct *prev)
+{
+ struct sched_bt_entity *se = &prev->bt;
+ struct bt_rq *bt_rq;
+
+ bt_rq = bt_rq_of(se);
+ put_prev_bt_entity(bt_rq, se);
+}
+
+#ifdef CONFIG_SMP
+static int
+select_task_rq_bt(struct task_struct *p, int prev_cpu, int sd_flag,
+ int wake_flags)
+{
+ struct sched_domain *tmp;
+ int cpu = smp_processor_id();
+ int new_cpu = cpu;
+
+ if (p->nr_cpus_allowed == 1)
+ return prev_cpu;
+
+ rcu_read_lock();
+ for_each_domain(cpu, tmp) {
+ if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
+ new_cpu = cpu;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return new_cpu;
+}
+#endif
+/* Account for a task changing its policy or group.
+ *
+ * This routine is mostly called to set bt_rq->curr field when a task
+ * migrates between groups/classes.
+ */
+static void set_curr_task_bt(struct rq *rq)
+{
+ struct sched_bt_entity *se = &rq->curr->bt;
+ struct bt_rq *bt_rq = bt_rq_of(se);
+
+ set_next_bt_entity(bt_rq, se);
+}
+
+static void
+bt_entity_tick(struct bt_rq *bt_rq, struct sched_bt_entity *curr, int queued)
+{
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(bt_rq);
+
+#ifdef CONFIG_SCHED_HRTICK
+ /*
+ * queued ticks are scheduled to match the slice, so don't bother
+ * validating it and just reschedule.
+ */
+ if (queued) {
+ resched_curr(rq_of_bt_rq(bt_rq));
+ return;
+ }
+#endif
+
+ if (bt_rq->bt_nr_running > 1)
+ check_preempt_tick_bt(bt_rq, curr);
+}
+
+/*
+ * scheduler tick hitting a task of our scheduling class:
+ */
+static void task_tick_bt(struct rq *rq, struct task_struct *curr, int queued)
+{
+ struct bt_rq *bt_rq;
+ struct sched_bt_entity *se = &curr->bt;
+
+ bt_rq = bt_rq_of(se);
+ bt_entity_tick(bt_rq, se, queued);
+}
+/*
+ * Priority of the task has changed. Check to see if we preempt
+ * the current task.
+ */
+static void
+prio_changed_bt(struct rq *rq, struct task_struct *p, int oldprio)
+{
+ if (!p->bt.on_rq)
+ return;
+
+ /*
+ * Reschedule if we are currently running on this runqueue and
+ * our priority decreased, or if we are not currently running on
+ * this runqueue and our priority is higher than the current's
+ */
+ if (rq->curr == p) {
+ if (p->prio > oldprio)
+ resched_curr(rq);
+ } else
+ check_preempt_curr(rq, p, 0);
+}
+
+/*
+ * We switched to the sched_fair class.
+ */
+static void switched_to_bt(struct rq *rq, struct task_struct *p)
+{
+ if (!p->bt.on_rq)
+ return;
+ /*
+ * We were most likely switched from sched_rt, so
+ * kick off the schedule if running, otherwise just see
+ * if we can still preempt the current task.
+ */
+ if (rq->curr == p)
+ resched_curr(rq);
+ else
+ check_preempt_curr(rq, p, 0);
+}
+
+static unsigned int get_rr_interval_bt(struct rq *rq, struct task_struct *task)
+{
+ struct sched_bt_entity *se = &task->bt;
+ unsigned int rr_interval = 0;
+
+ /*
+ * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
+ * idle runqueue:
+ */
+ if (rq->bt.load.weight)
+ rr_interval = NS_TO_JIFFIES(sched_bt_slice(bt_rq_of(se), se));
+
+ return rr_interval;
+}
+
+const struct sched_class bt_sched_class = {
+ .next = &idle_sched_class,
+ .enqueue_task = enqueue_task_bt,
+ .dequeue_task = dequeue_task_bt,
+ .yield_task = yield_task_bt,
+ .check_preempt_curr = check_preempt_wakeup_bt,
+ .pick_next_task = pick_next_task_bt,
+ .put_prev_task = put_prev_task_bt,
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_bt,
+ .set_cpus_allowed = set_cpus_allowed_common,
+#endif
+ .set_curr_task = set_curr_task_bt,
+ .task_tick = task_tick_bt,
+ .prio_changed = prio_changed_bt,
+ .switched_to = switched_to_bt,
+ .get_rr_interval = get_rr_interval_bt,
+ .update_curr = update_curr_bt,
+};

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f35930f..5299ce8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -64,7 +64,7 @@
/*
* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
*/
-static unsigned int sched_nr_latency = 8;
+unsigned int sched_nr_latency = 8;

/*
* After fork, child runs first. If set to 0 (default) then
@@ -10653,7 +10653,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* All the scheduling class methods:
*/
const struct sched_class fair_sched_class = {
- .next = &idle_sched_class,
+ .next = &bt_sched_class,
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 320c80d..68007be 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -593,7 +593,7 @@ struct bt_rq {
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;

- struct sched_bt_entity *curr, *next;
+ struct sched_bt_entity *curr, *next, *skip;
};

static inline int rt_bandwidth_enabled(void)
@@ -1754,6 +1754,7 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
extern const struct sched_class rt_sched_class;
extern const struct sched_class fair_sched_class;
extern const struct sched_class idle_sched_class;
+extern const struct sched_class bt_sched_class;


#ifdef CONFIG_SMP
@@ -2362,4 +2363,6 @@ static inline bool sched_energy_enabled(void)
#define perf_domain_span(pd) NULL
static inline bool sched_energy_enabled(void) { return false; }

+extern unsigned int sched_nr_latency;
+
#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
--
1.8.3.1