Re: [RFC PATCH 3/3] sched: introduce synchronized idle injection
From: Jacob Pan
Date: Thu Nov 05 2015 - 18:47:39 EST
On Thu, 5 Nov 2015 14:59:52 +0100
Peter Zijlstra <peterz@xxxxxxxxxxxxx> wrote:
> On Tue, Nov 03, 2015 at 02:31:20PM +0100, Peter Zijlstra wrote:
> > > @@ -5136,6 +5148,16 @@ pick_next_task_fair(struct rq *rq, struct
> > > task_struct *prev) struct task_struct *p;
> > > int new_tasks;
> > >
> > > +#ifdef CONFIG_CFS_IDLE_INJECT
> > > + if (cfs_rq->force_throttled &&
> > > + !idle_cpu(cpu_of(rq)) &&
> > > + !unlikely(local_softirq_pending())) {
> > > + /* forced idle, pick no task */
> > > + trace_sched_cfs_idle_inject(cpu_of(rq), 1);
> > > + update_curr(cfs_rq);
> > > + return NULL;
> > > + }
> > > +#endif
> > > again:
> > > #ifdef CONFIG_FAIR_GROUP_SCHED
> > > if (!cfs_rq->nr_running)
> >
> > So this is horrible...
>
> So this isn't ideal either (I rather liked the previous approach of a
> random task assuming idle, but tglx hated that). This should at least
> not touch extra cachelines in the hot paths, although it does add a
> few extra instructions :/
>
> Very limited testing didn't show anything horrible.
>
I did some testing with the code below, it shows random
[ 150.442597] NOHZ: local_softirq_pending 02
[ 153.032673] NOHZ: local_softirq_pending 202
[ 153.203785] NOHZ: local_softirq_pending 202
[ 153.206486] NOHZ: local_softirq_pending 282
I recalled that was why i checked for local_softirq_pending in the
initial patch, still trying to find out how we can avoid that. These
also causes non stop sched ticks in the inner idle loop.
> Your throttle would:
>
> raw_spin_lock_irqsave(&rq->lock, flags);
> rq->cfs.forced_idle = true;
> resched = rq->cfs.runnable;
> rq->cfs.runnable = false;
> raw_spin_unlock_irqrestore(&rq->lock, flags);
> if (resched)
> resched_cpu(cpu_of(rq));
>
> And your unthrottle:
>
> raw_spin_lock_irqsave(&rq->lock, flags);
> rq->cfs.forced_idle = false;
> resched = rq->cfs.runnable = !!rq->cfs.nr_running;
> raw_spin_unlock_irqrestore(&rq->lock, flags);
> if (resched)
> resched_cpu(cpu_of(rq));
>
> ---
> kernel/sched/fair.c | 13 +++++++++----
> kernel/sched/sched.h | 1 +
> 2 files changed, 10 insertions(+), 4 deletions(-)
>
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 824aa9f..1f0c809 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -2341,7 +2341,8 @@ account_entity_enqueue(struct cfs_rq *cfs_rq,
> struct sched_entity *se) list_add(&se->group_node, &rq->cfs_tasks);
> }
> #endif
> - cfs_rq->nr_running++;
> + if (!cfs_rq->nr_running++ && !cfs_rq->forced_idle)
> + cfs_rq->runnable = true;
> }
>
> static void
> @@ -2354,7 +2355,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq,
> struct sched_entity *se) account_numa_dequeue(rq_of(cfs_rq),
> task_of(se)); list_del_init(&se->group_node);
> }
> - cfs_rq->nr_running--;
> + if (!--cfs_rq->nr_running && !cfs_rq->forced_idle)
> + cfs_rq->runnable = false;
> }
>
> #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -5204,7 +5206,7 @@ pick_next_task_fair(struct rq *rq, struct
> task_struct *prev)
> again:
> #ifdef CONFIG_FAIR_GROUP_SCHED
> - if (!cfs_rq->nr_running)
> + if (!cfs_rq->runnable)
> goto idle;
>
> if (prev->sched_class != &fair_sched_class)
> @@ -5283,7 +5285,7 @@ simple:
> cfs_rq = &rq->cfs;
> #endif
>
> - if (!cfs_rq->nr_running)
> + if (!cfs_rq->runnable)
> goto idle;
>
> put_prev_task(rq, prev);
> @@ -5302,6 +5304,9 @@ simple:
> return p;
>
> idle:
> + if (cfs_rq->forced_idle)
> + return NULL;
> +
> /*
> * This is OK, because current is on_cpu, which avoids it
> being picked
> * for load-balance and preemption/IRQs are still disabled
> avoiding diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index efd3bfc..33d355d 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -347,6 +347,7 @@ struct cfs_bandwidth { };
> struct cfs_rq {
> struct load_weight load;
> unsigned int nr_running, h_nr_running;
> + unsigned int runnable, forced_idle;
>
> u64 exec_clock;
> u64 min_vruntime;
[Jacob Pan]
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/