Re: [PATCH 2/3] RT: Cache cpus_allowed weight for optimizing migration

From: Steven Rostedt
Date: Thu Oct 25 2007 - 11:49:23 EST



--
On Thu, 25 Oct 2007, Gregory Haskins wrote:

> Some RT tasks (particularly kthreads) are bound to one specific CPU.
> It is fairly common for one or more bound tasks to get queued up at the
> same time. Consider, for instance, softirq_timer and softirq_sched. A
> timer goes off in an ISR which schedules softirq_thread to run at RT50.
> Then during the handling of the timer, the system determines that it's
> time to smp-rebalance the system so it schedules softirq_sched to run
> from within the softirq_timer kthread context. So we are in a situation
> where we have two RT50 tasks queued, and the system will go into
> rt-overload condition to request other CPUs for help.
>
> The problem is that these tasks cannot ever be pulled away since they
> are already running on their one and only valid RQ. However, the other
> CPUs cannot determine that the tasks are unpullable without going
> through expensive checks/locking. Therefore the helping CPUS
> experience unecessary overhead/latencies regardless as they
> ineffectively try to process the overload condition.
>
> This patch tries to optimize the situation by utilizing the hamming
> weight of the task->cpus_allowed mask. A weight of 1 indicates that
> the task cannot be migrated, which may be utilized by the overload
> handling code to eliminate uncessary rebalance attempts. We also
> introduce a per-rq variable to count the number of migratable tasks
> that are currently running. We only go into overload if we have more
> than one rt task, AND at least one of them is migratable.
>
> Calculating the weight is probably relatively expensive, so it is only
> done when the cpus_allowed mask is updated (which should be relatively
> infrequent, especially compared to scheduling frequency) and cached in
> the task_struct.
>
>
> Signed-off-by: Gregory Haskins <ghaskins@xxxxxxxxxx>
> ---
>
> include/linux/sched.h | 2 +
> kernel/fork.c | 1
> kernel/sched.c | 9 +++-
> kernel/sched_rt.c | 116 +++++++++++++++++++++++++++++++++++++++++--------
> 4 files changed, 107 insertions(+), 21 deletions(-)
>
> diff --git a/include/linux/sched.h b/include/linux/sched.h
> index 7a3829f..829de6f 100644
> --- a/include/linux/sched.h
> +++ b/include/linux/sched.h
> @@ -1048,6 +1048,7 @@ struct sched_class {
> void (*set_curr_task) (struct rq *rq);
> void (*task_tick) (struct rq *rq, struct task_struct *p);
> void (*task_new) (struct rq *rq, struct task_struct *p);
> + void (*set_cpus_allowed)(struct task_struct *p, cpumask_t newmask);
> };
>
> struct load_weight {
> @@ -1144,6 +1145,7 @@ struct task_struct {
>
> unsigned int policy;
> cpumask_t cpus_allowed;
> + int nr_cpus_allowed;
> unsigned int time_slice;
>
> #ifdef CONFIG_PREEMPT_RCU
> diff --git a/kernel/fork.c b/kernel/fork.c
> index 5f11f23..f808e18 100644
> --- a/kernel/fork.c
> +++ b/kernel/fork.c
> @@ -1257,6 +1257,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
> */
> preempt_disable();
> p->cpus_allowed = current->cpus_allowed;
> + p->nr_cpus_allowed = current->nr_cpus_allowed;
> if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
> !cpu_online(task_cpu(p))))
> set_task_cpu(p, smp_processor_id());
> diff --git a/kernel/sched.c b/kernel/sched.c
> index 30fa531..6c90093 100644
> --- a/kernel/sched.c
> +++ b/kernel/sched.c
> @@ -262,6 +262,7 @@ struct rt_rq {
> int rt_load_balance_idx;
> struct list_head *rt_load_balance_head, *rt_load_balance_curr;
> unsigned long rt_nr_running;
> + unsigned long rt_nr_migratory;
> unsigned long rt_nr_uninterruptible;
> /* highest queued rt task prio */
> int highest_prio;
> @@ -5371,7 +5372,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
> goto out;
> }
>
> - p->cpus_allowed = new_mask;
> + if (p->sched_class->set_cpus_allowed)

Not sure we need this optimization (not doing the nr_cpus_allowed
calculation). Since, due to priority boosting, we will need to calculate
then. Calculating it here is better.

> + p->sched_class->set_cpus_allowed(p,
new_mask); > + else {
> + p->cpus_allowed = new_mask;
> + p->nr_cpus_allowed = cpus_weight(new_mask);
> + }
> +
> /* Can the task run on the task's current CPU? If so, we're done */
> if (cpu_isset(task_cpu(p), new_mask))
> goto out;
> diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
> index b59dc20..ad35c89 100644
> --- a/kernel/sched_rt.c
> +++ b/kernel/sched_rt.c
> @@ -50,6 +50,24 @@ static inline void update_curr_rt(struct rq *rq)
> curr->se.sum_exec_runtime += delta_exec;
> curr->se.exec_start = rq->clock;
> }
> +#ifdef CONFIG_SMP
> +static void inc_rt_migration(struct task_struct *p, struct rq *rq)
> +{
> + rq->rt.rt_nr_migratory++;
> +
> + if (rq->rt.rt_nr_running > 1)
> + rt_set_overload(p, rq->cpu);
> +}
> +
> +static void dec_rt_migration(struct task_struct *p, struct rq *rq)
> +{
> + WARN_ON(!rq->rt.rt_nr_migratory);
> + rq->rt.rt_nr_migratory--;
> +
> + if ((rq->rt.rt_nr_running < 2) || !rq->rt.rt_nr_migratory)
> + rt_clear_overload(p, rq->cpu);
> +}
> +#endif
>
> static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
> {
> @@ -58,8 +76,8 @@ static inline void inc_rt_tasks(struct task_struct *p, struct rq *rq)
> #ifdef CONFIG_SMP
> if (p->prio < rq->rt.highest_prio)
> rq->rt.highest_prio = p->prio;
> - if (rq->rt.rt_nr_running > 1)
> - rt_set_overload(p, rq->cpu);
> + if (p->nr_cpus_allowed > 1)
> + inc_rt_migration(p, rq);
> #endif /* CONFIG_SMP */
> }
>
> @@ -81,8 +99,8 @@ static inline void dec_rt_tasks(struct task_struct *p, struct rq *rq)
> } /* otherwise leave rq->highest prio alone */
> } else
> rq->rt.highest_prio = MAX_RT_PRIO;
> - if (rq->rt.rt_nr_running < 2)
> - rt_clear_overload(p, rq->cpu);
> + if (p->nr_cpus_allowed > 1)
> + dec_rt_migration(p, rq);
> #endif /* CONFIG_SMP */
> }
>
> @@ -286,6 +304,27 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
> return next;
> }
>
> +static int lock_migration_target(struct task_struct *p, struct rq *target_rq)
> +{
> + struct rq *this_rq = task_rq(p);
> +
> + if (double_lock_balance(this_rq, target_rq)) {
> + /*
> + * We had to unlock the run queue. In
> + * the mean time, task could have
> + * migrated already or had its affinity changed.
> + */
> + if (unlikely(task_running(this_rq, p) ||
> + task_rq(p) != this_rq ||
> + !cpu_isset(target_rq->cpu, p->cpus_allowed))) {
> + spin_unlock(&target_rq->lock);
> + return 0;
> + }
> + }
> +
> + return 1;
> +}
> +
> /* Will lock the rq it finds */
> static struct rq *find_lock_lowest_rq(struct task_struct *task,
> struct rq *this_rq)
> @@ -295,6 +334,28 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
> int cpu;
> int tries;
>
> + /*
> + * We can optimize if the hamming weight of the cpus_allowed mask
> + * is 1 because the task has nowhere to go but one CPU. So don't
> + * waste the time trying to find the lowest RQ in this case.
> + */

This code should be in the pick_next_highest_rt and not here.

> + if (task->nr_cpus_allowed == 1) {
> + /* If the task is already on the RQ, we are done */
> + if (cpu_isset(this_rq->cpu, task->cpus_allowed))
> + return NULL;
> +
> + cpu = first_cpu(task->cpus_allowed);
> +
> + lowest_rq = cpu_rq(cpu);
> + BUG_ON(this_rq == lowest_rq);
> +
> + /* Otherwise, we can simply grab the new RQ */
> + if (lock_migration_target(task, lowest_rq))
> + return lowest_rq;
> + else
> + return NULL;
> + }
> +
> cpus_and(cpu_mask, cpu_online_map, task->cpus_allowed);
>
> for (tries = 0; tries < RT_MAX_TRIES; tries++) {
> @@ -324,22 +385,8 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task,
> break;
>
> /* if the prio of this runqueue changed, try again */
> - if (double_lock_balance(this_rq, lowest_rq)) {
> - /*
> - * We had to unlock the run queue. In
> - * the mean time, task could have
> - * migrated already or had its affinity changed.
> - * Also make sure that it wasn't scheduled on its rq.
> - */
> - if (unlikely(task_rq(task) != this_rq ||
> - !cpu_isset(lowest_rq->cpu, task->cpus_allowed) ||
> - task_running(this_rq, task) ||
> - !task->se.on_rq)) {
> - spin_unlock(&lowest_rq->lock);
> - lowest_rq = NULL;
> - break;
> - }
> - }
> + if (!lock_migration_target(task, lowest_rq))
> + return NULL;

I don't like this encapsulating of the doubl_lock_balance. There's a
reason I kept it out like this. Mainly because this is the source of most
(if not all) of the race condititions I fought. So this is a very
sensitive area to touch. In fact, I see a few races already introduced by
this patch. One is that you took out the "!task->se.or_rq" test. Which
means that a task could have ran and deactivated itself
(TASK_UNINTERRUPTIBLE) and you just added it to a run queue. (I hit that
race ;-)

So keep that code out in the open where it belongs. Its very sensitive.


>
> /* If this rq is still suitable use it. */
> if (lowest_rq->rt.highest_prio > task->prio)
> @@ -658,6 +705,31 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
> }
> }
>
> +#ifdef CONFIG_SMP
> +static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t new_mask)
> +{
> + int weight = cpus_weight(new_mask);
> +
> + BUG_ON(!rt_task(p));
> +
> + /*
> + * Update the migration status of the RQ if we have an RT task
> + * which is running AND changing its weight value.
> + */
> + if (p->se.on_rq && (weight != p->nr_cpus_allowed)) {
> + struct rq *rq = task_rq(p);
> +
> + if ((p->nr_cpus_allowed <= 1) && (weight > 1))
> + inc_rt_migration(p, rq);
> + else if((p->nr_cpus_allowed > 1) && (weight <= 1))
> + dec_rt_migration(p, rq);
> + }
> +
> + p->cpus_allowed = new_mask;
> + p->nr_cpus_allowed = weight;
> +}
> +#endif
> +
> static struct sched_class rt_sched_class __read_mostly = {
> .enqueue_task = enqueue_task_rt,
> .dequeue_task = dequeue_task_rt,
> @@ -671,4 +743,8 @@ static struct sched_class rt_sched_class __read_mostly = {
> .load_balance = load_balance_rt,
>
> .task_tick = task_tick_rt,
> +
> +#ifdef CONFIG_SMP
> + .set_cpus_allowed = set_cpus_allowed_rt,
> +#endif

Hmm, I must have missed where you update the mask at time of boosting.
Anyway, we shouldn't. The set_cpus_allowed should be done for all tasks.
Now you can have a handler to call for when a task changes class and
changes weight.

-- Steve

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/