Re: [PATCH v4 6/6] sched/fair: Remove double_lock_balance() from load_balance()

From: Kirill Tkhai
Date: Tue Aug 12 2014 - 06:27:17 EST


Ð ÐÑ, 12/08/2014 Ð 11:36 +0200, Peter Zijlstra ÐÐÑÐÑ:
>
> Changed quite a bit around, _should_ be more or less the same end result
> I suppose. Only compile tested so far.

No objections. I've tested it in a virtual machine; simple "make -jxx"
test passed.

>
> ---
> Subject: sched/fair: Remove double_lock_balance() from load_balance()
> From: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx>
> Date: Wed, 6 Aug 2014 12:07:04 +0400
>
> Avoid double_rq_lock() and use ONRQ_MIGRATING for load_balance(). The
> advantage is (obviously) not holding two 'rq->lock's at the same time
> and thereby increasing parallelism.
>
> Further note that if there was no task to migrate we will not have
> acquired the second rq->lock at all.
>
> The important point to note is that because we acquire dst->lock
> immediately after releasing src->lock the potential wait time of
> task_rq_lock() callers on ONRQ_MIGRATING is not longer than it would
> have been in the double rq lock scenario.
>
> Signed-off-by: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx>
> Signed-off-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
> Link: http://lkml.kernel.org/r/1407312424.8424.48.camel@tkhai
> ---
> kernel/sched/fair.c | 151 ++++++++++++++++++++++++++++++++++------------------
> 1 file changed, 99 insertions(+), 52 deletions(-)
>
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -4706,7 +4706,7 @@ static void check_preempt_wakeup(struct
> return;
>
> /*
> - * This is possible from callers such as move_task(), in which we
> + * This is possible from callers such as attach_tasks(), in which we
> * unconditionally check_prempt_curr() after an enqueue (which may have
> * lead to a throttle). This both saves work and prevents false
> * next-buddy nomination below.
> @@ -5114,21 +5114,10 @@ struct lb_env {
> unsigned int loop_max;
>
> enum fbq_type fbq_type;
> + struct list_head tasks;
> };
>
> /*
> - * move_task - move a task from one runqueue to another runqueue.
> - * Both runqueues must be locked.
> - */
> -static void move_task(struct task_struct *p, struct lb_env *env)
> -{
> - deactivate_task(env->src_rq, p, 0);
> - set_task_cpu(p, env->dst_cpu);
> - activate_task(env->dst_rq, p, 0);
> - check_preempt_curr(env->dst_rq, p, 0);
> -}
> -
> -/*
> * Is this task likely cache-hot:
> */
> static int task_hot(struct task_struct *p, struct lb_env *env)
> @@ -5343,6 +5332,18 @@ int can_migrate_task(struct task_struct
> }
>
> /*
> + * detach_task() -- detach the task for the migration specified in env
> + */
> +static void detach_task(struct task_struct *p, struct lb_env *env)
> +{
> + lockdep_assert_held(&env->src_rq->lock);
> +
> + deactivate_task(env->src_rq, p, 0);
> + p->on_rq = ONRQ_MIGRATING;
> + set_task_cpu(p, env->dst_cpu);
> +}
> +
> +/*
> * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
> * part of active balancing operations within "domain".
> *
> @@ -5358,15 +5359,13 @@ static struct task_struct *detach_one_ta
> if (!can_migrate_task(p, env))
> continue;
>
> - deactivate_task(env->src_rq, p, 0);
> - p->on_rq = ONRQ_MIGRATING;
> - set_task_cpu(p, env->dst_cpu);
> + detach_task(p, env);
>
> /*
> * Right now, this is only the second place where
> - * lb_gained[env->idle] is updated (other is move_tasks)
> + * lb_gained[env->idle] is updated (other is detach_tasks)
> * so we can safely collect stats here rather than
> - * inside move_tasks().
> + * inside detach_tasks().
> */
> schedstat_inc(env->sd, lb_gained[env->idle]);
> return p;
> @@ -5374,35 +5373,22 @@ static struct task_struct *detach_one_ta
> return NULL;
> }
>
> -/*
> - * attach_one_task() -- attaches the task returned from detach_one_task() to
> - * its new rq.
> - */
> -static void attach_one_task(struct rq *rq, struct task_struct *p)
> -{
> - raw_spin_lock(&rq->lock);
> - BUG_ON(task_rq(p) != rq);
> - p->on_rq = ONRQ_QUEUED;
> - activate_task(rq, p, 0);
> - check_preempt_curr(rq, p, 0);
> - raw_spin_unlock(&rq->lock);
> -}
> -
> static const unsigned int sched_nr_migrate_break = 32;
>
> /*
> - * move_tasks tries to move up to imbalance weighted load from busiest to
> - * this_rq, as part of a balancing operation within domain "sd".
> - * Returns 1 if successful and 0 otherwise.
> + * detach_tasks() -- tries to detach up to imbalance weighted load from
> + * busiest_rq, as part of a balancing operation within domain "sd".
> *
> - * Called with both runqueues locked.
> + * Returns number of detached tasks if successful and 0 otherwise.
> */
> -static int move_tasks(struct lb_env *env)
> +static int detach_tasks(struct lb_env *env)
> {
> struct list_head *tasks = &env->src_rq->cfs_tasks;
> struct task_struct *p;
> unsigned long load;
> - int pulled = 0;
> + int detached = 0;
> +
> + lockdep_assert_held(&env->src_rq->lock);
>
> if (env->imbalance <= 0)
> return 0;
> @@ -5433,14 +5419,16 @@ static int move_tasks(struct lb_env *env
> if ((load / 2) > env->imbalance)
> goto next;
>
> - move_task(p, env);
> - pulled++;
> + detach_task(p, env);
> + list_add(&p->se.group_node, &env->tasks);
> +
> + detached++;
> env->imbalance -= load;
>
> #ifdef CONFIG_PREEMPT
> /*
> * NEWIDLE balancing is a source of latency, so preemptible
> - * kernels will stop after the first task is pulled to minimize
> + * kernels will stop after the first task is detached to minimize
> * the critical section.
> */
> if (env->idle == CPU_NEWLY_IDLE)
> @@ -5460,13 +5448,58 @@ static int move_tasks(struct lb_env *env
> }
>
> /*
> - * Right now, this is one of only two places move_task() is called,
> - * so we can safely collect move_task() stats here rather than
> - * inside move_task().
> + * Right now, this is one of only two places we collect this stat
> + * so we can safely collect detach_one_task() stats here rather
> + * than inside detach_one_task().
> */
> - schedstat_add(env->sd, lb_gained[env->idle], pulled);
> + schedstat_add(env->sd, lb_gained[env->idle], detached);
>
> - return pulled;
> + return detached;
> +}
> +
> +/*
> + * attach_task() -- attach the task detached by detach_task() to its new rq.
> + */
> +static void attach_task(struct rq *rq, struct task_struct *p)
> +{
> + lockdep_assert_held(&rq->lock);
> +
> + BUG_ON(task_rq(p) != rq);
> + p->on_rq = ONRQ_QUEUED;
> + activate_task(rq, p, 0);
> + check_preempt_curr(rq, p, 0);
> +}
> +
> +/*
> + * attach_one_task() -- attaches the task returned from detach_one_task() to
> + * its new rq.
> + */
> +static void attach_one_task(struct rq *rq, struct task_struct *p)
> +{
> + raw_spin_lock(&rq->lock);
> + attach_task(rq, p);
> + raw_spin_unlock(&rq->lock);
> +}
> +
> +/*
> + * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
> + * new rq.
> + */
> +static void attach_tasks(struct lb_env *env)
> +{
> + struct list_head *tasks = &env->tasks;
> + struct task_struct *p;
> +
> + raw_spin_lock(&env->dst_rq->lock);
> +
> + while (!list_empty(tasks)) {
> + p = list_first_entry(tasks, struct task_struct, se.group_node);
> + list_del_init(&p->se.group_node);
> +
> + attach_task(env->dst_rq, p);
> + }
> +
> + raw_spin_unlock(&env->dst_rq->lock);
> }
>
> #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -6600,6 +6633,7 @@ static int load_balance(int this_cpu, st
> .loop_break = sched_nr_migrate_break,
> .cpus = cpus,
> .fbq_type = all,
> + .tasks = LIST_HEAD_INIT(env.tasks),
> };
>
> /*
> @@ -6649,16 +6683,29 @@ static int load_balance(int this_cpu, st
> env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
>
> more_balance:
> - local_irq_save(flags);
> - double_rq_lock(env.dst_rq, busiest);
> + raw_spin_lock_irqsave(&busiest->lock, flags);
>
> /*
> * cur_ld_moved - load moved in current iteration
> * ld_moved - cumulative load moved across iterations
> */
> - cur_ld_moved = move_tasks(&env);
> - ld_moved += cur_ld_moved;
> - double_rq_unlock(env.dst_rq, busiest);
> + cur_ld_moved = detach_tasks(&env);
> +
> + /*
> + * We've detached some tasks from busiest_rq. Every
> + * task is masked "ONRQ_MIGRATED", so we can safely
> + * unlock busiest->lock, and we are able to be sure
> + * that nobody can manipulate the tasks in parallel.
> + * See task_rq_lock() family for the details.
> + */
> +
> + raw_spin_unlock(&busiest->lock);
> +
> + if (cur_ld_moved) {
> + attach_tasks(&env);
> + ld_moved += cur_ld_moved;
> + }
> +
> local_irq_restore(flags);
>
> /*
> @@ -6794,7 +6841,7 @@ static int load_balance(int this_cpu, st
> * If we've begun active balancing, start to back off. This
> * case may not be covered by the all_pinned logic if there
> * is only 1 task on the busy runqueue (because we don't call
> - * move_tasks).
> + * detach_tasks).
> */
> if (sd->balance_interval < sd->max_interval)
> sd->balance_interval *= 2;


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/