[PATCH v5 5/5] sched/fair: Remove double_lock_balance() from load_balance()

From: Kirill Tkhai
Date: Wed Aug 20 2014 - 05:48:43 EST

Next message: Javier Martinez Canillas: "Re: [PATCH 5/6] ARM: dts: Add vmmc and vqmmc supplies for Peach Pit and Pi boards"
Previous message: Thierry Reding: "Re: [PATCH 3/3] ARM: at91/tclib: mask interruptions at shutdown and probe"
Next in thread: tip-bot for Kirill Tkhai: "[tip:sched/core] sched/fair: Remove double_lock_balance() from load_balance()"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Avoid double_rq_lock() and use TASK_ON_RQ_MIGRATING for load_balance().
The advantage is (obviously) not holding two 'rq->lock's at the same time
and thereby increasing parallelism.

Further note that if there was no task to migrate we will not have
acquired the second rq->lock at all.

The important point to note is that because we acquire dst->lock
immediately after releasing src->lock the potential wait time of
task_rq_lock() callers on TASK_ON_RQ_MIGRATING is not longer than
it would have been in the double rq lock scenario.

Signed-off-by: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx>
---
kernel/sched/fair.c | 151 +++++++++++++++++++++++++++++++++------------------
1 file changed, 99 insertions(+), 52 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7e5cf05..d3427a8 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4709,7 +4709,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;

/*
- * This is possible from callers such as move_task(), in which we
+ * This is possible from callers such as attach_tasks(), in which we
* unconditionally check_prempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
@@ -5117,21 +5117,10 @@ struct lb_env {
unsigned int loop_max;

enum fbq_type fbq_type;
+ struct list_head tasks;
};

/*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
- */
-static void move_task(struct task_struct *p, struct lb_env *env)
-{
- deactivate_task(env->src_rq, p, 0);
- set_task_cpu(p, env->dst_cpu);
- activate_task(env->dst_rq, p, 0);
- check_preempt_curr(env->dst_rq, p, 0);
-}
-
-/*
* Is this task likely cache-hot:
*/
static int task_hot(struct task_struct *p, struct lb_env *env)
@@ -5346,6 +5335,18 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
}

/*
+ * detach_task() -- detach the task for the migration specified in env
+ */
+static void detach_task(struct task_struct *p, struct lb_env *env)
+{
+ lockdep_assert_held(&env->src_rq->lock);
+
+ deactivate_task(env->src_rq, p, 0);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
+ set_task_cpu(p, env->dst_cpu);
+}
+
+/*
* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
* part of active balancing operations within "domain".
*
@@ -5361,15 +5362,13 @@ static struct task_struct *detach_one_task(struct lb_env *env)
if (!can_migrate_task(p, env))
continue;

- deactivate_task(env->src_rq, p, 0);
- p->on_rq = TASK_ON_RQ_MIGRATING;
- set_task_cpu(p, env->dst_cpu);
+ detach_task(p, env);

/*
* Right now, this is only the second place where
- * lb_gained[env->idle] is updated (other is move_tasks)
+ * lb_gained[env->idle] is updated (other is detach_tasks)
* so we can safely collect stats here rather than
- * inside move_tasks().
+ * inside detach_tasks().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
return p;
@@ -5377,35 +5376,22 @@ static struct task_struct *detach_one_task(struct lb_env *env)
return NULL;
}

-/*
- * attach_one_task() -- attaches the task returned from detach_one_task() to
- * its new rq.
- */
-static void attach_one_task(struct rq *rq, struct task_struct *p)
-{
- raw_spin_lock(&rq->lock);
- BUG_ON(task_rq(p) != rq);
- p->on_rq = TASK_ON_RQ_QUEUED;
- activate_task(rq, p, 0);
- check_preempt_curr(rq, p, 0);
- raw_spin_unlock(&rq->lock);
-}
-
static const unsigned int sched_nr_migrate_break = 32;

/*
- * move_tasks tries to move up to imbalance weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
+ * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * busiest_rq, as part of a balancing operation within domain "sd".
*
- * Called with both runqueues locked.
+ * Returns number of detached tasks if successful and 0 otherwise.
*/
-static int move_tasks(struct lb_env *env)
+static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
struct task_struct *p;
unsigned long load;
- int pulled = 0;
+ int detached = 0;
+
+ lockdep_assert_held(&env->src_rq->lock);

if (env->imbalance <= 0)
return 0;
@@ -5436,14 +5422,16 @@ static int move_tasks(struct lb_env *env)
if ((load / 2) > env->imbalance)
goto next;

- move_task(p, env);
- pulled++;
+ detach_task(p, env);
+ list_add(&p->se.group_node, &env->tasks);
+
+ detached++;
env->imbalance -= load;

#ifdef CONFIG_PREEMPT
/*
* NEWIDLE balancing is a source of latency, so preemptible
- * kernels will stop after the first task is pulled to minimize
+ * kernels will stop after the first task is detached to minimize
* the critical section.
*/
if (env->idle == CPU_NEWLY_IDLE)
@@ -5463,13 +5451,58 @@ static int move_tasks(struct lb_env *env)
}

/*
- * Right now, this is one of only two places move_task() is called,
- * so we can safely collect move_task() stats here rather than
- * inside move_task().
+ * Right now, this is one of only two places we collect this stat
+ * so we can safely collect detach_one_task() stats here rather
+ * than inside detach_one_task().
*/
- schedstat_add(env->sd, lb_gained[env->idle], pulled);
+ schedstat_add(env->sd, lb_gained[env->idle], detached);

- return pulled;
+ return detached;
+}
+
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static void attach_task(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_held(&rq->lock);
+
+ BUG_ON(task_rq(p) != rq);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ activate_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+}
+
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+ raw_spin_lock(&rq->lock);
+ attach_task(rq, p);
+ raw_spin_unlock(&rq->lock);
+}
+
+/*
+ * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
+ * new rq.
+ */
+static void attach_tasks(struct lb_env *env)
+{
+ struct list_head *tasks = &env->tasks;
+ struct task_struct *p;
+
+ raw_spin_lock(&env->dst_rq->lock);
+
+ while (!list_empty(tasks)) {
+ p = list_first_entry(tasks, struct task_struct, se.group_node);
+ list_del_init(&p->se.group_node);
+
+ attach_task(env->dst_rq, p);
+ }
+
+ raw_spin_unlock(&env->dst_rq->lock);
}

#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6603,6 +6636,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
+ .tasks = LIST_HEAD_INIT(env.tasks),
};

/*
@@ -6652,16 +6686,29 @@ static int load_balance(int this_cpu, struct rq *this_rq,
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);

more_balance:
- local_irq_save(flags);
- double_rq_lock(env.dst_rq, busiest);
+ raw_spin_lock_irqsave(&busiest->lock, flags);

/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
- cur_ld_moved = move_tasks(&env);
- ld_moved += cur_ld_moved;
- double_rq_unlock(env.dst_rq, busiest);
+ cur_ld_moved = detach_tasks(&env);
+
+ /*
+ * We've detached some tasks from busiest_rq. Every
+ * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
+ * unlock busiest->lock, and we are able to be sure
+ * that nobody can manipulate the tasks in parallel.
+ * See task_rq_lock() family for the details.
+ */
+
+ raw_spin_unlock(&busiest->lock);
+
+ if (cur_ld_moved) {
+ attach_tasks(&env);
+ ld_moved += cur_ld_moved;
+ }
+
local_irq_restore(flags);

/*
@@ -6797,7 +6844,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
- * move_tasks).
+ * detach_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Javier Martinez Canillas: "Re: [PATCH 5/6] ARM: dts: Add vmmc and vqmmc supplies for Peach Pit and Pi boards"
Previous message: Thierry Reding: "Re: [PATCH 3/3] ARM: at91/tclib: mask interruptions at shutdown and probe"
Next in thread: tip-bot for Kirill Tkhai: "[tip:sched/core] sched/fair: Remove double_lock_balance() from load_balance()"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]