Re: [PATCH v4 6/6] sched/fair: Remove double_lock_balance() from load_balance()

From: Peter Zijlstra
Date: Tue Aug 12 2014 - 05:36:53 EST

Next message: Miklos Szeredi: "Re: Race in ovl_copy_up()?"
Previous message: Pavel Machek: "Re: [PATCH] Hibernate: Do not assume the first e820 area to be RAM"
Next in thread: Kirill Tkhai: "Re: [PATCH v4 6/6] sched/fair: Remove double_lock_balance() from load_balance()"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Changed quite a bit around, _should_ be more or less the same end result
I suppose. Only compile tested so far.

---
Subject: sched/fair: Remove double_lock_balance() from load_balance()
From: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx>
Date: Wed, 6 Aug 2014 12:07:04 +0400

Avoid double_rq_lock() and use ONRQ_MIGRATING for load_balance(). The
advantage is (obviously) not holding two 'rq->lock's at the same time
and thereby increasing parallelism.

Further note that if there was no task to migrate we will not have
acquired the second rq->lock at all.

The important point to note is that because we acquire dst->lock
immediately after releasing src->lock the potential wait time of
task_rq_lock() callers on ONRQ_MIGRATING is not longer than it would
have been in the double rq lock scenario.

Signed-off-by: Kirill Tkhai <ktkhai@xxxxxxxxxxxxx>
Signed-off-by: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Link: http://lkml.kernel.org/r/1407312424.8424.48.camel@tkhai
---
kernel/sched/fair.c | 151 ++++++++++++++++++++++++++++++++++------------------
1 file changed, 99 insertions(+), 52 deletions(-)

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4706,7 +4706,7 @@ static void check_preempt_wakeup(struct
return;

/*
- * This is possible from callers such as move_task(), in which we
+ * This is possible from callers such as attach_tasks(), in which we
* unconditionally check_prempt_curr() after an enqueue (which may have
* lead to a throttle). This both saves work and prevents false
* next-buddy nomination below.
@@ -5114,21 +5114,10 @@ struct lb_env {
unsigned int loop_max;

enum fbq_type fbq_type;
+ struct list_head tasks;
};

/*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
- */
-static void move_task(struct task_struct *p, struct lb_env *env)
-{
- deactivate_task(env->src_rq, p, 0);
- set_task_cpu(p, env->dst_cpu);
- activate_task(env->dst_rq, p, 0);
- check_preempt_curr(env->dst_rq, p, 0);
-}
-
-/*
* Is this task likely cache-hot:
*/
static int task_hot(struct task_struct *p, struct lb_env *env)
@@ -5343,6 +5332,18 @@ int can_migrate_task(struct task_struct
}

/*
+ * detach_task() -- detach the task for the migration specified in env
+ */
+static void detach_task(struct task_struct *p, struct lb_env *env)
+{
+ lockdep_assert_held(&env->src_rq->lock);
+
+ deactivate_task(env->src_rq, p, 0);
+ p->on_rq = ONRQ_MIGRATING;
+ set_task_cpu(p, env->dst_cpu);
+}
+
+/*
* detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
* part of active balancing operations within "domain".
*
@@ -5358,15 +5359,13 @@ static struct task_struct *detach_one_ta
if (!can_migrate_task(p, env))
continue;

- deactivate_task(env->src_rq, p, 0);
- p->on_rq = ONRQ_MIGRATING;
- set_task_cpu(p, env->dst_cpu);
+ detach_task(p, env);

/*
* Right now, this is only the second place where
- * lb_gained[env->idle] is updated (other is move_tasks)
+ * lb_gained[env->idle] is updated (other is detach_tasks)
* so we can safely collect stats here rather than
- * inside move_tasks().
+ * inside detach_tasks().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
return p;
@@ -5374,35 +5373,22 @@ static struct task_struct *detach_one_ta
return NULL;
}

-/*
- * attach_one_task() -- attaches the task returned from detach_one_task() to
- * its new rq.
- */
-static void attach_one_task(struct rq *rq, struct task_struct *p)
-{
- raw_spin_lock(&rq->lock);
- BUG_ON(task_rq(p) != rq);
- p->on_rq = ONRQ_QUEUED;
- activate_task(rq, p, 0);
- check_preempt_curr(rq, p, 0);
- raw_spin_unlock(&rq->lock);
-}
-
static const unsigned int sched_nr_migrate_break = 32;

/*
- * move_tasks tries to move up to imbalance weighted load from busiest to
- * this_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
+ * detach_tasks() -- tries to detach up to imbalance weighted load from
+ * busiest_rq, as part of a balancing operation within domain "sd".
*
- * Called with both runqueues locked.
+ * Returns number of detached tasks if successful and 0 otherwise.
*/
-static int move_tasks(struct lb_env *env)
+static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
struct task_struct *p;
unsigned long load;
- int pulled = 0;
+ int detached = 0;
+
+ lockdep_assert_held(&env->src_rq->lock);

if (env->imbalance <= 0)
return 0;
@@ -5433,14 +5419,16 @@ static int move_tasks(struct lb_env *env
if ((load / 2) > env->imbalance)
goto next;

- move_task(p, env);
- pulled++;
+ detach_task(p, env);
+ list_add(&p->se.group_node, &env->tasks);
+
+ detached++;
env->imbalance -= load;

#ifdef CONFIG_PREEMPT
/*
* NEWIDLE balancing is a source of latency, so preemptible
- * kernels will stop after the first task is pulled to minimize
+ * kernels will stop after the first task is detached to minimize
* the critical section.
*/
if (env->idle == CPU_NEWLY_IDLE)
@@ -5460,13 +5448,58 @@ static int move_tasks(struct lb_env *env
}

/*
- * Right now, this is one of only two places move_task() is called,
- * so we can safely collect move_task() stats here rather than
- * inside move_task().
+ * Right now, this is one of only two places we collect this stat
+ * so we can safely collect detach_one_task() stats here rather
+ * than inside detach_one_task().
*/
- schedstat_add(env->sd, lb_gained[env->idle], pulled);
+ schedstat_add(env->sd, lb_gained[env->idle], detached);

- return pulled;
+ return detached;
+}
+
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static void attach_task(struct rq *rq, struct task_struct *p)
+{
+ lockdep_assert_held(&rq->lock);
+
+ BUG_ON(task_rq(p) != rq);
+ p->on_rq = ONRQ_QUEUED;
+ activate_task(rq, p, 0);
+ check_preempt_curr(rq, p, 0);
+}
+
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+ raw_spin_lock(&rq->lock);
+ attach_task(rq, p);
+ raw_spin_unlock(&rq->lock);
+}
+
+/*
+ * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
+ * new rq.
+ */
+static void attach_tasks(struct lb_env *env)
+{
+ struct list_head *tasks = &env->tasks;
+ struct task_struct *p;
+
+ raw_spin_lock(&env->dst_rq->lock);
+
+ while (!list_empty(tasks)) {
+ p = list_first_entry(tasks, struct task_struct, se.group_node);
+ list_del_init(&p->se.group_node);
+
+ attach_task(env->dst_rq, p);
+ }
+
+ raw_spin_unlock(&env->dst_rq->lock);
}

#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -6600,6 +6633,7 @@ static int load_balance(int this_cpu, st
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
+ .tasks = LIST_HEAD_INIT(env.tasks),
};

/*
@@ -6649,16 +6683,29 @@ static int load_balance(int this_cpu, st
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);

more_balance:
- local_irq_save(flags);
- double_rq_lock(env.dst_rq, busiest);
+ raw_spin_lock_irqsave(&busiest->lock, flags);

/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
- cur_ld_moved = move_tasks(&env);
- ld_moved += cur_ld_moved;
- double_rq_unlock(env.dst_rq, busiest);
+ cur_ld_moved = detach_tasks(&env);
+
+ /*
+ * We've detached some tasks from busiest_rq. Every
+ * task is masked "ONRQ_MIGRATED", so we can safely
+ * unlock busiest->lock, and we are able to be sure
+ * that nobody can manipulate the tasks in parallel.
+ * See task_rq_lock() family for the details.
+ */
+
+ raw_spin_unlock(&busiest->lock);
+
+ if (cur_ld_moved) {
+ attach_tasks(&env);
+ ld_moved += cur_ld_moved;
+ }
+
local_irq_restore(flags);

/*
@@ -6794,7 +6841,7 @@ static int load_balance(int this_cpu, st
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
- * move_tasks).
+ * detach_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;

Attachment: pgpmrCoRbWzRz.pgp
Description: PGP signature

Next message: Miklos Szeredi: "Re: Race in ovl_copy_up()?"
Previous message: Pavel Machek: "Re: [PATCH] Hibernate: Do not assume the first e820 area to be RAM"
Next in thread: Kirill Tkhai: "Re: [PATCH v4 6/6] sched/fair: Remove double_lock_balance() from load_balance()"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]