[PATCH] sched: check pinned tasks before nohz balance, linux-4.2-rc5

From: Uladzislau Rezki
Date: Wed Aug 05 2015 - 11:14:40 EST


While investigating âload balancerâ logic, I run into interesting
behavior related
to affinity and NO_HZ idle balance. Basically saying if there are two
tasks running
in certain rq and one of them is pinned/affined, then scheduler will
trigger load
balance on power collapsed core via soft irq.

The corner case here is it may fail to unbalance because of affinity, as a
result core that has two tasks will interrupt any found idle one each its timer
tick, what can lead to wasting of power, especially in mobile area, what is
our case.

Do you have any concern with provided patch?

--
Uladzislau Rezki
From ed5c294addcb472be0d5c3619c5a7e0e9d34c3c5 Mon Sep 17 00:00:00 2001
From: Uladzislau Rezki <urezki@xxxxxxxxx>
Date: Wed, 5 Aug 2015 16:20:50 +0200
Subject: [PATCH] sched: check pinned tasks before balance

The problem is there are pinned tasks in the system
which can not be migrated on another CPUs while performing
load balancing. For example: 'kworker', 'ksoftirqd' and
user space applications.

In order to not disturb idle CPUs in that cases we check
if CPU can be balanced before making actual decision
to kick idle CPU and do nohz balance.

This is done for power saving.

Signed-off-by: Uladzislau Rezki <urezki@xxxxxxxxx>
---
kernel/sched/fair.c | 49 +++++++++++++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 1 +
2 files changed, 50 insertions(+)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d113c3b..9268be6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4265,6 +4265,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) {
update_rq_runnable_avg(rq, rq->nr_running);
add_nr_running(rq, 1);
+
+ if (unlikely(p->nr_cpus_allowed == 1))
+ rq->nr_pinned_tasks++;
}
hrtick_update(rq);
}
@@ -4326,6 +4329,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
if (!se) {
sub_nr_running(rq, 1);
update_rq_runnable_avg(rq, 1);
+
+ if (unlikely(p->nr_cpus_allowed == 1))
+ rq->nr_pinned_tasks--;
}
hrtick_update(rq);
}
@@ -7875,6 +7881,27 @@ end:
clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
}

+static inline bool can_balance_pinned_tasks(struct rq *rq)
+{
+ if (unlikely(rq->nr_pinned_tasks > 0)) {
+ int delta = rq->nr_running - rq->nr_pinned_tasks;
+
+ /*
+ * Check if it is possible to "unload" this CPU in case
+ * of having pinned/affine tasks. Do not disturb idle core
+ * if one of the below condition is true:
+ *
+ * - there is one pinned task and it is not "current"
+ * - all tasks are pinned to this CPU
+ */
+ if (delta < 2)
+ if (current->nr_cpus_allowed > 1 || !delta)
+ return false;
+ }
+
+ return true;
+}
+
/*
* Current heuristic for kicking the idle load balancer in the presence
* of an idle cpu in the system.
@@ -7914,6 +7941,9 @@ static inline bool nohz_kick_needed(struct rq *rq)
if (time_before(now, nohz.next_balance))
return false;

+ if (!can_balance_pinned_tasks(rq))
+ return false;
+
if (rq->nr_running >= 2)
return true;

@@ -8248,6 +8278,24 @@ static void task_move_group_fair(struct task_struct *p, int queued)
}
}

+static void set_cpus_allowed_fair(struct task_struct *p, const struct cpumask *new_mask)
+{
+ int nr_cpus_allowed;
+ struct rq *rq;
+
+ if (p->on_rq) {
+ nr_cpus_allowed = cpumask_weight(new_mask);
+ rq = task_rq(p);
+
+ if (nr_cpus_allowed == 1 && p->nr_cpus_allowed > 1)
+ rq->nr_pinned_tasks++;
+ else if (nr_cpus_allowed > 1 && p->nr_cpus_allowed == 1)
+ rq->nr_pinned_tasks--;
+ }
+
+ /* 'new_mask' is applied in core.c */
+}
+
void free_fair_sched_group(struct task_group *tg)
{
int i;
@@ -8461,6 +8509,7 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_move_group = task_move_group_fair,
#endif
+ .set_cpus_allowed = set_cpus_allowed_fair,
};

#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 84d4879..0bbf0b2 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -569,6 +569,7 @@ struct rq {
* remote CPUs use both these fields when doing load calculation.
*/
unsigned int nr_running;
+ unsigned int nr_pinned_tasks;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
--
1.8.2.2