[RFCv3 PATCH 31/48] sched: Extend sched_group_energy to test load-balancing decisions

From: Morten Rasmussen
Date: Wed Feb 04 2015 - 13:36:54 EST


Extended sched_group_energy() to support energy prediction with usage
(tasks) added/removed from a specific cpu or migrated between a pair of
cpus. Useful for load-balancing decision making.

cc: Ingo Molnar <mingo@xxxxxxxxxx>
cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>

Signed-off-by: Morten Rasmussen <morten.rasmussen@xxxxxxx>
---
kernel/sched/fair.c | 90 +++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 66 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d12aa63..07c84af 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4592,23 +4592,44 @@ static unsigned long capacity_curr_of(int cpu)
* Without capping the usage, a group could be seen as overloaded (CPU0 usage
* at 121% + CPU1 usage at 80%) whereas CPU1 has 20% of available capacity/
*/
-static int get_cpu_usage(int cpu)
+static int __get_cpu_usage(int cpu, int delta)
{
+ int sum;
unsigned long usage = cpu_rq(cpu)->cfs.utilization_load_avg;
unsigned long blocked = cpu_rq(cpu)->cfs.utilization_blocked_avg;
unsigned long capacity_curr = capacity_curr_of(cpu);

- if (usage + blocked >= capacity_curr)
+ sum = usage + blocked + delta;
+
+ if (sum < 0)
+ return 0;
+
+ if (sum >= capacity_curr)
return capacity_curr;

- return usage + blocked;
+ return sum;
}

+static int get_cpu_usage(int cpu)
+{
+ return __get_cpu_usage(cpu, 0);
+}
+
+
static inline bool energy_aware(void)
{
return sched_feat(ENERGY_AWARE);
}

+struct energy_env {
+ struct sched_group *sg_top;
+ struct sched_group *sg_cap;
+ int usage_delta;
+ int src_cpu;
+ int dst_cpu;
+ int energy;
+};
+
/*
* cpu_norm_usage() returns the cpu usage relative to it's current capacity,
* i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for
@@ -4623,20 +4644,38 @@ static inline bool energy_aware(void)
*
* norm_usage = running_time/time ~ usage/capacity_curr
*/
-static inline unsigned long cpu_norm_usage(int cpu)
+static inline unsigned long __cpu_norm_usage(int cpu, int delta)
{
unsigned long capacity_curr = capacity_curr_of(cpu);

- return (get_cpu_usage(cpu) << SCHED_CAPACITY_SHIFT)/capacity_curr;
+ return (__get_cpu_usage(cpu, delta) << SCHED_CAPACITY_SHIFT)
+ /capacity_curr;
}

-static unsigned group_max_usage(struct sched_group *sg)
+static inline unsigned long cpu_norm_usage(int cpu)
{
- int i;
+ return __cpu_norm_usage(cpu, 0);
+}
+
+static inline int calc_usage_delta(struct energy_env *eenv, int cpu)
+{
+ if (cpu == eenv->src_cpu)
+ return -eenv->usage_delta;
+ if (cpu == eenv->dst_cpu)
+ return eenv->usage_delta;
+ return 0;
+}
+
+static unsigned group_max_usage(struct energy_env *eenv,
+ struct sched_group *sg)
+{
+ int i, delta;
int max_usage = 0;

- for_each_cpu(i, sched_group_cpus(sg))
- max_usage = max(max_usage, get_cpu_usage(i));
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ delta = calc_usage_delta(eenv, i);
+ max_usage = max(max_usage, __get_cpu_usage(i, delta));
+ }

return max_usage;
}
@@ -4650,24 +4689,27 @@ static unsigned group_max_usage(struct sched_group *sg)
* latter is used as the estimate as it leads to a more pessimistic energy
* estimate (more busy).
*/
-static unsigned group_norm_usage(struct sched_group *sg)
+static unsigned group_norm_usage(struct energy_env *eenv,
+ struct sched_group *sg)
{
- int i;
+ int i, delta;
unsigned long usage_sum = 0;

- for_each_cpu(i, sched_group_cpus(sg))
- usage_sum += cpu_norm_usage(i);
+ for_each_cpu(i, sched_group_cpus(sg)) {
+ delta = calc_usage_delta(eenv, i);
+ usage_sum += __cpu_norm_usage(i, delta);
+ }

if (usage_sum > SCHED_CAPACITY_SCALE)
return SCHED_CAPACITY_SCALE;
return usage_sum;
}

-static int find_new_capacity(struct sched_group *sg,
+static int find_new_capacity(struct energy_env *eenv,
struct sched_group_energy *sge)
{
int idx;
- unsigned long util = group_max_usage(sg);
+ unsigned long util = group_max_usage(eenv, eenv->sg_cap);

for (idx = 0; idx < sge->nr_cap_states; idx++) {
if (sge->cap_states[idx].cap >= util)
@@ -4686,16 +4728,16 @@ static int find_new_capacity(struct sched_group *sg,
* gather the same usage statistics multiple times. This can probably be done in
* a faster but more complex way.
*/
-static unsigned int sched_group_energy(struct sched_group *sg_top)
+static unsigned int sched_group_energy(struct energy_env *eenv)
{
struct sched_domain *sd;
int cpu, total_energy = 0;
struct cpumask visit_cpus;
struct sched_group *sg;

- WARN_ON(!sg_top->sge);
+ WARN_ON(!eenv->sg_top->sge);

- cpumask_copy(&visit_cpus, sched_group_cpus(sg_top));
+ cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));

while (!cpumask_empty(&visit_cpus)) {
struct sched_group *sg_shared_cap = NULL;
@@ -4718,18 +4760,17 @@ static unsigned int sched_group_energy(struct sched_group *sg_top)
break;

do {
- struct sched_group *sg_cap_util;
unsigned group_util;
int sg_busy_energy, sg_idle_energy;
int cap_idx;

if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
- sg_cap_util = sg_shared_cap;
+ eenv->sg_cap = sg_shared_cap;
else
- sg_cap_util = sg;
+ eenv->sg_cap = sg;

- cap_idx = find_new_capacity(sg_cap_util, sg->sge);
- group_util = group_norm_usage(sg);
+ cap_idx = find_new_capacity(eenv, sg->sge);
+ group_util = group_norm_usage(eenv, sg);
sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power)
>> SCHED_CAPACITY_SHIFT;
sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) * sg->sge->idle_states[0].power)
@@ -4740,7 +4781,7 @@ static unsigned int sched_group_energy(struct sched_group *sg_top)
if (!sd->child)
cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));

- if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(sg_top)))
+ if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
goto next_cpu;

} while (sg = sg->next, sg != sd->groups);
@@ -4749,6 +4790,7 @@ static unsigned int sched_group_energy(struct sched_group *sg_top)
continue;
}

+ eenv->energy = total_energy;
return total_energy;
}

--
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/