[RFC][PATCH 6/9] sched: power: add power_domain data structure

From: Morten Rasmussen
Date: Tue Jul 09 2013 - 11:56:46 EST


Initial proposal for power topology representation in power
scheduler. For now just one global hierarchy. It will need a more
scalable layout later. More topology information will be added as
the power scheduler design evolves and implements power topology
aware freqency/P-state and idle state selection.

Signed-off-by: Morten Rasmussen <morten.rasmussen@xxxxxxx>
CC: Ingo Molnar <mingo@xxxxxxxxxx>
CC: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
CC: Catalin Marinas <catalin.marinas@xxxxxxx>
---
kernel/sched/power.c | 133 +++++++++++++++++++++++++++++++++++++++++---------
1 file changed, 110 insertions(+), 23 deletions(-)

diff --git a/kernel/sched/power.c b/kernel/sched/power.c
index ddf249f..1ff8e4a 100644
--- a/kernel/sched/power.c
+++ b/kernel/sched/power.c
@@ -21,18 +21,54 @@
#define INTERVAL 5 /* ms */
#define CPU_FULL 90 /* Busy %-age - TODO: Make tunable */

-struct cpu_stats_struct {
+struct power_domain {
+ /* Domain hierarchy pointers */
+ struct power_domain *parent;
+ struct power_domain *next;
+ struct power_domain *child;
+ /* Domain info */
+ struct cpumask span;
+ /* current max power supported by platform */
+ unsigned long arch_power;
+ /* cpu power exposed to the scheduler (fair.c) */
+ unsigned long sched_power;
+ /* load ratio (load tracking) */
int load;
int nr_tasks;
};

-static unsigned long power_of(int cpu)
+static struct power_domain power_hierarchy;
+
+DEFINE_PER_CPU(struct power_domain, *cpu_pds);
+
+#define cpu_pd(cpu) (per_cpu(cpu_pds, (cpu)))
+
+#define for_each_pd(cpu, __pd) \
+ for (__pd = cpu_pd(cpu); __pd; __pd = __pd->parent)
+
+/*
+ * update_hierarchy updates the power domain hierarchy with new information
+ * for a specific cpu
+ */
+static void update_hierarchy(int cpu)
{
- return cpu_rq(cpu)->cpu_power;
+ int i;
+ int domain_load;
+ int domain_arch_power;
+ struct power_domain *pd;
+
+ for_each_pd(cpu, pd) {
+ domain_load = 0;
+ domain_arch_power = 0;
+ for_each_cpu_mask(i, pd->span) {
+ domain_load += cpu_pd(i)->load;
+ domain_arch_power += cpu_pd(i)->arch_power;
+ }
+ pd->load = domain_load;
+ pd->arch_power = domain_arch_power;
+ }
}

-DEFINE_PER_CPU(struct cpu_stats_struct, cpu_stats);
-
/*
* update_cpu_load fetches runqueue statistics from the scheduler should
* only be called with approitate locks held.
@@ -47,18 +83,19 @@ static void update_cpu_load(void)
u32 sum = rq->avg.runnable_avg_sum;
u32 period = rq->avg.runnable_avg_period;

- load = (sum * power_of(i)) / (period+1);
- per_cpu(cpu_stats, i).load = load;
- per_cpu(cpu_stats, i).nr_tasks = rq->nr_running;
+ load = (sum * power_sched_cpu_power(i)) / (period+1);
+ cpu_pd(i)->load = load;
+ cpu_pd(i)->nr_tasks = rq->nr_running;

/* Take power scheduler kthread into account */
if (smp_processor_id() == i)
- per_cpu(cpu_stats, i).nr_tasks--;
+ cpu_pd(i)->nr_tasks--;
+
+ update_hierarchy(i);
}
}

extern unsigned long arch_scale_freq_power(struct sched_domain *sd, int cpu);
-DEFINE_PER_CPU(unsigned long, arch_cpu_power);

static void get_arch_cpu_power(void)
{
@@ -66,16 +103,14 @@ static void get_arch_cpu_power(void)

if (sched_feat(ARCH_POWER)) {
for_each_online_cpu(i)
- per_cpu(arch_cpu_power, i) =
+ cpu_pd(i)->arch_power =
arch_scale_freq_power(cpu_rq(i)->sd, i);
} else {
for_each_online_cpu(i)
- per_cpu(arch_cpu_power, i) = SCHED_POWER_SCALE;
+ cpu_pd(i)->arch_power = SCHED_POWER_SCALE;
}
}

-DEFINE_PER_CPU(unsigned long, cpu_power);
-
/*
* power_sched_cpu_power is called from fair.c to get the power scheduler
* cpu capacities. We can't use arch_scale_freq_power() as this may already
@@ -83,7 +118,10 @@ DEFINE_PER_CPU(unsigned long, cpu_power);
*/
unsigned long power_sched_cpu_power(struct sched_domain *sd, int cpu)
{
- return per_cpu(cpu_power, cpu);
+ if (cpu_pd(cpu))
+ return cpu_pd(cpu)->sched_power;
+ else
+ return SCHED_POWER_SCALE;
}

/*
@@ -95,7 +133,7 @@ unsigned long power_sched_cpu_power(struct sched_domain *sd, int cpu)
static void calculate_cpu_capacities(void)
{
int i, spare_cap = 0;
- struct cpu_stats_struct *stats;
+ struct power_domain *stats;

/*
* spare_cap keeps track of the total available capacity across
@@ -104,22 +142,22 @@ static void calculate_cpu_capacities(void)

for_each_online_cpu(i) {
int t_cap = 0;
- int arch_power = per_cpu(arch_cpu_power, i);
+ int sched_power = cpu_pd(i)->sched_power;

- stats = &per_cpu(cpu_stats, i);
- t_cap = arch_power - stats->load;
+ stats = cpu_pd(i);
+ t_cap = sched_power - stats->load;

- if (t_cap < (arch_power * (100-CPU_FULL)) / 100) {
+ if (t_cap < (sched_power * (100-CPU_FULL)) / 100) {
/* Potential for spreading load */
if (stats->nr_tasks > 1)
t_cap = -(stats->load / stats->nr_tasks);
}

/* Do we have enough capacity already? */
- if (spare_cap + t_cap > arch_power) {
- per_cpu(cpu_power, i) = 1;
+ if (spare_cap + t_cap > sched_power) {
+ cpu_pd(i)->sched_power = 1;
} else {
- per_cpu(cpu_power, i) = arch_power;
+ cpu_pd(i)->sched_power = cpu_pd(i)->arch_power;
spare_cap += t_cap;
}
}
@@ -136,6 +174,53 @@ static void __power_schedule(void)
rcu_read_unlock();
}

+static void init_power_domain(struct power_domain *pd)
+{
+ pd->parent = NULL;
+ pd->next = pd;
+ pd->child = NULL;
+ pd->load = 0;
+ pd->arch_power = 0;
+ pd->sched_power = 0;
+ cpumask_copy(&pd->span, cpu_possible_mask);
+}
+
+/*
+ * init_power_hierarhcy sets up the default power domain hierarchy with
+ * one top level domain spanning all cpus and child domains for each cpu.
+ * next points to the next power domain at the current level and forms a
+ * circular list.
+ */
+static void init_power_hierarchy(void)
+{
+ int cpu, next_cpu;
+ struct power_domain *pd;
+
+ init_power_domain(&power_hierarchy);
+ cpumask_copy(&power_hierarchy.span, cpu_possible_mask);
+
+ pd = kzalloc(sizeof(struct power_domain) * nr_cpu_ids, GFP_KERNEL);
+
+ cpu = cpumask_next(-1, &power_hierarchy.span);
+
+ while (cpu < nr_cpu_ids) {
+ cpu_pd(cpu) = &pd[cpu];
+ cpu_pd(cpu)->parent = &power_hierarchy;
+ cpu_pd(cpu)->child = NULL;
+ cpumask_copy(&(cpu_pd(cpu)->span), get_cpu_mask(cpu));
+ cpu_pd(cpu)->arch_power = 1;
+ cpu_pd(cpu)->sched_power = 1;
+
+ next_cpu = cpumask_next(cpu, &power_hierarchy.span);
+ if (next_cpu < nr_cpu_ids)
+ cpu_pd(cpu)->next = &pd[next_cpu];
+ else
+ cpu_pd(cpu)->next =
+ &pd[cpumask_first(&power_hierarchy.span)];
+ cpu = next_cpu;
+ }
+}
+
struct delayed_work dwork;

/* Periodic power schedule target cpu */
@@ -153,6 +238,8 @@ void power_schedule_wq(struct work_struct *work)

static int __init sched_power_init(void)
{
+ init_power_hierarchy();
+
INIT_DELAYED_WORK(&dwork, power_schedule_wq);
mod_delayed_work_on(schedule_cpu(), system_wq, &dwork,
msecs_to_jiffies(INTERVAL));
--
1.7.9.5


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/