[RFC PATCH 3/3] sched: add power aware scheduling in fork/exec/wake

From: Alex Shi
Date: Tue Nov 06 2012 - 08:12:06 EST

Next message: Alex Shi: "[RFC PATCH 1/3] sched: add sched_policy and it's sysfs interface"
Previous message: Alex Shi: "[RFC PATCH 2/3] sched: power aware load balance,"
In reply to: Alex Shi: "Re: [RFC PATCH 2/3] sched: power aware load balance,"
Next in thread: Alex Shi: "[RFC PATCH 1/3] sched: add sched_policy and it's sysfs interface"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

This patch add power aware scheduling in fork/exec/wake. It try to
select cpu from the busiest but has capcaity group. The trade off is
adding power aware statistics collection for the group seeking. But
since the collection just happened in power scheduling eligible
condition. So no munch performance impact.

hackbench testing results has no clear dropping even with powersaving
policy.

Signed-off-by: Alex Shi <alex.shi@xxxxxxxxx>
---
kernel/sched/fair.c | 233 +++++++++++++++++++++++++++++++++++-----------------
1 file changed, 159 insertions(+), 74 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index acc8b41..902ef5a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3370,12 +3370,149 @@ static int numa_select_node_cpu(struct task_struct *p, int node)
#endif /* CONFIG_SCHED_NUMA */

/*
- * sched_balance_self: balance the current task (running on cpu) in domains
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ * during load balancing.
+ */
+struct sd_lb_stats {
+ struct sched_group *busiest; /* Busiest group in this sd */
+ struct sched_group *this; /* Local group in this sd */
+ unsigned long total_load; /* Total load of all groups in sd */
+ unsigned long total_pwr; /* Total power of all groups in sd */
+ unsigned long avg_load; /* Average load across all groups in sd */
+
+ /** Statistics of this group */
+ unsigned long this_load;
+ unsigned long this_load_per_task;
+ unsigned long this_nr_running;
+ unsigned long this_has_capacity;
+ unsigned int this_idle_cpus;
+
+ /* Statistics of the busiest group */
+ unsigned int busiest_idle_cpus;
+ unsigned long max_load;
+ unsigned long busiest_load_per_task;
+ unsigned long busiest_nr_running;
+ unsigned long busiest_group_capacity;
+ unsigned long busiest_has_capacity;
+ unsigned int busiest_group_weight;
+
+ int group_imb; /* Is there imbalance in this sd */
+
+ /* Varibles of power awaring scheduling */
+ unsigned long sd_capacity; /* capacity of this domain */
+ unsigned long sd_nr_running; /* Nr running of this domain */
+ struct sched_group *group_min; /* Least loaded group in sd */
+ struct sched_group *group_leader; /* Group which relieves group_min */
+ unsigned long min_load_per_task; /* load_per_task in group_min */
+ unsigned long leader_nr_running; /* Nr running of group_leader */
+ unsigned long min_nr_running; /* Nr running of group_min */
+#ifdef CONFIG_SCHED_NUMA
+ struct sched_group *numa_group; /* group which has offnode_tasks */
+ unsigned long numa_group_weight;
+ unsigned long numa_group_running;
+
+ unsigned long this_offnode_running;
+ unsigned long this_onnode_running;
+#endif
+};
+
+/*
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ * and task rq selection
+ */
+struct sg_lb_stats {
+ unsigned long avg_load; /*Avg load across the CPUs of the group */
+ unsigned long group_load; /* Total load over the CPUs of the group */
+ unsigned long sum_nr_running; /* Nr tasks running in the group */
+ unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+ unsigned long group_capacity;
+ unsigned long idle_cpus;
+ unsigned long group_weight;
+ int group_imb; /* Is there an imbalance in the group ? */
+ int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_SCHED_NUMA
+ unsigned long numa_offnode_weight;
+ unsigned long numa_offnode_running;
+ unsigned long numa_onnode_running;
+#endif
+};
+
+static inline int
+fix_small_capacity(struct sched_domain *sd, struct sched_group *group);
+
+static void get_sg_power_stats(struct sched_group *group,
+ struct sched_domain *sd, struct sg_lb_stats *sgs)
+{
+ int i;
+
+
+ for_each_cpu(i, sched_group_cpus(group)) {
+ struct rq *rq = cpu_rq(i);
+
+ sgs->sum_nr_running += rq->nr_running;
+ }
+
+ sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
+ SCHED_POWER_SCALE);
+ if (!sgs->group_capacity)
+ sgs->group_capacity = fix_small_capacity(sd, group);
+ sgs->group_weight = group->group_weight;
+}
+
+static void get_sd_power_stats(struct sched_domain *sd,
+ struct sd_lb_stats *sds)
+{
+ struct sched_group *group;
+ struct sg_lb_stats sgs;
+ long sd_min_delta = LONG_MAX;
+
+ group = sd->groups;
+ do {
+ long g_delta;
+ unsigned long threshold;
+
+ memset(&sgs, 0, sizeof(sgs));
+ get_sg_power_stats(group, sd, &sgs);
+
+ if (sched_policy == SCHED_POLICY_POWERSAVING)
+ threshold = sgs.group_weight;
+ else
+ threshold = sgs.group_capacity;
+ g_delta = threshold - sgs.sum_nr_running;
+
+ if (g_delta > 0 && g_delta < sd_min_delta) {
+ sd_min_delta = g_delta;
+ sds->group_leader = group;
+ }
+
+ sds->sd_nr_running += sgs.sum_nr_running;
+ sds->total_pwr += group->sgp->power;
+ } while (group = group->next, group != sd->groups);
+
+ sds->sd_capacity = DIV_ROUND_CLOSEST(sds->total_pwr,
+ SCHED_POWER_SCALE);
+}
+
+static inline int get_sd_sched_policy(struct sched_domain *sd,
+ struct sd_lb_stats *sds)
+{
+ int policy = SCHED_POLICY_PERFORMANCE;
+
+ if (sched_policy != SCHED_POLICY_PERFORMANCE) {
+ memset(sds, 0, sizeof(*sds));
+ get_sd_power_stats(sd, sds);
+
+ if (sd->span_weight > sds->sd_nr_running)
+ policy = SCHED_POLICY_POWERSAVING;
+ }
+ return policy;
+}
+
+/*
+ * select_task_rq_fair: balance the current task (running on cpu) in domains
* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
* SD_BALANCE_EXEC.
*
- * Balance, ie. select the least loaded group.
- *
* Returns the target CPU number, or the same CPU if no balancing is needed.
*
* preempt must be disabled.
@@ -3384,12 +3521,14 @@ static int
select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+ struct sd_lb_stats sds;
int cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
int new_cpu = cpu;
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
int node = tsk_home_node(p);
+ int policy = sched_policy;

if (p->nr_cpus_allowed == 1)
return prev_cpu;
@@ -3412,6 +3551,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)

new_cpu = cpu = node_cpu;
sd = per_cpu(sd_node, cpu);
+ policy = get_sd_sched_policy(sd, &sds);
goto pick_idlest;
}

@@ -3445,8 +3585,12 @@ find_sd:
break;
}

- if (tmp->flags & sd_flag)
+ if (tmp->flags & sd_flag) {
sd = tmp;
+ policy = get_sd_sched_policy(sd, &sds);
+ if (policy != SCHED_POLICY_PERFORMANCE)
+ break;
+ }
}

if (affine_sd) {
@@ -3460,7 +3604,7 @@ find_sd:
pick_idlest:
while (sd) {
int load_idx = sd->forkexec_idx;
- struct sched_group *group;
+ struct sched_group *group = NULL;
int weight;

if (!(sd->flags & sd_flag)) {
@@ -3471,7 +3615,12 @@ pick_idlest:
if (sd_flag & SD_BALANCE_WAKE)
load_idx = sd->wake_idx;

- group = find_idlest_group(sd, p, cpu, load_idx);
+ if (policy != SCHED_POLICY_PERFORMANCE)
+ group = sds.group_leader;
+
+ if (!group)
+ group = find_idlest_group(sd, p, cpu, load_idx);
+
if (!group) {
sd = sd->child;
continue;
@@ -3491,8 +3640,11 @@ pick_idlest:
for_each_domain(cpu, tmp) {
if (weight <= tmp->span_weight)
break;
- if (tmp->flags & sd_flag)
+ if (tmp->flags & sd_flag) {
sd = tmp;
+ if (policy != SCHED_POLICY_PERFORMANCE)
+ policy = get_sd_sched_policy(sd, &sds);
+ }
}
/* while loop will break here if sd == NULL */
}
@@ -4330,73 +4482,6 @@ static unsigned long task_h_load(struct task_struct *p)
#endif

/********** Helpers for find_busiest_group ************************/
-/*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- * during load balancing.
- */
-struct sd_lb_stats {
- struct sched_group *busiest; /* Busiest group in this sd */
- struct sched_group *this; /* Local group in this sd */
- unsigned long total_load; /* Total load of all groups in sd */
- unsigned long total_pwr; /* Total power of all groups in sd */
- unsigned long avg_load; /* Average load across all groups in sd */
-
- /** Statistics of this group */
- unsigned long this_load;
- unsigned long this_load_per_task;
- unsigned long this_nr_running;
- unsigned long this_has_capacity;
- unsigned int this_idle_cpus;
-
- /* Statistics of the busiest group */
- unsigned int busiest_idle_cpus;
- unsigned long max_load;
- unsigned long busiest_load_per_task;
- unsigned long busiest_nr_running;
- unsigned long busiest_group_capacity;
- unsigned long busiest_has_capacity;
- unsigned int busiest_group_weight;
-
- int group_imb; /* Is there imbalance in this sd */
-
- /* Varibles of power awaring scheduling */
- unsigned long sd_capacity; /* capacity of this domain */
- unsigned long sd_nr_running; /* Nr running of this domain */
- struct sched_group *group_min; /* Least loaded group in sd */
- struct sched_group *group_leader; /* Group which relieves group_min */
- unsigned long min_load_per_task; /* load_per_task in group_min */
- unsigned long leader_nr_running; /* Nr running of group_leader */
- unsigned long min_nr_running; /* Nr running of group_min */
-
-#ifdef CONFIG_SCHED_NUMA
- struct sched_group *numa_group; /* group which has offnode_tasks */
- unsigned long numa_group_weight;
- unsigned long numa_group_running;
-
- unsigned long this_offnode_running;
- unsigned long this_onnode_running;
-#endif
-};
-
-/*
- * sg_lb_stats - stats of a sched_group required for load_balancing
- */
-struct sg_lb_stats {
- unsigned long avg_load; /*Avg load across the CPUs of the group */
- unsigned long group_load; /* Total load over the CPUs of the group */
- unsigned long sum_nr_running; /* Nr tasks running in the group */
- unsigned long sum_weighted_load; /* Weighted load of group's tasks */
- unsigned long group_capacity;
- unsigned long idle_cpus;
- unsigned long group_weight;
- int group_imb; /* Is there an imbalance in the group ? */
- int group_has_capacity; /* Is there extra capacity in the group? */
-#ifdef CONFIG_SCHED_NUMA
- unsigned long numa_offnode_weight;
- unsigned long numa_offnode_running;
- unsigned long numa_onnode_running;
-#endif
-};

/**
* init_sd_lb_power_stats - Initialize power savings statistics for
--
1.7.12

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Alex Shi: "[RFC PATCH 1/3] sched: add sched_policy and it's sysfs interface"
Previous message: Alex Shi: "[RFC PATCH 2/3] sched: power aware load balance,"
In reply to: Alex Shi: "Re: [RFC PATCH 2/3] sched: power aware load balance,"
Next in thread: Alex Shi: "[RFC PATCH 1/3] sched: add sched_policy and it's sysfs interface"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]