[RFC PATCH 4/9 v4] Define SD_WORKLOAD_CONSOLIDATION and attach to sched_domain

From: Yuyang Du
Date: Wed Jun 25 2014 - 04:46:23 EST


Workload Consolidation is completely CPU topology and policy driven. To do so,
we define SD_WORKLOAD_CONSOLIDATION, and add some fields in sched_domain struct:

1) total_groups is the group number in total in this domain
2) group_number is this CPU's group sequence number
3) consolidating_coeff is the coefficient for consolidating CPUs, and is changeable
via sysctl tool to make consolidation more aggressive or less
4) first_group is the pointer to this domain's first group ordered by CPU number

This patchset enables SD_WORKLOAD_CONSOLIDATION in MC domain by default. But we need
to come up with a better way to determine on which architecture this flag should be
enabled or not. Thanks to PeterZ and Dietmar for pointing this out and help me
finally understand it.

Signed-off-by: Yuyang Du <yuyang.du@xxxxxxxxx>
---
include/linux/sched.h | 8 +++++++-
kernel/sched/core.c | 46 ++++++++++++++++++++++++++++++++++++++++++----
kernel/sched/sched.h | 13 ++++++++++---
3 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1b1997d..a339467 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -870,6 +870,7 @@ enum cpu_idle_type {
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
#define SD_NUMA 0x4000 /* cross-node balancing */
+#define SD_WORKLOAD_CONSOLIDATION 0x8000 /* consolidate CPU workload */

#ifdef CONFIG_SCHED_SMT
static inline const int cpu_smt_flags(void)
@@ -881,7 +882,7 @@ static inline const int cpu_smt_flags(void)
#ifdef CONFIG_SCHED_MC
static inline const int cpu_core_flags(void)
{
- return SD_SHARE_PKG_RESOURCES;
+ return SD_SHARE_PKG_RESOURCES | SD_WORKLOAD_CONSOLIDATION;
}
#endif

@@ -973,6 +974,11 @@ struct sched_domain {
struct rcu_head rcu; /* used during destruction */
};

+ unsigned int total_groups; /* total group number */
+ unsigned int group_number; /* this CPU's group sequence */
+ unsigned int consolidating_coeff; /* consolidating coefficient */
+ struct sched_group *first_group; /* ordered by CPU number */
+
unsigned int span_weight;
/*
* Span of all CPUs in this domain.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 3bdf01b..da3cd74 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4941,7 +4941,7 @@ set_table_entry(struct ctl_table *entry,
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
- struct ctl_table *table = sd_alloc_ctl_entry(14);
+ struct ctl_table *table = sd_alloc_ctl_entry(15);

if (table == NULL)
return NULL;
@@ -4974,7 +4974,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
- /* &table[13] is terminator */
+ set_table_entry(&table[13], "consolidating_coeff", &sd->consolidating_coeff,
+ sizeof(int), 0644, proc_dointvec, false);
+ /* &table[14] is terminator */

return table;
}
@@ -5586,7 +5588,7 @@ static void update_top_cache_domain(int cpu)
int id = cpu;
int size = 1;

- sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+ sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES, 1);
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
@@ -5601,10 +5603,41 @@ static void update_top_cache_domain(int cpu)
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);

- sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
+ sd = highest_flag_domain(cpu, SD_ASYM_PACKING, 1);
rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
}

+
+DEFINE_PER_CPU(struct sched_domain *, sd_wc);
+
+static void update_wc_domain(struct sched_domain *sd, int cpu)
+{
+ while (sd) {
+ int i = 0, j = 0, first, min = INT_MAX;
+ struct sched_group *group;
+
+ group = sd->groups;
+ first = group_first_cpu(group);
+ do {
+ int k = group_first_cpu(group);
+ i += 1;
+ if (k < first)
+ j += 1;
+ if (k < min) {
+ sd->first_group = group;
+ min = k;
+ }
+ } while (group = group->next, group != sd->groups);
+
+ sd->total_groups = i;
+ sd->group_number = j;
+ sd = sd->parent;
+ }
+
+ sd = highest_flag_domain(cpu, SD_WORKLOAD_CONSOLIDATION, 0);
+ rcu_assign_pointer(per_cpu(sd_wc, cpu), sd);
+}
+
/*
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
@@ -5653,6 +5686,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
destroy_sched_domains(tmp, cpu);

update_top_cache_domain(cpu);
+
+ update_wc_domain(sd, cpu);
}

/* cpus with isolated domains */
@@ -6069,6 +6104,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
#ifdef CONFIG_SCHED_DEBUG
.name = tl->name,
#endif
+ .consolidating_coeff = 0,
};

/*
@@ -6098,6 +6134,8 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
}

#endif
+ } else if (sd->flags & SD_WORKLOAD_CONSOLIDATION) {
+ sd->consolidating_coeff = 160;
} else {
sd->flags |= SD_PREFER_SIBLING;
sd->cache_nice_tries = 1;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index eb47ce2..a2a7230 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -695,16 +695,22 @@ extern void sched_ttwu_pending(void);
* be returned.
* @flag: The flag to check for the highest sched_domain
* for the given cpu.
+ * @all: The flag is contained by all sched_domains from the hightest down
*
* Returns the highest sched_domain of a cpu which contains the given flag.
*/
-static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
+static inline struct
+sched_domain *highest_flag_domain(int cpu, int flag, int all)
{
struct sched_domain *sd, *hsd = NULL;

for_each_domain(cpu, sd) {
- if (!(sd->flags & flag))
- break;
+ if (!(sd->flags & flag)) {
+ if (all)
+ break;
+ else
+ continue;
+ }
hsd = sd;
}

@@ -729,6 +735,7 @@ DECLARE_PER_CPU(int, sd_llc_id);
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+DECLARE_PER_CPU(struct sched_domain *, sd_wc);

struct sched_group_capacity {
atomic_t ref;
--
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/