[RFC 07/13] sched: Reduce stack size requirements in kernel/sched.c

From: Mike Travis
Date: Sat Sep 06 2008 - 19:53:10 EST


* Make the following changes to kernel/sched.c functions:

- use node_to_cpumask_ptr in place of node_to_cpumask
- use get_cpumask_var for temporary cpumask_t variables
- use alloc_cpumask_ptr where available

* Remove special code for SCHED_CPUMASK_ALLOC and use CPUMASK_ALLOC
from linux/cpumask.h.

* The resultant stack savings are:

====== Stack (-l 100)

1 - initial
2 - stack-hogs-kernel_sched_c
'.' is less than the limit(100)

.1. .2. ..final..
2216 -1536 680 -69% __build_sched_domains
1592 -1592 . -100% move_task_off_dead_cpu
1096 -1096 . -100% sched_balance_self
1032 -1032 . -100% sched_setaffinity
616 -616 . -100% rebalance_domains
552 -552 . -100% free_sched_groups
512 -512 . -100% cpu_to_allnodes_group
7616 -6936 680 -91% Totals


Applies to linux-2.6.tip/master.

Signed-off-by: Mike Travis <travis@xxxxxxx>
---
kernel/sched.c | 151 ++++++++++++++++++++++++++++++---------------------------
1 file changed, 81 insertions(+), 70 deletions(-)

--- linux-2.6.tip.orig/kernel/sched.c
+++ linux-2.6.tip/kernel/sched.c
@@ -70,6 +70,7 @@
#include <linux/bootmem.h>
#include <linux/debugfs.h>
#include <linux/ctype.h>
+#include <linux/cpumask_ptr.h>
#include <linux/ftrace.h>
#include <trace/sched.h>

@@ -117,6 +118,12 @@
*/
#define RUNTIME_INF ((u64)~0ULL)

+/*
+ * temp cpumask variables
+ */
+static DEFINE_PER_CPUMASK(temp_cpumask_1);
+static DEFINE_PER_CPUMASK(temp_cpumask_2);
+
#ifdef CONFIG_SMP
/*
* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -2141,7 +2148,11 @@ static int sched_balance_self(int cpu, i
{
struct task_struct *t = current;
struct sched_domain *tmp, *sd = NULL;
+ cpumask_ptr span;
+ cpumask_ptr tmpmask;

+ get_cpumask_var(span, temp_cpumask_1);
+ get_cpumask_var(tmpmask, temp_cpumask_2);
for_each_domain(cpu, tmp) {
/*
* If power savings logic is enabled for a domain, stop there.
@@ -2156,7 +2167,6 @@ static int sched_balance_self(int cpu, i
update_shares(sd);

while (sd) {
- cpumask_t span, tmpmask;
struct sched_group *group;
int new_cpu, weight;

@@ -2165,14 +2175,14 @@ static int sched_balance_self(int cpu, i
continue;
}

- span = sd->span;
+ *span = sd->span;
group = find_idlest_group(sd, t, cpu);
if (!group) {
sd = sd->child;
continue;
}

- new_cpu = find_idlest_cpu(group, t, cpu, &tmpmask);
+ new_cpu = find_idlest_cpu(group, t, cpu, tmpmask);
if (new_cpu == -1 || new_cpu == cpu) {
/* Now try balancing at a lower domain level of cpu */
sd = sd->child;
@@ -2182,7 +2192,7 @@ static int sched_balance_self(int cpu, i
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
sd = NULL;
- weight = cpus_weight(span);
+ weight = cpus_weight(*span);
for_each_domain(cpu, tmp) {
if (weight <= cpus_weight(tmp->span))
break;
@@ -2192,6 +2202,9 @@ static int sched_balance_self(int cpu, i
/* while loop will break here if sd == NULL */
}

+ put_cpumask_var(span, temp_cpumask_1);
+ put_cpumask_var(tmpmask, temp_cpumask_2);
+
return cpu;
}

@@ -3865,8 +3878,9 @@ static void rebalance_domains(int cpu, e
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int need_serialize;
- cpumask_t tmp;
+ cpumask_ptr tmp;

+ get_cpumask_var(tmp, temp_cpumask_1);
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
@@ -3890,7 +3904,7 @@ static void rebalance_domains(int cpu, e
}

if (time_after_eq(jiffies, sd->last_balance + interval)) {
- if (load_balance(cpu, rq, sd, idle, &balance, &tmp)) {
+ if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
/*
* We've pulled tasks over so either we're no
* longer idle, or one of our SMT siblings is
@@ -3924,6 +3938,8 @@ out:
*/
if (likely(update_next_balance))
rq->next_balance = next_balance;
+
+ put_cpumask_var(tmp, temp_cpumask_1);
}

/*
@@ -5384,11 +5400,14 @@ out_unlock:

long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
{
- cpumask_t cpus_allowed;
- cpumask_t new_mask = *in_mask;
+ cpumask_ptr cpus_allowed;
+ cpumask_ptr new_mask;
struct task_struct *p;
int retval;

+ get_cpumask_var(cpus_allowed, temp_cpumask_1);
+ get_cpumask_var(new_mask, temp_cpumask_2);
+ *new_mask = *in_mask;
get_online_cpus();
read_lock(&tasklist_lock);

@@ -5416,24 +5435,26 @@ long sched_setaffinity(pid_t pid, const
if (retval)
goto out_unlock;

- cpuset_cpus_allowed(p, &cpus_allowed);
- cpus_and(new_mask, new_mask, cpus_allowed);
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpus_and(*new_mask, *new_mask, *cpus_allowed);
again:
- retval = set_cpus_allowed_ptr(p, &new_mask);
+ retval = set_cpus_allowed_ptr(p, new_mask);

if (!retval) {
- cpuset_cpus_allowed(p, &cpus_allowed);
- if (!cpus_subset(new_mask, cpus_allowed)) {
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpus_subset(*new_mask, *cpus_allowed)) {
/*
* We must have raced with a concurrent cpuset
* update. Just reset the cpus_allowed to the
* cpuset's cpus_allowed
*/
- new_mask = cpus_allowed;
+ *new_mask = *cpus_allowed;
goto again;
}
}
out_unlock:
+ put_cpumask_var(cpus_allowed, temp_cpumask_1);
+ put_cpumask_var(new_mask, temp_cpumask_2);
put_task_struct(p);
put_online_cpus();
return retval;
@@ -6107,15 +6128,19 @@ static int __migrate_task_irq(struct tas
static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
{
unsigned long flags;
- cpumask_t mask;
+ cpumask_ptr mask;
+ cpumask_ptr cpus_allowed;
struct rq *rq;
int dest_cpu;

+ get_cpumask_var(mask, temp_cpumask_1);
+ get_cpumask_var(cpus_allowed, temp_cpumask_2);
do {
/* On same node? */
- mask = node_to_cpumask(cpu_to_node(dead_cpu));
- cpus_and(mask, mask, p->cpus_allowed);
- dest_cpu = any_online_cpu(mask);
+ node_to_cpumask_ptr(pnodemask, cpu_to_node(dead_cpu));
+ *mask = *pnodemask;
+ cpus_and(*mask, *mask, p->cpus_allowed);
+ dest_cpu = any_online_cpu(*mask);

/* On any allowed CPU? */
if (dest_cpu >= nr_cpu_ids)
@@ -6123,9 +6148,8 @@ static void move_task_off_dead_cpu(int d

/* No more Mr. Nice Guy. */
if (dest_cpu >= nr_cpu_ids) {
- cpumask_t cpus_allowed;
+ cpuset_cpus_allowed_locked(p, cpus_allowed);

- cpuset_cpus_allowed_locked(p, &cpus_allowed);
/*
* Try to stay on the same cpuset, where the
* current cpuset may be a subset of all cpus.
@@ -6134,7 +6158,7 @@ static void move_task_off_dead_cpu(int d
* called within calls to cpuset_lock/cpuset_unlock.
*/
rq = task_rq_lock(p, &flags);
- p->cpus_allowed = cpus_allowed;
+ p->cpus_allowed = *cpus_allowed;
dest_cpu = any_online_cpu(p->cpus_allowed);
task_rq_unlock(rq, &flags);

@@ -6150,6 +6174,9 @@ static void move_task_off_dead_cpu(int d
}
}
} while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
+
+ put_cpumask_var(mask, temp_cpumask_1);
+ put_cpumask_var(cpus_allowed, temp_cpumask_2);
}

/*
@@ -6710,7 +6737,7 @@ static int sched_domain_debug_one(struct

static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
- cpumask_t *groupmask;
+ cpumask_ptr groupmask;
int level = 0;

if (!sd) {
@@ -6720,7 +6747,7 @@ static void sched_domain_debug(struct sc

printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);

- groupmask = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
+ alloc_cpumask_ptr(&groupmask);
if (!groupmask) {
printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
return;
@@ -6734,7 +6761,7 @@ static void sched_domain_debug(struct sc
if (!sd)
break;
}
- kfree(groupmask);
+ free_cpumask_ptr(groupmask);
}
#else /* !CONFIG_SCHED_DEBUG */
# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -7120,9 +7147,9 @@ static int cpu_to_allnodes_group(int cpu
struct sched_group **sg, cpumask_t *nodemask)
{
int group;
+ node_to_cpumask_ptr(pnodemask, cpu_to_node(cpu));

- *nodemask = node_to_cpumask(cpu_to_node(cpu));
- cpus_and(*nodemask, *nodemask, *cpu_map);
+ cpus_and(*nodemask, *pnodemask, *cpu_map);
group = first_cpu(*nodemask);

if (sg)
@@ -7172,9 +7199,9 @@ static void free_sched_groups(const cpum

for (i = 0; i < nr_node_ids; i++) {
struct sched_group *oldsg, *sg = sched_group_nodes[i];
+ node_to_cpumask_ptr(pnodemask, i);

- *nodemask = node_to_cpumask(i);
- cpus_and(*nodemask, *nodemask, *cpu_map);
+ cpus_and(*nodemask, *pnodemask, *cpu_map);
if (cpus_empty(*nodemask))
continue;

@@ -7297,19 +7324,6 @@ struct allmasks {
#endif
};

-#if NR_CPUS > 128
-#define SCHED_CPUMASK_ALLOC 1
-#define SCHED_CPUMASK_FREE(v) kfree(v)
-#define SCHED_CPUMASK_DECLARE(v) struct allmasks *v
-#else
-#define SCHED_CPUMASK_ALLOC 0
-#define SCHED_CPUMASK_FREE(v)
-#define SCHED_CPUMASK_DECLARE(v) struct allmasks _v, *v = &_v
-#endif
-
-#define SCHED_CPUMASK_VAR(v, a) cpumask_t *v = (cpumask_t *) \
- ((unsigned long)(a) + offsetof(struct allmasks, v))
-
static int default_relax_domain_level = -1;

static int __init setup_relax_domain_level(char *str)
@@ -7354,8 +7368,9 @@ static int __build_sched_domains(const c
{
int i;
struct root_domain *rd;
- SCHED_CPUMASK_DECLARE(allmasks);
- cpumask_t *tmpmask;
+ CPUMASK_ALLOC(allmasks);
+ CPUMASK_PTR(tmpmask, allmasks);
+
#ifdef CONFIG_NUMA
struct sched_group **sched_group_nodes = NULL;
int sd_allnodes = 0;
@@ -7367,6 +7382,7 @@ static int __build_sched_domains(const c
GFP_KERNEL);
if (!sched_group_nodes) {
printk(KERN_WARNING "Can not alloc sched group node list\n");
+ CPUMASK_FREE(allmasks);
return -ENOMEM;
}
#endif
@@ -7377,13 +7393,11 @@ static int __build_sched_domains(const c
#ifdef CONFIG_NUMA
kfree(sched_group_nodes);
#endif
+ CPUMASK_FREE(allmasks);
return -ENOMEM;
}

-#if SCHED_CPUMASK_ALLOC
- /* get space for all scratch cpumask variables */
- allmasks = kmalloc(sizeof(*allmasks), GFP_KERNEL);
- if (!allmasks) {
+ if (allmasks == NULL) {
printk(KERN_WARNING "Cannot alloc cpumask array\n");
kfree(rd);
#ifdef CONFIG_NUMA
@@ -7391,9 +7405,6 @@ static int __build_sched_domains(const c
#endif
return -ENOMEM;
}
-#endif
- tmpmask = (cpumask_t *)allmasks;
-

#ifdef CONFIG_NUMA
sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
@@ -7404,10 +7415,10 @@ static int __build_sched_domains(const c
*/
for_each_cpu_mask_nr(i, *cpu_map) {
struct sched_domain *sd = NULL, *p;
- SCHED_CPUMASK_VAR(nodemask, allmasks);
+ CPUMASK_PTR(nodemask, allmasks);
+ node_to_cpumask_ptr(pnodemask, cpu_to_node(i));

- *nodemask = node_to_cpumask(cpu_to_node(i));
- cpus_and(*nodemask, *nodemask, *cpu_map);
+ cpus_and(*nodemask, *pnodemask, *cpu_map);

#ifdef CONFIG_NUMA
if (cpus_weight(*cpu_map) >
@@ -7470,8 +7481,8 @@ static int __build_sched_domains(const c
#ifdef CONFIG_SCHED_SMT
/* Set up CPU (sibling) groups */
for_each_cpu_mask_nr(i, *cpu_map) {
- SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
- SCHED_CPUMASK_VAR(send_covered, allmasks);
+ CPUMASK_PTR(this_sibling_map, allmasks);
+ CPUMASK_PTR(send_covered, allmasks);

*this_sibling_map = per_cpu(cpu_sibling_map, i);
cpus_and(*this_sibling_map, *this_sibling_map, *cpu_map);
@@ -7487,8 +7498,8 @@ static int __build_sched_domains(const c
#ifdef CONFIG_SCHED_MC
/* Set up multi-core groups */
for_each_cpu_mask_nr(i, *cpu_map) {
- SCHED_CPUMASK_VAR(this_core_map, allmasks);
- SCHED_CPUMASK_VAR(send_covered, allmasks);
+ CPUMASK_PTR(this_core_map, allmasks);
+ CPUMASK_PTR(send_covered, allmasks);

*this_core_map = cpu_coregroup_map(i);
cpus_and(*this_core_map, *this_core_map, *cpu_map);
@@ -7503,11 +7514,11 @@ static int __build_sched_domains(const c

/* Set up physical groups */
for (i = 0; i < nr_node_ids; i++) {
- SCHED_CPUMASK_VAR(nodemask, allmasks);
- SCHED_CPUMASK_VAR(send_covered, allmasks);
+ CPUMASK_PTR(nodemask, allmasks);
+ CPUMASK_PTR(send_covered, allmasks);
+ node_to_cpumask_ptr(pnodemask, i);

- *nodemask = node_to_cpumask(i);
- cpus_and(*nodemask, *nodemask, *cpu_map);
+ cpus_and(*nodemask, *pnodemask, *cpu_map);
if (cpus_empty(*nodemask))
continue;

@@ -7519,7 +7530,7 @@ static int __build_sched_domains(const c
#ifdef CONFIG_NUMA
/* Set up node groups */
if (sd_allnodes) {
- SCHED_CPUMASK_VAR(send_covered, allmasks);
+ CPUMASK_PTR(send_covered, allmasks);

init_sched_build_groups(cpu_map, cpu_map,
&cpu_to_allnodes_group,
@@ -7529,15 +7540,15 @@ static int __build_sched_domains(const c
for (i = 0; i < nr_node_ids; i++) {
/* Set up node groups */
struct sched_group *sg, *prev;
- SCHED_CPUMASK_VAR(nodemask, allmasks);
- SCHED_CPUMASK_VAR(domainspan, allmasks);
- SCHED_CPUMASK_VAR(covered, allmasks);
+ CPUMASK_PTR(nodemask, allmasks);
+ CPUMASK_PTR(domainspan, allmasks);
+ CPUMASK_PTR(covered, allmasks);
+ node_to_cpumask_ptr(pnodemask, i);
int j;

- *nodemask = node_to_cpumask(i);
cpus_clear(*covered);

- cpus_and(*nodemask, *nodemask, *cpu_map);
+ cpus_and(*nodemask, *pnodemask, *cpu_map);
if (cpus_empty(*nodemask)) {
sched_group_nodes[i] = NULL;
continue;
@@ -7566,7 +7577,7 @@ static int __build_sched_domains(const c
prev = sg;

for (j = 0; j < nr_node_ids; j++) {
- SCHED_CPUMASK_VAR(notcovered, allmasks);
+ CPUMASK_PTR(notcovered, allmasks);
int n = (i + j) % nr_node_ids;
node_to_cpumask_ptr(pnodemask, n);

@@ -7645,13 +7656,13 @@ static int __build_sched_domains(const c
cpu_attach_domain(sd, rd, i);
}

- SCHED_CPUMASK_FREE((void *)allmasks);
+ CPUMASK_FREE(allmasks);
return 0;

#ifdef CONFIG_NUMA
error:
free_sched_groups(cpu_map, tmpmask);
- SCHED_CPUMASK_FREE((void *)allmasks);
+ CPUMASK_FREE(allmasks);
return -ENOMEM;
#endif
}

--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/