[RFC 1/2] procfs: /proc/sched_stat fails on very very large machines.

From: Nathan Zimmer
Date: Tue Nov 06 2012 - 16:02:14 EST


On systems with 4096 cores doing a cat /proc/sched_stat fails.
We are trying to push all the data into a single kmalloc buffer.
The issue is on these very large machines all the data will not fit in 4mb.

A better solution is to not us the single_open mechanism but to provide
our own seq_operations.

The output should be identical to previous version and thus not need the
version number.

Signed-off-by: Nathan Zimmer <nzimmer@xxxxxxx>
CC: Ingo Molnar <mingo@xxxxxxxxxx>
CC: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
CC: linux-kernel@xxxxxxxxxxxxxxx

---
kernel/sched/stats.c | 139 +++++++++++++++++++++++++++++---------------------
1 files changed, 81 insertions(+), 58 deletions(-)

diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 903ffa9..a4326a8 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -17,90 +17,113 @@ static int show_schedstat(struct seq_file *seq, void *v)
int cpu;
int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
char *mask_str = kmalloc(mask_len, GFP_KERNEL);
+ cpu = *(loff_t *)v;

if (mask_str == NULL)
return -ENOMEM;

- seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
- seq_printf(seq, "timestamp %lu\n", jiffies);
- for_each_online_cpu(cpu) {
- struct rq *rq = cpu_rq(cpu);
+ if (!cpu) {
+ seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+ seq_printf(seq, "timestamp %lu\n", jiffies);
+ }
+
+ struct rq *rq = cpu_rq(cpu);
#ifdef CONFIG_SMP
- struct sched_domain *sd;
- int dcount = 0;
+ struct sched_domain *sd;
+ int dcount = 0;
#endif

- /* runqueue-specific stats */
- seq_printf(seq,
- "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
- cpu, rq->yld_count,
- rq->sched_count, rq->sched_goidle,
- rq->ttwu_count, rq->ttwu_local,
- rq->rq_cpu_time,
- rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+ /* runqueue-specific stats */
+ seq_printf(seq,
+ "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
+ cpu, rq->yld_count,
+ rq->sched_count, rq->sched_goidle,
+ rq->ttwu_count, rq->ttwu_local,
+ rq->rq_cpu_time,
+ rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);

- seq_printf(seq, "\n");
+ seq_printf(seq, "\n");

#ifdef CONFIG_SMP
- /* domain-specific stats */
- rcu_read_lock();
- for_each_domain(cpu, sd) {
- enum cpu_idle_type itype;
-
- cpumask_scnprintf(mask_str, mask_len,
- sched_domain_span(sd));
- seq_printf(seq, "domain%d %s", dcount++, mask_str);
- for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
- itype++) {
- seq_printf(seq, " %u %u %u %u %u %u %u %u",
- sd->lb_count[itype],
- sd->lb_balanced[itype],
- sd->lb_failed[itype],
- sd->lb_imbalance[itype],
- sd->lb_gained[itype],
- sd->lb_hot_gained[itype],
- sd->lb_nobusyq[itype],
- sd->lb_nobusyg[itype]);
- }
- seq_printf(seq,
- " %u %u %u %u %u %u %u %u %u %u %u %u\n",
- sd->alb_count, sd->alb_failed, sd->alb_pushed,
- sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
- sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
- sd->ttwu_wake_remote, sd->ttwu_move_affine,
- sd->ttwu_move_balance);
+ /* domain-specific stats */
+ rcu_read_lock();
+ for_each_domain(cpu, sd) {
+ enum cpu_idle_type itype;
+
+ cpumask_scnprintf(mask_str, mask_len,
+ sched_domain_span(sd));
+ seq_printf(seq, "domain%d %s", dcount++, mask_str);
+ for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
+ itype++) {
+ seq_printf(seq, " %u %u %u %u %u %u %u %u",
+ sd->lb_count[itype],
+ sd->lb_balanced[itype],
+ sd->lb_failed[itype],
+ sd->lb_imbalance[itype],
+ sd->lb_gained[itype],
+ sd->lb_hot_gained[itype],
+ sd->lb_nobusyq[itype],
+ sd->lb_nobusyg[itype]);
}
- rcu_read_unlock();
-#endif
+ seq_printf(seq,
+ " %u %u %u %u %u %u %u %u %u %u %u %u\n",
+ sd->alb_count, sd->alb_failed, sd->alb_pushed,
+ sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
+ sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
+ sd->ttwu_wake_remote, sd->ttwu_move_affine,
+ sd->ttwu_move_balance);
}
+ rcu_read_unlock();
+#endif
kfree(mask_str);
return 0;
}

+static void *schedstat_start(struct seq_file *file, loff_t *offset)
+{
+ if (cpu_online(*offset))
+ return offset;
+ return NULL;
+}
+
+static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
+{
+ *offset = cpumask_next(*offset, cpu_online_mask);
+ if (cpu_online(*offset))
+ return offset;
+ return NULL;
+}
+
+static void schedstat_stop(struct seq_file *file, void *data)
+{
+}
+
+static const struct seq_operations schedstat_sops = {
+ .start = schedstat_start,
+ .next = schedstat_next,
+ .stop = schedstat_stop,
+ .show = show_schedstat,
+};
+
static int schedstat_open(struct inode *inode, struct file *file)
{
- unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
- char *buf = kmalloc(size, GFP_KERNEL);
- struct seq_file *m;
- int res;
+ int res = 0;
+
+ res = seq_open(file, &schedstat_sops);

- if (!buf)
- return -ENOMEM;
- res = single_open(file, show_schedstat, NULL);
- if (!res) {
- m = file->private_data;
- m->buf = buf;
- m->size = size;
- } else
- kfree(buf);
return res;
}

+static int schedstat_release(struct inode *inode, struct file *file)
+{
+ return 0;
+};
+
static const struct file_operations proc_schedstat_operations = {
.open = schedstat_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = single_release,
+ .release = schedstat_release,
};

static int __init proc_schedstat_init(void)
--
1.6.0.2

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/