[RFC PATCH] sched: Add cpu based entries to debugfs

From: David Ahern
Date: Sun Mar 29 2015 - 22:14:26 EST


Currently sched_debug can be added to the kernel commandline parameters
to dump domain information during boot. This method is not practical with
a large number of CPUs.

This patch adds per-cpu entries to debugfs under a sched directory.
Reading the per-cpu file shows the domain information in a human-readable
format:

$ cat /sys/kernel/debug/sched/cpu0
domain 0 / SMT:
flags: 0x2af: load-balance new-idle exec fork affine cpu-capacity share-pkg-resources
span: 0-7
groups:
0 (cpu_capacity = 147)
1 (cpu_capacity = 147)
2 (cpu_capacity = 147)
3 (cpu_capacity = 147)
4 (cpu_capacity = 147)
5 (cpu_capacity = 147)
6 (cpu_capacity = 147)
7 (cpu_capacity = 147)

domain 2 / DIE:
flags: 0x102f: load-balance new-idle exec fork affine prefer-sibling
span: 0-127
groups:
0-7 (cpu_capacity = 1176)
8-15 (cpu_capacity = 1176)
16-23 (cpu_capacity = 1176)
24-31 (cpu_capacity = 1176)
32-39 (cpu_capacity = 1176)
40-47 (cpu_capacity = 1176)
48-55 (cpu_capacity = 1176)
56-63 (cpu_capacity = 1176)
64-71 (cpu_capacity = 1176)
72-79 (cpu_capacity = 1176)
80-87 (cpu_capacity = 1176)
88-95 (cpu_capacity = 1176)
96-103 (cpu_capacity = 1176)
104-111 (cpu_capacity = 1176)
112-119 (cpu_capacity = 1176)
120-127 (cpu_capacity = 1176)

domain 3 / NUMA:
flags: 0x642f: load-balance new-idle exec fork affine serialize overlap numa
span: 0-1023
groups:
0-127 (cpu_capacity = 18816)
128-255 (cpu_capacity = 18816)
256-383 (cpu_capacity = 18816)
384-511 (cpu_capacity = 18816)
512-639 (cpu_capacity = 18816)
640-767 (cpu_capacity = 18816)
768-895 (cpu_capacity = 18816)
896-1023 (cpu_capacity = 18816)

Before spending too much time formalizing this I wanted to see if you guys
would entertain the idea of making this info available via debugfs. It does
move the existing sched_features file to sched/features -- not sure how
acceptable it is to move files in debugfs.

TO-DO: handle hotplug

Signed-off-by: David Ahern <david.ahern@xxxxxxxxxx>
---
kernel/sched/core.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 164 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62671f53202a..b4d8d0c8260e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -268,12 +268,173 @@ static const struct file_operations sched_feat_fops = {
.release = single_release,
};

+static const char * const sd_flag_names[] = {
+ "load-balance",
+ "new-idle",
+ "exec",
+ "fork",
+ "wake",
+ "affine",
+ "",
+ "cpu-capacity",
+ "power-domain",
+ "share-pkg-resources",
+ "serialize",
+ "asym-packing",
+ "prefer-sibling",
+ "overlap",
+ "numa",
+ "",
+};
+static void sched_cpu_domain_show(struct seq_file *m, struct sched_domain *sd,
+ int cpu)
+{
+ struct cpumask groupmask;
+ struct sched_group *group = sd->groups;
+ int i;
+
+ cpumask_clear(&groupmask);
+
+ seq_printf(m, "domain %d / %s:\n", sd->level, sd->name);
+ seq_printf(m, " flags: 0x%x: ", sd->flags);
+
+ for (i = 0; i < ARRAY_SIZE(sd_flag_names); ++i) {
+ if (sd->flags & (1 << i))
+ seq_printf(m, " %s", sd_flag_names[i]);
+ }
+ seq_puts(m, "\n");
+
+ if (!(sd->flags & SD_LOAD_BALANCE) && sd->parent)
+ seq_puts(m, " ERROR: !SD_LOAD_BALANCE domain has parent\n");
+
+ seq_printf(m, " span: %*pbl\n",
+ cpumask_pr_args(sched_domain_span(sd)));
+
+ if (!cpumask_test_cpu(cpu, sched_domain_span(sd)))
+ seq_printf(m, " ERROR: domain->span does not contain CPU%d\n", cpu);
+
+ if (!cpumask_test_cpu(cpu, sched_group_cpus(group)))
+ seq_printf(m, " ERROR: domain->groups does not contain CPU%d\n", cpu);
+
+ seq_puts(m, " groups:\n");
+ do {
+ if (!group) {
+ seq_puts(m, " ERROR: group is NULL\n");
+ break;
+ }
+
+ /*
+ * Even though we initialize ->capacity to something semi-sane,
+ * we leave capacity_orig unset. This allows us to detect if
+ * domain iteration is still funny without causing /0 traps.
+ */
+ if (!group->sgc->capacity_orig) {
+ seq_puts(m, " ERROR: domain->cpu_capacity not set\n");
+ break;
+ }
+
+ if (!cpumask_weight(sched_group_cpus(group))) {
+ seq_puts(m, " ERROR: empty group\n");
+ break;
+ }
+
+ if (!(sd->flags & SD_OVERLAP) &&
+ cpumask_intersects(&groupmask, sched_group_cpus(group))) {
+ seq_puts(m, " ERROR: repeated CPUs\n");
+ break;
+ }
+
+ cpumask_or(&groupmask, &groupmask, sched_group_cpus(group));
+
+ seq_printf(m, " %*pbl",
+ cpumask_pr_args(sched_group_cpus(group)));
+
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+ seq_printf(m, " (cpu_capacity = %d)",
+ group->sgc->capacity);
+ }
+ seq_puts(m, "\n");
+
+ group = group->next;
+ } while (group != sd->groups);
+
+ if (!cpumask_equal(sched_domain_span(sd), &groupmask))
+ seq_puts(m, " ERROR: groups don't span domain->span\n");
+
+ if (sd->parent &&
+ !cpumask_subset(&groupmask, sched_domain_span(sd->parent))) {
+ seq_puts(m, " ERROR: parent span is not a superset of domain->span\n");
+ }
+}
+
+static int sched_cpu_show(struct seq_file *m, void *unused)
+{
+ struct sched_domain *sd;
+ int cpu = (int) ((long) m->private);
+
+ if (cpu < 0 || cpu > CONFIG_NR_CPUS) {
+ seq_printf(m, "invalid CPU, %d\n", cpu);
+ return 0;
+ }
+
+ for_each_domain(cpu, sd) {
+ sched_cpu_domain_show(m, sd, cpu);
+ seq_puts(m, "\n");
+ }
+
+ return 0;
+}
+
+static int sched_cpu_open(struct inode *inode, struct file *filp)
+{
+ return single_open(filp, sched_cpu_show, inode->i_private);
+}
+static const struct file_operations sched_cpu_fops = {
+ .open = sched_cpu_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+static struct dentry *d_sched_debug;
+static struct dentry *d_sched_cpu[CONFIG_NR_CPUS];
+
+static int sched_debugfs_add_cpu(int cpu)
+{
+ char buf[32];
+ long lcpu = cpu;
+
+ snprintf(buf, sizeof(buf), "cpu%d", cpu);
+ d_sched_cpu[cpu] = debugfs_create_file(buf, 0444, d_sched_debug,
+ (void *) lcpu, &sched_cpu_fops);
+
+ if (d_sched_cpu[cpu] == NULL)
+ pr_warn("Failed to create debugfs entry for cpu %d\n", cpu);
+
+ return 0;
+}
+
static __init int sched_init_debug(void)
{
- debugfs_create_file("sched_features", 0644, NULL, NULL,
+ int cpu;
+ int rc = 0;
+
+ d_sched_debug = debugfs_create_dir("sched", NULL);
+ if (!d_sched_debug) {
+ pr_warn("Could not create debugfs 'sched' entry\n");
+ return 0;
+ }
+
+ debugfs_create_file("features", 0644, d_sched_debug, NULL,
&sched_feat_fops);

- return 0;
+ for_each_online_cpu(cpu) {
+ rc = sched_debugfs_add_cpu(cpu);
+ if (rc)
+ goto out;
+ }
+
+out:
+ return rc;
}
late_initcall(sched_init_debug);
#endif /* CONFIG_SCHED_DEBUG */
@@ -6689,7 +6850,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,

if (!cpumask_subset(sched_domain_span(child),
sched_domain_span(sd))) {
- pr_err("BUG: arch topology borken\n");
+ pr_err("BUG: arch topology broken\n");
#ifdef CONFIG_SCHED_DEBUG
pr_err(" the %s domain not a subset of the %s domain\n",
child->name, sd->name);
--
2.3.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/