Re: sched: Move SCHED_DEBUG sysctl to debugfs

From: Peter Zijlstra
Date: Wed Apr 28 2021 - 05:28:17 EST


On Wed, Apr 28, 2021 at 10:54:37AM +0200, Christian Borntraeger wrote:
>
>
> On 28.04.21 10:46, Peter Zijlstra wrote:
> > On Tue, Apr 27, 2021 at 04:59:25PM +0200, Christian Borntraeger wrote:
> > > Peter,
> > >
> > > I just realized that we moved away sysctl tunabled to debugfs in next.
> > > We have seen several cases where it was benefitial to set
> > > sched_migration_cost_ns to a lower value. For example with KVM I can
> > > easily get 50% more transactions with 50000 instead of 500000.
> > > Until now it was possible to use tuned or /etc/sysctl.conf to set
> > > these things permanently.
> > >
> > > Given that some people do not want to have debugfs mounted all the time
> > > I would consider this a regression. The sysctl tunable was always
> > > available.
> > >
> > > I am ok with the "informational" things being in debugfs, but not
> > > the tunables. So how do we proceed here?
> >
> > It's all SCHED_DEBUG; IOW you're relying on DEBUG infrastructure for
> > production performance, and that's your fail.
>
> No its not. sched_migration_cost_ns was NEVER protected by CONFIG_SCHED_DEBUG.
> It was available on all kernels with CONFIG_SMP.

The relevant section from origin/master:kernel/sysctl.c:

#ifdef CONFIG_SCHED_DEBUG
{
.procname = "sched_min_granularity_ns",
.data = &sysctl_sched_min_granularity,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_proc_update_handler,
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
},
{
.procname = "sched_latency_ns",
.data = &sysctl_sched_latency,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_proc_update_handler,
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
},
{
.procname = "sched_wakeup_granularity_ns",
.data = &sysctl_sched_wakeup_granularity,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sched_proc_update_handler,
.extra1 = &min_wakeup_granularity_ns,
.extra2 = &max_wakeup_granularity_ns,
},
#ifdef CONFIG_SMP
{
.procname = "sched_tunable_scaling",
.data = &sysctl_sched_tunable_scaling,
.maxlen = sizeof(enum sched_tunable_scaling),
.mode = 0644,
.proc_handler = sched_proc_update_handler,
.extra1 = &min_sched_tunable_scaling,
.extra2 = &max_sched_tunable_scaling,
},
{
.procname = "sched_migration_cost_ns",
.data = &sysctl_sched_migration_cost,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "sched_nr_migrate",
.data = &sysctl_sched_nr_migrate,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
.data = NULL,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_schedstats,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SCHEDSTATS */
#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
{
.procname = "numa_balancing_scan_delay_ms",
.data = &sysctl_numa_balancing_scan_delay,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_scan_period_min_ms",
.data = &sysctl_numa_balancing_scan_period_min,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_scan_period_max_ms",
.data = &sysctl_numa_balancing_scan_period_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_scan_size_mb",
.data = &sysctl_numa_balancing_scan_size,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ONE,
},
{
.procname = "numa_balancing",
.data = NULL, /* filled in by handler */
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_numa_balancing,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */

How is migration_cost not under SCHED_DEBUG? The bigger problem is that
world+dog has SCHED_DEBUG=y in their .config.