[PATCH 6/7] sched/rt: make it configurable

From: Nicolas Pitre
Date: Mon May 29 2017 - 17:19:26 EST


On most small systems where user space is tightly controlled, the realtime
scheduling class can often be dispensed with to reduce the kernel footprint.
Let's make it configurable.

Signed-off-by: Nicolas Pitre <nico@xxxxxxxxxx>
---
include/linux/init_task.h | 15 +++++++++++----
include/linux/sched.h | 2 ++
include/linux/sched/rt.h | 4 ++--
init/Kconfig | 14 ++++++++++++--
kernel/sched/Makefile | 4 ++--
kernel/sched/core.c | 42 +++++++++++++++++++++++++++++++++++++++---
kernel/sched/debug.c | 2 ++
kernel/sched/sched.h | 7 +++++--
kernel/sched/stop_task.c | 4 +++-
kernel/sysctl.c | 4 +++-
kernel/time/posix-cpu-timers.c | 6 +++++-
11 files changed, 86 insertions(+), 18 deletions(-)

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index e049526bc1..6befc0aa61 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -225,6 +225,16 @@ extern struct cred init_cred;
#define INIT_TASK_SECURITY
#endif

+#ifdef CONFIG_SCHED_RT
+#define INIT_TASK_RT(tsk) \
+ .rt = { \
+ .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
+ .time_slice = RR_TIMESLICE, \
+ },
+#else
+#define INIT_TASK_RT(tsk)
+#endif
+
/*
* INIT_TASK is used to set up the first task table, touch at
* your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -250,10 +260,7 @@ extern struct cred init_cred;
.se = { \
.group_node = LIST_HEAD_INIT(tsk.se.group_node), \
}, \
- .rt = { \
- .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \
- .time_slice = RR_TIMESLICE, \
- }, \
+ INIT_TASK_RT(tsk) \
.tasks = LIST_HEAD_INIT(tsk.tasks), \
INIT_PUSHABLE_TASKS(tsk) \
INIT_CGROUP_SCHED(tsk) \
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ba0c203669..71a43480ed 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -518,7 +518,9 @@ struct task_struct {

const struct sched_class *sched_class;
struct sched_entity se;
+#ifdef CONFIG_SCHED_RT
struct sched_rt_entity rt;
+#endif
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index f93329aba3..f2d636582d 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -7,7 +7,7 @@ struct task_struct;

static inline int rt_prio(int prio)
{
- if (unlikely(prio < MAX_RT_PRIO))
+ if (IS_ENABLED(CONFIG_SCHED_RT) && unlikely(prio < MAX_RT_PRIO))
return 1;
return 0;
}
@@ -17,7 +17,7 @@ static inline int rt_task(struct task_struct *p)
return rt_prio(p->prio);
}

-#ifdef CONFIG_RT_MUTEXES
+#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT)
/*
* Must hold either p->pi_lock or task_rq(p)->lock.
*/
diff --git a/init/Kconfig b/init/Kconfig
index f73e3f0940..3bcd49f576 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -687,7 +687,7 @@ config TREE_RCU_TRACE

config RCU_BOOST
bool "Enable RCU priority boosting"
- depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
+ depends on SCHED_RT && RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT
default n
help
This option boosts the priority of preempted RCU readers that
@@ -1090,7 +1090,7 @@ config CFS_BANDWIDTH

config RT_GROUP_SCHED
bool "Group scheduling for SCHED_RR/FIFO"
- depends on CGROUP_SCHED
+ depends on CGROUP_SCHED && SCHED_RT
default n
help
This feature lets you explicitly allocate real CPU bandwidth
@@ -1303,8 +1303,17 @@ config SCHED_AUTOGROUP
desktop applications. Task group autogeneration is currently based
upon task session.

+config SCHED_RT
+ bool "Real Time Task Scheduling" if EXPERT
+ default y
+ help
+ This adds the sched_rt scheduling class to the kernel providing
+ support for the SCHED_FIFO and SCHED_RR policies. You might want
+ to disable this to reduce the kernel size. If unsure say y.
+
config SCHED_DL
bool "Deadline Task Scheduling" if EXPERT
+ depends on SCHED_RT
default y
help
This adds the sched_dl scheduling class to the kernel providing
@@ -1632,6 +1641,7 @@ config BASE_FULL
config FUTEX
bool "Enable futex support" if EXPERT
default y
+ depends on SCHED_RT
select RT_MUTEXES
help
Disabling this option will cause the kernel to be built without
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 3bd6a7c1cc..bccbef85e5 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,8 +16,8 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
endif

obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += wait.o swait.o completion.o idle.o
-obj-y += idle_task.o fair.o rt.o
+obj-y += wait.o swait.o completion.o idle.o idle_task.o fair.o
+obj-$(CONFIG_SCHED_RT) += rt.o
obj-$(CONFIG_SCHED_DL) += deadline.o $(if $(CONFIG_SMP),cpudeadline.o)
obj-$(CONFIG_SMP) += cpupri.o topology.o stop_task.o
obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a7b004e440..3dd6fce750 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -640,6 +640,7 @@ bool sched_can_stop_tick(struct rq *rq)
return false;
#endif

+#ifdef CONFIG_SCHED_RT
/*
* If there are more than one RR tasks, we need the tick to effect the
* actual RR behaviour.
@@ -658,6 +659,7 @@ bool sched_can_stop_tick(struct rq *rq)
fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
if (fifo_nr_running)
return true;
+#endif

/*
* If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
@@ -1586,7 +1588,7 @@ void sched_set_stop_task(int cpu, struct task_struct *stop)
* Reset it back to a normal scheduling class so that
* it can die in pieces.
*/
- old_stop->sched_class = &rt_sched_class;
+ old_stop->sched_class = stop_sched_class.next;
}
}

@@ -2182,11 +2184,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
__dl_clear_params(p);
#endif

+#ifdef CONFIG_SCHED_RT
INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
p->rt.time_slice = sched_rr_timeslice;
p->rt.on_rq = 0;
p->rt.on_list = 0;
+#endif

#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -3716,13 +3720,18 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
p->sched_class = &dl_sched_class;
} else
#endif
+#ifdef CONFIG_SCHED_RT
if (rt_prio(prio)) {
if (oldprio < prio)
queue_flag |= ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
- } else {
+ } else
+#endif
+ {
+#ifdef CONFIG_SCHED_RT
if (rt_prio(oldprio))
p->rt.timeout = 0;
+#endif
p->sched_class = &fair_sched_class;
}

@@ -3997,6 +4006,23 @@ static int __sched_setscheduler(struct task_struct *p,

/* May grab non-irq protected spin_locks: */
BUG_ON(in_interrupt());
+
+ /*
+ * When the RT scheduling class is disabled, let's make sure kernel threads
+ * wanting RT still get lowest nice value to give them highest available
+ * priority rather than simply returning an error. Obviously we can't test
+ * rt_policy() here as it is always false in that case.
+ */
+ if (!IS_ENABLED(CONFIG_SCHED_RT) && !user &&
+ (policy == SCHED_FIFO || policy == SCHED_RR)) {
+ static const struct sched_attr k_attr = {
+ .sched_policy = SCHED_NORMAL,
+ .sched_nice = MIN_NICE,
+ };
+ attr = &k_attr;
+ policy = SCHED_NORMAL;
+ }
+
recheck:
/* Double check policy once rq lock held: */
if (policy < 0) {
@@ -5726,7 +5752,9 @@ void __init sched_init_smp(void)
sched_init_granularity();
free_cpumask_var(non_isolated_cpus);

+#ifdef CONFIG_SCHED_RT
init_sched_rt_class();
+#endif
#ifdef CONFIG_SCHED_DL
init_sched_dl_class();
#endif
@@ -5832,7 +5860,9 @@ void __init sched_init(void)
}
#endif /* CONFIG_CPUMASK_OFFSTACK */

+#ifdef CONFIG_SCHED_RT
init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
+#endif
#ifdef CONFIG_SCHED_DL
init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
#endif
@@ -5864,7 +5894,10 @@ void __init sched_init(void)
rq->calc_load_active = 0;
rq->calc_load_update = jiffies + LOAD_FREQ;
init_cfs_rq(&rq->cfs);
+#ifdef CONFIG_SCHED_RT
init_rt_rq(&rq->rt);
+ rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
+#endif
#ifdef CONFIG_SCHED_DL
init_dl_rq(&rq->dl);
#endif
@@ -5895,7 +5928,6 @@ void __init sched_init(void)
init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
#endif /* CONFIG_FAIR_GROUP_SCHED */

- rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
#ifdef CONFIG_RT_GROUP_SCHED
init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
#endif
@@ -6132,7 +6164,9 @@ static DEFINE_SPINLOCK(task_group_lock);
static void sched_free_group(struct task_group *tg)
{
free_fair_sched_group(tg);
+#ifdef CONFIG_SCHED_RT
free_rt_sched_group(tg);
+#endif
autogroup_free(tg);
kmem_cache_free(task_group_cache, tg);
}
@@ -6149,8 +6183,10 @@ struct task_group *sched_create_group(struct task_group *parent)
if (!alloc_fair_sched_group(tg, parent))
goto err;

+#ifdef CONFIG_SCHED_RT
if (!alloc_rt_sched_group(tg, parent))
goto err;
+#endif

return tg;

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 84f80a81ab..c550723ce9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -645,7 +645,9 @@ do { \

spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
+#ifdef CONFIG_SCHED_RT
print_rt_stats(m, cpu);
+#endif
#ifdef CONFIG_SCHED_DL
print_dl_stats(m, cpu);
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 41dc10b707..38439eefd3 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -132,7 +132,8 @@ static inline int fair_policy(int policy)

static inline int rt_policy(int policy)
{
- return policy == SCHED_FIFO || policy == SCHED_RR;
+ return IS_ENABLED(CONFIG_SCHED_RT) &&
+ (policy == SCHED_FIFO || policy == SCHED_RR);
}

static inline int dl_policy(int policy)
@@ -1447,8 +1448,10 @@ static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
#define sched_class_highest (&stop_sched_class)
#elif defined(CONFIG_SCHED_DL)
#define sched_class_highest (&dl_sched_class)
-#else
+#elif defined(CONFIG_SCHED_RT)
#define sched_class_highest (&rt_sched_class)
+#else
+#define sched_class_highest (&fair_sched_class)
#endif

#define for_each_class(class) \
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 5632dc3e63..7cad8c1540 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -112,8 +112,10 @@ static void update_curr_stop(struct rq *rq)
const struct sched_class stop_sched_class = {
#ifdef CONFIG_SCHED_DL
.next = &dl_sched_class,
-#else
+#elif defined(CONFIG_SCHED_RT)
.next = &rt_sched_class,
+#else
+ .next = &fair_sched_class,
#endif

.enqueue_task = enqueue_task_stop,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4dfba1a76c..1c670f4053 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -401,6 +401,7 @@ static struct ctl_table kern_table[] = {
},
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
+#ifdef CONFIG_SCHED_RT
{
.procname = "sched_rt_period_us",
.data = &sysctl_sched_rt_period,
@@ -422,6 +423,7 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sched_rr_handler,
},
+#endif
#ifdef CONFIG_SCHED_AUTOGROUP
{
.procname = "sched_autogroup_enabled",
@@ -1071,7 +1073,7 @@ static struct ctl_table kern_table[] = {
.extra1 = &neg_one,
},
#endif
-#ifdef CONFIG_RT_MUTEXES
+#if defined(CONFIG_RT_MUTEXES) && defined(CONFIG_SCHED_RT)
{
.procname = "max_lock_depth",
.data = &max_lock_depth,
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index d2a1e6dd02..010efb0e91 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -790,10 +790,12 @@ static void check_thread_timers(struct task_struct *tsk,
struct list_head *firing)
{
struct list_head *timers = tsk->cpu_timers;
- struct signal_struct *const sig = tsk->signal;
struct task_cputime *tsk_expires = &tsk->cputime_expires;
u64 expires;
+#ifdef CONFIG_SCHED_RT
+ struct signal_struct *const sig = tsk->signal;
unsigned long soft;
+#endif

/*
* If cputime_expires is zero, then there are no active
@@ -811,6 +813,7 @@ static void check_thread_timers(struct task_struct *tsk,
tsk_expires->sched_exp = check_timers_list(++timers, firing,
tsk->se.sum_exec_runtime);

+#ifdef CONFIG_SCHED_RT
/*
* Check for the special case thread timers.
*/
@@ -847,6 +850,7 @@ static void check_thread_timers(struct task_struct *tsk,
__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
}
}
+#endif
if (task_cputime_zero(tsk_expires))
tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
}
--
2.9.4