[RFC PATCH v7 20/23] sched/coresched: config option for kernel protection
From: Julien Desfossez
Date:  Fri Aug 28 2020 - 15:54:16 EST
From: Vineeth Pillai <viremana@xxxxxxxxxxxxxxxxxxx>
There are use cases where the kernel protection is not needed. One
example could be about using core scheduling for non-security related
use cases - isolate core for a particular process dynamically. Also,
to test/benchmark the overhead of kernel protection.
Have a compile time and boot time option to disable the feature.
CONFIG_SCHED_CORE_KERNEL_PROTECTION will enable this feature at
compile time and is enabled by default is CONFIG_SCHED_CORE=y.
sched_core_kernel_protection= boot time option to control this. Value
0 will disable the feature.
Signed-off-by: Vineeth Pillai <viremana@xxxxxxxxxxxxxxxxxxx>
---
 .../admin-guide/kernel-parameters.txt         |  9 +++++
 include/linux/sched.h                         |  2 +-
 kernel/Kconfig.preempt                        | 13 +++++++
 kernel/sched/core.c                           | 39 ++++++++++++++++++-
 kernel/sched/sched.h                          |  2 +
 5 files changed, 63 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index a1068742a6df..01e442388e4a 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4611,6 +4611,15 @@
 
 	sbni=		[NET] Granch SBNI12 leased line adapter
 
+	sched_core_kernel_protection=
+			[SCHED_CORE, SCHED_CORE_IRQ_PAUSE] Pause SMT siblings
+			of a core runninig in user mode if atleast one of the
+			siblings of the core is running in kernel. This is to
+			guarantee that kernel data is not leaked to tasks which
+			are not trusted by the kernel.
+			This feature is valid only when Core scheduling is
+			enabled(CONFIG_SCHED_CORE).
+
 	sched_debug	[KNL] Enables verbose scheduler debug messages.
 
 	schedstats=	[KNL,X86] Enable or disable scheduled statistics.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1e04ffe689cb..4d9ae6b4dcc9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2055,7 +2055,7 @@ int sched_trace_rq_nr_running(struct rq *rq);
 
 const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
 
-#ifdef CONFIG_SCHED_CORE
+#ifdef CONFIG_SCHED_CORE_KERNEL_PROTECTION
 void sched_core_unsafe_enter(void);
 void sched_core_unsafe_exit(void);
 void sched_core_unsafe_exit_wait(unsigned long ti_check);
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 4488fbf4d3a8..52f86739f910 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -86,3 +86,16 @@ config SCHED_CORE
 	default y
 	depends on SCHED_SMT
 
+config SCHED_CORE_KERNEL_PROTECTION
+	bool "Core Scheduling for SMT"
+	default y
+	depends on SCHED_CORE
+	help
+	  This option enables pausing all SMT siblings of a core running in
+	  user mode when atleast one of the siblings in the core is in kernel.
+	  This is to enforce security such that information from kernel is not
+	  leaked to non-trusted tasks running on siblings. This option is valid
+	  only if Core Scheduling(CONFIG_SCHED_CORE) is enabled.
+
+	  If in doubt, select 'Y' when CONFIG_SCHED_CORE=y
+
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0dc9172be04d..34238fd67f31 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -75,6 +75,24 @@ __read_mostly int scheduler_running;
 
 #ifdef CONFIG_SCHED_CORE
 
+#ifdef CONFIG_SCHED_CORE_KERNEL_PROTECTION
+
+DEFINE_STATIC_KEY_TRUE(sched_core_kernel_protection);
+static int __init set_sched_core_kernel_protection(char *str)
+{
+	unsigned long val = 0;
+
+	if (!str)
+		return 0;
+
+	if (!kstrtoul(str, 0, &val) && !val)
+		static_branch_disable(&sched_core_kernel_protection);
+
+	return 1;
+}
+__setup("sched_core_kernel_protection=", set_sched_core_kernel_protection);
+#endif
+
 DEFINE_STATIC_KEY_FALSE(__sched_core_enabled);
 
 /* kernel prio, less is more */
@@ -4600,6 +4618,8 @@ static inline bool cookie_match(struct task_struct *a, struct task_struct *b)
 	return a->core_cookie == b->core_cookie;
 }
 
+#ifdef CONFIG_SCHED_CORE_KERNEL_PROTECTION
+
 /*
  * Handler to attempt to enter kernel. It does nothing because the exit to
  * usermode or guest mode will do the actual work (of waiting if needed).
@@ -4609,6 +4629,11 @@ static void sched_core_irq_work(struct irq_work *work)
 	return;
 }
 
+static inline void init_sched_core_irq_work(struct rq *rq)
+{
+	init_irq_work(&rq->core_irq_work, sched_core_irq_work);
+}
+
 /*
  * sched_core_wait_till_safe - Pause the caller's hyperthread until the core
  * exits the core-wide unsafe state. Obviously the CPU calling this function
@@ -4684,6 +4709,9 @@ void sched_core_unsafe_enter(void)
 	struct rq *rq;
 	int i, cpu;
 
+	if (!static_branch_likely(&sched_core_kernel_protection))
+		return;
+
 	/* Ensure that on return to user/guest, we check whether to wait. */
 	if (current->core_cookie)
 		set_tsk_thread_flag(current, TIF_UNSAFE_RET);
@@ -4769,6 +4797,9 @@ void sched_core_unsafe_exit(void)
 	struct rq *rq;
 	int cpu;
 
+	if (!static_branch_likely(&sched_core_kernel_protection))
+		return;
+
 	local_irq_save(flags);
 	cpu = smp_processor_id();
 	rq = cpu_rq(cpu);
@@ -4807,9 +4838,15 @@ void sched_core_unsafe_exit(void)
 
 void sched_core_unsafe_exit_wait(unsigned long ti_check)
 {
+	if (!static_branch_likely(&sched_core_kernel_protection))
+		return;
+
 	sched_core_unsafe_exit();
 	sched_core_wait_till_safe(ti_check);
 }
+#else
+static inline void init_sched_core_irq_work(struct rq *rq) {}
+#endif /* CONFIG_SCHED_CORE_KERNEL_PROTECTION */
 
 // XXX fairness/fwd progress conditions
 /*
@@ -7795,7 +7832,7 @@ int sched_cpu_starting(unsigned int cpu)
 			rq = cpu_rq(i);
 			if (rq->core && rq->core == rq)
 				core_rq = rq;
-			init_irq_work(&rq->core_irq_work, sched_core_irq_work);
+			init_sched_core_irq_work(rq);
 		}
 
 		if (!core_rq)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index dbd8416ddaba..676818bdb9df 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1058,8 +1058,10 @@ struct rq {
 	unsigned int		core_sched_seq;
 	struct rb_root		core_tree;
 	unsigned char		core_forceidle;
+#ifdef CONFIG_SCHED_CORE_KERNEL_PROTECTION
 	struct irq_work		core_irq_work; /* To force HT into kernel */
 	unsigned int		core_this_unsafe_nest;
+#endif
 
 	/* shared state */
 	unsigned int		core_task_seq;
-- 
2.17.1