[PATCH v2 04/35] preempt: introduce CONFIG_PREEMPT_AUTO

From: Ankur Arora
Date: Mon May 27 2024 - 20:36:39 EST


PREEMPT_AUTO adds a new scheduling model which, like PREEMPT_DYNAMIC,
allows dynamic switching between a none/voluntary/full preemption
model. However, unlike PREEMPT_DYNAMIC, it doesn't use explicit
preemption points for the voluntary models.

It works by depending on CONFIG_PREEMPTION (and thus PREEMPT_COUNT),
allowing the scheduler to always know when it is safe to preempt
for all three preemption models.

In addition, it uses an additional need-resched bit
(TIF_NEED_RESCHED_LAZY) which, with TIF_NEED_RESCHED allows the
scheduler to express two kinds of rescheduling intent: schedule at
the earliest opportunity (the usual TIF_NEED_RESCHED semantics), or
express a need for rescheduling while allowing the task on the
runqueue to run to timeslice completion (TIF_NEED_RESCHED_LAZY).

The scheduler chooses the specific need-resched flag based on
the preemption model:

TIF_NEED_RESCHED TIF_NEED_RESCHED_LAZY

none never always [*]
voluntary higher sched class other tasks [*]
full always never

[*] when preempting idle, or for kernel tasks that are 'urgent' in
some way (ex. resched_cpu() used as an RCU hammer), we use
TIF_NEED_RESCHED.

The other part is when preemption happens -- or, when are the
need-resched flags checked:

exit-to-user ret-to-kernel preempt_count()
NEED_RESCHED_LAZY Y N N
NEED_RESCHED Y Y Y

Exposed under CONFIG_EXPERT for now.

Cc: Peter Ziljstra <peterz@xxxxxxxxxxxxx>
Cc: Jonathan Corbet <corbet@xxxxxxx>
Originally-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Link: https://lore.kernel.org/lkml/87jzshhexi.ffs@tglx/
Signed-off-by: Ankur Arora <ankur.a.arora@xxxxxxxxxx>
---
.../admin-guide/kernel-parameters.txt | 1 +
include/linux/thread_info.h | 12 ++++++
init/Makefile | 1 +
kernel/Kconfig.preempt | 37 +++++++++++++++++--
4 files changed, 48 insertions(+), 3 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 2d693300ab57..16a91090d167 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4719,6 +4719,7 @@

preempt= [KNL]
Select preemption mode if you have CONFIG_PREEMPT_DYNAMIC
+ or CONFIG_PREEMPT_AUTO.
none - Limited to cond_resched() calls
voluntary - Limited to cond_resched() and might_sleep() calls
full - Any section that isn't explicitly preempt disabled
diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h
index 9ea0b28068f4..06e13e7acbe2 100644
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -59,6 +59,18 @@ enum syscall_work_bit {

#include <asm/thread_info.h>

+/*
+ * Fall back to the default need-resched flag when an architecture does not
+ * define TIF_NEED_RESCHED_LAZY.
+ *
+ * Note: with !PREEMPT_AUTO, code should not be setting TIF_NEED_RESCHED_LAZY
+ * anywhere. Define this here because we will explicitly test for this bit.
+ */
+#ifndef TIF_NEED_RESCHED_LAZY
+#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED
+#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
+#endif
+
#ifdef __KERNEL__

#ifndef arch_set_restart_data
diff --git a/init/Makefile b/init/Makefile
index cbac576c57d6..da1dba3116dc 100644
--- a/init/Makefile
+++ b/init/Makefile
@@ -27,6 +27,7 @@ smp-flag-$(CONFIG_SMP) := SMP
preempt-flag-$(CONFIG_PREEMPT_BUILD) := PREEMPT
preempt-flag-$(CONFIG_PREEMPT_DYNAMIC) := PREEMPT_DYNAMIC
preempt-flag-$(CONFIG_PREEMPT_RT) := PREEMPT_RT
+preempt-flag-$(CONFIG_PREEMPT_AUTO) := PREEMPT_AUTO

build-version = $(or $(KBUILD_BUILD_VERSION), $(build-version-auto))
build-timestamp = $(or $(KBUILD_BUILD_TIMESTAMP), $(build-timestamp-auto))
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c2f1fd95a821..fe83040ad755 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -11,13 +11,17 @@ config PREEMPT_BUILD
select PREEMPTION
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK

+config HAVE_PREEMPT_AUTO
+ bool
+
choice
prompt "Preemption Model"
default PREEMPT_NONE

config PREEMPT_NONE
bool "No Forced Preemption (Server)"
- select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
+ select PREEMPT_NONE_BUILD if (!PREEMPT_DYNAMIC && !PREEMPT_AUTO)
+
help
This is the traditional Linux preemption model, geared towards
throughput. It will still provide good latencies most of the
@@ -32,7 +36,7 @@ config PREEMPT_NONE
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
depends on !ARCH_NO_PREEMPT
- select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
+ select PREEMPT_VOLUNTARY_BUILD if (!PREEMPT_DYNAMIC && !PREEMPT_AUTO)
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
@@ -95,7 +99,7 @@ config PREEMPTION

config PREEMPT_DYNAMIC
bool "Preemption behaviour defined on boot"
- depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+ depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT && !PREEMPT_AUTO
select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
select PREEMPT_BUILD
default y if HAVE_PREEMPT_DYNAMIC_CALL
@@ -115,6 +119,33 @@ config PREEMPT_DYNAMIC
Interesting if you want the same pre-built kernel should be used for
both Server and Desktop workloads.

+config PREEMPT_AUTO
+ bool "Scheduler controlled preemption model"
+ depends on EXPERT && HAVE_PREEMPT_AUTO && !ARCH_NO_PREEMPT
+ select PREEMPT_BUILD
+ help
+ This option allows to define the preemption model on the kernel
+ command line parameter and thus override the default preemption
+ model selected during compile time.
+
+ However, note that the compile time choice of preemption model
+ might impact other kernel options like the specific RCU model.
+
+ This feature makes the latency of the kernel configurable by
+ allowing the scheduler to choose when to preempt based on
+ the preemption policy in effect. It does this without needing
+ voluntary preemption points.
+
+ With PREEMPT_NONE: the scheduler allows a task (executing in
+ user or kernel context) to run to completion, at least until
+ its current tick expires.
+
+ With PREEMPT_VOLUNTARY: similar to PREEMPT_NONE, but the scheduler
+ will also preempt for higher priority class of processes but not
+ lower.
+
+ With PREEMPT: the scheduler preempts at the earliest opportunity.
+
config SCHED_CORE
bool "Core Scheduling for SMT"
depends on SCHED_SMT
--
2.31.1