Re: [PATCH v2 7/9] sched: define TIF_ALLOW_RESCHED

From: Thomas Gleixner
Date: Wed Sep 20 2023 - 19:58:39 EST


On Tue, Sep 19 2023 at 14:30, Thomas Gleixner wrote:
> On Mon, Sep 18 2023 at 18:57, Linus Torvalds wrote:
>> Anyway, I'm definitely not opposed. We'd get rid of a config option
>> that is presumably not very widely used, and we'd simplify a lot of
>> issues, and get rid of all these badly defined "cond_preempt()"
>> things.
>
> Hmm. Didn't I promise a year ago that I won't do further large scale
> cleanups and simplifications beyond printk.
>
> Maybe I get away this time with just suggesting it. :)

Maybe not. As I'm inveterate curious, I sat down and figured out how
that might look like.

To some extent I really curse my curiosity as the amount of macro maze,
config options and convoluted mess behind all these preempt mechanisms
is beyond disgusting.

Find below a PoC which implements that scheme. It's not even close to
correct, but it builds, boots and survives lightweight testing.

I did not even try to look into time-slice enforcement, but I really want
to share this for illustration and for others to experiment.

This keeps all the existing mechanisms in place and introduces a new
config knob in the preemption model Kconfig switch: PREEMPT_AUTO

If selected it builds a CONFIG_PREEMPT kernel, which disables the
cond_resched() machinery and switches the fair scheduler class to use
the NEED_PREEMPT_LAZY bit by default, i.e. it should be pretty close to
the preempt NONE model except that cond_resched() is a NOOP and I did
not validate the time-slice enforcement. The latter should be a
no-brainer to figure out and fix if required.

For run-time switching this to the FULL preemption model which always
uses TIF_NEED_RESCHED, you need to enable CONFIG_SCHED_DEBUG and then
you can enable "FULL" via:

echo FORCE_NEED_RESCHED >/sys/kernel/debug/sched/features

and switch back to some sort of "NONE" via

echo NO_FORCE_NEED_RESCHED >/sys/kernel/debug/sched/features

It seems to work as expected for a simple hackbench -l 10000 run:

NO_FORCE_NEED_RESCHED FORCE_NEED_RESCHED
schedule() [1] 3646163 2701641
preemption 12554 927856
total 3658717 3629497

[1] is voluntary schedule() AND_ schedule() from return to user space. I
did not come around to account them separately yet, but for a quick
check this clearly shows that this "works" as advertised.

Of course this needs way more analysis than this quick PoC+check, but
you get the idea.

Contrary to other hot of the press hacks, I'm pretty sure it won't
destroy your hard-disk, but I won't recommend that you deploy it on your
alarm-clock as it might make you miss the bus.

If this concept holds, which I'm pretty convinced of by now, then this
is an opportunity to trade ~3000 lines of unholy hacks for about 100-200
lines of understandable code :)

Thanks,

tglx
---
arch/x86/Kconfig | 1
arch/x86/include/asm/thread_info.h | 2 +
drivers/acpi/processor_idle.c | 2 -
include/linux/entry-common.h | 2 -
include/linux/entry-kvm.h | 2 -
include/linux/sched.h | 18 +++++++++++-----
include/linux/sched/idle.h | 8 +++----
include/linux/thread_info.h | 19 +++++++++++++++++
kernel/Kconfig.preempt | 12 +++++++++-
kernel/entry/common.c | 2 -
kernel/sched/core.c | 41 ++++++++++++++++++++++++-------------
kernel/sched/fair.c | 10 ++++-----
kernel/sched/features.h | 2 +
kernel/sched/idle.c | 3 --
kernel/sched/sched.h | 1
kernel/trace/trace.c | 2 -
16 files changed, 91 insertions(+), 36 deletions(-)

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -898,14 +898,14 @@ static inline void hrtick_rq_init(struct

#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
/*
- * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * Atomically set TIF_NEED_RESCHED[_LAZY] and test for TIF_POLLING_NRFLAG,
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p, int nr_bit)
{
struct thread_info *ti = task_thread_info(p);
- return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+ return !(fetch_or(&ti->flags, 1 << nr_bit) & _TIF_POLLING_NRFLAG);
}

/*
@@ -931,9 +931,9 @@ static bool set_nr_if_polling(struct tas
}

#else
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct task_struct *p, int nr_bit)
{
- set_tsk_need_resched(p);
+ set_tsk_thread_flag(p, nr_bit);
return true;
}

@@ -1038,28 +1038,42 @@ void wake_up_q(struct wake_q_head *head)
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
-void resched_curr(struct rq *rq)
+static void __resched_curr(struct rq *rq, int nr_bit)
{
struct task_struct *curr = rq->curr;
int cpu;

lockdep_assert_rq_held(rq);

- if (test_tsk_need_resched(curr))
+ if (test_tsk_need_resched_type(curr, nr_bit))
return;

cpu = cpu_of(rq);

if (cpu == smp_processor_id()) {
- set_tsk_need_resched(curr);
- set_preempt_need_resched();
+ set_tsk_thread_flag(curr, nr_bit);
+ if (nr_bit == TIF_NEED_RESCHED)
+ set_preempt_need_resched();
return;
}

- if (set_nr_and_not_polling(curr))
- smp_send_reschedule(cpu);
- else
+ if (set_nr_and_not_polling(curr, nr_bit)) {
+ if (nr_bit == TIF_NEED_RESCHED)
+ smp_send_reschedule(cpu);
+ } else {
trace_sched_wake_idle_without_ipi(cpu);
+ }
+}
+
+void resched_curr(struct rq *rq)
+{
+ __resched_curr(rq, TIF_NEED_RESCHED);
+}
+
+void resched_curr_lazy(struct rq *rq)
+{
+ __resched_curr(rq, sched_feat(FORCE_NEED_RESCHED) ?
+ TIF_NEED_RESCHED : TIF_NEED_RESCHED_LAZY);
}

void resched_cpu(int cpu)
@@ -1132,7 +1146,7 @@ static void wake_up_idle_cpu(int cpu)
if (cpu == smp_processor_id())
return;

- if (set_nr_and_not_polling(rq->idle))
+ if (set_nr_and_not_polling(rq->idle, TIF_NEED_RESCHED))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);
@@ -8872,7 +8886,6 @@ static void __init preempt_dynamic_init(
WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
return preempt_dynamic_mode == preempt_dynamic_##mode; \
} \
- EXPORT_SYMBOL_GPL(preempt_model_##mode)

PREEMPT_MODEL_ACCESSOR(none);
PREEMPT_MODEL_ACCESSOR(voluntary);
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -59,6 +59,11 @@ enum syscall_work_bit {

#include <asm/thread_info.h>

+#ifndef CONFIG_PREEMPT_AUTO
+# define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED
+# define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
+#endif
+
#ifdef __KERNEL__

#ifndef arch_set_restart_data
@@ -185,6 +190,13 @@ static __always_inline bool tif_need_res
(unsigned long *)(&current_thread_info()->flags));
}

+static __always_inline bool tif_need_resched_lazy(void)
+{
+ return IS_ENABLED(CONFIG_PREEMPT_AUTO) &&
+ arch_test_bit(TIF_NEED_RESCHED_LAZY,
+ (unsigned long *)(&current_thread_info()->flags));
+}
+
#else

static __always_inline bool tif_need_resched(void)
@@ -193,6 +205,13 @@ static __always_inline bool tif_need_res
(unsigned long *)(&current_thread_info()->flags));
}

+static __always_inline bool tif_need_resched_lazy(void)
+{
+ return IS_ENABLED(CONFIG_PREEMPT_AUTO) &&
+ test_bit(TIF_NEED_RESCHED_LAZY,
+ (unsigned long *)(&current_thread_info()->flags));
+}
+
#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */

#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -11,6 +11,9 @@ config PREEMPT_BUILD
select PREEMPTION
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK

+config HAVE_PREEMPT_AUTO
+ bool
+
choice
prompt "Preemption Model"
default PREEMPT_NONE
@@ -67,6 +70,13 @@ config PREEMPT
embedded system with latency requirements in the milliseconds
range.

+config PREEMPT_AUTO
+ bool "Automagic preemption mode with runtime tweaking support"
+ depends on HAVE_PREEMPT_AUTO
+ select PREEMPT_BUILD
+ help
+ Add some sensible blurb here
+
config PREEMPT_RT
bool "Fully Preemptible Kernel (Real-Time)"
depends on EXPERT && ARCH_SUPPORTS_RT
@@ -95,7 +105,7 @@ config PREEMPTION

config PREEMPT_DYNAMIC
bool "Preemption behaviour defined on boot"
- depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+ depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT && !PREEMPT_AUTO
select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY
select PREEMPT_BUILD
default y if HAVE_PREEMPT_DYNAMIC_CALL
--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -60,7 +60,7 @@
#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
_TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
- ARCH_EXIT_TO_USER_MODE_WORK)
+ _TIF_NEED_RESCHED_LAZY | ARCH_EXIT_TO_USER_MODE_WORK)

/**
* arch_enter_from_user_mode - Architecture specific sanity check for user mode regs
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -18,7 +18,7 @@

#define XFER_TO_GUEST_MODE_WORK \
(_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \
- _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK)
+ _TIF_NOTIFY_RESUME | _TIF_NEED_RESCHED_LAZY | ARCH_XFER_TO_GUEST_MODE_WORK)

struct kvm_vcpu;

--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -155,7 +155,7 @@ static unsigned long exit_to_user_mode_l

local_irq_enable_exit_to_user(ti_work);

- if (ti_work & _TIF_NEED_RESCHED)
+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
schedule();

if (ti_work & _TIF_UPROBE)
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -89,3 +89,5 @@ SCHED_FEAT(UTIL_EST_FASTUP, true)
SCHED_FEAT(LATENCY_WARN, false)

SCHED_FEAT(HZ_BW, true)
+
+SCHED_FEAT(FORCE_NEED_RESCHED, false)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2435,6 +2435,7 @@ extern void init_sched_fair_class(void);
extern void reweight_task(struct task_struct *p, int prio);

extern void resched_curr(struct rq *rq);
+extern void resched_curr_lazy(struct rq *rq);
extern void resched_cpu(int cpu);

extern struct rt_bandwidth def_rt_bandwidth;
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2046,17 +2046,17 @@ static inline void update_tsk_thread_fla
update_ti_thread_flag(task_thread_info(tsk), flag, value);
}

-static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
+static inline bool test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
}

-static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
+static inline bool test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
}

-static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
+static inline bool test_tsk_thread_flag(struct task_struct *tsk, int flag)
{
return test_ti_thread_flag(task_thread_info(tsk), flag);
}
@@ -2069,13 +2069,21 @@ static inline void set_tsk_need_resched(
static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+ if (IS_ENABLED(CONFIG_PREEMPT_AUTO))
+ clear_tsk_thread_flag(tsk, TIF_NEED_RESCHED_LAZY);
}

-static inline int test_tsk_need_resched(struct task_struct *tsk)
+static inline bool test_tsk_need_resched(struct task_struct *tsk)
{
return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}

+static inline bool test_tsk_need_resched_type(struct task_struct *tsk,
+ int nr_bit)
+{
+ return unlikely(test_tsk_thread_flag(tsk, 1 << nr_bit));
+}
+
/*
* cond_resched() and cond_resched_lock(): latency reduction via
* explicit rescheduling in places that are safe. The return
@@ -2252,7 +2260,7 @@ static inline int rwlock_needbreak(rwloc

static __always_inline bool need_resched(void)
{
- return unlikely(tif_need_resched());
+ return unlikely(tif_need_resched_lazy() || tif_need_resched());
}

/*
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -985,7 +985,7 @@ static void update_deadline(struct cfs_r
* The task has consumed its request, reschedule.
*/
if (cfs_rq->nr_running > 1) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
clear_buddies(cfs_rq, se);
}
}
@@ -5267,7 +5267,7 @@ entity_tick(struct cfs_rq *cfs_rq, struc
* validating it and just reschedule.
*/
if (queued) {
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
return;
}
/*
@@ -5413,7 +5413,7 @@ static void __account_cfs_rq_runtime(str
* hierarchy can be throttled
*/
if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
- resched_curr(rq_of(cfs_rq));
+ resched_curr_lazy(rq_of(cfs_rq));
}

static __always_inline
@@ -5673,7 +5673,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cf

/* Determine whether we need to wake up potentially idle CPU: */
if (rq->curr == rq->idle && rq->cfs.nr_running)
- resched_curr(rq);
+ resched_curr_lazy(rq);
}

#ifdef CONFIG_SMP
@@ -8073,7 +8073,7 @@ static void check_preempt_wakeup(struct
return;

preempt:
- resched_curr(rq);
+ resched_curr_lazy(rq);
}

#ifdef CONFIG_SMP
--- a/drivers/acpi/processor_idle.c
+++ b/drivers/acpi/processor_idle.c
@@ -108,7 +108,7 @@ static const struct dmi_system_id proces
*/
static void __cpuidle acpi_safe_halt(void)
{
- if (!tif_need_resched()) {
+ if (!need_resched()) {
raw_safe_halt();
raw_local_irq_disable();
}
--- a/include/linux/sched/idle.h
+++ b/include/linux/sched/idle.h
@@ -63,7 +63,7 @@ static __always_inline bool __must_check
*/
smp_mb__after_atomic();

- return unlikely(tif_need_resched());
+ return unlikely(need_resched());
}

static __always_inline bool __must_check current_clr_polling_and_test(void)
@@ -76,7 +76,7 @@ static __always_inline bool __must_check
*/
smp_mb__after_atomic();

- return unlikely(tif_need_resched());
+ return unlikely(need_resched());
}

#else
@@ -85,11 +85,11 @@ static inline void __current_clr_polling

static inline bool __must_check current_set_polling_and_test(void)
{
- return unlikely(tif_need_resched());
+ return unlikely(need_resched());
}
static inline bool __must_check current_clr_polling_and_test(void)
{
- return unlikely(tif_need_resched());
+ return unlikely(need_resched());
}
#endif

--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -57,8 +57,7 @@ static noinline int __cpuidle cpu_idle_p
ct_cpuidle_enter();

raw_local_irq_enable();
- while (!tif_need_resched() &&
- (cpu_idle_force_poll || tick_check_broadcast_expired()))
+ while (!need_resched() && (cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();
raw_local_irq_disable();

--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2720,7 +2720,7 @@ unsigned int tracing_gen_ctx_irq_test(un
if (softirq_count() >> (SOFTIRQ_SHIFT + 1))
trace_flags |= TRACE_FLAG_BH_OFF;

- if (tif_need_resched())
+ if (need_resched())
trace_flags |= TRACE_FLAG_NEED_RESCHED;
if (test_preempt_need_resched())
trace_flags |= TRACE_FLAG_PREEMPT_RESCHED;
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -271,6 +271,7 @@ config X86
select HAVE_STATIC_CALL
select HAVE_STATIC_CALL_INLINE if HAVE_OBJTOOL
select HAVE_PREEMPT_DYNAMIC_CALL
+ select HAVE_PREEMPT_AUTO
select HAVE_RSEQ
select HAVE_RUST if X86_64
select HAVE_SYSCALL_TRACEPOINTS
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,6 +83,7 @@ struct thread_info {
#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
#define TIF_SSBD 5 /* Speculative store bypass disable */
+#define TIF_NEED_RESCHED_LAZY 6 /* Lazy rescheduling */
#define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */
#define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */
#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
@@ -106,6 +107,7 @@ struct thread_info {
#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
#define _TIF_SSBD (1 << TIF_SSBD)
+#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY)
#define _TIF_SPEC_IB (1 << TIF_SPEC_IB)
#define _TIF_SPEC_L1D_FLUSH (1 << TIF_SPEC_L1D_FLUSH)
#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)