[PATCH 1/5] sched: Add TIF_NEED_RESCHED_LAZY infrastructure

From: Peter Zijlstra
Date: Mon Oct 07 2024 - 03:56:30 EST


Add the basic infrastructure to split the TIF_NEED_RESCHED bit in two.
Either bit will cause a resched on return-to-user, but only
TIF_NEED_RESCHED will drive IRQ preemption.

No behavioural change intended.

Suggested-by: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Signed-off-by: Peter Zijlstra (Intel) <peterz@xxxxxxxxxxxxx>
---
include/linux/entry-common.h | 3 ++-
include/linux/entry-kvm.h | 5 +++--
include/linux/sched.h | 3 ++-
include/linux/thread_info.h | 21 +++++++++++++++++----
kernel/entry/common.c | 2 +-
kernel/entry/kvm.c | 4 ++--
kernel/sched/core.c | 34 +++++++++++++++++++++-------------
7 files changed, 48 insertions(+), 24 deletions(-)

--- a/include/linux/entry-common.h
+++ b/include/linux/entry-common.h
@@ -64,7 +64,8 @@

#define EXIT_TO_USER_MODE_WORK \
(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \
- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
+ _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \
+ _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \
ARCH_EXIT_TO_USER_MODE_WORK)

/**
--- a/include/linux/entry-kvm.h
+++ b/include/linux/entry-kvm.h
@@ -17,8 +17,9 @@
#endif

#define XFER_TO_GUEST_MODE_WORK \
- (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \
- _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK)
+ (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \
+ _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME | \
+ ARCH_XFER_TO_GUEST_MODE_WORK)

struct kvm_vcpu;

--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2002,7 +2002,8 @@ static inline void set_tsk_need_resched(

static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
- clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+ atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY,
+ (atomic_long_t *)&task_thread_info(tsk)->flags);
}

static inline int test_tsk_need_resched(struct task_struct *tsk)
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -59,6 +59,14 @@ enum syscall_work_bit {

#include <asm/thread_info.h>

+#ifndef TIF_NEED_RESCHED_LAZY
+#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY
+#error Inconsistent PREEMPT_LAZY
+#endif
+#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED
+#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED
+#endif
+
#ifdef __KERNEL__

#ifndef arch_set_restart_data
@@ -179,22 +187,27 @@ static __always_inline unsigned long rea

#ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H

-static __always_inline bool tif_need_resched(void)
+static __always_inline bool tif_test_bit(int bit)
{
- return arch_test_bit(TIF_NEED_RESCHED,
+ return arch_test_bit(bit,
(unsigned long *)(&current_thread_info()->flags));
}

#else

-static __always_inline bool tif_need_resched(void)
+static __always_inline bool tif_test_bit(int bit)
{
- return test_bit(TIF_NEED_RESCHED,
+ return test_bit(bit,
(unsigned long *)(&current_thread_info()->flags));
}

#endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */

+static __always_inline bool tif_need_resched(void)
+{
+ return tif_test_bit(TIF_NEED_RESCHED);
+}
+
#ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES
static inline int arch_within_stack_frames(const void * const stack,
const void * const stackend,
--- a/kernel/entry/common.c
+++ b/kernel/entry/common.c
@@ -98,7 +98,7 @@ __always_inline unsigned long exit_to_us

local_irq_enable_exit_to_user(ti_work);

- if (ti_work & _TIF_NEED_RESCHED)
+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
schedule();

if (ti_work & _TIF_UPROBE)
--- a/kernel/entry/kvm.c
+++ b/kernel/entry/kvm.c
@@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struc
return -EINTR;
}

- if (ti_work & _TIF_NEED_RESCHED)
+ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY))
schedule();

if (ti_work & _TIF_NOTIFY_RESUME)
@@ -24,7 +24,7 @@ static int xfer_to_guest_mode_work(struc
return ret;

ti_work = read_thread_flags();
- } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
+ } while (ti_work & XFER_TO_GUEST_MODE_WORK);
return 0;
}

--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -936,10 +936,9 @@ static inline void hrtick_rq_init(struct
* this avoids any races wrt polling state changes and thereby avoids
* spurious IPIs.
*/
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
{
- struct thread_info *ti = task_thread_info(p);
- return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+ return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG);
}

/*
@@ -964,9 +963,9 @@ static bool set_nr_if_polling(struct tas
}

#else
-static inline bool set_nr_and_not_polling(struct task_struct *p)
+static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif)
{
- set_tsk_need_resched(p);
+ atomic_long_or(1 << tif, (atomic_long_t *)&ti->flags);
return true;
}

@@ -1071,28 +1070,37 @@ void wake_up_q(struct wake_q_head *head)
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
-void resched_curr(struct rq *rq)
+static void __resched_curr(struct rq *rq, int tif)
{
struct task_struct *curr = rq->curr;
+ struct thread_info *cti = task_thread_info(curr);
int cpu;

lockdep_assert_rq_held(rq);

- if (test_tsk_need_resched(curr))
+ if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED))
return;

cpu = cpu_of(rq);

if (cpu == smp_processor_id()) {
- set_tsk_need_resched(curr);
- set_preempt_need_resched();
+ set_ti_thread_flag(cti, tif);
+ if (tif == TIF_NEED_RESCHED)
+ set_preempt_need_resched();
return;
}

- if (set_nr_and_not_polling(curr))
- smp_send_reschedule(cpu);
- else
+ if (set_nr_and_not_polling(cti, tif)) {
+ if (tif == TIF_NEED_RESCHED)
+ smp_send_reschedule(cpu);
+ } else {
trace_sched_wake_idle_without_ipi(cpu);
+ }
+}
+
+void resched_curr(struct rq *rq)
+{
+ __resched_curr(rq, TIF_NEED_RESCHED);
}

void resched_cpu(int cpu)
@@ -1187,7 +1195,7 @@ static void wake_up_idle_cpu(int cpu)
* and testing of the above solutions didn't appear to report
* much benefits.
*/
- if (set_nr_and_not_polling(rq->idle))
+ if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED))
smp_send_reschedule(cpu);
else
trace_sched_wake_idle_without_ipi(cpu);