[RFC/RFT PATCH 1/2] sched/core: Check and schedule ksoftirq

From: Srinivas Pandruvada
Date: Thu Dec 15 2022 - 13:43:36 EST


From: Frederic Weisbecker <frederic@xxxxxxxxxx>

It is possible that ksoftirqd is woken up and racing with forced
idle injection via play_idle_precise() called from a FIFO thread.

Here it is possible that need_resched() is false but ksoftirqd
is pending. This will result in printing of warning:

[147777.095484] NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #08!!!

Also the processing can be delayed upto the duration of forced idle.

For example in the following traces (with added traces for debug):

<idle>-0 [004] 207.087742: sched_wakeup: ksoftirqd/4
<idle>-0 [004] 207.087743: sched_switch: swapper/4:0 [120] R ==> idle_inject/4:37 [49]

idle_inject/4-37 [004] 207.087744: play_idle_enter
idle_inject/4-37 [004] 207.087746: bputs: can_stop_idle_tick.isra.16: soft irq pending

(Printed warning at this point
NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #08!!!

...
...
idle_inject/4-37 [004] 207.112127: play_idle_exit: state=24000000 cpu_id=4

idle_inject/4-37 [004] 207.112134: sched_switch: idle_inject/4:37 [49] S ==> ksoftirqd/4:39 [120]
ksoftirqd/4-39 [004] 207.112142: softirq_entry: vec=3 [action=NET_RX]
ksoftirqd/4-39 [004] 207.112213: irq_handler_entry: irq=142 name=enp0s

Delayed by : 207.112142 - 207.087742 = 0.024400
(250HZ configuration) and the idle duration for play_idle_precise is 24ms.
So, the softirq entered after the expiry of forced idle time.

The solution here is to give chance to run softirq. Currently do_idle()
checks for need_resched() and break. But here in addition to need_resched()
also added a check for task_is_running(__this_cpu_read(ksoftirqd)).
So if the ksoftirq is pending break the do_idle() loop and give 1 jiffie
to process via schedule_timeout(1). If the required idle time is
not expired, start hrtimer again and call do_idle() again.

Signed-off-by: Frederic Weisbecker <frederic@xxxxxxxxxx>
Signed-off-by: Srinivas Pandruvada <srinivas.pandruvada@xxxxxxxxxxxxxxx>
---
kernel/sched/idle.c | 55 ++++++++++++++++++++++++++++++---------------
1 file changed, 37 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index f26ab2675f7d..77d6168288cf 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -49,6 +49,11 @@ static int __init cpu_idle_nopoll_setup(char *__unused)
__setup("hlt", cpu_idle_nopoll_setup);
#endif

+static bool __cpuidle idle_loop_should_break(void)
+{
+ return need_resched() || task_is_running(__this_cpu_read(ksoftirqd));
+}
+
static noinline int __cpuidle cpu_idle_poll(void)
{
trace_cpu_idle(0, smp_processor_id());
@@ -56,7 +61,7 @@ static noinline int __cpuidle cpu_idle_poll(void)
ct_idle_enter();
local_irq_enable();

- while (!tif_need_resched() &&
+ while (!idle_loop_should_break() &&
(cpu_idle_force_poll || tick_check_broadcast_expired()))
cpu_relax();

@@ -174,7 +179,7 @@ static void cpuidle_idle_call(void)
* Check if the idle task must be rescheduled. If it is the
* case, exit the function after re-enabling the local irq.
*/
- if (need_resched()) {
+ if (idle_loop_should_break()) {
local_irq_enable();
return;
}
@@ -276,7 +281,7 @@ static void do_idle(void)
__current_set_polling();
tick_nohz_idle_enter();

- while (!need_resched()) {
+ while (!idle_loop_should_break()) {
rmb();

local_irq_disable();
@@ -358,6 +363,7 @@ static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
void play_idle_precise(u64 duration_ns, u64 latency_ns)
{
struct idle_timer it;
+ ktime_t remaining;

/*
* Only FIFO tasks can disable the tick since they don't need the forced
@@ -370,25 +376,38 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
WARN_ON_ONCE(!duration_ns);
WARN_ON_ONCE(current->mm);

- rcu_sleep_check();
- preempt_disable();
- current->flags |= PF_IDLE;
- cpuidle_use_deepest_state(latency_ns);
+ remaining = ns_to_ktime(duration_ns);

- it.done = 0;
- hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
- it.timer.function = idle_inject_timer_fn;
- hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
- HRTIMER_MODE_REL_PINNED_HARD);
+ do {
+ rcu_sleep_check();
+ preempt_disable();
+ current->flags |= PF_IDLE;
+ cpuidle_use_deepest_state(latency_ns);

- while (!READ_ONCE(it.done))
- do_idle();
+ it.done = 0;
+ hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
+ it.timer.function = idle_inject_timer_fn;
+ hrtimer_start(&it.timer, remaining, HRTIMER_MODE_REL_PINNED_HARD);
+
+ while (!READ_ONCE(it.done) && !task_is_running(__this_cpu_read(ksoftirqd)))
+ do_idle();

- cpuidle_use_deepest_state(0);
- current->flags &= ~PF_IDLE;
+ remaining = hrtimer_get_remaining(&it.timer);

- preempt_fold_need_resched();
- preempt_enable();
+ hrtimer_cancel(&it.timer);
+
+ cpuidle_use_deepest_state(0);
+ current->flags &= ~PF_IDLE;
+
+ preempt_fold_need_resched();
+ preempt_enable();
+
+ /* Give ksoftirqd 1 jiffy to get a chance to start its job */
+ if (!READ_ONCE(it.done) && task_is_running(__this_cpu_read(ksoftirqd))) {
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(1);
+ }
+ } while (!READ_ONCE(it.done));
}
EXPORT_SYMBOL_GPL(play_idle_precise);

--
2.38.1