Re: [RFC PATCH 3/3] softirq: Avoid waking up ksoftirqd from flush_smp_call_function_queue()

From: K Prateek Nayak
Date: Tue Jul 23 2024 - 00:50:52 EST


Hello Peter,

On 7/10/2024 11:50 PM, K Prateek Nayak wrote:
Hello Peter,

Thank you for the feedback.

On 7/10/2024 8:35 PM, Peter Zijlstra wrote:
On Wed, Jul 10, 2024 at 09:02:10AM +0000, K Prateek Nayak wrote:

[..snip..]

On first reading I wonder why you've not re-used and hooked into the
PREEMPT_RT variant of should_wake_ksoftirqd(). That already has a per
CPU variable to do exactly this.

With this RFC, I intended to check if everyone was onboard with the idea
and of the use-case. One caveat with re-using the existing
"softirq_ctrl.cnt" hook that PREEMPT_RT uses is that we'll need to
expose the functions that increment and decrement it, for it to be used
in kernel/smp.c. I'll make those changes in v2 and we can see if there
are sufficient WARN_ON() to catch any incorrect usage in !PREEMPT_RT
case.


Reusing the PREEMPT_RT bits, I was able to come up with the approach
below. Following are some nuances with this approach:

- Although I don't believe "set_do_softirq_pending()" can be nested,
since it is used only from flush_smp_call_function_queue() which is
called with IRQs disabled, I'm still using inc and dec since I'm not
sure if it can be nested in a local_bh_{disable,enable}() or if
those can be called from SMP-call-function.

- I used the same modified version of ipistorm to test the changes on
top of v6.10-rc6-rt11 with LOCKDEP enabled and did not see any splats
during the run of the benchmark. If there are better tests that
stress this part on RT kernels, I'm all ears.

- I've abandoned any micro-optimization since I see different numbers
on different machines and am sticking to the simplest approach since
it works and is an improvement over the baseline.

I'll leave the diff below:

diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
index 21ac44428bb0..83f70626ff1e 100644
--- a/kernel/sched/smp.h
+++ b/kernel/sched/smp.h
@@ -9,7 +9,16 @@ extern void sched_ttwu_pending(void *arg);
extern bool call_function_single_prep_ipi(int cpu);
#ifdef CONFIG_SMP
+/*
+ * Used to indicate a pending call to do_softirq() from
+ * flush_smp_call_function_queue()
+ */
+extern void set_do_softirq_pending(void);
+extern void clr_do_softirq_pending(void);
+
extern void flush_smp_call_function_queue(void);
#else
+static inline void set_do_softirq_pending(void) { }
+static inline void clr_do_softirq_pending(void) { }
static inline void flush_smp_call_function_queue(void) { }
#endif
diff --git a/kernel/smp.c b/kernel/smp.c
index f085ebcdf9e7..c191bad912f6 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -583,7 +583,9 @@ void flush_smp_call_function_queue(void)
local_irq_save(flags);
/* Get the already pending soft interrupts for RT enabled kernels */
was_pending = local_softirq_pending();
+ set_do_softirq_pending();
__flush_smp_call_function_queue(true);
+ clr_do_softirq_pending();
if (local_softirq_pending())
do_softirq_post_smp_call_flush(was_pending);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 00e32e279fa9..8308687fc7b9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -88,22 +88,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled);
EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
#endif
-/*
- * SOFTIRQ_OFFSET usage:
- *
- * On !RT kernels 'count' is the preempt counter, on RT kernels this applies
- * to a per CPU counter and to task::softirqs_disabled_cnt.
- *
- * - count is changed by SOFTIRQ_OFFSET on entering or leaving softirq
- * processing.
- *
- * - count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
- * on local_bh_disable or local_bh_enable.
- *
- * This lets us distinguish between whether we are currently processing
- * softirq and whether we just have bh disabled.
- */
-#ifdef CONFIG_PREEMPT_RT
+#define DO_SOFTIRQ_PENDING_MASK GENMASK(SOFTIRQ_SHIFT - 1, 0)
/*
* RT accounts for BH disabled sections in task::softirqs_disabled_cnt and
@@ -116,16 +101,56 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context);
*
* The per CPU counter prevents pointless wakeups of ksoftirqd in case that
* the task which is in a softirq disabled section is preempted or blocks.
+ *
+ * The bottom bits of softirq_ctrl::cnt is used to indicate an impending call
+ * to do_softirq() to prevent pointless wakeups of ksoftirqd since the CPU
+ * promises to handle softirqs soon.
*/
struct softirq_ctrl {
+#ifdef CONFIG_PREEMPT_RT
local_lock_t lock;
+#endif
int cnt;
};
static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = {
+#ifdef CONFIG_PREEMPT_RT
.lock = INIT_LOCAL_LOCK(softirq_ctrl.lock),
+#endif
};
+inline void set_do_softirq_pending(void)
+{
+ __this_cpu_inc(softirq_ctrl.cnt);
+}
+
+inline void clr_do_softirq_pending(void)
+{
+ __this_cpu_dec(softirq_ctrl.cnt);
+}
+
+static inline bool should_wake_ksoftirqd(void)
+{
+ return !this_cpu_read(softirq_ctrl.cnt);
+}
+
+/*
+ * SOFTIRQ_OFFSET usage:
+ *
+ * On !RT kernels 'count' is the preempt counter, on RT kernels this applies
+ * to a per CPU counter and to task::softirqs_disabled_cnt.
+ *
+ * - count is changed by SOFTIRQ_OFFSET on entering or leaving softirq
+ * processing.
+ *
+ * - count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
+ * on local_bh_disable or local_bh_enable.
+ *
+ * This lets us distinguish between whether we are currently processing
+ * softirq and whether we just have bh disabled.
+ */
+#ifdef CONFIG_PREEMPT_RT
+
/**
* local_bh_blocked() - Check for idle whether BH processing is blocked
*
@@ -138,7 +163,7 @@ static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = {
*/
bool local_bh_blocked(void)
{
- return __this_cpu_read(softirq_ctrl.cnt) != 0;
+ return (__this_cpu_read(softirq_ctrl.cnt) & SOFTIRQ_MASK) != 0;
}
void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
@@ -155,7 +180,8 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
/* Required to meet the RCU bottomhalf requirements. */
rcu_read_lock();
} else {
- DEBUG_LOCKS_WARN_ON(this_cpu_read(softirq_ctrl.cnt));
+ DEBUG_LOCKS_WARN_ON(this_cpu_read(softirq_ctrl.cnt) &
+ SOFTIRQ_MASK);
}
}
@@ -163,7 +189,7 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
* Track the per CPU softirq disabled state. On RT this is per CPU
* state to allow preemption of bottom half disabled sections.
*/
- newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt);
+ newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt) & SOFTIRQ_MASK;
/*
* Reflect the result in the task state to prevent recursion on the
* local lock and to make softirq_count() & al work.
@@ -184,7 +210,7 @@ static void __local_bh_enable(unsigned int cnt, bool unlock)
int newcnt;
DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt !=
- this_cpu_read(softirq_ctrl.cnt));
+ (this_cpu_read(softirq_ctrl.cnt) & SOFTIRQ_MASK));
if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) {
raw_local_irq_save(flags);
@@ -192,7 +218,7 @@ static void __local_bh_enable(unsigned int cnt, bool unlock)
raw_local_irq_restore(flags);
}
- newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt);
+ newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt) & SOFTIRQ_MASK;
current->softirq_disable_cnt = newcnt;
if (!newcnt && unlock) {
@@ -212,7 +238,7 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt)
lockdep_assert_irqs_enabled();
local_irq_save(flags);
- curcnt = __this_cpu_read(softirq_ctrl.cnt);
+ curcnt = __this_cpu_read(softirq_ctrl.cnt) & SOFTIRQ_MASK;
/*
* If this is not reenabling soft interrupts, no point in trying to
@@ -253,7 +279,7 @@ void softirq_preempt(void)
if (WARN_ON_ONCE(!preemptible()))
return;
- if (WARN_ON_ONCE(__this_cpu_read(softirq_ctrl.cnt) != SOFTIRQ_OFFSET))
+ if (WARN_ON_ONCE((__this_cpu_read(softirq_ctrl.cnt) & SOFTIRQ_MASK) != SOFTIRQ_OFFSET))
return;
__local_bh_enable(SOFTIRQ_OFFSET, true);
@@ -282,11 +308,6 @@ static inline void ksoftirqd_run_end(void)
static inline void softirq_handle_begin(void) { }
static inline void softirq_handle_end(void) { }
-static inline bool should_wake_ksoftirqd(void)
-{
- return !this_cpu_read(softirq_ctrl.cnt);
-}
-
static inline void invoke_softirq(void)
{
if (should_wake_ksoftirqd())
@@ -424,11 +445,6 @@ static inline void ksoftirqd_run_end(void)
local_irq_enable();
}
-static inline bool should_wake_ksoftirqd(void)
-{
- return true;
-}
-
static inline void invoke_softirq(void)
{
if (!force_irqthreads() || !__this_cpu_read(ksoftirqd)) {
--

Some of the (softirq_ctrl.cnt & SOFTIRQ_MASK) masking might be
unnecessary but I'm not familiar enough with all these bits to make a
sound call. Any and all comments are appreciated :)

--
Thanks and Regards,
Prateek