[RFC PATCH 4/5] softirq: Replace ksoftirqd with workqueues entirely

From: Frederic Weisbecker
Date: Mon Jan 15 2018 - 23:41:21 EST


Ksoftirqd only remains to implement threaded IRQs. Convert it to
existing per-vector workqueues to avoid code duplication.

Suggested-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Suggested-by: Paolo Abeni <pabeni@xxxxxxxxxx>
Signed-off-by: Frederic Weisbecker <frederic@xxxxxxxxxx>
Cc: Dmitry Safonov <dima@xxxxxxxxxx>
Cc: Eric Dumazet <edumazet@xxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: David Miller <davem@xxxxxxxxxxxxx>
Cc: Hannes Frederic Sowa <hannes@xxxxxxxxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Levin Alexander <alexander.levin@xxxxxxxxxxx>
Cc: Paolo Abeni <pabeni@xxxxxxxxxx>
Cc: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
Cc: Radu Rendec <rrendec@xxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Stanislaw Gruszka <sgruszka@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Wanpeng Li <wanpeng.li@xxxxxxxxxxx>
Cc: Mauro Carvalho Chehab <mchehab@xxxxxxxxxxxxxxxx>
---
Documentation/RCU/stallwarn.txt | 4 +-
include/linux/interrupt.h | 7 ----
kernel/sched/cputime.c | 13 +++---
kernel/sched/sched.h | 4 +-
kernel/softirq.c | 87 +++++++++--------------------------------
net/ipv4/tcp_output.c | 4 +-
6 files changed, 31 insertions(+), 88 deletions(-)

diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt
index a08f928..ea3a8de 100644
--- a/Documentation/RCU/stallwarn.txt
+++ b/Documentation/RCU/stallwarn.txt
@@ -17,8 +17,8 @@ o A CPU looping in an RCU read-side critical section.
o A CPU looping with interrupts disabled.

o A CPU looping with preemption disabled. This condition can
- result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh
- stalls.
+ result in RCU-sched stalls and, if softirq workqueue is in use,
+ RCU-bh stalls.

o A CPU looping with bottom halves disabled. This condition can
result in RCU-sched and RCU-bh stalls.
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 92d044d..680f620 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -507,13 +507,6 @@ extern void __raise_softirq_irqoff(unsigned int nr);
extern void raise_softirq_irqoff(unsigned int nr);
extern void raise_softirq(unsigned int nr);

-DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
-
-static inline struct task_struct *this_cpu_ksoftirqd(void)
-{
- return this_cpu_read(ksoftirqd);
-}
-
extern int softirq_serving_workqueue(void);

/* Tasklets --- multithreaded analogue of BHs.
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 30f70e5..c5b8dbd 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -64,15 +64,14 @@ void irqtime_account_irq(struct task_struct *curr)
irqtime->irq_start_time += delta;

/*
- * We do not account for softirq time from ksoftirqd here.
- * We want to continue accounting softirq time to ksoftirqd thread
+ * We do not account for softirq time from workqueue here.
+ * We want to continue accounting softirq time to workqueue thread
* in that case, so as not to confuse scheduler with a special task
* that do not consume any time, but still wants to run.
*/
if (hardirq_count())
irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd() &&
- !softirq_serving_workqueue())
+ else if (in_serving_softirq() && !softirq_serving_workqueue())
irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
@@ -376,11 +375,11 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,

cputime -= other;

- if (this_cpu_ksoftirqd() == p || softirq_serving_workqueue()) {
+ if (softirq_serving_workqueue()) {
/*
- * ksoftirqd time do not get accounted in cpu_softirq_time.
+ * Softirq wq time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
- * Also, p->stime needs to be updated for ksoftirqd.
+ * Also, p->stime needs to be updated for workqueue.
*/
account_system_index_time(p, cputime, CPUTIME_SOFTIRQ);
} else if (user_tick) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b19552a2..5d481f1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2061,8 +2061,8 @@ struct irqtime {
DECLARE_PER_CPU(struct irqtime, cpu_irqtime);

/*
- * Returns the irqtime minus the softirq time computed by ksoftirqd.
- * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * Returns the irqtime minus the softirq time computed by workqueue.
+ * Otherwise workqueue's sum_exec_runtime is substracted its own runtime
* and never move forward.
*/
static inline u64 irq_time_read(int cpu)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 441e654..b2a5384 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -55,8 +55,6 @@ EXPORT_SYMBOL(irq_stat);

static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;

-DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
-
const char * const softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
"TASKLET", "SCHED", "HRTIMER", "RCU"
@@ -78,32 +76,6 @@ struct softirq {
static DEFINE_PER_CPU(struct softirq, softirq_cpu);

/*
- * we cannot loop indefinitely here to avoid userspace starvation,
- * but we also don't want to introduce a worst case 1/HZ latency
- * to the pending events, so lets the scheduler to balance
- * the softirq load for us.
- */
-static void wakeup_softirqd(void)
-{
- /* Interrupts are disabled: no need to stop preemption */
- struct task_struct *tsk = __this_cpu_read(ksoftirqd);
-
- if (tsk && tsk->state != TASK_RUNNING)
- wake_up_process(tsk);
-}
-
-/*
- * If ksoftirqd is scheduled, we do not want to process pending softirqs
- * right now. Let ksoftirqd handle this at its own rate, to get fairness.
- */
-static bool ksoftirqd_running(void)
-{
- struct task_struct *tsk = __this_cpu_read(ksoftirqd);
-
- return tsk && (tsk->state == TASK_RUNNING);
-}
-
-/*
* preempt_count and SOFTIRQ_OFFSET usage:
* - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
* softirq processing.
@@ -408,7 +380,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)

asmlinkage __visible void do_softirq(void)
{
- __u32 pending;
+ __u32 pending, pending_work;
unsigned long flags;

if (in_interrupt())
@@ -417,8 +389,9 @@ asmlinkage __visible void do_softirq(void)
local_irq_save(flags);

pending = local_softirq_pending();
+ pending_work = __this_cpu_read(softirq_cpu.pending_work_mask);

- if (pending && !ksoftirqd_running())
+ if (pending & ~pending_work)
do_softirq_own_stack();

local_irq_restore(flags);
@@ -432,7 +405,7 @@ void irq_enter(void)
rcu_irq_enter();
if (is_idle_task(current) && !in_interrupt()) {
/*
- * Prevent raise_softirq from needlessly waking up ksoftirqd
+ * Prevent raise_softirq from needlessly waking up workqueue
* here, as softirq will be serviced on return from interrupt.
*/
local_bh_disable();
@@ -445,7 +418,15 @@ void irq_enter(void)

static inline void invoke_softirq(void)
{
- if (ksoftirqd_running())
+ unsigned int pending_work, pending = local_softirq_pending();
+
+ if (!pending)
+ return;
+
+ pending_work = __this_cpu_read(softirq_cpu.pending_work_mask);
+ pending &= ~pending_work;
+
+ if (!pending)
return;

if (!force_irqthreads) {
@@ -465,7 +446,7 @@ static inline void invoke_softirq(void)
do_softirq_own_stack();
#endif
} else {
- wakeup_softirqd();
+ do_softirq_workqueue(pending);
}
}

@@ -494,7 +475,7 @@ void irq_exit(void)
#endif
account_irq_exit_time(current);
preempt_count_sub(HARDIRQ_OFFSET);
- if (!in_interrupt() && local_softirq_pending())
+ if (!in_interrupt())
invoke_softirq();

tick_irq_exit();
@@ -515,11 +496,11 @@ inline void raise_softirq_irqoff(unsigned int nr)
* actually run the softirq once we return from
* the irq or softirq.
*
- * Otherwise we wake up ksoftirqd to make sure we
+ * Otherwise we wake up workqueue to make sure we
* schedule the softirq soon.
*/
if (!in_interrupt())
- wakeup_softirqd();
+ do_softirq_workqueue(BIT(nr));
}

void raise_softirq(unsigned int nr)
@@ -758,27 +739,6 @@ void __init softirq_init(void)
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
}

-static int ksoftirqd_should_run(unsigned int cpu)
-{
- return local_softirq_pending();
-}
-
-static void run_ksoftirqd(unsigned int cpu)
-{
- local_irq_disable();
- if (local_softirq_pending()) {
- /*
- * We can safely run softirq on inline stack, as we are not deep
- * in the task stack here.
- */
- __do_softirq();
- local_irq_enable();
- cond_resched_rcu_qs();
- return;
- }
- local_irq_enable();
-}
-
#ifdef CONFIG_HOTPLUG_CPU
/*
* tasklet_kill_immediate is called to remove a tasklet which can already be
@@ -841,22 +801,13 @@ static int takeover_tasklets(unsigned int cpu)
#define takeover_tasklets NULL
#endif /* CONFIG_HOTPLUG_CPU */

-static struct smp_hotplug_thread softirq_threads = {
- .store = &ksoftirqd,
- .thread_should_run = ksoftirqd_should_run,
- .thread_fn = run_ksoftirqd,
- .thread_comm = "ksoftirqd/%u",
-};
-
-static __init int spawn_ksoftirqd(void)
+static __init int tasklet_set_takeover(void)
{
cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
takeover_tasklets);
- BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
-
return 0;
}
-early_initcall(spawn_ksoftirqd);
+early_initcall(tasklet_set_takeover);

/*
* [ These __weak aliases are kept in a separate compilation unit, so that
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index b4e4160..3b4811e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -912,7 +912,7 @@ void tcp_wfree(struct sk_buff *skb)
*/
WARN_ON(refcount_sub_and_test(skb->truesize - 1, &sk->sk_wmem_alloc));

- /* If this softirq is serviced by ksoftirqd, we are likely under stress.
+ /* If this softirq is serviced by workqueue, we are likely under stress.
* Wait until our queues (qdisc + devices) are drained.
* This gives :
* - less callbacks to tcp_write_xmit(), reducing stress (batches)
@@ -920,7 +920,7 @@ void tcp_wfree(struct sk_buff *skb)
* to migrate this flow (skb->ooo_okay will be eventually set)
*/
if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) &&
- (this_cpu_ksoftirqd() == current || softirq_serving_workqueue()))
+ softirq_serving_workqueue())
goto out;

for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
--
2.7.4