[RFC 4/6] softirq: Run per-group per-cpu ksoftirqd thread

From: Dmitry Safonov
Date: Thu Jan 18 2018 - 11:13:47 EST


Running one ksoftirqd per-cpu allows to defer processing softirqs under
storm. But having only one ksoftirqd thread for that make it worse for
other kinds of softirqs. As we check if (ksoftirqd_running()) and defer
all softirqs till ksoftirqd time-slice it introduces latencies.
While it's acceptable for softirqs under storm, the other softirqs are
impeded.

For each softirq-group create ksoftirqd thread which will serve deferred
softirqs of the group. Softirqs of other groups will be served as they
come.

Without kernel parameter it'll work as it was previously: creating
default softirq-group which includes all softirqs and running only one
ksoftirqd thread per-cpu.

Signed-off-by: Dmitry Safonov <dima@xxxxxxxxxx>
---
include/linux/interrupt.h | 16 +++-
kernel/sched/cputime.c | 27 ++++---
kernel/softirq.c | 187 ++++++++++++++++++++++++++++++++++++----------
net/ipv4/tcp_output.c | 2 +-
4 files changed, 177 insertions(+), 55 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 2ea09896bd6e..17e1a04445fa 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -508,11 +508,21 @@ extern void __raise_softirq_irqoff(unsigned int nr);
extern void raise_softirq_irqoff(unsigned int nr);
extern void raise_softirq(unsigned int nr);

-DECLARE_PER_CPU(struct task_struct *, ksoftirqd);
+extern struct task_struct *__percpu **ksoftirqd;
+extern unsigned nr_softirq_groups;

-static inline struct task_struct *this_cpu_ksoftirqd(void)
+extern bool servicing_softirq(unsigned nr);
+static inline bool current_is_ksoftirqd(void)
{
- return this_cpu_read(ksoftirqd);
+ unsigned i;
+
+ if (!ksoftirqd)
+ return false;
+
+ for (i = 0; i < nr_softirq_groups; i++)
+ if (*this_cpu_ptr(ksoftirqd[i]) == current)
+ return true;
+ return false;
}

/* Tasklets --- multithreaded analogue of BHs.
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index bac6ac9a4ec7..faacba00a153 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -46,6 +46,18 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
u64_stats_update_end(&irqtime->sync);
}

+static void irqtime_account_softirq(struct irqtime *irqtime, s64 delta)
+{
+ /*
+ * We do not account for softirq time from ksoftirqd here.
+ * We want to continue accounting softirq time to ksoftirqd thread
+ * in that case, so as not to confuse scheduler with a special task
+ * that do not consume any time, but still wants to run.
+ */
+ if (!current_is_ksoftirqd())
+ irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
+}
+
/*
* Called before incrementing preempt_count on {soft,}irq_enter
* and before decrementing preempt_count on {soft,}irq_exit.
@@ -63,16 +75,11 @@ void irqtime_account_irq(struct task_struct *curr)
delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
irqtime->irq_start_time += delta;

- /*
- * We do not account for softirq time from ksoftirqd here.
- * We want to continue accounting softirq time to ksoftirqd thread
- * in that case, so as not to confuse scheduler with a special task
- * that do not consume any time, but still wants to run.
- */
- if (hardirq_count())
+ if (hardirq_count()) {
irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
- irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
+ } else if (in_serving_softirq()) {
+ irqtime_account_softirq(irqtime, delta);
+ }
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);

@@ -375,7 +382,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,

cputime -= other;

- if (this_cpu_ksoftirqd() == p) {
+ if (current_is_ksoftirqd()) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7de5791c08f9..fdde3788afba 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -55,28 +55,56 @@ EXPORT_SYMBOL(irq_stat);

static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
static unsigned group_to_softirqs[sizeof(softirq_vec[0].group_mask)] __cacheline_aligned_in_smp;
-static unsigned __initdata nr_softirq_groups = 0;
-
-DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+struct task_struct *__percpu **ksoftirqd = 0;
+unsigned nr_softirq_groups = 0;

const char * const softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "IRQ_POLL",
"TASKLET", "SCHED", "HRTIMER", "RCU"
};

+bool servicing_softirq(unsigned nr)
+{
+ u32 group_mask = softirq_vec[nr].group_mask;
+ unsigned i, group = 0;
+
+ if (!ksoftirqd)
+ return false;
+
+ while ((i = ffs(group_mask))) {
+ group += i - 1;
+ if (*this_cpu_ptr(ksoftirqd[group]) == current)
+ return true;
+ group_mask >>= i;
+ }
+
+ return false;
+}
+
/*
* we cannot loop indefinitely here to avoid userspace starvation,
* but we also don't want to introduce a worst case 1/HZ latency
* to the pending events, so lets the scheduler to balance
* the softirq load for us.
*/
-static void wakeup_softirqd(void)
+static void wakeup_softirqd(u32 softirq_mask)
{
- /* Interrupts are disabled: no need to stop preemption */
- struct task_struct *tsk = __this_cpu_read(ksoftirqd);
+ unsigned i;
+
+ if (!ksoftirqd)
+ return;

- if (tsk && tsk->state != TASK_RUNNING)
- wake_up_process(tsk);
+ for (i = 0; i < nr_softirq_groups; i++) {
+ if (softirq_mask & group_to_softirqs[i]) {
+ struct task_struct *tsk;
+
+ /* Interrupts disabled: no need to stop preemption */
+ tsk = *this_cpu_ptr(ksoftirqd[i]);
+ if (tsk && tsk->state != TASK_RUNNING)
+ wake_up_process(tsk);
+ }
+
+ }
}

/*
@@ -85,9 +113,27 @@ static void wakeup_softirqd(void)
*/
static bool ksoftirqd_running(void)
{
- struct task_struct *tsk = __this_cpu_read(ksoftirqd);
+ /* We rely that there are pending softirqs */
+ __u32 pending = local_softirq_pending();
+ unsigned i;

- return tsk && (tsk->state == TASK_RUNNING);
+ if (!ksoftirqd)
+ return false;
+
+ for (i = 0; i < nr_softirq_groups && pending; i++) {
+ /* Interrupts are disabled: no need to stop preemption */
+ struct task_struct *tsk = *this_cpu_ptr(ksoftirqd[i]);
+
+ if (!(pending & group_to_softirqs[i]))
+ continue;
+
+ if (!tsk || tsk->state != TASK_RUNNING)
+ continue;
+
+ pending &= ~group_to_softirqs[i];
+ }
+
+ return !pending;
}

/*
@@ -306,7 +352,8 @@ asmlinkage __visible void __softirq_entry __do_softirq(__u32 mask)
--max_restart)
goto restart;

- wakeup_softirqd();
+ /* XXX: not fair ATM, next patches will fix that */
+ wakeup_softirqd(pending);
}

lockdep_softirq_end(in_hardirq);
@@ -375,7 +422,7 @@ static inline void invoke_softirq(void)
do_softirq_own_stack(pending);
#endif
} else {
- wakeup_softirqd();
+ wakeup_softirqd(local_softirq_pending());
}
}

@@ -429,7 +476,7 @@ inline void raise_softirq_irqoff(unsigned int nr)
* schedule the softirq soon.
*/
if (!in_interrupt())
- wakeup_softirqd();
+ wakeup_softirqd(local_softirq_pending());
}

void raise_softirq(unsigned int nr)
@@ -685,27 +732,6 @@ void __init softirq_init(void)
open_softirq(HI_SOFTIRQ, tasklet_hi_action);
}

-static int ksoftirqd_should_run(unsigned int cpu)
-{
- return local_softirq_pending();
-}
-
-static void run_ksoftirqd(unsigned int cpu)
-{
- local_irq_disable();
- if (local_softirq_pending()) {
- /*
- * We can safely run softirq on inline stack, as we are not deep
- * in the task stack here.
- */
- __do_softirq(~0);
- local_irq_enable();
- cond_resched_rcu_qs();
- return;
- }
- local_irq_enable();
-}
-
#ifdef CONFIG_HOTPLUG_CPU
/*
* tasklet_kill_immediate is called to remove a tasklet which can already be
@@ -768,18 +794,97 @@ static int takeover_tasklets(unsigned int cpu)
#define takeover_tasklets NULL
#endif /* CONFIG_HOTPLUG_CPU */

-static struct smp_hotplug_thread softirq_threads = {
- .store = &ksoftirqd,
- .thread_should_run = ksoftirqd_should_run,
- .thread_fn = run_ksoftirqd,
- .thread_comm = "ksoftirqd/%u",
-};
+static int ksoftirqd_should_run(unsigned int cpu)
+{
+ __u32 pending = local_softirq_pending();
+ unsigned group;
+
+ if (!ksoftirqd)
+ return 0;
+
+ for (group = 0; group < nr_softirq_groups; group++)
+ if (*this_cpu_ptr(ksoftirqd[group]) == current)
+ break;
+
+ if (WARN_ON_ONCE(group == nr_softirq_groups))
+ return 0;
+
+ return pending & group_to_softirqs[group];
+}
+
+static void run_ksoftirqd(unsigned int cpu)
+{
+ unsigned group;
+
+ for (group = 0; group < nr_softirq_groups; group++)
+ if (*this_cpu_ptr(ksoftirqd[group]) == current)
+ break;
+
+ local_irq_disable();
+ if (local_softirq_pending()) {
+ /*
+ * We can safely run softirq on inline stack, as we are not deep
+ * in the task stack here.
+ */
+ __do_softirq(group_to_softirqs[group]);
+ local_irq_enable();
+ cond_resched_rcu_qs();
+ return;
+ }
+ local_irq_enable();
+}
+
+static __init
+int register_ksoftirqd_group(unsigned nr, struct task_struct *__percpu **tsk)
+{
+ struct smp_hotplug_thread *thread;
+ char *thread_comm;
+
+ thread = kzalloc(sizeof(struct smp_hotplug_thread), GFP_KERNEL);
+ if (WARN_ON_ONCE(!thread))
+ return 1;
+
+ thread_comm = kzalloc(TASK_COMM_LEN, GFP_KERNEL);
+ if (WARN_ON_ONCE(!thread_comm))
+ return 1;
+
+ *tsk = alloc_percpu(struct task_struct*);
+ if (WARN_ON(!*tsk))
+ return 1;
+
+ snprintf(thread_comm, TASK_COMM_LEN, "ksoftirqd-g%d/%%u", nr);
+
+ thread->thread_comm = thread_comm;
+ thread->store = *tsk;
+ thread->thread_should_run = ksoftirqd_should_run;
+ thread->thread_fn = run_ksoftirqd;
+
+ if (WARN_ON_ONCE(smpboot_register_percpu_thread(thread)))
+ return 1;
+
+ return 0;
+}

static __init int spawn_ksoftirqd(void)
{
+ size_t k_groups_sz = sizeof(struct task_struct *__percpu *);
+ struct task_struct *__percpu **tmp;
+ unsigned group;
+
cpuhp_setup_state_nocalls(CPUHP_SOFTIRQ_DEAD, "softirq:dead", NULL,
takeover_tasklets);
- BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
+
+ tmp = kmalloc_array(nr_softirq_groups, k_groups_sz, GFP_KERNEL);
+ if (WARN_ON(!tmp))
+ return 1;
+
+ for (group = 0; group < nr_softirq_groups; group++) {
+ if (register_ksoftirqd_group(group, &tmp[group]))
+ return 1;
+ }
+
+ smp_wmb();
+ ksoftirqd = tmp;

return 0;
}
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a4d214c7b506..bb403be3987e 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -919,7 +919,7 @@ void tcp_wfree(struct sk_buff *skb)
* - chance for incoming ACK (processed by another cpu maybe)
* to migrate this flow (skb->ooo_okay will be eventually set)
*/
- if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
+ if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && servicing_softirq(NET_TX_SOFTIRQ))
goto out;

for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
--
2.13.6