[RFC PATCH 2/5] softirq: Per vector deferment to workqueue

From: Frederic Weisbecker
Date: Mon Jan 15 2018 - 23:41:09 EST


Some softirq vectors can be more CPU hungry than others. Especially
networking may sometimes deal with packet storm and need more CPU than
IRQ tail can offer without inducing scheduler latencies. In this case
the current code defers to ksoftirqd that behaves nicer. Now this nice
behaviour can be bad for other IRQ vectors that usually need quick
processing.

To solve this we only defer to threading the vectors that outreached the
calls limit on IRQ tail processing and leave the others inline on real
Soft-IRQs service. This is achieved using workqueues with
per-CPU/per-vector worklets.

Note ksoftirqd is not yet removed as it is still needed for threaded IRQs
mode.

Suggested-by: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Signed-off-by: Frederic Weisbecker <frederic@xxxxxxxxxx>
Cc: Dmitry Safonov <dima@xxxxxxxxxx>
Cc: Eric Dumazet <edumazet@xxxxxxxxxx>
Cc: Linus Torvalds <torvalds@xxxxxxxxxxxxxxxxxxxx>
Cc: Peter Zijlstra <peterz@xxxxxxxxxxxxx>
Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
Cc: David Miller <davem@xxxxxxxxxxxxx>
Cc: Hannes Frederic Sowa <hannes@xxxxxxxxxxxxxxxxxxx>
Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: Levin Alexander <alexander.levin@xxxxxxxxxxx>
Cc: Paolo Abeni <pabeni@xxxxxxxxxx>
Cc: Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx>
Cc: Radu Rendec <rrendec@xxxxxxxxxx>
Cc: Rik van Riel <riel@xxxxxxxxxx>
Cc: Stanislaw Gruszka <sgruszka@xxxxxxxxxx>
Cc: Thomas Gleixner <tglx@xxxxxxxxxxxxx>
Cc: Wanpeng Li <wanpeng.li@xxxxxxxxxxx>
Cc: Mauro Carvalho Chehab <mchehab@xxxxxxxxxxxxxxxx>
---
include/linux/interrupt.h | 2 +
kernel/sched/cputime.c | 5 +-
kernel/softirq.c | 121 +++++++++++++++++++++++++++++++++++++++++-----
net/ipv4/tcp_output.c | 3 +-
4 files changed, 117 insertions(+), 14 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 69c2382..92d044d 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -514,6 +514,8 @@ static inline struct task_struct *this_cpu_ksoftirqd(void)
return this_cpu_read(ksoftirqd);
}

+extern int softirq_serving_workqueue(void);
+
/* Tasklets --- multithreaded analogue of BHs.

Main feature differing them of generic softirqs: tasklet
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index bac6ac9..30f70e5 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -71,7 +71,8 @@ void irqtime_account_irq(struct task_struct *curr)
*/
if (hardirq_count())
irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
- else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd() &&
+ !softirq_serving_workqueue())
irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
@@ -375,7 +376,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,

cputime -= other;

- if (this_cpu_ksoftirqd() == p) {
+ if (this_cpu_ksoftirqd() == p || softirq_serving_workqueue()) {
/*
* ksoftirqd time do not get accounted in cpu_softirq_time.
* So, we have to handle it separately here.
diff --git a/kernel/softirq.c b/kernel/softirq.c
index e0f4b29..255da68 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -63,14 +63,20 @@ const char * const softirq_to_name[NR_SOFTIRQS] = {
};

struct vector {
+ int nr;
unsigned int jiffy_calls;
unsigned long jiffy_snap;
+ struct work_struct work;
};

-static DEFINE_PER_CPU(struct vector, vector_cpu[NR_SOFTIRQS]) = {
- [0 ... NR_SOFTIRQS-1] = { 0, INITIAL_JIFFIES }
+struct softirq {
+ unsigned int pending_work_mask;
+ int work_running;
+ struct vector vector[NR_SOFTIRQS];
};

+static DEFINE_PER_CPU(struct softirq, softirq_cpu);
+
/*
* we cannot loop indefinitely here to avoid userspace starvation,
* but we also don't want to introduce a worst case 1/HZ latency
@@ -242,8 +248,77 @@ static inline bool lockdep_softirq_start(void) { return false; }
static inline void lockdep_softirq_end(bool in_hardirq) { }
#endif

+int softirq_serving_workqueue(void)
+{
+ return __this_cpu_read(softirq_cpu.work_running);
+}
+
+static void vector_work_func(struct work_struct *work)
+{
+ struct vector *vector = container_of(work, struct vector, work);
+ struct softirq *softirq = this_cpu_ptr(&softirq_cpu);
+ int vec_nr = vector->nr;
+ int vec_bit = BIT(vec_nr);
+ u32 pending;
+
+ local_irq_disable();
+ pending = local_softirq_pending();
+ account_irq_enter_time(current);
+ __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
+ lockdep_softirq_enter();
+ set_softirq_pending(pending & ~vec_bit);
+ local_irq_enable();
+
+ if (pending & vec_bit) {
+ struct softirq_action *sa = &softirq_vec[vec_nr];
+
+ kstat_incr_softirqs_this_cpu(vec_nr);
+ softirq->work_running = 1;
+ trace_softirq_entry(vec_nr);
+ sa->action(sa);
+ trace_softirq_exit(vec_nr);
+ softirq->work_running = 0;
+ }
+
+ local_irq_disable();
+
+ pending = local_softirq_pending();
+ if (pending & vec_bit)
+ schedule_work_on(smp_processor_id(), &vector->work);
+ else
+ softirq->pending_work_mask &= ~vec_bit;
+
+ lockdep_softirq_exit();
+ account_irq_exit_time(current);
+ __local_bh_enable(SOFTIRQ_OFFSET);
+ local_irq_enable();
+}
+
+static void do_softirq_workqueue(u32 pending)
+{
+ struct softirq *softirq = this_cpu_ptr(&softirq_cpu);
+ struct softirq_action *h = softirq_vec;
+ int softirq_bit;
+
+ pending &= ~softirq->pending_work_mask;
+
+ while ((softirq_bit = ffs(pending))) {
+ struct vector *vector;
+ unsigned int vec_nr;
+
+ h += softirq_bit - 1;
+ vec_nr = h - softirq_vec;
+ softirq->pending_work_mask |= BIT(vec_nr);
+ vector = &softirq->vector[vec_nr];
+ schedule_work_on(smp_processor_id(), &vector->work);
+ h++;
+ pending >>= softirq_bit;
+ }
+}
+
asmlinkage __visible void __softirq_entry __do_softirq(void)
{
+ struct softirq *softirq = this_cpu_ptr(&softirq_cpu);
unsigned long old_flags = current->flags;
struct softirq_action *h;
bool in_hardirq;
@@ -257,15 +332,18 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
*/
current->flags &= ~PF_MEMALLOC;

- pending = local_softirq_pending();
+ /* Ignore vectors pending on workqueues, they have been punished */
+ pending = local_softirq_pending() & ~softirq->pending_work_mask;
account_irq_enter_time(current);

__local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET);
in_hardirq = lockdep_softirq_start();
-
restart:
- /* Reset the pending bitmask before enabling irqs */
- set_softirq_pending(0);
+ /*
+ * Reset the pending bitmask before enabling irqs but keep
+ * those pending on workqueues so they get properly handled there.
+ */
+ set_softirq_pending(softirq->pending_work_mask);

local_irq_enable();

@@ -287,7 +365,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
h->action(h);
trace_softirq_exit(vec_nr);

- vector = this_cpu_ptr(&vector_cpu[vec_nr]);
+ vector = &softirq->vector[vec_nr];
if (time_before(vector->jiffy_snap, jiffies)) {
vector->jiffy_calls = 0;
vector->jiffy_snap = jiffies;
@@ -309,12 +387,18 @@ asmlinkage __visible void __softirq_entry __do_softirq(void)
rcu_bh_qs();
local_irq_disable();

- pending = local_softirq_pending();
+ pending = local_softirq_pending() & ~softirq->pending_work_mask;
if (pending) {
- if (overrun || need_resched())
+ if (need_resched()) {
wakeup_softirqd();
- else
- goto restart;
+ } else {
+ /* Vectors that overreached the limits are threaded */
+ if (overrun & pending)
+ do_softirq_workqueue(overrun & pending);
+ pending &= ~overrun;
+ if (pending)
+ goto restart;
+ }
}

lockdep_softirq_end(in_hardirq);
@@ -651,10 +735,25 @@ void __init softirq_init(void)
int cpu;

for_each_possible_cpu(cpu) {
+ struct softirq *softirq;
+ int i;
+
per_cpu(tasklet_vec, cpu).tail =
&per_cpu(tasklet_vec, cpu).head;
per_cpu(tasklet_hi_vec, cpu).tail =
&per_cpu(tasklet_hi_vec, cpu).head;
+
+ softirq = &per_cpu(softirq_cpu, cpu);
+
+ for (i = 0; i < NR_SOFTIRQS; i++) {
+ struct vector *vector;
+
+ vector = &softirq->vector[i];
+ vector->nr = i;
+ vector->jiffy_calls = 0;
+ vector->jiffy_snap = INITIAL_JIFFIES;
+ INIT_WORK(&vector->work, vector_work_func);
+ }
}

open_softirq(TASKLET_SOFTIRQ, tasklet_action);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a4d214c..b4e4160 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -919,7 +919,8 @@ void tcp_wfree(struct sk_buff *skb)
* - chance for incoming ACK (processed by another cpu maybe)
* to migrate this flow (skb->ooo_okay will be eventually set)
*/
- if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) && this_cpu_ksoftirqd() == current)
+ if (refcount_read(&sk->sk_wmem_alloc) >= SKB_TRUESIZE(1) &&
+ (this_cpu_ksoftirqd() == current || softirq_serving_workqueue()))
goto out;

for (oval = READ_ONCE(sk->sk_tsq_flags);; oval = nval) {
--
2.7.4