[RFC 6/6] softirq/sched: Account si cpu time to ksoftirqd(s)

From: Dmitry Safonov
Date: Thu Jan 18 2018 - 11:13:14 EST


Warning: non-merge-ready in any sense

Under CONFIG_FAIR_SOFTIRQ_SCHEDULE each sched tick will account cpu time
spent on processing softirqs to ksoftirqd of the softirq's group.
Update then ksoftirqd->se.sum_exec_runtime and recalculate
ksoftirqd->se.vruntime.

Use CFS's vrutime to decide if softirq needs to be served or deferred.
It's possible to tune this with ksoftirqd nice policy.

Signed-off-by: Dmitry Safonov <dima@xxxxxxxxxx>
---
include/linux/interrupt.h | 1 +
kernel/sched/fair.c | 38 ++++++++++++++++++++++++++++++++++++++
kernel/sched/sched.h | 19 +++++++++++++++++++
kernel/softirq.c | 45 +++++++++++++++++++++++++++++++++++++--------
4 files changed, 95 insertions(+), 8 deletions(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 17e1a04445fa..a0b5c24c088a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -512,6 +512,7 @@ extern struct task_struct *__percpu **ksoftirqd;
extern unsigned nr_softirq_groups;

extern bool servicing_softirq(unsigned nr);
+extern unsigned group_softirqs(unsigned nr);
static inline bool current_is_ksoftirqd(void)
{
unsigned i;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2fe3aa853e4d..d0105739551f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -813,6 +813,42 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
}
#endif /* CONFIG_SMP */

+static void update_ksoftirqd(struct cfs_rq *cfs_rq)
+{
+#ifdef CONFIG_FAIR_SOFTIRQ_SCHEDULE
+ int rq_cpu = cpu_of(rq_of(cfs_rq));
+ u64 si_times[NR_SOFTIRQS], delta[NR_SOFTIRQS];
+ unsigned i;
+
+ if (unlikely(!ksoftirqd))
+ return;
+
+ softirq_time_read(rq_cpu, si_times);
+
+ for (i = 0; i < NR_SOFTIRQS; i++) {
+ delta[i] = si_times[i] - cfs_rq->prev_si_time[i];
+ cfs_rq->prev_si_time[i] = si_times[i];
+ if (unlikely((s64)delta[i] < 0))
+ delta[i] = 0;
+ }
+
+ for (i = 0; i < nr_softirq_groups; i++) {
+ unsigned j, softirq = 0, group_mask = group_softirqs(i);
+ struct task_struct *tsk = *this_cpu_ptr(ksoftirqd[i]);
+ u64 sum_delta = 0;
+
+ while ((j = ffs(group_mask))) {
+ softirq += j - 1;
+ group_mask >>= j;
+ sum_delta += delta[softirq];
+ }
+
+ tsk->se.sum_exec_runtime += sum_delta;
+ tsk->se.vruntime += calc_delta_fair(sum_delta, &tsk->se);
+ }
+#endif
+}
+
/*
* Update the current task's runtime statistics.
*/
@@ -822,6 +858,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
u64 now = rq_clock_task(rq_of(cfs_rq));
u64 delta_exec;

+ update_ksoftirqd(cfs_rq);
+
if (unlikely(!curr))
return;

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14e154c86dc5..e95d8d4f9146 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -487,6 +487,10 @@ struct cfs_rq {
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */

+#ifdef CONFIG_FAIR_SOFTIRQ_SCHEDULE
+ u64 prev_si_time[NR_SOFTIRQS];
+#endif
+
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
u64 runtime_expires;
@@ -2081,6 +2085,21 @@ static inline u64 irq_time_read(int cpu)
}
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */

+static inline void softirq_time_read(int cpu, u64 si_times[NR_SOFTIRQS])
+{
+#ifdef CONFIG_FAIR_SOFTIRQ_SCHEDULE
+ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
+ unsigned int seq, i;
+
+ for (i = 0; i < NR_SOFTIRQS; i++) {
+ do {
+ seq = __u64_stats_fetch_begin(&irqtime->sync);
+ si_times[i] = irqtime->total_si[i];
+ } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
+ }
+#endif
+}
+
#ifdef CONFIG_CPU_FREQ
DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 516e31d3d5b4..a123bafa11c2 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -82,6 +82,11 @@ bool servicing_softirq(unsigned nr)
return false;
}

+unsigned group_softirqs(unsigned nr)
+{
+ return group_to_softirqs[nr];
+}
+
/*
* we cannot loop indefinitely here to avoid userspace starvation,
* but we also don't want to introduce a worst case 1/HZ latency
@@ -112,15 +117,10 @@ static void wakeup_softirqd(u32 softirq_mask)
* If ksoftirqd is scheduled, we do not want to process pending softirqs
* right now. Let ksoftirqd handle this at its own rate, to get fairness.
*/
-static bool ksoftirqd_running(void)
+static bool ksoftirqd_running(__u32 pending)
{
- /* We rely that there are pending softirqs */
- __u32 pending = local_softirq_pending();
unsigned i;

- if (!ksoftirqd)
- return false;
-
for (i = 0; i < nr_softirq_groups && pending; i++) {
/* Interrupts are disabled: no need to stop preemption */
struct task_struct *tsk = *this_cpu_ptr(ksoftirqd[i]);
@@ -137,6 +137,33 @@ static bool ksoftirqd_running(void)
return !pending;
}

+static __u32 softirqs_to_serve(__u32 pending)
+{
+ unsigned i;
+ __u32 unserve = pending;
+
+ if (!ksoftirqd || !current || is_idle_task(current))
+ return pending;
+
+ if (!IS_ENABLED(CONFIG_FAIR_SOFTIRQ_SCHEDULE))
+ return ksoftirqd_running(pending) ? 0 : pending;
+
+ for (i = 0; i < nr_softirq_groups && unserve; i++) {
+ /* Interrupts are disabled: no need to stop preemption */
+ struct task_struct *tsk = *this_cpu_ptr(ksoftirqd[i]);
+
+ if (tsk && (s64)(current->se.vruntime - tsk->se.vruntime) < 0) {
+ if (tsk->state != TASK_RUNNING)
+ wake_up_process(tsk);
+ continue;
+ }
+
+ unserve &= ~group_to_softirqs[i];
+ }
+
+ return pending & ~unserve;
+}
+
/*
* preempt_count and SOFTIRQ_OFFSET usage:
* - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
@@ -385,7 +412,8 @@ asmlinkage __visible void do_softirq(void)

local_irq_save(flags);

- if (!ksoftirqd_running())
+ pending = softirqs_to_serve(pending);
+ if (pending)
do_softirq_own_stack(pending);

local_irq_restore(flags);
@@ -414,7 +442,8 @@ static inline void invoke_softirq(void)
{
__u32 pending = local_softirq_pending();

- if (!pending || !ksoftirqd_running())
+ pending = softirqs_to_serve(pending);
+ if (!pending)
return;

if (!force_irqthreads) {
--
2.13.6