Subject: [PATCH] sched/fair: prioritize normal task over sched_idle task with vruntime offset

From: chenying
Date: Fri Mar 11 2022 - 02:59:29 EST


We add a time offset to the se->vruntime when the idle sched_entity
is enqueued, so that the idle entity will always be on the right of
the non-idle in the runqueue. This can allow non-idle tasks to be
selected and run before the idle.

A use-case is that sched_idle for background tasks and non-idle
for foreground. The foreground tasks are latency sensitive and do
not want to be disturbed by the background. It is well known that
the idle tasks can be preempted by the non-idle tasks when waking up,
but will not distinguish between idle and non-idle when pick the next
entity. This may cause background tasks to disturb the foreground.

Test results as below:

~$ ./loop.sh &
[1] 764
~$ chrt -i 0 ./loop.sh &
[2] 765
~$ taskset -p 04 764
~$ taskset -p 04 765

~$ top -p 764 -p 765
top - 13:10:01 up 1 min,  2 users,  load average: 1.30, 0.38, 0.13
Tasks:   2 total,   2 running,   0 sleeping,   0 stopped,   0 zombie
%Cpu(s): 12.5 us,  0.0 sy,  0.0 ni, 87.4 id,  0.0 wa,  0.0 hi, 0.0 si,  0.0 st
KiB Mem : 16393492 total, 16142256 free,   111028 used,   140208 buff/cache
KiB Swap:   385836 total,   385836 free,        0 used. 16037992 avail Mem

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM TIME+ COMMAND
  764 chenyin+  20   0   12888   1144   1004 R 100.0  0.0 1:05.12 loop.sh
  765 chenyin+  20   0   12888   1224   1080 R   0.0  0.0 0:16.21 loop.sh

The non-idle process (764) can run at 100% and without being disturbed by
the idle process (765).

~$ cat /sys/fs/cgroup/cpu/background/cgroup.procs
765
~$ cat /sys/fs/cgroup/cpu/foreground/cgroup.procs
764
~$ top -p 764 -p 765
top - 13:17:19 up 9 min,  2 users,  load average: 2.00, 1.64, 0.86
Tasks:   2 total,   2 running,   0 sleeping,   0 stopped,   0 zombie
%Cpu(s): 12.5 us,  0.0 sy,  0.0 ni, 87.5 id,  0.0 wa,  0.0 hi, 0.0 si,  0.0 st
KiB Mem : 16393492 total, 16139576 free,   112732 used,   141184 buff/cache
KiB Swap:   385836 total,   385836 free,        0 used. 16036236 avail Mem

  PID USER      PR  NI    VIRT    RES    SHR S  %CPU %MEM TIME+ COMMAND
  764 chenyin+  20   0   12888   1144   1004 R 100.0  0.0 8:23.51 loop.sh
  765 chenyin+  20   0   12888   1224   1080 R   0.0  0.0 0:16.21 loop.sh

The non-idle group can run at 100% and without being disturbed by the
idle group.

Co-developed-by: chengming zhou <zhouchengming@xxxxxxxxxxxxx>
Signed-off-by: chenying <chenying.kernel@xxxxxxxxxxxxx>
---
 include/linux/sched.h   |  1 +
 kernel/sched/core.c     |  6 +++++-
 kernel/sched/debug.c    |  2 ++
 kernel/sched/fair.c     | 26 ++++++++++++++++++++++----
 kernel/sched/features.h |  2 ++
 kernel/sched/sched.h    |  1 +
 6 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 75ba8aa60248..20412f353cad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -545,6 +545,7 @@ struct sched_entity {
     u64                exec_start;
     u64                sum_exec_runtime;
     u64                vruntime;
+    u64                vruntime_offset;
     u64                prev_sum_exec_runtime;

     u64                nr_migrations;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9745613d531c..beb9d6f54c52 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4239,6 +4239,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
     p->se.prev_sum_exec_runtime    = 0;
     p->se.nr_migrations        = 0;
     p->se.vruntime            = 0;
+    p->se.vruntime_offset        = 0;
     INIT_LIST_HEAD(&p->se.group_node);

 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7211,8 +7212,11 @@ static void __setscheduler_params(struct task_struct *p,

     if (dl_policy(policy))
         __setparam_dl(p, attr);
-    else if (fair_policy(policy))
+    else if (fair_policy(policy)) {
         p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+        p->se.vruntime_offset = 0;
+    } else if (idle_policy(policy))
+        p->se.vruntime_offset = sched_idle_vruntime_offset;

     /*
      * __sched_setscheduler() ensures attr->sched_priority == 0 when
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index aa29211de1bf..701496626830 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -460,6 +460,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group

     PN(se->exec_start);
     PN(se->vruntime);
+    PN(se->vruntime_offset);
     PN(se->sum_exec_runtime);

     if (schedstat_enabled()) {
@@ -969,6 +970,7 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns,

     PN(se.exec_start);
     PN(se.vruntime);
+    PN(se.vruntime_offset);
     PN(se.sum_exec_runtime);

     nr_switches = p->nvcsw + p->nivcsw;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5146163bfabb..6a2cba63b4a9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -92,6 +92,8 @@ static unsigned int normalized_sysctl_sched_wakeup_granularity    = 1000000UL;

 const_debug unsigned int sysctl_sched_migration_cost    = 500000UL;

+unsigned long long sched_idle_vruntime_offset    = 2592000000000000; /* 30 days */
+
 int sched_thermal_decay_shift;
 static int __init setup_sched_thermal_decay_shift(char *str)
 {
@@ -535,10 +537,19 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
     return min_vruntime;
 }

+static inline s64  vtime_diff(struct sched_entity *a,
+                struct sched_entity *b)
+{
+    if (sched_feat(VRUNTIME_OFFSET))
+        return (s64)(a->vruntime_offset - b->vruntime_offset);
+    else
+        return 0;
+}
+
 static inline bool entity_before(struct sched_entity *a,
                 struct sched_entity *b)
 {
-    return (s64)(a->vruntime - b->vruntime) < 0;
+    return (s64)(a->vruntime - b->vruntime + vtime_diff(a, b)) < 0;
 }

 #define __node_2_se(node) \
@@ -4445,7 +4456,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
         return;

     se = __pick_first_entity(cfs_rq);
-    delta = curr->vruntime - se->vruntime;
+    delta = curr->vruntime - se->vruntime + vtime_diff(curr, se);

     if (delta < 0)
         return;
@@ -7036,7 +7047,7 @@ static unsigned long wakeup_gran(struct sched_entity *se)
 static int
 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 {
-    s64 gran, vdiff = curr->vruntime - se->vruntime;
+    s64 gran, vdiff = curr->vruntime - se->vruntime + vtime_diff(curr, se);

     if (vdiff <= 0)
         return -1;
@@ -11131,7 +11142,7 @@ bool cfs_prio_less(struct task_struct *a, struct task_struct *b, bool in_fi)
      * min_vruntime_fi, which would have been updated in prior calls
      * to se_fi_update().
      */
-    delta = (s64)(sea->vruntime - seb->vruntime) +
+    delta = (s64)(sea->vruntime - seb->vruntime + vtime_diff(sea, seb)) +
         (s64)(cfs_rqb->min_vruntime_fi - cfs_rqa->min_vruntime_fi);

     return delta > 0;
@@ -11190,6 +11201,9 @@ static void task_fork_fair(struct task_struct *p)
     }
     place_entity(cfs_rq, se, 1);

+    if (task_has_idle_policy(p))
+        se->vruntime_offset = sched_idle_vruntime_offset;
+
     if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
         /*
          * Upon rescheduling, sched_class::put_prev_task() will place
@@ -11655,6 +11669,10 @@ int sched_group_set_idle(struct task_group *tg, long idle)
         rq_lock_irqsave(rq, &rf);

         grp_cfs_rq->idle = idle;
+        if (idle)
+            se->vruntime_offset = sched_idle_vruntime_offset;
+        else
+            se->vruntime_offset = 0;
         if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
             goto next_cpu;

diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1cf435bbcd9c..f59f507e6dba 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -100,3 +100,5 @@ SCHED_FEAT(LATENCY_WARN, false)

 SCHED_FEAT(ALT_PERIOD, true)
 SCHED_FEAT(BASE_SLICE, true)
+
+SCHED_FEAT(VRUNTIME_OFFSET, true)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index de53be905739..1bc0c0756fd4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -95,6 +95,7 @@ extern __read_mostly int scheduler_running;

 extern unsigned long calc_load_update;
 extern atomic_long_t calc_load_tasks;
+extern unsigned long long sched_idle_vruntime_offset;

 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
--
2.11.0