Thanks for your reply !
On 2022/8/10 12:02, Abel Wu wrote:
Hi Zhang Song,Please consider the situation that CPU A has several normal tasks and hundreds of idle tasks
On 8/10/22 9:56 AM, zhangsong Wrote:
For co-location with NORMAL and IDLE tasks, when CFS trigger load balance,
it is reasonable to prefer migrating NORMAL(Latency Sensitive) tasks from
the busy src CPU to dst CPU, and migrating IDLE tasks lastly.
Considering the large weight difference between normal and idle tasks,
does the re-ordering really change things? It would be helpful if you
can offer more detailed info.
while CPU B is idle, and CPU B needs to pull some tasks from CPU A, but the cfs_tasks in CPU A
are not in order of priority, and the max number of pulling tasks depends on env->loop_max,
which value is sysctl_sched_nr_migrate, i.e. 32.
We now cannot guarantee that CPU B can pull a certain number of normal tasks instead of idle tasks
from the waiting queue of CPU A, so It is necessary to divide cfs_tasks into two different lists
and ensure that tasks in none-idle list can be migrated first.
The tg_is_idle is used to check whether the task group of se is idle.
...
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3034,6 +3034,21 @@ static inline void update_scan_period(struct task_struct *p, int new_cpu)
#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_SMP
+static void
+adjust_rq_cfs_tasks(void (*list_op)(struct list_head *, struct list_head *),
+ struct rq *rq,
+ struct sched_entity *se)
+{
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (task_has_idle_policy(task_of(se)) || tg_is_idle(cfs_rq->tg))
The tg_is_idle() doesn't have hierarchical judgement on parent task
groups, while rq->cfs{,_idle}_tasks is rq wide. Say A->B where tgA
is idle and tgB isn't, a task from B will be added to the non-idle
list, is this what you want?
I think it is unlikely that the parent group is idle but the child group is none-idle.
If it happens, the current load balance policy yet not be affected.
Actually, we do not change the policy behavior of idle task group.
+ (*list_op)(&se->group_node, &rq->cfs_idle_tasks);
+ else
+ (*list_op)(&se->group_node, &rq->cfs_tasks);
+}
+#endif
+
static void
account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -3043,7 +3058,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
struct rq *rq = rq_of(cfs_rq);
account_numa_enqueue(rq, task_of(se));
- list_add(&se->group_node, &rq->cfs_tasks);
+ adjust_rq_cfs_tasks(list_add, rq, se);
}
#endif
cfs_rq->nr_running++;
@@ -7465,7 +7480,7 @@ done: __maybe_unused;
* the list, so our cfs_tasks list becomes MRU
* one.
*/
- list_move(&p->se.group_node, &rq->cfs_tasks);
+ adjust_rq_cfs_tasks(list_move, rq, &p->se);
#endif
if (hrtick_enabled_fair(rq))
@@ -7788,6 +7803,9 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (unlikely(task_has_idle_policy(p)))
return 0;
+ if (tg_is_idle(cfs_rq_of(&p->se)->tg))
+ return 0;
+
Same as above. But I am not sure this is the right way to do it. We
still want to maintain policy behavior inside an idle task group.
We only want to ensure migration prioritily when load balance
pulling/pushing tasks between CPUs.
OK, let me think more about how to define it an appropriate variable.
/* SMT siblings share cache */
if (env->sd->flags & SD_SHARE_CPUCAPACITY)
return 0;
@@ -7800,6 +7818,11 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
&p->se == cfs_rq_of(&p->se)->last))
return 1;
+ /* Preempt sched idle cpu do not consider migration cost */
+ if (cpus_share_cache(env->src_cpu, env->dst_cpu) &&
+ sched_idle_cpu(env->dst_cpu))
+ return 0;
+
if (sysctl_sched_migration_cost == -1)
return 1;
@@ -7990,11 +8013,14 @@ static void detach_task(struct task_struct *p, struct lb_env *env)
static struct task_struct *detach_one_task(struct lb_env *env)
{
struct task_struct *p;
+ struct list_head *tasks = &env->src_rq->cfs_tasks;
+ int loop = 0;
Maybe a boolean variable is enough (and more readable)?
Thanks,
Abel
lockdep_assert_rq_held(env->src_rq);.
+again:
list_for_each_entry_reverse(p,
- &env->src_rq->cfs_tasks, se.group_node) {
+ tasks, se.group_node) {
if (!can_migrate_task(p, env))
continue;
@@ -8009,6 +8035,10 @@ static struct task_struct *detach_one_task(struct lb_env *env)
schedstat_inc(env->sd->lb_gained[env->idle]);
return p;
}
+ if (++loop == 1) {
+ tasks = &env->src_rq->cfs_idle_tasks;
+ goto again;
+ }
return NULL;
}
@@ -8026,6 +8056,7 @@ static int detach_tasks(struct lb_env *env)
unsigned long util, load;
struct task_struct *p;
int detached = 0;
+ int loop = 0;
lockdep_assert_rq_held(env->src_rq);
@@ -8041,6 +8072,7 @@ static int detach_tasks(struct lb_env *env)
if (env->imbalance <= 0)
return 0;
+again:
while (!list_empty(tasks)) {
/*
* We don't want to steal all, otherwise we may be treated likewise,
@@ -8142,6 +8174,11 @@ static int detach_tasks(struct lb_env *env)
list_move(&p->se.group_node, tasks);
}
+ if (env->imbalance > 0 && ++loop == 1) {
+ tasks = &env->src_rq->cfs_idle_tasks;
+ goto again;
+ }
+
/*
* Right now, this is one of only two places we collect this stat
* so we can safely collect detach_one_task() stats here rather
@@ -11643,7 +11680,7 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first)
* Move the next running task to the front of the list, so our
* cfs_tasks list becomes MRU one.
*/
- list_move(&se->group_node, &rq->cfs_tasks);
+ adjust_rq_cfs_tasks(list_move, rq, se);
}
#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e26688d387ae..accb4eea9769 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1068,6 +1068,7 @@ struct rq {
int online;
struct list_head cfs_tasks;
+ struct list_head cfs_idle_tasks;
struct sched_avg avg_rt;
struct sched_avg avg_dl;