Hi Abel,
+static inline bool cfs_rq_busy(struct rq *rq)
+{
+ return rq->cfs.h_nr_running - rq->cfs.idle_h_nr_running == 1;
+}
+
+static inline bool need_pull_cfs_task(struct rq *rq)
+{
+ return rq->cfs.h_nr_running == rq->cfs.idle_h_nr_running;
+}
Note that this will also return true if there are 0 tasks, which I
don't think is the semantics you intend for its use in
rebalance_domains() below.
/*
* Use locality-friendly rq->overloaded to cache the status of the rq
* to minimize the heavy cost on LLC shared data.
@@ -7837,6 +7867,22 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
if (kthread_is_per_cpu(p))
return 0;
+ if (unlikely(task_h_idle(p))) {
+ /*
+ * Disregard hierarchically idle tasks during sched-idle
+ * load balancing.
+ */
+ if (env->idle == CPU_SCHED_IDLE)
+ return 0;
+ } else if (!static_branch_unlikely(&sched_asym_cpucapacity)) {
+ /*
+ * It's not gonna help if stacking non-idle tasks on one
+ * cpu while leaving some idle.
+ */
+ if (cfs_rq_busy(env->src_rq) && !need_pull_cfs_task(env->dst_rq))
+ return 0;
These checks don't involve the task at all, so this kind of check
should be pushed into the more general load balance function. But, I'm
not totally clear on the motivation here. If we have cpu A with 1
non-idle task and 100 idle tasks, and cpu B with 1 non-idle task, we
should definitely try to load balance some of the idle tasks from A to
B. idle tasks _do_ get time to run (although little), and this can add
up and cause antagonism to the non-idle task if there are a lot of
idle threads.
/*
+ * The sched-idle balancing tries to make full use of cpu capacity
+ * for non-idle tasks by pulling them for the unoccupied cpus from
+ * the overloaded ones.
+ *
+ * Return 1 if pulled successfully, 0 otherwise.
+ */
+static int sched_idle_balance(struct rq *dst_rq)
+{
+ struct sched_domain *sd;
+ struct task_struct *p = NULL;
+ int dst_cpu = cpu_of(dst_rq), cpu;
+
+ sd = rcu_dereference(per_cpu(sd_llc, dst_cpu));
+ if (unlikely(!sd))
+ return 0;
+
+ if (!atomic_read(&sd->shared->nr_overloaded))
+ return 0;
+
+ for_each_cpu_wrap(cpu, sdo_mask(sd->shared), dst_cpu + 1) {
+ struct rq *rq = cpu_rq(cpu);
+ struct rq_flags rf;
+ struct lb_env env;
+
+ if (cpu == dst_cpu || !cfs_rq_overloaded(rq) ||
+ READ_ONCE(rq->sched_idle_balance))
+ continue;
+
+ WRITE_ONCE(rq->sched_idle_balance, 1);
+ rq_lock_irqsave(rq, &rf);
+
+ env = (struct lb_env) {
+ .sd = sd,
+ .dst_cpu = dst_cpu,
+ .dst_rq = dst_rq,
+ .src_cpu = cpu,
+ .src_rq = rq,
+ .idle = CPU_SCHED_IDLE, /* non-idle only */
+ .flags = LBF_DST_PINNED, /* pin dst_cpu */
+ };
+
+ update_rq_clock(rq);
+ p = detach_one_task(&env);
+ if (p)
+ update_overload_status(rq);
+
+ rq_unlock(rq, &rf);
+ WRITE_ONCE(rq->sched_idle_balance, 0);
+
+ if (p) {
+ attach_one_task(dst_rq, p);
+ local_irq_restore(rf.flags);
+ return 1;
+ }
+
+ local_irq_restore(rf.flags);
+ }
+
+ return 0;
+}
I think this could probably be integrated with the load balancing
function. Your goal is ignore idle tasks for the purpose of pulling
from a remote rq. And I think the above isn't exactly what you want
anyway; detach_tasks/detach_one_task are just going to iterate the
task list in order. You want to actually look for the non-idle tasks
explicitly.
@@ -10996,9 +11119,9 @@ static int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
if (sd->flags & SD_BALANCE_NEWIDLE) {
- pulled_task = load_balance(this_cpu, this_rq,
- sd, CPU_NEWLY_IDLE,
- &continue_balancing);
+ pulled_task |= load_balance(this_cpu, this_rq,
+ sd, CPU_NEWLY_IDLE,
+ &continue_balancing);
Why |= ?