[..snip..]
@@ -6197,24 +6201,44 @@ static inline int __select_idle_cpu(int cpu, struct task_struct *p)
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
EXPORT_SYMBOL_GPL(sched_smt_present);
-static inline void set_idle_cores(int cpu, int val)
+static inline void sd_set_state(int cpu, enum sd_state state)
Nit: We are setting the state of only the LLC domain and not any other
domain via this function. So should we name it as
set_llc_state()/get_llc_state() for better readability ?
{^^^^^
struct sched_domain_shared *sds;
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds)
- WRITE_ONCE(sds->has_idle_cores, val);
+ WRITE_ONCE(sds->state, state);
}
-static inline bool test_idle_cores(int cpu)
+static inline enum sd_state sd_get_state(int cpu)
{
struct sched_domain_shared *sds;
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (sds)
- return READ_ONCE(sds->has_idle_cores);
+ return READ_ONCE(sds->state);
- return false;
+ return sd_has_icpus;
+}
+
+static inline void set_idle_cores(int cpu, int idle)
I agree with Josh. We can use core_idle instead of idle here.
+{
+ sd_set_state(cpu, idle ? sd_has_icores : sd_has_icpus);
+}
+
+static inline bool test_idle_cores(int cpu)
+{
+ return sd_get_state(cpu) == sd_has_icores;
+}
+
+static inline void set_idle_cpus(int cpu, int idle)
+{
+ sd_set_state(cpu, idle ? sd_has_icpus : sd_is_busy);
+}
+
+static inline bool test_idle_cpus(int cpu)
+{
+ return sd_get_state(cpu) != sd_is_busy;
}
/*
[...]
@@ -8661,6 +8702,12 @@ sched_asym(struct lb_env *env, struct sd_lb_stats *sds, struct sg_lb_stats *sgs
return sched_asym_prefer(env->dst_cpu, group->asym_prefer_cpu);
}
+static inline void sd_classify(struct sd_lb_stats *sds, struct rq *rq)
+{
+ if (sds->sd_state != sd_has_icpus && unoccupied_rq(rq))
Nit: sds->sd_state can either be sd_has_icpus or sd_is_busy. So for
better readability, we can just use the positive check
if (sds->sd_state == sd_is_busy && unoccupied_rq(rq))
sds->sd_state = sd_has_icpus;
+ sds->sd_state = sd_has_icpus;
+}
+
/**
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
@@ -8675,11 +8722,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
struct sg_lb_stats *sgs,
int *sg_status)
{
- int i, nr_running, local_group;
+ int i, nr_running, local_group, update_core;
memset(sgs, 0, sizeof(*sgs));
local_group = group == sds->local;
+ update_core = env->sd->flags & SD_SHARE_CPUCAPACITY;
for_each_cpu_and(i, sched_group_span(group), env->cpus) {
struct rq *rq = cpu_rq(i);
@@ -8692,6 +8740,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
nr_running = rq->nr_running;
sgs->sum_nr_running += nr_running;
+ if (update_core)
+ sd_classify(sds, rq);
+
if (nr_running > 1)
*sg_status |= SG_OVERLOAD;
@@ -9220,6 +9271,12 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
return idlest;
}
+static void sd_update_state(struct lb_env *env, struct sd_lb_stats *sds)
+{
+ if (sds->sd_state == sd_has_icpus && !test_idle_cpus(env->dst_cpu))
+ set_idle_cpus(env->dst_cpu, true);
We could enter this if condition when env->dst_cpu is the only idle
CPU in the SMT domain (which is likely to be the case every time we do
a NEW_IDLE balance). By the end of this load-balancing round, the
env->dst_cpu can pull a task from some other CPU and thereby no longer
remain idle but the LLC state would still be sd_has_icpus.
That would mean that some CPU on this LLC would do a full scan during
the wakeup only to find no idle CPU and reset the state to
sd_is_busy. Have you seen instances where this false-positive pattern
can result in wasteful scan thereby cause a performance degradation?
Ideally it should not be worse that what we currently have.
Apart from this, patch looks good to me.
I would be worth the while to explore if the LLC state can be used
early on in select_task_rq_fair() to determine if we need to do a
wake-affine or allow the task to stick to its previous LLC depending
on which among the previous LLC and the waker's LLC have an idle CPU.