Re: [PATCH 1/2] sched/fair: Use task_groups instead of leaf_cfs_rq_list to walk all cfs_rqs

From: Peter Zijlstra
Date: Mon May 01 2017 - 12:59:55 EST


On Tue, Apr 25, 2017 at 05:40:39PM -0700, Tejun Heo wrote:
> @@ -9355,24 +9366,18 @@ void online_fair_sched_group(struct task
> void unregister_fair_sched_group(struct task_group *tg)
> {
> unsigned long flags;
> - struct rq *rq;
> int cpu;
>
> for_each_possible_cpu(cpu) {
> + struct rq *rq = cpu_rq(cpu);
> + struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
> +
> if (tg->se[cpu])
> remove_entity_load_avg(tg->se[cpu]);
>
> - /*
> - * Only empty task groups can be destroyed; so we can speculatively
> - * check on_list without danger of it being re-added.
> - */
> - if (!tg->cfs_rq[cpu]->on_list)
> - continue;
> -
> - rq = cpu_rq(cpu);
> -
> raw_spin_lock_irqsave(&rq->lock, flags);
> - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
> + list_del_leaf_cfs_rq(cfs_rq);
> + cfs_rq->online = 0;
> raw_spin_unlock_irqrestore(&rq->lock, flags);
> }
> }

It would be nice to be able to retain that on_list check and avoid
taking all those rq->lock's.

Since per the previous mail, those online/offline loops are protected by
rq->lock, all we need is to ensure an rcu_sched GP passes between here
and sched_free_group().

Which I think we can do differently, see below. Then we don't need the
->online thing and can keep using the ->on_list check.


> @@ -9523,11 +9528,11 @@ const struct sched_class fair_sched_clas
> #ifdef CONFIG_SCHED_DEBUG
> void print_cfs_stats(struct seq_file *m, int cpu)
> {
> - struct cfs_rq *cfs_rq;
> + struct task_group *tg;
>
> rcu_read_lock();
> - for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
> - print_cfs_rq(m, cpu, cfs_rq);
> + list_for_each_entry_rcu(tg, &task_groups, list)
> + print_cfs_rq(m, cpu, tg->cfs_rq[cpu]);
> rcu_read_unlock();
> }

If you have a gazillion empty groups, do we really want to print them?



diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h
index 74d9c3a1feee..8bad2c371b18 100644
--- a/include/linux/rcutiny.h
+++ b/include/linux/rcutiny.h
@@ -58,6 +58,11 @@ static inline void cond_synchronize_sched(unsigned long oldstate)
might_sleep();
}

+static bool void should_synchornize_sched(unsigned long oldstate)
+{
+ return true;
+}
+
extern void rcu_barrier_bh(void);
extern void rcu_barrier_sched(void);

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 0bacb6b2af69..9918b632db27 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -78,6 +78,7 @@ unsigned long get_state_synchronize_rcu(void);
void cond_synchronize_rcu(unsigned long oldstate);
unsigned long get_state_synchronize_sched(void);
void cond_synchronize_sched(unsigned long oldstate);
+bool should_synchronize_sched(unsigned long oldstate);

extern unsigned long rcutorture_testseq;
extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 91fff49d5869..59eb0d115217 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3423,6 +3423,18 @@ void cond_synchronize_sched(unsigned long oldstate)
}
EXPORT_SYMBOL_GPL(cond_synchronize_sched);

+bool should_synchronize_sched(unsigned long oldstate)
+{
+ unsigned long newstate;
+
+ /*
+ * Ensure that this load happens before any RCU-destructive
+ * actions the caller might carry out after we return.
+ */
+ newstate = smp_load_acquire(&rcu_sched_state.completed);
+ return ULONG_CMP_GE(oldstate, newstate);
+}
+
/*
* Check to see if there is any immediate RCU-related work to be done
* by the current CPU, for the specified type of RCU, returning 1 if so.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ecad8e2d2a29..d7630b34d197 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6362,7 +6362,7 @@ void ia64_set_curr_task(int cpu, struct task_struct *p)
/* task_group_lock serializes the addition/removal of task groups */
static DEFINE_SPINLOCK(task_group_lock);

-static void sched_free_group(struct task_group *tg)
+static void __sched_free_group(struct task_group *tg)
{
free_fair_sched_group(tg);
free_rt_sched_group(tg);
@@ -6370,6 +6370,22 @@ static void sched_free_group(struct task_group *tg)
kmem_cache_free(task_group_cache, tg);
}

+static void sched_free_group_rcu_sched(struct rcu_head *rhp)
+{
+ /* Now it should be safe to free those cfs_rqs: */
+ __sched_free_group(container_of(rhp, struct task_group, rcu));
+}
+
+static void sched_free_group(struct task_group *tg)
+{
+ if (should_synchronize_sched(tg->rcu_stamp)) {
+ call_rcu_sched(&tg->rcu, sched_free_group_rcu_sched);
+ return;
+ }
+
+ __sched_free_group(tg);
+}
+
/* allocate runqueue etc for a new task group */
struct task_group *sched_create_group(struct task_group *parent)
{
@@ -6434,6 +6450,8 @@ void sched_offline_group(struct task_group *tg)
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
spin_unlock_irqrestore(&task_group_lock, flags);
+
+ tg->rcu_stamp = get_state_synchronize_sched();
}

static void sched_change_group(struct task_struct *tsk, int type)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b2e5e9e64c86..1dbab563e798 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -302,6 +302,7 @@ struct task_group {
#endif

struct rcu_head rcu;
+ long rcu_stamp;
struct list_head list;

struct task_group *parent;