[RFC PATCH] sched/fair: Make tg->load_avg per node

From: Aaron Lu
Date: Mon Mar 27 2023 - 01:40:28 EST


When using sysbench to benchmark Postgres in a single docker instance
with sysbench's nr_threads set to nr_cpu, it is observed there are times
update_cfs_group() and update_load_avg() shows noticeable overhead on
cpus of one node of a 2sockets/112core/224cpu Intel Sapphire Rapids:

10.01% 9.86% [kernel.vmlinux] [k] update_cfs_group
7.84% 7.43% [kernel.vmlinux] [k] update_load_avg

While cpus of the other node normally sees a lower cycle percent:

4.46% 4.36% [kernel.vmlinux] [k] update_cfs_group
4.02% 3.40% [kernel.vmlinux] [k] update_load_avg

Annotate shows the cycles are mostly spent on accessing tg->load_avg
with update_load_avg() being the write side and update_cfs_group() being
the read side.

The reason why only cpus of one node has bigger overhead is: task_group
is allocated on demand from a slab and whichever cpu happens to do the
allocation, the allocated tg will be located on that node and accessing
to tg->load_avg will have a lower cost for cpus on the same node and
a higer cost for cpus of the remote node.

Tim Chen told me that PeterZ once mentioned a way to solve a similar
problem by making a counter per node so do the same for tg->load_avg.
After this change, the worst number I saw during a 5 minutes run from
both nodes are:

2.77% 2.11% [kernel.vmlinux] [k] update_load_avg
2.72% 2.59% [kernel.vmlinux] [k] update_cfs_group

Another observation of this workload is: it has a lot of wakeup time
task migrations and that is the reason why update_load_avg() and
update_cfs_group() shows noticeable cost. Running this workload in N
instances setup where N >= 2 with sysbench's nr_threads set to 1/N nr_cpu,
task migrations on wake up time are greatly reduced and the overhead from
the two above mentioned functions also dropped a lot. It's not clear to
me why running in multiple instances can reduce task migrations on
wakeup path yet.

Reported-by: Nitin Tekchandani <nitin.tekchandani@xxxxxxxxx>
Signed-off-by: Aaron Lu <aaron.lu@xxxxxxxxx>
---
kernel/sched/core.c | 24 +++++++++++++++++-------
kernel/sched/debug.c | 2 +-
kernel/sched/fair.c | 5 +++--
kernel/sched/sched.h | 32 ++++++++++++++++++++++++--------
4 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2a4918a1faa9..531d465038d8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -9759,9 +9759,6 @@ int in_sched_functions(unsigned long addr)
*/
struct task_group root_task_group;
LIST_HEAD(task_groups);
-
-/* Cacheline aligned slab cache for task_group */
-static struct kmem_cache *task_group_cache __read_mostly;
#endif

void __init sched_init(void)
@@ -9820,8 +9817,6 @@ void __init sched_init(void)
#endif /* CONFIG_RT_GROUP_SCHED */

#ifdef CONFIG_CGROUP_SCHED
- task_group_cache = KMEM_CACHE(task_group, 0);
-
list_add(&root_task_group.list, &task_groups);
INIT_LIST_HEAD(&root_task_group.children);
INIT_LIST_HEAD(&root_task_group.siblings);
@@ -10219,7 +10214,6 @@ static void sched_free_group(struct task_group *tg)
free_fair_sched_group(tg);
free_rt_sched_group(tg);
autogroup_free(tg);
- kmem_cache_free(task_group_cache, tg);
}

static void sched_free_group_rcu(struct rcu_head *rcu)
@@ -10241,11 +10235,27 @@ static void sched_unregister_group(struct task_group *tg)
/* allocate runqueue etc for a new task group */
struct task_group *sched_create_group(struct task_group *parent)
{
+ size_t size = sizeof(struct task_group);
+ int __maybe_unused i, nodes;
struct task_group *tg;

- tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ nodes = num_possible_nodes();
+ size += nodes * sizeof(void *);
+ tg = kzalloc(size, GFP_KERNEL);
+ if (!tg)
+ return ERR_PTR(-ENOMEM);
+
+ for_each_node(i) {
+ tg->node_info[i] = kzalloc_node(sizeof(struct tg_node_info), GFP_KERNEL, i);
+ if (!tg->node_info[i])
+ return ERR_PTR(-ENOMEM);
+ }
+#else
+ tg = kzalloc(size, GFP_KERNEL);
if (!tg)
return ERR_PTR(-ENOMEM);
+#endif

if (!alloc_fair_sched_group(tg, parent))
goto err;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1637b65ba07a..2f20728aa093 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -645,7 +645,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %lu\n", "tg_load_avg_contrib",
cfs_rq->tg_load_avg_contrib);
SEQ_printf(m, " .%-30s: %ld\n", "tg_load_avg",
- atomic_long_read(&cfs_rq->tg->load_avg));
+ tg_load_avg(cfs_rq->tg));
#endif
#endif
#ifdef CONFIG_CFS_BANDWIDTH
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0f8736991427..68ac015fab6a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3439,7 +3439,7 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)

load = max(scale_load_down(cfs_rq->load.weight), cfs_rq->avg.load_avg);

- tg_weight = atomic_long_read(&tg->load_avg);
+ tg_weight = tg_load_avg(tg);

/* Ensure tg_weight >= load */
tg_weight -= cfs_rq->tg_load_avg_contrib;
@@ -3608,6 +3608,7 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
{
long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
+ int node = cpu_to_node(cfs_rq->rq->cpu);

/*
* No need to update load_avg for root_task_group as it is not used.
@@ -3616,7 +3617,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq)
return;

if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
- atomic_long_add(delta, &cfs_rq->tg->load_avg);
+ atomic_long_add(delta, &cfs_rq->tg->node_info[node]->load_avg);
cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 771f8ddb7053..11a1aed4e8f0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -365,6 +365,14 @@ struct cfs_bandwidth {
#endif
};

+struct tg_node_info {
+ /*
+ * load_avg can be heavily contended at clock tick and task
+ * enqueue/dequeue time, so put it in its own cacheline.
+ */
+ atomic_long_t load_avg ____cacheline_aligned;
+};
+
/* Task group related information */
struct task_group {
struct cgroup_subsys_state css;
@@ -379,14 +387,6 @@ struct task_group {
/* A positive value indicates that this is a SCHED_IDLE group. */
int idle;

-#ifdef CONFIG_SMP
- /*
- * load_avg can be heavily contended at clock tick time, so put
- * it in its own cacheline separated from the fields above which
- * will also be accessed at each tick.
- */
- atomic_long_t load_avg ____cacheline_aligned;
-#endif
#endif

#ifdef CONFIG_RT_GROUP_SCHED
@@ -418,8 +418,24 @@ struct task_group {
struct uclamp_se uclamp[UCLAMP_CNT];
#endif

+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ struct tg_node_info *node_info[];
+#endif
};

+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+static inline long tg_load_avg(struct task_group *tg)
+{
+ long load_avg = 0;
+ int i;
+
+ for_each_node(i)
+ load_avg += atomic_long_read(&tg->node_info[i]->load_avg);
+
+ return load_avg;
+}
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
#define ROOT_TASK_GROUP_LOAD NICE_0_LOAD


base-commit: c9c3395d5e3dcc6daee66c6908354d47bf98cb0c
--
2.39.2