Re: [PATCH v2 2/3] sched/fair: Move hot load_avg into its own cacheline

From: bsegall
Date: Wed Dec 02 2015 - 15:02:41 EST


Waiman Long <Waiman.Long@xxxxxxx> writes:

> If a system with large number of sockets was driven to full
> utilization, it was found that the clock tick handling occupied a
> rather significant proportion of CPU time when fair group scheduling
> and autogroup were enabled.
>
> Running a java benchmark on a 16-socket IvyBridge-EX system, the perf
> profile looked like:
>
> 10.52% 0.00% java [kernel.vmlinux] [k] smp_apic_timer_interrupt
> 9.66% 0.05% java [kernel.vmlinux] [k] hrtimer_interrupt
> 8.65% 0.03% java [kernel.vmlinux] [k] tick_sched_timer
> 8.56% 0.00% java [kernel.vmlinux] [k] update_process_times
> 8.07% 0.03% java [kernel.vmlinux] [k] scheduler_tick
> 6.91% 1.78% java [kernel.vmlinux] [k] task_tick_fair
> 5.24% 5.04% java [kernel.vmlinux] [k] update_cfs_shares
>
> In particular, the high CPU time consumed by update_cfs_shares()
> was mostly due to contention on the cacheline that contained the
> task_group's load_avg statistical counter. This cacheline may also
> contains variables like shares, cfs_rq & se which are accessed rather
> frequently during clock tick processing.
>
> This patch moves the load_avg variable into another cacheline
> separated from the other frequently accessed variables. It also
> creates a cacheline aligned kmemcache for task_group to make sure
> that all the allocated task_group's are cacheline aligned.
>
> By doing so, the perf profile became:
>
> 9.44% 0.00% java [kernel.vmlinux] [k] smp_apic_timer_interrupt
> 8.74% 0.01% java [kernel.vmlinux] [k] hrtimer_interrupt
> 7.83% 0.03% java [kernel.vmlinux] [k] tick_sched_timer
> 7.74% 0.00% java [kernel.vmlinux] [k] update_process_times
> 7.27% 0.03% java [kernel.vmlinux] [k] scheduler_tick
> 5.94% 1.74% java [kernel.vmlinux] [k] task_tick_fair
> 4.15% 3.92% java [kernel.vmlinux] [k] update_cfs_shares
>
> The %cpu time is still pretty high, but it is better than before. The
> benchmark results before and after the patch was as follows:
>
> Before patch - Max-jOPs: 907533 Critical-jOps: 134877
> After patch - Max-jOPs: 916011 Critical-jOps: 142366
>
> Signed-off-by: Waiman Long <Waiman.Long@xxxxxxx>
> ---
> kernel/sched/core.c | 36 ++++++++++++++++++++++++++++++++++--
> kernel/sched/sched.h | 7 ++++++-
> 2 files changed, 40 insertions(+), 3 deletions(-)
>
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 4d568ac..e39204f 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -7331,6 +7331,11 @@ int in_sched_functions(unsigned long addr)
> */
> struct task_group root_task_group;
> LIST_HEAD(task_groups);
> +
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +/* Cacheline aligned slab cache for task_group */
> +static struct kmem_cache *task_group_cache __read_mostly;
> +#endif
> #endif
>
> DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
> @@ -7356,6 +7361,7 @@ void __init sched_init(void)
> root_task_group.cfs_rq = (struct cfs_rq **)ptr;
> ptr += nr_cpu_ids * sizeof(void **);
>
> + task_group_cache = KMEM_CACHE(task_group, SLAB_HWCACHE_ALIGN);

The KMEM_CACHE macro suggests instead adding
____cacheline_aligned_in_smp to the struct definition instead.

> #endif /* CONFIG_FAIR_GROUP_SCHED */
> #ifdef CONFIG_RT_GROUP_SCHED
> root_task_group.rt_se = (struct sched_rt_entity **)ptr;
> @@ -7668,12 +7674,38 @@ void set_curr_task(int cpu, struct task_struct *p)
> /* task_group_lock serializes the addition/removal of task groups */
> static DEFINE_SPINLOCK(task_group_lock);
>
> +/*
> + * Make sure that the task_group structure is cacheline aligned when
> + * fair group scheduling is enabled.
> + */
> +#ifdef CONFIG_FAIR_GROUP_SCHED
> +static inline struct task_group *alloc_task_group(void)
> +{
> + return kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
> +}
> +
> +static inline void free_task_group(struct task_group *tg)
> +{
> + kmem_cache_free(task_group_cache, tg);
> +}
> +#else /* CONFIG_FAIR_GROUP_SCHED */
> +static inline struct task_group *alloc_task_group(void)
> +{
> + return kzalloc(sizeof(struct task_group), GFP_KERNEL);
> +}
> +
> +static inline void free_task_group(struct task_group *tg)
> +{
> + kfree(tg);
> +}
> +#endif /* CONFIG_FAIR_GROUP_SCHED */
> +
> static void free_sched_group(struct task_group *tg)
> {
> free_fair_sched_group(tg);
> free_rt_sched_group(tg);
> autogroup_free(tg);
> - kfree(tg);
> + free_task_group(tg);
> }
>
> /* allocate runqueue etc for a new task group */
> @@ -7681,7 +7713,7 @@ struct task_group *sched_create_group(struct task_group *parent)
> {
> struct task_group *tg;
>
> - tg = kzalloc(sizeof(*tg), GFP_KERNEL);
> + tg = alloc_task_group();
> if (!tg)
> return ERR_PTR(-ENOMEM);
>
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index efd3bfc..e679895 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -248,7 +248,12 @@ struct task_group {
> unsigned long shares;
>
> #ifdef CONFIG_SMP
> - atomic_long_t load_avg;
> + /*
> + * load_avg can be heavily contended at clock tick time, so put
> + * it in its own cacheline separated from the fields above which
> + * will also be accessed at each tick.
> + */
> + atomic_long_t load_avg ____cacheline_aligned;
> #endif
> #endif

I suppose the question is if it would be better to just move this to
wind up on a separate cacheline without the extra empty space, though it
would likely be more fragile and unclear.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/