Re: [ANNOUNCEMENT] The Barbershop Load Distribution algorithm forLinux kernel scheduler.

From: Hillf Danton
Date: Mon Feb 13 2012 - 09:05:46 EST


Hello Rakib

Just nitpicks

On Mon, Feb 13, 2012 at 2:52 AM, Rakib Mullick <rakib.mullick@xxxxxxxxx> wrote:
[...]
> --- /dev/null
> +++ b/kernel/sched/bld.h
> @@ -0,0 +1,112 @@
> +#ifdef CONFIG_BLD
> +
> +static DEFINE_RWLOCK(disp_list_lock);

What is the advantage of rwlock, compared with spin lock?

> +static LIST_HEAD(rq_head);
> +
> +static inline int list_is_first(const struct list_head *list,

Where is this helper used?

> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â const struct list_head *head)
> +{
> + Â Â Â return list == head->next;
> +}
> +
> +static inline int select_cpu_for_wakeup(struct task_struct *p, int
> sd_flags, int wake_flags)

Looks @sd_flags not used. Why is the arch specifics negligible?
Also looks message corrupted due to mail agent?

> +{
> + Â Â Â int cpu = smp_processor_id(), prev_cpu = task_cpu(p), i;

int this_cpu = smp_processor_id();
int prev_cpu = task_cpu(p);
int cpu;

> + Â Â Â /*bool sync = wake_flags & WF_SYNC; */
> + Â Â Â unsigned long load, min_load = ULONG_MAX;
> + Â Â Â struct cpumask *mask;
> +
> + Â Â Â if (wake_flags & WF_SYNC) {
> + Â Â Â Â Â Â Â if (cpu == prev_cpu)
> + Â Â Â Â Â Â Â Â Â Â Â return cpu;
> + Â Â Â Â Â Â Â mask = sched_group_cpus(cpu_rq(prev_cpu)->sd->groups);
> + Â Â Â } else
> + Â Â Â Â Â Â Â mask = sched_domain_span(cpu_rq(prev_cpu)->sd);
> +
> + Â Â Â for_each_cpu(i, mask) {
> + Â Â Â Â Â Â Â load = cpu_rq(i)->load.weight;
> + Â Â Â Â Â Â Â if (load < min_load) {
> + Â Â Â Â Â Â Â Â Â Â Â min_load = load;
> + Â Â Â Â Â Â Â Â Â Â Â cpu = i;
> + Â Â Â Â Â Â Â }
> + Â Â Â }
> + Â Â Â return cpu;
> +}
> +
> +static int bld_select_task_rq(struct task_struct *p, int sd_flags,
> int wake_flags)

Message corrupted?

> +{
> + Â Â Â struct rq *tmp;
> + Â Â Â unsigned long flag;
> + Â Â Â unsigned int cpu = smp_processor_id();
> +
> + Â Â Â if (&p->cpus_allowed) {
> + Â Â Â Â Â Â Â struct cpumask *taskmask;
> + Â Â Â Â Â Â Â unsigned long min_load = ULONG_MAX, load, i;
> + Â Â Â Â Â Â Â taskmask = tsk_cpus_allowed(p);
> + Â Â Â Â Â Â Â for_each_cpu(i, taskmask) {
> + Â Â Â Â Â Â Â Â Â Â Â load = cpu_rq(i)->load.weight;
> + Â Â Â Â Â Â Â Â Â Â Â if (load < min_load) {
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â min_load = load;
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â cpu = i;
> + Â Â Â Â Â Â Â Â Â Â Â }
> + Â Â Â Â Â Â Â }
> + Â Â Â } else Âif (sd_flags & SD_BALANCE_WAKE) {
> + Â Â Â Â Â Â Â cpu = select_cpu_for_wakeup(p, sd_flags, wake_flags);
> + Â Â Â Â Â Â Â return cpu;
> + Â Â Â } else {
> + Â Â Â Â Â Â Â read_lock_irqsave(&disp_list_lock, flag);
> + Â Â Â Â Â Â Â list_for_each_entry(tmp, &rq_head, disp_load_balance) {
> + Â Â Â Â Â Â Â Â Â Â Â cpu = cpu_of(tmp);
> + Â Â Â Â Â Â Â Â Â Â Â if (cpu_online(cpu))
> + Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â break;
> + Â Â Â Â Â Â Â }
> + Â Â Â Â Â Â Â read_unlock_irqrestore(&disp_list_lock, flag);
> + Â Â Â }
> + Â Â Â return cpu;
> +}
> +
> +static void bld_track_load_activate(struct rq *rq)
> +{
> + Â Â Â unsigned long Âflag;
> + Â Â Â rq->this_cpu_load = rq->load.weight;

Well ->this_cpu_load looks unnecessary?

> +
> + Â Â Â if (rq->pos != 2) { Â Â /* if rq isn't the last one */
> + Â Â Â Â Â Â Â struct rq *last;
> + Â Â Â Â Â Â Â write_lock_irqsave(&disp_list_lock, flag);

if (rq->pos != 2)
goto out;

> + Â Â Â Â Â Â Â last = list_entry(rq_head.prev, struct rq, disp_load_balance);

Could disp_list_lock serialize updating this_cpu_load?

> + Â Â Â Â Â Â Â if (rq->this_cpu_load > last->this_cpu_load) {
> + Â Â Â Â Â Â Â Â Â Â Â list_del(&rq->disp_load_balance);
> + Â Â Â Â Â Â Â Â Â Â Â list_add_tail(&rq->disp_load_balance, &rq_head);
> + Â Â Â Â Â Â Â Â Â Â Â rq->pos = 2; last->pos = 1;
> + Â Â Â Â Â Â Â }

out:

> + Â Â Â Â Â Â Â write_unlock_irqrestore(&disp_list_lock, flag);
> + Â Â Â }
> +}
> +
> +static void bld_track_load_deactivate(struct rq *rq)
> +{
> + Â Â Â unsigned long flag;
> +
> + Â Â Â rq->this_cpu_load = rq->load.weight;
> +
> + Â Â Â if (rq->pos != 0) { /* If rq isn't first one */
> + Â Â Â Â Â Â Â struct rq *first;
> + Â Â Â Â Â Â Â first = list_entry(rq_head.prev, struct rq, disp_load_balance);
> + Â Â Â Â Â Â Â write_lock_irqsave(&disp_list_lock, flag);
> + Â Â Â Â Â Â Â if (rq->this_cpu_load <= first->this_cpu_load) {
> + Â Â Â Â Â Â Â Â Â Â Â list_del(&rq->disp_load_balance);
> + Â Â Â Â Â Â Â Â Â Â Â list_add_tail(&rq->disp_load_balance, &rq_head);
> + Â Â Â Â Â Â Â Â Â Â Â rq->pos = 0; first->pos = 1;
> + Â Â Â Â Â Â Â }
> + Â Â Â Â Â Â Â write_unlock_irqrestore(&disp_list_lock, flag);
> + Â Â Â }
> +}
> +#else
> +static inline void bld_track_load_activate(struct rq *rq)
> +{
> +}
> +
> +static inline void bld_track_load_deactivate(struct rq *rq)
> +{
> +}
> +#endif /* CONFIG_BLD */
> diff --git a/kernel/sched/core.c b/kernel/sched/core.c
> index 5255c9d..cff20e1 100644
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -24,6 +24,8 @@
> Â* Â2007-07-01 ÂGroup scheduling enhancements by Srivatsa Vaddagiri
> Â* Â2007-11-29 ÂRT balancing improvements by Steven Rostedt, Gregory Haskins,
> Â* Â Â Â Â Â Â ÂThomas Gleixner, Mike Kravetz
> + * Â2012-Feb  The Barbershop Load Distribution (BLD) algorithm, an alternate
> + * Â Â Â Â Â Â load distribution algorithm by Rakib Mullick.
> Â*/
>
> Â#include <linux/mm.h>
> @@ -81,6 +83,7 @@
>
> Â#include "sched.h"
> Â#include "../workqueue_sched.h"
> +#include "bld.h"
>
> Â#define CREATE_TRACE_POINTS
> Â#include <trace/events/sched.h>
> @@ -578,6 +581,7 @@ unlock:
> Â*/
> Âvoid wake_up_idle_cpu(int cpu)
> Â{
> +#ifndef CONFIG_BLD
> Â Â Â Âstruct rq *rq = cpu_rq(cpu);
>
> Â Â Â Âif (cpu == smp_processor_id())
> @@ -604,6 +608,7 @@ void wake_up_idle_cpu(int cpu)
> Â Â Â Âsmp_mb();
> Â Â Â Âif (!tsk_is_polling(rq->idle))
> Â Â Â Â Â Â Â Âsmp_send_reschedule(cpu);
> +#endif
> Â}
>
> Âstatic inline bool got_nohz_idle_kick(void)
> @@ -730,6 +735,7 @@ void activate_task(struct rq *rq, struct
> task_struct *p, int flags)
> Â Â Â Â Â Â Â Ârq->nr_uninterruptible--;
>
> Â Â Â Âenqueue_task(rq, p, flags);
> + Â Â Â bld_track_load_activate(rq);

Looks better if sorting rq folded in enqueue_task()?

> Â}
>
> Âvoid deactivate_task(struct rq *rq, struct task_struct *p, int flags)
> @@ -738,6 +744,7 @@ void deactivate_task(struct rq *rq, struct
> task_struct *p, int flags)
> Â Â Â Â Â Â Â Ârq->nr_uninterruptible++;
>
> Â Â Â Âdequeue_task(rq, p, flags);
> + Â Â Â bld_track_load_deactivate(rq);
> Â}
>
> Â#ifdef CONFIG_IRQ_TIME_ACCOUNTING
> @@ -1297,7 +1304,12 @@ static int select_fallback_rq(int cpu, struct
> task_struct *p)
> Âstatic inline
> Âint select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
> Â{
> - Â Â Â int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
> + Â Â Â int cpu;
> +#ifdef CONFIG_BLD
> + Â Â Â cpu = bld_select_task_rq(p, sd_flags, wake_flags);

What if @p is RT?

> +#else
> + Â Â Â cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
> +#endif
>
> Â Â Â Â/*
> Â Â Â Â * In order not to call set_task_cpu() on a blocking task we need
> @@ -1453,7 +1465,11 @@ static void sched_ttwu_pending(void)
>
> Âvoid scheduler_ipi(void)
> Â{
> +#ifndef CONFIG_BLD
> Â Â Â Âif (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
> +#else
> + Â Â Â if (llist_empty(&this_rq()->wake_list))
> +#endif
> Â Â Â Â Â Â Â Âreturn;
>
> Â Â Â Â/*
> @@ -1475,10 +1491,12 @@ void scheduler_ipi(void)
> Â Â Â Â/*
> Â Â Â Â * Check if someone kicked us for doing the nohz idle load balance.
> Â Â Â Â */
> +#ifndef CONFIG_BLD
> Â Â Â Âif (unlikely(got_nohz_idle_kick() && !need_resched())) {
> Â Â Â Â Â Â Â Âthis_rq()->idle_balance = 1;
> Â Â Â Â Â Â Â Âraise_softirq_irqoff(SCHED_SOFTIRQ);
> Â Â Â Â}
> +#endif
> Â Â Â Âirq_exit();
> Â}
>
> @@ -1518,12 +1536,14 @@ static void ttwu_queue(struct task_struct *p, int cpu)
> Â Â Â Âstruct rq *rq = cpu_rq(cpu);
>
> Â#if defined(CONFIG_SMP)
> +#ifndef CONFIG_BLD
> Â Â Â Âif (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
> Â Â Â Â Â Â Â Âsched_clock_cpu(cpu); /* sync clocks x-cpu */
> Â Â Â Â Â Â Â Âttwu_queue_remote(p, cpu);
> Â Â Â Â Â Â Â Âreturn;
> Â Â Â Â}
> Â#endif
> +#endif
>
> Â Â Â Âraw_spin_lock(&rq->lock);
> Â Â Â Âttwu_do_activate(rq, p, 0);
> @@ -2269,6 +2289,7 @@ calc_load_n(unsigned long load, unsigned long exp,
> Â*/
> Âstatic void calc_global_nohz(unsigned long ticks)
> Â{
> +#ifndef CONFIG_BLD
> Â Â Â Âlong delta, active, n;
>
> Â Â Â Âif (time_before(jiffies, calc_load_update))
> @@ -2310,6 +2331,7 @@ static void calc_global_nohz(unsigned long ticks)
> Â Â Â Â * age us 4 cycles, and the test in calc_global_load() will
> Â Â Â Â * pick up the final one.
> Â Â Â Â */
> +#endif
> Â}
> Â#else
> Âvoid calc_load_account_idle(struct rq *this_rq)
> @@ -3003,8 +3025,10 @@ void scheduler_tick(void)
>
> Â#ifdef CONFIG_SMP
> Â Â Â Ârq->idle_balance = idle_cpu(cpu);
> +#ifndef CONFIG_BLD
> Â Â Â Âtrigger_load_balance(rq, cpu);
> Â#endif
> +#endif
> Â}
>
> Ânotrace unsigned long get_parent_ip(unsigned long addr)
> @@ -3194,8 +3218,10 @@ need_resched:
>
> Â Â Â Âpre_schedule(rq, prev);
>
> +#ifndef CONFIG_BLD
> Â Â Â Âif (unlikely(!rq->nr_running))
> Â Â Â Â Â Â Â Âidle_balance(cpu, rq);
> +#endif
>
> Â Â Â Âput_prev_task(rq, prev);
> Â Â Â Ânext = pick_next_task(rq);
> @@ -6938,6 +6964,11 @@ void __init sched_init(void)
> Â#endif
> Â Â Â Â Â Â Â Âinit_rq_hrtick(rq);
> Â Â Â Â Â Â Â Âatomic_set(&rq->nr_iowait, 0);
> +#ifdef CONFIG_BLD
> + Â Â Â Â Â Â Â INIT_LIST_HEAD(&rq->disp_load_balance);
> + Â Â Â Â Â Â Â list_add_tail(&rq->disp_load_balance, &rq_head);
> + Â Â Â Â Â Â Â rq->pos = 0;
> +#endif
> Â Â Â Â}
>
> Â Â Â Âset_load_weight(&init_task);
> diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
> index 7c6414f..f2624ce 100644
> --- a/kernel/sched/fair.c
> +++ b/kernel/sched/fair.c
> @@ -5609,7 +5609,9 @@ void print_cfs_stats(struct seq_file *m, int cpu)
> Â__init void init_sched_fair_class(void)
> Â{
> Â#ifdef CONFIG_SMP
> +#ifndef CONFIG_BLD
> Â Â Â Âopen_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
> +#endif /* BLD */
>
> Â#ifdef CONFIG_NO_HZ
> Â Â Â Âzalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
> index 98c0c26..bd7e4c6 100644
> --- a/kernel/sched/sched.h
> +++ b/kernel/sched/sched.h
> @@ -474,6 +474,17 @@ struct rq {
> Â#ifdef CONFIG_SMP
> Â Â Â Âstruct llist_head wake_list;
> Â#endif
> +#ifdef CONFIG_BLD
> + Â Â Â unsigned long this_cpu_load;
> + Â Â Â struct list_head disp_load_balance;
> + Â Â Â /* It indicates whether, rq is first or last
> + Â Â Â Â* or in the middle based on load from rq_head.
> + Â Â Â Â* 0 - First rq
> + Â Â Â Â* 1 - rq stays middle
> + Â Â Â Â* 2 - last rq
> + Â Â Â Â*/
> + Â Â Â char pos;
> +#endif
> Â};
>
> Âstatic inline int cpu_of(struct rq *rq)
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at Âhttp://vger.kernel.org/majordomo-info.html
> Please read the FAQ at Âhttp://www.tux.org/lkml/
>
>
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/