[RFC][PATCH] sched: fair-group: load_balance_monitor vs wakeups

From: Peter Zijlstra
Date: Tue Feb 12 2008 - 08:00:30 EST



The biggest issue with load_balance_monitor atm is that it never goes
to sleep, and generates a huge amount of wakeups, even on idle systems.

I tried doing a simple interruptible sleep on idle, but wake_up_process
can't be used while holding the rq lock, so that didn't quite work out.

The next option was turning it into a timer, this does work however it
now runs from hardirq context and I worry that it might be too heavy,
esp on larger boxen.

However, if we split the global lb_monitor into a per root-domain monitor
I think it might be doable,.. thoughts?

Signed-off-by: Peter Zijlstra <a.p.zijlstra@xxxxxxxxx>
---
kernel/sched.c | 268 +++++++++++++++++++++++++++++++---------------------
kernel/sched_fair.c | 3
2 files changed, 163 insertions(+), 108 deletions(-)

Index: linux-2.6/kernel/sched.c
===================================================================
--- linux-2.6.orig/kernel/sched.c
+++ linux-2.6/kernel/sched.c
@@ -337,11 +337,112 @@ static DEFINE_SPINLOCK(task_group_lock);
/* doms_cur_mutex serializes access to doms_cur[] array */
static DEFINE_MUTEX(doms_cur_mutex);

+/*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+#ifdef CONFIG_SCHED_DEBUG
+# define const_debug __read_mostly
+#else
+# define const_debug static const
+#endif
+
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
/* kernel thread that runs rebalance_shares() periodically */
-static struct task_struct *lb_monitor_task;
-static int load_balance_monitor(void *unused);
+
+struct lb_monitor {
+ struct hrtimer timer;
+ ktime_t timeout;
+ spinlock_t lock;
+};
+
+static struct lb_monitor lb_monitor;
+
+/*
+ * How frequently should we rebalance_shares() across cpus?
+ *
+ * The more frequently we rebalance shares, the more accurate is the fairness
+ * of cpu bandwidth distribution between task groups. However higher frequency
+ * also implies increased scheduling overhead.
+ *
+ * sysctl_sched_min_bal_int_shares represents the minimum interval between
+ * consecutive calls to rebalance_shares() in the same sched domain.
+ *
+ * sysctl_sched_max_bal_int_shares represents the maximum interval between
+ * consecutive calls to rebalance_shares() in the same sched domain.
+ *
+ * These settings allows for the appropriate trade-off between accuracy of
+ * fairness and the associated overhead.
+ *
+ */
+
+/* default: 8ms, units: milliseconds */
+const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
+
+/* default: 128ms, units: milliseconds */
+const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
+
+enum {
+ shares_idle, /* nothing to balance */
+ shares_balanced, /* it was in balance */
+ shares_unbalanced, /* it needed balancing */
+};
+
+static int load_balance_shares(struct lb_monitor *lb_monitor);
+
+static enum hrtimer_restart lb_monitor_timer(struct hrtimer *timer)
+{
+ int state = shares_idle;
+ struct lb_monitor *lb_monitor =
+ container_of(timer, struct lb_monitor, timer);
+
+ for (;;) {
+ ktime_t now = hrtimer_cb_get_time(timer);
+ int overrun = hrtimer_forward(timer, now, lb_monitor->timeout);
+
+ if (!overrun)
+ break;
+
+ state = load_balance_shares(lb_monitor);
+ }
+
+ return (state == shares_idle) ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+
+static void lb_monitor_wake(struct lb_monitor *lb_monitor)
+{
+ ktime_t now;
+
+ if (hrtimer_active(&lb_monitor->timer))
+ return;
+
+ if (nr_cpu_ids == 1)
+ return;
+
+ spin_lock(&lb_monitor->lock);
+ for (;;) {
+ if (hrtimer_active(&lb_monitor->timer))
+ break;
+
+ now = hrtimer_cb_get_time(&lb_monitor->timer);
+ hrtimer_forward(&lb_monitor->timer, now, lb_monitor->timeout);
+ hrtimer_start(&lb_monitor->timer, lb_monitor->timer.expires,
+ HRTIMER_MODE_ABS);
+ }
+ spin_unlock(&lb_monitor->lock);
+}
+
+static void lb_monitor_init(struct lb_monitor *lb_monitor)
+{
+ hrtimer_init(&lb_monitor->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ lb_monitor->timer.function = lb_monitor_timer;
+ lb_monitor->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
+
+ lb_monitor->timeout = ns_to_ktime((u64)sysctl_sched_min_bal_int_shares *
+ NSEC_PER_MSEC);
+
+ spin_lock_init(&lb_monitor->lock);
+}
#endif

static void set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -699,15 +800,6 @@ static void update_rq_clock(struct rq *r
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)

/*
- * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
- */
-#ifdef CONFIG_SCHED_DEBUG
-# define const_debug __read_mostly
-#else
-# define const_debug static const
-#endif
-
-/*
* Debugging: various feature bits
*/
enum {
@@ -7157,21 +7249,6 @@ void __init sched_init_smp(void)
if (set_cpus_allowed(current, non_isolated_cpus) < 0)
BUG();
sched_init_granularity();
-
-#ifdef CONFIG_FAIR_GROUP_SCHED
- if (nr_cpu_ids == 1)
- return;
-
- lb_monitor_task = kthread_create(load_balance_monitor, NULL,
- "group_balance");
- if (!IS_ERR(lb_monitor_task)) {
- lb_monitor_task->flags |= PF_NOFREEZE;
- wake_up_process(lb_monitor_task);
- } else {
- printk(KERN_ERR "Could not create load balance monitor thread"
- "(error = %ld) \n", PTR_ERR(lb_monitor_task));
- }
-#endif
}
#else
void __init sched_init_smp(void)
@@ -7278,8 +7355,11 @@ void __init sched_init(void)

#ifdef CONFIG_SMP
init_defrootdomain();
-#endif

+#ifdef CONFIG_FAIR_GROUP_SCHED
+ lb_monitor_init(&lb_monitor);
+#endif
+#endif
init_rt_bandwidth(&def_rt_bandwidth,
global_rt_period(), global_rt_runtime());

@@ -7513,7 +7593,7 @@ static int rebalance_shares(struct sched
struct cfs_rq *cfs_rq;
struct rq *rq = cpu_rq(this_cpu);
cpumask_t sdspan = sd->span;
- int balanced = 1;
+ int state = shares_idle;

/* Walk thr' all the task groups that we have */
for_each_leaf_cfs_rq(rq, cfs_rq) {
@@ -7529,6 +7609,8 @@ static int rebalance_shares(struct sched
if (!total_load)
continue;

+ state = max(state, shares_balanced);
+
/*
* tg->shares represents the number of cpu shares the task group
* is eligible to hold on a single cpu. On N cpus, it is
@@ -7550,108 +7632,78 @@ static int rebalance_shares(struct sched
if (local_shares == tg->se[i]->load.weight)
continue;

- spin_lock_irq(&cpu_rq(i)->lock);
+ spin_lock(&cpu_rq(i)->lock);
set_se_shares(tg->se[i], local_shares);
- spin_unlock_irq(&cpu_rq(i)->lock);
- balanced = 0;
+ spin_unlock(&cpu_rq(i)->lock);
+ state = shares_unbalanced;
}
}

- return balanced;
+ return state;
}

-/*
- * How frequently should we rebalance_shares() across cpus?
- *
- * The more frequently we rebalance shares, the more accurate is the fairness
- * of cpu bandwidth distribution between task groups. However higher frequency
- * also implies increased scheduling overhead.
- *
- * sysctl_sched_min_bal_int_shares represents the minimum interval between
- * consecutive calls to rebalance_shares() in the same sched domain.
- *
- * sysctl_sched_max_bal_int_shares represents the maximum interval between
- * consecutive calls to rebalance_shares() in the same sched domain.
- *
- * These settings allows for the appropriate trade-off between accuracy of
- * fairness and the associated overhead.
- *
- */
-
-/* default: 8ms, units: milliseconds */
-const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
-
-/* default: 128ms, units: milliseconds */
-const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
-
-/* kernel thread that runs rebalance_shares() periodically */
-static int load_balance_monitor(void *unused)
+static int load_balance_shares(struct lb_monitor *lb_monitor)
{
- unsigned int timeout = sysctl_sched_min_bal_int_shares;
- struct sched_param schedparm;
- int ret;
+ int i, cpu, state = shares_idle;
+ u64 max_timeout = (u64)sysctl_sched_max_bal_int_shares * NSEC_PER_MSEC;
+ u64 min_timeout = (u64)sysctl_sched_min_bal_int_shares * NSEC_PER_MSEC;
+ u64 timeout;

+ /* Prevent cpus going down or coming up */
+ /* get_online_cpus(); */
+ /* lockout changes to doms_cur[] array */
+ /* lock_doms_cur(); */
/*
- * We don't want this thread's execution to be limited by the shares
- * assigned to default group (init_task_group). Hence make it run
- * as a SCHED_RR RT task at the lowest priority.
- */
- schedparm.sched_priority = 1;
- ret = sched_setscheduler(current, SCHED_RR, &schedparm);
- if (ret)
- printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
- " monitor thread (error = %d) \n", ret);
-
- while (!kthread_should_stop()) {
- int i, cpu, balanced = 1;
+ * Enter a rcu read-side critical section to safely walk rq->sd
+ * chain on various cpus and to walk task group list
+ * (rq->leaf_cfs_rq_list) in rebalance_shares().
+ */
+ rcu_read_lock();

- /* Prevent cpus going down or coming up */
- get_online_cpus();
- /* lockout changes to doms_cur[] array */
- lock_doms_cur();
- /*
- * Enter a rcu read-side critical section to safely walk rq->sd
- * chain on various cpus and to walk task group list
- * (rq->leaf_cfs_rq_list) in rebalance_shares().
- */
- rcu_read_lock();
+ for (i = 0; i < ndoms_cur; i++) {
+ cpumask_t cpumap = doms_cur[i];
+ struct sched_domain *sd = NULL, *sd_prev = NULL;

- for (i = 0; i < ndoms_cur; i++) {
- cpumask_t cpumap = doms_cur[i];
- struct sched_domain *sd = NULL, *sd_prev = NULL;
-
- cpu = first_cpu(cpumap);
-
- /* Find the highest domain at which to balance shares */
- for_each_domain(cpu, sd) {
- if (!(sd->flags & SD_LOAD_BALANCE))
- continue;
- sd_prev = sd;
- }
+ cpu = first_cpu(cpumap);

- sd = sd_prev;
- /* sd == NULL? No load balance reqd in this domain */
- if (!sd)
+ /* Find the highest domain at which to balance shares */
+ for_each_domain(cpu, sd) {
+ if (!(sd->flags & SD_LOAD_BALANCE))
continue;
-
- balanced &= rebalance_shares(sd, cpu);
+ sd_prev = sd;
}

- rcu_read_unlock();
+ sd = sd_prev;
+ /* sd == NULL? No load balance reqd in this domain */
+ if (!sd)
+ continue;

- unlock_doms_cur();
- put_online_cpus();
+ state = max(state, rebalance_shares(sd, cpu));
+ }

- if (!balanced)
- timeout = sysctl_sched_min_bal_int_shares;
- else if (timeout < sysctl_sched_max_bal_int_shares)
+ rcu_read_unlock();
+
+ /* unlock_doms_cur(); */
+ /* put_online_cpus(); */
+
+ timeout = ktime_to_ns(lb_monitor->timeout);
+ switch (state) {
+ case shares_balanced:
+ if (timeout < max_timeout)
timeout *= 2;
+ break;

- msleep_interruptible(timeout);
+ default:
+ timeout = min_timeout;
+ break;
}
+ lb_monitor->timeout = ns_to_ktime(timeout);
+
+ printk(KERN_EMERG "load_balance_shares: %p %d\n", lb_monitor, state);

- return 0;
+ return state;
}
+
#endif /* CONFIG_SMP */

#ifdef CONFIG_FAIR_GROUP_SCHED
Index: linux-2.6/kernel/sched_fair.c
===================================================================
--- linux-2.6.orig/kernel/sched_fair.c
+++ linux-2.6/kernel/sched_fair.c
@@ -548,6 +548,9 @@ account_entity_enqueue(struct cfs_rq *cf
cfs_rq->nr_running++;
se->on_rq = 1;
list_add(&se->group_node, &cfs_rq->tasks);
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+ lb_monitor_wake(&lb_monitor);
+#endif
}

static void


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/