[PATCH -cfs] Fix cpu_load calculation error

From: Srivatsa Vaddagiri
Date: Mon Jun 18 2007 - 12:52:55 EST

Next message: Christoph Lameter: "Re: v2.6.21.4-rt11"
Previous message: Nish Aravamudan: "Re: [PATCH] madvise_need_mmap_write() usage"
Next in thread: Srivatsa Vaddagiri: "Re: [PATCH -cfs] Fix cpu_load calculation error"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

Ingo/Dmitry,
I am resending the patch to fix cpu_load calculation error in
cfs v17, this time CCing lkml.

Related to load-balance, there is still a lingering thought in my mind. I am
not sure though whether it is something we need to worry about atm.

Load balance between CPUs is based on gross-weight imbalance between
the two i.e weight imbalance of all classes put together on both CPUs.
When move_tasks() is invoked to move this 'imbalance' amount of
weight, it can potentially end up moving that complete 'imbalance'
weight from one class only and not balance other classes?

As an example, consider CPU0 and CPU1's initial load to be:

(before balance)
CPU0_rq->cpu_load = 20480 (5*2048 rt load + 10*1024 normal load)
CPU1_rq->cpu_load = 10240 (5*2048 rt load + 0 normal load)

This leads to a load imbalance of 10240. When CPU1 attempts to
balance by pulling some 'imbalance' weight (say wt_diff/2 = 5120), it could
potentially pull all this imbalance weight from real-time class alone?

(after balance)
CPU0_rq->cpu_load = 14336 (2*2048 rt load + 10*1024 normal load)
CPU1_rq->cpu_load = 16384 (8*2048 rt load + 0 normal load)

Basically do we need to take into account class imbalance also in
sched_rt.c:load_balance()?

I have CCed some folks from real-time who may hit us if they think
this needs to be fixed (from real-time perspective atleast!).

v17 cpu_load calculation didn't take into account that a class's
delta_exec/fair may be stale because it could not get an opportunity to
run. This can lead to incorrect calculation of a class's load. This
patch removes load tracking from individual classes and instead
introduces a global per-cpu load tracking common to all classes in
kernel/sched.c itself.

Thanks to Dmitry for suggesting this idea.

Signed-off-by : Srivatsa Vaddagiri <vatsa@xxxxxxxxxxxxxxxxxx>

---
include/linux/sched.h | 2
kernel/sched.c | 177 +++++++++++++++++++++++++++++++++-----------------
kernel/sched_debug.c | 6 -
kernel/sched_fair.c | 49 +------------
kernel/sched_rt.c | 7 -
5 files changed, 126 insertions(+), 115 deletions(-)

Index: current/kernel/sched.c
===================================================================
--- current.orig/kernel/sched.c 2007-06-15 22:50:33.000000000 +0530
+++ current/kernel/sched.c 2007-06-15 23:31:22.000000000 +0530
@@ -116,14 +116,18 @@
struct list_head queue[MAX_RT_PRIO];
};

+struct load_stat {
+ unsigned long raw_weighted_load;
+ u64 delta_fair, delta_exec, exec_start, last_update_load;
+};
+
/* CFS-related fields in a runqueue */
struct cfs_rq {
unsigned long raw_weighted_load;
unsigned long nr_running;

- u64 fair_clock, delta_fair_clock;
- u64 exec_clock, delta_exec_clock;
- u64 last_update_load;
+ u64 fair_clock;
+ u64 exec_clock;
s64 wait_runtime;
unsigned long wait_runtime_overruns, wait_runtime_underruns;

@@ -134,9 +138,6 @@

/* Real-Time classes' related field in a runqueue: */
struct rt_rq {
- unsigned long raw_weighted_load;
- unsigned long nr_running;
-
struct prio_array active;
int rt_load_balance_idx;
struct list_head *rt_load_balance_head, *rt_load_balance_curr;
@@ -159,6 +160,7 @@
long nr_running;
#define CPU_LOAD_IDX_MAX 5
unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+ struct load_stat ls; /* capture load from *all* tasks on this cpu */
unsigned char idle_at_tick;
#ifdef CONFIG_NO_HZ
unsigned char in_nohz_recently;
@@ -499,6 +501,9 @@
task_rq_unlock(rq, &flags);
}

+#define NICE_0_LOAD SCHED_LOAD_SCALE
+#define NICE_0_SHIFT SCHED_LOAD_SHIFT
+
/*
* resched_task - mark a task 'to be rescheduled now'.
*
@@ -552,6 +557,50 @@
}
#endif

+static inline void set_exec_start(struct rq *rq)
+{
+ rq->ls.exec_start = __rq_clock(rq);
+}
+
+/* Update delta_exec, delta_fair fields for rq.
+ *
+ * delta_fair clock advances at a rate inversely proportional to
+ * total load (rq->ls.raw_weighted_load) on the runqueue, while
+ * delta_exec advances at the same rate as wall-clock (provided
+ * cpu is not idle).
+ *
+ * delta_exec / delta_fair is a measure of the (smoothened) load on this
+ * runqueue over any given interval. This (smoothened) load is used
+ * during load balance.
+ *
+ * This function is called /before/ updating rq->ls.raw_weighted_load
+ * and when switching tasks.
+ *
+ * todo : Remove this for !CONFIG_SMP?
+ */
+static void update_curr_load(struct rq *rq)
+{
+ u64 now = rq_clock(rq);
+ u64 delta_exec, delta_fair;
+ struct load_stat *ls = &rq->ls;
+ unsigned long load = ls->raw_weighted_load;
+
+ if (rq->curr == rq->idle || !load)
+ return;
+
+ delta_exec = now - ls->exec_start;
+ if (unlikely((s64)delta_exec < 0))
+ delta_exec = 0;
+
+ delta_fair = delta_exec * NICE_0_LOAD;
+ delta_fair += load >> 1; /* rounding */
+ do_div(delta_fair, load);
+
+ ls->delta_fair += delta_fair;
+ ls->delta_exec += delta_exec;
+ ls->exec_start = now;
+}
+
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
@@ -574,9 +623,6 @@
#define RTPRIO_TO_LOAD_WEIGHT(rp) \
(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))

-#define NICE_0_LOAD SCHED_LOAD_SCALE
-#define NICE_0_SHIFT SCHED_LOAD_SHIFT
-
/*
* Nice levels are multiplicative, with a gentle 10% change for every
* nice level changed. I.e. when a CPU-bound task goes from nice 0 to
@@ -595,55 +641,28 @@
/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15,
};

-static inline unsigned long
-total_raw_weighted_load(struct rq *rq)
-{
- return rq->rt.raw_weighted_load + rq->cfs.raw_weighted_load;
-}
-
static inline void
inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
{
- /*
- * Can be smth like (if we'd benefit from even high abstraction --
- * say we are going to support more sched_classes):
- *
- * p->sched_class->inc_raw_weighted_load(p);
- */
-
- if (has_rt_policy(p))
- rq->rt.raw_weighted_load += p->se.load_weight;
- else
- rq->cfs.raw_weighted_load += p->se.load_weight;
+ update_curr_load(rq);
+ rq->ls.raw_weighted_load += p->se.load_weight;
}

static inline void
dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
{
- if (has_rt_policy(p))
- rq->rt.raw_weighted_load -= p->se.load_weight;
- else
- rq->cfs.raw_weighted_load -= p->se.load_weight;
+ update_curr_load(rq);
+ rq->ls.raw_weighted_load -= p->se.load_weight;
}

static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
{
- if (has_rt_policy(p))
- rq->rt.nr_running++;
- else
- rq->cfs.nr_running++;
-
rq->nr_running++;
inc_raw_weighted_load(rq, p);
}

static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
{
- if (has_rt_policy(p))
- rq->rt.nr_running--;
- else
- rq->cfs.nr_running--;
-
rq->nr_running--;
dec_raw_weighted_load(rq, p);
}
@@ -779,7 +798,7 @@
/* Used instead of source_load when we know the type == 0 */
unsigned long weighted_cpuload(const int cpu)
{
- return total_raw_weighted_load(cpu_rq(cpu));
+ return cpu_rq(cpu)->ls.raw_weighted_load;
}

#ifdef CONFIG_SMP
@@ -913,7 +932,7 @@
static inline unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = total_raw_weighted_load(rq);
+ unsigned long total = weighted_cpuload(cpu);

if (type == 0)
return total;
@@ -928,7 +947,7 @@
static inline unsigned long target_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = total_raw_weighted_load(rq);
+ unsigned long total = weighted_cpuload(cpu);

if (type == 0)
return total;
@@ -942,7 +961,7 @@
static inline unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- unsigned long total = total_raw_weighted_load(rq);
+ unsigned long total = weighted_cpuload(cpu);
unsigned long n = rq->nr_running;

return n ? total / n : SCHED_LOAD_SCALE;
@@ -1623,18 +1642,49 @@
return running + uninterruptible;
}

-static void update_load(struct rq *this_rq)
+/* Update rq->cpu_load[] statistics. This function is usually called every
+ * scheduler tick (TICK_NSEC).
+ */
+static void update_cpu_load(struct rq *this_rq)
{
- struct sched_class *class = sched_class_highest;
- unsigned long this_load = 0;
+ unsigned long total_load = this_rq->ls.raw_weighted_load;
int i, scale;
+ unsigned long this_load = total_load;
+ struct load_stat *ls = &this_rq->ls;
+ u64 fair_delta64, exec_delta64, idle_delta64,
+ sample_interval64, tmp64;
+ u64 now = __rq_clock(this_rq);

this_rq->nr_load_updates++;
+ if (sysctl_sched_features & 64)
+ goto do_avg;

- do {
- this_load += class->update_load(this_rq);
- class = class->next;
- } while (class);
+ /* Update delta_fair/delta_exec fields first */
+ update_curr_load(this_rq);
+
+ fair_delta64 = ls->delta_fair + 1;
+ ls->delta_fair = 0;
+
+ exec_delta64 = ls->delta_exec + 1;
+ ls->delta_exec = 0;
+
+ sample_interval64 = now - ls->last_update_load;
+ ls->last_update_load = now;
+
+ if ((s64)sample_interval64 < (s64)TICK_NSEC)
+ sample_interval64 = TICK_NSEC;
+
+ if (exec_delta64 > sample_interval64)
+ exec_delta64 = sample_interval64;
+
+ idle_delta64 = sample_interval64 - exec_delta64;
+
+ tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
+ tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
+
+ this_load = (unsigned long)tmp64;
+
+do_avg:

/* Update our load: */
for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2012,7 +2062,7 @@

avg_load += load;
sum_nr_running += rq->nr_running;
- sum_weighted_load += total_raw_weighted_load(rq);
+ sum_weighted_load += weighted_cpuload(i);
}

/*
@@ -2240,18 +2290,19 @@
int i;

for_each_cpu_mask(i, group->cpumask) {
+ unsigned long wl;

if (!cpu_isset(i, *cpus))
continue;

rq = cpu_rq(i);
+ wl = weighted_cpuload(i);

- if (rq->nr_running == 1 &&
- total_raw_weighted_load(rq) > imbalance)
+ if (rq->nr_running == 1 && wl > imbalance)
continue;

- if (total_raw_weighted_load(rq) > max_load) {
- max_load = total_raw_weighted_load(rq);
+ if (wl > max_load) {
+ max_load = wl;
busiest = rq;
}
}
@@ -2831,14 +2882,17 @@
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
}
-#else
+
+#else /* CONFIG_SMP */
+
/*
* on UP we do not need to balance between CPUs:
*/
static inline void idle_balance(int cpu, struct rq *rq)
{
}
-#endif
+
+#endif /* CONFIG_SMP */

DEFINE_PER_CPU(struct kernel_stat, kstat);

@@ -2966,7 +3020,7 @@
p->sched_class->task_tick(rq, p);
spin_unlock(&rq->lock);

- update_load(rq);
+ update_cpu_load(rq);
#ifdef CONFIG_SMP
rq->idle_at_tick = idle_at_tick;
trigger_load_balance(cpu);
@@ -3051,6 +3105,7 @@
u64 now = __rq_clock(rq);
struct task_struct *p;

+ update_curr_load(rq);
prev->sched_class->put_prev_task(rq, prev, now);

do {
@@ -3101,6 +3156,7 @@
if (unlikely(!rq->nr_running)) {
idle_balance(cpu, rq);
if (!rq->nr_running) {
+ update_curr_load(rq);
prev->sched_class->put_prev_task(rq, prev,
__rq_clock(rq));
next = rq->idle;
@@ -3110,6 +3166,7 @@
}

next = pick_next_task(rq, prev);
+ set_exec_start(rq);
next->se.nr_switches++;

switch_tasks:
@@ -6132,7 +6189,7 @@
rq->nr_running = 0;
rq->cfs.tasks_timeline = RB_ROOT;
rq->clock = rq->cfs.fair_clock = 1;
- rq->cfs.last_update_load = sched_clock();
+ rq->ls.last_update_load = sched_clock();

for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
rq->cpu_load[j] = 0;
Index: current/include/linux/sched.h
===================================================================
--- current.orig/include/linux/sched.h 2007-06-15 22:50:33.000000000 +0530
+++ current/include/linux/sched.h 2007-06-15 22:50:49.000000000 +0530
@@ -869,8 +869,6 @@
struct task_struct * (*load_balance_next) (struct rq *rq);
void (*task_tick) (struct rq *rq, struct task_struct *p);
void (*task_new) (struct rq *rq, struct task_struct *p);
-
- unsigned long (*update_load) (struct rq *rq);
};

/* CFS stats for a schedulable entity (task, task-group etc) */
Index: current/kernel/sched_fair.c
===================================================================
--- current.orig/kernel/sched_fair.c 2007-06-15 22:50:33.000000000 +0530
+++ current/kernel/sched_fair.c 2007-06-15 22:50:49.000000000 +0530
@@ -119,6 +119,9 @@

rb_link_node(&se->run_node, parent, link);
rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
+ cfs_rq->raw_weighted_load += se->load_weight;
+ cfs_rq->nr_running++;
+ se->on_rq = 1;
}

static inline void
@@ -127,6 +130,9 @@
if (cfs_rq->rb_leftmost == &se->run_node)
cfs_rq->rb_leftmost = NULL;
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
+ cfs_rq->raw_weighted_load -= se->load_weight;
+ cfs_rq->nr_running--;
+ se->on_rq = 0;
}

static inline struct rb_node * first_fair(struct cfs_rq *cfs_rq)
@@ -249,10 +255,6 @@
delta_fair += load >> 1; /* rounding */
do_div(delta_fair, load);

- /* Load-balancing accounting. */
- cfs_rq->delta_fair_clock += delta_fair;
- cfs_rq->delta_exec_clock += delta_exec;
-
/*
* Task already marked for preemption, do not burden
* it with the cost of not having left the CPU yet:
@@ -829,44 +831,6 @@
inc_nr_running(p, rq);
}

-static unsigned long update_load_cfs_rq(struct cfs_rq *cfs_rq)
-{
- u64 fair_delta64, exec_delta64, idle_delta64,
- sample_interval64, now, tmp64;
-
- if (sysctl_sched_features & 64)
- return cfs_rq->raw_weighted_load;
-
- fair_delta64 = cfs_rq->delta_fair_clock + 1;
- cfs_rq->delta_fair_clock = 0;
-
- exec_delta64 = cfs_rq->delta_exec_clock + 1;
- cfs_rq->delta_exec_clock = 0;
-
- now = rq_clock(rq_of(cfs_rq));
- sample_interval64 = now - cfs_rq->last_update_load;
- cfs_rq->last_update_load = now;
-
- if ((s64)sample_interval64 < (s64)TICK_NSEC)
- sample_interval64 = TICK_NSEC;
-
- if (exec_delta64 > sample_interval64)
- exec_delta64 = sample_interval64;
-
- idle_delta64 = sample_interval64 - exec_delta64;
-
- tmp64 = div64_64(SCHED_LOAD_SCALE * exec_delta64, fair_delta64);
- tmp64 = div64_64(tmp64 * exec_delta64, sample_interval64);
-
- return (unsigned long)tmp64;
-}
-
-
-static unsigned long update_load_fair(struct rq *rq)
-{
- return update_load_cfs_rq(&rq->cfs);
-}
-
/*
* All the scheduling class methods:
*/
@@ -882,7 +846,6 @@

.load_balance_start = load_balance_start_fair,
.load_balance_next = load_balance_next_fair,
- .update_load = update_load_fair,
.task_tick = task_tick_fair,
.task_new = task_new_fair,
};
Index: current/kernel/sched_rt.c
===================================================================
--- current.orig/kernel/sched_rt.c 2007-06-15 22:50:33.000000000 +0530
+++ current/kernel/sched_rt.c 2007-06-15 22:50:49.000000000 +0530
@@ -194,12 +194,6 @@
activate_task(rq, p, 1);
}

-static unsigned long update_load_rt(struct rq *rq)
-{
- /* Temporal approach. */
- return rq->rt.raw_weighted_load;
-}
-
static struct sched_class rt_sched_class __read_mostly = {
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
@@ -212,7 +206,6 @@

.load_balance_start = load_balance_start_rt,
.load_balance_next = load_balance_next_rt,
- .update_load = update_load_rt,

.task_tick = task_tick_rt,
.task_new = task_new_rt,
Index: current/kernel/sched_debug.c
===================================================================
--- current.orig/kernel/sched_debug.c 2007-06-15 22:54:26.000000000 +0530
+++ current/kernel/sched_debug.c 2007-06-15 23:03:04.000000000 +0530
@@ -112,7 +112,9 @@
SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rq->x))

P(nr_running);
- SEQ_printf(m, " .%-30s: %lu\n", "raw_weighted_load", total_raw_weighted_load(rq));
+ SEQ_printf(m, " .%-30s: %lu\n", "raw_weighted_load", rq->ls.raw_weighted_load);
+ P(ls.delta_fair);
+ P(ls.delta_exec);
P(nr_switches);
P(nr_load_updates);
P(nr_uninterruptible);
@@ -126,9 +128,7 @@
P(clock_unstable_events);
P(clock_max_delta);
P(cfs.fair_clock);
- P(cfs.delta_fair_clock);
P(cfs.exec_clock);
- P(cfs.delta_exec_clock);
P(cfs.wait_runtime);
P(cfs.wait_runtime_overruns);
P(cfs.wait_runtime_underruns);

--
Regards,
vatsa
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Christoph Lameter: "Re: v2.6.21.4-rt11"
Previous message: Nish Aravamudan: "Re: [PATCH] madvise_need_mmap_write() usage"
Next in thread: Srivatsa Vaddagiri: "Re: [PATCH -cfs] Fix cpu_load calculation error"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]