Re: High CPU load when machine is idle (related to PROBLEM:Unusually high load average when idle in 2.6.35, 2.6.35.1 and later)

From: Peter Zijlstra
Date: Mon Oct 18 2010 - 08:32:44 EST

Next message: Kay Sievers: "Re: [RFD] Device Renaming Mechanism"
Previous message: Hennerich, Michael: "RE: [PATCH] i2c: add irq_flags to board info"
Next in thread: Damien Wyart: "Re: High CPU load when machine is idle (related to PROBLEM:Unusually high load average when idle in 2.6.35, 2.6.35.1 and later)"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]

On Fri, 2010-10-15 at 13:08 +0200, Peter Zijlstra wrote:
> On Thu, 2010-10-14 at 16:58 +0200, Damien Wyart wrote:
>
> > - the commit 74f5187ac873042f502227701ed1727e7c5fbfa9 isolated by Tim
> > seems to be the culprit;
>
> Right, so I think I figured out what's happening.
>
> We're folding sucessive idles of the same cpu into the total idle
> number, which is inflating things.
>
> +/*
> + * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
> + *
> + * When making the ILB scale, we should try to pull this in as well.
> + */
> +static atomic_long_t calc_load_tasks_idle;
> +
> +static void calc_load_account_idle(struct rq *this_rq)
> +{
> + long delta;
> +
> + delta = calc_load_fold_active(this_rq);
> + if (delta)
> + atomic_long_add(delta, &calc_load_tasks_idle);
> +}
> +
> +static long calc_load_fold_idle(void)
> +{
> + long delta = 0;
> +
> + /*
> + * Its got a race, we don't care...
> + */
> + if (atomic_long_read(&calc_load_tasks_idle))
> + delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
> +
> + return delta;
> +}
>
>
> If you look at that and imagine CPU1 going idle with 1 task blocked,
> then waking up due to unblocking, then going idle with that same task
> block, etc.. all before we fold_idle on an active cpu, then we can count
> that one task many times over.
>
OK, I came up with the below, but its not quite working, load continues
to decrease even though I've got a make -j64 running..

Thomas, Chase, any clue?

---
kernel/sched.c | 31 +++++++++++++++++++++++++------
kernel/sched_idletask.c | 1 +
2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index 3312c64..a56446b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -521,6 +521,10 @@ struct rq {
/* calc_load related fields */
unsigned long calc_load_update;
long calc_load_active;
+#ifdef CONFIG_NO_HZ
+ long calc_load_idle;
+ int calc_load_seq;
+#endif

#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMP
@@ -1817,6 +1821,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
#endif

static void calc_load_account_idle(struct rq *this_rq);
+static void calc_load_account_nonidle(struct rq *this_rq);
static void update_sysctl(void);
static int get_update_sysctl_factor(void);
static void update_cpu_load(struct rq *this_rq);
@@ -2978,14 +2983,25 @@ static long calc_load_fold_active(struct rq *this_rq)
* When making the ILB scale, we should try to pull this in as well.
*/
static atomic_long_t calc_load_tasks_idle;
+static atomic_t calc_load_seq;

static void calc_load_account_idle(struct rq *this_rq)
{
- long delta;
+ long idle;

- delta = calc_load_fold_active(this_rq);
- if (delta)
- atomic_long_add(delta, &calc_load_tasks_idle);
+ idle = calc_load_fold_active(this_rq);
+ this_rq->calc_load_idle = idle;
+
+ if (idle) {
+ this_rq->calc_load_seq = atomic_read(&calc_load_seq);
+ atomic_long_add(idle, &calc_load_tasks_idle);
+ }
+}
+
+static void calc_load_account_nonidle(struct rq *this_rq)
+{
+ if (atomic_read(&calc_load_seq) == this_rq->calc_load_seq)
+ atomic_long_sub(this_rq->calc_load_idle, &calc_load_tasks_idle);
}

static long calc_load_fold_idle(void)
@@ -2993,10 +3009,13 @@ static long calc_load_fold_idle(void)
long delta = 0;

/*
- * Its got a race, we don't care...
+ * Its got races, we don't care... its only statistics after all.
*/
- if (atomic_long_read(&calc_load_tasks_idle))
+ if (atomic_long_read(&calc_load_tasks_idle)) {
delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+ if (delta)
+ atomic_inc(&calc_load_seq);
+ }

return delta;
}
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402..a7fa1aa 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -42,6 +42,7 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)

static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
{
+ calc_load_account_nonidle(rq);
}

static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Next message: Kay Sievers: "Re: [RFD] Device Renaming Mechanism"
Previous message: Hennerich, Michael: "RE: [PATCH] i2c: add irq_flags to board info"
Next in thread: Damien Wyart: "Re: High CPU load when machine is idle (related to PROBLEM:Unusually high load average when idle in 2.6.35, 2.6.35.1 and later)"
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]