[RESEND PATCH 2/2] sched/fair: Optimize __update_sched_avg()

From: Yuyang Du
Date: Mon Feb 13 2017 - 00:39:29 EST


__update_load_avg() has the following steps:

1. add the remainder of the last incomplete period
2. decay old sum
3. accumulate new sum in full periods since last_update_time
4. accumulate the current incomplete period
5. update averages

However, there is no need to separately compute steps 1, 3, and 4.

Illustation:

c1 c3 c4
^ ^ ^
| | |
|<->|<----------------->|<--->|
... |---x---|------| ... |------|-----x (now)

c1, c3, and c4 are the accumulated (meanwhile decayed) contributions
in timing in steps 1, 3, and 4 respectively.

With them, the accumulated contribution to load_sum, for example, is:

contrib = c1 * weight * freq_scaled;
contrib += c3 * weight * freq_scaled;
contrib += c4 * weight * freq_scaled;

Obviously, we can optimize the above and they equate to:

contrib = c1 + c3 + c4;
contrib *= weight * freq_scaled;

By doing so, we save instructions, and the codes are is obviously much
cleaner and simpler. One potential issue is that c1 must be additionally
decayed as opposed to doing it in step 2. However, these two approaches
should be about the same if you compare decay_load() with a round of
contrib accumulation.

Code size comparison (with allyesconfig):

Before: kernel/sched/built-in.o 1213304
After : kernel/sched/built-in.o 1212536 (-768B)

Signed-off-by: Yuyang Du <yuyang.du@xxxxxxxxx>
---
kernel/sched/fair.c | 174 ++++++++++++++++++++++++++--------------------------
1 file changed, 88 insertions(+), 86 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3e88b35..be01d89 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2802,24 +2802,83 @@ static __always_inline u64 decay_load(u64 val, u64 n)
* We can compute this reasonably efficiently by combining:
* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
*/
-static u32 __compute_runnable_contrib(u64 n)
+static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
{
- u32 contrib = 0;
+ u32 contrib;
+
+ if (!periods)
+ return remainder - period_contrib;

- if (likely(n <= LOAD_AVG_PERIOD))
- return runnable_avg_yN_sum[n];
- else if (unlikely(n >= LOAD_AVG_MAX_N))
+ if (unlikely(periods >= LOAD_AVG_MAX_N))
return LOAD_AVG_MAX;

- /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
- contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
- n %= LOAD_AVG_PERIOD;
- contrib = decay_load(contrib, n);
- return contrib + runnable_avg_yN_sum[n];
+ remainder += decay_load((u64)(1024 - period_contrib), periods);
+
+ periods -= 1;
+ if (likely(periods <= LOAD_AVG_PERIOD))
+ contrib = runnable_avg_yN_sum[periods];
+ else {
+ contrib = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
+ periods %= LOAD_AVG_PERIOD;
+ contrib = decay_load(contrib, periods);
+ contrib += runnable_avg_yN_sum[periods];
+ }
+
+ return contrib + remainder;
}

#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)

+static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
+ struct cfs_rq *cfs_rq, int cpu, unsigned long weight, int running)
+{
+ u32 contrib;
+ u64 periods;
+ unsigned long scale_freq, scale_cpu;
+
+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+ delta += sa->period_contrib;
+ periods = delta >> 10; /* A period is 1024us (~1ms) */
+
+ /*
+ * Accumulating *_sum has two steps.
+ *
+ * Step 1: decay old *_sum if we crossed period boundaries.
+ */
+ if (periods) {
+ sa->load_sum = decay_load(sa->load_sum, periods);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum =
+ decay_load(cfs_rq->runnable_load_sum, periods);
+ }
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+ }
+
+ /*
+ * Step 2: accumulate new *_sum since last_update_time. This at most has
+ * three parts (at least one part): (1) remainder of incomplete last
+ * period, (2) full periods since (1), and (3) incomplete current period.
+ *
+ * Fortunately, we can (and should) do all these three at once.
+ */
+ delta %= 1024;
+ contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+ sa->period_contrib = delta;
+
+ contrib = cap_scale(contrib, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * contrib;
+ }
+ if (running)
+ sa->util_sum += contrib * scale_cpu;
+
+ return periods;
+}
+
/*
* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
@@ -2852,10 +2911,7 @@ static __always_inline int
__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
- u64 delta, scaled_delta, periods;
- u32 contrib;
- unsigned int delta_w, scaled_delta_w, decayed = 0;
- unsigned long scale_freq, scale_cpu;
+ u64 delta;

delta = now - sa->last_update_time;
/*
@@ -2876,81 +2932,27 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
return 0;
sa->last_update_time = now;

- scale_freq = arch_scale_freq_capacity(NULL, cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
- /* delta_w is the amount already accumulated against our next period */
- delta_w = sa->period_contrib;
- if (delta + delta_w >= 1024) {
- decayed = 1;
-
- /* how much left for next period will start over, we don't know yet */
- sa->period_contrib = 0;
-
- /*
- * Now that we know we're crossing a period boundary, figure
- * out how much from delta we need to complete the current
- * period and accrue it.
- */
- delta_w = 1024 - delta_w;
- scaled_delta_w = cap_scale(delta_w, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta_w;
- if (cfs_rq) {
- cfs_rq->runnable_load_sum +=
- weight * scaled_delta_w;
- }
- }
- if (running)
- sa->util_sum += scaled_delta_w * scale_cpu;
-
- delta -= delta_w;
-
- /* Figure out how many additional periods this update spans */
- periods = delta / 1024;
- delta %= 1024;
-
- sa->load_sum = decay_load(sa->load_sum, periods + 1);
- if (cfs_rq) {
- cfs_rq->runnable_load_sum =
- decay_load(cfs_rq->runnable_load_sum, periods + 1);
- }
- sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
-
- /* Efficiently calculate \sum (1..n_period) 1024*y^i */
- contrib = __compute_runnable_contrib(periods);
- contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
- }
- if (running)
- sa->util_sum += contrib * scale_cpu;
- }
-
- /* Remainder of delta accrued against u_0` */
- scaled_delta = cap_scale(delta, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * scaled_delta;
- }
- if (running)
- sa->util_sum += scaled_delta * scale_cpu;
-
- sa->period_contrib += delta;
+ /*
+ * Now we know we crossed measurement unit boundaries. The *_avg
+ * accrues by two steps:
+ *
+ * Step 1: accumulate *_sum since last_update_time. If we haven't
+ * crossed period boundaries, finish.
+ */
+ if (!accumulate_sum(delta, sa, cfs_rq, cpu, weight, running))
+ return 0;

- if (decayed) {
- sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
- if (cfs_rq) {
- cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
- }
- sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+ /*
+ * Step 2: update *_avg.
+ */
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_avg =
+ div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
}
+ sa->util_avg = sa->util_sum / LOAD_AVG_MAX;

- return decayed;
+ return 1;
}

/*
--
2.1.4