[PATCH v3 05/12] sched/fair: Optimize __update_sched_avg()
From: Yuyang Du
Date: Tue May 03 2016 - 23:46:51 EST
__update_sched_avg() has these steps:
1. add the remainder of the last incomplete period
2. decay old sum
3. accumulate new sum in full periods since last_update_time
4. add the current incomplete period
5. update averages
Previously, we separately computed steps 1, 3, and 4, leading to
each one of them ugly in codes and costly in overhead.
For example:
c1 c3 c4
^ ^ ^
| | |
|<->|<----------------->|<--->|
... |---x---|------| ... |------|-----x (now)
c1, c3, and c4 are the accumulated (meanwhile decayed) contributions
in timing aspect of steps 1, 3, and 4 respectively.
Then the accumulated contribution to load_sum, for example, is:
contrib = c1 * weight * freq_scaled;
contrib += c3 * weight * freq_scaled;
contrib += c4 * weight * freq_scaled;
Obviously, we can optimize the above as:
contrib = c1 + c3 + c4;
contrib *= weight * freq_scaled;
After we combine them together, we will have much cleaner codes
and less CPU cycles.
Signed-off-by: Yuyang Du <yuyang.du@xxxxxxxxx>
---
kernel/sched/fair.c | 189 ++++++++++++++++++++++++++-------------------------
1 file changed, 95 insertions(+), 94 deletions(-)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index a060ef2..495e5f0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -668,7 +668,7 @@ static unsigned long task_h_load(struct task_struct *p);
*/
#define SCHED_AVG_HALFLIFE 32 /* number of periods as a half-life */
#define SCHED_AVG_MAX 47742 /* maximum possible sched avg */
-#define SCHED_AVG_MAX_N 345 /* number of full periods to produce SCHED_AVG_MAX */
+#define SCHED_AVG_MAX_N 347 /* number of full periods to produce SCHED_AVG_MAX */
/* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se)
@@ -2606,7 +2606,7 @@ static const u32 __accumulated_sum_N[] = {
/*
* Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to
- * lower integers.
+ * lower integers. Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11
*/
static const u32 __accumulated_sum_N32[] = {
0, 23371, 35056, 40899, 43820, 45281,
@@ -2616,8 +2616,11 @@ static const u32 __accumulated_sum_N32[] = {
/*
* val * y^n, where y^m ~= 0.5
*
- * n is the number of periods past; a period is ~1ms
+ * n is the number of periods past. A period is ~1ms, so a 32bit
+ * integer can hold approximately a maximum of 49 (=2^32/1000/3600/24) days.
+ *
* m is called half-life in exponential decay; here it is SCHED_AVG_HALFLIFE=32.
+ *
*/
static __always_inline u64 __decay_sum(u64 val, u32 n)
{
@@ -2649,20 +2652,30 @@ static __always_inline u64 __decay_sum(u64 val, u32 n)
* We can compute this efficiently by combining:
* y^32 = 1/2 with precomputed \Sum 1024*y^n (where n < 32)
*/
-static __always_inline u32 __accumulate_sum(u32 n)
+static __always_inline u32
+__accumulate_sum(u32 periods, u32 period_contrib, u32 remainder)
{
- u32 contrib = 0;
+ u32 contrib;
+
+ if (!periods)
+ return remainder - period_contrib;
- if (likely(n <= SCHED_AVG_HALFLIFE))
- return __accumulated_sum_N[n];
- else if (unlikely(n >= SCHED_AVG_MAX_N))
+ if (unlikely(periods >= SCHED_AVG_MAX_N))
return SCHED_AVG_MAX;
- /* Since n < SCHED_AVG_MAX_N, n/SCHED_AVG_HALFLIFE < 11 */
- contrib = __accumulated_sum_N32[n/SCHED_AVG_HALFLIFE];
- n %= SCHED_AVG_HALFLIFE;
- contrib = __decay_sum(contrib, n);
- return contrib + __accumulated_sum_N[n];
+ remainder += __decay_sum((u64)(1024 - period_contrib), periods);
+
+ periods -= 1;
+ if (likely(periods <= SCHED_AVG_HALFLIFE))
+ contrib = __accumulated_sum_N[periods];
+ else {
+ contrib = __accumulated_sum_N32[periods/SCHED_AVG_HALFLIFE];
+ periods %= SCHED_AVG_HALFLIFE;
+ contrib = __decay_sum(contrib, periods);
+ contrib += __accumulated_sum_N[periods];
+ }
+
+ return contrib + remainder;
}
#if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
@@ -2671,6 +2684,55 @@ static __always_inline u32 __accumulate_sum(u32 n)
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+static __always_inline u32 accumulate_sum(u64 delta, struct sched_avg *sa,
+ struct cfs_rq *cfs_rq, int cpu, unsigned long weight, int running)
+{
+ u32 contrib, periods;
+ unsigned long scale_freq, scale_cpu;
+
+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+ delta += sa->period_contrib;
+ periods = delta >> 10; /* A period is 1024us (~1ms) */
+
+ /*
+ * Accumulating *_sum has two steps.
+ *
+ * Step 1: decay old *_sum if we crossed period boundaries.
+ */
+ if (periods) {
+ sa->load_sum = __decay_sum(sa->load_sum, periods);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_sum =
+ __decay_sum(cfs_rq->runnable_load_sum, periods);
+ }
+ sa->util_sum = __decay_sum((u64)(sa->util_sum), periods);
+ }
+
+ /*
+ * Step 2: accumulate new *_sum since last_update_time. This at most has
+ * three parts (at least one part): (1) remainder of incomplete last
+ * period, (2) full periods since (1), and (3) incomplete current period.
+ *
+ * Fortunately, we can (and should) do all these three at once.
+ */
+ delta %= 1024;
+ contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+ sa->period_contrib = delta;
+
+ contrib = cap_scale(contrib, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ if (cfs_rq)
+ cfs_rq->runnable_load_sum += weight * contrib;
+ }
+ if (running)
+ sa->util_sum += contrib * scale_cpu;
+
+ return periods;
+}
+
/*
* We can represent the historical contribution to sched average as the
* coefficients of a geometric series. To do this we divide the history
@@ -2701,12 +2763,9 @@ static __always_inline u32 __accumulate_sum(u32 n)
*/
static __always_inline int
__update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
- unsigned long weight, int running, struct cfs_rq *cfs_rq)
+ unsigned long weight, int running, struct cfs_rq *cfs_rq)
{
- u64 delta, scaled_delta;
- u32 contrib, periods;
- unsigned int delta_w, scaled_delta_w, decayed = 0;
- unsigned long scale_freq, scale_cpu;
+ u64 delta;
delta = now - sa->last_update_time;
/*
@@ -2727,85 +2786,27 @@ __update_sched_avg(u64 now, int cpu, struct sched_avg *sa,
return 0;
sa->last_update_time = now;
- scale_freq = arch_scale_freq_capacity(NULL, cpu);
- scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-
- /* delta_w is the amount already accumulated against our next period */
- delta_w = sa->period_contrib;
- if (delta + delta_w >= 1024) {
- decayed = 1;
-
- /* how much left for next period will start over, we don't know yet */
- sa->period_contrib = 0;
-
- /*
- * Now that we know we're crossing a period boundary, figure
- * out how much from delta we need to complete the current
- * period and accrue it.
- */
- delta_w = 1024 - delta_w;
- scaled_delta_w = cap_scale(delta_w, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta_w;
- if (cfs_rq) {
- cfs_rq->runnable_load_sum +=
- weight * scaled_delta_w;
- }
- }
- if (running)
- sa->util_sum += scaled_delta_w * scale_cpu;
-
- delta -= delta_w;
-
- /*
- * Figure out how many additional periods this update spans.
- * A period is 1024*1024ns or ~1ms, so a 32bit integer can hold
- * approximately a maximum of 49 (=2^32/1000/3600/24) days.
- */
- periods = delta / 1024;
- delta %= 1024;
-
- sa->load_sum = __decay_sum(sa->load_sum, periods + 1);
- if (cfs_rq) {
- cfs_rq->runnable_load_sum =
- __decay_sum(cfs_rq->runnable_load_sum, periods + 1);
- }
- sa->util_sum = __decay_sum((u64)(sa->util_sum), periods + 1);
-
- /* Efficiently calculate \sum (1..n_period) 1024*y^i */
- contrib = __accumulate_sum(periods);
- contrib = cap_scale(contrib, scale_freq);
- if (weight) {
- sa->load_sum += weight * contrib;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * contrib;
- }
- if (running)
- sa->util_sum += contrib * scale_cpu;
- }
-
- /* Remainder of delta accrued against u_0` */
- scaled_delta = cap_scale(delta, scale_freq);
- if (weight) {
- sa->load_sum += weight * scaled_delta;
- if (cfs_rq)
- cfs_rq->runnable_load_sum += weight * scaled_delta;
- }
- if (running)
- sa->util_sum += scaled_delta * scale_cpu;
-
- sa->period_contrib += delta;
+ /*
+ * Now we know we crossed measurement unit boundaries. The *_avg
+ * accrues by two steps:
+ *
+ * Step 1: accumulate *_sum since last_update_time. If we haven't
+ * crossed period boundaries, finish.
+ */
+ if (!accumulate_sum(delta, sa, cfs_rq, cpu, weight, running))
+ return 0;
- if (decayed) {
- sa->load_avg = div_u64(sa->load_sum, SCHED_AVG_MAX);
- if (cfs_rq) {
- cfs_rq->runnable_load_avg =
- div_u64(cfs_rq->runnable_load_sum, SCHED_AVG_MAX);
- }
- sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
+ /*
+ * Step 2: update *_avg.
+ */
+ sa->load_avg = div_u64(sa->load_sum, SCHED_AVG_MAX);
+ if (cfs_rq) {
+ cfs_rq->runnable_load_avg =
+ div_u64(cfs_rq->runnable_load_sum, SCHED_AVG_MAX);
}
+ sa->util_avg = sa->util_sum / SCHED_AVG_MAX;
- return decayed;
+ return 1;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
--
1.7.9.5