sched/fair: Optimize ___update_sched_avg()

The main PELT function ___update_load_avg(), which implements the accumulation and progression of the geometric average series, is implemented along the following lines for the scenario where the time delta spans all 3 possible sections (see figure below): 1. add the remainder of the last incomplete period 2. decay old sum 3. accumulate new sum in full periods since last_update_time 4. accumulate the current incomplete period 5. update averages Or: d1 d2 d3 ^ ^ ^ | | | |<->|<----------------->|<--->| ... |---x---|------| ... |------|-----x (now) load_sum' = (load_sum + weight * scale * d1) * y^(p+1) + (1,2) p weight * scale * 1024 * \Sum y^n + (3) n=1 weight * scale * d3 * y^0 (4) load_avg' = load_sum' / LOAD_AVG_MAX (5) Where: d1 - is the delta part completing the remainder of the last incomplete period, d2 - is the delta part spannind complete periods, and d3 - is the delta part starting the current incomplete period. We can simplify the code in two steps; the first step is to separate the first term into new and old parts like: (load_sum + weight * scale * d1) * y^(p+1) = load_sum * y^(p+1) + weight * scale * d1 * y^(p+1) Once we've done that, its easy to see that all new terms carry the common factors: weight * scale If we factor those out, we arrive at the form: load_sum' = load_sum * y^(p+1) + weight * scale * (d1 * y^(p+1) + p 1024 * \Sum y^n + n=1 d3 * y^0) Which results in a simpler, smaller and faster implementation. Signed-off-by: Yuyang Du <yuyang.du@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: matt@codeblueprint.co.uk Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1486935863-25251-3-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
author: Yuyang Du <yuyang.du@intel.com> 2017-02-12 16:44:23 -0500
committer: Ingo Molnar <mingo@kernel.org> 2017-03-30 03:43:41 -0400
commit: a481db34b9beb7a9647c23f2320dd38a2b1d681f (patch)
tree: da9a81b164c13cd3d8577c41b658b768ec326230
parent: 0ccb977f4c80b921a8bf6a2c4b8ea0c1fed6553c (diff)
1 files changed, 118 insertions, 94 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2ac00cfbf29f..76f67b3e34d6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[] = {
 * Approximate:
 *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
 */
-static __always_inline u64 decay_load(u64 val, u64 n)
+static u64 decay_load(u64 val, u64 n)
 {
        unsigned int local_n;
@@ -2795,32 +2795,113 @@ static __always_inline u64 decay_load(u64 val, u64 n)
        return val;
 }
-/*
+static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
- * For updates fully spanning n periods, the contribution to runnable
- * average will be: \Sum 1024*y^n
- *
- * We can compute this reasonably efficiently by combining:
- *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
- */
-static u32 __compute_runnable_contrib(u64 n)
 {
-        u32 contrib = 0;
+        u32 c1, c2, c3 = remainder; /* y^0 == 1 */
-        if (likely(n <= LOAD_AVG_PERIOD))
+        if (!periods)
-                return runnable_avg_yN_sum[n];
+                return remainder - period_contrib;
-        else if (unlikely(n >= LOAD_AVG_MAX_N))
+        if (unlikely(periods >= LOAD_AVG_MAX_N))
                return LOAD_AVG_MAX;
-        /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */
+        /*
-        contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];
+         * c1 = d1 y^(p+1)
-        n %= LOAD_AVG_PERIOD;
+         */
-        contrib = decay_load(contrib, n);
+        c1 = decay_load((u64)(1024 - period_contrib), periods);
-        return contrib + runnable_avg_yN_sum[n];
+        periods -= 1;
+        /*
+         * For updates fully spanning n periods, the contribution to runnable
+         * average will be:
+         *
+         *   c2 = 1024 \Sum y^n
+         *
+         * We can compute this reasonably efficiently by combining:
+         *
+         *   y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
+         */
+        if (likely(periods <= LOAD_AVG_PERIOD)) {
+                c2 = runnable_avg_yN_sum[periods];
+        } else {
+                c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
+                periods %= LOAD_AVG_PERIOD;
+                c2 = decay_load(c2, periods);
+                c2 += runnable_avg_yN_sum[periods];
+        }
+        return c1 + c2 + c3;
 }
 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 /*
+ * Accumulate the three separate parts of the sum; d1 the remainder
+ * of the last (incomplete) period, d2 the span of full periods and d3
+ * the remainder of the (incomplete) current period.
+ *
+ *           d1          d2           d3
+ *           ^           ^            ^
+ *           |           |            |
+ *         |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ *                                p
+ * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0
+ *                               n=1
+ *
+ *    = u y^(p+1) +                             (Step 1)
+ *
+ *                          p
+ *      d1 y^(p+1) + 1024 \Sum y^n + d3 y^0     (Step 2)
+ *                         n=1
+ */
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+               unsigned long weight, int running, struct cfs_rq *cfs_rq)
+{
+        unsigned long scale_freq, scale_cpu;
+        u64 periods;
+        u32 contrib;
+        scale_freq = arch_scale_freq_capacity(NULL, cpu);
+        scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+        delta += sa->period_contrib;
+        periods = delta / 1024; /* A period is 1024us (~1ms) */
+        /*
+         * Step 1: decay old *_sum if we crossed period boundaries.
+         */
+        if (periods) {
+                sa->load_sum = decay_load(sa->load_sum, periods);
+                if (cfs_rq) {
+                        cfs_rq->runnable_load_sum =
+                                decay_load(cfs_rq->runnable_load_sum, periods);
+                }
+                sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+        }
+        /*
+         * Step 2
+         */
+        delta %= 1024;
+        contrib = __accumulate_sum(periods, sa->period_contrib, delta);
+        sa->period_contrib = delta;
+        contrib = cap_scale(contrib, scale_freq);
+        if (weight) {
+                sa->load_sum += weight * contrib;
+                if (cfs_rq)
+                        cfs_rq->runnable_load_sum += weight * contrib;
+        }
+        if (running)
+                sa->util_sum += contrib * scale_cpu;
+        return periods;
+}
+/*
 * We can represent the historical contribution to runnable average as the
 * coefficients of a geometric series.  To do this we sub-divide our runnable
 * history into segments of approximately 1ms (1024us); label the segment that
@@ -2852,10 +2933,7 @@ static __always_inline int
 ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                  unsigned long weight, int running, struct cfs_rq *cfs_rq)
 {
-        u64 delta, scaled_delta, periods;
+        u64 delta;
-        u32 contrib;
-        unsigned int delta_w, scaled_delta_w, decayed = 0;
-        unsigned long scale_freq, scale_cpu;
        delta = now - sa->last_update_time;
        /*
@@ -2876,81 +2954,27 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
                return 0;
        sa->last_update_time = now;
-        scale_freq = arch_scale_freq_capacity(NULL, cpu);
+        /*
-        scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+         * Now we know we crossed measurement unit boundaries. The *_avg
+         * accrues by two steps:
-        /* delta_w is the amount already accumulated against our next period */
+         *
-        delta_w = sa->period_contrib;
+         * Step 1: accumulate *_sum since last_update_time. If we haven't
-        if (delta + delta_w >= 1024) {
+         * crossed period boundaries, finish.
-                decayed = 1;
+         */
+        if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
-                /* how much left for next period will start over, we don't know yet */
+                return 0;
-                sa->period_contrib = 0;
-                /*
-                 * Now that we know we're crossing a period boundary, figure
-                 * out how much from delta we need to complete the current
-                 * period and accrue it.
-                 */
-                delta_w = 1024 - delta_w;
-                scaled_delta_w = cap_scale(delta_w, scale_freq);
-                if (weight) {
-                        sa->load_sum += weight * scaled_delta_w;
-                        if (cfs_rq) {
-                                cfs_rq->runnable_load_sum +=
-                                                weight * scaled_delta_w;
-                        }
-                }
-                if (running)
-                        sa->util_sum += scaled_delta_w * scale_cpu;
-                delta -= delta_w;
-                /* Figure out how many additional periods this update spans */
-                periods = delta / 1024;
-                delta %= 1024;
-                sa->load_sum = decay_load(sa->load_sum, periods + 1);
-                if (cfs_rq) {
-                        cfs_rq->runnable_load_sum =
-                                decay_load(cfs_rq->runnable_load_sum, periods + 1);
-                }
-                sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
-                /* Efficiently calculate \sum (1..n_period) 1024*y^i */
-                contrib = __compute_runnable_contrib(periods);
-                contrib = cap_scale(contrib, scale_freq);
-                if (weight) {
-                        sa->load_sum += weight * contrib;
-                        if (cfs_rq)
-                                cfs_rq->runnable_load_sum += weight * contrib;
-                }
-                if (running)
-                        sa->util_sum += contrib * scale_cpu;
-        }
-        /* Remainder of delta accrued against u_0` */
-        scaled_delta = cap_scale(delta, scale_freq);
-        if (weight) {
-                sa->load_sum += weight * scaled_delta;
-                if (cfs_rq)
-                        cfs_rq->runnable_load_sum += weight * scaled_delta;
-        }
-        if (running)
-                sa->util_sum += scaled_delta * scale_cpu;
-        sa->period_contrib += delta;
-        if (decayed) {
+        /*
-                sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+         * Step 2: update *_avg.
-                if (cfs_rq) {
+         */
-                        cfs_rq->runnable_load_avg =
+        sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
-                                div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
+        if (cfs_rq) {
-                }
+                cfs_rq->runnable_load_avg =
-                sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+                        div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
        }
+        sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
-        return decayed;
+        return 1;
 }
 static int
author	Yuyang Du <yuyang.du@intel.com>	2017-02-12 16:44:23 -0500
committer	Ingo Molnar <mingo@kernel.org>	2017-03-30 03:43:41 -0400
commit	a481db34b9beb7a9647c23f2320dd38a2b1d681f (patch)
tree	da9a81b164c13cd3d8577c41b658b768ec326230
parent	0ccb977f4c80b921a8bf6a2c4b8ea0c1fed6553c (diff)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2ac00cfbf29f..76f67b3e34d6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c
@@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[] = {
2767	* Approximate:	2767	* Approximate:
2768	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)	2768	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
2769	*/	2769	*/
2770	static __always_inline u64 decay_load(u64 val, u64 n)	2770	static u64 decay_load(u64 val, u64 n)
2771	{	2771	{
2772	unsigned int local_n;	2772	unsigned int local_n;
2773		2773
@@ -2795,32 +2795,113 @@ static __always_inline u64 decay_load(u64 val, u64 n)
2795	return val;	2795	return val;
2796	}	2796	}
2797		2797
2798	/*	2798	static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
2799	* For updates fully spanning n periods, the contribution to runnable
2800	* average will be: \Sum 1024*y^n
2801	*
2802	* We can compute this reasonably efficiently by combining:
2803	* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
2804	*/
2805	static u32 __compute_runnable_contrib(u64 n)
2806	{	2799	{
2807	u32 contrib = 0;	2800	u32 c1, c2, c3 = remainder; /* y^0 == 1 */
2808		2801
2809	if (likely(n <= LOAD_AVG_PERIOD))	2802	if (!periods)
2810	return runnable_avg_yN_sum[n];	2803	return remainder - period_contrib;
2811	else if (unlikely(n >= LOAD_AVG_MAX_N))	2804
		2805	if (unlikely(periods >= LOAD_AVG_MAX_N))
2812	return LOAD_AVG_MAX;	2806	return LOAD_AVG_MAX;
2813		2807
2814	/* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */	2808	/*
2815	contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD];	2809	* c1 = d1 y^(p+1)
2816	n %= LOAD_AVG_PERIOD;	2810	*/
2817	contrib = decay_load(contrib, n);	2811	c1 = decay_load((u64)(1024 - period_contrib), periods);
2818	return contrib + runnable_avg_yN_sum[n];	2812
		2813	periods -= 1;
		2814	/*
		2815	* For updates fully spanning n periods, the contribution to runnable
		2816	* average will be:
		2817	*
		2818	* c2 = 1024 \Sum y^n
		2819	*
		2820	* We can compute this reasonably efficiently by combining:
		2821	*
		2822	* y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
		2823	*/
		2824	if (likely(periods <= LOAD_AVG_PERIOD)) {
		2825	c2 = runnable_avg_yN_sum[periods];
		2826	} else {
		2827	c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
		2828	periods %= LOAD_AVG_PERIOD;
		2829	c2 = decay_load(c2, periods);
		2830	c2 += runnable_avg_yN_sum[periods];
		2831	}
		2832
		2833	return c1 + c2 + c3;
2819	}	2834	}
2820		2835
2821	#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)	2836	#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2822		2837
2823	/*	2838	/*
		2839	* Accumulate the three separate parts of the sum; d1 the remainder
		2840	* of the last (incomplete) period, d2 the span of full periods and d3
		2841	* the remainder of the (incomplete) current period.
		2842	*
		2843	* d1 d2 d3
		2844	* ^ ^ ^
		2845	* \| \| \|
		2846	* \|<->\|<----------------->\|<--->\|
		2847	* ... \|---x---\|------\| ... \|------\|-----x (now)
		2848	*
		2849	* p
		2850	* u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0
		2851	* n=1
		2852	*
		2853	* = u y^(p+1) + (Step 1)
		2854	*
		2855	* p
		2856	* d1 y^(p+1) + 1024 \Sum y^n + d3 y^0 (Step 2)
		2857	* n=1
		2858	*/
		2859	static __always_inline u32
		2860	accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
		2861	unsigned long weight, int running, struct cfs_rq *cfs_rq)
		2862	{
		2863	unsigned long scale_freq, scale_cpu;
		2864	u64 periods;
		2865	u32 contrib;
		2866
		2867	scale_freq = arch_scale_freq_capacity(NULL, cpu);
		2868	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
		2869
		2870	delta += sa->period_contrib;
		2871	periods = delta / 1024; /* A period is 1024us (~1ms) */
		2872
		2873	/*
		2874	* Step 1: decay old *_sum if we crossed period boundaries.
		2875	*/
		2876	if (periods) {
		2877	sa->load_sum = decay_load(sa->load_sum, periods);
		2878	if (cfs_rq) {
		2879	cfs_rq->runnable_load_sum =
		2880	decay_load(cfs_rq->runnable_load_sum, periods);
		2881	}
		2882	sa->util_sum = decay_load((u64)(sa->util_sum), periods);
		2883	}
		2884
		2885	/*
		2886	* Step 2
		2887	*/
		2888	delta %= 1024;
		2889	contrib = __accumulate_sum(periods, sa->period_contrib, delta);
		2890	sa->period_contrib = delta;
		2891
		2892	contrib = cap_scale(contrib, scale_freq);
		2893	if (weight) {
		2894	sa->load_sum += weight * contrib;
		2895	if (cfs_rq)
		2896	cfs_rq->runnable_load_sum += weight * contrib;
		2897	}
		2898	if (running)
		2899	sa->util_sum += contrib * scale_cpu;
		2900
		2901	return periods;
		2902	}
		2903
		2904	/*
2824	* We can represent the historical contribution to runnable average as the	2905	* We can represent the historical contribution to runnable average as the
2825	* coefficients of a geometric series. To do this we sub-divide our runnable	2906	* coefficients of a geometric series. To do this we sub-divide our runnable
2826	* history into segments of approximately 1ms (1024us); label the segment that	2907	* history into segments of approximately 1ms (1024us); label the segment that
@@ -2852,10 +2933,7 @@ static __always_inline int
2852	___update_load_avg(u64 now, int cpu, struct sched_avg *sa,	2933	___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2853	unsigned long weight, int running, struct cfs_rq *cfs_rq)	2934	unsigned long weight, int running, struct cfs_rq *cfs_rq)
2854	{	2935	{
2855	u64 delta, scaled_delta, periods;	2936	u64 delta;
2856	u32 contrib;
2857	unsigned int delta_w, scaled_delta_w, decayed = 0;
2858	unsigned long scale_freq, scale_cpu;
2859		2937
2860	delta = now - sa->last_update_time;	2938	delta = now - sa->last_update_time;
2861	/*	2939	/*
@@ -2876,81 +2954,27 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2876	return 0;	2954	return 0;
2877	sa->last_update_time = now;	2955	sa->last_update_time = now;
2878		2956
2879	scale_freq = arch_scale_freq_capacity(NULL, cpu);	2957	/*
2880	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);	2958	* Now we know we crossed measurement unit boundaries. The *_avg
2881		2959	* accrues by two steps:
2882	/* delta_w is the amount already accumulated against our next period */	2960	*
2883	delta_w = sa->period_contrib;	2961	* Step 1: accumulate *_sum since last_update_time. If we haven't
2884	if (delta + delta_w >= 1024) {	2962	* crossed period boundaries, finish.
2885	decayed = 1;	2963	*/
2886		2964	if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
2887	/* how much left for next period will start over, we don't know yet */	2965	return 0;
2888	sa->period_contrib = 0;
2889
2890	/*
2891	* Now that we know we're crossing a period boundary, figure
2892	* out how much from delta we need to complete the current
2893	* period and accrue it.
2894	*/
2895	delta_w = 1024 - delta_w;
2896	scaled_delta_w = cap_scale(delta_w, scale_freq);
2897	if (weight) {
2898	sa->load_sum += weight * scaled_delta_w;
2899	if (cfs_rq) {
2900	cfs_rq->runnable_load_sum +=
2901	weight * scaled_delta_w;
2902	}
2903	}
2904	if (running)
2905	sa->util_sum += scaled_delta_w * scale_cpu;
2906
2907	delta -= delta_w;
2908
2909	/* Figure out how many additional periods this update spans */
2910	periods = delta / 1024;
2911	delta %= 1024;
2912
2913	sa->load_sum = decay_load(sa->load_sum, periods + 1);
2914	if (cfs_rq) {
2915	cfs_rq->runnable_load_sum =
2916	decay_load(cfs_rq->runnable_load_sum, periods + 1);
2917	}
2918	sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
2919
2920	/* Efficiently calculate \sum (1..n_period) 1024y^i /
2921	contrib = __compute_runnable_contrib(periods);
2922	contrib = cap_scale(contrib, scale_freq);
2923	if (weight) {
2924	sa->load_sum += weight * contrib;
2925	if (cfs_rq)
2926	cfs_rq->runnable_load_sum += weight * contrib;
2927	}
2928	if (running)
2929	sa->util_sum += contrib * scale_cpu;
2930	}
2931
2932	/* Remainder of delta accrued against u_0` */
2933	scaled_delta = cap_scale(delta, scale_freq);
2934	if (weight) {
2935	sa->load_sum += weight * scaled_delta;
2936	if (cfs_rq)
2937	cfs_rq->runnable_load_sum += weight * scaled_delta;
2938	}
2939	if (running)
2940	sa->util_sum += scaled_delta * scale_cpu;
2941
2942	sa->period_contrib += delta;
2943		2966
2944	if (decayed) {	2967	/*
2945	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);	2968	* Step 2: update *_avg.
2946	if (cfs_rq) {	2969	*/
2947	cfs_rq->runnable_load_avg =	2970	sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
2948	div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);	2971	if (cfs_rq) {
2949	}	2972	cfs_rq->runnable_load_avg =
2950	sa->util_avg = sa->util_sum / LOAD_AVG_MAX;	2973	div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2951	}	2974	}
		2975	sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2952		2976
2953	return decayed;	2977	return 1;
2954	}	2978	}
2955		2979
2956	static int	2980	static int