summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYuyang Du <yuyang.du@intel.com>2017-02-12 16:44:23 -0500
committerIngo Molnar <mingo@kernel.org>2017-03-30 03:43:41 -0400
commita481db34b9beb7a9647c23f2320dd38a2b1d681f (patch)
treeda9a81b164c13cd3d8577c41b658b768ec326230
parent0ccb977f4c80b921a8bf6a2c4b8ea0c1fed6553c (diff)
sched/fair: Optimize ___update_sched_avg()
The main PELT function ___update_load_avg(), which implements the accumulation and progression of the geometric average series, is implemented along the following lines for the scenario where the time delta spans all 3 possible sections (see figure below): 1. add the remainder of the last incomplete period 2. decay old sum 3. accumulate new sum in full periods since last_update_time 4. accumulate the current incomplete period 5. update averages Or: d1 d2 d3 ^ ^ ^ | | | |<->|<----------------->|<--->| ... |---x---|------| ... |------|-----x (now) load_sum' = (load_sum + weight * scale * d1) * y^(p+1) + (1,2) p weight * scale * 1024 * \Sum y^n + (3) n=1 weight * scale * d3 * y^0 (4) load_avg' = load_sum' / LOAD_AVG_MAX (5) Where: d1 - is the delta part completing the remainder of the last incomplete period, d2 - is the delta part spannind complete periods, and d3 - is the delta part starting the current incomplete period. We can simplify the code in two steps; the first step is to separate the first term into new and old parts like: (load_sum + weight * scale * d1) * y^(p+1) = load_sum * y^(p+1) + weight * scale * d1 * y^(p+1) Once we've done that, its easy to see that all new terms carry the common factors: weight * scale If we factor those out, we arrive at the form: load_sum' = load_sum * y^(p+1) + weight * scale * (d1 * y^(p+1) + p 1024 * \Sum y^n + n=1 d3 * y^0) Which results in a simpler, smaller and faster implementation. Signed-off-by: Yuyang Du <yuyang.du@intel.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: bsegall@google.com Cc: dietmar.eggemann@arm.com Cc: matt@codeblueprint.co.uk Cc: morten.rasmussen@arm.com Cc: pjt@google.com Cc: umgwanakikbuti@gmail.com Cc: vincent.guittot@linaro.org Link: http://lkml.kernel.org/r/1486935863-25251-3-git-send-email-yuyang.du@intel.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--kernel/sched/fair.c212
1 files changed, 118 insertions, 94 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2ac00cfbf29f..76f67b3e34d6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[] = {
2767 * Approximate: 2767 * Approximate:
2768 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) 2768 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
2769 */ 2769 */
2770static __always_inline u64 decay_load(u64 val, u64 n) 2770static u64 decay_load(u64 val, u64 n)
2771{ 2771{
2772 unsigned int local_n; 2772 unsigned int local_n;
2773 2773
@@ -2795,32 +2795,113 @@ static __always_inline u64 decay_load(u64 val, u64 n)
2795 return val; 2795 return val;
2796} 2796}
2797 2797
2798/* 2798static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder)
2799 * For updates fully spanning n periods, the contribution to runnable
2800 * average will be: \Sum 1024*y^n
2801 *
2802 * We can compute this reasonably efficiently by combining:
2803 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
2804 */
2805static u32 __compute_runnable_contrib(u64 n)
2806{ 2799{
2807 u32 contrib = 0; 2800 u32 c1, c2, c3 = remainder; /* y^0 == 1 */
2808 2801
2809 if (likely(n <= LOAD_AVG_PERIOD)) 2802 if (!periods)
2810 return runnable_avg_yN_sum[n]; 2803 return remainder - period_contrib;
2811 else if (unlikely(n >= LOAD_AVG_MAX_N)) 2804
2805 if (unlikely(periods >= LOAD_AVG_MAX_N))
2812 return LOAD_AVG_MAX; 2806 return LOAD_AVG_MAX;
2813 2807
2814 /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ 2808 /*
2815 contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; 2809 * c1 = d1 y^(p+1)
2816 n %= LOAD_AVG_PERIOD; 2810 */
2817 contrib = decay_load(contrib, n); 2811 c1 = decay_load((u64)(1024 - period_contrib), periods);
2818 return contrib + runnable_avg_yN_sum[n]; 2812
2813 periods -= 1;
2814 /*
2815 * For updates fully spanning n periods, the contribution to runnable
2816 * average will be:
2817 *
2818 * c2 = 1024 \Sum y^n
2819 *
2820 * We can compute this reasonably efficiently by combining:
2821 *
2822 * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD}
2823 */
2824 if (likely(periods <= LOAD_AVG_PERIOD)) {
2825 c2 = runnable_avg_yN_sum[periods];
2826 } else {
2827 c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD];
2828 periods %= LOAD_AVG_PERIOD;
2829 c2 = decay_load(c2, periods);
2830 c2 += runnable_avg_yN_sum[periods];
2831 }
2832
2833 return c1 + c2 + c3;
2819} 2834}
2820 2835
2821#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) 2836#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2822 2837
2823/* 2838/*
2839 * Accumulate the three separate parts of the sum; d1 the remainder
2840 * of the last (incomplete) period, d2 the span of full periods and d3
2841 * the remainder of the (incomplete) current period.
2842 *
2843 * d1 d2 d3
2844 * ^ ^ ^
2845 * | | |
2846 * |<->|<----------------->|<--->|
2847 * ... |---x---|------| ... |------|-----x (now)
2848 *
2849 * p
2850 * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0
2851 * n=1
2852 *
2853 * = u y^(p+1) + (Step 1)
2854 *
2855 * p
2856 * d1 y^(p+1) + 1024 \Sum y^n + d3 y^0 (Step 2)
2857 * n=1
2858 */
2859static __always_inline u32
2860accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
2861 unsigned long weight, int running, struct cfs_rq *cfs_rq)
2862{
2863 unsigned long scale_freq, scale_cpu;
2864 u64 periods;
2865 u32 contrib;
2866
2867 scale_freq = arch_scale_freq_capacity(NULL, cpu);
2868 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2869
2870 delta += sa->period_contrib;
2871 periods = delta / 1024; /* A period is 1024us (~1ms) */
2872
2873 /*
2874 * Step 1: decay old *_sum if we crossed period boundaries.
2875 */
2876 if (periods) {
2877 sa->load_sum = decay_load(sa->load_sum, periods);
2878 if (cfs_rq) {
2879 cfs_rq->runnable_load_sum =
2880 decay_load(cfs_rq->runnable_load_sum, periods);
2881 }
2882 sa->util_sum = decay_load((u64)(sa->util_sum), periods);
2883 }
2884
2885 /*
2886 * Step 2
2887 */
2888 delta %= 1024;
2889 contrib = __accumulate_sum(periods, sa->period_contrib, delta);
2890 sa->period_contrib = delta;
2891
2892 contrib = cap_scale(contrib, scale_freq);
2893 if (weight) {
2894 sa->load_sum += weight * contrib;
2895 if (cfs_rq)
2896 cfs_rq->runnable_load_sum += weight * contrib;
2897 }
2898 if (running)
2899 sa->util_sum += contrib * scale_cpu;
2900
2901 return periods;
2902}
2903
2904/*
2824 * We can represent the historical contribution to runnable average as the 2905 * We can represent the historical contribution to runnable average as the
2825 * coefficients of a geometric series. To do this we sub-divide our runnable 2906 * coefficients of a geometric series. To do this we sub-divide our runnable
2826 * history into segments of approximately 1ms (1024us); label the segment that 2907 * history into segments of approximately 1ms (1024us); label the segment that
@@ -2852,10 +2933,7 @@ static __always_inline int
2852___update_load_avg(u64 now, int cpu, struct sched_avg *sa, 2933___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2853 unsigned long weight, int running, struct cfs_rq *cfs_rq) 2934 unsigned long weight, int running, struct cfs_rq *cfs_rq)
2854{ 2935{
2855 u64 delta, scaled_delta, periods; 2936 u64 delta;
2856 u32 contrib;
2857 unsigned int delta_w, scaled_delta_w, decayed = 0;
2858 unsigned long scale_freq, scale_cpu;
2859 2937
2860 delta = now - sa->last_update_time; 2938 delta = now - sa->last_update_time;
2861 /* 2939 /*
@@ -2876,81 +2954,27 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2876 return 0; 2954 return 0;
2877 sa->last_update_time = now; 2955 sa->last_update_time = now;
2878 2956
2879 scale_freq = arch_scale_freq_capacity(NULL, cpu); 2957 /*
2880 scale_cpu = arch_scale_cpu_capacity(NULL, cpu); 2958 * Now we know we crossed measurement unit boundaries. The *_avg
2881 2959 * accrues by two steps:
2882 /* delta_w is the amount already accumulated against our next period */ 2960 *
2883 delta_w = sa->period_contrib; 2961 * Step 1: accumulate *_sum since last_update_time. If we haven't
2884 if (delta + delta_w >= 1024) { 2962 * crossed period boundaries, finish.
2885 decayed = 1; 2963 */
2886 2964 if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq))
2887 /* how much left for next period will start over, we don't know yet */ 2965 return 0;
2888 sa->period_contrib = 0;
2889
2890 /*
2891 * Now that we know we're crossing a period boundary, figure
2892 * out how much from delta we need to complete the current
2893 * period and accrue it.
2894 */
2895 delta_w = 1024 - delta_w;
2896 scaled_delta_w = cap_scale(delta_w, scale_freq);
2897 if (weight) {
2898 sa->load_sum += weight * scaled_delta_w;
2899 if (cfs_rq) {
2900 cfs_rq->runnable_load_sum +=
2901 weight * scaled_delta_w;
2902 }
2903 }
2904 if (running)
2905 sa->util_sum += scaled_delta_w * scale_cpu;
2906
2907 delta -= delta_w;
2908
2909 /* Figure out how many additional periods this update spans */
2910 periods = delta / 1024;
2911 delta %= 1024;
2912
2913 sa->load_sum = decay_load(sa->load_sum, periods + 1);
2914 if (cfs_rq) {
2915 cfs_rq->runnable_load_sum =
2916 decay_load(cfs_rq->runnable_load_sum, periods + 1);
2917 }
2918 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
2919
2920 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
2921 contrib = __compute_runnable_contrib(periods);
2922 contrib = cap_scale(contrib, scale_freq);
2923 if (weight) {
2924 sa->load_sum += weight * contrib;
2925 if (cfs_rq)
2926 cfs_rq->runnable_load_sum += weight * contrib;
2927 }
2928 if (running)
2929 sa->util_sum += contrib * scale_cpu;
2930 }
2931
2932 /* Remainder of delta accrued against u_0` */
2933 scaled_delta = cap_scale(delta, scale_freq);
2934 if (weight) {
2935 sa->load_sum += weight * scaled_delta;
2936 if (cfs_rq)
2937 cfs_rq->runnable_load_sum += weight * scaled_delta;
2938 }
2939 if (running)
2940 sa->util_sum += scaled_delta * scale_cpu;
2941
2942 sa->period_contrib += delta;
2943 2966
2944 if (decayed) { 2967 /*
2945 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); 2968 * Step 2: update *_avg.
2946 if (cfs_rq) { 2969 */
2947 cfs_rq->runnable_load_avg = 2970 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
2948 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); 2971 if (cfs_rq) {
2949 } 2972 cfs_rq->runnable_load_avg =
2950 sa->util_avg = sa->util_sum / LOAD_AVG_MAX; 2973 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2951 } 2974 }
2975 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2952 2976
2953 return decayed; 2977 return 1;
2954} 2978}
2955 2979
2956static int 2980static int