diff options
author | Yuyang Du <yuyang.du@intel.com> | 2017-02-12 16:44:23 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2017-03-30 03:43:41 -0400 |
commit | a481db34b9beb7a9647c23f2320dd38a2b1d681f (patch) | |
tree | da9a81b164c13cd3d8577c41b658b768ec326230 | |
parent | 0ccb977f4c80b921a8bf6a2c4b8ea0c1fed6553c (diff) |
sched/fair: Optimize ___update_sched_avg()
The main PELT function ___update_load_avg(), which implements the
accumulation and progression of the geometric average series, is
implemented along the following lines for the scenario where the time
delta spans all 3 possible sections (see figure below):
1. add the remainder of the last incomplete period
2. decay old sum
3. accumulate new sum in full periods since last_update_time
4. accumulate the current incomplete period
5. update averages
Or:
d1 d2 d3
^ ^ ^
| | |
|<->|<----------------->|<--->|
... |---x---|------| ... |------|-----x (now)
load_sum' = (load_sum + weight * scale * d1) * y^(p+1) + (1,2)
p
weight * scale * 1024 * \Sum y^n + (3)
n=1
weight * scale * d3 * y^0 (4)
load_avg' = load_sum' / LOAD_AVG_MAX (5)
Where:
d1 - is the delta part completing the remainder of the last
incomplete period,
d2 - is the delta part spannind complete periods, and
d3 - is the delta part starting the current incomplete period.
We can simplify the code in two steps; the first step is to separate
the first term into new and old parts like:
(load_sum + weight * scale * d1) * y^(p+1) = load_sum * y^(p+1) +
weight * scale * d1 * y^(p+1)
Once we've done that, its easy to see that all new terms carry the
common factors:
weight * scale
If we factor those out, we arrive at the form:
load_sum' = load_sum * y^(p+1) +
weight * scale * (d1 * y^(p+1) +
p
1024 * \Sum y^n +
n=1
d3 * y^0)
Which results in a simpler, smaller and faster implementation.
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: matt@codeblueprint.co.uk
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1486935863-25251-3-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | kernel/sched/fair.c | 212 |
1 files changed, 118 insertions, 94 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2ac00cfbf29f..76f67b3e34d6 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -2767,7 +2767,7 @@ static const u32 __accumulated_sum_N32[] = { | |||
2767 | * Approximate: | 2767 | * Approximate: |
2768 | * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) | 2768 | * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) |
2769 | */ | 2769 | */ |
2770 | static __always_inline u64 decay_load(u64 val, u64 n) | 2770 | static u64 decay_load(u64 val, u64 n) |
2771 | { | 2771 | { |
2772 | unsigned int local_n; | 2772 | unsigned int local_n; |
2773 | 2773 | ||
@@ -2795,32 +2795,113 @@ static __always_inline u64 decay_load(u64 val, u64 n) | |||
2795 | return val; | 2795 | return val; |
2796 | } | 2796 | } |
2797 | 2797 | ||
2798 | /* | 2798 | static u32 __accumulate_sum(u64 periods, u32 period_contrib, u32 remainder) |
2799 | * For updates fully spanning n periods, the contribution to runnable | ||
2800 | * average will be: \Sum 1024*y^n | ||
2801 | * | ||
2802 | * We can compute this reasonably efficiently by combining: | ||
2803 | * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD} | ||
2804 | */ | ||
2805 | static u32 __compute_runnable_contrib(u64 n) | ||
2806 | { | 2799 | { |
2807 | u32 contrib = 0; | 2800 | u32 c1, c2, c3 = remainder; /* y^0 == 1 */ |
2808 | 2801 | ||
2809 | if (likely(n <= LOAD_AVG_PERIOD)) | 2802 | if (!periods) |
2810 | return runnable_avg_yN_sum[n]; | 2803 | return remainder - period_contrib; |
2811 | else if (unlikely(n >= LOAD_AVG_MAX_N)) | 2804 | |
2805 | if (unlikely(periods >= LOAD_AVG_MAX_N)) | ||
2812 | return LOAD_AVG_MAX; | 2806 | return LOAD_AVG_MAX; |
2813 | 2807 | ||
2814 | /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ | 2808 | /* |
2815 | contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; | 2809 | * c1 = d1 y^(p+1) |
2816 | n %= LOAD_AVG_PERIOD; | 2810 | */ |
2817 | contrib = decay_load(contrib, n); | 2811 | c1 = decay_load((u64)(1024 - period_contrib), periods); |
2818 | return contrib + runnable_avg_yN_sum[n]; | 2812 | |
2813 | periods -= 1; | ||
2814 | /* | ||
2815 | * For updates fully spanning n periods, the contribution to runnable | ||
2816 | * average will be: | ||
2817 | * | ||
2818 | * c2 = 1024 \Sum y^n | ||
2819 | * | ||
2820 | * We can compute this reasonably efficiently by combining: | ||
2821 | * | ||
2822 | * y^PERIOD = 1/2 with precomputed 1024 \Sum y^n {for: n < PERIOD} | ||
2823 | */ | ||
2824 | if (likely(periods <= LOAD_AVG_PERIOD)) { | ||
2825 | c2 = runnable_avg_yN_sum[periods]; | ||
2826 | } else { | ||
2827 | c2 = __accumulated_sum_N32[periods/LOAD_AVG_PERIOD]; | ||
2828 | periods %= LOAD_AVG_PERIOD; | ||
2829 | c2 = decay_load(c2, periods); | ||
2830 | c2 += runnable_avg_yN_sum[periods]; | ||
2831 | } | ||
2832 | |||
2833 | return c1 + c2 + c3; | ||
2819 | } | 2834 | } |
2820 | 2835 | ||
2821 | #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) | 2836 | #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) |
2822 | 2837 | ||
2823 | /* | 2838 | /* |
2839 | * Accumulate the three separate parts of the sum; d1 the remainder | ||
2840 | * of the last (incomplete) period, d2 the span of full periods and d3 | ||
2841 | * the remainder of the (incomplete) current period. | ||
2842 | * | ||
2843 | * d1 d2 d3 | ||
2844 | * ^ ^ ^ | ||
2845 | * | | | | ||
2846 | * |<->|<----------------->|<--->| | ||
2847 | * ... |---x---|------| ... |------|-----x (now) | ||
2848 | * | ||
2849 | * p | ||
2850 | * u' = (u + d1) y^(p+1) + 1024 \Sum y^n + d3 y^0 | ||
2851 | * n=1 | ||
2852 | * | ||
2853 | * = u y^(p+1) + (Step 1) | ||
2854 | * | ||
2855 | * p | ||
2856 | * d1 y^(p+1) + 1024 \Sum y^n + d3 y^0 (Step 2) | ||
2857 | * n=1 | ||
2858 | */ | ||
2859 | static __always_inline u32 | ||
2860 | accumulate_sum(u64 delta, int cpu, struct sched_avg *sa, | ||
2861 | unsigned long weight, int running, struct cfs_rq *cfs_rq) | ||
2862 | { | ||
2863 | unsigned long scale_freq, scale_cpu; | ||
2864 | u64 periods; | ||
2865 | u32 contrib; | ||
2866 | |||
2867 | scale_freq = arch_scale_freq_capacity(NULL, cpu); | ||
2868 | scale_cpu = arch_scale_cpu_capacity(NULL, cpu); | ||
2869 | |||
2870 | delta += sa->period_contrib; | ||
2871 | periods = delta / 1024; /* A period is 1024us (~1ms) */ | ||
2872 | |||
2873 | /* | ||
2874 | * Step 1: decay old *_sum if we crossed period boundaries. | ||
2875 | */ | ||
2876 | if (periods) { | ||
2877 | sa->load_sum = decay_load(sa->load_sum, periods); | ||
2878 | if (cfs_rq) { | ||
2879 | cfs_rq->runnable_load_sum = | ||
2880 | decay_load(cfs_rq->runnable_load_sum, periods); | ||
2881 | } | ||
2882 | sa->util_sum = decay_load((u64)(sa->util_sum), periods); | ||
2883 | } | ||
2884 | |||
2885 | /* | ||
2886 | * Step 2 | ||
2887 | */ | ||
2888 | delta %= 1024; | ||
2889 | contrib = __accumulate_sum(periods, sa->period_contrib, delta); | ||
2890 | sa->period_contrib = delta; | ||
2891 | |||
2892 | contrib = cap_scale(contrib, scale_freq); | ||
2893 | if (weight) { | ||
2894 | sa->load_sum += weight * contrib; | ||
2895 | if (cfs_rq) | ||
2896 | cfs_rq->runnable_load_sum += weight * contrib; | ||
2897 | } | ||
2898 | if (running) | ||
2899 | sa->util_sum += contrib * scale_cpu; | ||
2900 | |||
2901 | return periods; | ||
2902 | } | ||
2903 | |||
2904 | /* | ||
2824 | * We can represent the historical contribution to runnable average as the | 2905 | * We can represent the historical contribution to runnable average as the |
2825 | * coefficients of a geometric series. To do this we sub-divide our runnable | 2906 | * coefficients of a geometric series. To do this we sub-divide our runnable |
2826 | * history into segments of approximately 1ms (1024us); label the segment that | 2907 | * history into segments of approximately 1ms (1024us); label the segment that |
@@ -2852,10 +2933,7 @@ static __always_inline int | |||
2852 | ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, | 2933 | ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, |
2853 | unsigned long weight, int running, struct cfs_rq *cfs_rq) | 2934 | unsigned long weight, int running, struct cfs_rq *cfs_rq) |
2854 | { | 2935 | { |
2855 | u64 delta, scaled_delta, periods; | 2936 | u64 delta; |
2856 | u32 contrib; | ||
2857 | unsigned int delta_w, scaled_delta_w, decayed = 0; | ||
2858 | unsigned long scale_freq, scale_cpu; | ||
2859 | 2937 | ||
2860 | delta = now - sa->last_update_time; | 2938 | delta = now - sa->last_update_time; |
2861 | /* | 2939 | /* |
@@ -2876,81 +2954,27 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, | |||
2876 | return 0; | 2954 | return 0; |
2877 | sa->last_update_time = now; | 2955 | sa->last_update_time = now; |
2878 | 2956 | ||
2879 | scale_freq = arch_scale_freq_capacity(NULL, cpu); | 2957 | /* |
2880 | scale_cpu = arch_scale_cpu_capacity(NULL, cpu); | 2958 | * Now we know we crossed measurement unit boundaries. The *_avg |
2881 | 2959 | * accrues by two steps: | |
2882 | /* delta_w is the amount already accumulated against our next period */ | 2960 | * |
2883 | delta_w = sa->period_contrib; | 2961 | * Step 1: accumulate *_sum since last_update_time. If we haven't |
2884 | if (delta + delta_w >= 1024) { | 2962 | * crossed period boundaries, finish. |
2885 | decayed = 1; | 2963 | */ |
2886 | 2964 | if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) | |
2887 | /* how much left for next period will start over, we don't know yet */ | 2965 | return 0; |
2888 | sa->period_contrib = 0; | ||
2889 | |||
2890 | /* | ||
2891 | * Now that we know we're crossing a period boundary, figure | ||
2892 | * out how much from delta we need to complete the current | ||
2893 | * period and accrue it. | ||
2894 | */ | ||
2895 | delta_w = 1024 - delta_w; | ||
2896 | scaled_delta_w = cap_scale(delta_w, scale_freq); | ||
2897 | if (weight) { | ||
2898 | sa->load_sum += weight * scaled_delta_w; | ||
2899 | if (cfs_rq) { | ||
2900 | cfs_rq->runnable_load_sum += | ||
2901 | weight * scaled_delta_w; | ||
2902 | } | ||
2903 | } | ||
2904 | if (running) | ||
2905 | sa->util_sum += scaled_delta_w * scale_cpu; | ||
2906 | |||
2907 | delta -= delta_w; | ||
2908 | |||
2909 | /* Figure out how many additional periods this update spans */ | ||
2910 | periods = delta / 1024; | ||
2911 | delta %= 1024; | ||
2912 | |||
2913 | sa->load_sum = decay_load(sa->load_sum, periods + 1); | ||
2914 | if (cfs_rq) { | ||
2915 | cfs_rq->runnable_load_sum = | ||
2916 | decay_load(cfs_rq->runnable_load_sum, periods + 1); | ||
2917 | } | ||
2918 | sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); | ||
2919 | |||
2920 | /* Efficiently calculate \sum (1..n_period) 1024*y^i */ | ||
2921 | contrib = __compute_runnable_contrib(periods); | ||
2922 | contrib = cap_scale(contrib, scale_freq); | ||
2923 | if (weight) { | ||
2924 | sa->load_sum += weight * contrib; | ||
2925 | if (cfs_rq) | ||
2926 | cfs_rq->runnable_load_sum += weight * contrib; | ||
2927 | } | ||
2928 | if (running) | ||
2929 | sa->util_sum += contrib * scale_cpu; | ||
2930 | } | ||
2931 | |||
2932 | /* Remainder of delta accrued against u_0` */ | ||
2933 | scaled_delta = cap_scale(delta, scale_freq); | ||
2934 | if (weight) { | ||
2935 | sa->load_sum += weight * scaled_delta; | ||
2936 | if (cfs_rq) | ||
2937 | cfs_rq->runnable_load_sum += weight * scaled_delta; | ||
2938 | } | ||
2939 | if (running) | ||
2940 | sa->util_sum += scaled_delta * scale_cpu; | ||
2941 | |||
2942 | sa->period_contrib += delta; | ||
2943 | 2966 | ||
2944 | if (decayed) { | 2967 | /* |
2945 | sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); | 2968 | * Step 2: update *_avg. |
2946 | if (cfs_rq) { | 2969 | */ |
2947 | cfs_rq->runnable_load_avg = | 2970 | sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); |
2948 | div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); | 2971 | if (cfs_rq) { |
2949 | } | 2972 | cfs_rq->runnable_load_avg = |
2950 | sa->util_avg = sa->util_sum / LOAD_AVG_MAX; | 2973 | div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); |
2951 | } | 2974 | } |
2975 | sa->util_avg = sa->util_sum / LOAD_AVG_MAX; | ||
2952 | 2976 | ||
2953 | return decayed; | 2977 | return 1; |
2954 | } | 2978 | } |
2955 | 2979 | ||
2956 | static int | 2980 | static int |