aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2010-11-30 13:48:45 -0500
committerIngo Molnar <mingo@elte.hu>2010-12-08 14:15:04 -0500
commit0f004f5a696a9434b7214d0d3cbd0525ee77d428 (patch)
tree274b3bb92469789284d864314d46e902c70e8384 /kernel/sched.c
parent6313e3c21743cc88bb5bd8aa72948ee1e83937b6 (diff)
sched: Cure more NO_HZ load average woes
There's a long-running regression that proved difficult to fix and which is hitting certain people and is rather annoying in its effects. Damien reported that after 74f5187ac8 (sched: Cure load average vs NO_HZ woes) his load average is unnaturally high, he also noted that even with that patch reverted the load avgerage numbers are not correct. The problem is that the previous patch only solved half the NO_HZ problem, it addressed the part of going into NO_HZ mode, not of comming out of NO_HZ mode. This patch implements that missing half. When comming out of NO_HZ mode there are two important things to take care of: - Folding the pending idle delta into the global active count. - Correctly aging the averages for the idle-duration. So with this patch the NO_HZ interaction should be complete and behaviour between CONFIG_NO_HZ=[yn] should be equivalent. Furthermore, this patch slightly changes the load average computation by adding a rounding term to the fixed point multiplication. Reported-by: Damien Wyart <damien.wyart@free.fr> Reported-by: Tim McGrath <tmhikaru@gmail.com> Tested-by: Damien Wyart <damien.wyart@free.fr> Tested-by: Orion Poplawski <orion@cora.nwra.com> Tested-by: Kyle McMartin <kyle@mcmartin.ca> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: stable@kernel.org Cc: Chase Douglas <chase.douglas@canonical.com> LKML-Reference: <1291129145.32004.874.camel@laptop> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c150
1 files changed, 139 insertions, 11 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index dc91a4d09ac3..6b7c26a1a097 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3119,6 +3119,15 @@ static long calc_load_fold_active(struct rq *this_rq)
3119 return delta; 3119 return delta;
3120} 3120}
3121 3121
3122static unsigned long
3123calc_load(unsigned long load, unsigned long exp, unsigned long active)
3124{
3125 load *= exp;
3126 load += active * (FIXED_1 - exp);
3127 load += 1UL << (FSHIFT - 1);
3128 return load >> FSHIFT;
3129}
3130
3122#ifdef CONFIG_NO_HZ 3131#ifdef CONFIG_NO_HZ
3123/* 3132/*
3124 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 3133 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3148,6 +3157,128 @@ static long calc_load_fold_idle(void)
3148 3157
3149 return delta; 3158 return delta;
3150} 3159}
3160
3161/**
3162 * fixed_power_int - compute: x^n, in O(log n) time
3163 *
3164 * @x: base of the power
3165 * @frac_bits: fractional bits of @x
3166 * @n: power to raise @x to.
3167 *
3168 * By exploiting the relation between the definition of the natural power
3169 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3170 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3171 * (where: n_i \elem {0, 1}, the binary vector representing n),
3172 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3173 * of course trivially computable in O(log_2 n), the length of our binary
3174 * vector.
3175 */
3176static unsigned long
3177fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3178{
3179 unsigned long result = 1UL << frac_bits;
3180
3181 if (n) for (;;) {
3182 if (n & 1) {
3183 result *= x;
3184 result += 1UL << (frac_bits - 1);
3185 result >>= frac_bits;
3186 }
3187 n >>= 1;
3188 if (!n)
3189 break;
3190 x *= x;
3191 x += 1UL << (frac_bits - 1);
3192 x >>= frac_bits;
3193 }
3194
3195 return result;
3196}
3197
3198/*
3199 * a1 = a0 * e + a * (1 - e)
3200 *
3201 * a2 = a1 * e + a * (1 - e)
3202 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3203 * = a0 * e^2 + a * (1 - e) * (1 + e)
3204 *
3205 * a3 = a2 * e + a * (1 - e)
3206 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3207 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3208 *
3209 * ...
3210 *
3211 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3212 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3213 * = a0 * e^n + a * (1 - e^n)
3214 *
3215 * [1] application of the geometric series:
3216 *
3217 * n 1 - x^(n+1)
3218 * S_n := \Sum x^i = -------------
3219 * i=0 1 - x
3220 */
3221static unsigned long
3222calc_load_n(unsigned long load, unsigned long exp,
3223 unsigned long active, unsigned int n)
3224{
3225
3226 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3227}
3228
3229/*
3230 * NO_HZ can leave us missing all per-cpu ticks calling
3231 * calc_load_account_active(), but since an idle CPU folds its delta into
3232 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3233 * in the pending idle delta if our idle period crossed a load cycle boundary.
3234 *
3235 * Once we've updated the global active value, we need to apply the exponential
3236 * weights adjusted to the number of cycles missed.
3237 */
3238static void calc_global_nohz(unsigned long ticks)
3239{
3240 long delta, active, n;
3241
3242 if (time_before(jiffies, calc_load_update))
3243 return;
3244
3245 /*
3246 * If we crossed a calc_load_update boundary, make sure to fold
3247 * any pending idle changes, the respective CPUs might have
3248 * missed the tick driven calc_load_account_active() update
3249 * due to NO_HZ.
3250 */
3251 delta = calc_load_fold_idle();
3252 if (delta)
3253 atomic_long_add(delta, &calc_load_tasks);
3254
3255 /*
3256 * If we were idle for multiple load cycles, apply them.
3257 */
3258 if (ticks >= LOAD_FREQ) {
3259 n = ticks / LOAD_FREQ;
3260
3261 active = atomic_long_read(&calc_load_tasks);
3262 active = active > 0 ? active * FIXED_1 : 0;
3263
3264 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3265 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3266 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3267
3268 calc_load_update += n * LOAD_FREQ;
3269 }
3270
3271 /*
3272 * Its possible the remainder of the above division also crosses
3273 * a LOAD_FREQ period, the regular check in calc_global_load()
3274 * which comes after this will take care of that.
3275 *
3276 * Consider us being 11 ticks before a cycle completion, and us
3277 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3278 * age us 4 cycles, and the test in calc_global_load() will
3279 * pick up the final one.
3280 */
3281}
3151#else 3282#else
3152static void calc_load_account_idle(struct rq *this_rq) 3283static void calc_load_account_idle(struct rq *this_rq)
3153{ 3284{
@@ -3157,6 +3288,10 @@ static inline long calc_load_fold_idle(void)
3157{ 3288{
3158 return 0; 3289 return 0;
3159} 3290}
3291
3292static void calc_global_nohz(unsigned long ticks)
3293{
3294}
3160#endif 3295#endif
3161 3296
3162/** 3297/**
@@ -3174,24 +3309,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3174 loads[2] = (avenrun[2] + offset) << shift; 3309 loads[2] = (avenrun[2] + offset) << shift;
3175} 3310}
3176 3311
3177static unsigned long
3178calc_load(unsigned long load, unsigned long exp, unsigned long active)
3179{
3180 load *= exp;
3181 load += active * (FIXED_1 - exp);
3182 return load >> FSHIFT;
3183}
3184
3185/* 3312/*
3186 * calc_load - update the avenrun load estimates 10 ticks after the 3313 * calc_load - update the avenrun load estimates 10 ticks after the
3187 * CPUs have updated calc_load_tasks. 3314 * CPUs have updated calc_load_tasks.
3188 */ 3315 */
3189void calc_global_load(void) 3316void calc_global_load(unsigned long ticks)
3190{ 3317{
3191 unsigned long upd = calc_load_update + 10;
3192 long active; 3318 long active;
3193 3319
3194 if (time_before(jiffies, upd)) 3320 calc_global_nohz(ticks);
3321
3322 if (time_before(jiffies, calc_load_update + 10))
3195 return; 3323 return;
3196 3324
3197 active = atomic_long_read(&calc_load_tasks); 3325 active = atomic_long_read(&calc_load_tasks);