sched: Cure more NO_HZ load average woes

There's a long-running regression that proved difficult to fix and which is hitting certain people and is rather annoying in its effects. Damien reported that after 74f5187ac8 (sched: Cure load average vs NO_HZ woes) his load average is unnaturally high, he also noted that even with that patch reverted the load avgerage numbers are not correct. The problem is that the previous patch only solved half the NO_HZ problem, it addressed the part of going into NO_HZ mode, not of comming out of NO_HZ mode. This patch implements that missing half. When comming out of NO_HZ mode there are two important things to take care of: - Folding the pending idle delta into the global active count. - Correctly aging the averages for the idle-duration. So with this patch the NO_HZ interaction should be complete and behaviour between CONFIG_NO_HZ=[yn] should be equivalent. Furthermore, this patch slightly changes the load average computation by adding a rounding term to the fixed point multiplication. Reported-by: Damien Wyart <damien.wyart@free.fr> Reported-by: Tim McGrath <tmhikaru@gmail.com> Tested-by: Damien Wyart <damien.wyart@free.fr> Tested-by: Orion Poplawski <orion@cora.nwra.com> Tested-by: Kyle McMartin <kyle@mcmartin.ca> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: stable@kernel.org Cc: Chase Douglas <chase.douglas@canonical.com> LKML-Reference: <1291129145.32004.874.camel@laptop> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2010-11-30 13:48:45 -0500
committer: Ingo Molnar <mingo@elte.hu> 2010-12-08 14:15:04 -0500
commit: 0f004f5a696a9434b7214d0d3cbd0525ee77d428 (patch)
tree: 274b3bb92469789284d864314d46e902c70e8384 /kernel/sched.c
parent: 6313e3c21743cc88bb5bd8aa72948ee1e83937b6 (diff)
1 files changed, 139 insertions, 11 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index dc91a4d09ac3..6b7c26a1a097 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3119,6 +3119,15 @@ static long calc_load_fold_active(struct rq *this_rq)
        return delta;
 }
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+        load *= exp;
+        load += active * (FIXED_1 - exp);
+        load += 1UL << (FSHIFT - 1);
+        return load >> FSHIFT;
+}
 #ifdef CONFIG_NO_HZ
 /*
 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3148,6 +3157,128 @@ static long calc_load_fold_idle(void)
        return delta;
 }
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x:         base of the power
+ * @frac_bits: fractional bits of @x
+ * @n:         power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+        unsigned long result = 1UL << frac_bits;
+        if (n) for (;;) {
+                if (n & 1) {
+                        result *= x;
+                        result += 1UL << (frac_bits - 1);
+                        result >>= frac_bits;
+                }
+                n >>= 1;
+                if (!n)
+                        break;
+                x *= x;
+                x += 1UL << (frac_bits - 1);
+                x >>= frac_bits;
+        }
+        return result;
+}
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ *    = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ *  ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ *    = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ *              n         1 - x^(n+1)
+ *     S_n := \Sum x^i = -------------
+ *             i=0          1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+            unsigned long active, unsigned int n)
+{
+        return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+        long delta, active, n;
+        if (time_before(jiffies, calc_load_update))
+                return;
+        /*
+         * If we crossed a calc_load_update boundary, make sure to fold
+         * any pending idle changes, the respective CPUs might have
+         * missed the tick driven calc_load_account_active() update
+         * due to NO_HZ.
+         */
+        delta = calc_load_fold_idle();
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
+        /*
+         * If we were idle for multiple load cycles, apply them.
+         */
+        if (ticks >= LOAD_FREQ) {
+                n = ticks / LOAD_FREQ;
+                active = atomic_long_read(&calc_load_tasks);
+                active = active > 0 ? active * FIXED_1 : 0;
+                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+                calc_load_update += n * LOAD_FREQ;
+        }
+        /*
+         * Its possible the remainder of the above division also crosses
+         * a LOAD_FREQ period, the regular check in calc_global_load()
+         * which comes after this will take care of that.
+         *
+         * Consider us being 11 ticks before a cycle completion, and us
+         * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+         * age us 4 cycles, and the test in calc_global_load() will
+         * pick up the final one.
+         */
+}
 #else
 static void calc_load_account_idle(struct rq *this_rq)
 {
@@ -3157,6 +3288,10 @@ static inline long calc_load_fold_idle(void)
 {
        return 0;
 }
+static void calc_global_nohz(unsigned long ticks)
+{
+}
 #endif
 /**
@@ -3174,24 +3309,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
        loads[2] = (avenrun[2] + offset) << shift;
 }
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
-        load *= exp;
-        load += active * (FIXED_1 - exp);
-        return load >> FSHIFT;
-}
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
 * CPUs have updated calc_load_tasks.
 */
-void calc_global_load(void)
+void calc_global_load(unsigned long ticks)
 {
-        unsigned long upd = calc_load_update + 10;
        long active;
-        if (time_before(jiffies, upd))
+        calc_global_nohz(ticks);
+        if (time_before(jiffies, calc_load_update + 10))
                return;
        active = atomic_long_read(&calc_load_tasks);
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2010-11-30 13:48:45 -0500
committer	Ingo Molnar <mingo@elte.hu>	2010-12-08 14:15:04 -0500
commit	0f004f5a696a9434b7214d0d3cbd0525ee77d428 (patch)
tree	274b3bb92469789284d864314d46e902c70e8384 /kernel/sched.c
parent	6313e3c21743cc88bb5bd8aa72948ee1e83937b6 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index dc91a4d09ac3..6b7c26a1a097 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -3119,6 +3119,15 @@ static long calc_load_fold_active(struct rq *this_rq)
3119	return delta;	3119	return delta;
3120	}	3120	}
3121		3121
		3122	static unsigned long
		3123	calc_load(unsigned long load, unsigned long exp, unsigned long active)
		3124	{
		3125	load *= exp;
		3126	load += active * (FIXED_1 - exp);
		3127	load += 1UL << (FSHIFT - 1);
		3128	return load >> FSHIFT;
		3129	}
		3130
3122	#ifdef CONFIG_NO_HZ	3131	#ifdef CONFIG_NO_HZ
3123	/*	3132	/*
3124	* For NO_HZ we delay the active fold to the next LOAD_FREQ update.	3133	* For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3148,6 +3157,128 @@ static long calc_load_fold_idle(void)
3148		3157
3149	return delta;	3158	return delta;
3150	}	3159	}
		3160
		3161	/**
		3162	* fixed_power_int - compute: x^n, in O(log n) time
		3163	*
		3164	* @x: base of the power
		3165	* @frac_bits: fractional bits of @x
		3166	* @n: power to raise @x to.
		3167	*
		3168	* By exploiting the relation between the definition of the natural power
		3169	* function: x^n := xx...*x (x multiplied by itself for n times), and
		3170	* the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
		3171	* (where: n_i \elem {0, 1}, the binary vector representing n),
		3172	* we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
		3173	* of course trivially computable in O(log_2 n), the length of our binary
		3174	* vector.
		3175	*/
		3176	static unsigned long
		3177	fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
		3178	{
		3179	unsigned long result = 1UL << frac_bits;
		3180
		3181	if (n) for (;;) {
		3182	if (n & 1) {
		3183	result *= x;
		3184	result += 1UL << (frac_bits - 1);
		3185	result >>= frac_bits;
		3186	}
		3187	n >>= 1;
		3188	if (!n)
		3189	break;
		3190	x *= x;
		3191	x += 1UL << (frac_bits - 1);
		3192	x >>= frac_bits;
		3193	}
		3194
		3195	return result;
		3196	}
		3197
		3198	/*
		3199	* a1 = a0 * e + a * (1 - e)
		3200	*
		3201	* a2 = a1 * e + a * (1 - e)
		3202	* = (a0 * e + a * (1 - e)) * e + a * (1 - e)
		3203	* = a0 * e^2 + a * (1 - e) * (1 + e)
		3204	*
		3205	* a3 = a2 * e + a * (1 - e)
		3206	* = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
		3207	* = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
		3208	*
		3209	* ...
		3210	*
		3211	* an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
		3212	* = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
		3213	* = a0 * e^n + a * (1 - e^n)
		3214	*
		3215	* [1] application of the geometric series:
		3216	*
		3217	* n 1 - x^(n+1)
		3218	* S_n := \Sum x^i = -------------
		3219	* i=0 1 - x
		3220	*/
		3221	static unsigned long
		3222	calc_load_n(unsigned long load, unsigned long exp,
		3223	unsigned long active, unsigned int n)
		3224	{
		3225
		3226	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
		3227	}
		3228
		3229	/*
		3230	* NO_HZ can leave us missing all per-cpu ticks calling
		3231	* calc_load_account_active(), but since an idle CPU folds its delta into
		3232	* calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
		3233	* in the pending idle delta if our idle period crossed a load cycle boundary.
		3234	*
		3235	* Once we've updated the global active value, we need to apply the exponential
		3236	* weights adjusted to the number of cycles missed.
		3237	*/
		3238	static void calc_global_nohz(unsigned long ticks)
		3239	{
		3240	long delta, active, n;
		3241
		3242	if (time_before(jiffies, calc_load_update))
		3243	return;
		3244
		3245	/*
		3246	* If we crossed a calc_load_update boundary, make sure to fold
		3247	* any pending idle changes, the respective CPUs might have
		3248	* missed the tick driven calc_load_account_active() update
		3249	* due to NO_HZ.
		3250	*/
		3251	delta = calc_load_fold_idle();
		3252	if (delta)
		3253	atomic_long_add(delta, &calc_load_tasks);
		3254
		3255	/*
		3256	* If we were idle for multiple load cycles, apply them.
		3257	*/
		3258	if (ticks >= LOAD_FREQ) {
		3259	n = ticks / LOAD_FREQ;
		3260
		3261	active = atomic_long_read(&calc_load_tasks);
		3262	active = active > 0 ? active * FIXED_1 : 0;
		3263
		3264	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
		3265	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
		3266	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
		3267
		3268	calc_load_update += n * LOAD_FREQ;
		3269	}
		3270
		3271	/*
		3272	* Its possible the remainder of the above division also crosses
		3273	* a LOAD_FREQ period, the regular check in calc_global_load()
		3274	* which comes after this will take care of that.
		3275	*
		3276	* Consider us being 11 ticks before a cycle completion, and us
		3277	* sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
		3278	* age us 4 cycles, and the test in calc_global_load() will
		3279	* pick up the final one.
		3280	*/
		3281	}
3151	#else	3282	#else
3152	static void calc_load_account_idle(struct rq *this_rq)	3283	static void calc_load_account_idle(struct rq *this_rq)
3153	{	3284	{
@@ -3157,6 +3288,10 @@ static inline long calc_load_fold_idle(void)
3157	{	3288	{
3158	return 0;	3289	return 0;
3159	}	3290	}
		3291
		3292	static void calc_global_nohz(unsigned long ticks)
		3293	{
		3294	}
3160	#endif	3295	#endif
3161		3296
3162	/**	3297	/**
@@ -3174,24 +3309,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3174	loads[2] = (avenrun[2] + offset) << shift;	3309	loads[2] = (avenrun[2] + offset) << shift;
3175	}	3310	}
3176		3311
3177	static unsigned long
3178	calc_load(unsigned long load, unsigned long exp, unsigned long active)
3179	{
3180	load *= exp;
3181	load += active * (FIXED_1 - exp);
3182	return load >> FSHIFT;
3183	}
3184
3185	/*	3312	/*
3186	* calc_load - update the avenrun load estimates 10 ticks after the	3313	* calc_load - update the avenrun load estimates 10 ticks after the
3187	* CPUs have updated calc_load_tasks.	3314	* CPUs have updated calc_load_tasks.
3188	*/	3315	*/
3189	void calc_global_load(void)	3316	void calc_global_load(unsigned long ticks)
3190	{	3317	{
3191	unsigned long upd = calc_load_update + 10;
3192	long active;	3318	long active;
3193		3319
3194	if (time_before(jiffies, upd))	3320	calc_global_nohz(ticks);
		3321
		3322	if (time_before(jiffies, calc_load_update + 10))
3195	return;	3323	return;
3196		3324
3197	active = atomic_long_read(&calc_load_tasks);	3325	active = atomic_long_read(&calc_load_tasks);