1 files changed, 203 insertions, 73 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d5594a4268d4..468bdd44c1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2081,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
 #endif
        /* Here we just switch the register state and the stack. */
-        rcu_switch_from(prev);
        switch_to(prev, next, prev);
        barrier();
@@ -2161,11 +2160,73 @@ unsigned long this_cpu_load(void)
 }
+/*
+ * Global load-average calculations
+ *
+ * We take a distributed and async approach to calculating the global load-avg
+ * in order to minimize overhead.
+ *
+ * The global load average is an exponentially decaying average of nr_running +
+ * nr_uninterruptible.
+ *
+ * Once every LOAD_FREQ:
+ *
+ *   nr_active = 0;
+ *   for_each_possible_cpu(cpu)
+ *      nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
+ *
+ *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
+ *
+ * Due to a number of reasons the above turns in the mess below:
+ *
+ *  - for_each_possible_cpu() is prohibitively expensive on machines with
+ *    serious number of cpus, therefore we need to take a distributed approach
+ *    to calculating nr_active.
+ *
+ *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
+ *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
+ *
+ *    So assuming nr_active := 0 when we start out -- true per definition, we
+ *    can simply take per-cpu deltas and fold those into a global accumulate
+ *    to obtain the same result. See calc_load_fold_active().
+ *
+ *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ *    across the machine, we assume 10 ticks is sufficient time for every
+ *    cpu to have completed this task.
+ *
+ *    This places an upper-bound on the IRQ-off latency of the machine. Then
+ *    again, being late doesn't loose the delta, just wrecks the sample.
+ *
+ *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ *    this would add another cross-cpu cacheline miss and atomic operation
+ *    to the wakeup path. Instead we increment on whatever cpu the task ran
+ *    when it went into uninterruptible state and decrement on whatever cpu
+ *    did the wakeup. This means that only the sum of nr_uninterruptible over
+ *    all cpus yields the correct result.
+ *
+ *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
+ */
 /* Variables and functions for calc_load */
 static atomic_long_t calc_load_tasks;
 static unsigned long calc_load_update;
 unsigned long avenrun[3];
-EXPORT_SYMBOL(avenrun);
+EXPORT_SYMBOL(avenrun); /* should be removed */
+/**
+ * get_avenrun - get the load average array
+ * @loads:      pointer to dest load array
+ * @offset:     offset to add
+ * @shift:      shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+        loads[0] = (avenrun[0] + offset) << shift;
+        loads[1] = (avenrun[1] + offset) << shift;
+        loads[2] = (avenrun[2] + offset) << shift;
+}
 static long calc_load_fold_active(struct rq *this_rq)
 {
@@ -2182,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
        return delta;
 }
+/*
+ * a1 = a0 * e + a * (1 - e)
+ */
 static unsigned long
 calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
@@ -2193,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
 #ifdef CONFIG_NO_HZ
 /*
- * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
+ * Handle NO_HZ for the global load-average.
+ *
+ * Since the above described distributed algorithm to compute the global
+ * load-average relies on per-cpu sampling from the tick, it is affected by
+ * NO_HZ.
+ *
+ * The basic idea is to fold the nr_active delta into a global idle-delta upon
+ * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * when we read the global state.
+ *
+ * Obviously reality has to ruin such a delightfully simple scheme:
+ *
+ *  - When we go NO_HZ idle during the window, we can negate our sample
+ *    contribution, causing under-accounting.
+ *
+ *    We avoid this by keeping two idle-delta counters and flipping them
+ *    when the window starts, thus separating old and new NO_HZ load.
+ *
+ *    The only trick is the slight shift in index flip for read vs write.
+ *
+ *        0s            5s            10s           15s
+ *          +10           +10           +10           +10
+ *        |-|-----------|-|-----------|-|-----------|-|
+ *    r:0 0 1           1 0           0 1           1 0
+ *    w:0 1 1           0 0           1 1           0 0
+ *
+ *    This ensures we'll fold the old idle contribution in this window while
+ *    accumlating the new one.
+ *
+ *  - When we wake up from NO_HZ idle during the window, we push up our
+ *    contribution, since we effectively move our sample point to a known
+ *    busy state.
+ *
+ *    This is solved by pushing the window forward, and thus skipping the
+ *    sample, for this cpu (effectively using the idle-delta for this cpu which
+ *    was in effect at the time the window opened). This also solves the issue
+ *    of having to deal with a cpu having been in NOHZ idle for multiple
+ *    LOAD_FREQ intervals.
 *
 * When making the ILB scale, we should try to pull this in as well.
 */
-static atomic_long_t calc_load_tasks_idle;
+static atomic_long_t calc_load_idle[2];
+static int calc_load_idx;
-void calc_load_account_idle(struct rq *this_rq)
+static inline int calc_load_write_idx(void)
 {
+        int idx = calc_load_idx;
+        /*
+         * See calc_global_nohz(), if we observe the new index, we also
+         * need to observe the new update time.
+         */
+        smp_rmb();
+        /*
+         * If the folding window started, make sure we start writing in the
+         * next idle-delta.
+         */
+        if (!time_before(jiffies, calc_load_update))
+                idx++;
+        return idx & 1;
+}
+static inline int calc_load_read_idx(void)
+{
+        return calc_load_idx & 1;
+}
+void calc_load_enter_idle(void)
+{
+        struct rq *this_rq = this_rq();
        long delta;
+        /*
+         * We're going into NOHZ mode, if there's any pending delta, fold it
+         * into the pending idle delta.
+         */
        delta = calc_load_fold_active(this_rq);
-        if (delta)
+        if (delta) {
-                atomic_long_add(delta, &calc_load_tasks_idle);
+                int idx = calc_load_write_idx();
+                atomic_long_add(delta, &calc_load_idle[idx]);
+        }
 }
-static long calc_load_fold_idle(void)
+void calc_load_exit_idle(void)
 {
-        long delta = 0;
+        struct rq *this_rq = this_rq();
+        /*
+         * If we're still before the sample window, we're done.
+         */
+        if (time_before(jiffies, this_rq->calc_load_update))
+                return;
        /*
-         * Its got a race, we don't care...
+         * We woke inside or after the sample window, this means we're already
+         * accounted through the nohz accounting, so skip the entire deal and
+         * sync up for the next window.
         */
-        if (atomic_long_read(&calc_load_tasks_idle))
+        this_rq->calc_load_update = calc_load_update;
-                delta = atomic_long_xchg(&calc_load_tasks_idle, 0);
+        if (time_before(jiffies, this_rq->calc_load_update + 10))
+                this_rq->calc_load_update += LOAD_FREQ;
+}
+static long calc_load_fold_idle(void)
+{
+        int idx = calc_load_read_idx();
+        long delta = 0;
+        if (atomic_long_read(&calc_load_idle[idx]))
+                delta = atomic_long_xchg(&calc_load_idle[idx], 0);
        return delta;
 }
@@ -2302,66 +2454,39 @@ static void calc_global_nohz(void)
 {
        long delta, active, n;
-        /*
+        if (!time_before(jiffies, calc_load_update + 10)) {
-         * If we crossed a calc_load_update boundary, make sure to fold
+                /*
-         * any pending idle changes, the respective CPUs might have
+                 * Catch-up, fold however many we are behind still
-         * missed the tick driven calc_load_account_active() update
+                 */
-         * due to NO_HZ.
+                delta = jiffies - calc_load_update - 10;
-         */
+                n = 1 + (delta / LOAD_FREQ);
-        delta = calc_load_fold_idle();
-        if (delta)
-                atomic_long_add(delta, &calc_load_tasks);
-        /*
-         * It could be the one fold was all it took, we done!
-         */
-        if (time_before(jiffies, calc_load_update + 10))
-                return;
-        /*
-         * Catch-up, fold however many we are behind still
-         */
-        delta = jiffies - calc_load_update - 10;
-        n = 1 + (delta / LOAD_FREQ);
-        active = atomic_long_read(&calc_load_tasks);
+                active = atomic_long_read(&calc_load_tasks);
-        active = active > 0 ? active * FIXED_1 : 0;
+                active = active > 0 ? active * FIXED_1 : 0;
-        avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+                avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
-        avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+                avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
-        avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+                avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
-        calc_load_update += n * LOAD_FREQ;
+                calc_load_update += n * LOAD_FREQ;
-}
+        }
-#else
-void calc_load_account_idle(struct rq *this_rq)
-{
-}
-static inline long calc_load_fold_idle(void)
+        /*
-{
+         * Flip the idle index...
-        return 0;
+         *
+         * Make sure we first write the new time then flip the index, so that
+         * calc_load_write_idx() will see the new time when it reads the new
+         * index, this avoids a double flip messing things up.
+         */
+        smp_wmb();
+        calc_load_idx++;
 }
+#else /* !CONFIG_NO_HZ */
-static void calc_global_nohz(void)
+static inline long calc_load_fold_idle(void) { return 0; }
-{
+static inline void calc_global_nohz(void) { }
-}
-#endif
-/**
+#endif /* CONFIG_NO_HZ */
- * get_avenrun - get the load average array
- * @loads:      pointer to dest load array
- * @offset:     offset to add
- * @shift:      shift count to shift the result left
- *
- * These values are estimates at best, so no need for locking.
- */
-void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
-{
-        loads[0] = (avenrun[0] + offset) << shift;
-        loads[1] = (avenrun[1] + offset) << shift;
-        loads[2] = (avenrun[2] + offset) << shift;
-}
 /*
 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2369,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 */
 void calc_global_load(unsigned long ticks)
 {
-        long active;
+        long active, delta;
        if (time_before(jiffies, calc_load_update + 10))
                return;
+        /*
+         * Fold the 'old' idle-delta to include all NO_HZ cpus.
+         */
+        delta = calc_load_fold_idle();
+        if (delta)
+                atomic_long_add(delta, &calc_load_tasks);
        active = atomic_long_read(&calc_load_tasks);
        active = active > 0 ? active * FIXED_1 : 0;
@@ -2384,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
        calc_load_update += LOAD_FREQ;
        /*
-         * Account one period with whatever state we found before
+         * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
-         * folding in the nohz state and ageing the entire idle period.
-         *
-         * This avoids loosing a sample when we go idle between 
-         * calc_load_account_active() (10 ticks ago) and now and thus
-         * under-accounting.
         */
        calc_global_nohz();
 }
@@ -2406,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
                return;
        delta  = calc_load_fold_active(this_rq);
-        delta += calc_load_fold_idle();
        if (delta)
                atomic_long_add(delta, &calc_load_tasks);
@@ -2414,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
 }
 /*
+ * End of global load-average stuff
+ */
+/*
 * The exact cpuload at various idx values, calculated at every tick would be
 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
 *

diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d5594a4268d4..468bdd44c1ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c
@@ -2081,7 +2081,6 @@ context_switch(struct rq rq, struct task_struct prev,
2081	#endif	2081	#endif
2082		2082
2083	/* Here we just switch the register state and the stack. */	2083	/* Here we just switch the register state and the stack. */
2084	rcu_switch_from(prev);
2085	switch_to(prev, next, prev);	2084	switch_to(prev, next, prev);
2086		2085
2087	barrier();	2086	barrier();
@@ -2161,11 +2160,73 @@ unsigned long this_cpu_load(void)
2161	}	2160	}
2162		2161
2163		2162
		2163	/*
		2164	* Global load-average calculations
		2165	*
		2166	* We take a distributed and async approach to calculating the global load-avg
		2167	* in order to minimize overhead.
		2168	*
		2169	* The global load average is an exponentially decaying average of nr_running +
		2170	* nr_uninterruptible.
		2171	*
		2172	* Once every LOAD_FREQ:
		2173	*
		2174	* nr_active = 0;
		2175	* for_each_possible_cpu(cpu)
		2176	* nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
		2177	*
		2178	* avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
		2179	*
		2180	* Due to a number of reasons the above turns in the mess below:
		2181	*
		2182	* - for_each_possible_cpu() is prohibitively expensive on machines with
		2183	* serious number of cpus, therefore we need to take a distributed approach
		2184	* to calculating nr_active.
		2185	*
		2186	* \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) \| x_i(t_0) := 0
		2187	* = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
		2188	*
		2189	* So assuming nr_active := 0 when we start out -- true per definition, we
		2190	* can simply take per-cpu deltas and fold those into a global accumulate
		2191	* to obtain the same result. See calc_load_fold_active().
		2192	*
		2193	* Furthermore, in order to avoid synchronizing all per-cpu delta folding
		2194	* across the machine, we assume 10 ticks is sufficient time for every
		2195	* cpu to have completed this task.
		2196	*
		2197	* This places an upper-bound on the IRQ-off latency of the machine. Then
		2198	* again, being late doesn't loose the delta, just wrecks the sample.
		2199	*
		2200	* - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
		2201	* this would add another cross-cpu cacheline miss and atomic operation
		2202	* to the wakeup path. Instead we increment on whatever cpu the task ran
		2203	* when it went into uninterruptible state and decrement on whatever cpu
		2204	* did the wakeup. This means that only the sum of nr_uninterruptible over
		2205	* all cpus yields the correct result.
		2206	*
		2207	* This covers the NO_HZ=n code, for extra head-aches, see the comment below.
		2208	*/
		2209
2164	/* Variables and functions for calc_load */	2210	/* Variables and functions for calc_load */
2165	static atomic_long_t calc_load_tasks;	2211	static atomic_long_t calc_load_tasks;
2166	static unsigned long calc_load_update;	2212	static unsigned long calc_load_update;
2167	unsigned long avenrun[3];	2213	unsigned long avenrun[3];
2168	EXPORT_SYMBOL(avenrun);	2214	EXPORT_SYMBOL(avenrun); /* should be removed */
		2215
		2216	/**
		2217	* get_avenrun - get the load average array
		2218	* @loads: pointer to dest load array
		2219	* @offset: offset to add
		2220	* @shift: shift count to shift the result left
		2221	*
		2222	* These values are estimates at best, so no need for locking.
		2223	*/
		2224	void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
		2225	{
		2226	loads[0] = (avenrun[0] + offset) << shift;
		2227	loads[1] = (avenrun[1] + offset) << shift;
		2228	loads[2] = (avenrun[2] + offset) << shift;
		2229	}
2169		2230
2170	static long calc_load_fold_active(struct rq *this_rq)	2231	static long calc_load_fold_active(struct rq *this_rq)
2171	{	2232	{
@@ -2182,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
2182	return delta;	2243	return delta;
2183	}	2244	}
2184		2245
		2246	/*
		2247	* a1 = a0 * e + a * (1 - e)
		2248	*/
2185	static unsigned long	2249	static unsigned long
2186	calc_load(unsigned long load, unsigned long exp, unsigned long active)	2250	calc_load(unsigned long load, unsigned long exp, unsigned long active)
2187	{	2251	{
@@ -2193,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
2193		2257
2194	#ifdef CONFIG_NO_HZ	2258	#ifdef CONFIG_NO_HZ
2195	/*	2259	/*
2196	* For NO_HZ we delay the active fold to the next LOAD_FREQ update.	2260	* Handle NO_HZ for the global load-average.
		2261	*
		2262	* Since the above described distributed algorithm to compute the global
		2263	* load-average relies on per-cpu sampling from the tick, it is affected by
		2264	* NO_HZ.
		2265	*
		2266	* The basic idea is to fold the nr_active delta into a global idle-delta upon
		2267	* entering NO_HZ state such that we can include this as an 'extra' cpu delta
		2268	* when we read the global state.
		2269	*
		2270	* Obviously reality has to ruin such a delightfully simple scheme:
		2271	*
		2272	* - When we go NO_HZ idle during the window, we can negate our sample
		2273	* contribution, causing under-accounting.
		2274	*
		2275	* We avoid this by keeping two idle-delta counters and flipping them
		2276	* when the window starts, thus separating old and new NO_HZ load.
		2277	*
		2278	* The only trick is the slight shift in index flip for read vs write.
		2279	*
		2280	* 0s 5s 10s 15s
		2281	* +10 +10 +10 +10
		2282	* \|-\|-----------\|-\|-----------\|-\|-----------\|-\|
		2283	* r:0 0 1 1 0 0 1 1 0
		2284	* w:0 1 1 0 0 1 1 0 0
		2285	*
		2286	* This ensures we'll fold the old idle contribution in this window while
		2287	* accumlating the new one.
		2288	*
		2289	* - When we wake up from NO_HZ idle during the window, we push up our
		2290	* contribution, since we effectively move our sample point to a known
		2291	* busy state.
		2292	*
		2293	* This is solved by pushing the window forward, and thus skipping the
		2294	* sample, for this cpu (effectively using the idle-delta for this cpu which
		2295	* was in effect at the time the window opened). This also solves the issue
		2296	* of having to deal with a cpu having been in NOHZ idle for multiple
		2297	* LOAD_FREQ intervals.
2197	*	2298	*
2198	* When making the ILB scale, we should try to pull this in as well.	2299	* When making the ILB scale, we should try to pull this in as well.
2199	*/	2300	*/
2200	static atomic_long_t calc_load_tasks_idle;	2301	static atomic_long_t calc_load_idle[2];
		2302	static int calc_load_idx;
2201		2303
2202	void calc_load_account_idle(struct rq *this_rq)	2304	static inline int calc_load_write_idx(void)
2203	{	2305	{
		2306	int idx = calc_load_idx;
		2307
		2308	/*
		2309	* See calc_global_nohz(), if we observe the new index, we also
		2310	* need to observe the new update time.
		2311	*/
		2312	smp_rmb();
		2313
		2314	/*
		2315	* If the folding window started, make sure we start writing in the
		2316	* next idle-delta.
		2317	*/
		2318	if (!time_before(jiffies, calc_load_update))
		2319	idx++;
		2320
		2321	return idx & 1;
		2322	}
		2323
		2324	static inline int calc_load_read_idx(void)
		2325	{
		2326	return calc_load_idx & 1;
		2327	}
		2328
		2329	void calc_load_enter_idle(void)
		2330	{
		2331	struct rq *this_rq = this_rq();
2204	long delta;	2332	long delta;
2205		2333
		2334	/*
		2335	* We're going into NOHZ mode, if there's any pending delta, fold it
		2336	* into the pending idle delta.
		2337	*/
2206	delta = calc_load_fold_active(this_rq);	2338	delta = calc_load_fold_active(this_rq);
2207	if (delta)	2339	if (delta) {
2208	atomic_long_add(delta, &calc_load_tasks_idle);	2340	int idx = calc_load_write_idx();
		2341	atomic_long_add(delta, &calc_load_idle[idx]);
		2342	}
2209	}	2343	}
2210		2344
2211	static long calc_load_fold_idle(void)	2345	void calc_load_exit_idle(void)
2212	{	2346	{
2213	long delta = 0;	2347	struct rq *this_rq = this_rq();
		2348
		2349	/*
		2350	* If we're still before the sample window, we're done.
		2351	*/
		2352	if (time_before(jiffies, this_rq->calc_load_update))
		2353	return;
2214		2354
2215	/*	2355	/*
2216	* Its got a race, we don't care...	2356	* We woke inside or after the sample window, this means we're already
		2357	* accounted through the nohz accounting, so skip the entire deal and
		2358	* sync up for the next window.
2217	*/	2359	*/
2218	if (atomic_long_read(&calc_load_tasks_idle))	2360	this_rq->calc_load_update = calc_load_update;
2219	delta = atomic_long_xchg(&calc_load_tasks_idle, 0);	2361	if (time_before(jiffies, this_rq->calc_load_update + 10))
		2362	this_rq->calc_load_update += LOAD_FREQ;
		2363	}
		2364
		2365	static long calc_load_fold_idle(void)
		2366	{
		2367	int idx = calc_load_read_idx();
		2368	long delta = 0;
		2369
		2370	if (atomic_long_read(&calc_load_idle[idx]))
		2371	delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2220		2372
2221	return delta;	2373	return delta;
2222	}	2374	}
@@ -2302,66 +2454,39 @@ static void calc_global_nohz(void)
2302	{	2454	{
2303	long delta, active, n;	2455	long delta, active, n;
2304		2456
2305	/*	2457	if (!time_before(jiffies, calc_load_update + 10)) {
2306	* If we crossed a calc_load_update boundary, make sure to fold	2458	/*
2307	* any pending idle changes, the respective CPUs might have	2459	* Catch-up, fold however many we are behind still
2308	* missed the tick driven calc_load_account_active() update	2460	*/
2309	* due to NO_HZ.	2461	delta = jiffies - calc_load_update - 10;
2310	*/	2462	n = 1 + (delta / LOAD_FREQ);
2311	delta = calc_load_fold_idle();
2312	if (delta)
2313	atomic_long_add(delta, &calc_load_tasks);
2314
2315	/*
2316	* It could be the one fold was all it took, we done!
2317	*/
2318	if (time_before(jiffies, calc_load_update + 10))
2319	return;
2320
2321	/*
2322	* Catch-up, fold however many we are behind still
2323	*/
2324	delta = jiffies - calc_load_update - 10;
2325	n = 1 + (delta / LOAD_FREQ);
2326		2463
2327	active = atomic_long_read(&calc_load_tasks);	2464	active = atomic_long_read(&calc_load_tasks);
2328	active = active > 0 ? active * FIXED_1 : 0;	2465	active = active > 0 ? active * FIXED_1 : 0;
2329		2466
2330	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);	2467	avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2331	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);	2468	avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2332	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);	2469	avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2333		2470
2334	calc_load_update += n * LOAD_FREQ;	2471	calc_load_update += n * LOAD_FREQ;
2335	}	2472	}
2336	#else
2337	void calc_load_account_idle(struct rq *this_rq)
2338	{
2339	}
2340		2473
2341	static inline long calc_load_fold_idle(void)	2474	/*
2342	{	2475	* Flip the idle index...
2343	return 0;	2476	*
		2477	* Make sure we first write the new time then flip the index, so that
		2478	* calc_load_write_idx() will see the new time when it reads the new
		2479	* index, this avoids a double flip messing things up.
		2480	*/
		2481	smp_wmb();
		2482	calc_load_idx++;
2344	}	2483	}
		2484	#else /* !CONFIG_NO_HZ */
2345		2485
2346	static void calc_global_nohz(void)	2486	static inline long calc_load_fold_idle(void) { return 0; }
2347	{	2487	static inline void calc_global_nohz(void) { }
2348	}
2349	#endif
2350		2488
2351	/**	2489	#endif /* CONFIG_NO_HZ */
2352	* get_avenrun - get the load average array
2353	* @loads: pointer to dest load array
2354	* @offset: offset to add
2355	* @shift: shift count to shift the result left
2356	*
2357	* These values are estimates at best, so no need for locking.
2358	*/
2359	void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2360	{
2361	loads[0] = (avenrun[0] + offset) << shift;
2362	loads[1] = (avenrun[1] + offset) << shift;
2363	loads[2] = (avenrun[2] + offset) << shift;
2364	}
2365		2490
2366	/*	2491	/*
2367	* calc_load - update the avenrun load estimates 10 ticks after the	2492	* calc_load - update the avenrun load estimates 10 ticks after the
@@ -2369,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2369	*/	2494	*/
2370	void calc_global_load(unsigned long ticks)	2495	void calc_global_load(unsigned long ticks)
2371	{	2496	{
2372	long active;	2497	long active, delta;
2373		2498
2374	if (time_before(jiffies, calc_load_update + 10))	2499	if (time_before(jiffies, calc_load_update + 10))
2375	return;	2500	return;
2376		2501
		2502	/*
		2503	* Fold the 'old' idle-delta to include all NO_HZ cpus.
		2504	*/
		2505	delta = calc_load_fold_idle();
		2506	if (delta)
		2507	atomic_long_add(delta, &calc_load_tasks);
		2508
2377	active = atomic_long_read(&calc_load_tasks);	2509	active = atomic_long_read(&calc_load_tasks);
2378	active = active > 0 ? active * FIXED_1 : 0;	2510	active = active > 0 ? active * FIXED_1 : 0;
2379		2511
@@ -2384,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
2384	calc_load_update += LOAD_FREQ;	2516	calc_load_update += LOAD_FREQ;
2385		2517
2386	/*	2518	/*
2387	* Account one period with whatever state we found before	2519	* In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2388	* folding in the nohz state and ageing the entire idle period.
2389	*
2390	* This avoids loosing a sample when we go idle between
2391	* calc_load_account_active() (10 ticks ago) and now and thus
2392	* under-accounting.
2393	*/	2520	*/
2394	calc_global_nohz();	2521	calc_global_nohz();
2395	}	2522	}
@@ -2406,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
2406	return;	2533	return;
2407		2534
2408	delta = calc_load_fold_active(this_rq);	2535	delta = calc_load_fold_active(this_rq);
2409	delta += calc_load_fold_idle();
2410	if (delta)	2536	if (delta)
2411	atomic_long_add(delta, &calc_load_tasks);	2537	atomic_long_add(delta, &calc_load_tasks);
2412		2538
@@ -2414,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
2414	}	2540	}
2415		2541
2416	/*	2542	/*
		2543	* End of global load-average stuff
		2544	*/
		2545
		2546	/*
2417	* The exact cpuload at various idx values, calculated at every tick would be	2547	* The exact cpuload at various idx values, calculated at every tick would be
2418	* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load	2548	* load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2419	*	2549	*