diff options
Diffstat (limited to 'kernel/sched/loadavg.c')
-rw-r--r-- | kernel/sched/loadavg.c | 394 |
1 files changed, 394 insertions, 0 deletions
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c new file mode 100644 index 000000000000..ef7159012cf3 --- /dev/null +++ b/kernel/sched/loadavg.c | |||
@@ -0,0 +1,394 @@ | |||
1 | /* | ||
2 | * kernel/sched/loadavg.c | ||
3 | * | ||
4 | * This file contains the magic bits required to compute the global loadavg | ||
5 | * figure. Its a silly number but people think its important. We go through | ||
6 | * great pains to make it work on big machines and tickless kernels. | ||
7 | */ | ||
8 | |||
9 | #include <linux/export.h> | ||
10 | |||
11 | #include "sched.h" | ||
12 | |||
13 | /* | ||
14 | * Global load-average calculations | ||
15 | * | ||
16 | * We take a distributed and async approach to calculating the global load-avg | ||
17 | * in order to minimize overhead. | ||
18 | * | ||
19 | * The global load average is an exponentially decaying average of nr_running + | ||
20 | * nr_uninterruptible. | ||
21 | * | ||
22 | * Once every LOAD_FREQ: | ||
23 | * | ||
24 | * nr_active = 0; | ||
25 | * for_each_possible_cpu(cpu) | ||
26 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | ||
27 | * | ||
28 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | ||
29 | * | ||
30 | * Due to a number of reasons the above turns in the mess below: | ||
31 | * | ||
32 | * - for_each_possible_cpu() is prohibitively expensive on machines with | ||
33 | * serious number of cpus, therefore we need to take a distributed approach | ||
34 | * to calculating nr_active. | ||
35 | * | ||
36 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | ||
37 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | ||
38 | * | ||
39 | * So assuming nr_active := 0 when we start out -- true per definition, we | ||
40 | * can simply take per-cpu deltas and fold those into a global accumulate | ||
41 | * to obtain the same result. See calc_load_fold_active(). | ||
42 | * | ||
43 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | ||
44 | * across the machine, we assume 10 ticks is sufficient time for every | ||
45 | * cpu to have completed this task. | ||
46 | * | ||
47 | * This places an upper-bound on the IRQ-off latency of the machine. Then | ||
48 | * again, being late doesn't loose the delta, just wrecks the sample. | ||
49 | * | ||
50 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | ||
51 | * this would add another cross-cpu cacheline miss and atomic operation | ||
52 | * to the wakeup path. Instead we increment on whatever cpu the task ran | ||
53 | * when it went into uninterruptible state and decrement on whatever cpu | ||
54 | * did the wakeup. This means that only the sum of nr_uninterruptible over | ||
55 | * all cpus yields the correct result. | ||
56 | * | ||
57 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | ||
58 | */ | ||
59 | |||
60 | /* Variables and functions for calc_load */ | ||
61 | atomic_long_t calc_load_tasks; | ||
62 | unsigned long calc_load_update; | ||
63 | unsigned long avenrun[3]; | ||
64 | EXPORT_SYMBOL(avenrun); /* should be removed */ | ||
65 | |||
66 | /** | ||
67 | * get_avenrun - get the load average array | ||
68 | * @loads: pointer to dest load array | ||
69 | * @offset: offset to add | ||
70 | * @shift: shift count to shift the result left | ||
71 | * | ||
72 | * These values are estimates at best, so no need for locking. | ||
73 | */ | ||
74 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
75 | { | ||
76 | loads[0] = (avenrun[0] + offset) << shift; | ||
77 | loads[1] = (avenrun[1] + offset) << shift; | ||
78 | loads[2] = (avenrun[2] + offset) << shift; | ||
79 | } | ||
80 | |||
81 | long calc_load_fold_active(struct rq *this_rq) | ||
82 | { | ||
83 | long nr_active, delta = 0; | ||
84 | |||
85 | nr_active = this_rq->nr_running; | ||
86 | nr_active += (long)this_rq->nr_uninterruptible; | ||
87 | |||
88 | if (nr_active != this_rq->calc_load_active) { | ||
89 | delta = nr_active - this_rq->calc_load_active; | ||
90 | this_rq->calc_load_active = nr_active; | ||
91 | } | ||
92 | |||
93 | return delta; | ||
94 | } | ||
95 | |||
96 | /* | ||
97 | * a1 = a0 * e + a * (1 - e) | ||
98 | */ | ||
99 | static unsigned long | ||
100 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
101 | { | ||
102 | load *= exp; | ||
103 | load += active * (FIXED_1 - exp); | ||
104 | load += 1UL << (FSHIFT - 1); | ||
105 | return load >> FSHIFT; | ||
106 | } | ||
107 | |||
108 | #ifdef CONFIG_NO_HZ_COMMON | ||
109 | /* | ||
110 | * Handle NO_HZ for the global load-average. | ||
111 | * | ||
112 | * Since the above described distributed algorithm to compute the global | ||
113 | * load-average relies on per-cpu sampling from the tick, it is affected by | ||
114 | * NO_HZ. | ||
115 | * | ||
116 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | ||
117 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | ||
118 | * when we read the global state. | ||
119 | * | ||
120 | * Obviously reality has to ruin such a delightfully simple scheme: | ||
121 | * | ||
122 | * - When we go NO_HZ idle during the window, we can negate our sample | ||
123 | * contribution, causing under-accounting. | ||
124 | * | ||
125 | * We avoid this by keeping two idle-delta counters and flipping them | ||
126 | * when the window starts, thus separating old and new NO_HZ load. | ||
127 | * | ||
128 | * The only trick is the slight shift in index flip for read vs write. | ||
129 | * | ||
130 | * 0s 5s 10s 15s | ||
131 | * +10 +10 +10 +10 | ||
132 | * |-|-----------|-|-----------|-|-----------|-| | ||
133 | * r:0 0 1 1 0 0 1 1 0 | ||
134 | * w:0 1 1 0 0 1 1 0 0 | ||
135 | * | ||
136 | * This ensures we'll fold the old idle contribution in this window while | ||
137 | * accumlating the new one. | ||
138 | * | ||
139 | * - When we wake up from NO_HZ idle during the window, we push up our | ||
140 | * contribution, since we effectively move our sample point to a known | ||
141 | * busy state. | ||
142 | * | ||
143 | * This is solved by pushing the window forward, and thus skipping the | ||
144 | * sample, for this cpu (effectively using the idle-delta for this cpu which | ||
145 | * was in effect at the time the window opened). This also solves the issue | ||
146 | * of having to deal with a cpu having been in NOHZ idle for multiple | ||
147 | * LOAD_FREQ intervals. | ||
148 | * | ||
149 | * When making the ILB scale, we should try to pull this in as well. | ||
150 | */ | ||
151 | static atomic_long_t calc_load_idle[2]; | ||
152 | static int calc_load_idx; | ||
153 | |||
154 | static inline int calc_load_write_idx(void) | ||
155 | { | ||
156 | int idx = calc_load_idx; | ||
157 | |||
158 | /* | ||
159 | * See calc_global_nohz(), if we observe the new index, we also | ||
160 | * need to observe the new update time. | ||
161 | */ | ||
162 | smp_rmb(); | ||
163 | |||
164 | /* | ||
165 | * If the folding window started, make sure we start writing in the | ||
166 | * next idle-delta. | ||
167 | */ | ||
168 | if (!time_before(jiffies, calc_load_update)) | ||
169 | idx++; | ||
170 | |||
171 | return idx & 1; | ||
172 | } | ||
173 | |||
174 | static inline int calc_load_read_idx(void) | ||
175 | { | ||
176 | return calc_load_idx & 1; | ||
177 | } | ||
178 | |||
179 | void calc_load_enter_idle(void) | ||
180 | { | ||
181 | struct rq *this_rq = this_rq(); | ||
182 | long delta; | ||
183 | |||
184 | /* | ||
185 | * We're going into NOHZ mode, if there's any pending delta, fold it | ||
186 | * into the pending idle delta. | ||
187 | */ | ||
188 | delta = calc_load_fold_active(this_rq); | ||
189 | if (delta) { | ||
190 | int idx = calc_load_write_idx(); | ||
191 | |||
192 | atomic_long_add(delta, &calc_load_idle[idx]); | ||
193 | } | ||
194 | } | ||
195 | |||
196 | void calc_load_exit_idle(void) | ||
197 | { | ||
198 | struct rq *this_rq = this_rq(); | ||
199 | |||
200 | /* | ||
201 | * If we're still before the sample window, we're done. | ||
202 | */ | ||
203 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
204 | return; | ||
205 | |||
206 | /* | ||
207 | * We woke inside or after the sample window, this means we're already | ||
208 | * accounted through the nohz accounting, so skip the entire deal and | ||
209 | * sync up for the next window. | ||
210 | */ | ||
211 | this_rq->calc_load_update = calc_load_update; | ||
212 | if (time_before(jiffies, this_rq->calc_load_update + 10)) | ||
213 | this_rq->calc_load_update += LOAD_FREQ; | ||
214 | } | ||
215 | |||
216 | static long calc_load_fold_idle(void) | ||
217 | { | ||
218 | int idx = calc_load_read_idx(); | ||
219 | long delta = 0; | ||
220 | |||
221 | if (atomic_long_read(&calc_load_idle[idx])) | ||
222 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | ||
223 | |||
224 | return delta; | ||
225 | } | ||
226 | |||
227 | /** | ||
228 | * fixed_power_int - compute: x^n, in O(log n) time | ||
229 | * | ||
230 | * @x: base of the power | ||
231 | * @frac_bits: fractional bits of @x | ||
232 | * @n: power to raise @x to. | ||
233 | * | ||
234 | * By exploiting the relation between the definition of the natural power | ||
235 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
236 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
237 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
238 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
239 | * of course trivially computable in O(log_2 n), the length of our binary | ||
240 | * vector. | ||
241 | */ | ||
242 | static unsigned long | ||
243 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
244 | { | ||
245 | unsigned long result = 1UL << frac_bits; | ||
246 | |||
247 | if (n) { | ||
248 | for (;;) { | ||
249 | if (n & 1) { | ||
250 | result *= x; | ||
251 | result += 1UL << (frac_bits - 1); | ||
252 | result >>= frac_bits; | ||
253 | } | ||
254 | n >>= 1; | ||
255 | if (!n) | ||
256 | break; | ||
257 | x *= x; | ||
258 | x += 1UL << (frac_bits - 1); | ||
259 | x >>= frac_bits; | ||
260 | } | ||
261 | } | ||
262 | |||
263 | return result; | ||
264 | } | ||
265 | |||
266 | /* | ||
267 | * a1 = a0 * e + a * (1 - e) | ||
268 | * | ||
269 | * a2 = a1 * e + a * (1 - e) | ||
270 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
271 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
272 | * | ||
273 | * a3 = a2 * e + a * (1 - e) | ||
274 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
275 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
276 | * | ||
277 | * ... | ||
278 | * | ||
279 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
280 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
281 | * = a0 * e^n + a * (1 - e^n) | ||
282 | * | ||
283 | * [1] application of the geometric series: | ||
284 | * | ||
285 | * n 1 - x^(n+1) | ||
286 | * S_n := \Sum x^i = ------------- | ||
287 | * i=0 1 - x | ||
288 | */ | ||
289 | static unsigned long | ||
290 | calc_load_n(unsigned long load, unsigned long exp, | ||
291 | unsigned long active, unsigned int n) | ||
292 | { | ||
293 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
294 | } | ||
295 | |||
296 | /* | ||
297 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
298 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
299 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
300 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
301 | * | ||
302 | * Once we've updated the global active value, we need to apply the exponential | ||
303 | * weights adjusted to the number of cycles missed. | ||
304 | */ | ||
305 | static void calc_global_nohz(void) | ||
306 | { | ||
307 | long delta, active, n; | ||
308 | |||
309 | if (!time_before(jiffies, calc_load_update + 10)) { | ||
310 | /* | ||
311 | * Catch-up, fold however many we are behind still | ||
312 | */ | ||
313 | delta = jiffies - calc_load_update - 10; | ||
314 | n = 1 + (delta / LOAD_FREQ); | ||
315 | |||
316 | active = atomic_long_read(&calc_load_tasks); | ||
317 | active = active > 0 ? active * FIXED_1 : 0; | ||
318 | |||
319 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
320 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
321 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
322 | |||
323 | calc_load_update += n * LOAD_FREQ; | ||
324 | } | ||
325 | |||
326 | /* | ||
327 | * Flip the idle index... | ||
328 | * | ||
329 | * Make sure we first write the new time then flip the index, so that | ||
330 | * calc_load_write_idx() will see the new time when it reads the new | ||
331 | * index, this avoids a double flip messing things up. | ||
332 | */ | ||
333 | smp_wmb(); | ||
334 | calc_load_idx++; | ||
335 | } | ||
336 | #else /* !CONFIG_NO_HZ_COMMON */ | ||
337 | |||
338 | static inline long calc_load_fold_idle(void) { return 0; } | ||
339 | static inline void calc_global_nohz(void) { } | ||
340 | |||
341 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
342 | |||
343 | /* | ||
344 | * calc_load - update the avenrun load estimates 10 ticks after the | ||
345 | * CPUs have updated calc_load_tasks. | ||
346 | * | ||
347 | * Called from the global timer code. | ||
348 | */ | ||
349 | void calc_global_load(unsigned long ticks) | ||
350 | { | ||
351 | long active, delta; | ||
352 | |||
353 | if (time_before(jiffies, calc_load_update + 10)) | ||
354 | return; | ||
355 | |||
356 | /* | ||
357 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | ||
358 | */ | ||
359 | delta = calc_load_fold_idle(); | ||
360 | if (delta) | ||
361 | atomic_long_add(delta, &calc_load_tasks); | ||
362 | |||
363 | active = atomic_long_read(&calc_load_tasks); | ||
364 | active = active > 0 ? active * FIXED_1 : 0; | ||
365 | |||
366 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); | ||
367 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); | ||
368 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | ||
369 | |||
370 | calc_load_update += LOAD_FREQ; | ||
371 | |||
372 | /* | ||
373 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. | ||
374 | */ | ||
375 | calc_global_nohz(); | ||
376 | } | ||
377 | |||
378 | /* | ||
379 | * Called from scheduler_tick() to periodically update this CPU's | ||
380 | * active count. | ||
381 | */ | ||
382 | void calc_global_load_tick(struct rq *this_rq) | ||
383 | { | ||
384 | long delta; | ||
385 | |||
386 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
387 | return; | ||
388 | |||
389 | delta = calc_load_fold_active(this_rq); | ||
390 | if (delta) | ||
391 | atomic_long_add(delta, &calc_load_tasks); | ||
392 | |||
393 | this_rq->calc_load_update += LOAD_FREQ; | ||
394 | } | ||