diff options
author | Paul Gortmaker <paul.gortmaker@windriver.com> | 2013-04-19 15:10:49 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2013-05-07 07:14:50 -0400 |
commit | 45ceebf77653975815d82fcf7cec0a164215ae11 (patch) | |
tree | 07171b93e073a58f80747885614451e0a5705728 | |
parent | 534c97b0950b1967bca1c753aeaed32f5db40264 (diff) |
sched: Factor out load calculation code from sched/core.c --> sched/proc.c
This large chunk of load calculation code can be easily divorced
from the main core.c scheduler file, with only a couple
prototypes and externs added to a kernel/sched header.
Some recent commits expanded the code and the documentation of
it, making it large enough to warrant separation. For example,
see:
556061b, "sched/nohz: Fix rq->cpu_load[] calculations"
5aaa0b7, "sched/nohz: Fix rq->cpu_load calculations some more"
5167e8d, "sched/nohz: Rewrite and fix load-avg computation -- again"
More importantly, it helps reduce the size of the main
sched/core.c by yet another significant amount (~600 lines).
Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Link: http://lkml.kernel.org/r/1366398650-31599-2-git-send-email-paul.gortmaker@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | kernel/sched/Makefile | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 569 | ||||
-rw-r--r-- | kernel/sched/proc.c | 578 | ||||
-rw-r--r-- | kernel/sched/sched.h | 8 |
4 files changed, 587 insertions, 570 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index deaf90e4a1de..54adcf35f495 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -11,7 +11,7 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) | |||
11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | 11 | CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer |
12 | endif | 12 | endif |
13 | 13 | ||
14 | obj-y += core.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o | 14 | obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o |
15 | obj-$(CONFIG_SMP) += cpupri.o | 15 | obj-$(CONFIG_SMP) += cpupri.o |
16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o | 16 | obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o |
17 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 17 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272fd..bfa7e77e0b50 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -2056,575 +2056,6 @@ unsigned long nr_iowait_cpu(int cpu) | |||
2056 | return atomic_read(&this->nr_iowait); | 2056 | return atomic_read(&this->nr_iowait); |
2057 | } | 2057 | } |
2058 | 2058 | ||
2059 | unsigned long this_cpu_load(void) | ||
2060 | { | ||
2061 | struct rq *this = this_rq(); | ||
2062 | return this->cpu_load[0]; | ||
2063 | } | ||
2064 | |||
2065 | |||
2066 | /* | ||
2067 | * Global load-average calculations | ||
2068 | * | ||
2069 | * We take a distributed and async approach to calculating the global load-avg | ||
2070 | * in order to minimize overhead. | ||
2071 | * | ||
2072 | * The global load average is an exponentially decaying average of nr_running + | ||
2073 | * nr_uninterruptible. | ||
2074 | * | ||
2075 | * Once every LOAD_FREQ: | ||
2076 | * | ||
2077 | * nr_active = 0; | ||
2078 | * for_each_possible_cpu(cpu) | ||
2079 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | ||
2080 | * | ||
2081 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | ||
2082 | * | ||
2083 | * Due to a number of reasons the above turns in the mess below: | ||
2084 | * | ||
2085 | * - for_each_possible_cpu() is prohibitively expensive on machines with | ||
2086 | * serious number of cpus, therefore we need to take a distributed approach | ||
2087 | * to calculating nr_active. | ||
2088 | * | ||
2089 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | ||
2090 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | ||
2091 | * | ||
2092 | * So assuming nr_active := 0 when we start out -- true per definition, we | ||
2093 | * can simply take per-cpu deltas and fold those into a global accumulate | ||
2094 | * to obtain the same result. See calc_load_fold_active(). | ||
2095 | * | ||
2096 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | ||
2097 | * across the machine, we assume 10 ticks is sufficient time for every | ||
2098 | * cpu to have completed this task. | ||
2099 | * | ||
2100 | * This places an upper-bound on the IRQ-off latency of the machine. Then | ||
2101 | * again, being late doesn't loose the delta, just wrecks the sample. | ||
2102 | * | ||
2103 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | ||
2104 | * this would add another cross-cpu cacheline miss and atomic operation | ||
2105 | * to the wakeup path. Instead we increment on whatever cpu the task ran | ||
2106 | * when it went into uninterruptible state and decrement on whatever cpu | ||
2107 | * did the wakeup. This means that only the sum of nr_uninterruptible over | ||
2108 | * all cpus yields the correct result. | ||
2109 | * | ||
2110 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | ||
2111 | */ | ||
2112 | |||
2113 | /* Variables and functions for calc_load */ | ||
2114 | static atomic_long_t calc_load_tasks; | ||
2115 | static unsigned long calc_load_update; | ||
2116 | unsigned long avenrun[3]; | ||
2117 | EXPORT_SYMBOL(avenrun); /* should be removed */ | ||
2118 | |||
2119 | /** | ||
2120 | * get_avenrun - get the load average array | ||
2121 | * @loads: pointer to dest load array | ||
2122 | * @offset: offset to add | ||
2123 | * @shift: shift count to shift the result left | ||
2124 | * | ||
2125 | * These values are estimates at best, so no need for locking. | ||
2126 | */ | ||
2127 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
2128 | { | ||
2129 | loads[0] = (avenrun[0] + offset) << shift; | ||
2130 | loads[1] = (avenrun[1] + offset) << shift; | ||
2131 | loads[2] = (avenrun[2] + offset) << shift; | ||
2132 | } | ||
2133 | |||
2134 | static long calc_load_fold_active(struct rq *this_rq) | ||
2135 | { | ||
2136 | long nr_active, delta = 0; | ||
2137 | |||
2138 | nr_active = this_rq->nr_running; | ||
2139 | nr_active += (long) this_rq->nr_uninterruptible; | ||
2140 | |||
2141 | if (nr_active != this_rq->calc_load_active) { | ||
2142 | delta = nr_active - this_rq->calc_load_active; | ||
2143 | this_rq->calc_load_active = nr_active; | ||
2144 | } | ||
2145 | |||
2146 | return delta; | ||
2147 | } | ||
2148 | |||
2149 | /* | ||
2150 | * a1 = a0 * e + a * (1 - e) | ||
2151 | */ | ||
2152 | static unsigned long | ||
2153 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
2154 | { | ||
2155 | load *= exp; | ||
2156 | load += active * (FIXED_1 - exp); | ||
2157 | load += 1UL << (FSHIFT - 1); | ||
2158 | return load >> FSHIFT; | ||
2159 | } | ||
2160 | |||
2161 | #ifdef CONFIG_NO_HZ_COMMON | ||
2162 | /* | ||
2163 | * Handle NO_HZ for the global load-average. | ||
2164 | * | ||
2165 | * Since the above described distributed algorithm to compute the global | ||
2166 | * load-average relies on per-cpu sampling from the tick, it is affected by | ||
2167 | * NO_HZ. | ||
2168 | * | ||
2169 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | ||
2170 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | ||
2171 | * when we read the global state. | ||
2172 | * | ||
2173 | * Obviously reality has to ruin such a delightfully simple scheme: | ||
2174 | * | ||
2175 | * - When we go NO_HZ idle during the window, we can negate our sample | ||
2176 | * contribution, causing under-accounting. | ||
2177 | * | ||
2178 | * We avoid this by keeping two idle-delta counters and flipping them | ||
2179 | * when the window starts, thus separating old and new NO_HZ load. | ||
2180 | * | ||
2181 | * The only trick is the slight shift in index flip for read vs write. | ||
2182 | * | ||
2183 | * 0s 5s 10s 15s | ||
2184 | * +10 +10 +10 +10 | ||
2185 | * |-|-----------|-|-----------|-|-----------|-| | ||
2186 | * r:0 0 1 1 0 0 1 1 0 | ||
2187 | * w:0 1 1 0 0 1 1 0 0 | ||
2188 | * | ||
2189 | * This ensures we'll fold the old idle contribution in this window while | ||
2190 | * accumlating the new one. | ||
2191 | * | ||
2192 | * - When we wake up from NO_HZ idle during the window, we push up our | ||
2193 | * contribution, since we effectively move our sample point to a known | ||
2194 | * busy state. | ||
2195 | * | ||
2196 | * This is solved by pushing the window forward, and thus skipping the | ||
2197 | * sample, for this cpu (effectively using the idle-delta for this cpu which | ||
2198 | * was in effect at the time the window opened). This also solves the issue | ||
2199 | * of having to deal with a cpu having been in NOHZ idle for multiple | ||
2200 | * LOAD_FREQ intervals. | ||
2201 | * | ||
2202 | * When making the ILB scale, we should try to pull this in as well. | ||
2203 | */ | ||
2204 | static atomic_long_t calc_load_idle[2]; | ||
2205 | static int calc_load_idx; | ||
2206 | |||
2207 | static inline int calc_load_write_idx(void) | ||
2208 | { | ||
2209 | int idx = calc_load_idx; | ||
2210 | |||
2211 | /* | ||
2212 | * See calc_global_nohz(), if we observe the new index, we also | ||
2213 | * need to observe the new update time. | ||
2214 | */ | ||
2215 | smp_rmb(); | ||
2216 | |||
2217 | /* | ||
2218 | * If the folding window started, make sure we start writing in the | ||
2219 | * next idle-delta. | ||
2220 | */ | ||
2221 | if (!time_before(jiffies, calc_load_update)) | ||
2222 | idx++; | ||
2223 | |||
2224 | return idx & 1; | ||
2225 | } | ||
2226 | |||
2227 | static inline int calc_load_read_idx(void) | ||
2228 | { | ||
2229 | return calc_load_idx & 1; | ||
2230 | } | ||
2231 | |||
2232 | void calc_load_enter_idle(void) | ||
2233 | { | ||
2234 | struct rq *this_rq = this_rq(); | ||
2235 | long delta; | ||
2236 | |||
2237 | /* | ||
2238 | * We're going into NOHZ mode, if there's any pending delta, fold it | ||
2239 | * into the pending idle delta. | ||
2240 | */ | ||
2241 | delta = calc_load_fold_active(this_rq); | ||
2242 | if (delta) { | ||
2243 | int idx = calc_load_write_idx(); | ||
2244 | atomic_long_add(delta, &calc_load_idle[idx]); | ||
2245 | } | ||
2246 | } | ||
2247 | |||
2248 | void calc_load_exit_idle(void) | ||
2249 | { | ||
2250 | struct rq *this_rq = this_rq(); | ||
2251 | |||
2252 | /* | ||
2253 | * If we're still before the sample window, we're done. | ||
2254 | */ | ||
2255 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
2256 | return; | ||
2257 | |||
2258 | /* | ||
2259 | * We woke inside or after the sample window, this means we're already | ||
2260 | * accounted through the nohz accounting, so skip the entire deal and | ||
2261 | * sync up for the next window. | ||
2262 | */ | ||
2263 | this_rq->calc_load_update = calc_load_update; | ||
2264 | if (time_before(jiffies, this_rq->calc_load_update + 10)) | ||
2265 | this_rq->calc_load_update += LOAD_FREQ; | ||
2266 | } | ||
2267 | |||
2268 | static long calc_load_fold_idle(void) | ||
2269 | { | ||
2270 | int idx = calc_load_read_idx(); | ||
2271 | long delta = 0; | ||
2272 | |||
2273 | if (atomic_long_read(&calc_load_idle[idx])) | ||
2274 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | ||
2275 | |||
2276 | return delta; | ||
2277 | } | ||
2278 | |||
2279 | /** | ||
2280 | * fixed_power_int - compute: x^n, in O(log n) time | ||
2281 | * | ||
2282 | * @x: base of the power | ||
2283 | * @frac_bits: fractional bits of @x | ||
2284 | * @n: power to raise @x to. | ||
2285 | * | ||
2286 | * By exploiting the relation between the definition of the natural power | ||
2287 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
2288 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
2289 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
2290 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
2291 | * of course trivially computable in O(log_2 n), the length of our binary | ||
2292 | * vector. | ||
2293 | */ | ||
2294 | static unsigned long | ||
2295 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
2296 | { | ||
2297 | unsigned long result = 1UL << frac_bits; | ||
2298 | |||
2299 | if (n) for (;;) { | ||
2300 | if (n & 1) { | ||
2301 | result *= x; | ||
2302 | result += 1UL << (frac_bits - 1); | ||
2303 | result >>= frac_bits; | ||
2304 | } | ||
2305 | n >>= 1; | ||
2306 | if (!n) | ||
2307 | break; | ||
2308 | x *= x; | ||
2309 | x += 1UL << (frac_bits - 1); | ||
2310 | x >>= frac_bits; | ||
2311 | } | ||
2312 | |||
2313 | return result; | ||
2314 | } | ||
2315 | |||
2316 | /* | ||
2317 | * a1 = a0 * e + a * (1 - e) | ||
2318 | * | ||
2319 | * a2 = a1 * e + a * (1 - e) | ||
2320 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
2321 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
2322 | * | ||
2323 | * a3 = a2 * e + a * (1 - e) | ||
2324 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
2325 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
2326 | * | ||
2327 | * ... | ||
2328 | * | ||
2329 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
2330 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
2331 | * = a0 * e^n + a * (1 - e^n) | ||
2332 | * | ||
2333 | * [1] application of the geometric series: | ||
2334 | * | ||
2335 | * n 1 - x^(n+1) | ||
2336 | * S_n := \Sum x^i = ------------- | ||
2337 | * i=0 1 - x | ||
2338 | */ | ||
2339 | static unsigned long | ||
2340 | calc_load_n(unsigned long load, unsigned long exp, | ||
2341 | unsigned long active, unsigned int n) | ||
2342 | { | ||
2343 | |||
2344 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
2345 | } | ||
2346 | |||
2347 | /* | ||
2348 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
2349 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
2350 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
2351 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
2352 | * | ||
2353 | * Once we've updated the global active value, we need to apply the exponential | ||
2354 | * weights adjusted to the number of cycles missed. | ||
2355 | */ | ||
2356 | static void calc_global_nohz(void) | ||
2357 | { | ||
2358 | long delta, active, n; | ||
2359 | |||
2360 | if (!time_before(jiffies, calc_load_update + 10)) { | ||
2361 | /* | ||
2362 | * Catch-up, fold however many we are behind still | ||
2363 | */ | ||
2364 | delta = jiffies - calc_load_update - 10; | ||
2365 | n = 1 + (delta / LOAD_FREQ); | ||
2366 | |||
2367 | active = atomic_long_read(&calc_load_tasks); | ||
2368 | active = active > 0 ? active * FIXED_1 : 0; | ||
2369 | |||
2370 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
2371 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
2372 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
2373 | |||
2374 | calc_load_update += n * LOAD_FREQ; | ||
2375 | } | ||
2376 | |||
2377 | /* | ||
2378 | * Flip the idle index... | ||
2379 | * | ||
2380 | * Make sure we first write the new time then flip the index, so that | ||
2381 | * calc_load_write_idx() will see the new time when it reads the new | ||
2382 | * index, this avoids a double flip messing things up. | ||
2383 | */ | ||
2384 | smp_wmb(); | ||
2385 | calc_load_idx++; | ||
2386 | } | ||
2387 | #else /* !CONFIG_NO_HZ_COMMON */ | ||
2388 | |||
2389 | static inline long calc_load_fold_idle(void) { return 0; } | ||
2390 | static inline void calc_global_nohz(void) { } | ||
2391 | |||
2392 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
2393 | |||
2394 | /* | ||
2395 | * calc_load - update the avenrun load estimates 10 ticks after the | ||
2396 | * CPUs have updated calc_load_tasks. | ||
2397 | */ | ||
2398 | void calc_global_load(unsigned long ticks) | ||
2399 | { | ||
2400 | long active, delta; | ||
2401 | |||
2402 | if (time_before(jiffies, calc_load_update + 10)) | ||
2403 | return; | ||
2404 | |||
2405 | /* | ||
2406 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | ||
2407 | */ | ||
2408 | delta = calc_load_fold_idle(); | ||
2409 | if (delta) | ||
2410 | atomic_long_add(delta, &calc_load_tasks); | ||
2411 | |||
2412 | active = atomic_long_read(&calc_load_tasks); | ||
2413 | active = active > 0 ? active * FIXED_1 : 0; | ||
2414 | |||
2415 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); | ||
2416 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); | ||
2417 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | ||
2418 | |||
2419 | calc_load_update += LOAD_FREQ; | ||
2420 | |||
2421 | /* | ||
2422 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. | ||
2423 | */ | ||
2424 | calc_global_nohz(); | ||
2425 | } | ||
2426 | |||
2427 | /* | ||
2428 | * Called from update_cpu_load() to periodically update this CPU's | ||
2429 | * active count. | ||
2430 | */ | ||
2431 | static void calc_load_account_active(struct rq *this_rq) | ||
2432 | { | ||
2433 | long delta; | ||
2434 | |||
2435 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
2436 | return; | ||
2437 | |||
2438 | delta = calc_load_fold_active(this_rq); | ||
2439 | if (delta) | ||
2440 | atomic_long_add(delta, &calc_load_tasks); | ||
2441 | |||
2442 | this_rq->calc_load_update += LOAD_FREQ; | ||
2443 | } | ||
2444 | |||
2445 | /* | ||
2446 | * End of global load-average stuff | ||
2447 | */ | ||
2448 | |||
2449 | /* | ||
2450 | * The exact cpuload at various idx values, calculated at every tick would be | ||
2451 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | ||
2452 | * | ||
2453 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | ||
2454 | * on nth tick when cpu may be busy, then we have: | ||
2455 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
2456 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | ||
2457 | * | ||
2458 | * decay_load_missed() below does efficient calculation of | ||
2459 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
2460 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | ||
2461 | * | ||
2462 | * The calculation is approximated on a 128 point scale. | ||
2463 | * degrade_zero_ticks is the number of ticks after which load at any | ||
2464 | * particular idx is approximated to be zero. | ||
2465 | * degrade_factor is a precomputed table, a row for each load idx. | ||
2466 | * Each column corresponds to degradation factor for a power of two ticks, | ||
2467 | * based on 128 point scale. | ||
2468 | * Example: | ||
2469 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | ||
2470 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | ||
2471 | * | ||
2472 | * With this power of 2 load factors, we can degrade the load n times | ||
2473 | * by looking at 1 bits in n and doing as many mult/shift instead of | ||
2474 | * n mult/shifts needed by the exact degradation. | ||
2475 | */ | ||
2476 | #define DEGRADE_SHIFT 7 | ||
2477 | static const unsigned char | ||
2478 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
2479 | static const unsigned char | ||
2480 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
2481 | {0, 0, 0, 0, 0, 0, 0, 0}, | ||
2482 | {64, 32, 8, 0, 0, 0, 0, 0}, | ||
2483 | {96, 72, 40, 12, 1, 0, 0}, | ||
2484 | {112, 98, 75, 43, 15, 1, 0}, | ||
2485 | {120, 112, 98, 76, 45, 16, 2} }; | ||
2486 | |||
2487 | /* | ||
2488 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
2489 | * would be when CPU is idle and so we just decay the old load without | ||
2490 | * adding any new load. | ||
2491 | */ | ||
2492 | static unsigned long | ||
2493 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
2494 | { | ||
2495 | int j = 0; | ||
2496 | |||
2497 | if (!missed_updates) | ||
2498 | return load; | ||
2499 | |||
2500 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
2501 | return 0; | ||
2502 | |||
2503 | if (idx == 1) | ||
2504 | return load >> missed_updates; | ||
2505 | |||
2506 | while (missed_updates) { | ||
2507 | if (missed_updates % 2) | ||
2508 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
2509 | |||
2510 | missed_updates >>= 1; | ||
2511 | j++; | ||
2512 | } | ||
2513 | return load; | ||
2514 | } | ||
2515 | |||
2516 | /* | ||
2517 | * Update rq->cpu_load[] statistics. This function is usually called every | ||
2518 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | ||
2519 | * every tick. We fix it up based on jiffies. | ||
2520 | */ | ||
2521 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | ||
2522 | unsigned long pending_updates) | ||
2523 | { | ||
2524 | int i, scale; | ||
2525 | |||
2526 | this_rq->nr_load_updates++; | ||
2527 | |||
2528 | /* Update our load: */ | ||
2529 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | ||
2530 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
2531 | unsigned long old_load, new_load; | ||
2532 | |||
2533 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
2534 | |||
2535 | old_load = this_rq->cpu_load[i]; | ||
2536 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
2537 | new_load = this_load; | ||
2538 | /* | ||
2539 | * Round up the averaging division if load is increasing. This | ||
2540 | * prevents us from getting stuck on 9 if the load is 10, for | ||
2541 | * example. | ||
2542 | */ | ||
2543 | if (new_load > old_load) | ||
2544 | new_load += scale - 1; | ||
2545 | |||
2546 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
2547 | } | ||
2548 | |||
2549 | sched_avg_update(this_rq); | ||
2550 | } | ||
2551 | |||
2552 | #ifdef CONFIG_NO_HZ_COMMON | ||
2553 | /* | ||
2554 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
2555 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
2556 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
2557 | * | ||
2558 | * Therefore we cannot use the delta approach from the regular tick since that | ||
2559 | * would seriously skew the load calculation. However we'll make do for those | ||
2560 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
2561 | * (tick_nohz_idle_exit). | ||
2562 | * | ||
2563 | * This means we might still be one tick off for nohz periods. | ||
2564 | */ | ||
2565 | |||
2566 | /* | ||
2567 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
2568 | * idle balance. | ||
2569 | */ | ||
2570 | void update_idle_cpu_load(struct rq *this_rq) | ||
2571 | { | ||
2572 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
2573 | unsigned long load = this_rq->load.weight; | ||
2574 | unsigned long pending_updates; | ||
2575 | |||
2576 | /* | ||
2577 | * bail if there's load or we're actually up-to-date. | ||
2578 | */ | ||
2579 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
2580 | return; | ||
2581 | |||
2582 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2583 | this_rq->last_load_update_tick = curr_jiffies; | ||
2584 | |||
2585 | __update_cpu_load(this_rq, load, pending_updates); | ||
2586 | } | ||
2587 | |||
2588 | /* | ||
2589 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
2590 | */ | ||
2591 | void update_cpu_load_nohz(void) | ||
2592 | { | ||
2593 | struct rq *this_rq = this_rq(); | ||
2594 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
2595 | unsigned long pending_updates; | ||
2596 | |||
2597 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
2598 | return; | ||
2599 | |||
2600 | raw_spin_lock(&this_rq->lock); | ||
2601 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2602 | if (pending_updates) { | ||
2603 | this_rq->last_load_update_tick = curr_jiffies; | ||
2604 | /* | ||
2605 | * We were idle, this means load 0, the current load might be | ||
2606 | * !0 due to remote wakeups and the sort. | ||
2607 | */ | ||
2608 | __update_cpu_load(this_rq, 0, pending_updates); | ||
2609 | } | ||
2610 | raw_spin_unlock(&this_rq->lock); | ||
2611 | } | ||
2612 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
2613 | |||
2614 | /* | ||
2615 | * Called from scheduler_tick() | ||
2616 | */ | ||
2617 | static void update_cpu_load_active(struct rq *this_rq) | ||
2618 | { | ||
2619 | /* | ||
2620 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | ||
2621 | */ | ||
2622 | this_rq->last_load_update_tick = jiffies; | ||
2623 | __update_cpu_load(this_rq, this_rq->load.weight, 1); | ||
2624 | |||
2625 | calc_load_account_active(this_rq); | ||
2626 | } | ||
2627 | |||
2628 | #ifdef CONFIG_SMP | 2059 | #ifdef CONFIG_SMP |
2629 | 2060 | ||
2630 | /* | 2061 | /* |
diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c new file mode 100644 index 000000000000..bb3a6a0b8623 --- /dev/null +++ b/kernel/sched/proc.c | |||
@@ -0,0 +1,578 @@ | |||
1 | /* | ||
2 | * kernel/sched/proc.c | ||
3 | * | ||
4 | * Kernel load calculations, forked from sched/core.c | ||
5 | */ | ||
6 | |||
7 | #include <linux/export.h> | ||
8 | |||
9 | #include "sched.h" | ||
10 | |||
11 | unsigned long this_cpu_load(void) | ||
12 | { | ||
13 | struct rq *this = this_rq(); | ||
14 | return this->cpu_load[0]; | ||
15 | } | ||
16 | |||
17 | |||
18 | /* | ||
19 | * Global load-average calculations | ||
20 | * | ||
21 | * We take a distributed and async approach to calculating the global load-avg | ||
22 | * in order to minimize overhead. | ||
23 | * | ||
24 | * The global load average is an exponentially decaying average of nr_running + | ||
25 | * nr_uninterruptible. | ||
26 | * | ||
27 | * Once every LOAD_FREQ: | ||
28 | * | ||
29 | * nr_active = 0; | ||
30 | * for_each_possible_cpu(cpu) | ||
31 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | ||
32 | * | ||
33 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | ||
34 | * | ||
35 | * Due to a number of reasons the above turns in the mess below: | ||
36 | * | ||
37 | * - for_each_possible_cpu() is prohibitively expensive on machines with | ||
38 | * serious number of cpus, therefore we need to take a distributed approach | ||
39 | * to calculating nr_active. | ||
40 | * | ||
41 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | ||
42 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | ||
43 | * | ||
44 | * So assuming nr_active := 0 when we start out -- true per definition, we | ||
45 | * can simply take per-cpu deltas and fold those into a global accumulate | ||
46 | * to obtain the same result. See calc_load_fold_active(). | ||
47 | * | ||
48 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | ||
49 | * across the machine, we assume 10 ticks is sufficient time for every | ||
50 | * cpu to have completed this task. | ||
51 | * | ||
52 | * This places an upper-bound on the IRQ-off latency of the machine. Then | ||
53 | * again, being late doesn't loose the delta, just wrecks the sample. | ||
54 | * | ||
55 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | ||
56 | * this would add another cross-cpu cacheline miss and atomic operation | ||
57 | * to the wakeup path. Instead we increment on whatever cpu the task ran | ||
58 | * when it went into uninterruptible state and decrement on whatever cpu | ||
59 | * did the wakeup. This means that only the sum of nr_uninterruptible over | ||
60 | * all cpus yields the correct result. | ||
61 | * | ||
62 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | ||
63 | */ | ||
64 | |||
65 | /* Variables and functions for calc_load */ | ||
66 | atomic_long_t calc_load_tasks; | ||
67 | unsigned long calc_load_update; | ||
68 | unsigned long avenrun[3]; | ||
69 | EXPORT_SYMBOL(avenrun); /* should be removed */ | ||
70 | |||
71 | /** | ||
72 | * get_avenrun - get the load average array | ||
73 | * @loads: pointer to dest load array | ||
74 | * @offset: offset to add | ||
75 | * @shift: shift count to shift the result left | ||
76 | * | ||
77 | * These values are estimates at best, so no need for locking. | ||
78 | */ | ||
79 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
80 | { | ||
81 | loads[0] = (avenrun[0] + offset) << shift; | ||
82 | loads[1] = (avenrun[1] + offset) << shift; | ||
83 | loads[2] = (avenrun[2] + offset) << shift; | ||
84 | } | ||
85 | |||
86 | long calc_load_fold_active(struct rq *this_rq) | ||
87 | { | ||
88 | long nr_active, delta = 0; | ||
89 | |||
90 | nr_active = this_rq->nr_running; | ||
91 | nr_active += (long) this_rq->nr_uninterruptible; | ||
92 | |||
93 | if (nr_active != this_rq->calc_load_active) { | ||
94 | delta = nr_active - this_rq->calc_load_active; | ||
95 | this_rq->calc_load_active = nr_active; | ||
96 | } | ||
97 | |||
98 | return delta; | ||
99 | } | ||
100 | |||
101 | /* | ||
102 | * a1 = a0 * e + a * (1 - e) | ||
103 | */ | ||
104 | static unsigned long | ||
105 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
106 | { | ||
107 | load *= exp; | ||
108 | load += active * (FIXED_1 - exp); | ||
109 | load += 1UL << (FSHIFT - 1); | ||
110 | return load >> FSHIFT; | ||
111 | } | ||
112 | |||
113 | #ifdef CONFIG_NO_HZ_COMMON | ||
114 | /* | ||
115 | * Handle NO_HZ for the global load-average. | ||
116 | * | ||
117 | * Since the above described distributed algorithm to compute the global | ||
118 | * load-average relies on per-cpu sampling from the tick, it is affected by | ||
119 | * NO_HZ. | ||
120 | * | ||
121 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | ||
122 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | ||
123 | * when we read the global state. | ||
124 | * | ||
125 | * Obviously reality has to ruin such a delightfully simple scheme: | ||
126 | * | ||
127 | * - When we go NO_HZ idle during the window, we can negate our sample | ||
128 | * contribution, causing under-accounting. | ||
129 | * | ||
130 | * We avoid this by keeping two idle-delta counters and flipping them | ||
131 | * when the window starts, thus separating old and new NO_HZ load. | ||
132 | * | ||
133 | * The only trick is the slight shift in index flip for read vs write. | ||
134 | * | ||
135 | * 0s 5s 10s 15s | ||
136 | * +10 +10 +10 +10 | ||
137 | * |-|-----------|-|-----------|-|-----------|-| | ||
138 | * r:0 0 1 1 0 0 1 1 0 | ||
139 | * w:0 1 1 0 0 1 1 0 0 | ||
140 | * | ||
141 | * This ensures we'll fold the old idle contribution in this window while | ||
142 | * accumlating the new one. | ||
143 | * | ||
144 | * - When we wake up from NO_HZ idle during the window, we push up our | ||
145 | * contribution, since we effectively move our sample point to a known | ||
146 | * busy state. | ||
147 | * | ||
148 | * This is solved by pushing the window forward, and thus skipping the | ||
149 | * sample, for this cpu (effectively using the idle-delta for this cpu which | ||
150 | * was in effect at the time the window opened). This also solves the issue | ||
151 | * of having to deal with a cpu having been in NOHZ idle for multiple | ||
152 | * LOAD_FREQ intervals. | ||
153 | * | ||
154 | * When making the ILB scale, we should try to pull this in as well. | ||
155 | */ | ||
156 | static atomic_long_t calc_load_idle[2]; | ||
157 | static int calc_load_idx; | ||
158 | |||
159 | static inline int calc_load_write_idx(void) | ||
160 | { | ||
161 | int idx = calc_load_idx; | ||
162 | |||
163 | /* | ||
164 | * See calc_global_nohz(), if we observe the new index, we also | ||
165 | * need to observe the new update time. | ||
166 | */ | ||
167 | smp_rmb(); | ||
168 | |||
169 | /* | ||
170 | * If the folding window started, make sure we start writing in the | ||
171 | * next idle-delta. | ||
172 | */ | ||
173 | if (!time_before(jiffies, calc_load_update)) | ||
174 | idx++; | ||
175 | |||
176 | return idx & 1; | ||
177 | } | ||
178 | |||
179 | static inline int calc_load_read_idx(void) | ||
180 | { | ||
181 | return calc_load_idx & 1; | ||
182 | } | ||
183 | |||
184 | void calc_load_enter_idle(void) | ||
185 | { | ||
186 | struct rq *this_rq = this_rq(); | ||
187 | long delta; | ||
188 | |||
189 | /* | ||
190 | * We're going into NOHZ mode, if there's any pending delta, fold it | ||
191 | * into the pending idle delta. | ||
192 | */ | ||
193 | delta = calc_load_fold_active(this_rq); | ||
194 | if (delta) { | ||
195 | int idx = calc_load_write_idx(); | ||
196 | atomic_long_add(delta, &calc_load_idle[idx]); | ||
197 | } | ||
198 | } | ||
199 | |||
200 | void calc_load_exit_idle(void) | ||
201 | { | ||
202 | struct rq *this_rq = this_rq(); | ||
203 | |||
204 | /* | ||
205 | * If we're still before the sample window, we're done. | ||
206 | */ | ||
207 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
208 | return; | ||
209 | |||
210 | /* | ||
211 | * We woke inside or after the sample window, this means we're already | ||
212 | * accounted through the nohz accounting, so skip the entire deal and | ||
213 | * sync up for the next window. | ||
214 | */ | ||
215 | this_rq->calc_load_update = calc_load_update; | ||
216 | if (time_before(jiffies, this_rq->calc_load_update + 10)) | ||
217 | this_rq->calc_load_update += LOAD_FREQ; | ||
218 | } | ||
219 | |||
220 | static long calc_load_fold_idle(void) | ||
221 | { | ||
222 | int idx = calc_load_read_idx(); | ||
223 | long delta = 0; | ||
224 | |||
225 | if (atomic_long_read(&calc_load_idle[idx])) | ||
226 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | ||
227 | |||
228 | return delta; | ||
229 | } | ||
230 | |||
231 | /** | ||
232 | * fixed_power_int - compute: x^n, in O(log n) time | ||
233 | * | ||
234 | * @x: base of the power | ||
235 | * @frac_bits: fractional bits of @x | ||
236 | * @n: power to raise @x to. | ||
237 | * | ||
238 | * By exploiting the relation between the definition of the natural power | ||
239 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
240 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
241 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
242 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
243 | * of course trivially computable in O(log_2 n), the length of our binary | ||
244 | * vector. | ||
245 | */ | ||
246 | static unsigned long | ||
247 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
248 | { | ||
249 | unsigned long result = 1UL << frac_bits; | ||
250 | |||
251 | if (n) for (;;) { | ||
252 | if (n & 1) { | ||
253 | result *= x; | ||
254 | result += 1UL << (frac_bits - 1); | ||
255 | result >>= frac_bits; | ||
256 | } | ||
257 | n >>= 1; | ||
258 | if (!n) | ||
259 | break; | ||
260 | x *= x; | ||
261 | x += 1UL << (frac_bits - 1); | ||
262 | x >>= frac_bits; | ||
263 | } | ||
264 | |||
265 | return result; | ||
266 | } | ||
267 | |||
268 | /* | ||
269 | * a1 = a0 * e + a * (1 - e) | ||
270 | * | ||
271 | * a2 = a1 * e + a * (1 - e) | ||
272 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
273 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
274 | * | ||
275 | * a3 = a2 * e + a * (1 - e) | ||
276 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
277 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
278 | * | ||
279 | * ... | ||
280 | * | ||
281 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
282 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
283 | * = a0 * e^n + a * (1 - e^n) | ||
284 | * | ||
285 | * [1] application of the geometric series: | ||
286 | * | ||
287 | * n 1 - x^(n+1) | ||
288 | * S_n := \Sum x^i = ------------- | ||
289 | * i=0 1 - x | ||
290 | */ | ||
291 | static unsigned long | ||
292 | calc_load_n(unsigned long load, unsigned long exp, | ||
293 | unsigned long active, unsigned int n) | ||
294 | { | ||
295 | |||
296 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
297 | } | ||
298 | |||
299 | /* | ||
300 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
301 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
302 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
303 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
304 | * | ||
305 | * Once we've updated the global active value, we need to apply the exponential | ||
306 | * weights adjusted to the number of cycles missed. | ||
307 | */ | ||
308 | static void calc_global_nohz(void) | ||
309 | { | ||
310 | long delta, active, n; | ||
311 | |||
312 | if (!time_before(jiffies, calc_load_update + 10)) { | ||
313 | /* | ||
314 | * Catch-up, fold however many we are behind still | ||
315 | */ | ||
316 | delta = jiffies - calc_load_update - 10; | ||
317 | n = 1 + (delta / LOAD_FREQ); | ||
318 | |||
319 | active = atomic_long_read(&calc_load_tasks); | ||
320 | active = active > 0 ? active * FIXED_1 : 0; | ||
321 | |||
322 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
323 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
324 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
325 | |||
326 | calc_load_update += n * LOAD_FREQ; | ||
327 | } | ||
328 | |||
329 | /* | ||
330 | * Flip the idle index... | ||
331 | * | ||
332 | * Make sure we first write the new time then flip the index, so that | ||
333 | * calc_load_write_idx() will see the new time when it reads the new | ||
334 | * index, this avoids a double flip messing things up. | ||
335 | */ | ||
336 | smp_wmb(); | ||
337 | calc_load_idx++; | ||
338 | } | ||
339 | #else /* !CONFIG_NO_HZ_COMMON */ | ||
340 | |||
341 | static inline long calc_load_fold_idle(void) { return 0; } | ||
342 | static inline void calc_global_nohz(void) { } | ||
343 | |||
344 | #endif /* CONFIG_NO_HZ_COMMON */ | ||
345 | |||
346 | /* | ||
347 | * calc_load - update the avenrun load estimates 10 ticks after the | ||
348 | * CPUs have updated calc_load_tasks. | ||
349 | */ | ||
350 | void calc_global_load(unsigned long ticks) | ||
351 | { | ||
352 | long active, delta; | ||
353 | |||
354 | if (time_before(jiffies, calc_load_update + 10)) | ||
355 | return; | ||
356 | |||
357 | /* | ||
358 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | ||
359 | */ | ||
360 | delta = calc_load_fold_idle(); | ||
361 | if (delta) | ||
362 | atomic_long_add(delta, &calc_load_tasks); | ||
363 | |||
364 | active = atomic_long_read(&calc_load_tasks); | ||
365 | active = active > 0 ? active * FIXED_1 : 0; | ||
366 | |||
367 | avenrun[0] = calc_load(avenrun[0], EXP_1, active); | ||
368 | avenrun[1] = calc_load(avenrun[1], EXP_5, active); | ||
369 | avenrun[2] = calc_load(avenrun[2], EXP_15, active); | ||
370 | |||
371 | calc_load_update += LOAD_FREQ; | ||
372 | |||
373 | /* | ||
374 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. | ||
375 | */ | ||
376 | calc_global_nohz(); | ||
377 | } | ||
378 | |||
379 | /* | ||
380 | * Called from update_cpu_load() to periodically update this CPU's | ||
381 | * active count. | ||
382 | */ | ||
383 | static void calc_load_account_active(struct rq *this_rq) | ||
384 | { | ||
385 | long delta; | ||
386 | |||
387 | if (time_before(jiffies, this_rq->calc_load_update)) | ||
388 | return; | ||
389 | |||
390 | delta = calc_load_fold_active(this_rq); | ||
391 | if (delta) | ||
392 | atomic_long_add(delta, &calc_load_tasks); | ||
393 | |||
394 | this_rq->calc_load_update += LOAD_FREQ; | ||
395 | } | ||
396 | |||
397 | /* | ||
398 | * End of global load-average stuff | ||
399 | */ | ||
400 | |||
401 | /* | ||
402 | * The exact cpuload at various idx values, calculated at every tick would be | ||
403 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | ||
404 | * | ||
405 | * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called | ||
406 | * on nth tick when cpu may be busy, then we have: | ||
407 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
408 | * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load | ||
409 | * | ||
410 | * decay_load_missed() below does efficient calculation of | ||
411 | * load = ((2^idx - 1) / 2^idx)^(n-1) * load | ||
412 | * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load | ||
413 | * | ||
414 | * The calculation is approximated on a 128 point scale. | ||
415 | * degrade_zero_ticks is the number of ticks after which load at any | ||
416 | * particular idx is approximated to be zero. | ||
417 | * degrade_factor is a precomputed table, a row for each load idx. | ||
418 | * Each column corresponds to degradation factor for a power of two ticks, | ||
419 | * based on 128 point scale. | ||
420 | * Example: | ||
421 | * row 2, col 3 (=12) says that the degradation at load idx 2 after | ||
422 | * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). | ||
423 | * | ||
424 | * With this power of 2 load factors, we can degrade the load n times | ||
425 | * by looking at 1 bits in n and doing as many mult/shift instead of | ||
426 | * n mult/shifts needed by the exact degradation. | ||
427 | */ | ||
428 | #define DEGRADE_SHIFT 7 | ||
429 | static const unsigned char | ||
430 | degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; | ||
431 | static const unsigned char | ||
432 | degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { | ||
433 | {0, 0, 0, 0, 0, 0, 0, 0}, | ||
434 | {64, 32, 8, 0, 0, 0, 0, 0}, | ||
435 | {96, 72, 40, 12, 1, 0, 0}, | ||
436 | {112, 98, 75, 43, 15, 1, 0}, | ||
437 | {120, 112, 98, 76, 45, 16, 2} }; | ||
438 | |||
439 | /* | ||
440 | * Update cpu_load for any missed ticks, due to tickless idle. The backlog | ||
441 | * would be when CPU is idle and so we just decay the old load without | ||
442 | * adding any new load. | ||
443 | */ | ||
444 | static unsigned long | ||
445 | decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | ||
446 | { | ||
447 | int j = 0; | ||
448 | |||
449 | if (!missed_updates) | ||
450 | return load; | ||
451 | |||
452 | if (missed_updates >= degrade_zero_ticks[idx]) | ||
453 | return 0; | ||
454 | |||
455 | if (idx == 1) | ||
456 | return load >> missed_updates; | ||
457 | |||
458 | while (missed_updates) { | ||
459 | if (missed_updates % 2) | ||
460 | load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT; | ||
461 | |||
462 | missed_updates >>= 1; | ||
463 | j++; | ||
464 | } | ||
465 | return load; | ||
466 | } | ||
467 | |||
468 | /* | ||
469 | * Update rq->cpu_load[] statistics. This function is usually called every | ||
470 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | ||
471 | * every tick. We fix it up based on jiffies. | ||
472 | */ | ||
473 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | ||
474 | unsigned long pending_updates) | ||
475 | { | ||
476 | int i, scale; | ||
477 | |||
478 | this_rq->nr_load_updates++; | ||
479 | |||
480 | /* Update our load: */ | ||
481 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | ||
482 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | ||
483 | unsigned long old_load, new_load; | ||
484 | |||
485 | /* scale is effectively 1 << i now, and >> i divides by scale */ | ||
486 | |||
487 | old_load = this_rq->cpu_load[i]; | ||
488 | old_load = decay_load_missed(old_load, pending_updates - 1, i); | ||
489 | new_load = this_load; | ||
490 | /* | ||
491 | * Round up the averaging division if load is increasing. This | ||
492 | * prevents us from getting stuck on 9 if the load is 10, for | ||
493 | * example. | ||
494 | */ | ||
495 | if (new_load > old_load) | ||
496 | new_load += scale - 1; | ||
497 | |||
498 | this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i; | ||
499 | } | ||
500 | |||
501 | sched_avg_update(this_rq); | ||
502 | } | ||
503 | |||
504 | #ifdef CONFIG_NO_HZ_COMMON | ||
505 | /* | ||
506 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
507 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
508 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
509 | * | ||
510 | * Therefore we cannot use the delta approach from the regular tick since that | ||
511 | * would seriously skew the load calculation. However we'll make do for those | ||
512 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
513 | * (tick_nohz_idle_exit). | ||
514 | * | ||
515 | * This means we might still be one tick off for nohz periods. | ||
516 | */ | ||
517 | |||
518 | /* | ||
519 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
520 | * idle balance. | ||
521 | */ | ||
522 | void update_idle_cpu_load(struct rq *this_rq) | ||
523 | { | ||
524 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
525 | unsigned long load = this_rq->load.weight; | ||
526 | unsigned long pending_updates; | ||
527 | |||
528 | /* | ||
529 | * bail if there's load or we're actually up-to-date. | ||
530 | */ | ||
531 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
532 | return; | ||
533 | |||
534 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
535 | this_rq->last_load_update_tick = curr_jiffies; | ||
536 | |||
537 | __update_cpu_load(this_rq, load, pending_updates); | ||
538 | } | ||
539 | |||
540 | /* | ||
541 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
542 | */ | ||
543 | void update_cpu_load_nohz(void) | ||
544 | { | ||
545 | struct rq *this_rq = this_rq(); | ||
546 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
547 | unsigned long pending_updates; | ||
548 | |||
549 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
550 | return; | ||
551 | |||
552 | raw_spin_lock(&this_rq->lock); | ||
553 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
554 | if (pending_updates) { | ||
555 | this_rq->last_load_update_tick = curr_jiffies; | ||
556 | /* | ||
557 | * We were idle, this means load 0, the current load might be | ||
558 | * !0 due to remote wakeups and the sort. | ||
559 | */ | ||
560 | __update_cpu_load(this_rq, 0, pending_updates); | ||
561 | } | ||
562 | raw_spin_unlock(&this_rq->lock); | ||
563 | } | ||
564 | #endif /* CONFIG_NO_HZ */ | ||
565 | |||
566 | /* | ||
567 | * Called from scheduler_tick() | ||
568 | */ | ||
569 | void update_cpu_load_active(struct rq *this_rq) | ||
570 | { | ||
571 | /* | ||
572 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | ||
573 | */ | ||
574 | this_rq->last_load_update_tick = jiffies; | ||
575 | __update_cpu_load(this_rq, this_rq->load.weight, 1); | ||
576 | |||
577 | calc_load_account_active(this_rq); | ||
578 | } | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ce39224d6155..a38ee0a0650e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -10,8 +10,16 @@ | |||
10 | #include "cpupri.h" | 10 | #include "cpupri.h" |
11 | #include "cpuacct.h" | 11 | #include "cpuacct.h" |
12 | 12 | ||
13 | struct rq; | ||
14 | |||
13 | extern __read_mostly int scheduler_running; | 15 | extern __read_mostly int scheduler_running; |
14 | 16 | ||
17 | extern unsigned long calc_load_update; | ||
18 | extern atomic_long_t calc_load_tasks; | ||
19 | |||
20 | extern long calc_load_fold_active(struct rq *this_rq); | ||
21 | extern void update_cpu_load_active(struct rq *this_rq); | ||
22 | |||
15 | /* | 23 | /* |
16 | * Convert user-nice values [ -20 ... 0 ... 19 ] | 24 | * Convert user-nice values [ -20 ... 0 ... 19 ] |
17 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], | 25 | * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], |