diff options
| author | Jens Axboe <axboe@kernel.dk> | 2012-07-30 03:03:10 -0400 |
|---|---|---|
| committer | Jens Axboe <axboe@kernel.dk> | 2012-07-30 03:03:10 -0400 |
| commit | 72ea1f74fcdf874cca6d2c0962379523bbd99e2c (patch) | |
| tree | 4c67be6c73356086ff44ef1b8b1c9479702689ca /kernel/sched | |
| parent | b1af9be5ef77898c05667bb9dbf3b180d91d3292 (diff) | |
| parent | a73ff3231df59a4b92ccd0dd4e73897c5822489b (diff) | |
Merge branch 'for-jens' of git://git.drbd.org/linux-drbd into for-3.6/drivers
Diffstat (limited to 'kernel/sched')
| -rw-r--r-- | kernel/sched/core.c | 525 | ||||
| -rw-r--r-- | kernel/sched/fair.c | 71 | ||||
| -rw-r--r-- | kernel/sched/idle_task.c | 1 | ||||
| -rw-r--r-- | kernel/sched/rt.c | 53 | ||||
| -rw-r--r-- | kernel/sched/sched.h | 4 |
5 files changed, 479 insertions, 175 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 39eb6011bc38..468bdd44c1ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features = | |||
| 142 | #define SCHED_FEAT(name, enabled) \ | 142 | #define SCHED_FEAT(name, enabled) \ |
| 143 | #name , | 143 | #name , |
| 144 | 144 | ||
| 145 | static __read_mostly char *sched_feat_names[] = { | 145 | static const char * const sched_feat_names[] = { |
| 146 | #include "features.h" | 146 | #include "features.h" |
| 147 | NULL | ||
| 148 | }; | 147 | }; |
| 149 | 148 | ||
| 150 | #undef SCHED_FEAT | 149 | #undef SCHED_FEAT |
| @@ -2082,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
| 2082 | #endif | 2081 | #endif |
| 2083 | 2082 | ||
| 2084 | /* Here we just switch the register state and the stack. */ | 2083 | /* Here we just switch the register state and the stack. */ |
| 2085 | rcu_switch_from(prev); | ||
| 2086 | switch_to(prev, next, prev); | 2084 | switch_to(prev, next, prev); |
| 2087 | 2085 | ||
| 2088 | barrier(); | 2086 | barrier(); |
| @@ -2162,11 +2160,73 @@ unsigned long this_cpu_load(void) | |||
| 2162 | } | 2160 | } |
| 2163 | 2161 | ||
| 2164 | 2162 | ||
| 2163 | /* | ||
| 2164 | * Global load-average calculations | ||
| 2165 | * | ||
| 2166 | * We take a distributed and async approach to calculating the global load-avg | ||
| 2167 | * in order to minimize overhead. | ||
| 2168 | * | ||
| 2169 | * The global load average is an exponentially decaying average of nr_running + | ||
| 2170 | * nr_uninterruptible. | ||
| 2171 | * | ||
| 2172 | * Once every LOAD_FREQ: | ||
| 2173 | * | ||
| 2174 | * nr_active = 0; | ||
| 2175 | * for_each_possible_cpu(cpu) | ||
| 2176 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | ||
| 2177 | * | ||
| 2178 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | ||
| 2179 | * | ||
| 2180 | * Due to a number of reasons the above turns in the mess below: | ||
| 2181 | * | ||
| 2182 | * - for_each_possible_cpu() is prohibitively expensive on machines with | ||
| 2183 | * serious number of cpus, therefore we need to take a distributed approach | ||
| 2184 | * to calculating nr_active. | ||
| 2185 | * | ||
| 2186 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | ||
| 2187 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | ||
| 2188 | * | ||
| 2189 | * So assuming nr_active := 0 when we start out -- true per definition, we | ||
| 2190 | * can simply take per-cpu deltas and fold those into a global accumulate | ||
| 2191 | * to obtain the same result. See calc_load_fold_active(). | ||
| 2192 | * | ||
| 2193 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | ||
| 2194 | * across the machine, we assume 10 ticks is sufficient time for every | ||
| 2195 | * cpu to have completed this task. | ||
| 2196 | * | ||
| 2197 | * This places an upper-bound on the IRQ-off latency of the machine. Then | ||
| 2198 | * again, being late doesn't loose the delta, just wrecks the sample. | ||
| 2199 | * | ||
| 2200 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | ||
| 2201 | * this would add another cross-cpu cacheline miss and atomic operation | ||
| 2202 | * to the wakeup path. Instead we increment on whatever cpu the task ran | ||
| 2203 | * when it went into uninterruptible state and decrement on whatever cpu | ||
| 2204 | * did the wakeup. This means that only the sum of nr_uninterruptible over | ||
| 2205 | * all cpus yields the correct result. | ||
| 2206 | * | ||
| 2207 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | ||
| 2208 | */ | ||
| 2209 | |||
| 2165 | /* Variables and functions for calc_load */ | 2210 | /* Variables and functions for calc_load */ |
| 2166 | static atomic_long_t calc_load_tasks; | 2211 | static atomic_long_t calc_load_tasks; |
| 2167 | static unsigned long calc_load_update; | 2212 | static unsigned long calc_load_update; |
| 2168 | unsigned long avenrun[3]; | 2213 | unsigned long avenrun[3]; |
| 2169 | EXPORT_SYMBOL(avenrun); | 2214 | EXPORT_SYMBOL(avenrun); /* should be removed */ |
| 2215 | |||
| 2216 | /** | ||
| 2217 | * get_avenrun - get the load average array | ||
| 2218 | * @loads: pointer to dest load array | ||
| 2219 | * @offset: offset to add | ||
| 2220 | * @shift: shift count to shift the result left | ||
| 2221 | * | ||
| 2222 | * These values are estimates at best, so no need for locking. | ||
| 2223 | */ | ||
| 2224 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
| 2225 | { | ||
| 2226 | loads[0] = (avenrun[0] + offset) << shift; | ||
| 2227 | loads[1] = (avenrun[1] + offset) << shift; | ||
| 2228 | loads[2] = (avenrun[2] + offset) << shift; | ||
| 2229 | } | ||
| 2170 | 2230 | ||
| 2171 | static long calc_load_fold_active(struct rq *this_rq) | 2231 | static long calc_load_fold_active(struct rq *this_rq) |
| 2172 | { | 2232 | { |
| @@ -2183,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
| 2183 | return delta; | 2243 | return delta; |
| 2184 | } | 2244 | } |
| 2185 | 2245 | ||
| 2246 | /* | ||
| 2247 | * a1 = a0 * e + a * (1 - e) | ||
| 2248 | */ | ||
| 2186 | static unsigned long | 2249 | static unsigned long |
| 2187 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | 2250 | calc_load(unsigned long load, unsigned long exp, unsigned long active) |
| 2188 | { | 2251 | { |
| @@ -2194,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
| 2194 | 2257 | ||
| 2195 | #ifdef CONFIG_NO_HZ | 2258 | #ifdef CONFIG_NO_HZ |
| 2196 | /* | 2259 | /* |
| 2197 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 2260 | * Handle NO_HZ for the global load-average. |
| 2261 | * | ||
| 2262 | * Since the above described distributed algorithm to compute the global | ||
| 2263 | * load-average relies on per-cpu sampling from the tick, it is affected by | ||
| 2264 | * NO_HZ. | ||
| 2265 | * | ||
| 2266 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | ||
| 2267 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | ||
| 2268 | * when we read the global state. | ||
| 2269 | * | ||
| 2270 | * Obviously reality has to ruin such a delightfully simple scheme: | ||
| 2271 | * | ||
| 2272 | * - When we go NO_HZ idle during the window, we can negate our sample | ||
| 2273 | * contribution, causing under-accounting. | ||
| 2274 | * | ||
| 2275 | * We avoid this by keeping two idle-delta counters and flipping them | ||
| 2276 | * when the window starts, thus separating old and new NO_HZ load. | ||
| 2277 | * | ||
| 2278 | * The only trick is the slight shift in index flip for read vs write. | ||
| 2279 | * | ||
| 2280 | * 0s 5s 10s 15s | ||
| 2281 | * +10 +10 +10 +10 | ||
| 2282 | * |-|-----------|-|-----------|-|-----------|-| | ||
| 2283 | * r:0 0 1 1 0 0 1 1 0 | ||
| 2284 | * w:0 1 1 0 0 1 1 0 0 | ||
| 2285 | * | ||
| 2286 | * This ensures we'll fold the old idle contribution in this window while | ||
| 2287 | * accumlating the new one. | ||
| 2288 | * | ||
| 2289 | * - When we wake up from NO_HZ idle during the window, we push up our | ||
| 2290 | * contribution, since we effectively move our sample point to a known | ||
| 2291 | * busy state. | ||
| 2292 | * | ||
| 2293 | * This is solved by pushing the window forward, and thus skipping the | ||
| 2294 | * sample, for this cpu (effectively using the idle-delta for this cpu which | ||
| 2295 | * was in effect at the time the window opened). This also solves the issue | ||
| 2296 | * of having to deal with a cpu having been in NOHZ idle for multiple | ||
| 2297 | * LOAD_FREQ intervals. | ||
| 2198 | * | 2298 | * |
| 2199 | * When making the ILB scale, we should try to pull this in as well. | 2299 | * When making the ILB scale, we should try to pull this in as well. |
| 2200 | */ | 2300 | */ |
| 2201 | static atomic_long_t calc_load_tasks_idle; | 2301 | static atomic_long_t calc_load_idle[2]; |
| 2302 | static int calc_load_idx; | ||
| 2303 | |||
| 2304 | static inline int calc_load_write_idx(void) | ||
| 2305 | { | ||
| 2306 | int idx = calc_load_idx; | ||
| 2307 | |||
| 2308 | /* | ||
| 2309 | * See calc_global_nohz(), if we observe the new index, we also | ||
| 2310 | * need to observe the new update time. | ||
| 2311 | */ | ||
| 2312 | smp_rmb(); | ||
| 2313 | |||
| 2314 | /* | ||
| 2315 | * If the folding window started, make sure we start writing in the | ||
| 2316 | * next idle-delta. | ||
| 2317 | */ | ||
| 2318 | if (!time_before(jiffies, calc_load_update)) | ||
| 2319 | idx++; | ||
| 2202 | 2320 | ||
| 2203 | void calc_load_account_idle(struct rq *this_rq) | 2321 | return idx & 1; |
| 2322 | } | ||
| 2323 | |||
| 2324 | static inline int calc_load_read_idx(void) | ||
| 2204 | { | 2325 | { |
| 2326 | return calc_load_idx & 1; | ||
| 2327 | } | ||
| 2328 | |||
| 2329 | void calc_load_enter_idle(void) | ||
| 2330 | { | ||
| 2331 | struct rq *this_rq = this_rq(); | ||
| 2205 | long delta; | 2332 | long delta; |
| 2206 | 2333 | ||
| 2334 | /* | ||
| 2335 | * We're going into NOHZ mode, if there's any pending delta, fold it | ||
| 2336 | * into the pending idle delta. | ||
| 2337 | */ | ||
| 2207 | delta = calc_load_fold_active(this_rq); | 2338 | delta = calc_load_fold_active(this_rq); |
| 2208 | if (delta) | 2339 | if (delta) { |
| 2209 | atomic_long_add(delta, &calc_load_tasks_idle); | 2340 | int idx = calc_load_write_idx(); |
| 2341 | atomic_long_add(delta, &calc_load_idle[idx]); | ||
| 2342 | } | ||
| 2210 | } | 2343 | } |
| 2211 | 2344 | ||
| 2212 | static long calc_load_fold_idle(void) | 2345 | void calc_load_exit_idle(void) |
| 2213 | { | 2346 | { |
| 2214 | long delta = 0; | 2347 | struct rq *this_rq = this_rq(); |
| 2215 | 2348 | ||
| 2216 | /* | 2349 | /* |
| 2217 | * Its got a race, we don't care... | 2350 | * If we're still before the sample window, we're done. |
| 2218 | */ | 2351 | */ |
| 2219 | if (atomic_long_read(&calc_load_tasks_idle)) | 2352 | if (time_before(jiffies, this_rq->calc_load_update)) |
| 2220 | delta = atomic_long_xchg(&calc_load_tasks_idle, 0); | 2353 | return; |
| 2354 | |||
| 2355 | /* | ||
| 2356 | * We woke inside or after the sample window, this means we're already | ||
| 2357 | * accounted through the nohz accounting, so skip the entire deal and | ||
| 2358 | * sync up for the next window. | ||
| 2359 | */ | ||
| 2360 | this_rq->calc_load_update = calc_load_update; | ||
| 2361 | if (time_before(jiffies, this_rq->calc_load_update + 10)) | ||
| 2362 | this_rq->calc_load_update += LOAD_FREQ; | ||
| 2363 | } | ||
| 2364 | |||
| 2365 | static long calc_load_fold_idle(void) | ||
| 2366 | { | ||
| 2367 | int idx = calc_load_read_idx(); | ||
| 2368 | long delta = 0; | ||
| 2369 | |||
| 2370 | if (atomic_long_read(&calc_load_idle[idx])) | ||
| 2371 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | ||
| 2221 | 2372 | ||
| 2222 | return delta; | 2373 | return delta; |
| 2223 | } | 2374 | } |
| @@ -2303,66 +2454,39 @@ static void calc_global_nohz(void) | |||
| 2303 | { | 2454 | { |
| 2304 | long delta, active, n; | 2455 | long delta, active, n; |
| 2305 | 2456 | ||
| 2306 | /* | 2457 | if (!time_before(jiffies, calc_load_update + 10)) { |
| 2307 | * If we crossed a calc_load_update boundary, make sure to fold | 2458 | /* |
| 2308 | * any pending idle changes, the respective CPUs might have | 2459 | * Catch-up, fold however many we are behind still |
| 2309 | * missed the tick driven calc_load_account_active() update | 2460 | */ |
| 2310 | * due to NO_HZ. | 2461 | delta = jiffies - calc_load_update - 10; |
| 2311 | */ | 2462 | n = 1 + (delta / LOAD_FREQ); |
| 2312 | delta = calc_load_fold_idle(); | ||
| 2313 | if (delta) | ||
| 2314 | atomic_long_add(delta, &calc_load_tasks); | ||
| 2315 | |||
| 2316 | /* | ||
| 2317 | * It could be the one fold was all it took, we done! | ||
| 2318 | */ | ||
| 2319 | if (time_before(jiffies, calc_load_update + 10)) | ||
| 2320 | return; | ||
| 2321 | |||
| 2322 | /* | ||
| 2323 | * Catch-up, fold however many we are behind still | ||
| 2324 | */ | ||
| 2325 | delta = jiffies - calc_load_update - 10; | ||
| 2326 | n = 1 + (delta / LOAD_FREQ); | ||
| 2327 | 2463 | ||
| 2328 | active = atomic_long_read(&calc_load_tasks); | 2464 | active = atomic_long_read(&calc_load_tasks); |
| 2329 | active = active > 0 ? active * FIXED_1 : 0; | 2465 | active = active > 0 ? active * FIXED_1 : 0; |
| 2330 | 2466 | ||
| 2331 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | 2467 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); |
| 2332 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | 2468 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); |
| 2333 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | 2469 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); |
| 2334 | 2470 | ||
| 2335 | calc_load_update += n * LOAD_FREQ; | 2471 | calc_load_update += n * LOAD_FREQ; |
| 2336 | } | 2472 | } |
| 2337 | #else | ||
| 2338 | void calc_load_account_idle(struct rq *this_rq) | ||
| 2339 | { | ||
| 2340 | } | ||
| 2341 | 2473 | ||
| 2342 | static inline long calc_load_fold_idle(void) | 2474 | /* |
| 2343 | { | 2475 | * Flip the idle index... |
| 2344 | return 0; | 2476 | * |
| 2477 | * Make sure we first write the new time then flip the index, so that | ||
| 2478 | * calc_load_write_idx() will see the new time when it reads the new | ||
| 2479 | * index, this avoids a double flip messing things up. | ||
| 2480 | */ | ||
| 2481 | smp_wmb(); | ||
| 2482 | calc_load_idx++; | ||
| 2345 | } | 2483 | } |
| 2484 | #else /* !CONFIG_NO_HZ */ | ||
| 2346 | 2485 | ||
| 2347 | static void calc_global_nohz(void) | 2486 | static inline long calc_load_fold_idle(void) { return 0; } |
| 2348 | { | 2487 | static inline void calc_global_nohz(void) { } |
| 2349 | } | ||
| 2350 | #endif | ||
| 2351 | 2488 | ||
| 2352 | /** | 2489 | #endif /* CONFIG_NO_HZ */ |
| 2353 | * get_avenrun - get the load average array | ||
| 2354 | * @loads: pointer to dest load array | ||
| 2355 | * @offset: offset to add | ||
| 2356 | * @shift: shift count to shift the result left | ||
| 2357 | * | ||
| 2358 | * These values are estimates at best, so no need for locking. | ||
| 2359 | */ | ||
| 2360 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
| 2361 | { | ||
| 2362 | loads[0] = (avenrun[0] + offset) << shift; | ||
| 2363 | loads[1] = (avenrun[1] + offset) << shift; | ||
| 2364 | loads[2] = (avenrun[2] + offset) << shift; | ||
| 2365 | } | ||
| 2366 | 2490 | ||
| 2367 | /* | 2491 | /* |
| 2368 | * calc_load - update the avenrun load estimates 10 ticks after the | 2492 | * calc_load - update the avenrun load estimates 10 ticks after the |
| @@ -2370,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
| 2370 | */ | 2494 | */ |
| 2371 | void calc_global_load(unsigned long ticks) | 2495 | void calc_global_load(unsigned long ticks) |
| 2372 | { | 2496 | { |
| 2373 | long active; | 2497 | long active, delta; |
| 2374 | 2498 | ||
| 2375 | if (time_before(jiffies, calc_load_update + 10)) | 2499 | if (time_before(jiffies, calc_load_update + 10)) |
| 2376 | return; | 2500 | return; |
| 2377 | 2501 | ||
| 2502 | /* | ||
| 2503 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | ||
| 2504 | */ | ||
| 2505 | delta = calc_load_fold_idle(); | ||
| 2506 | if (delta) | ||
| 2507 | atomic_long_add(delta, &calc_load_tasks); | ||
| 2508 | |||
| 2378 | active = atomic_long_read(&calc_load_tasks); | 2509 | active = atomic_long_read(&calc_load_tasks); |
| 2379 | active = active > 0 ? active * FIXED_1 : 0; | 2510 | active = active > 0 ? active * FIXED_1 : 0; |
| 2380 | 2511 | ||
| @@ -2385,12 +2516,7 @@ void calc_global_load(unsigned long ticks) | |||
| 2385 | calc_load_update += LOAD_FREQ; | 2516 | calc_load_update += LOAD_FREQ; |
| 2386 | 2517 | ||
| 2387 | /* | 2518 | /* |
| 2388 | * Account one period with whatever state we found before | 2519 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. |
| 2389 | * folding in the nohz state and ageing the entire idle period. | ||
| 2390 | * | ||
| 2391 | * This avoids loosing a sample when we go idle between | ||
| 2392 | * calc_load_account_active() (10 ticks ago) and now and thus | ||
| 2393 | * under-accounting. | ||
| 2394 | */ | 2520 | */ |
| 2395 | calc_global_nohz(); | 2521 | calc_global_nohz(); |
| 2396 | } | 2522 | } |
| @@ -2407,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq) | |||
| 2407 | return; | 2533 | return; |
| 2408 | 2534 | ||
| 2409 | delta = calc_load_fold_active(this_rq); | 2535 | delta = calc_load_fold_active(this_rq); |
| 2410 | delta += calc_load_fold_idle(); | ||
| 2411 | if (delta) | 2536 | if (delta) |
| 2412 | atomic_long_add(delta, &calc_load_tasks); | 2537 | atomic_long_add(delta, &calc_load_tasks); |
| 2413 | 2538 | ||
| @@ -2415,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq) | |||
| 2415 | } | 2540 | } |
| 2416 | 2541 | ||
| 2417 | /* | 2542 | /* |
| 2543 | * End of global load-average stuff | ||
| 2544 | */ | ||
| 2545 | |||
| 2546 | /* | ||
| 2418 | * The exact cpuload at various idx values, calculated at every tick would be | 2547 | * The exact cpuload at various idx values, calculated at every tick would be |
| 2419 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | 2548 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load |
| 2420 | * | 2549 | * |
| @@ -2517,25 +2646,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, | |||
| 2517 | sched_avg_update(this_rq); | 2646 | sched_avg_update(this_rq); |
| 2518 | } | 2647 | } |
| 2519 | 2648 | ||
| 2649 | #ifdef CONFIG_NO_HZ | ||
| 2650 | /* | ||
| 2651 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
| 2652 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
| 2653 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
| 2654 | * | ||
| 2655 | * Therefore we cannot use the delta approach from the regular tick since that | ||
| 2656 | * would seriously skew the load calculation. However we'll make do for those | ||
| 2657 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
| 2658 | * (tick_nohz_idle_exit). | ||
| 2659 | * | ||
| 2660 | * This means we might still be one tick off for nohz periods. | ||
| 2661 | */ | ||
| 2662 | |||
| 2520 | /* | 2663 | /* |
| 2521 | * Called from nohz_idle_balance() to update the load ratings before doing the | 2664 | * Called from nohz_idle_balance() to update the load ratings before doing the |
| 2522 | * idle balance. | 2665 | * idle balance. |
| 2523 | */ | 2666 | */ |
| 2524 | void update_idle_cpu_load(struct rq *this_rq) | 2667 | void update_idle_cpu_load(struct rq *this_rq) |
| 2525 | { | 2668 | { |
| 2526 | unsigned long curr_jiffies = jiffies; | 2669 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); |
| 2527 | unsigned long load = this_rq->load.weight; | 2670 | unsigned long load = this_rq->load.weight; |
| 2528 | unsigned long pending_updates; | 2671 | unsigned long pending_updates; |
| 2529 | 2672 | ||
| 2530 | /* | 2673 | /* |
| 2531 | * Bloody broken means of dealing with nohz, but better than nothing.. | 2674 | * bail if there's load or we're actually up-to-date. |
| 2532 | * jiffies is updated by one cpu, another cpu can drift wrt the jiffy | ||
| 2533 | * update and see 0 difference the one time and 2 the next, even though | ||
| 2534 | * we ticked at roughtly the same rate. | ||
| 2535 | * | ||
| 2536 | * Hence we only use this from nohz_idle_balance() and skip this | ||
| 2537 | * nonsense when called from the scheduler_tick() since that's | ||
| 2538 | * guaranteed a stable rate. | ||
| 2539 | */ | 2675 | */ |
| 2540 | if (load || curr_jiffies == this_rq->last_load_update_tick) | 2676 | if (load || curr_jiffies == this_rq->last_load_update_tick) |
| 2541 | return; | 2677 | return; |
| @@ -2547,12 +2683,38 @@ void update_idle_cpu_load(struct rq *this_rq) | |||
| 2547 | } | 2683 | } |
| 2548 | 2684 | ||
| 2549 | /* | 2685 | /* |
| 2686 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
| 2687 | */ | ||
| 2688 | void update_cpu_load_nohz(void) | ||
| 2689 | { | ||
| 2690 | struct rq *this_rq = this_rq(); | ||
| 2691 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
| 2692 | unsigned long pending_updates; | ||
| 2693 | |||
| 2694 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
| 2695 | return; | ||
| 2696 | |||
| 2697 | raw_spin_lock(&this_rq->lock); | ||
| 2698 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
| 2699 | if (pending_updates) { | ||
| 2700 | this_rq->last_load_update_tick = curr_jiffies; | ||
| 2701 | /* | ||
| 2702 | * We were idle, this means load 0, the current load might be | ||
| 2703 | * !0 due to remote wakeups and the sort. | ||
| 2704 | */ | ||
| 2705 | __update_cpu_load(this_rq, 0, pending_updates); | ||
| 2706 | } | ||
| 2707 | raw_spin_unlock(&this_rq->lock); | ||
| 2708 | } | ||
| 2709 | #endif /* CONFIG_NO_HZ */ | ||
| 2710 | |||
| 2711 | /* | ||
| 2550 | * Called from scheduler_tick() | 2712 | * Called from scheduler_tick() |
| 2551 | */ | 2713 | */ |
| 2552 | static void update_cpu_load_active(struct rq *this_rq) | 2714 | static void update_cpu_load_active(struct rq *this_rq) |
| 2553 | { | 2715 | { |
| 2554 | /* | 2716 | /* |
| 2555 | * See the mess in update_idle_cpu_load(). | 2717 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). |
| 2556 | */ | 2718 | */ |
| 2557 | this_rq->last_load_update_tick = jiffies; | 2719 | this_rq->last_load_update_tick = jiffies; |
| 2558 | __update_cpu_load(this_rq, this_rq->load.weight, 1); | 2720 | __update_cpu_load(this_rq, this_rq->load.weight, 1); |
| @@ -4982,7 +5144,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
| 4982 | p->sched_class->set_cpus_allowed(p, new_mask); | 5144 | p->sched_class->set_cpus_allowed(p, new_mask); |
| 4983 | 5145 | ||
| 4984 | cpumask_copy(&p->cpus_allowed, new_mask); | 5146 | cpumask_copy(&p->cpus_allowed, new_mask); |
| 4985 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | 5147 | p->nr_cpus_allowed = cpumask_weight(new_mask); |
| 4986 | } | 5148 | } |
| 4987 | 5149 | ||
| 4988 | /* | 5150 | /* |
| @@ -5524,15 +5686,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | |||
| 5524 | 5686 | ||
| 5525 | #ifdef CONFIG_SCHED_DEBUG | 5687 | #ifdef CONFIG_SCHED_DEBUG |
| 5526 | 5688 | ||
| 5527 | static __read_mostly int sched_domain_debug_enabled; | 5689 | static __read_mostly int sched_debug_enabled; |
| 5528 | 5690 | ||
| 5529 | static int __init sched_domain_debug_setup(char *str) | 5691 | static int __init sched_debug_setup(char *str) |
| 5530 | { | 5692 | { |
| 5531 | sched_domain_debug_enabled = 1; | 5693 | sched_debug_enabled = 1; |
| 5532 | 5694 | ||
| 5533 | return 0; | 5695 | return 0; |
| 5534 | } | 5696 | } |
| 5535 | early_param("sched_debug", sched_domain_debug_setup); | 5697 | early_param("sched_debug", sched_debug_setup); |
| 5698 | |||
| 5699 | static inline bool sched_debug(void) | ||
| 5700 | { | ||
| 5701 | return sched_debug_enabled; | ||
| 5702 | } | ||
| 5536 | 5703 | ||
| 5537 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 5704 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
| 5538 | struct cpumask *groupmask) | 5705 | struct cpumask *groupmask) |
| @@ -5572,7 +5739,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
| 5572 | break; | 5739 | break; |
| 5573 | } | 5740 | } |
| 5574 | 5741 | ||
| 5575 | if (!group->sgp->power) { | 5742 | /* |
| 5743 | * Even though we initialize ->power to something semi-sane, | ||
| 5744 | * we leave power_orig unset. This allows us to detect if | ||
| 5745 | * domain iteration is still funny without causing /0 traps. | ||
| 5746 | */ | ||
| 5747 | if (!group->sgp->power_orig) { | ||
| 5576 | printk(KERN_CONT "\n"); | 5748 | printk(KERN_CONT "\n"); |
| 5577 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5749 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
| 5578 | "set\n"); | 5750 | "set\n"); |
| @@ -5620,7 +5792,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 5620 | { | 5792 | { |
| 5621 | int level = 0; | 5793 | int level = 0; |
| 5622 | 5794 | ||
| 5623 | if (!sched_domain_debug_enabled) | 5795 | if (!sched_debug_enabled) |
| 5624 | return; | 5796 | return; |
| 5625 | 5797 | ||
| 5626 | if (!sd) { | 5798 | if (!sd) { |
| @@ -5641,6 +5813,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
| 5641 | } | 5813 | } |
| 5642 | #else /* !CONFIG_SCHED_DEBUG */ | 5814 | #else /* !CONFIG_SCHED_DEBUG */ |
| 5643 | # define sched_domain_debug(sd, cpu) do { } while (0) | 5815 | # define sched_domain_debug(sd, cpu) do { } while (0) |
| 5816 | static inline bool sched_debug(void) | ||
| 5817 | { | ||
| 5818 | return false; | ||
| 5819 | } | ||
| 5644 | #endif /* CONFIG_SCHED_DEBUG */ | 5820 | #endif /* CONFIG_SCHED_DEBUG */ |
| 5645 | 5821 | ||
| 5646 | static int sd_degenerate(struct sched_domain *sd) | 5822 | static int sd_degenerate(struct sched_domain *sd) |
| @@ -5962,6 +6138,44 @@ struct sched_domain_topology_level { | |||
| 5962 | struct sd_data data; | 6138 | struct sd_data data; |
| 5963 | }; | 6139 | }; |
| 5964 | 6140 | ||
| 6141 | /* | ||
| 6142 | * Build an iteration mask that can exclude certain CPUs from the upwards | ||
| 6143 | * domain traversal. | ||
| 6144 | * | ||
| 6145 | * Asymmetric node setups can result in situations where the domain tree is of | ||
| 6146 | * unequal depth, make sure to skip domains that already cover the entire | ||
| 6147 | * range. | ||
| 6148 | * | ||
| 6149 | * In that case build_sched_domains() will have terminated the iteration early | ||
| 6150 | * and our sibling sd spans will be empty. Domains should always include the | ||
| 6151 | * cpu they're built on, so check that. | ||
| 6152 | * | ||
| 6153 | */ | ||
| 6154 | static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) | ||
| 6155 | { | ||
| 6156 | const struct cpumask *span = sched_domain_span(sd); | ||
| 6157 | struct sd_data *sdd = sd->private; | ||
| 6158 | struct sched_domain *sibling; | ||
| 6159 | int i; | ||
| 6160 | |||
| 6161 | for_each_cpu(i, span) { | ||
| 6162 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
| 6163 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
| 6164 | continue; | ||
| 6165 | |||
| 6166 | cpumask_set_cpu(i, sched_group_mask(sg)); | ||
| 6167 | } | ||
| 6168 | } | ||
| 6169 | |||
| 6170 | /* | ||
| 6171 | * Return the canonical balance cpu for this group, this is the first cpu | ||
| 6172 | * of this group that's also in the iteration mask. | ||
| 6173 | */ | ||
| 6174 | int group_balance_cpu(struct sched_group *sg) | ||
| 6175 | { | ||
| 6176 | return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); | ||
| 6177 | } | ||
| 6178 | |||
| 5965 | static int | 6179 | static int |
| 5966 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) | 6180 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) |
| 5967 | { | 6181 | { |
| @@ -5980,6 +6194,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5980 | if (cpumask_test_cpu(i, covered)) | 6194 | if (cpumask_test_cpu(i, covered)) |
| 5981 | continue; | 6195 | continue; |
| 5982 | 6196 | ||
| 6197 | child = *per_cpu_ptr(sdd->sd, i); | ||
| 6198 | |||
| 6199 | /* See the comment near build_group_mask(). */ | ||
| 6200 | if (!cpumask_test_cpu(i, sched_domain_span(child))) | ||
| 6201 | continue; | ||
| 6202 | |||
| 5983 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 6203 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
| 5984 | GFP_KERNEL, cpu_to_node(cpu)); | 6204 | GFP_KERNEL, cpu_to_node(cpu)); |
| 5985 | 6205 | ||
| @@ -5987,8 +6207,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5987 | goto fail; | 6207 | goto fail; |
| 5988 | 6208 | ||
| 5989 | sg_span = sched_group_cpus(sg); | 6209 | sg_span = sched_group_cpus(sg); |
| 5990 | |||
| 5991 | child = *per_cpu_ptr(sdd->sd, i); | ||
| 5992 | if (child->child) { | 6210 | if (child->child) { |
| 5993 | child = child->child; | 6211 | child = child->child; |
| 5994 | cpumask_copy(sg_span, sched_domain_span(child)); | 6212 | cpumask_copy(sg_span, sched_domain_span(child)); |
| @@ -5997,10 +6215,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
| 5997 | 6215 | ||
| 5998 | cpumask_or(covered, covered, sg_span); | 6216 | cpumask_or(covered, covered, sg_span); |
| 5999 | 6217 | ||
| 6000 | sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); | 6218 | sg->sgp = *per_cpu_ptr(sdd->sgp, i); |
| 6001 | atomic_inc(&sg->sgp->ref); | 6219 | if (atomic_inc_return(&sg->sgp->ref) == 1) |
| 6220 | build_group_mask(sd, sg); | ||
| 6002 | 6221 | ||
| 6003 | if (cpumask_test_cpu(cpu, sg_span)) | 6222 | /* |
| 6223 | * Initialize sgp->power such that even if we mess up the | ||
| 6224 | * domains and no possible iteration will get us here, we won't | ||
| 6225 | * die on a /0 trap. | ||
| 6226 | */ | ||
| 6227 | sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); | ||
| 6228 | |||
| 6229 | /* | ||
| 6230 | * Make sure the first group of this domain contains the | ||
| 6231 | * canonical balance cpu. Otherwise the sched_domain iteration | ||
| 6232 | * breaks. See update_sg_lb_stats(). | ||
| 6233 | */ | ||
| 6234 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || | ||
| 6235 | group_balance_cpu(sg) == cpu) | ||
| 6004 | groups = sg; | 6236 | groups = sg; |
| 6005 | 6237 | ||
| 6006 | if (!first) | 6238 | if (!first) |
| @@ -6074,6 +6306,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
| 6074 | 6306 | ||
| 6075 | cpumask_clear(sched_group_cpus(sg)); | 6307 | cpumask_clear(sched_group_cpus(sg)); |
| 6076 | sg->sgp->power = 0; | 6308 | sg->sgp->power = 0; |
| 6309 | cpumask_setall(sched_group_mask(sg)); | ||
| 6077 | 6310 | ||
| 6078 | for_each_cpu(j, span) { | 6311 | for_each_cpu(j, span) { |
| 6079 | if (get_group(j, sdd, NULL) != group) | 6312 | if (get_group(j, sdd, NULL) != group) |
| @@ -6115,7 +6348,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
| 6115 | sg = sg->next; | 6348 | sg = sg->next; |
| 6116 | } while (sg != sd->groups); | 6349 | } while (sg != sd->groups); |
| 6117 | 6350 | ||
| 6118 | if (cpu != group_first_cpu(sg)) | 6351 | if (cpu != group_balance_cpu(sg)) |
| 6119 | return; | 6352 | return; |
| 6120 | 6353 | ||
| 6121 | update_group_power(sd, cpu); | 6354 | update_group_power(sd, cpu); |
| @@ -6165,11 +6398,8 @@ int sched_domain_level_max; | |||
| 6165 | 6398 | ||
| 6166 | static int __init setup_relax_domain_level(char *str) | 6399 | static int __init setup_relax_domain_level(char *str) |
| 6167 | { | 6400 | { |
| 6168 | unsigned long val; | 6401 | if (kstrtoint(str, 0, &default_relax_domain_level)) |
| 6169 | 6402 | pr_warn("Unable to set relax_domain_level\n"); | |
| 6170 | val = simple_strtoul(str, NULL, 0); | ||
| 6171 | if (val < sched_domain_level_max) | ||
| 6172 | default_relax_domain_level = val; | ||
| 6173 | 6403 | ||
| 6174 | return 1; | 6404 | return 1; |
| 6175 | } | 6405 | } |
| @@ -6279,14 +6509,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol | |||
| 6279 | #ifdef CONFIG_NUMA | 6509 | #ifdef CONFIG_NUMA |
| 6280 | 6510 | ||
| 6281 | static int sched_domains_numa_levels; | 6511 | static int sched_domains_numa_levels; |
| 6282 | static int sched_domains_numa_scale; | ||
| 6283 | static int *sched_domains_numa_distance; | 6512 | static int *sched_domains_numa_distance; |
| 6284 | static struct cpumask ***sched_domains_numa_masks; | 6513 | static struct cpumask ***sched_domains_numa_masks; |
| 6285 | static int sched_domains_curr_level; | 6514 | static int sched_domains_curr_level; |
| 6286 | 6515 | ||
| 6287 | static inline int sd_local_flags(int level) | 6516 | static inline int sd_local_flags(int level) |
| 6288 | { | 6517 | { |
| 6289 | if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) | 6518 | if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) |
| 6290 | return 0; | 6519 | return 0; |
| 6291 | 6520 | ||
| 6292 | return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; | 6521 | return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; |
| @@ -6344,6 +6573,42 @@ static const struct cpumask *sd_numa_mask(int cpu) | |||
| 6344 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | 6573 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; |
| 6345 | } | 6574 | } |
| 6346 | 6575 | ||
| 6576 | static void sched_numa_warn(const char *str) | ||
| 6577 | { | ||
| 6578 | static int done = false; | ||
| 6579 | int i,j; | ||
| 6580 | |||
| 6581 | if (done) | ||
| 6582 | return; | ||
| 6583 | |||
| 6584 | done = true; | ||
| 6585 | |||
| 6586 | printk(KERN_WARNING "ERROR: %s\n\n", str); | ||
| 6587 | |||
| 6588 | for (i = 0; i < nr_node_ids; i++) { | ||
| 6589 | printk(KERN_WARNING " "); | ||
| 6590 | for (j = 0; j < nr_node_ids; j++) | ||
| 6591 | printk(KERN_CONT "%02d ", node_distance(i,j)); | ||
| 6592 | printk(KERN_CONT "\n"); | ||
| 6593 | } | ||
| 6594 | printk(KERN_WARNING "\n"); | ||
| 6595 | } | ||
| 6596 | |||
| 6597 | static bool find_numa_distance(int distance) | ||
| 6598 | { | ||
| 6599 | int i; | ||
| 6600 | |||
| 6601 | if (distance == node_distance(0, 0)) | ||
| 6602 | return true; | ||
| 6603 | |||
| 6604 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
| 6605 | if (sched_domains_numa_distance[i] == distance) | ||
| 6606 | return true; | ||
| 6607 | } | ||
| 6608 | |||
| 6609 | return false; | ||
| 6610 | } | ||
| 6611 | |||
| 6347 | static void sched_init_numa(void) | 6612 | static void sched_init_numa(void) |
| 6348 | { | 6613 | { |
| 6349 | int next_distance, curr_distance = node_distance(0, 0); | 6614 | int next_distance, curr_distance = node_distance(0, 0); |
| @@ -6351,7 +6616,6 @@ static void sched_init_numa(void) | |||
| 6351 | int level = 0; | 6616 | int level = 0; |
| 6352 | int i, j, k; | 6617 | int i, j, k; |
| 6353 | 6618 | ||
| 6354 | sched_domains_numa_scale = curr_distance; | ||
| 6355 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | 6619 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); |
| 6356 | if (!sched_domains_numa_distance) | 6620 | if (!sched_domains_numa_distance) |
| 6357 | return; | 6621 | return; |
| @@ -6362,23 +6626,41 @@ static void sched_init_numa(void) | |||
| 6362 | * | 6626 | * |
| 6363 | * Assumes node_distance(0,j) includes all distances in | 6627 | * Assumes node_distance(0,j) includes all distances in |
| 6364 | * node_distance(i,j) in order to avoid cubic time. | 6628 | * node_distance(i,j) in order to avoid cubic time. |
| 6365 | * | ||
| 6366 | * XXX: could be optimized to O(n log n) by using sort() | ||
| 6367 | */ | 6629 | */ |
| 6368 | next_distance = curr_distance; | 6630 | next_distance = curr_distance; |
| 6369 | for (i = 0; i < nr_node_ids; i++) { | 6631 | for (i = 0; i < nr_node_ids; i++) { |
| 6370 | for (j = 0; j < nr_node_ids; j++) { | 6632 | for (j = 0; j < nr_node_ids; j++) { |
| 6371 | int distance = node_distance(0, j); | 6633 | for (k = 0; k < nr_node_ids; k++) { |
| 6372 | if (distance > curr_distance && | 6634 | int distance = node_distance(i, k); |
| 6373 | (distance < next_distance || | 6635 | |
| 6374 | next_distance == curr_distance)) | 6636 | if (distance > curr_distance && |
| 6375 | next_distance = distance; | 6637 | (distance < next_distance || |
| 6638 | next_distance == curr_distance)) | ||
| 6639 | next_distance = distance; | ||
| 6640 | |||
| 6641 | /* | ||
| 6642 | * While not a strong assumption it would be nice to know | ||
| 6643 | * about cases where if node A is connected to B, B is not | ||
| 6644 | * equally connected to A. | ||
| 6645 | */ | ||
| 6646 | if (sched_debug() && node_distance(k, i) != distance) | ||
| 6647 | sched_numa_warn("Node-distance not symmetric"); | ||
| 6648 | |||
| 6649 | if (sched_debug() && i && !find_numa_distance(distance)) | ||
| 6650 | sched_numa_warn("Node-0 not representative"); | ||
| 6651 | } | ||
| 6652 | if (next_distance != curr_distance) { | ||
| 6653 | sched_domains_numa_distance[level++] = next_distance; | ||
| 6654 | sched_domains_numa_levels = level; | ||
| 6655 | curr_distance = next_distance; | ||
| 6656 | } else break; | ||
| 6376 | } | 6657 | } |
| 6377 | if (next_distance != curr_distance) { | 6658 | |
| 6378 | sched_domains_numa_distance[level++] = next_distance; | 6659 | /* |
| 6379 | sched_domains_numa_levels = level; | 6660 | * In case of sched_debug() we verify the above assumption. |
| 6380 | curr_distance = next_distance; | 6661 | */ |
| 6381 | } else break; | 6662 | if (!sched_debug()) |
| 6663 | break; | ||
| 6382 | } | 6664 | } |
| 6383 | /* | 6665 | /* |
| 6384 | * 'level' contains the number of unique distances, excluding the | 6666 | * 'level' contains the number of unique distances, excluding the |
| @@ -6403,7 +6685,7 @@ static void sched_init_numa(void) | |||
| 6403 | return; | 6685 | return; |
| 6404 | 6686 | ||
| 6405 | for (j = 0; j < nr_node_ids; j++) { | 6687 | for (j = 0; j < nr_node_ids; j++) { |
| 6406 | struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); | 6688 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); |
| 6407 | if (!mask) | 6689 | if (!mask) |
| 6408 | return; | 6690 | return; |
| 6409 | 6691 | ||
| @@ -6490,7 +6772,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
| 6490 | 6772 | ||
| 6491 | *per_cpu_ptr(sdd->sg, j) = sg; | 6773 | *per_cpu_ptr(sdd->sg, j) = sg; |
| 6492 | 6774 | ||
| 6493 | sgp = kzalloc_node(sizeof(struct sched_group_power), | 6775 | sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), |
| 6494 | GFP_KERNEL, cpu_to_node(j)); | 6776 | GFP_KERNEL, cpu_to_node(j)); |
| 6495 | if (!sgp) | 6777 | if (!sgp) |
| 6496 | return -ENOMEM; | 6778 | return -ENOMEM; |
| @@ -6543,7 +6825,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
| 6543 | if (!sd) | 6825 | if (!sd) |
| 6544 | return child; | 6826 | return child; |
| 6545 | 6827 | ||
| 6546 | set_domain_attribute(sd, attr); | ||
| 6547 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | 6828 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); |
| 6548 | if (child) { | 6829 | if (child) { |
| 6549 | sd->level = child->level + 1; | 6830 | sd->level = child->level + 1; |
| @@ -6551,6 +6832,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
| 6551 | child->parent = sd; | 6832 | child->parent = sd; |
| 6552 | } | 6833 | } |
| 6553 | sd->child = child; | 6834 | sd->child = child; |
| 6835 | set_domain_attribute(sd, attr); | ||
| 6554 | 6836 | ||
| 6555 | return sd; | 6837 | return sd; |
| 6556 | } | 6838 | } |
| @@ -6691,7 +6973,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) | |||
| 6691 | if (!doms_cur) | 6973 | if (!doms_cur) |
| 6692 | doms_cur = &fallback_doms; | 6974 | doms_cur = &fallback_doms; |
| 6693 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 6975 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
| 6694 | dattr_cur = NULL; | ||
| 6695 | err = build_sched_domains(doms_cur[0], NULL); | 6976 | err = build_sched_domains(doms_cur[0], NULL); |
| 6696 | register_sched_domain_sysctl(); | 6977 | register_sched_domain_sysctl(); |
| 6697 | 6978 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 940e6d17cf96..c099cc6eebe3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
| @@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) | |||
| 2703 | int want_sd = 1; | 2703 | int want_sd = 1; |
| 2704 | int sync = wake_flags & WF_SYNC; | 2704 | int sync = wake_flags & WF_SYNC; |
| 2705 | 2705 | ||
| 2706 | if (p->rt.nr_cpus_allowed == 1) | 2706 | if (p->nr_cpus_allowed == 1) |
| 2707 | return prev_cpu; | 2707 | return prev_cpu; |
| 2708 | 2708 | ||
| 2709 | if (sd_flag & SD_BALANCE_WAKE) { | 2709 | if (sd_flag & SD_BALANCE_WAKE) { |
| @@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) | |||
| 3503 | unsigned long scale_rt_power(int cpu) | 3503 | unsigned long scale_rt_power(int cpu) |
| 3504 | { | 3504 | { |
| 3505 | struct rq *rq = cpu_rq(cpu); | 3505 | struct rq *rq = cpu_rq(cpu); |
| 3506 | u64 total, available; | 3506 | u64 total, available, age_stamp, avg; |
| 3507 | 3507 | ||
| 3508 | total = sched_avg_period() + (rq->clock - rq->age_stamp); | 3508 | /* |
| 3509 | * Since we're reading these variables without serialization make sure | ||
| 3510 | * we read them once before doing sanity checks on them. | ||
| 3511 | */ | ||
| 3512 | age_stamp = ACCESS_ONCE(rq->age_stamp); | ||
| 3513 | avg = ACCESS_ONCE(rq->rt_avg); | ||
| 3514 | |||
| 3515 | total = sched_avg_period() + (rq->clock - age_stamp); | ||
| 3509 | 3516 | ||
| 3510 | if (unlikely(total < rq->rt_avg)) { | 3517 | if (unlikely(total < avg)) { |
| 3511 | /* Ensures that power won't end up being negative */ | 3518 | /* Ensures that power won't end up being negative */ |
| 3512 | available = 0; | 3519 | available = 0; |
| 3513 | } else { | 3520 | } else { |
| 3514 | available = total - rq->rt_avg; | 3521 | available = total - avg; |
| 3515 | } | 3522 | } |
| 3516 | 3523 | ||
| 3517 | if (unlikely((s64)total < SCHED_POWER_SCALE)) | 3524 | if (unlikely((s64)total < SCHED_POWER_SCALE)) |
| @@ -3574,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu) | |||
| 3574 | 3581 | ||
| 3575 | power = 0; | 3582 | power = 0; |
| 3576 | 3583 | ||
| 3577 | group = child->groups; | 3584 | if (child->flags & SD_OVERLAP) { |
| 3578 | do { | 3585 | /* |
| 3579 | power += group->sgp->power; | 3586 | * SD_OVERLAP domains cannot assume that child groups |
| 3580 | group = group->next; | 3587 | * span the current group. |
| 3581 | } while (group != child->groups); | 3588 | */ |
| 3582 | 3589 | ||
| 3583 | sdg->sgp->power = power; | 3590 | for_each_cpu(cpu, sched_group_cpus(sdg)) |
| 3591 | power += power_of(cpu); | ||
| 3592 | } else { | ||
| 3593 | /* | ||
| 3594 | * !SD_OVERLAP domains can assume that child groups | ||
| 3595 | * span the current group. | ||
| 3596 | */ | ||
| 3597 | |||
| 3598 | group = child->groups; | ||
| 3599 | do { | ||
| 3600 | power += group->sgp->power; | ||
| 3601 | group = group->next; | ||
| 3602 | } while (group != child->groups); | ||
| 3603 | } | ||
| 3604 | |||
| 3605 | sdg->sgp->power_orig = sdg->sgp->power = power; | ||
| 3584 | } | 3606 | } |
| 3585 | 3607 | ||
| 3586 | /* | 3608 | /* |
| @@ -3610,7 +3632,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group) | |||
| 3610 | 3632 | ||
| 3611 | /** | 3633 | /** |
| 3612 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. | 3634 | * update_sg_lb_stats - Update sched_group's statistics for load balancing. |
| 3613 | * @sd: The sched_domain whose statistics are to be updated. | 3635 | * @env: The load balancing environment. |
| 3614 | * @group: sched_group whose statistics are to be updated. | 3636 | * @group: sched_group whose statistics are to be updated. |
| 3615 | * @load_idx: Load index of sched_domain of this_cpu for load calc. | 3637 | * @load_idx: Load index of sched_domain of this_cpu for load calc. |
| 3616 | * @local_group: Does group contain this_cpu. | 3638 | * @local_group: Does group contain this_cpu. |
| @@ -3630,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 3630 | int i; | 3652 | int i; |
| 3631 | 3653 | ||
| 3632 | if (local_group) | 3654 | if (local_group) |
| 3633 | balance_cpu = group_first_cpu(group); | 3655 | balance_cpu = group_balance_cpu(group); |
| 3634 | 3656 | ||
| 3635 | /* Tally up the load of all CPUs in the group */ | 3657 | /* Tally up the load of all CPUs in the group */ |
| 3636 | max_cpu_load = 0; | 3658 | max_cpu_load = 0; |
| @@ -3645,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 3645 | 3667 | ||
| 3646 | /* Bias balancing toward cpus of our domain */ | 3668 | /* Bias balancing toward cpus of our domain */ |
| 3647 | if (local_group) { | 3669 | if (local_group) { |
| 3648 | if (idle_cpu(i) && !first_idle_cpu) { | 3670 | if (idle_cpu(i) && !first_idle_cpu && |
| 3671 | cpumask_test_cpu(i, sched_group_mask(group))) { | ||
| 3649 | first_idle_cpu = 1; | 3672 | first_idle_cpu = 1; |
| 3650 | balance_cpu = i; | 3673 | balance_cpu = i; |
| 3651 | } | 3674 | } |
| @@ -3719,11 +3742,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
| 3719 | 3742 | ||
| 3720 | /** | 3743 | /** |
| 3721 | * update_sd_pick_busiest - return 1 on busiest group | 3744 | * update_sd_pick_busiest - return 1 on busiest group |
| 3722 | * @sd: sched_domain whose statistics are to be checked | 3745 | * @env: The load balancing environment. |
| 3723 | * @sds: sched_domain statistics | 3746 | * @sds: sched_domain statistics |
| 3724 | * @sg: sched_group candidate to be checked for being the busiest | 3747 | * @sg: sched_group candidate to be checked for being the busiest |
| 3725 | * @sgs: sched_group statistics | 3748 | * @sgs: sched_group statistics |
| 3726 | * @this_cpu: the current cpu | ||
| 3727 | * | 3749 | * |
| 3728 | * Determine if @sg is a busier group than the previously selected | 3750 | * Determine if @sg is a busier group than the previously selected |
| 3729 | * busiest group. | 3751 | * busiest group. |
| @@ -3761,9 +3783,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, | |||
| 3761 | 3783 | ||
| 3762 | /** | 3784 | /** |
| 3763 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. | 3785 | * update_sd_lb_stats - Update sched_domain's statistics for load balancing. |
| 3764 | * @sd: sched_domain whose statistics are to be updated. | 3786 | * @env: The load balancing environment. |
| 3765 | * @this_cpu: Cpu for which load balance is currently performed. | ||
| 3766 | * @idle: Idle status of this_cpu | ||
| 3767 | * @cpus: Set of cpus considered for load balancing. | 3787 | * @cpus: Set of cpus considered for load balancing. |
| 3768 | * @balance: Should we balance. | 3788 | * @balance: Should we balance. |
| 3769 | * @sds: variable to hold the statistics for this sched_domain. | 3789 | * @sds: variable to hold the statistics for this sched_domain. |
| @@ -3852,10 +3872,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, | |||
| 3852 | * Returns 1 when packing is required and a task should be moved to | 3872 | * Returns 1 when packing is required and a task should be moved to |
| 3853 | * this CPU. The amount of the imbalance is returned in *imbalance. | 3873 | * this CPU. The amount of the imbalance is returned in *imbalance. |
| 3854 | * | 3874 | * |
| 3855 | * @sd: The sched_domain whose packing is to be checked. | 3875 | * @env: The load balancing environment. |
| 3856 | * @sds: Statistics of the sched_domain which is to be packed | 3876 | * @sds: Statistics of the sched_domain which is to be packed |
| 3857 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
| 3858 | * @imbalance: returns amount of imbalanced due to packing. | ||
| 3859 | */ | 3877 | */ |
| 3860 | static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | 3878 | static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) |
| 3861 | { | 3879 | { |
| @@ -3881,9 +3899,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) | |||
| 3881 | * fix_small_imbalance - Calculate the minor imbalance that exists | 3899 | * fix_small_imbalance - Calculate the minor imbalance that exists |
| 3882 | * amongst the groups of a sched_domain, during | 3900 | * amongst the groups of a sched_domain, during |
| 3883 | * load balancing. | 3901 | * load balancing. |
| 3902 | * @env: The load balancing environment. | ||
| 3884 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. | 3903 | * @sds: Statistics of the sched_domain whose imbalance is to be calculated. |
| 3885 | * @this_cpu: The cpu at whose sched_domain we're performing load-balance. | ||
| 3886 | * @imbalance: Variable to store the imbalance. | ||
| 3887 | */ | 3904 | */ |
| 3888 | static inline | 3905 | static inline |
| 3889 | void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) | 3906 | void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) |
| @@ -4026,11 +4043,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
| 4026 | * Also calculates the amount of weighted load which should be moved | 4043 | * Also calculates the amount of weighted load which should be moved |
| 4027 | * to restore balance. | 4044 | * to restore balance. |
| 4028 | * | 4045 | * |
| 4029 | * @sd: The sched_domain whose busiest group is to be returned. | 4046 | * @env: The load balancing environment. |
| 4030 | * @this_cpu: The cpu for which load balancing is currently being performed. | ||
| 4031 | * @imbalance: Variable which stores amount of weighted load which should | ||
| 4032 | * be moved to restore balance/put a group to idle. | ||
| 4033 | * @idle: The idle status of this_cpu. | ||
| 4034 | * @cpus: The set of CPUs under consideration for load-balancing. | 4047 | * @cpus: The set of CPUs under consideration for load-balancing. |
| 4035 | * @balance: Pointer to a variable indicating if this_cpu | 4048 | * @balance: Pointer to a variable indicating if this_cpu |
| 4036 | * is the appropriate cpu to perform load balancing at this_level. | 4049 | * is the appropriate cpu to perform load balancing at this_level. |
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b44d604b35d1..b6baf370cae9 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c | |||
| @@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl | |||
| 25 | static struct task_struct *pick_next_task_idle(struct rq *rq) | 25 | static struct task_struct *pick_next_task_idle(struct rq *rq) |
| 26 | { | 26 | { |
| 27 | schedstat_inc(rq, sched_goidle); | 27 | schedstat_inc(rq, sched_goidle); |
| 28 | calc_load_account_idle(rq); | ||
| 29 | return rq->idle; | 28 | return rq->idle; |
| 30 | } | 29 | } |
| 31 | 30 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c5565c3c515f..573e1ca01102 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
| @@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq) | |||
| 274 | 274 | ||
| 275 | static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 275 | static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
| 276 | { | 276 | { |
| 277 | struct task_struct *p; | ||
| 278 | |||
| 277 | if (!rt_entity_is_task(rt_se)) | 279 | if (!rt_entity_is_task(rt_se)) |
| 278 | return; | 280 | return; |
| 279 | 281 | ||
| 282 | p = rt_task_of(rt_se); | ||
| 280 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; | 283 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; |
| 281 | 284 | ||
| 282 | rt_rq->rt_nr_total++; | 285 | rt_rq->rt_nr_total++; |
| 283 | if (rt_se->nr_cpus_allowed > 1) | 286 | if (p->nr_cpus_allowed > 1) |
| 284 | rt_rq->rt_nr_migratory++; | 287 | rt_rq->rt_nr_migratory++; |
| 285 | 288 | ||
| 286 | update_rt_migration(rt_rq); | 289 | update_rt_migration(rt_rq); |
| @@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | |||
| 288 | 291 | ||
| 289 | static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | 292 | static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) |
| 290 | { | 293 | { |
| 294 | struct task_struct *p; | ||
| 295 | |||
| 291 | if (!rt_entity_is_task(rt_se)) | 296 | if (!rt_entity_is_task(rt_se)) |
| 292 | return; | 297 | return; |
| 293 | 298 | ||
| 299 | p = rt_task_of(rt_se); | ||
| 294 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; | 300 | rt_rq = &rq_of_rt_rq(rt_rq)->rt; |
| 295 | 301 | ||
| 296 | rt_rq->rt_nr_total--; | 302 | rt_rq->rt_nr_total--; |
| 297 | if (rt_se->nr_cpus_allowed > 1) | 303 | if (p->nr_cpus_allowed > 1) |
| 298 | rt_rq->rt_nr_migratory--; | 304 | rt_rq->rt_nr_migratory--; |
| 299 | 305 | ||
| 300 | update_rt_migration(rt_rq); | 306 | update_rt_migration(rt_rq); |
| @@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) | |||
| 1161 | 1167 | ||
| 1162 | enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); | 1168 | enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); |
| 1163 | 1169 | ||
| 1164 | if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) | 1170 | if (!task_current(rq, p) && p->nr_cpus_allowed > 1) |
| 1165 | enqueue_pushable_task(rq, p); | 1171 | enqueue_pushable_task(rq, p); |
| 1166 | 1172 | ||
| 1167 | inc_nr_running(rq); | 1173 | inc_nr_running(rq); |
| @@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
| 1225 | 1231 | ||
| 1226 | cpu = task_cpu(p); | 1232 | cpu = task_cpu(p); |
| 1227 | 1233 | ||
| 1228 | if (p->rt.nr_cpus_allowed == 1) | 1234 | if (p->nr_cpus_allowed == 1) |
| 1229 | goto out; | 1235 | goto out; |
| 1230 | 1236 | ||
| 1231 | /* For anything but wake ups, just return the task_cpu */ | 1237 | /* For anything but wake ups, just return the task_cpu */ |
| @@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) | |||
| 1260 | * will have to sort it out. | 1266 | * will have to sort it out. |
| 1261 | */ | 1267 | */ |
| 1262 | if (curr && unlikely(rt_task(curr)) && | 1268 | if (curr && unlikely(rt_task(curr)) && |
| 1263 | (curr->rt.nr_cpus_allowed < 2 || | 1269 | (curr->nr_cpus_allowed < 2 || |
| 1264 | curr->prio <= p->prio) && | 1270 | curr->prio <= p->prio) && |
| 1265 | (p->rt.nr_cpus_allowed > 1)) { | 1271 | (p->nr_cpus_allowed > 1)) { |
| 1266 | int target = find_lowest_rq(p); | 1272 | int target = find_lowest_rq(p); |
| 1267 | 1273 | ||
| 1268 | if (target != -1) | 1274 | if (target != -1) |
| @@ -1276,10 +1282,10 @@ out: | |||
| 1276 | 1282 | ||
| 1277 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | 1283 | static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) |
| 1278 | { | 1284 | { |
| 1279 | if (rq->curr->rt.nr_cpus_allowed == 1) | 1285 | if (rq->curr->nr_cpus_allowed == 1) |
| 1280 | return; | 1286 | return; |
| 1281 | 1287 | ||
| 1282 | if (p->rt.nr_cpus_allowed != 1 | 1288 | if (p->nr_cpus_allowed != 1 |
| 1283 | && cpupri_find(&rq->rd->cpupri, p, NULL)) | 1289 | && cpupri_find(&rq->rd->cpupri, p, NULL)) |
| 1284 | return; | 1290 | return; |
| 1285 | 1291 | ||
| @@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
| 1395 | * The previous task needs to be made eligible for pushing | 1401 | * The previous task needs to be made eligible for pushing |
| 1396 | * if it is still active | 1402 | * if it is still active |
| 1397 | */ | 1403 | */ |
| 1398 | if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) | 1404 | if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) |
| 1399 | enqueue_pushable_task(rq, p); | 1405 | enqueue_pushable_task(rq, p); |
| 1400 | } | 1406 | } |
| 1401 | 1407 | ||
| @@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | |||
| 1408 | { | 1414 | { |
| 1409 | if (!task_running(rq, p) && | 1415 | if (!task_running(rq, p) && |
| 1410 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && | 1416 | (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && |
| 1411 | (p->rt.nr_cpus_allowed > 1)) | 1417 | (p->nr_cpus_allowed > 1)) |
| 1412 | return 1; | 1418 | return 1; |
| 1413 | return 0; | 1419 | return 0; |
| 1414 | } | 1420 | } |
| @@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
| 1464 | if (unlikely(!lowest_mask)) | 1470 | if (unlikely(!lowest_mask)) |
| 1465 | return -1; | 1471 | return -1; |
| 1466 | 1472 | ||
| 1467 | if (task->rt.nr_cpus_allowed == 1) | 1473 | if (task->nr_cpus_allowed == 1) |
| 1468 | return -1; /* No other targets possible */ | 1474 | return -1; /* No other targets possible */ |
| 1469 | 1475 | ||
| 1470 | if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) | 1476 | if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) |
| @@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | |||
| 1556 | task_running(rq, task) || | 1562 | task_running(rq, task) || |
| 1557 | !task->on_rq)) { | 1563 | !task->on_rq)) { |
| 1558 | 1564 | ||
| 1559 | raw_spin_unlock(&lowest_rq->lock); | 1565 | double_unlock_balance(rq, lowest_rq); |
| 1560 | lowest_rq = NULL; | 1566 | lowest_rq = NULL; |
| 1561 | break; | 1567 | break; |
| 1562 | } | 1568 | } |
| @@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) | |||
| 1586 | 1592 | ||
| 1587 | BUG_ON(rq->cpu != task_cpu(p)); | 1593 | BUG_ON(rq->cpu != task_cpu(p)); |
| 1588 | BUG_ON(task_current(rq, p)); | 1594 | BUG_ON(task_current(rq, p)); |
| 1589 | BUG_ON(p->rt.nr_cpus_allowed <= 1); | 1595 | BUG_ON(p->nr_cpus_allowed <= 1); |
| 1590 | 1596 | ||
| 1591 | BUG_ON(!p->on_rq); | 1597 | BUG_ON(!p->on_rq); |
| 1592 | BUG_ON(!rt_task(p)); | 1598 | BUG_ON(!rt_task(p)); |
| @@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) | |||
| 1793 | if (!task_running(rq, p) && | 1799 | if (!task_running(rq, p) && |
| 1794 | !test_tsk_need_resched(rq->curr) && | 1800 | !test_tsk_need_resched(rq->curr) && |
| 1795 | has_pushable_tasks(rq) && | 1801 | has_pushable_tasks(rq) && |
| 1796 | p->rt.nr_cpus_allowed > 1 && | 1802 | p->nr_cpus_allowed > 1 && |
| 1797 | rt_task(rq->curr) && | 1803 | rt_task(rq->curr) && |
| 1798 | (rq->curr->rt.nr_cpus_allowed < 2 || | 1804 | (rq->curr->nr_cpus_allowed < 2 || |
| 1799 | rq->curr->prio <= p->prio)) | 1805 | rq->curr->prio <= p->prio)) |
| 1800 | push_rt_tasks(rq); | 1806 | push_rt_tasks(rq); |
| 1801 | } | 1807 | } |
| @@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, | |||
| 1817 | * Only update if the process changes its state from whether it | 1823 | * Only update if the process changes its state from whether it |
| 1818 | * can migrate or not. | 1824 | * can migrate or not. |
| 1819 | */ | 1825 | */ |
| 1820 | if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) | 1826 | if ((p->nr_cpus_allowed > 1) == (weight > 1)) |
| 1821 | return; | 1827 | return; |
| 1822 | 1828 | ||
| 1823 | rq = task_rq(p); | 1829 | rq = task_rq(p); |
| @@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
| 1979 | 1985 | ||
| 1980 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | 1986 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) |
| 1981 | { | 1987 | { |
| 1988 | struct sched_rt_entity *rt_se = &p->rt; | ||
| 1989 | |||
| 1982 | update_curr_rt(rq); | 1990 | update_curr_rt(rq); |
| 1983 | 1991 | ||
| 1984 | watchdog(rq, p); | 1992 | watchdog(rq, p); |
| @@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | |||
| 1996 | p->rt.time_slice = RR_TIMESLICE; | 2004 | p->rt.time_slice = RR_TIMESLICE; |
| 1997 | 2005 | ||
| 1998 | /* | 2006 | /* |
| 1999 | * Requeue to the end of queue if we are not the only element | 2007 | * Requeue to the end of queue if we (and all of our ancestors) are the |
| 2000 | * on the queue: | 2008 | * only element on the queue |
| 2001 | */ | 2009 | */ |
| 2002 | if (p->rt.run_list.prev != p->rt.run_list.next) { | 2010 | for_each_sched_rt_entity(rt_se) { |
| 2003 | requeue_task_rt(rq, p, 0); | 2011 | if (rt_se->run_list.prev != rt_se->run_list.next) { |
| 2004 | set_tsk_need_resched(p); | 2012 | requeue_task_rt(rq, p, 0); |
| 2013 | set_tsk_need_resched(p); | ||
| 2014 | return; | ||
| 2015 | } | ||
| 2005 | } | 2016 | } |
| 2006 | } | 2017 | } |
| 2007 | 2018 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ba9dccfd24ce..55844f24435a 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
| @@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | |||
| 526 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); | 526 | DECLARE_PER_CPU(struct sched_domain *, sd_llc); |
| 527 | DECLARE_PER_CPU(int, sd_llc_id); | 527 | DECLARE_PER_CPU(int, sd_llc_id); |
| 528 | 528 | ||
| 529 | extern int group_balance_cpu(struct sched_group *sg); | ||
| 530 | |||
| 529 | #endif /* CONFIG_SMP */ | 531 | #endif /* CONFIG_SMP */ |
| 530 | 532 | ||
| 531 | #include "stats.h" | 533 | #include "stats.h" |
| @@ -940,8 +942,6 @@ static inline u64 sched_avg_period(void) | |||
| 940 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; | 942 | return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; |
| 941 | } | 943 | } |
| 942 | 944 | ||
| 943 | void calc_load_account_idle(struct rq *this_rq); | ||
| 944 | |||
| 945 | #ifdef CONFIG_SCHED_HRTICK | 945 | #ifdef CONFIG_SCHED_HRTICK |
| 946 | 946 | ||
| 947 | /* | 947 | /* |
