diff options
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r-- | kernel/sched/core.c | 919 |
1 files changed, 605 insertions, 314 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0533a688ce22..468bdd44c1ba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -83,6 +83,7 @@ | |||
83 | 83 | ||
84 | #include "sched.h" | 84 | #include "sched.h" |
85 | #include "../workqueue_sched.h" | 85 | #include "../workqueue_sched.h" |
86 | #include "../smpboot.h" | ||
86 | 87 | ||
87 | #define CREATE_TRACE_POINTS | 88 | #define CREATE_TRACE_POINTS |
88 | #include <trace/events/sched.h> | 89 | #include <trace/events/sched.h> |
@@ -141,9 +142,8 @@ const_debug unsigned int sysctl_sched_features = | |||
141 | #define SCHED_FEAT(name, enabled) \ | 142 | #define SCHED_FEAT(name, enabled) \ |
142 | #name , | 143 | #name , |
143 | 144 | ||
144 | static __read_mostly char *sched_feat_names[] = { | 145 | static const char * const sched_feat_names[] = { |
145 | #include "features.h" | 146 | #include "features.h" |
146 | NULL | ||
147 | }; | 147 | }; |
148 | 148 | ||
149 | #undef SCHED_FEAT | 149 | #undef SCHED_FEAT |
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data) | |||
692 | } | 692 | } |
693 | #endif | 693 | #endif |
694 | 694 | ||
695 | void update_cpu_load(struct rq *this_rq); | ||
696 | |||
697 | static void set_load_weight(struct task_struct *p) | 695 | static void set_load_weight(struct task_struct *p) |
698 | { | 696 | { |
699 | int prio = p->static_prio - MAX_RT_PRIO; | 697 | int prio = p->static_prio - MAX_RT_PRIO; |
@@ -2162,11 +2160,73 @@ unsigned long this_cpu_load(void) | |||
2162 | } | 2160 | } |
2163 | 2161 | ||
2164 | 2162 | ||
2163 | /* | ||
2164 | * Global load-average calculations | ||
2165 | * | ||
2166 | * We take a distributed and async approach to calculating the global load-avg | ||
2167 | * in order to minimize overhead. | ||
2168 | * | ||
2169 | * The global load average is an exponentially decaying average of nr_running + | ||
2170 | * nr_uninterruptible. | ||
2171 | * | ||
2172 | * Once every LOAD_FREQ: | ||
2173 | * | ||
2174 | * nr_active = 0; | ||
2175 | * for_each_possible_cpu(cpu) | ||
2176 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; | ||
2177 | * | ||
2178 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) | ||
2179 | * | ||
2180 | * Due to a number of reasons the above turns in the mess below: | ||
2181 | * | ||
2182 | * - for_each_possible_cpu() is prohibitively expensive on machines with | ||
2183 | * serious number of cpus, therefore we need to take a distributed approach | ||
2184 | * to calculating nr_active. | ||
2185 | * | ||
2186 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | ||
2187 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | ||
2188 | * | ||
2189 | * So assuming nr_active := 0 when we start out -- true per definition, we | ||
2190 | * can simply take per-cpu deltas and fold those into a global accumulate | ||
2191 | * to obtain the same result. See calc_load_fold_active(). | ||
2192 | * | ||
2193 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | ||
2194 | * across the machine, we assume 10 ticks is sufficient time for every | ||
2195 | * cpu to have completed this task. | ||
2196 | * | ||
2197 | * This places an upper-bound on the IRQ-off latency of the machine. Then | ||
2198 | * again, being late doesn't loose the delta, just wrecks the sample. | ||
2199 | * | ||
2200 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | ||
2201 | * this would add another cross-cpu cacheline miss and atomic operation | ||
2202 | * to the wakeup path. Instead we increment on whatever cpu the task ran | ||
2203 | * when it went into uninterruptible state and decrement on whatever cpu | ||
2204 | * did the wakeup. This means that only the sum of nr_uninterruptible over | ||
2205 | * all cpus yields the correct result. | ||
2206 | * | ||
2207 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | ||
2208 | */ | ||
2209 | |||
2165 | /* Variables and functions for calc_load */ | 2210 | /* Variables and functions for calc_load */ |
2166 | static atomic_long_t calc_load_tasks; | 2211 | static atomic_long_t calc_load_tasks; |
2167 | static unsigned long calc_load_update; | 2212 | static unsigned long calc_load_update; |
2168 | unsigned long avenrun[3]; | 2213 | unsigned long avenrun[3]; |
2169 | EXPORT_SYMBOL(avenrun); | 2214 | EXPORT_SYMBOL(avenrun); /* should be removed */ |
2215 | |||
2216 | /** | ||
2217 | * get_avenrun - get the load average array | ||
2218 | * @loads: pointer to dest load array | ||
2219 | * @offset: offset to add | ||
2220 | * @shift: shift count to shift the result left | ||
2221 | * | ||
2222 | * These values are estimates at best, so no need for locking. | ||
2223 | */ | ||
2224 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
2225 | { | ||
2226 | loads[0] = (avenrun[0] + offset) << shift; | ||
2227 | loads[1] = (avenrun[1] + offset) << shift; | ||
2228 | loads[2] = (avenrun[2] + offset) << shift; | ||
2229 | } | ||
2170 | 2230 | ||
2171 | static long calc_load_fold_active(struct rq *this_rq) | 2231 | static long calc_load_fold_active(struct rq *this_rq) |
2172 | { | 2232 | { |
@@ -2183,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
2183 | return delta; | 2243 | return delta; |
2184 | } | 2244 | } |
2185 | 2245 | ||
2246 | /* | ||
2247 | * a1 = a0 * e + a * (1 - e) | ||
2248 | */ | ||
2186 | static unsigned long | 2249 | static unsigned long |
2187 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | 2250 | calc_load(unsigned long load, unsigned long exp, unsigned long active) |
2188 | { | 2251 | { |
@@ -2194,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
2194 | 2257 | ||
2195 | #ifdef CONFIG_NO_HZ | 2258 | #ifdef CONFIG_NO_HZ |
2196 | /* | 2259 | /* |
2197 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 2260 | * Handle NO_HZ for the global load-average. |
2261 | * | ||
2262 | * Since the above described distributed algorithm to compute the global | ||
2263 | * load-average relies on per-cpu sampling from the tick, it is affected by | ||
2264 | * NO_HZ. | ||
2265 | * | ||
2266 | * The basic idea is to fold the nr_active delta into a global idle-delta upon | ||
2267 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | ||
2268 | * when we read the global state. | ||
2269 | * | ||
2270 | * Obviously reality has to ruin such a delightfully simple scheme: | ||
2271 | * | ||
2272 | * - When we go NO_HZ idle during the window, we can negate our sample | ||
2273 | * contribution, causing under-accounting. | ||
2274 | * | ||
2275 | * We avoid this by keeping two idle-delta counters and flipping them | ||
2276 | * when the window starts, thus separating old and new NO_HZ load. | ||
2277 | * | ||
2278 | * The only trick is the slight shift in index flip for read vs write. | ||
2279 | * | ||
2280 | * 0s 5s 10s 15s | ||
2281 | * +10 +10 +10 +10 | ||
2282 | * |-|-----------|-|-----------|-|-----------|-| | ||
2283 | * r:0 0 1 1 0 0 1 1 0 | ||
2284 | * w:0 1 1 0 0 1 1 0 0 | ||
2285 | * | ||
2286 | * This ensures we'll fold the old idle contribution in this window while | ||
2287 | * accumlating the new one. | ||
2288 | * | ||
2289 | * - When we wake up from NO_HZ idle during the window, we push up our | ||
2290 | * contribution, since we effectively move our sample point to a known | ||
2291 | * busy state. | ||
2292 | * | ||
2293 | * This is solved by pushing the window forward, and thus skipping the | ||
2294 | * sample, for this cpu (effectively using the idle-delta for this cpu which | ||
2295 | * was in effect at the time the window opened). This also solves the issue | ||
2296 | * of having to deal with a cpu having been in NOHZ idle for multiple | ||
2297 | * LOAD_FREQ intervals. | ||
2198 | * | 2298 | * |
2199 | * When making the ILB scale, we should try to pull this in as well. | 2299 | * When making the ILB scale, we should try to pull this in as well. |
2200 | */ | 2300 | */ |
2201 | static atomic_long_t calc_load_tasks_idle; | 2301 | static atomic_long_t calc_load_idle[2]; |
2302 | static int calc_load_idx; | ||
2202 | 2303 | ||
2203 | void calc_load_account_idle(struct rq *this_rq) | 2304 | static inline int calc_load_write_idx(void) |
2204 | { | 2305 | { |
2306 | int idx = calc_load_idx; | ||
2307 | |||
2308 | /* | ||
2309 | * See calc_global_nohz(), if we observe the new index, we also | ||
2310 | * need to observe the new update time. | ||
2311 | */ | ||
2312 | smp_rmb(); | ||
2313 | |||
2314 | /* | ||
2315 | * If the folding window started, make sure we start writing in the | ||
2316 | * next idle-delta. | ||
2317 | */ | ||
2318 | if (!time_before(jiffies, calc_load_update)) | ||
2319 | idx++; | ||
2320 | |||
2321 | return idx & 1; | ||
2322 | } | ||
2323 | |||
2324 | static inline int calc_load_read_idx(void) | ||
2325 | { | ||
2326 | return calc_load_idx & 1; | ||
2327 | } | ||
2328 | |||
2329 | void calc_load_enter_idle(void) | ||
2330 | { | ||
2331 | struct rq *this_rq = this_rq(); | ||
2205 | long delta; | 2332 | long delta; |
2206 | 2333 | ||
2334 | /* | ||
2335 | * We're going into NOHZ mode, if there's any pending delta, fold it | ||
2336 | * into the pending idle delta. | ||
2337 | */ | ||
2207 | delta = calc_load_fold_active(this_rq); | 2338 | delta = calc_load_fold_active(this_rq); |
2208 | if (delta) | 2339 | if (delta) { |
2209 | atomic_long_add(delta, &calc_load_tasks_idle); | 2340 | int idx = calc_load_write_idx(); |
2341 | atomic_long_add(delta, &calc_load_idle[idx]); | ||
2342 | } | ||
2210 | } | 2343 | } |
2211 | 2344 | ||
2212 | static long calc_load_fold_idle(void) | 2345 | void calc_load_exit_idle(void) |
2213 | { | 2346 | { |
2214 | long delta = 0; | 2347 | struct rq *this_rq = this_rq(); |
2215 | 2348 | ||
2216 | /* | 2349 | /* |
2217 | * Its got a race, we don't care... | 2350 | * If we're still before the sample window, we're done. |
2218 | */ | 2351 | */ |
2219 | if (atomic_long_read(&calc_load_tasks_idle)) | 2352 | if (time_before(jiffies, this_rq->calc_load_update)) |
2220 | delta = atomic_long_xchg(&calc_load_tasks_idle, 0); | 2353 | return; |
2354 | |||
2355 | /* | ||
2356 | * We woke inside or after the sample window, this means we're already | ||
2357 | * accounted through the nohz accounting, so skip the entire deal and | ||
2358 | * sync up for the next window. | ||
2359 | */ | ||
2360 | this_rq->calc_load_update = calc_load_update; | ||
2361 | if (time_before(jiffies, this_rq->calc_load_update + 10)) | ||
2362 | this_rq->calc_load_update += LOAD_FREQ; | ||
2363 | } | ||
2364 | |||
2365 | static long calc_load_fold_idle(void) | ||
2366 | { | ||
2367 | int idx = calc_load_read_idx(); | ||
2368 | long delta = 0; | ||
2369 | |||
2370 | if (atomic_long_read(&calc_load_idle[idx])) | ||
2371 | delta = atomic_long_xchg(&calc_load_idle[idx], 0); | ||
2221 | 2372 | ||
2222 | return delta; | 2373 | return delta; |
2223 | } | 2374 | } |
@@ -2303,66 +2454,39 @@ static void calc_global_nohz(void) | |||
2303 | { | 2454 | { |
2304 | long delta, active, n; | 2455 | long delta, active, n; |
2305 | 2456 | ||
2306 | /* | 2457 | if (!time_before(jiffies, calc_load_update + 10)) { |
2307 | * If we crossed a calc_load_update boundary, make sure to fold | 2458 | /* |
2308 | * any pending idle changes, the respective CPUs might have | 2459 | * Catch-up, fold however many we are behind still |
2309 | * missed the tick driven calc_load_account_active() update | 2460 | */ |
2310 | * due to NO_HZ. | 2461 | delta = jiffies - calc_load_update - 10; |
2311 | */ | 2462 | n = 1 + (delta / LOAD_FREQ); |
2312 | delta = calc_load_fold_idle(); | ||
2313 | if (delta) | ||
2314 | atomic_long_add(delta, &calc_load_tasks); | ||
2315 | 2463 | ||
2316 | /* | 2464 | active = atomic_long_read(&calc_load_tasks); |
2317 | * It could be the one fold was all it took, we done! | 2465 | active = active > 0 ? active * FIXED_1 : 0; |
2318 | */ | ||
2319 | if (time_before(jiffies, calc_load_update + 10)) | ||
2320 | return; | ||
2321 | 2466 | ||
2322 | /* | 2467 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); |
2323 | * Catch-up, fold however many we are behind still | 2468 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); |
2324 | */ | 2469 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); |
2325 | delta = jiffies - calc_load_update - 10; | ||
2326 | n = 1 + (delta / LOAD_FREQ); | ||
2327 | 2470 | ||
2328 | active = atomic_long_read(&calc_load_tasks); | 2471 | calc_load_update += n * LOAD_FREQ; |
2329 | active = active > 0 ? active * FIXED_1 : 0; | 2472 | } |
2330 | |||
2331 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
2332 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
2333 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
2334 | |||
2335 | calc_load_update += n * LOAD_FREQ; | ||
2336 | } | ||
2337 | #else | ||
2338 | void calc_load_account_idle(struct rq *this_rq) | ||
2339 | { | ||
2340 | } | ||
2341 | 2473 | ||
2342 | static inline long calc_load_fold_idle(void) | 2474 | /* |
2343 | { | 2475 | * Flip the idle index... |
2344 | return 0; | 2476 | * |
2477 | * Make sure we first write the new time then flip the index, so that | ||
2478 | * calc_load_write_idx() will see the new time when it reads the new | ||
2479 | * index, this avoids a double flip messing things up. | ||
2480 | */ | ||
2481 | smp_wmb(); | ||
2482 | calc_load_idx++; | ||
2345 | } | 2483 | } |
2484 | #else /* !CONFIG_NO_HZ */ | ||
2346 | 2485 | ||
2347 | static void calc_global_nohz(void) | 2486 | static inline long calc_load_fold_idle(void) { return 0; } |
2348 | { | 2487 | static inline void calc_global_nohz(void) { } |
2349 | } | ||
2350 | #endif | ||
2351 | 2488 | ||
2352 | /** | 2489 | #endif /* CONFIG_NO_HZ */ |
2353 | * get_avenrun - get the load average array | ||
2354 | * @loads: pointer to dest load array | ||
2355 | * @offset: offset to add | ||
2356 | * @shift: shift count to shift the result left | ||
2357 | * | ||
2358 | * These values are estimates at best, so no need for locking. | ||
2359 | */ | ||
2360 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | ||
2361 | { | ||
2362 | loads[0] = (avenrun[0] + offset) << shift; | ||
2363 | loads[1] = (avenrun[1] + offset) << shift; | ||
2364 | loads[2] = (avenrun[2] + offset) << shift; | ||
2365 | } | ||
2366 | 2490 | ||
2367 | /* | 2491 | /* |
2368 | * calc_load - update the avenrun load estimates 10 ticks after the | 2492 | * calc_load - update the avenrun load estimates 10 ticks after the |
@@ -2370,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
2370 | */ | 2494 | */ |
2371 | void calc_global_load(unsigned long ticks) | 2495 | void calc_global_load(unsigned long ticks) |
2372 | { | 2496 | { |
2373 | long active; | 2497 | long active, delta; |
2374 | 2498 | ||
2375 | if (time_before(jiffies, calc_load_update + 10)) | 2499 | if (time_before(jiffies, calc_load_update + 10)) |
2376 | return; | 2500 | return; |
2377 | 2501 | ||
2502 | /* | ||
2503 | * Fold the 'old' idle-delta to include all NO_HZ cpus. | ||
2504 | */ | ||
2505 | delta = calc_load_fold_idle(); | ||
2506 | if (delta) | ||
2507 | atomic_long_add(delta, &calc_load_tasks); | ||
2508 | |||
2378 | active = atomic_long_read(&calc_load_tasks); | 2509 | active = atomic_long_read(&calc_load_tasks); |
2379 | active = active > 0 ? active * FIXED_1 : 0; | 2510 | active = active > 0 ? active * FIXED_1 : 0; |
2380 | 2511 | ||
@@ -2385,12 +2516,7 @@ void calc_global_load(unsigned long ticks) | |||
2385 | calc_load_update += LOAD_FREQ; | 2516 | calc_load_update += LOAD_FREQ; |
2386 | 2517 | ||
2387 | /* | 2518 | /* |
2388 | * Account one period with whatever state we found before | 2519 | * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk. |
2389 | * folding in the nohz state and ageing the entire idle period. | ||
2390 | * | ||
2391 | * This avoids loosing a sample when we go idle between | ||
2392 | * calc_load_account_active() (10 ticks ago) and now and thus | ||
2393 | * under-accounting. | ||
2394 | */ | 2520 | */ |
2395 | calc_global_nohz(); | 2521 | calc_global_nohz(); |
2396 | } | 2522 | } |
@@ -2407,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq) | |||
2407 | return; | 2533 | return; |
2408 | 2534 | ||
2409 | delta = calc_load_fold_active(this_rq); | 2535 | delta = calc_load_fold_active(this_rq); |
2410 | delta += calc_load_fold_idle(); | ||
2411 | if (delta) | 2536 | if (delta) |
2412 | atomic_long_add(delta, &calc_load_tasks); | 2537 | atomic_long_add(delta, &calc_load_tasks); |
2413 | 2538 | ||
@@ -2415,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq) | |||
2415 | } | 2540 | } |
2416 | 2541 | ||
2417 | /* | 2542 | /* |
2543 | * End of global load-average stuff | ||
2544 | */ | ||
2545 | |||
2546 | /* | ||
2418 | * The exact cpuload at various idx values, calculated at every tick would be | 2547 | * The exact cpuload at various idx values, calculated at every tick would be |
2419 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load | 2548 | * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load |
2420 | * | 2549 | * |
@@ -2486,22 +2615,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) | |||
2486 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called | 2615 | * scheduler tick (TICK_NSEC). With tickless idle this will not be called |
2487 | * every tick. We fix it up based on jiffies. | 2616 | * every tick. We fix it up based on jiffies. |
2488 | */ | 2617 | */ |
2489 | void update_cpu_load(struct rq *this_rq) | 2618 | static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, |
2619 | unsigned long pending_updates) | ||
2490 | { | 2620 | { |
2491 | unsigned long this_load = this_rq->load.weight; | ||
2492 | unsigned long curr_jiffies = jiffies; | ||
2493 | unsigned long pending_updates; | ||
2494 | int i, scale; | 2621 | int i, scale; |
2495 | 2622 | ||
2496 | this_rq->nr_load_updates++; | 2623 | this_rq->nr_load_updates++; |
2497 | 2624 | ||
2498 | /* Avoid repeated calls on same jiffy, when moving in and out of idle */ | ||
2499 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
2500 | return; | ||
2501 | |||
2502 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2503 | this_rq->last_load_update_tick = curr_jiffies; | ||
2504 | |||
2505 | /* Update our load: */ | 2625 | /* Update our load: */ |
2506 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ | 2626 | this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ |
2507 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { | 2627 | for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { |
@@ -2526,9 +2646,78 @@ void update_cpu_load(struct rq *this_rq) | |||
2526 | sched_avg_update(this_rq); | 2646 | sched_avg_update(this_rq); |
2527 | } | 2647 | } |
2528 | 2648 | ||
2649 | #ifdef CONFIG_NO_HZ | ||
2650 | /* | ||
2651 | * There is no sane way to deal with nohz on smp when using jiffies because the | ||
2652 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | ||
2653 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | ||
2654 | * | ||
2655 | * Therefore we cannot use the delta approach from the regular tick since that | ||
2656 | * would seriously skew the load calculation. However we'll make do for those | ||
2657 | * updates happening while idle (nohz_idle_balance) or coming out of idle | ||
2658 | * (tick_nohz_idle_exit). | ||
2659 | * | ||
2660 | * This means we might still be one tick off for nohz periods. | ||
2661 | */ | ||
2662 | |||
2663 | /* | ||
2664 | * Called from nohz_idle_balance() to update the load ratings before doing the | ||
2665 | * idle balance. | ||
2666 | */ | ||
2667 | void update_idle_cpu_load(struct rq *this_rq) | ||
2668 | { | ||
2669 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
2670 | unsigned long load = this_rq->load.weight; | ||
2671 | unsigned long pending_updates; | ||
2672 | |||
2673 | /* | ||
2674 | * bail if there's load or we're actually up-to-date. | ||
2675 | */ | ||
2676 | if (load || curr_jiffies == this_rq->last_load_update_tick) | ||
2677 | return; | ||
2678 | |||
2679 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2680 | this_rq->last_load_update_tick = curr_jiffies; | ||
2681 | |||
2682 | __update_cpu_load(this_rq, load, pending_updates); | ||
2683 | } | ||
2684 | |||
2685 | /* | ||
2686 | * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. | ||
2687 | */ | ||
2688 | void update_cpu_load_nohz(void) | ||
2689 | { | ||
2690 | struct rq *this_rq = this_rq(); | ||
2691 | unsigned long curr_jiffies = ACCESS_ONCE(jiffies); | ||
2692 | unsigned long pending_updates; | ||
2693 | |||
2694 | if (curr_jiffies == this_rq->last_load_update_tick) | ||
2695 | return; | ||
2696 | |||
2697 | raw_spin_lock(&this_rq->lock); | ||
2698 | pending_updates = curr_jiffies - this_rq->last_load_update_tick; | ||
2699 | if (pending_updates) { | ||
2700 | this_rq->last_load_update_tick = curr_jiffies; | ||
2701 | /* | ||
2702 | * We were idle, this means load 0, the current load might be | ||
2703 | * !0 due to remote wakeups and the sort. | ||
2704 | */ | ||
2705 | __update_cpu_load(this_rq, 0, pending_updates); | ||
2706 | } | ||
2707 | raw_spin_unlock(&this_rq->lock); | ||
2708 | } | ||
2709 | #endif /* CONFIG_NO_HZ */ | ||
2710 | |||
2711 | /* | ||
2712 | * Called from scheduler_tick() | ||
2713 | */ | ||
2529 | static void update_cpu_load_active(struct rq *this_rq) | 2714 | static void update_cpu_load_active(struct rq *this_rq) |
2530 | { | 2715 | { |
2531 | update_cpu_load(this_rq); | 2716 | /* |
2717 | * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). | ||
2718 | */ | ||
2719 | this_rq->last_load_update_tick = jiffies; | ||
2720 | __update_cpu_load(this_rq, this_rq->load.weight, 1); | ||
2532 | 2721 | ||
2533 | calc_load_account_active(this_rq); | 2722 | calc_load_account_active(this_rq); |
2534 | } | 2723 | } |
@@ -3113,6 +3302,7 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
3113 | if (irqs_disabled()) | 3302 | if (irqs_disabled()) |
3114 | print_irqtrace_events(prev); | 3303 | print_irqtrace_events(prev); |
3115 | dump_stack(); | 3304 | dump_stack(); |
3305 | add_taint(TAINT_WARN); | ||
3116 | } | 3306 | } |
3117 | 3307 | ||
3118 | /* | 3308 | /* |
@@ -4042,11 +4232,8 @@ static bool check_same_owner(struct task_struct *p) | |||
4042 | 4232 | ||
4043 | rcu_read_lock(); | 4233 | rcu_read_lock(); |
4044 | pcred = __task_cred(p); | 4234 | pcred = __task_cred(p); |
4045 | if (cred->user->user_ns == pcred->user->user_ns) | 4235 | match = (uid_eq(cred->euid, pcred->euid) || |
4046 | match = (cred->euid == pcred->euid || | 4236 | uid_eq(cred->euid, pcred->uid)); |
4047 | cred->euid == pcred->uid); | ||
4048 | else | ||
4049 | match = false; | ||
4050 | rcu_read_unlock(); | 4237 | rcu_read_unlock(); |
4051 | return match; | 4238 | return match; |
4052 | } | 4239 | } |
@@ -4957,7 +5144,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) | |||
4957 | p->sched_class->set_cpus_allowed(p, new_mask); | 5144 | p->sched_class->set_cpus_allowed(p, new_mask); |
4958 | 5145 | ||
4959 | cpumask_copy(&p->cpus_allowed, new_mask); | 5146 | cpumask_copy(&p->cpus_allowed, new_mask); |
4960 | p->rt.nr_cpus_allowed = cpumask_weight(new_mask); | 5147 | p->nr_cpus_allowed = cpumask_weight(new_mask); |
4961 | } | 5148 | } |
4962 | 5149 | ||
4963 | /* | 5150 | /* |
@@ -5499,15 +5686,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */ | |||
5499 | 5686 | ||
5500 | #ifdef CONFIG_SCHED_DEBUG | 5687 | #ifdef CONFIG_SCHED_DEBUG |
5501 | 5688 | ||
5502 | static __read_mostly int sched_domain_debug_enabled; | 5689 | static __read_mostly int sched_debug_enabled; |
5503 | 5690 | ||
5504 | static int __init sched_domain_debug_setup(char *str) | 5691 | static int __init sched_debug_setup(char *str) |
5505 | { | 5692 | { |
5506 | sched_domain_debug_enabled = 1; | 5693 | sched_debug_enabled = 1; |
5507 | 5694 | ||
5508 | return 0; | 5695 | return 0; |
5509 | } | 5696 | } |
5510 | early_param("sched_debug", sched_domain_debug_setup); | 5697 | early_param("sched_debug", sched_debug_setup); |
5698 | |||
5699 | static inline bool sched_debug(void) | ||
5700 | { | ||
5701 | return sched_debug_enabled; | ||
5702 | } | ||
5511 | 5703 | ||
5512 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | 5704 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
5513 | struct cpumask *groupmask) | 5705 | struct cpumask *groupmask) |
@@ -5547,7 +5739,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5547 | break; | 5739 | break; |
5548 | } | 5740 | } |
5549 | 5741 | ||
5550 | if (!group->sgp->power) { | 5742 | /* |
5743 | * Even though we initialize ->power to something semi-sane, | ||
5744 | * we leave power_orig unset. This allows us to detect if | ||
5745 | * domain iteration is still funny without causing /0 traps. | ||
5746 | */ | ||
5747 | if (!group->sgp->power_orig) { | ||
5551 | printk(KERN_CONT "\n"); | 5748 | printk(KERN_CONT "\n"); |
5552 | printk(KERN_ERR "ERROR: domain->cpu_power not " | 5749 | printk(KERN_ERR "ERROR: domain->cpu_power not " |
5553 | "set\n"); | 5750 | "set\n"); |
@@ -5560,7 +5757,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
5560 | break; | 5757 | break; |
5561 | } | 5758 | } |
5562 | 5759 | ||
5563 | if (cpumask_intersects(groupmask, sched_group_cpus(group))) { | 5760 | if (!(sd->flags & SD_OVERLAP) && |
5761 | cpumask_intersects(groupmask, sched_group_cpus(group))) { | ||
5564 | printk(KERN_CONT "\n"); | 5762 | printk(KERN_CONT "\n"); |
5565 | printk(KERN_ERR "ERROR: repeated CPUs\n"); | 5763 | printk(KERN_ERR "ERROR: repeated CPUs\n"); |
5566 | break; | 5764 | break; |
@@ -5594,7 +5792,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5594 | { | 5792 | { |
5595 | int level = 0; | 5793 | int level = 0; |
5596 | 5794 | ||
5597 | if (!sched_domain_debug_enabled) | 5795 | if (!sched_debug_enabled) |
5598 | return; | 5796 | return; |
5599 | 5797 | ||
5600 | if (!sd) { | 5798 | if (!sd) { |
@@ -5615,6 +5813,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) | |||
5615 | } | 5813 | } |
5616 | #else /* !CONFIG_SCHED_DEBUG */ | 5814 | #else /* !CONFIG_SCHED_DEBUG */ |
5617 | # define sched_domain_debug(sd, cpu) do { } while (0) | 5815 | # define sched_domain_debug(sd, cpu) do { } while (0) |
5816 | static inline bool sched_debug(void) | ||
5817 | { | ||
5818 | return false; | ||
5819 | } | ||
5618 | #endif /* CONFIG_SCHED_DEBUG */ | 5820 | #endif /* CONFIG_SCHED_DEBUG */ |
5619 | 5821 | ||
5620 | static int sd_degenerate(struct sched_domain *sd) | 5822 | static int sd_degenerate(struct sched_domain *sd) |
@@ -5898,99 +6100,11 @@ static int __init isolated_cpu_setup(char *str) | |||
5898 | 6100 | ||
5899 | __setup("isolcpus=", isolated_cpu_setup); | 6101 | __setup("isolcpus=", isolated_cpu_setup); |
5900 | 6102 | ||
5901 | #ifdef CONFIG_NUMA | ||
5902 | |||
5903 | /** | ||
5904 | * find_next_best_node - find the next node to include in a sched_domain | ||
5905 | * @node: node whose sched_domain we're building | ||
5906 | * @used_nodes: nodes already in the sched_domain | ||
5907 | * | ||
5908 | * Find the next node to include in a given scheduling domain. Simply | ||
5909 | * finds the closest node not already in the @used_nodes map. | ||
5910 | * | ||
5911 | * Should use nodemask_t. | ||
5912 | */ | ||
5913 | static int find_next_best_node(int node, nodemask_t *used_nodes) | ||
5914 | { | ||
5915 | int i, n, val, min_val, best_node = -1; | ||
5916 | |||
5917 | min_val = INT_MAX; | ||
5918 | |||
5919 | for (i = 0; i < nr_node_ids; i++) { | ||
5920 | /* Start at @node */ | ||
5921 | n = (node + i) % nr_node_ids; | ||
5922 | |||
5923 | if (!nr_cpus_node(n)) | ||
5924 | continue; | ||
5925 | |||
5926 | /* Skip already used nodes */ | ||
5927 | if (node_isset(n, *used_nodes)) | ||
5928 | continue; | ||
5929 | |||
5930 | /* Simple min distance search */ | ||
5931 | val = node_distance(node, n); | ||
5932 | |||
5933 | if (val < min_val) { | ||
5934 | min_val = val; | ||
5935 | best_node = n; | ||
5936 | } | ||
5937 | } | ||
5938 | |||
5939 | if (best_node != -1) | ||
5940 | node_set(best_node, *used_nodes); | ||
5941 | return best_node; | ||
5942 | } | ||
5943 | |||
5944 | /** | ||
5945 | * sched_domain_node_span - get a cpumask for a node's sched_domain | ||
5946 | * @node: node whose cpumask we're constructing | ||
5947 | * @span: resulting cpumask | ||
5948 | * | ||
5949 | * Given a node, construct a good cpumask for its sched_domain to span. It | ||
5950 | * should be one that prevents unnecessary balancing, but also spreads tasks | ||
5951 | * out optimally. | ||
5952 | */ | ||
5953 | static void sched_domain_node_span(int node, struct cpumask *span) | ||
5954 | { | ||
5955 | nodemask_t used_nodes; | ||
5956 | int i; | ||
5957 | |||
5958 | cpumask_clear(span); | ||
5959 | nodes_clear(used_nodes); | ||
5960 | |||
5961 | cpumask_or(span, span, cpumask_of_node(node)); | ||
5962 | node_set(node, used_nodes); | ||
5963 | |||
5964 | for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { | ||
5965 | int next_node = find_next_best_node(node, &used_nodes); | ||
5966 | if (next_node < 0) | ||
5967 | break; | ||
5968 | cpumask_or(span, span, cpumask_of_node(next_node)); | ||
5969 | } | ||
5970 | } | ||
5971 | |||
5972 | static const struct cpumask *cpu_node_mask(int cpu) | ||
5973 | { | ||
5974 | lockdep_assert_held(&sched_domains_mutex); | ||
5975 | |||
5976 | sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask); | ||
5977 | |||
5978 | return sched_domains_tmpmask; | ||
5979 | } | ||
5980 | |||
5981 | static const struct cpumask *cpu_allnodes_mask(int cpu) | ||
5982 | { | ||
5983 | return cpu_possible_mask; | ||
5984 | } | ||
5985 | #endif /* CONFIG_NUMA */ | ||
5986 | |||
5987 | static const struct cpumask *cpu_cpu_mask(int cpu) | 6103 | static const struct cpumask *cpu_cpu_mask(int cpu) |
5988 | { | 6104 | { |
5989 | return cpumask_of_node(cpu_to_node(cpu)); | 6105 | return cpumask_of_node(cpu_to_node(cpu)); |
5990 | } | 6106 | } |
5991 | 6107 | ||
5992 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | ||
5993 | |||
5994 | struct sd_data { | 6108 | struct sd_data { |
5995 | struct sched_domain **__percpu sd; | 6109 | struct sched_domain **__percpu sd; |
5996 | struct sched_group **__percpu sg; | 6110 | struct sched_group **__percpu sg; |
@@ -6020,9 +6134,48 @@ struct sched_domain_topology_level { | |||
6020 | sched_domain_init_f init; | 6134 | sched_domain_init_f init; |
6021 | sched_domain_mask_f mask; | 6135 | sched_domain_mask_f mask; |
6022 | int flags; | 6136 | int flags; |
6137 | int numa_level; | ||
6023 | struct sd_data data; | 6138 | struct sd_data data; |
6024 | }; | 6139 | }; |
6025 | 6140 | ||
6141 | /* | ||
6142 | * Build an iteration mask that can exclude certain CPUs from the upwards | ||
6143 | * domain traversal. | ||
6144 | * | ||
6145 | * Asymmetric node setups can result in situations where the domain tree is of | ||
6146 | * unequal depth, make sure to skip domains that already cover the entire | ||
6147 | * range. | ||
6148 | * | ||
6149 | * In that case build_sched_domains() will have terminated the iteration early | ||
6150 | * and our sibling sd spans will be empty. Domains should always include the | ||
6151 | * cpu they're built on, so check that. | ||
6152 | * | ||
6153 | */ | ||
6154 | static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) | ||
6155 | { | ||
6156 | const struct cpumask *span = sched_domain_span(sd); | ||
6157 | struct sd_data *sdd = sd->private; | ||
6158 | struct sched_domain *sibling; | ||
6159 | int i; | ||
6160 | |||
6161 | for_each_cpu(i, span) { | ||
6162 | sibling = *per_cpu_ptr(sdd->sd, i); | ||
6163 | if (!cpumask_test_cpu(i, sched_domain_span(sibling))) | ||
6164 | continue; | ||
6165 | |||
6166 | cpumask_set_cpu(i, sched_group_mask(sg)); | ||
6167 | } | ||
6168 | } | ||
6169 | |||
6170 | /* | ||
6171 | * Return the canonical balance cpu for this group, this is the first cpu | ||
6172 | * of this group that's also in the iteration mask. | ||
6173 | */ | ||
6174 | int group_balance_cpu(struct sched_group *sg) | ||
6175 | { | ||
6176 | return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); | ||
6177 | } | ||
6178 | |||
6026 | static int | 6179 | static int |
6027 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) | 6180 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) |
6028 | { | 6181 | { |
@@ -6041,6 +6194,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
6041 | if (cpumask_test_cpu(i, covered)) | 6194 | if (cpumask_test_cpu(i, covered)) |
6042 | continue; | 6195 | continue; |
6043 | 6196 | ||
6197 | child = *per_cpu_ptr(sdd->sd, i); | ||
6198 | |||
6199 | /* See the comment near build_group_mask(). */ | ||
6200 | if (!cpumask_test_cpu(i, sched_domain_span(child))) | ||
6201 | continue; | ||
6202 | |||
6044 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), | 6203 | sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), |
6045 | GFP_KERNEL, cpu_to_node(cpu)); | 6204 | GFP_KERNEL, cpu_to_node(cpu)); |
6046 | 6205 | ||
@@ -6048,8 +6207,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
6048 | goto fail; | 6207 | goto fail; |
6049 | 6208 | ||
6050 | sg_span = sched_group_cpus(sg); | 6209 | sg_span = sched_group_cpus(sg); |
6051 | |||
6052 | child = *per_cpu_ptr(sdd->sd, i); | ||
6053 | if (child->child) { | 6210 | if (child->child) { |
6054 | child = child->child; | 6211 | child = child->child; |
6055 | cpumask_copy(sg_span, sched_domain_span(child)); | 6212 | cpumask_copy(sg_span, sched_domain_span(child)); |
@@ -6058,10 +6215,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) | |||
6058 | 6215 | ||
6059 | cpumask_or(covered, covered, sg_span); | 6216 | cpumask_or(covered, covered, sg_span); |
6060 | 6217 | ||
6061 | sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); | 6218 | sg->sgp = *per_cpu_ptr(sdd->sgp, i); |
6062 | atomic_inc(&sg->sgp->ref); | 6219 | if (atomic_inc_return(&sg->sgp->ref) == 1) |
6220 | build_group_mask(sd, sg); | ||
6063 | 6221 | ||
6064 | if (cpumask_test_cpu(cpu, sg_span)) | 6222 | /* |
6223 | * Initialize sgp->power such that even if we mess up the | ||
6224 | * domains and no possible iteration will get us here, we won't | ||
6225 | * die on a /0 trap. | ||
6226 | */ | ||
6227 | sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span); | ||
6228 | |||
6229 | /* | ||
6230 | * Make sure the first group of this domain contains the | ||
6231 | * canonical balance cpu. Otherwise the sched_domain iteration | ||
6232 | * breaks. See update_sg_lb_stats(). | ||
6233 | */ | ||
6234 | if ((!groups && cpumask_test_cpu(cpu, sg_span)) || | ||
6235 | group_balance_cpu(sg) == cpu) | ||
6065 | groups = sg; | 6236 | groups = sg; |
6066 | 6237 | ||
6067 | if (!first) | 6238 | if (!first) |
@@ -6135,6 +6306,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) | |||
6135 | 6306 | ||
6136 | cpumask_clear(sched_group_cpus(sg)); | 6307 | cpumask_clear(sched_group_cpus(sg)); |
6137 | sg->sgp->power = 0; | 6308 | sg->sgp->power = 0; |
6309 | cpumask_setall(sched_group_mask(sg)); | ||
6138 | 6310 | ||
6139 | for_each_cpu(j, span) { | 6311 | for_each_cpu(j, span) { |
6140 | if (get_group(j, sdd, NULL) != group) | 6312 | if (get_group(j, sdd, NULL) != group) |
@@ -6176,7 +6348,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6176 | sg = sg->next; | 6348 | sg = sg->next; |
6177 | } while (sg != sd->groups); | 6349 | } while (sg != sd->groups); |
6178 | 6350 | ||
6179 | if (cpu != group_first_cpu(sg)) | 6351 | if (cpu != group_balance_cpu(sg)) |
6180 | return; | 6352 | return; |
6181 | 6353 | ||
6182 | update_group_power(sd, cpu); | 6354 | update_group_power(sd, cpu); |
@@ -6211,10 +6383,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \ | |||
6211 | } | 6383 | } |
6212 | 6384 | ||
6213 | SD_INIT_FUNC(CPU) | 6385 | SD_INIT_FUNC(CPU) |
6214 | #ifdef CONFIG_NUMA | ||
6215 | SD_INIT_FUNC(ALLNODES) | ||
6216 | SD_INIT_FUNC(NODE) | ||
6217 | #endif | ||
6218 | #ifdef CONFIG_SCHED_SMT | 6386 | #ifdef CONFIG_SCHED_SMT |
6219 | SD_INIT_FUNC(SIBLING) | 6387 | SD_INIT_FUNC(SIBLING) |
6220 | #endif | 6388 | #endif |
@@ -6230,11 +6398,8 @@ int sched_domain_level_max; | |||
6230 | 6398 | ||
6231 | static int __init setup_relax_domain_level(char *str) | 6399 | static int __init setup_relax_domain_level(char *str) |
6232 | { | 6400 | { |
6233 | unsigned long val; | 6401 | if (kstrtoint(str, 0, &default_relax_domain_level)) |
6234 | 6402 | pr_warn("Unable to set relax_domain_level\n"); | |
6235 | val = simple_strtoul(str, NULL, 0); | ||
6236 | if (val < sched_domain_level_max) | ||
6237 | default_relax_domain_level = val; | ||
6238 | 6403 | ||
6239 | return 1; | 6404 | return 1; |
6240 | } | 6405 | } |
@@ -6336,15 +6501,236 @@ static struct sched_domain_topology_level default_topology[] = { | |||
6336 | { sd_init_BOOK, cpu_book_mask, }, | 6501 | { sd_init_BOOK, cpu_book_mask, }, |
6337 | #endif | 6502 | #endif |
6338 | { sd_init_CPU, cpu_cpu_mask, }, | 6503 | { sd_init_CPU, cpu_cpu_mask, }, |
6339 | #ifdef CONFIG_NUMA | ||
6340 | { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, }, | ||
6341 | { sd_init_ALLNODES, cpu_allnodes_mask, }, | ||
6342 | #endif | ||
6343 | { NULL, }, | 6504 | { NULL, }, |
6344 | }; | 6505 | }; |
6345 | 6506 | ||
6346 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; | 6507 | static struct sched_domain_topology_level *sched_domain_topology = default_topology; |
6347 | 6508 | ||
6509 | #ifdef CONFIG_NUMA | ||
6510 | |||
6511 | static int sched_domains_numa_levels; | ||
6512 | static int *sched_domains_numa_distance; | ||
6513 | static struct cpumask ***sched_domains_numa_masks; | ||
6514 | static int sched_domains_curr_level; | ||
6515 | |||
6516 | static inline int sd_local_flags(int level) | ||
6517 | { | ||
6518 | if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE) | ||
6519 | return 0; | ||
6520 | |||
6521 | return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; | ||
6522 | } | ||
6523 | |||
6524 | static struct sched_domain * | ||
6525 | sd_numa_init(struct sched_domain_topology_level *tl, int cpu) | ||
6526 | { | ||
6527 | struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); | ||
6528 | int level = tl->numa_level; | ||
6529 | int sd_weight = cpumask_weight( | ||
6530 | sched_domains_numa_masks[level][cpu_to_node(cpu)]); | ||
6531 | |||
6532 | *sd = (struct sched_domain){ | ||
6533 | .min_interval = sd_weight, | ||
6534 | .max_interval = 2*sd_weight, | ||
6535 | .busy_factor = 32, | ||
6536 | .imbalance_pct = 125, | ||
6537 | .cache_nice_tries = 2, | ||
6538 | .busy_idx = 3, | ||
6539 | .idle_idx = 2, | ||
6540 | .newidle_idx = 0, | ||
6541 | .wake_idx = 0, | ||
6542 | .forkexec_idx = 0, | ||
6543 | |||
6544 | .flags = 1*SD_LOAD_BALANCE | ||
6545 | | 1*SD_BALANCE_NEWIDLE | ||
6546 | | 0*SD_BALANCE_EXEC | ||
6547 | | 0*SD_BALANCE_FORK | ||
6548 | | 0*SD_BALANCE_WAKE | ||
6549 | | 0*SD_WAKE_AFFINE | ||
6550 | | 0*SD_PREFER_LOCAL | ||
6551 | | 0*SD_SHARE_CPUPOWER | ||
6552 | | 0*SD_SHARE_PKG_RESOURCES | ||
6553 | | 1*SD_SERIALIZE | ||
6554 | | 0*SD_PREFER_SIBLING | ||
6555 | | sd_local_flags(level) | ||
6556 | , | ||
6557 | .last_balance = jiffies, | ||
6558 | .balance_interval = sd_weight, | ||
6559 | }; | ||
6560 | SD_INIT_NAME(sd, NUMA); | ||
6561 | sd->private = &tl->data; | ||
6562 | |||
6563 | /* | ||
6564 | * Ugly hack to pass state to sd_numa_mask()... | ||
6565 | */ | ||
6566 | sched_domains_curr_level = tl->numa_level; | ||
6567 | |||
6568 | return sd; | ||
6569 | } | ||
6570 | |||
6571 | static const struct cpumask *sd_numa_mask(int cpu) | ||
6572 | { | ||
6573 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; | ||
6574 | } | ||
6575 | |||
6576 | static void sched_numa_warn(const char *str) | ||
6577 | { | ||
6578 | static int done = false; | ||
6579 | int i,j; | ||
6580 | |||
6581 | if (done) | ||
6582 | return; | ||
6583 | |||
6584 | done = true; | ||
6585 | |||
6586 | printk(KERN_WARNING "ERROR: %s\n\n", str); | ||
6587 | |||
6588 | for (i = 0; i < nr_node_ids; i++) { | ||
6589 | printk(KERN_WARNING " "); | ||
6590 | for (j = 0; j < nr_node_ids; j++) | ||
6591 | printk(KERN_CONT "%02d ", node_distance(i,j)); | ||
6592 | printk(KERN_CONT "\n"); | ||
6593 | } | ||
6594 | printk(KERN_WARNING "\n"); | ||
6595 | } | ||
6596 | |||
6597 | static bool find_numa_distance(int distance) | ||
6598 | { | ||
6599 | int i; | ||
6600 | |||
6601 | if (distance == node_distance(0, 0)) | ||
6602 | return true; | ||
6603 | |||
6604 | for (i = 0; i < sched_domains_numa_levels; i++) { | ||
6605 | if (sched_domains_numa_distance[i] == distance) | ||
6606 | return true; | ||
6607 | } | ||
6608 | |||
6609 | return false; | ||
6610 | } | ||
6611 | |||
6612 | static void sched_init_numa(void) | ||
6613 | { | ||
6614 | int next_distance, curr_distance = node_distance(0, 0); | ||
6615 | struct sched_domain_topology_level *tl; | ||
6616 | int level = 0; | ||
6617 | int i, j, k; | ||
6618 | |||
6619 | sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); | ||
6620 | if (!sched_domains_numa_distance) | ||
6621 | return; | ||
6622 | |||
6623 | /* | ||
6624 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the | ||
6625 | * unique distances in the node_distance() table. | ||
6626 | * | ||
6627 | * Assumes node_distance(0,j) includes all distances in | ||
6628 | * node_distance(i,j) in order to avoid cubic time. | ||
6629 | */ | ||
6630 | next_distance = curr_distance; | ||
6631 | for (i = 0; i < nr_node_ids; i++) { | ||
6632 | for (j = 0; j < nr_node_ids; j++) { | ||
6633 | for (k = 0; k < nr_node_ids; k++) { | ||
6634 | int distance = node_distance(i, k); | ||
6635 | |||
6636 | if (distance > curr_distance && | ||
6637 | (distance < next_distance || | ||
6638 | next_distance == curr_distance)) | ||
6639 | next_distance = distance; | ||
6640 | |||
6641 | /* | ||
6642 | * While not a strong assumption it would be nice to know | ||
6643 | * about cases where if node A is connected to B, B is not | ||
6644 | * equally connected to A. | ||
6645 | */ | ||
6646 | if (sched_debug() && node_distance(k, i) != distance) | ||
6647 | sched_numa_warn("Node-distance not symmetric"); | ||
6648 | |||
6649 | if (sched_debug() && i && !find_numa_distance(distance)) | ||
6650 | sched_numa_warn("Node-0 not representative"); | ||
6651 | } | ||
6652 | if (next_distance != curr_distance) { | ||
6653 | sched_domains_numa_distance[level++] = next_distance; | ||
6654 | sched_domains_numa_levels = level; | ||
6655 | curr_distance = next_distance; | ||
6656 | } else break; | ||
6657 | } | ||
6658 | |||
6659 | /* | ||
6660 | * In case of sched_debug() we verify the above assumption. | ||
6661 | */ | ||
6662 | if (!sched_debug()) | ||
6663 | break; | ||
6664 | } | ||
6665 | /* | ||
6666 | * 'level' contains the number of unique distances, excluding the | ||
6667 | * identity distance node_distance(i,i). | ||
6668 | * | ||
6669 | * The sched_domains_nume_distance[] array includes the actual distance | ||
6670 | * numbers. | ||
6671 | */ | ||
6672 | |||
6673 | sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL); | ||
6674 | if (!sched_domains_numa_masks) | ||
6675 | return; | ||
6676 | |||
6677 | /* | ||
6678 | * Now for each level, construct a mask per node which contains all | ||
6679 | * cpus of nodes that are that many hops away from us. | ||
6680 | */ | ||
6681 | for (i = 0; i < level; i++) { | ||
6682 | sched_domains_numa_masks[i] = | ||
6683 | kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL); | ||
6684 | if (!sched_domains_numa_masks[i]) | ||
6685 | return; | ||
6686 | |||
6687 | for (j = 0; j < nr_node_ids; j++) { | ||
6688 | struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL); | ||
6689 | if (!mask) | ||
6690 | return; | ||
6691 | |||
6692 | sched_domains_numa_masks[i][j] = mask; | ||
6693 | |||
6694 | for (k = 0; k < nr_node_ids; k++) { | ||
6695 | if (node_distance(j, k) > sched_domains_numa_distance[i]) | ||
6696 | continue; | ||
6697 | |||
6698 | cpumask_or(mask, mask, cpumask_of_node(k)); | ||
6699 | } | ||
6700 | } | ||
6701 | } | ||
6702 | |||
6703 | tl = kzalloc((ARRAY_SIZE(default_topology) + level) * | ||
6704 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); | ||
6705 | if (!tl) | ||
6706 | return; | ||
6707 | |||
6708 | /* | ||
6709 | * Copy the default topology bits.. | ||
6710 | */ | ||
6711 | for (i = 0; default_topology[i].init; i++) | ||
6712 | tl[i] = default_topology[i]; | ||
6713 | |||
6714 | /* | ||
6715 | * .. and append 'j' levels of NUMA goodness. | ||
6716 | */ | ||
6717 | for (j = 0; j < level; i++, j++) { | ||
6718 | tl[i] = (struct sched_domain_topology_level){ | ||
6719 | .init = sd_numa_init, | ||
6720 | .mask = sd_numa_mask, | ||
6721 | .flags = SDTL_OVERLAP, | ||
6722 | .numa_level = j, | ||
6723 | }; | ||
6724 | } | ||
6725 | |||
6726 | sched_domain_topology = tl; | ||
6727 | } | ||
6728 | #else | ||
6729 | static inline void sched_init_numa(void) | ||
6730 | { | ||
6731 | } | ||
6732 | #endif /* CONFIG_NUMA */ | ||
6733 | |||
6348 | static int __sdt_alloc(const struct cpumask *cpu_map) | 6734 | static int __sdt_alloc(const struct cpumask *cpu_map) |
6349 | { | 6735 | { |
6350 | struct sched_domain_topology_level *tl; | 6736 | struct sched_domain_topology_level *tl; |
@@ -6382,9 +6768,11 @@ static int __sdt_alloc(const struct cpumask *cpu_map) | |||
6382 | if (!sg) | 6768 | if (!sg) |
6383 | return -ENOMEM; | 6769 | return -ENOMEM; |
6384 | 6770 | ||
6771 | sg->next = sg; | ||
6772 | |||
6385 | *per_cpu_ptr(sdd->sg, j) = sg; | 6773 | *per_cpu_ptr(sdd->sg, j) = sg; |
6386 | 6774 | ||
6387 | sgp = kzalloc_node(sizeof(struct sched_group_power), | 6775 | sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), |
6388 | GFP_KERNEL, cpu_to_node(j)); | 6776 | GFP_KERNEL, cpu_to_node(j)); |
6389 | if (!sgp) | 6777 | if (!sgp) |
6390 | return -ENOMEM; | 6778 | return -ENOMEM; |
@@ -6437,7 +6825,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6437 | if (!sd) | 6825 | if (!sd) |
6438 | return child; | 6826 | return child; |
6439 | 6827 | ||
6440 | set_domain_attribute(sd, attr); | ||
6441 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); | 6828 | cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); |
6442 | if (child) { | 6829 | if (child) { |
6443 | sd->level = child->level + 1; | 6830 | sd->level = child->level + 1; |
@@ -6445,6 +6832,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, | |||
6445 | child->parent = sd; | 6832 | child->parent = sd; |
6446 | } | 6833 | } |
6447 | sd->child = child; | 6834 | sd->child = child; |
6835 | set_domain_attribute(sd, attr); | ||
6448 | 6836 | ||
6449 | return sd; | 6837 | return sd; |
6450 | } | 6838 | } |
@@ -6585,7 +6973,6 @@ static int init_sched_domains(const struct cpumask *cpu_map) | |||
6585 | if (!doms_cur) | 6973 | if (!doms_cur) |
6586 | doms_cur = &fallback_doms; | 6974 | doms_cur = &fallback_doms; |
6587 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); | 6975 | cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); |
6588 | dattr_cur = NULL; | ||
6589 | err = build_sched_domains(doms_cur[0], NULL); | 6976 | err = build_sched_domains(doms_cur[0], NULL); |
6590 | register_sched_domain_sysctl(); | 6977 | register_sched_domain_sysctl(); |
6591 | 6978 | ||
@@ -6710,97 +7097,6 @@ match2: | |||
6710 | mutex_unlock(&sched_domains_mutex); | 7097 | mutex_unlock(&sched_domains_mutex); |
6711 | } | 7098 | } |
6712 | 7099 | ||
6713 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
6714 | static void reinit_sched_domains(void) | ||
6715 | { | ||
6716 | get_online_cpus(); | ||
6717 | |||
6718 | /* Destroy domains first to force the rebuild */ | ||
6719 | partition_sched_domains(0, NULL, NULL); | ||
6720 | |||
6721 | rebuild_sched_domains(); | ||
6722 | put_online_cpus(); | ||
6723 | } | ||
6724 | |||
6725 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | ||
6726 | { | ||
6727 | unsigned int level = 0; | ||
6728 | |||
6729 | if (sscanf(buf, "%u", &level) != 1) | ||
6730 | return -EINVAL; | ||
6731 | |||
6732 | /* | ||
6733 | * level is always be positive so don't check for | ||
6734 | * level < POWERSAVINGS_BALANCE_NONE which is 0 | ||
6735 | * What happens on 0 or 1 byte write, | ||
6736 | * need to check for count as well? | ||
6737 | */ | ||
6738 | |||
6739 | if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS) | ||
6740 | return -EINVAL; | ||
6741 | |||
6742 | if (smt) | ||
6743 | sched_smt_power_savings = level; | ||
6744 | else | ||
6745 | sched_mc_power_savings = level; | ||
6746 | |||
6747 | reinit_sched_domains(); | ||
6748 | |||
6749 | return count; | ||
6750 | } | ||
6751 | |||
6752 | #ifdef CONFIG_SCHED_MC | ||
6753 | static ssize_t sched_mc_power_savings_show(struct device *dev, | ||
6754 | struct device_attribute *attr, | ||
6755 | char *buf) | ||
6756 | { | ||
6757 | return sprintf(buf, "%u\n", sched_mc_power_savings); | ||
6758 | } | ||
6759 | static ssize_t sched_mc_power_savings_store(struct device *dev, | ||
6760 | struct device_attribute *attr, | ||
6761 | const char *buf, size_t count) | ||
6762 | { | ||
6763 | return sched_power_savings_store(buf, count, 0); | ||
6764 | } | ||
6765 | static DEVICE_ATTR(sched_mc_power_savings, 0644, | ||
6766 | sched_mc_power_savings_show, | ||
6767 | sched_mc_power_savings_store); | ||
6768 | #endif | ||
6769 | |||
6770 | #ifdef CONFIG_SCHED_SMT | ||
6771 | static ssize_t sched_smt_power_savings_show(struct device *dev, | ||
6772 | struct device_attribute *attr, | ||
6773 | char *buf) | ||
6774 | { | ||
6775 | return sprintf(buf, "%u\n", sched_smt_power_savings); | ||
6776 | } | ||
6777 | static ssize_t sched_smt_power_savings_store(struct device *dev, | ||
6778 | struct device_attribute *attr, | ||
6779 | const char *buf, size_t count) | ||
6780 | { | ||
6781 | return sched_power_savings_store(buf, count, 1); | ||
6782 | } | ||
6783 | static DEVICE_ATTR(sched_smt_power_savings, 0644, | ||
6784 | sched_smt_power_savings_show, | ||
6785 | sched_smt_power_savings_store); | ||
6786 | #endif | ||
6787 | |||
6788 | int __init sched_create_sysfs_power_savings_entries(struct device *dev) | ||
6789 | { | ||
6790 | int err = 0; | ||
6791 | |||
6792 | #ifdef CONFIG_SCHED_SMT | ||
6793 | if (smt_capable()) | ||
6794 | err = device_create_file(dev, &dev_attr_sched_smt_power_savings); | ||
6795 | #endif | ||
6796 | #ifdef CONFIG_SCHED_MC | ||
6797 | if (!err && mc_capable()) | ||
6798 | err = device_create_file(dev, &dev_attr_sched_mc_power_savings); | ||
6799 | #endif | ||
6800 | return err; | ||
6801 | } | ||
6802 | #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ | ||
6803 | |||
6804 | /* | 7100 | /* |
6805 | * Update cpusets according to cpu_active mask. If cpusets are | 7101 | * Update cpusets according to cpu_active mask. If cpusets are |
6806 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper | 7102 | * disabled, cpuset_update_active_cpus() becomes a simple wrapper |
@@ -6838,6 +7134,8 @@ void __init sched_init_smp(void) | |||
6838 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); | 7134 | alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); |
6839 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); | 7135 | alloc_cpumask_var(&fallback_doms, GFP_KERNEL); |
6840 | 7136 | ||
7137 | sched_init_numa(); | ||
7138 | |||
6841 | get_online_cpus(); | 7139 | get_online_cpus(); |
6842 | mutex_lock(&sched_domains_mutex); | 7140 | mutex_lock(&sched_domains_mutex); |
6843 | init_sched_domains(cpu_active_mask); | 7141 | init_sched_domains(cpu_active_mask); |
@@ -7059,6 +7357,7 @@ void __init sched_init(void) | |||
7059 | /* May be allocated at isolcpus cmdline parse time */ | 7357 | /* May be allocated at isolcpus cmdline parse time */ |
7060 | if (cpu_isolated_map == NULL) | 7358 | if (cpu_isolated_map == NULL) |
7061 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 7359 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
7360 | idle_thread_set_boot_cpu(); | ||
7062 | #endif | 7361 | #endif |
7063 | init_sched_fair_class(); | 7362 | init_sched_fair_class(); |
7064 | 7363 | ||
@@ -7980,13 +8279,9 @@ static struct cftype cpu_files[] = { | |||
7980 | .write_u64 = cpu_rt_period_write_uint, | 8279 | .write_u64 = cpu_rt_period_write_uint, |
7981 | }, | 8280 | }, |
7982 | #endif | 8281 | #endif |
8282 | { } /* terminate */ | ||
7983 | }; | 8283 | }; |
7984 | 8284 | ||
7985 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | ||
7986 | { | ||
7987 | return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); | ||
7988 | } | ||
7989 | |||
7990 | struct cgroup_subsys cpu_cgroup_subsys = { | 8285 | struct cgroup_subsys cpu_cgroup_subsys = { |
7991 | .name = "cpu", | 8286 | .name = "cpu", |
7992 | .create = cpu_cgroup_create, | 8287 | .create = cpu_cgroup_create, |
@@ -7994,8 +8289,8 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
7994 | .can_attach = cpu_cgroup_can_attach, | 8289 | .can_attach = cpu_cgroup_can_attach, |
7995 | .attach = cpu_cgroup_attach, | 8290 | .attach = cpu_cgroup_attach, |
7996 | .exit = cpu_cgroup_exit, | 8291 | .exit = cpu_cgroup_exit, |
7997 | .populate = cpu_cgroup_populate, | ||
7998 | .subsys_id = cpu_cgroup_subsys_id, | 8292 | .subsys_id = cpu_cgroup_subsys_id, |
8293 | .base_cftypes = cpu_files, | ||
7999 | .early_init = 1, | 8294 | .early_init = 1, |
8000 | }; | 8295 | }; |
8001 | 8296 | ||
@@ -8180,13 +8475,9 @@ static struct cftype files[] = { | |||
8180 | .name = "stat", | 8475 | .name = "stat", |
8181 | .read_map = cpuacct_stats_show, | 8476 | .read_map = cpuacct_stats_show, |
8182 | }, | 8477 | }, |
8478 | { } /* terminate */ | ||
8183 | }; | 8479 | }; |
8184 | 8480 | ||
8185 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp) | ||
8186 | { | ||
8187 | return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files)); | ||
8188 | } | ||
8189 | |||
8190 | /* | 8481 | /* |
8191 | * charge this task's execution time to its accounting group. | 8482 | * charge this task's execution time to its accounting group. |
8192 | * | 8483 | * |
@@ -8218,7 +8509,7 @@ struct cgroup_subsys cpuacct_subsys = { | |||
8218 | .name = "cpuacct", | 8509 | .name = "cpuacct", |
8219 | .create = cpuacct_create, | 8510 | .create = cpuacct_create, |
8220 | .destroy = cpuacct_destroy, | 8511 | .destroy = cpuacct_destroy, |
8221 | .populate = cpuacct_populate, | ||
8222 | .subsys_id = cpuacct_subsys_id, | 8512 | .subsys_id = cpuacct_subsys_id, |
8513 | .base_cftypes = files, | ||
8223 | }; | 8514 | }; |
8224 | #endif /* CONFIG_CGROUP_CPUACCT */ | 8515 | #endif /* CONFIG_CGROUP_CPUACCT */ |