aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c919
1 files changed, 605 insertions, 314 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0533a688ce22..468bdd44c1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,6 +83,7 @@
83 83
84#include "sched.h" 84#include "sched.h"
85#include "../workqueue_sched.h" 85#include "../workqueue_sched.h"
86#include "../smpboot.h"
86 87
87#define CREATE_TRACE_POINTS 88#define CREATE_TRACE_POINTS
88#include <trace/events/sched.h> 89#include <trace/events/sched.h>
@@ -141,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
141#define SCHED_FEAT(name, enabled) \ 142#define SCHED_FEAT(name, enabled) \
142 #name , 143 #name ,
143 144
144static __read_mostly char *sched_feat_names[] = { 145static const char * const sched_feat_names[] = {
145#include "features.h" 146#include "features.h"
146 NULL
147}; 147};
148 148
149#undef SCHED_FEAT 149#undef SCHED_FEAT
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data)
692} 692}
693#endif 693#endif
694 694
695void update_cpu_load(struct rq *this_rq);
696
697static void set_load_weight(struct task_struct *p) 695static void set_load_weight(struct task_struct *p)
698{ 696{
699 int prio = p->static_prio - MAX_RT_PRIO; 697 int prio = p->static_prio - MAX_RT_PRIO;
@@ -2162,11 +2160,73 @@ unsigned long this_cpu_load(void)
2162} 2160}
2163 2161
2164 2162
2163/*
2164 * Global load-average calculations
2165 *
2166 * We take a distributed and async approach to calculating the global load-avg
2167 * in order to minimize overhead.
2168 *
2169 * The global load average is an exponentially decaying average of nr_running +
2170 * nr_uninterruptible.
2171 *
2172 * Once every LOAD_FREQ:
2173 *
2174 * nr_active = 0;
2175 * for_each_possible_cpu(cpu)
2176 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
2177 *
2178 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
2179 *
2180 * Due to a number of reasons the above turns in the mess below:
2181 *
2182 * - for_each_possible_cpu() is prohibitively expensive on machines with
2183 * serious number of cpus, therefore we need to take a distributed approach
2184 * to calculating nr_active.
2185 *
2186 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
2187 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
2188 *
2189 * So assuming nr_active := 0 when we start out -- true per definition, we
2190 * can simply take per-cpu deltas and fold those into a global accumulate
2191 * to obtain the same result. See calc_load_fold_active().
2192 *
2193 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
2194 * across the machine, we assume 10 ticks is sufficient time for every
2195 * cpu to have completed this task.
2196 *
2197 * This places an upper-bound on the IRQ-off latency of the machine. Then
2198 * again, being late doesn't loose the delta, just wrecks the sample.
2199 *
2200 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
2201 * this would add another cross-cpu cacheline miss and atomic operation
2202 * to the wakeup path. Instead we increment on whatever cpu the task ran
2203 * when it went into uninterruptible state and decrement on whatever cpu
2204 * did the wakeup. This means that only the sum of nr_uninterruptible over
2205 * all cpus yields the correct result.
2206 *
2207 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
2208 */
2209
2165/* Variables and functions for calc_load */ 2210/* Variables and functions for calc_load */
2166static atomic_long_t calc_load_tasks; 2211static atomic_long_t calc_load_tasks;
2167static unsigned long calc_load_update; 2212static unsigned long calc_load_update;
2168unsigned long avenrun[3]; 2213unsigned long avenrun[3];
2169EXPORT_SYMBOL(avenrun); 2214EXPORT_SYMBOL(avenrun); /* should be removed */
2215
2216/**
2217 * get_avenrun - get the load average array
2218 * @loads: pointer to dest load array
2219 * @offset: offset to add
2220 * @shift: shift count to shift the result left
2221 *
2222 * These values are estimates at best, so no need for locking.
2223 */
2224void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2225{
2226 loads[0] = (avenrun[0] + offset) << shift;
2227 loads[1] = (avenrun[1] + offset) << shift;
2228 loads[2] = (avenrun[2] + offset) << shift;
2229}
2170 2230
2171static long calc_load_fold_active(struct rq *this_rq) 2231static long calc_load_fold_active(struct rq *this_rq)
2172{ 2232{
@@ -2183,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
2183 return delta; 2243 return delta;
2184} 2244}
2185 2245
2246/*
2247 * a1 = a0 * e + a * (1 - e)
2248 */
2186static unsigned long 2249static unsigned long
2187calc_load(unsigned long load, unsigned long exp, unsigned long active) 2250calc_load(unsigned long load, unsigned long exp, unsigned long active)
2188{ 2251{
@@ -2194,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
2194 2257
2195#ifdef CONFIG_NO_HZ 2258#ifdef CONFIG_NO_HZ
2196/* 2259/*
2197 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 2260 * Handle NO_HZ for the global load-average.
2261 *
2262 * Since the above described distributed algorithm to compute the global
2263 * load-average relies on per-cpu sampling from the tick, it is affected by
2264 * NO_HZ.
2265 *
2266 * The basic idea is to fold the nr_active delta into a global idle-delta upon
2267 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
2268 * when we read the global state.
2269 *
2270 * Obviously reality has to ruin such a delightfully simple scheme:
2271 *
2272 * - When we go NO_HZ idle during the window, we can negate our sample
2273 * contribution, causing under-accounting.
2274 *
2275 * We avoid this by keeping two idle-delta counters and flipping them
2276 * when the window starts, thus separating old and new NO_HZ load.
2277 *
2278 * The only trick is the slight shift in index flip for read vs write.
2279 *
2280 * 0s 5s 10s 15s
2281 * +10 +10 +10 +10
2282 * |-|-----------|-|-----------|-|-----------|-|
2283 * r:0 0 1 1 0 0 1 1 0
2284 * w:0 1 1 0 0 1 1 0 0
2285 *
2286 * This ensures we'll fold the old idle contribution in this window while
2287 * accumlating the new one.
2288 *
2289 * - When we wake up from NO_HZ idle during the window, we push up our
2290 * contribution, since we effectively move our sample point to a known
2291 * busy state.
2292 *
2293 * This is solved by pushing the window forward, and thus skipping the
2294 * sample, for this cpu (effectively using the idle-delta for this cpu which
2295 * was in effect at the time the window opened). This also solves the issue
2296 * of having to deal with a cpu having been in NOHZ idle for multiple
2297 * LOAD_FREQ intervals.
2198 * 2298 *
2199 * When making the ILB scale, we should try to pull this in as well. 2299 * When making the ILB scale, we should try to pull this in as well.
2200 */ 2300 */
2201static atomic_long_t calc_load_tasks_idle; 2301static atomic_long_t calc_load_idle[2];
2302static int calc_load_idx;
2202 2303
2203void calc_load_account_idle(struct rq *this_rq) 2304static inline int calc_load_write_idx(void)
2204{ 2305{
2306 int idx = calc_load_idx;
2307
2308 /*
2309 * See calc_global_nohz(), if we observe the new index, we also
2310 * need to observe the new update time.
2311 */
2312 smp_rmb();
2313
2314 /*
2315 * If the folding window started, make sure we start writing in the
2316 * next idle-delta.
2317 */
2318 if (!time_before(jiffies, calc_load_update))
2319 idx++;
2320
2321 return idx & 1;
2322}
2323
2324static inline int calc_load_read_idx(void)
2325{
2326 return calc_load_idx & 1;
2327}
2328
2329void calc_load_enter_idle(void)
2330{
2331 struct rq *this_rq = this_rq();
2205 long delta; 2332 long delta;
2206 2333
2334 /*
2335 * We're going into NOHZ mode, if there's any pending delta, fold it
2336 * into the pending idle delta.
2337 */
2207 delta = calc_load_fold_active(this_rq); 2338 delta = calc_load_fold_active(this_rq);
2208 if (delta) 2339 if (delta) {
2209 atomic_long_add(delta, &calc_load_tasks_idle); 2340 int idx = calc_load_write_idx();
2341 atomic_long_add(delta, &calc_load_idle[idx]);
2342 }
2210} 2343}
2211 2344
2212static long calc_load_fold_idle(void) 2345void calc_load_exit_idle(void)
2213{ 2346{
2214 long delta = 0; 2347 struct rq *this_rq = this_rq();
2215 2348
2216 /* 2349 /*
2217 * Its got a race, we don't care... 2350 * If we're still before the sample window, we're done.
2218 */ 2351 */
2219 if (atomic_long_read(&calc_load_tasks_idle)) 2352 if (time_before(jiffies, this_rq->calc_load_update))
2220 delta = atomic_long_xchg(&calc_load_tasks_idle, 0); 2353 return;
2354
2355 /*
2356 * We woke inside or after the sample window, this means we're already
2357 * accounted through the nohz accounting, so skip the entire deal and
2358 * sync up for the next window.
2359 */
2360 this_rq->calc_load_update = calc_load_update;
2361 if (time_before(jiffies, this_rq->calc_load_update + 10))
2362 this_rq->calc_load_update += LOAD_FREQ;
2363}
2364
2365static long calc_load_fold_idle(void)
2366{
2367 int idx = calc_load_read_idx();
2368 long delta = 0;
2369
2370 if (atomic_long_read(&calc_load_idle[idx]))
2371 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2221 2372
2222 return delta; 2373 return delta;
2223} 2374}
@@ -2303,66 +2454,39 @@ static void calc_global_nohz(void)
2303{ 2454{
2304 long delta, active, n; 2455 long delta, active, n;
2305 2456
2306 /* 2457 if (!time_before(jiffies, calc_load_update + 10)) {
2307 * If we crossed a calc_load_update boundary, make sure to fold 2458 /*
2308 * any pending idle changes, the respective CPUs might have 2459 * Catch-up, fold however many we are behind still
2309 * missed the tick driven calc_load_account_active() update 2460 */
2310 * due to NO_HZ. 2461 delta = jiffies - calc_load_update - 10;
2311 */ 2462 n = 1 + (delta / LOAD_FREQ);
2312 delta = calc_load_fold_idle();
2313 if (delta)
2314 atomic_long_add(delta, &calc_load_tasks);
2315 2463
2316 /* 2464 active = atomic_long_read(&calc_load_tasks);
2317 * It could be the one fold was all it took, we done! 2465 active = active > 0 ? active * FIXED_1 : 0;
2318 */
2319 if (time_before(jiffies, calc_load_update + 10))
2320 return;
2321 2466
2322 /* 2467 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2323 * Catch-up, fold however many we are behind still 2468 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2324 */ 2469 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2325 delta = jiffies - calc_load_update - 10;
2326 n = 1 + (delta / LOAD_FREQ);
2327 2470
2328 active = atomic_long_read(&calc_load_tasks); 2471 calc_load_update += n * LOAD_FREQ;
2329 active = active > 0 ? active * FIXED_1 : 0; 2472 }
2330
2331 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2332 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2333 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2334
2335 calc_load_update += n * LOAD_FREQ;
2336}
2337#else
2338void calc_load_account_idle(struct rq *this_rq)
2339{
2340}
2341 2473
2342static inline long calc_load_fold_idle(void) 2474 /*
2343{ 2475 * Flip the idle index...
2344 return 0; 2476 *
2477 * Make sure we first write the new time then flip the index, so that
2478 * calc_load_write_idx() will see the new time when it reads the new
2479 * index, this avoids a double flip messing things up.
2480 */
2481 smp_wmb();
2482 calc_load_idx++;
2345} 2483}
2484#else /* !CONFIG_NO_HZ */
2346 2485
2347static void calc_global_nohz(void) 2486static inline long calc_load_fold_idle(void) { return 0; }
2348{ 2487static inline void calc_global_nohz(void) { }
2349}
2350#endif
2351 2488
2352/** 2489#endif /* CONFIG_NO_HZ */
2353 * get_avenrun - get the load average array
2354 * @loads: pointer to dest load array
2355 * @offset: offset to add
2356 * @shift: shift count to shift the result left
2357 *
2358 * These values are estimates at best, so no need for locking.
2359 */
2360void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2361{
2362 loads[0] = (avenrun[0] + offset) << shift;
2363 loads[1] = (avenrun[1] + offset) << shift;
2364 loads[2] = (avenrun[2] + offset) << shift;
2365}
2366 2490
2367/* 2491/*
2368 * calc_load - update the avenrun load estimates 10 ticks after the 2492 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2370,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2370 */ 2494 */
2371void calc_global_load(unsigned long ticks) 2495void calc_global_load(unsigned long ticks)
2372{ 2496{
2373 long active; 2497 long active, delta;
2374 2498
2375 if (time_before(jiffies, calc_load_update + 10)) 2499 if (time_before(jiffies, calc_load_update + 10))
2376 return; 2500 return;
2377 2501
2502 /*
2503 * Fold the 'old' idle-delta to include all NO_HZ cpus.
2504 */
2505 delta = calc_load_fold_idle();
2506 if (delta)
2507 atomic_long_add(delta, &calc_load_tasks);
2508
2378 active = atomic_long_read(&calc_load_tasks); 2509 active = atomic_long_read(&calc_load_tasks);
2379 active = active > 0 ? active * FIXED_1 : 0; 2510 active = active > 0 ? active * FIXED_1 : 0;
2380 2511
@@ -2385,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
2385 calc_load_update += LOAD_FREQ; 2516 calc_load_update += LOAD_FREQ;
2386 2517
2387 /* 2518 /*
2388 * Account one period with whatever state we found before 2519 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2389 * folding in the nohz state and ageing the entire idle period.
2390 *
2391 * This avoids loosing a sample when we go idle between
2392 * calc_load_account_active() (10 ticks ago) and now and thus
2393 * under-accounting.
2394 */ 2520 */
2395 calc_global_nohz(); 2521 calc_global_nohz();
2396} 2522}
@@ -2407,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
2407 return; 2533 return;
2408 2534
2409 delta = calc_load_fold_active(this_rq); 2535 delta = calc_load_fold_active(this_rq);
2410 delta += calc_load_fold_idle();
2411 if (delta) 2536 if (delta)
2412 atomic_long_add(delta, &calc_load_tasks); 2537 atomic_long_add(delta, &calc_load_tasks);
2413 2538
@@ -2415,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
2415} 2540}
2416 2541
2417/* 2542/*
2543 * End of global load-average stuff
2544 */
2545
2546/*
2418 * The exact cpuload at various idx values, calculated at every tick would be 2547 * The exact cpuload at various idx values, calculated at every tick would be
2419 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load 2548 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2420 * 2549 *
@@ -2486,22 +2615,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
2486 * scheduler tick (TICK_NSEC). With tickless idle this will not be called 2615 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
2487 * every tick. We fix it up based on jiffies. 2616 * every tick. We fix it up based on jiffies.
2488 */ 2617 */
2489void update_cpu_load(struct rq *this_rq) 2618static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2619 unsigned long pending_updates)
2490{ 2620{
2491 unsigned long this_load = this_rq->load.weight;
2492 unsigned long curr_jiffies = jiffies;
2493 unsigned long pending_updates;
2494 int i, scale; 2621 int i, scale;
2495 2622
2496 this_rq->nr_load_updates++; 2623 this_rq->nr_load_updates++;
2497 2624
2498 /* Avoid repeated calls on same jiffy, when moving in and out of idle */
2499 if (curr_jiffies == this_rq->last_load_update_tick)
2500 return;
2501
2502 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2503 this_rq->last_load_update_tick = curr_jiffies;
2504
2505 /* Update our load: */ 2625 /* Update our load: */
2506 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */ 2626 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
2507 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) { 2627 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2526,9 +2646,78 @@ void update_cpu_load(struct rq *this_rq)
2526 sched_avg_update(this_rq); 2646 sched_avg_update(this_rq);
2527} 2647}
2528 2648
2649#ifdef CONFIG_NO_HZ
2650/*
2651 * There is no sane way to deal with nohz on smp when using jiffies because the
2652 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2653 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2654 *
2655 * Therefore we cannot use the delta approach from the regular tick since that
2656 * would seriously skew the load calculation. However we'll make do for those
2657 * updates happening while idle (nohz_idle_balance) or coming out of idle
2658 * (tick_nohz_idle_exit).
2659 *
2660 * This means we might still be one tick off for nohz periods.
2661 */
2662
2663/*
2664 * Called from nohz_idle_balance() to update the load ratings before doing the
2665 * idle balance.
2666 */
2667void update_idle_cpu_load(struct rq *this_rq)
2668{
2669 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2670 unsigned long load = this_rq->load.weight;
2671 unsigned long pending_updates;
2672
2673 /*
2674 * bail if there's load or we're actually up-to-date.
2675 */
2676 if (load || curr_jiffies == this_rq->last_load_update_tick)
2677 return;
2678
2679 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2680 this_rq->last_load_update_tick = curr_jiffies;
2681
2682 __update_cpu_load(this_rq, load, pending_updates);
2683}
2684
2685/*
2686 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2687 */
2688void update_cpu_load_nohz(void)
2689{
2690 struct rq *this_rq = this_rq();
2691 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2692 unsigned long pending_updates;
2693
2694 if (curr_jiffies == this_rq->last_load_update_tick)
2695 return;
2696
2697 raw_spin_lock(&this_rq->lock);
2698 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2699 if (pending_updates) {
2700 this_rq->last_load_update_tick = curr_jiffies;
2701 /*
2702 * We were idle, this means load 0, the current load might be
2703 * !0 due to remote wakeups and the sort.
2704 */
2705 __update_cpu_load(this_rq, 0, pending_updates);
2706 }
2707 raw_spin_unlock(&this_rq->lock);
2708}
2709#endif /* CONFIG_NO_HZ */
2710
2711/*
2712 * Called from scheduler_tick()
2713 */
2529static void update_cpu_load_active(struct rq *this_rq) 2714static void update_cpu_load_active(struct rq *this_rq)
2530{ 2715{
2531 update_cpu_load(this_rq); 2716 /*
2717 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2718 */
2719 this_rq->last_load_update_tick = jiffies;
2720 __update_cpu_load(this_rq, this_rq->load.weight, 1);
2532 2721
2533 calc_load_account_active(this_rq); 2722 calc_load_account_active(this_rq);
2534} 2723}
@@ -3113,6 +3302,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
3113 if (irqs_disabled()) 3302 if (irqs_disabled())
3114 print_irqtrace_events(prev); 3303 print_irqtrace_events(prev);
3115 dump_stack(); 3304 dump_stack();
3305 add_taint(TAINT_WARN);
3116} 3306}
3117 3307
3118/* 3308/*
@@ -4042,11 +4232,8 @@ static bool check_same_owner(struct task_struct *p)
4042 4232
4043 rcu_read_lock(); 4233 rcu_read_lock();
4044 pcred = __task_cred(p); 4234 pcred = __task_cred(p);
4045 if (cred->user->user_ns == pcred->user->user_ns) 4235 match = (uid_eq(cred->euid, pcred->euid) ||
4046 match = (cred->euid == pcred->euid || 4236 uid_eq(cred->euid, pcred->uid));
4047 cred->euid == pcred->uid);
4048 else
4049 match = false;
4050 rcu_read_unlock(); 4237 rcu_read_unlock();
4051 return match; 4238 return match;
4052} 4239}
@@ -4957,7 +5144,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4957 p->sched_class->set_cpus_allowed(p, new_mask); 5144 p->sched_class->set_cpus_allowed(p, new_mask);
4958 5145
4959 cpumask_copy(&p->cpus_allowed, new_mask); 5146 cpumask_copy(&p->cpus_allowed, new_mask);
4960 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 5147 p->nr_cpus_allowed = cpumask_weight(new_mask);
4961} 5148}
4962 5149
4963/* 5150/*
@@ -5499,15 +5686,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5499 5686
5500#ifdef CONFIG_SCHED_DEBUG 5687#ifdef CONFIG_SCHED_DEBUG
5501 5688
5502static __read_mostly int sched_domain_debug_enabled; 5689static __read_mostly int sched_debug_enabled;
5503 5690
5504static int __init sched_domain_debug_setup(char *str) 5691static int __init sched_debug_setup(char *str)
5505{ 5692{
5506 sched_domain_debug_enabled = 1; 5693 sched_debug_enabled = 1;
5507 5694
5508 return 0; 5695 return 0;
5509} 5696}
5510early_param("sched_debug", sched_domain_debug_setup); 5697early_param("sched_debug", sched_debug_setup);
5698
5699static inline bool sched_debug(void)
5700{
5701 return sched_debug_enabled;
5702}
5511 5703
5512static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5704static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5513 struct cpumask *groupmask) 5705 struct cpumask *groupmask)
@@ -5547,7 +5739,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5547 break; 5739 break;
5548 } 5740 }
5549 5741
5550 if (!group->sgp->power) { 5742 /*
5743 * Even though we initialize ->power to something semi-sane,
5744 * we leave power_orig unset. This allows us to detect if
5745 * domain iteration is still funny without causing /0 traps.
5746 */
5747 if (!group->sgp->power_orig) {
5551 printk(KERN_CONT "\n"); 5748 printk(KERN_CONT "\n");
5552 printk(KERN_ERR "ERROR: domain->cpu_power not " 5749 printk(KERN_ERR "ERROR: domain->cpu_power not "
5553 "set\n"); 5750 "set\n");
@@ -5560,7 +5757,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5560 break; 5757 break;
5561 } 5758 }
5562 5759
5563 if (cpumask_intersects(groupmask, sched_group_cpus(group))) { 5760 if (!(sd->flags & SD_OVERLAP) &&
5761 cpumask_intersects(groupmask, sched_group_cpus(group))) {
5564 printk(KERN_CONT "\n"); 5762 printk(KERN_CONT "\n");
5565 printk(KERN_ERR "ERROR: repeated CPUs\n"); 5763 printk(KERN_ERR "ERROR: repeated CPUs\n");
5566 break; 5764 break;
@@ -5594,7 +5792,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5594{ 5792{
5595 int level = 0; 5793 int level = 0;
5596 5794
5597 if (!sched_domain_debug_enabled) 5795 if (!sched_debug_enabled)
5598 return; 5796 return;
5599 5797
5600 if (!sd) { 5798 if (!sd) {
@@ -5615,6 +5813,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5615} 5813}
5616#else /* !CONFIG_SCHED_DEBUG */ 5814#else /* !CONFIG_SCHED_DEBUG */
5617# define sched_domain_debug(sd, cpu) do { } while (0) 5815# define sched_domain_debug(sd, cpu) do { } while (0)
5816static inline bool sched_debug(void)
5817{
5818 return false;
5819}
5618#endif /* CONFIG_SCHED_DEBUG */ 5820#endif /* CONFIG_SCHED_DEBUG */
5619 5821
5620static int sd_degenerate(struct sched_domain *sd) 5822static int sd_degenerate(struct sched_domain *sd)
@@ -5898,99 +6100,11 @@ static int __init isolated_cpu_setup(char *str)
5898 6100
5899__setup("isolcpus=", isolated_cpu_setup); 6101__setup("isolcpus=", isolated_cpu_setup);
5900 6102
5901#ifdef CONFIG_NUMA
5902
5903/**
5904 * find_next_best_node - find the next node to include in a sched_domain
5905 * @node: node whose sched_domain we're building
5906 * @used_nodes: nodes already in the sched_domain
5907 *
5908 * Find the next node to include in a given scheduling domain. Simply
5909 * finds the closest node not already in the @used_nodes map.
5910 *
5911 * Should use nodemask_t.
5912 */
5913static int find_next_best_node(int node, nodemask_t *used_nodes)
5914{
5915 int i, n, val, min_val, best_node = -1;
5916
5917 min_val = INT_MAX;
5918
5919 for (i = 0; i < nr_node_ids; i++) {
5920 /* Start at @node */
5921 n = (node + i) % nr_node_ids;
5922
5923 if (!nr_cpus_node(n))
5924 continue;
5925
5926 /* Skip already used nodes */
5927 if (node_isset(n, *used_nodes))
5928 continue;
5929
5930 /* Simple min distance search */
5931 val = node_distance(node, n);
5932
5933 if (val < min_val) {
5934 min_val = val;
5935 best_node = n;
5936 }
5937 }
5938
5939 if (best_node != -1)
5940 node_set(best_node, *used_nodes);
5941 return best_node;
5942}
5943
5944/**
5945 * sched_domain_node_span - get a cpumask for a node's sched_domain
5946 * @node: node whose cpumask we're constructing
5947 * @span: resulting cpumask
5948 *
5949 * Given a node, construct a good cpumask for its sched_domain to span. It
5950 * should be one that prevents unnecessary balancing, but also spreads tasks
5951 * out optimally.
5952 */
5953static void sched_domain_node_span(int node, struct cpumask *span)
5954{
5955 nodemask_t used_nodes;
5956 int i;
5957
5958 cpumask_clear(span);
5959 nodes_clear(used_nodes);
5960
5961 cpumask_or(span, span, cpumask_of_node(node));
5962 node_set(node, used_nodes);
5963
5964 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5965 int next_node = find_next_best_node(node, &used_nodes);
5966 if (next_node < 0)
5967 break;
5968 cpumask_or(span, span, cpumask_of_node(next_node));
5969 }
5970}
5971
5972static const struct cpumask *cpu_node_mask(int cpu)
5973{
5974 lockdep_assert_held(&sched_domains_mutex);
5975
5976 sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
5977
5978 return sched_domains_tmpmask;
5979}
5980
5981static const struct cpumask *cpu_allnodes_mask(int cpu)
5982{
5983 return cpu_possible_mask;
5984}
5985#endif /* CONFIG_NUMA */
5986
5987static const struct cpumask *cpu_cpu_mask(int cpu) 6103static const struct cpumask *cpu_cpu_mask(int cpu)
5988{ 6104{
5989 return cpumask_of_node(cpu_to_node(cpu)); 6105 return cpumask_of_node(cpu_to_node(cpu));
5990} 6106}
5991 6107
5992int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5993
5994struct sd_data { 6108struct sd_data {
5995 struct sched_domain **__percpu sd; 6109 struct sched_domain **__percpu sd;
5996 struct sched_group **__percpu sg; 6110 struct sched_group **__percpu sg;
@@ -6020,9 +6134,48 @@ struct sched_domain_topology_level {
6020 sched_domain_init_f init; 6134 sched_domain_init_f init;
6021 sched_domain_mask_f mask; 6135 sched_domain_mask_f mask;
6022 int flags; 6136 int flags;
6137 int numa_level;
6023 struct sd_data data; 6138 struct sd_data data;
6024}; 6139};
6025 6140
6141/*
6142 * Build an iteration mask that can exclude certain CPUs from the upwards
6143 * domain traversal.
6144 *
6145 * Asymmetric node setups can result in situations where the domain tree is of
6146 * unequal depth, make sure to skip domains that already cover the entire
6147 * range.
6148 *
6149 * In that case build_sched_domains() will have terminated the iteration early
6150 * and our sibling sd spans will be empty. Domains should always include the
6151 * cpu they're built on, so check that.
6152 *
6153 */
6154static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6155{
6156 const struct cpumask *span = sched_domain_span(sd);
6157 struct sd_data *sdd = sd->private;
6158 struct sched_domain *sibling;
6159 int i;
6160
6161 for_each_cpu(i, span) {
6162 sibling = *per_cpu_ptr(sdd->sd, i);
6163 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6164 continue;
6165
6166 cpumask_set_cpu(i, sched_group_mask(sg));
6167 }
6168}
6169
6170/*
6171 * Return the canonical balance cpu for this group, this is the first cpu
6172 * of this group that's also in the iteration mask.
6173 */
6174int group_balance_cpu(struct sched_group *sg)
6175{
6176 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6177}
6178
6026static int 6179static int
6027build_overlap_sched_groups(struct sched_domain *sd, int cpu) 6180build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6028{ 6181{
@@ -6041,6 +6194,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6041 if (cpumask_test_cpu(i, covered)) 6194 if (cpumask_test_cpu(i, covered))
6042 continue; 6195 continue;
6043 6196
6197 child = *per_cpu_ptr(sdd->sd, i);
6198
6199 /* See the comment near build_group_mask(). */
6200 if (!cpumask_test_cpu(i, sched_domain_span(child)))
6201 continue;
6202
6044 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6203 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
6045 GFP_KERNEL, cpu_to_node(cpu)); 6204 GFP_KERNEL, cpu_to_node(cpu));
6046 6205
@@ -6048,8 +6207,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6048 goto fail; 6207 goto fail;
6049 6208
6050 sg_span = sched_group_cpus(sg); 6209 sg_span = sched_group_cpus(sg);
6051
6052 child = *per_cpu_ptr(sdd->sd, i);
6053 if (child->child) { 6210 if (child->child) {
6054 child = child->child; 6211 child = child->child;
6055 cpumask_copy(sg_span, sched_domain_span(child)); 6212 cpumask_copy(sg_span, sched_domain_span(child));
@@ -6058,10 +6215,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
6058 6215
6059 cpumask_or(covered, covered, sg_span); 6216 cpumask_or(covered, covered, sg_span);
6060 6217
6061 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); 6218 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
6062 atomic_inc(&sg->sgp->ref); 6219 if (atomic_inc_return(&sg->sgp->ref) == 1)
6220 build_group_mask(sd, sg);
6063 6221
6064 if (cpumask_test_cpu(cpu, sg_span)) 6222 /*
6223 * Initialize sgp->power such that even if we mess up the
6224 * domains and no possible iteration will get us here, we won't
6225 * die on a /0 trap.
6226 */
6227 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
6228
6229 /*
6230 * Make sure the first group of this domain contains the
6231 * canonical balance cpu. Otherwise the sched_domain iteration
6232 * breaks. See update_sg_lb_stats().
6233 */
6234 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6235 group_balance_cpu(sg) == cpu)
6065 groups = sg; 6236 groups = sg;
6066 6237
6067 if (!first) 6238 if (!first)
@@ -6135,6 +6306,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
6135 6306
6136 cpumask_clear(sched_group_cpus(sg)); 6307 cpumask_clear(sched_group_cpus(sg));
6137 sg->sgp->power = 0; 6308 sg->sgp->power = 0;
6309 cpumask_setall(sched_group_mask(sg));
6138 6310
6139 for_each_cpu(j, span) { 6311 for_each_cpu(j, span) {
6140 if (get_group(j, sdd, NULL) != group) 6312 if (get_group(j, sdd, NULL) != group)
@@ -6176,7 +6348,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6176 sg = sg->next; 6348 sg = sg->next;
6177 } while (sg != sd->groups); 6349 } while (sg != sd->groups);
6178 6350
6179 if (cpu != group_first_cpu(sg)) 6351 if (cpu != group_balance_cpu(sg))
6180 return; 6352 return;
6181 6353
6182 update_group_power(sd, cpu); 6354 update_group_power(sd, cpu);
@@ -6211,10 +6383,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
6211} 6383}
6212 6384
6213SD_INIT_FUNC(CPU) 6385SD_INIT_FUNC(CPU)
6214#ifdef CONFIG_NUMA
6215 SD_INIT_FUNC(ALLNODES)
6216 SD_INIT_FUNC(NODE)
6217#endif
6218#ifdef CONFIG_SCHED_SMT 6386#ifdef CONFIG_SCHED_SMT
6219 SD_INIT_FUNC(SIBLING) 6387 SD_INIT_FUNC(SIBLING)
6220#endif 6388#endif
@@ -6230,11 +6398,8 @@ int sched_domain_level_max;
6230 6398
6231static int __init setup_relax_domain_level(char *str) 6399static int __init setup_relax_domain_level(char *str)
6232{ 6400{
6233 unsigned long val; 6401 if (kstrtoint(str, 0, &default_relax_domain_level))
6234 6402 pr_warn("Unable to set relax_domain_level\n");
6235 val = simple_strtoul(str, NULL, 0);
6236 if (val < sched_domain_level_max)
6237 default_relax_domain_level = val;
6238 6403
6239 return 1; 6404 return 1;
6240} 6405}
@@ -6336,15 +6501,236 @@ static struct sched_domain_topology_level default_topology[] = {
6336 { sd_init_BOOK, cpu_book_mask, }, 6501 { sd_init_BOOK, cpu_book_mask, },
6337#endif 6502#endif
6338 { sd_init_CPU, cpu_cpu_mask, }, 6503 { sd_init_CPU, cpu_cpu_mask, },
6339#ifdef CONFIG_NUMA
6340 { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
6341 { sd_init_ALLNODES, cpu_allnodes_mask, },
6342#endif
6343 { NULL, }, 6504 { NULL, },
6344}; 6505};
6345 6506
6346static struct sched_domain_topology_level *sched_domain_topology = default_topology; 6507static struct sched_domain_topology_level *sched_domain_topology = default_topology;
6347 6508
6509#ifdef CONFIG_NUMA
6510
6511static int sched_domains_numa_levels;
6512static int *sched_domains_numa_distance;
6513static struct cpumask ***sched_domains_numa_masks;
6514static int sched_domains_curr_level;
6515
6516static inline int sd_local_flags(int level)
6517{
6518 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
6519 return 0;
6520
6521 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
6522}
6523
6524static struct sched_domain *
6525sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
6526{
6527 struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
6528 int level = tl->numa_level;
6529 int sd_weight = cpumask_weight(
6530 sched_domains_numa_masks[level][cpu_to_node(cpu)]);
6531
6532 *sd = (struct sched_domain){
6533 .min_interval = sd_weight,
6534 .max_interval = 2*sd_weight,
6535 .busy_factor = 32,
6536 .imbalance_pct = 125,
6537 .cache_nice_tries = 2,
6538 .busy_idx = 3,
6539 .idle_idx = 2,
6540 .newidle_idx = 0,
6541 .wake_idx = 0,
6542 .forkexec_idx = 0,
6543
6544 .flags = 1*SD_LOAD_BALANCE
6545 | 1*SD_BALANCE_NEWIDLE
6546 | 0*SD_BALANCE_EXEC
6547 | 0*SD_BALANCE_FORK
6548 | 0*SD_BALANCE_WAKE
6549 | 0*SD_WAKE_AFFINE
6550 | 0*SD_PREFER_LOCAL
6551 | 0*SD_SHARE_CPUPOWER
6552 | 0*SD_SHARE_PKG_RESOURCES
6553 | 1*SD_SERIALIZE
6554 | 0*SD_PREFER_SIBLING
6555 | sd_local_flags(level)
6556 ,
6557 .last_balance = jiffies,
6558 .balance_interval = sd_weight,
6559 };
6560 SD_INIT_NAME(sd, NUMA);
6561 sd->private = &tl->data;
6562
6563 /*
6564 * Ugly hack to pass state to sd_numa_mask()...
6565 */
6566 sched_domains_curr_level = tl->numa_level;
6567
6568 return sd;
6569}
6570
6571static const struct cpumask *sd_numa_mask(int cpu)
6572{
6573 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6574}
6575
6576static void sched_numa_warn(const char *str)
6577{
6578 static int done = false;
6579 int i,j;
6580
6581 if (done)
6582 return;
6583
6584 done = true;
6585
6586 printk(KERN_WARNING "ERROR: %s\n\n", str);
6587
6588 for (i = 0; i < nr_node_ids; i++) {
6589 printk(KERN_WARNING " ");
6590 for (j = 0; j < nr_node_ids; j++)
6591 printk(KERN_CONT "%02d ", node_distance(i,j));
6592 printk(KERN_CONT "\n");
6593 }
6594 printk(KERN_WARNING "\n");
6595}
6596
6597static bool find_numa_distance(int distance)
6598{
6599 int i;
6600
6601 if (distance == node_distance(0, 0))
6602 return true;
6603
6604 for (i = 0; i < sched_domains_numa_levels; i++) {
6605 if (sched_domains_numa_distance[i] == distance)
6606 return true;
6607 }
6608
6609 return false;
6610}
6611
6612static void sched_init_numa(void)
6613{
6614 int next_distance, curr_distance = node_distance(0, 0);
6615 struct sched_domain_topology_level *tl;
6616 int level = 0;
6617 int i, j, k;
6618
6619 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6620 if (!sched_domains_numa_distance)
6621 return;
6622
6623 /*
6624 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
6625 * unique distances in the node_distance() table.
6626 *
6627 * Assumes node_distance(0,j) includes all distances in
6628 * node_distance(i,j) in order to avoid cubic time.
6629 */
6630 next_distance = curr_distance;
6631 for (i = 0; i < nr_node_ids; i++) {
6632 for (j = 0; j < nr_node_ids; j++) {
6633 for (k = 0; k < nr_node_ids; k++) {
6634 int distance = node_distance(i, k);
6635
6636 if (distance > curr_distance &&
6637 (distance < next_distance ||
6638 next_distance == curr_distance))
6639 next_distance = distance;
6640
6641 /*
6642 * While not a strong assumption it would be nice to know
6643 * about cases where if node A is connected to B, B is not
6644 * equally connected to A.
6645 */
6646 if (sched_debug() && node_distance(k, i) != distance)
6647 sched_numa_warn("Node-distance not symmetric");
6648
6649 if (sched_debug() && i && !find_numa_distance(distance))
6650 sched_numa_warn("Node-0 not representative");
6651 }
6652 if (next_distance != curr_distance) {
6653 sched_domains_numa_distance[level++] = next_distance;
6654 sched_domains_numa_levels = level;
6655 curr_distance = next_distance;
6656 } else break;
6657 }
6658
6659 /*
6660 * In case of sched_debug() we verify the above assumption.
6661 */
6662 if (!sched_debug())
6663 break;
6664 }
6665 /*
6666 * 'level' contains the number of unique distances, excluding the
6667 * identity distance node_distance(i,i).
6668 *
6669 * The sched_domains_nume_distance[] array includes the actual distance
6670 * numbers.
6671 */
6672
6673 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
6674 if (!sched_domains_numa_masks)
6675 return;
6676
6677 /*
6678 * Now for each level, construct a mask per node which contains all
6679 * cpus of nodes that are that many hops away from us.
6680 */
6681 for (i = 0; i < level; i++) {
6682 sched_domains_numa_masks[i] =
6683 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
6684 if (!sched_domains_numa_masks[i])
6685 return;
6686
6687 for (j = 0; j < nr_node_ids; j++) {
6688 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6689 if (!mask)
6690 return;
6691
6692 sched_domains_numa_masks[i][j] = mask;
6693
6694 for (k = 0; k < nr_node_ids; k++) {
6695 if (node_distance(j, k) > sched_domains_numa_distance[i])
6696 continue;
6697
6698 cpumask_or(mask, mask, cpumask_of_node(k));
6699 }
6700 }
6701 }
6702
6703 tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
6704 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
6705 if (!tl)
6706 return;
6707
6708 /*
6709 * Copy the default topology bits..
6710 */
6711 for (i = 0; default_topology[i].init; i++)
6712 tl[i] = default_topology[i];
6713
6714 /*
6715 * .. and append 'j' levels of NUMA goodness.
6716 */
6717 for (j = 0; j < level; i++, j++) {
6718 tl[i] = (struct sched_domain_topology_level){
6719 .init = sd_numa_init,
6720 .mask = sd_numa_mask,
6721 .flags = SDTL_OVERLAP,
6722 .numa_level = j,
6723 };
6724 }
6725
6726 sched_domain_topology = tl;
6727}
6728#else
6729static inline void sched_init_numa(void)
6730{
6731}
6732#endif /* CONFIG_NUMA */
6733
6348static int __sdt_alloc(const struct cpumask *cpu_map) 6734static int __sdt_alloc(const struct cpumask *cpu_map)
6349{ 6735{
6350 struct sched_domain_topology_level *tl; 6736 struct sched_domain_topology_level *tl;
@@ -6382,9 +6768,11 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6382 if (!sg) 6768 if (!sg)
6383 return -ENOMEM; 6769 return -ENOMEM;
6384 6770
6771 sg->next = sg;
6772
6385 *per_cpu_ptr(sdd->sg, j) = sg; 6773 *per_cpu_ptr(sdd->sg, j) = sg;
6386 6774
6387 sgp = kzalloc_node(sizeof(struct sched_group_power), 6775 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6388 GFP_KERNEL, cpu_to_node(j)); 6776 GFP_KERNEL, cpu_to_node(j));
6389 if (!sgp) 6777 if (!sgp)
6390 return -ENOMEM; 6778 return -ENOMEM;
@@ -6437,7 +6825,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6437 if (!sd) 6825 if (!sd)
6438 return child; 6826 return child;
6439 6827
6440 set_domain_attribute(sd, attr);
6441 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6828 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6442 if (child) { 6829 if (child) {
6443 sd->level = child->level + 1; 6830 sd->level = child->level + 1;
@@ -6445,6 +6832,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6445 child->parent = sd; 6832 child->parent = sd;
6446 } 6833 }
6447 sd->child = child; 6834 sd->child = child;
6835 set_domain_attribute(sd, attr);
6448 6836
6449 return sd; 6837 return sd;
6450} 6838}
@@ -6585,7 +6973,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
6585 if (!doms_cur) 6973 if (!doms_cur)
6586 doms_cur = &fallback_doms; 6974 doms_cur = &fallback_doms;
6587 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6975 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6588 dattr_cur = NULL;
6589 err = build_sched_domains(doms_cur[0], NULL); 6976 err = build_sched_domains(doms_cur[0], NULL);
6590 register_sched_domain_sysctl(); 6977 register_sched_domain_sysctl();
6591 6978
@@ -6710,97 +7097,6 @@ match2:
6710 mutex_unlock(&sched_domains_mutex); 7097 mutex_unlock(&sched_domains_mutex);
6711} 7098}
6712 7099
6713#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6714static void reinit_sched_domains(void)
6715{
6716 get_online_cpus();
6717
6718 /* Destroy domains first to force the rebuild */
6719 partition_sched_domains(0, NULL, NULL);
6720
6721 rebuild_sched_domains();
6722 put_online_cpus();
6723}
6724
6725static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6726{
6727 unsigned int level = 0;
6728
6729 if (sscanf(buf, "%u", &level) != 1)
6730 return -EINVAL;
6731
6732 /*
6733 * level is always be positive so don't check for
6734 * level < POWERSAVINGS_BALANCE_NONE which is 0
6735 * What happens on 0 or 1 byte write,
6736 * need to check for count as well?
6737 */
6738
6739 if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
6740 return -EINVAL;
6741
6742 if (smt)
6743 sched_smt_power_savings = level;
6744 else
6745 sched_mc_power_savings = level;
6746
6747 reinit_sched_domains();
6748
6749 return count;
6750}
6751
6752#ifdef CONFIG_SCHED_MC
6753static ssize_t sched_mc_power_savings_show(struct device *dev,
6754 struct device_attribute *attr,
6755 char *buf)
6756{
6757 return sprintf(buf, "%u\n", sched_mc_power_savings);
6758}
6759static ssize_t sched_mc_power_savings_store(struct device *dev,
6760 struct device_attribute *attr,
6761 const char *buf, size_t count)
6762{
6763 return sched_power_savings_store(buf, count, 0);
6764}
6765static DEVICE_ATTR(sched_mc_power_savings, 0644,
6766 sched_mc_power_savings_show,
6767 sched_mc_power_savings_store);
6768#endif
6769
6770#ifdef CONFIG_SCHED_SMT
6771static ssize_t sched_smt_power_savings_show(struct device *dev,
6772 struct device_attribute *attr,
6773 char *buf)
6774{
6775 return sprintf(buf, "%u\n", sched_smt_power_savings);
6776}
6777static ssize_t sched_smt_power_savings_store(struct device *dev,
6778 struct device_attribute *attr,
6779 const char *buf, size_t count)
6780{
6781 return sched_power_savings_store(buf, count, 1);
6782}
6783static DEVICE_ATTR(sched_smt_power_savings, 0644,
6784 sched_smt_power_savings_show,
6785 sched_smt_power_savings_store);
6786#endif
6787
6788int __init sched_create_sysfs_power_savings_entries(struct device *dev)
6789{
6790 int err = 0;
6791
6792#ifdef CONFIG_SCHED_SMT
6793 if (smt_capable())
6794 err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
6795#endif
6796#ifdef CONFIG_SCHED_MC
6797 if (!err && mc_capable())
6798 err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
6799#endif
6800 return err;
6801}
6802#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
6803
6804/* 7100/*
6805 * Update cpusets according to cpu_active mask. If cpusets are 7101 * Update cpusets according to cpu_active mask. If cpusets are
6806 * disabled, cpuset_update_active_cpus() becomes a simple wrapper 7102 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@ -6838,6 +7134,8 @@ void __init sched_init_smp(void)
6838 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL); 7134 alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
6839 alloc_cpumask_var(&fallback_doms, GFP_KERNEL); 7135 alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
6840 7136
7137 sched_init_numa();
7138
6841 get_online_cpus(); 7139 get_online_cpus();
6842 mutex_lock(&sched_domains_mutex); 7140 mutex_lock(&sched_domains_mutex);
6843 init_sched_domains(cpu_active_mask); 7141 init_sched_domains(cpu_active_mask);
@@ -7059,6 +7357,7 @@ void __init sched_init(void)
7059 /* May be allocated at isolcpus cmdline parse time */ 7357 /* May be allocated at isolcpus cmdline parse time */
7060 if (cpu_isolated_map == NULL) 7358 if (cpu_isolated_map == NULL)
7061 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 7359 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7360 idle_thread_set_boot_cpu();
7062#endif 7361#endif
7063 init_sched_fair_class(); 7362 init_sched_fair_class();
7064 7363
@@ -7980,13 +8279,9 @@ static struct cftype cpu_files[] = {
7980 .write_u64 = cpu_rt_period_write_uint, 8279 .write_u64 = cpu_rt_period_write_uint,
7981 }, 8280 },
7982#endif 8281#endif
8282 { } /* terminate */
7983}; 8283};
7984 8284
7985static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7986{
7987 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
7988}
7989
7990struct cgroup_subsys cpu_cgroup_subsys = { 8285struct cgroup_subsys cpu_cgroup_subsys = {
7991 .name = "cpu", 8286 .name = "cpu",
7992 .create = cpu_cgroup_create, 8287 .create = cpu_cgroup_create,
@@ -7994,8 +8289,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
7994 .can_attach = cpu_cgroup_can_attach, 8289 .can_attach = cpu_cgroup_can_attach,
7995 .attach = cpu_cgroup_attach, 8290 .attach = cpu_cgroup_attach,
7996 .exit = cpu_cgroup_exit, 8291 .exit = cpu_cgroup_exit,
7997 .populate = cpu_cgroup_populate,
7998 .subsys_id = cpu_cgroup_subsys_id, 8292 .subsys_id = cpu_cgroup_subsys_id,
8293 .base_cftypes = cpu_files,
7999 .early_init = 1, 8294 .early_init = 1,
8000}; 8295};
8001 8296
@@ -8180,13 +8475,9 @@ static struct cftype files[] = {
8180 .name = "stat", 8475 .name = "stat",
8181 .read_map = cpuacct_stats_show, 8476 .read_map = cpuacct_stats_show,
8182 }, 8477 },
8478 { } /* terminate */
8183}; 8479};
8184 8480
8185static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
8186{
8187 return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
8188}
8189
8190/* 8481/*
8191 * charge this task's execution time to its accounting group. 8482 * charge this task's execution time to its accounting group.
8192 * 8483 *
@@ -8218,7 +8509,7 @@ struct cgroup_subsys cpuacct_subsys = {
8218 .name = "cpuacct", 8509 .name = "cpuacct",
8219 .create = cpuacct_create, 8510 .create = cpuacct_create,
8220 .destroy = cpuacct_destroy, 8511 .destroy = cpuacct_destroy,
8221 .populate = cpuacct_populate,
8222 .subsys_id = cpuacct_subsys_id, 8512 .subsys_id = cpuacct_subsys_id,
8513 .base_cftypes = files,
8223}; 8514};
8224#endif /* CONFIG_CGROUP_CPUACCT */ 8515#endif /* CONFIG_CGROUP_CPUACCT */