aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/core.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/core.c')
-rw-r--r--kernel/sched/core.c525
1 files changed, 403 insertions, 122 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 39eb6011bc38..468bdd44c1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
142#define SCHED_FEAT(name, enabled) \ 142#define SCHED_FEAT(name, enabled) \
143 #name , 143 #name ,
144 144
145static __read_mostly char *sched_feat_names[] = { 145static const char * const sched_feat_names[] = {
146#include "features.h" 146#include "features.h"
147 NULL
148}; 147};
149 148
150#undef SCHED_FEAT 149#undef SCHED_FEAT
@@ -2082,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
2082#endif 2081#endif
2083 2082
2084 /* Here we just switch the register state and the stack. */ 2083 /* Here we just switch the register state and the stack. */
2085 rcu_switch_from(prev);
2086 switch_to(prev, next, prev); 2084 switch_to(prev, next, prev);
2087 2085
2088 barrier(); 2086 barrier();
@@ -2162,11 +2160,73 @@ unsigned long this_cpu_load(void)
2162} 2160}
2163 2161
2164 2162
2163/*
2164 * Global load-average calculations
2165 *
2166 * We take a distributed and async approach to calculating the global load-avg
2167 * in order to minimize overhead.
2168 *
2169 * The global load average is an exponentially decaying average of nr_running +
2170 * nr_uninterruptible.
2171 *
2172 * Once every LOAD_FREQ:
2173 *
2174 * nr_active = 0;
2175 * for_each_possible_cpu(cpu)
2176 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
2177 *
2178 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
2179 *
2180 * Due to a number of reasons the above turns in the mess below:
2181 *
2182 * - for_each_possible_cpu() is prohibitively expensive on machines with
2183 * serious number of cpus, therefore we need to take a distributed approach
2184 * to calculating nr_active.
2185 *
2186 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
2187 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
2188 *
2189 * So assuming nr_active := 0 when we start out -- true per definition, we
2190 * can simply take per-cpu deltas and fold those into a global accumulate
2191 * to obtain the same result. See calc_load_fold_active().
2192 *
2193 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
2194 * across the machine, we assume 10 ticks is sufficient time for every
2195 * cpu to have completed this task.
2196 *
2197 * This places an upper-bound on the IRQ-off latency of the machine. Then
2198 * again, being late doesn't loose the delta, just wrecks the sample.
2199 *
2200 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
2201 * this would add another cross-cpu cacheline miss and atomic operation
2202 * to the wakeup path. Instead we increment on whatever cpu the task ran
2203 * when it went into uninterruptible state and decrement on whatever cpu
2204 * did the wakeup. This means that only the sum of nr_uninterruptible over
2205 * all cpus yields the correct result.
2206 *
2207 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
2208 */
2209
2165/* Variables and functions for calc_load */ 2210/* Variables and functions for calc_load */
2166static atomic_long_t calc_load_tasks; 2211static atomic_long_t calc_load_tasks;
2167static unsigned long calc_load_update; 2212static unsigned long calc_load_update;
2168unsigned long avenrun[3]; 2213unsigned long avenrun[3];
2169EXPORT_SYMBOL(avenrun); 2214EXPORT_SYMBOL(avenrun); /* should be removed */
2215
2216/**
2217 * get_avenrun - get the load average array
2218 * @loads: pointer to dest load array
2219 * @offset: offset to add
2220 * @shift: shift count to shift the result left
2221 *
2222 * These values are estimates at best, so no need for locking.
2223 */
2224void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2225{
2226 loads[0] = (avenrun[0] + offset) << shift;
2227 loads[1] = (avenrun[1] + offset) << shift;
2228 loads[2] = (avenrun[2] + offset) << shift;
2229}
2170 2230
2171static long calc_load_fold_active(struct rq *this_rq) 2231static long calc_load_fold_active(struct rq *this_rq)
2172{ 2232{
@@ -2183,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
2183 return delta; 2243 return delta;
2184} 2244}
2185 2245
2246/*
2247 * a1 = a0 * e + a * (1 - e)
2248 */
2186static unsigned long 2249static unsigned long
2187calc_load(unsigned long load, unsigned long exp, unsigned long active) 2250calc_load(unsigned long load, unsigned long exp, unsigned long active)
2188{ 2251{
@@ -2194,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
2194 2257
2195#ifdef CONFIG_NO_HZ 2258#ifdef CONFIG_NO_HZ
2196/* 2259/*
2197 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 2260 * Handle NO_HZ for the global load-average.
2261 *
2262 * Since the above described distributed algorithm to compute the global
2263 * load-average relies on per-cpu sampling from the tick, it is affected by
2264 * NO_HZ.
2265 *
2266 * The basic idea is to fold the nr_active delta into a global idle-delta upon
2267 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
2268 * when we read the global state.
2269 *
2270 * Obviously reality has to ruin such a delightfully simple scheme:
2271 *
2272 * - When we go NO_HZ idle during the window, we can negate our sample
2273 * contribution, causing under-accounting.
2274 *
2275 * We avoid this by keeping two idle-delta counters and flipping them
2276 * when the window starts, thus separating old and new NO_HZ load.
2277 *
2278 * The only trick is the slight shift in index flip for read vs write.
2279 *
2280 * 0s 5s 10s 15s
2281 * +10 +10 +10 +10
2282 * |-|-----------|-|-----------|-|-----------|-|
2283 * r:0 0 1 1 0 0 1 1 0
2284 * w:0 1 1 0 0 1 1 0 0
2285 *
2286 * This ensures we'll fold the old idle contribution in this window while
2287 * accumlating the new one.
2288 *
2289 * - When we wake up from NO_HZ idle during the window, we push up our
2290 * contribution, since we effectively move our sample point to a known
2291 * busy state.
2292 *
2293 * This is solved by pushing the window forward, and thus skipping the
2294 * sample, for this cpu (effectively using the idle-delta for this cpu which
2295 * was in effect at the time the window opened). This also solves the issue
2296 * of having to deal with a cpu having been in NOHZ idle for multiple
2297 * LOAD_FREQ intervals.
2198 * 2298 *
2199 * When making the ILB scale, we should try to pull this in as well. 2299 * When making the ILB scale, we should try to pull this in as well.
2200 */ 2300 */
2201static atomic_long_t calc_load_tasks_idle; 2301static atomic_long_t calc_load_idle[2];
2302static int calc_load_idx;
2303
2304static inline int calc_load_write_idx(void)
2305{
2306 int idx = calc_load_idx;
2307
2308 /*
2309 * See calc_global_nohz(), if we observe the new index, we also
2310 * need to observe the new update time.
2311 */
2312 smp_rmb();
2313
2314 /*
2315 * If the folding window started, make sure we start writing in the
2316 * next idle-delta.
2317 */
2318 if (!time_before(jiffies, calc_load_update))
2319 idx++;
2202 2320
2203void calc_load_account_idle(struct rq *this_rq) 2321 return idx & 1;
2322}
2323
2324static inline int calc_load_read_idx(void)
2204{ 2325{
2326 return calc_load_idx & 1;
2327}
2328
2329void calc_load_enter_idle(void)
2330{
2331 struct rq *this_rq = this_rq();
2205 long delta; 2332 long delta;
2206 2333
2334 /*
2335 * We're going into NOHZ mode, if there's any pending delta, fold it
2336 * into the pending idle delta.
2337 */
2207 delta = calc_load_fold_active(this_rq); 2338 delta = calc_load_fold_active(this_rq);
2208 if (delta) 2339 if (delta) {
2209 atomic_long_add(delta, &calc_load_tasks_idle); 2340 int idx = calc_load_write_idx();
2341 atomic_long_add(delta, &calc_load_idle[idx]);
2342 }
2210} 2343}
2211 2344
2212static long calc_load_fold_idle(void) 2345void calc_load_exit_idle(void)
2213{ 2346{
2214 long delta = 0; 2347 struct rq *this_rq = this_rq();
2215 2348
2216 /* 2349 /*
2217 * Its got a race, we don't care... 2350 * If we're still before the sample window, we're done.
2218 */ 2351 */
2219 if (atomic_long_read(&calc_load_tasks_idle)) 2352 if (time_before(jiffies, this_rq->calc_load_update))
2220 delta = atomic_long_xchg(&calc_load_tasks_idle, 0); 2353 return;
2354
2355 /*
2356 * We woke inside or after the sample window, this means we're already
2357 * accounted through the nohz accounting, so skip the entire deal and
2358 * sync up for the next window.
2359 */
2360 this_rq->calc_load_update = calc_load_update;
2361 if (time_before(jiffies, this_rq->calc_load_update + 10))
2362 this_rq->calc_load_update += LOAD_FREQ;
2363}
2364
2365static long calc_load_fold_idle(void)
2366{
2367 int idx = calc_load_read_idx();
2368 long delta = 0;
2369
2370 if (atomic_long_read(&calc_load_idle[idx]))
2371 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2221 2372
2222 return delta; 2373 return delta;
2223} 2374}
@@ -2303,66 +2454,39 @@ static void calc_global_nohz(void)
2303{ 2454{
2304 long delta, active, n; 2455 long delta, active, n;
2305 2456
2306 /* 2457 if (!time_before(jiffies, calc_load_update + 10)) {
2307 * If we crossed a calc_load_update boundary, make sure to fold 2458 /*
2308 * any pending idle changes, the respective CPUs might have 2459 * Catch-up, fold however many we are behind still
2309 * missed the tick driven calc_load_account_active() update 2460 */
2310 * due to NO_HZ. 2461 delta = jiffies - calc_load_update - 10;
2311 */ 2462 n = 1 + (delta / LOAD_FREQ);
2312 delta = calc_load_fold_idle();
2313 if (delta)
2314 atomic_long_add(delta, &calc_load_tasks);
2315
2316 /*
2317 * It could be the one fold was all it took, we done!
2318 */
2319 if (time_before(jiffies, calc_load_update + 10))
2320 return;
2321
2322 /*
2323 * Catch-up, fold however many we are behind still
2324 */
2325 delta = jiffies - calc_load_update - 10;
2326 n = 1 + (delta / LOAD_FREQ);
2327 2463
2328 active = atomic_long_read(&calc_load_tasks); 2464 active = atomic_long_read(&calc_load_tasks);
2329 active = active > 0 ? active * FIXED_1 : 0; 2465 active = active > 0 ? active * FIXED_1 : 0;
2330 2466
2331 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); 2467 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2332 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 2468 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2333 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); 2469 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2334 2470
2335 calc_load_update += n * LOAD_FREQ; 2471 calc_load_update += n * LOAD_FREQ;
2336} 2472 }
2337#else
2338void calc_load_account_idle(struct rq *this_rq)
2339{
2340}
2341 2473
2342static inline long calc_load_fold_idle(void) 2474 /*
2343{ 2475 * Flip the idle index...
2344 return 0; 2476 *
2477 * Make sure we first write the new time then flip the index, so that
2478 * calc_load_write_idx() will see the new time when it reads the new
2479 * index, this avoids a double flip messing things up.
2480 */
2481 smp_wmb();
2482 calc_load_idx++;
2345} 2483}
2484#else /* !CONFIG_NO_HZ */
2346 2485
2347static void calc_global_nohz(void) 2486static inline long calc_load_fold_idle(void) { return 0; }
2348{ 2487static inline void calc_global_nohz(void) { }
2349}
2350#endif
2351 2488
2352/** 2489#endif /* CONFIG_NO_HZ */
2353 * get_avenrun - get the load average array
2354 * @loads: pointer to dest load array
2355 * @offset: offset to add
2356 * @shift: shift count to shift the result left
2357 *
2358 * These values are estimates at best, so no need for locking.
2359 */
2360void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2361{
2362 loads[0] = (avenrun[0] + offset) << shift;
2363 loads[1] = (avenrun[1] + offset) << shift;
2364 loads[2] = (avenrun[2] + offset) << shift;
2365}
2366 2490
2367/* 2491/*
2368 * calc_load - update the avenrun load estimates 10 ticks after the 2492 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2370,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2370 */ 2494 */
2371void calc_global_load(unsigned long ticks) 2495void calc_global_load(unsigned long ticks)
2372{ 2496{
2373 long active; 2497 long active, delta;
2374 2498
2375 if (time_before(jiffies, calc_load_update + 10)) 2499 if (time_before(jiffies, calc_load_update + 10))
2376 return; 2500 return;
2377 2501
2502 /*
2503 * Fold the 'old' idle-delta to include all NO_HZ cpus.
2504 */
2505 delta = calc_load_fold_idle();
2506 if (delta)
2507 atomic_long_add(delta, &calc_load_tasks);
2508
2378 active = atomic_long_read(&calc_load_tasks); 2509 active = atomic_long_read(&calc_load_tasks);
2379 active = active > 0 ? active * FIXED_1 : 0; 2510 active = active > 0 ? active * FIXED_1 : 0;
2380 2511
@@ -2385,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
2385 calc_load_update += LOAD_FREQ; 2516 calc_load_update += LOAD_FREQ;
2386 2517
2387 /* 2518 /*
2388 * Account one period with whatever state we found before 2519 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2389 * folding in the nohz state and ageing the entire idle period.
2390 *
2391 * This avoids loosing a sample when we go idle between
2392 * calc_load_account_active() (10 ticks ago) and now and thus
2393 * under-accounting.
2394 */ 2520 */
2395 calc_global_nohz(); 2521 calc_global_nohz();
2396} 2522}
@@ -2407,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
2407 return; 2533 return;
2408 2534
2409 delta = calc_load_fold_active(this_rq); 2535 delta = calc_load_fold_active(this_rq);
2410 delta += calc_load_fold_idle();
2411 if (delta) 2536 if (delta)
2412 atomic_long_add(delta, &calc_load_tasks); 2537 atomic_long_add(delta, &calc_load_tasks);
2413 2538
@@ -2415,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
2415} 2540}
2416 2541
2417/* 2542/*
2543 * End of global load-average stuff
2544 */
2545
2546/*
2418 * The exact cpuload at various idx values, calculated at every tick would be 2547 * The exact cpuload at various idx values, calculated at every tick would be
2419 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load 2548 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2420 * 2549 *
@@ -2517,25 +2646,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2517 sched_avg_update(this_rq); 2646 sched_avg_update(this_rq);
2518} 2647}
2519 2648
2649#ifdef CONFIG_NO_HZ
2650/*
2651 * There is no sane way to deal with nohz on smp when using jiffies because the
2652 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2653 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2654 *
2655 * Therefore we cannot use the delta approach from the regular tick since that
2656 * would seriously skew the load calculation. However we'll make do for those
2657 * updates happening while idle (nohz_idle_balance) or coming out of idle
2658 * (tick_nohz_idle_exit).
2659 *
2660 * This means we might still be one tick off for nohz periods.
2661 */
2662
2520/* 2663/*
2521 * Called from nohz_idle_balance() to update the load ratings before doing the 2664 * Called from nohz_idle_balance() to update the load ratings before doing the
2522 * idle balance. 2665 * idle balance.
2523 */ 2666 */
2524void update_idle_cpu_load(struct rq *this_rq) 2667void update_idle_cpu_load(struct rq *this_rq)
2525{ 2668{
2526 unsigned long curr_jiffies = jiffies; 2669 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2527 unsigned long load = this_rq->load.weight; 2670 unsigned long load = this_rq->load.weight;
2528 unsigned long pending_updates; 2671 unsigned long pending_updates;
2529 2672
2530 /* 2673 /*
2531 * Bloody broken means of dealing with nohz, but better than nothing.. 2674 * bail if there's load or we're actually up-to-date.
2532 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
2533 * update and see 0 difference the one time and 2 the next, even though
2534 * we ticked at roughtly the same rate.
2535 *
2536 * Hence we only use this from nohz_idle_balance() and skip this
2537 * nonsense when called from the scheduler_tick() since that's
2538 * guaranteed a stable rate.
2539 */ 2675 */
2540 if (load || curr_jiffies == this_rq->last_load_update_tick) 2676 if (load || curr_jiffies == this_rq->last_load_update_tick)
2541 return; 2677 return;
@@ -2547,12 +2683,38 @@ void update_idle_cpu_load(struct rq *this_rq)
2547} 2683}
2548 2684
2549/* 2685/*
2686 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2687 */
2688void update_cpu_load_nohz(void)
2689{
2690 struct rq *this_rq = this_rq();
2691 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2692 unsigned long pending_updates;
2693
2694 if (curr_jiffies == this_rq->last_load_update_tick)
2695 return;
2696
2697 raw_spin_lock(&this_rq->lock);
2698 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2699 if (pending_updates) {
2700 this_rq->last_load_update_tick = curr_jiffies;
2701 /*
2702 * We were idle, this means load 0, the current load might be
2703 * !0 due to remote wakeups and the sort.
2704 */
2705 __update_cpu_load(this_rq, 0, pending_updates);
2706 }
2707 raw_spin_unlock(&this_rq->lock);
2708}
2709#endif /* CONFIG_NO_HZ */
2710
2711/*
2550 * Called from scheduler_tick() 2712 * Called from scheduler_tick()
2551 */ 2713 */
2552static void update_cpu_load_active(struct rq *this_rq) 2714static void update_cpu_load_active(struct rq *this_rq)
2553{ 2715{
2554 /* 2716 /*
2555 * See the mess in update_idle_cpu_load(). 2717 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2556 */ 2718 */
2557 this_rq->last_load_update_tick = jiffies; 2719 this_rq->last_load_update_tick = jiffies;
2558 __update_cpu_load(this_rq, this_rq->load.weight, 1); 2720 __update_cpu_load(this_rq, this_rq->load.weight, 1);
@@ -4982,7 +5144,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4982 p->sched_class->set_cpus_allowed(p, new_mask); 5144 p->sched_class->set_cpus_allowed(p, new_mask);
4983 5145
4984 cpumask_copy(&p->cpus_allowed, new_mask); 5146 cpumask_copy(&p->cpus_allowed, new_mask);
4985 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 5147 p->nr_cpus_allowed = cpumask_weight(new_mask);
4986} 5148}
4987 5149
4988/* 5150/*
@@ -5524,15 +5686,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5524 5686
5525#ifdef CONFIG_SCHED_DEBUG 5687#ifdef CONFIG_SCHED_DEBUG
5526 5688
5527static __read_mostly int sched_domain_debug_enabled; 5689static __read_mostly int sched_debug_enabled;
5528 5690
5529static int __init sched_domain_debug_setup(char *str) 5691static int __init sched_debug_setup(char *str)
5530{ 5692{
5531 sched_domain_debug_enabled = 1; 5693 sched_debug_enabled = 1;
5532 5694
5533 return 0; 5695 return 0;
5534} 5696}
5535early_param("sched_debug", sched_domain_debug_setup); 5697early_param("sched_debug", sched_debug_setup);
5698
5699static inline bool sched_debug(void)
5700{
5701 return sched_debug_enabled;
5702}
5536 5703
5537static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5704static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5538 struct cpumask *groupmask) 5705 struct cpumask *groupmask)
@@ -5572,7 +5739,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5572 break; 5739 break;
5573 } 5740 }
5574 5741
5575 if (!group->sgp->power) { 5742 /*
5743 * Even though we initialize ->power to something semi-sane,
5744 * we leave power_orig unset. This allows us to detect if
5745 * domain iteration is still funny without causing /0 traps.
5746 */
5747 if (!group->sgp->power_orig) {
5576 printk(KERN_CONT "\n"); 5748 printk(KERN_CONT "\n");
5577 printk(KERN_ERR "ERROR: domain->cpu_power not " 5749 printk(KERN_ERR "ERROR: domain->cpu_power not "
5578 "set\n"); 5750 "set\n");
@@ -5620,7 +5792,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5620{ 5792{
5621 int level = 0; 5793 int level = 0;
5622 5794
5623 if (!sched_domain_debug_enabled) 5795 if (!sched_debug_enabled)
5624 return; 5796 return;
5625 5797
5626 if (!sd) { 5798 if (!sd) {
@@ -5641,6 +5813,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5641} 5813}
5642#else /* !CONFIG_SCHED_DEBUG */ 5814#else /* !CONFIG_SCHED_DEBUG */
5643# define sched_domain_debug(sd, cpu) do { } while (0) 5815# define sched_domain_debug(sd, cpu) do { } while (0)
5816static inline bool sched_debug(void)
5817{
5818 return false;
5819}
5644#endif /* CONFIG_SCHED_DEBUG */ 5820#endif /* CONFIG_SCHED_DEBUG */
5645 5821
5646static int sd_degenerate(struct sched_domain *sd) 5822static int sd_degenerate(struct sched_domain *sd)
@@ -5962,6 +6138,44 @@ struct sched_domain_topology_level {
5962 struct sd_data data; 6138 struct sd_data data;
5963}; 6139};
5964 6140
6141/*
6142 * Build an iteration mask that can exclude certain CPUs from the upwards
6143 * domain traversal.
6144 *
6145 * Asymmetric node setups can result in situations where the domain tree is of
6146 * unequal depth, make sure to skip domains that already cover the entire
6147 * range.
6148 *
6149 * In that case build_sched_domains() will have terminated the iteration early
6150 * and our sibling sd spans will be empty. Domains should always include the
6151 * cpu they're built on, so check that.
6152 *
6153 */
6154static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6155{
6156 const struct cpumask *span = sched_domain_span(sd);
6157 struct sd_data *sdd = sd->private;
6158 struct sched_domain *sibling;
6159 int i;
6160
6161 for_each_cpu(i, span) {
6162 sibling = *per_cpu_ptr(sdd->sd, i);
6163 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6164 continue;
6165
6166 cpumask_set_cpu(i, sched_group_mask(sg));
6167 }
6168}
6169
6170/*
6171 * Return the canonical balance cpu for this group, this is the first cpu
6172 * of this group that's also in the iteration mask.
6173 */
6174int group_balance_cpu(struct sched_group *sg)
6175{
6176 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6177}
6178
5965static int 6179static int
5966build_overlap_sched_groups(struct sched_domain *sd, int cpu) 6180build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5967{ 6181{
@@ -5980,6 +6194,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5980 if (cpumask_test_cpu(i, covered)) 6194 if (cpumask_test_cpu(i, covered))
5981 continue; 6195 continue;
5982 6196
6197 child = *per_cpu_ptr(sdd->sd, i);
6198
6199 /* See the comment near build_group_mask(). */
6200 if (!cpumask_test_cpu(i, sched_domain_span(child)))
6201 continue;
6202
5983 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6203 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5984 GFP_KERNEL, cpu_to_node(cpu)); 6204 GFP_KERNEL, cpu_to_node(cpu));
5985 6205
@@ -5987,8 +6207,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5987 goto fail; 6207 goto fail;
5988 6208
5989 sg_span = sched_group_cpus(sg); 6209 sg_span = sched_group_cpus(sg);
5990
5991 child = *per_cpu_ptr(sdd->sd, i);
5992 if (child->child) { 6210 if (child->child) {
5993 child = child->child; 6211 child = child->child;
5994 cpumask_copy(sg_span, sched_domain_span(child)); 6212 cpumask_copy(sg_span, sched_domain_span(child));
@@ -5997,10 +6215,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5997 6215
5998 cpumask_or(covered, covered, sg_span); 6216 cpumask_or(covered, covered, sg_span);
5999 6217
6000 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); 6218 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
6001 atomic_inc(&sg->sgp->ref); 6219 if (atomic_inc_return(&sg->sgp->ref) == 1)
6220 build_group_mask(sd, sg);
6002 6221
6003 if (cpumask_test_cpu(cpu, sg_span)) 6222 /*
6223 * Initialize sgp->power such that even if we mess up the
6224 * domains and no possible iteration will get us here, we won't
6225 * die on a /0 trap.
6226 */
6227 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
6228
6229 /*
6230 * Make sure the first group of this domain contains the
6231 * canonical balance cpu. Otherwise the sched_domain iteration
6232 * breaks. See update_sg_lb_stats().
6233 */
6234 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6235 group_balance_cpu(sg) == cpu)
6004 groups = sg; 6236 groups = sg;
6005 6237
6006 if (!first) 6238 if (!first)
@@ -6074,6 +6306,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
6074 6306
6075 cpumask_clear(sched_group_cpus(sg)); 6307 cpumask_clear(sched_group_cpus(sg));
6076 sg->sgp->power = 0; 6308 sg->sgp->power = 0;
6309 cpumask_setall(sched_group_mask(sg));
6077 6310
6078 for_each_cpu(j, span) { 6311 for_each_cpu(j, span) {
6079 if (get_group(j, sdd, NULL) != group) 6312 if (get_group(j, sdd, NULL) != group)
@@ -6115,7 +6348,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6115 sg = sg->next; 6348 sg = sg->next;
6116 } while (sg != sd->groups); 6349 } while (sg != sd->groups);
6117 6350
6118 if (cpu != group_first_cpu(sg)) 6351 if (cpu != group_balance_cpu(sg))
6119 return; 6352 return;
6120 6353
6121 update_group_power(sd, cpu); 6354 update_group_power(sd, cpu);
@@ -6165,11 +6398,8 @@ int sched_domain_level_max;
6165 6398
6166static int __init setup_relax_domain_level(char *str) 6399static int __init setup_relax_domain_level(char *str)
6167{ 6400{
6168 unsigned long val; 6401 if (kstrtoint(str, 0, &default_relax_domain_level))
6169 6402 pr_warn("Unable to set relax_domain_level\n");
6170 val = simple_strtoul(str, NULL, 0);
6171 if (val < sched_domain_level_max)
6172 default_relax_domain_level = val;
6173 6403
6174 return 1; 6404 return 1;
6175} 6405}
@@ -6279,14 +6509,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol
6279#ifdef CONFIG_NUMA 6509#ifdef CONFIG_NUMA
6280 6510
6281static int sched_domains_numa_levels; 6511static int sched_domains_numa_levels;
6282static int sched_domains_numa_scale;
6283static int *sched_domains_numa_distance; 6512static int *sched_domains_numa_distance;
6284static struct cpumask ***sched_domains_numa_masks; 6513static struct cpumask ***sched_domains_numa_masks;
6285static int sched_domains_curr_level; 6514static int sched_domains_curr_level;
6286 6515
6287static inline int sd_local_flags(int level) 6516static inline int sd_local_flags(int level)
6288{ 6517{
6289 if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) 6518 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
6290 return 0; 6519 return 0;
6291 6520
6292 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 6521 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
@@ -6344,6 +6573,42 @@ static const struct cpumask *sd_numa_mask(int cpu)
6344 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6573 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6345} 6574}
6346 6575
6576static void sched_numa_warn(const char *str)
6577{
6578 static int done = false;
6579 int i,j;
6580
6581 if (done)
6582 return;
6583
6584 done = true;
6585
6586 printk(KERN_WARNING "ERROR: %s\n\n", str);
6587
6588 for (i = 0; i < nr_node_ids; i++) {
6589 printk(KERN_WARNING " ");
6590 for (j = 0; j < nr_node_ids; j++)
6591 printk(KERN_CONT "%02d ", node_distance(i,j));
6592 printk(KERN_CONT "\n");
6593 }
6594 printk(KERN_WARNING "\n");
6595}
6596
6597static bool find_numa_distance(int distance)
6598{
6599 int i;
6600
6601 if (distance == node_distance(0, 0))
6602 return true;
6603
6604 for (i = 0; i < sched_domains_numa_levels; i++) {
6605 if (sched_domains_numa_distance[i] == distance)
6606 return true;
6607 }
6608
6609 return false;
6610}
6611
6347static void sched_init_numa(void) 6612static void sched_init_numa(void)
6348{ 6613{
6349 int next_distance, curr_distance = node_distance(0, 0); 6614 int next_distance, curr_distance = node_distance(0, 0);
@@ -6351,7 +6616,6 @@ static void sched_init_numa(void)
6351 int level = 0; 6616 int level = 0;
6352 int i, j, k; 6617 int i, j, k;
6353 6618
6354 sched_domains_numa_scale = curr_distance;
6355 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 6619 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6356 if (!sched_domains_numa_distance) 6620 if (!sched_domains_numa_distance)
6357 return; 6621 return;
@@ -6362,23 +6626,41 @@ static void sched_init_numa(void)
6362 * 6626 *
6363 * Assumes node_distance(0,j) includes all distances in 6627 * Assumes node_distance(0,j) includes all distances in
6364 * node_distance(i,j) in order to avoid cubic time. 6628 * node_distance(i,j) in order to avoid cubic time.
6365 *
6366 * XXX: could be optimized to O(n log n) by using sort()
6367 */ 6629 */
6368 next_distance = curr_distance; 6630 next_distance = curr_distance;
6369 for (i = 0; i < nr_node_ids; i++) { 6631 for (i = 0; i < nr_node_ids; i++) {
6370 for (j = 0; j < nr_node_ids; j++) { 6632 for (j = 0; j < nr_node_ids; j++) {
6371 int distance = node_distance(0, j); 6633 for (k = 0; k < nr_node_ids; k++) {
6372 if (distance > curr_distance && 6634 int distance = node_distance(i, k);
6373 (distance < next_distance || 6635
6374 next_distance == curr_distance)) 6636 if (distance > curr_distance &&
6375 next_distance = distance; 6637 (distance < next_distance ||
6638 next_distance == curr_distance))
6639 next_distance = distance;
6640
6641 /*
6642 * While not a strong assumption it would be nice to know
6643 * about cases where if node A is connected to B, B is not
6644 * equally connected to A.
6645 */
6646 if (sched_debug() && node_distance(k, i) != distance)
6647 sched_numa_warn("Node-distance not symmetric");
6648
6649 if (sched_debug() && i && !find_numa_distance(distance))
6650 sched_numa_warn("Node-0 not representative");
6651 }
6652 if (next_distance != curr_distance) {
6653 sched_domains_numa_distance[level++] = next_distance;
6654 sched_domains_numa_levels = level;
6655 curr_distance = next_distance;
6656 } else break;
6376 } 6657 }
6377 if (next_distance != curr_distance) { 6658
6378 sched_domains_numa_distance[level++] = next_distance; 6659 /*
6379 sched_domains_numa_levels = level; 6660 * In case of sched_debug() we verify the above assumption.
6380 curr_distance = next_distance; 6661 */
6381 } else break; 6662 if (!sched_debug())
6663 break;
6382 } 6664 }
6383 /* 6665 /*
6384 * 'level' contains the number of unique distances, excluding the 6666 * 'level' contains the number of unique distances, excluding the
@@ -6403,7 +6685,7 @@ static void sched_init_numa(void)
6403 return; 6685 return;
6404 6686
6405 for (j = 0; j < nr_node_ids; j++) { 6687 for (j = 0; j < nr_node_ids; j++) {
6406 struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); 6688 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6407 if (!mask) 6689 if (!mask)
6408 return; 6690 return;
6409 6691
@@ -6490,7 +6772,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6490 6772
6491 *per_cpu_ptr(sdd->sg, j) = sg; 6773 *per_cpu_ptr(sdd->sg, j) = sg;
6492 6774
6493 sgp = kzalloc_node(sizeof(struct sched_group_power), 6775 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6494 GFP_KERNEL, cpu_to_node(j)); 6776 GFP_KERNEL, cpu_to_node(j));
6495 if (!sgp) 6777 if (!sgp)
6496 return -ENOMEM; 6778 return -ENOMEM;
@@ -6543,7 +6825,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6543 if (!sd) 6825 if (!sd)
6544 return child; 6826 return child;
6545 6827
6546 set_domain_attribute(sd, attr);
6547 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6828 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6548 if (child) { 6829 if (child) {
6549 sd->level = child->level + 1; 6830 sd->level = child->level + 1;
@@ -6551,6 +6832,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6551 child->parent = sd; 6832 child->parent = sd;
6552 } 6833 }
6553 sd->child = child; 6834 sd->child = child;
6835 set_domain_attribute(sd, attr);
6554 6836
6555 return sd; 6837 return sd;
6556} 6838}
@@ -6691,7 +6973,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
6691 if (!doms_cur) 6973 if (!doms_cur)
6692 doms_cur = &fallback_doms; 6974 doms_cur = &fallback_doms;
6693 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6975 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6694 dattr_cur = NULL;
6695 err = build_sched_domains(doms_cur[0], NULL); 6976 err = build_sched_domains(doms_cur[0], NULL);
6696 register_sched_domain_sysctl(); 6977 register_sched_domain_sysctl();
6697 6978