aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorJens Axboe <axboe@kernel.dk>2012-07-30 03:03:10 -0400
committerJens Axboe <axboe@kernel.dk>2012-07-30 03:03:10 -0400
commit72ea1f74fcdf874cca6d2c0962379523bbd99e2c (patch)
tree4c67be6c73356086ff44ef1b8b1c9479702689ca /kernel/sched
parentb1af9be5ef77898c05667bb9dbf3b180d91d3292 (diff)
parenta73ff3231df59a4b92ccd0dd4e73897c5822489b (diff)
Merge branch 'for-jens' of git://git.drbd.org/linux-drbd into for-3.6/drivers
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c525
-rw-r--r--kernel/sched/fair.c71
-rw-r--r--kernel/sched/idle_task.c1
-rw-r--r--kernel/sched/rt.c53
-rw-r--r--kernel/sched/sched.h4
5 files changed, 479 insertions, 175 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 39eb6011bc38..468bdd44c1ba 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -142,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
142#define SCHED_FEAT(name, enabled) \ 142#define SCHED_FEAT(name, enabled) \
143 #name , 143 #name ,
144 144
145static __read_mostly char *sched_feat_names[] = { 145static const char * const sched_feat_names[] = {
146#include "features.h" 146#include "features.h"
147 NULL
148}; 147};
149 148
150#undef SCHED_FEAT 149#undef SCHED_FEAT
@@ -2082,7 +2081,6 @@ context_switch(struct rq *rq, struct task_struct *prev,
2082#endif 2081#endif
2083 2082
2084 /* Here we just switch the register state and the stack. */ 2083 /* Here we just switch the register state and the stack. */
2085 rcu_switch_from(prev);
2086 switch_to(prev, next, prev); 2084 switch_to(prev, next, prev);
2087 2085
2088 barrier(); 2086 barrier();
@@ -2162,11 +2160,73 @@ unsigned long this_cpu_load(void)
2162} 2160}
2163 2161
2164 2162
2163/*
2164 * Global load-average calculations
2165 *
2166 * We take a distributed and async approach to calculating the global load-avg
2167 * in order to minimize overhead.
2168 *
2169 * The global load average is an exponentially decaying average of nr_running +
2170 * nr_uninterruptible.
2171 *
2172 * Once every LOAD_FREQ:
2173 *
2174 * nr_active = 0;
2175 * for_each_possible_cpu(cpu)
2176 * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
2177 *
2178 * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
2179 *
2180 * Due to a number of reasons the above turns in the mess below:
2181 *
2182 * - for_each_possible_cpu() is prohibitively expensive on machines with
2183 * serious number of cpus, therefore we need to take a distributed approach
2184 * to calculating nr_active.
2185 *
2186 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
2187 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
2188 *
2189 * So assuming nr_active := 0 when we start out -- true per definition, we
2190 * can simply take per-cpu deltas and fold those into a global accumulate
2191 * to obtain the same result. See calc_load_fold_active().
2192 *
2193 * Furthermore, in order to avoid synchronizing all per-cpu delta folding
2194 * across the machine, we assume 10 ticks is sufficient time for every
2195 * cpu to have completed this task.
2196 *
2197 * This places an upper-bound on the IRQ-off latency of the machine. Then
2198 * again, being late doesn't loose the delta, just wrecks the sample.
2199 *
2200 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
2201 * this would add another cross-cpu cacheline miss and atomic operation
2202 * to the wakeup path. Instead we increment on whatever cpu the task ran
2203 * when it went into uninterruptible state and decrement on whatever cpu
2204 * did the wakeup. This means that only the sum of nr_uninterruptible over
2205 * all cpus yields the correct result.
2206 *
2207 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
2208 */
2209
2165/* Variables and functions for calc_load */ 2210/* Variables and functions for calc_load */
2166static atomic_long_t calc_load_tasks; 2211static atomic_long_t calc_load_tasks;
2167static unsigned long calc_load_update; 2212static unsigned long calc_load_update;
2168unsigned long avenrun[3]; 2213unsigned long avenrun[3];
2169EXPORT_SYMBOL(avenrun); 2214EXPORT_SYMBOL(avenrun); /* should be removed */
2215
2216/**
2217 * get_avenrun - get the load average array
2218 * @loads: pointer to dest load array
2219 * @offset: offset to add
2220 * @shift: shift count to shift the result left
2221 *
2222 * These values are estimates at best, so no need for locking.
2223 */
2224void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2225{
2226 loads[0] = (avenrun[0] + offset) << shift;
2227 loads[1] = (avenrun[1] + offset) << shift;
2228 loads[2] = (avenrun[2] + offset) << shift;
2229}
2170 2230
2171static long calc_load_fold_active(struct rq *this_rq) 2231static long calc_load_fold_active(struct rq *this_rq)
2172{ 2232{
@@ -2183,6 +2243,9 @@ static long calc_load_fold_active(struct rq *this_rq)
2183 return delta; 2243 return delta;
2184} 2244}
2185 2245
2246/*
2247 * a1 = a0 * e + a * (1 - e)
2248 */
2186static unsigned long 2249static unsigned long
2187calc_load(unsigned long load, unsigned long exp, unsigned long active) 2250calc_load(unsigned long load, unsigned long exp, unsigned long active)
2188{ 2251{
@@ -2194,30 +2257,118 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
2194 2257
2195#ifdef CONFIG_NO_HZ 2258#ifdef CONFIG_NO_HZ
2196/* 2259/*
2197 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 2260 * Handle NO_HZ for the global load-average.
2261 *
2262 * Since the above described distributed algorithm to compute the global
2263 * load-average relies on per-cpu sampling from the tick, it is affected by
2264 * NO_HZ.
2265 *
2266 * The basic idea is to fold the nr_active delta into a global idle-delta upon
2267 * entering NO_HZ state such that we can include this as an 'extra' cpu delta
2268 * when we read the global state.
2269 *
2270 * Obviously reality has to ruin such a delightfully simple scheme:
2271 *
2272 * - When we go NO_HZ idle during the window, we can negate our sample
2273 * contribution, causing under-accounting.
2274 *
2275 * We avoid this by keeping two idle-delta counters and flipping them
2276 * when the window starts, thus separating old and new NO_HZ load.
2277 *
2278 * The only trick is the slight shift in index flip for read vs write.
2279 *
2280 * 0s 5s 10s 15s
2281 * +10 +10 +10 +10
2282 * |-|-----------|-|-----------|-|-----------|-|
2283 * r:0 0 1 1 0 0 1 1 0
2284 * w:0 1 1 0 0 1 1 0 0
2285 *
2286 * This ensures we'll fold the old idle contribution in this window while
2287 * accumlating the new one.
2288 *
2289 * - When we wake up from NO_HZ idle during the window, we push up our
2290 * contribution, since we effectively move our sample point to a known
2291 * busy state.
2292 *
2293 * This is solved by pushing the window forward, and thus skipping the
2294 * sample, for this cpu (effectively using the idle-delta for this cpu which
2295 * was in effect at the time the window opened). This also solves the issue
2296 * of having to deal with a cpu having been in NOHZ idle for multiple
2297 * LOAD_FREQ intervals.
2198 * 2298 *
2199 * When making the ILB scale, we should try to pull this in as well. 2299 * When making the ILB scale, we should try to pull this in as well.
2200 */ 2300 */
2201static atomic_long_t calc_load_tasks_idle; 2301static atomic_long_t calc_load_idle[2];
2302static int calc_load_idx;
2303
2304static inline int calc_load_write_idx(void)
2305{
2306 int idx = calc_load_idx;
2307
2308 /*
2309 * See calc_global_nohz(), if we observe the new index, we also
2310 * need to observe the new update time.
2311 */
2312 smp_rmb();
2313
2314 /*
2315 * If the folding window started, make sure we start writing in the
2316 * next idle-delta.
2317 */
2318 if (!time_before(jiffies, calc_load_update))
2319 idx++;
2202 2320
2203void calc_load_account_idle(struct rq *this_rq) 2321 return idx & 1;
2322}
2323
2324static inline int calc_load_read_idx(void)
2204{ 2325{
2326 return calc_load_idx & 1;
2327}
2328
2329void calc_load_enter_idle(void)
2330{
2331 struct rq *this_rq = this_rq();
2205 long delta; 2332 long delta;
2206 2333
2334 /*
2335 * We're going into NOHZ mode, if there's any pending delta, fold it
2336 * into the pending idle delta.
2337 */
2207 delta = calc_load_fold_active(this_rq); 2338 delta = calc_load_fold_active(this_rq);
2208 if (delta) 2339 if (delta) {
2209 atomic_long_add(delta, &calc_load_tasks_idle); 2340 int idx = calc_load_write_idx();
2341 atomic_long_add(delta, &calc_load_idle[idx]);
2342 }
2210} 2343}
2211 2344
2212static long calc_load_fold_idle(void) 2345void calc_load_exit_idle(void)
2213{ 2346{
2214 long delta = 0; 2347 struct rq *this_rq = this_rq();
2215 2348
2216 /* 2349 /*
2217 * Its got a race, we don't care... 2350 * If we're still before the sample window, we're done.
2218 */ 2351 */
2219 if (atomic_long_read(&calc_load_tasks_idle)) 2352 if (time_before(jiffies, this_rq->calc_load_update))
2220 delta = atomic_long_xchg(&calc_load_tasks_idle, 0); 2353 return;
2354
2355 /*
2356 * We woke inside or after the sample window, this means we're already
2357 * accounted through the nohz accounting, so skip the entire deal and
2358 * sync up for the next window.
2359 */
2360 this_rq->calc_load_update = calc_load_update;
2361 if (time_before(jiffies, this_rq->calc_load_update + 10))
2362 this_rq->calc_load_update += LOAD_FREQ;
2363}
2364
2365static long calc_load_fold_idle(void)
2366{
2367 int idx = calc_load_read_idx();
2368 long delta = 0;
2369
2370 if (atomic_long_read(&calc_load_idle[idx]))
2371 delta = atomic_long_xchg(&calc_load_idle[idx], 0);
2221 2372
2222 return delta; 2373 return delta;
2223} 2374}
@@ -2303,66 +2454,39 @@ static void calc_global_nohz(void)
2303{ 2454{
2304 long delta, active, n; 2455 long delta, active, n;
2305 2456
2306 /* 2457 if (!time_before(jiffies, calc_load_update + 10)) {
2307 * If we crossed a calc_load_update boundary, make sure to fold 2458 /*
2308 * any pending idle changes, the respective CPUs might have 2459 * Catch-up, fold however many we are behind still
2309 * missed the tick driven calc_load_account_active() update 2460 */
2310 * due to NO_HZ. 2461 delta = jiffies - calc_load_update - 10;
2311 */ 2462 n = 1 + (delta / LOAD_FREQ);
2312 delta = calc_load_fold_idle();
2313 if (delta)
2314 atomic_long_add(delta, &calc_load_tasks);
2315
2316 /*
2317 * It could be the one fold was all it took, we done!
2318 */
2319 if (time_before(jiffies, calc_load_update + 10))
2320 return;
2321
2322 /*
2323 * Catch-up, fold however many we are behind still
2324 */
2325 delta = jiffies - calc_load_update - 10;
2326 n = 1 + (delta / LOAD_FREQ);
2327 2463
2328 active = atomic_long_read(&calc_load_tasks); 2464 active = atomic_long_read(&calc_load_tasks);
2329 active = active > 0 ? active * FIXED_1 : 0; 2465 active = active > 0 ? active * FIXED_1 : 0;
2330 2466
2331 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); 2467 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
2332 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); 2468 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
2333 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); 2469 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
2334 2470
2335 calc_load_update += n * LOAD_FREQ; 2471 calc_load_update += n * LOAD_FREQ;
2336} 2472 }
2337#else
2338void calc_load_account_idle(struct rq *this_rq)
2339{
2340}
2341 2473
2342static inline long calc_load_fold_idle(void) 2474 /*
2343{ 2475 * Flip the idle index...
2344 return 0; 2476 *
2477 * Make sure we first write the new time then flip the index, so that
2478 * calc_load_write_idx() will see the new time when it reads the new
2479 * index, this avoids a double flip messing things up.
2480 */
2481 smp_wmb();
2482 calc_load_idx++;
2345} 2483}
2484#else /* !CONFIG_NO_HZ */
2346 2485
2347static void calc_global_nohz(void) 2486static inline long calc_load_fold_idle(void) { return 0; }
2348{ 2487static inline void calc_global_nohz(void) { }
2349}
2350#endif
2351 2488
2352/** 2489#endif /* CONFIG_NO_HZ */
2353 * get_avenrun - get the load average array
2354 * @loads: pointer to dest load array
2355 * @offset: offset to add
2356 * @shift: shift count to shift the result left
2357 *
2358 * These values are estimates at best, so no need for locking.
2359 */
2360void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2361{
2362 loads[0] = (avenrun[0] + offset) << shift;
2363 loads[1] = (avenrun[1] + offset) << shift;
2364 loads[2] = (avenrun[2] + offset) << shift;
2365}
2366 2490
2367/* 2491/*
2368 * calc_load - update the avenrun load estimates 10 ticks after the 2492 * calc_load - update the avenrun load estimates 10 ticks after the
@@ -2370,11 +2494,18 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
2370 */ 2494 */
2371void calc_global_load(unsigned long ticks) 2495void calc_global_load(unsigned long ticks)
2372{ 2496{
2373 long active; 2497 long active, delta;
2374 2498
2375 if (time_before(jiffies, calc_load_update + 10)) 2499 if (time_before(jiffies, calc_load_update + 10))
2376 return; 2500 return;
2377 2501
2502 /*
2503 * Fold the 'old' idle-delta to include all NO_HZ cpus.
2504 */
2505 delta = calc_load_fold_idle();
2506 if (delta)
2507 atomic_long_add(delta, &calc_load_tasks);
2508
2378 active = atomic_long_read(&calc_load_tasks); 2509 active = atomic_long_read(&calc_load_tasks);
2379 active = active > 0 ? active * FIXED_1 : 0; 2510 active = active > 0 ? active * FIXED_1 : 0;
2380 2511
@@ -2385,12 +2516,7 @@ void calc_global_load(unsigned long ticks)
2385 calc_load_update += LOAD_FREQ; 2516 calc_load_update += LOAD_FREQ;
2386 2517
2387 /* 2518 /*
2388 * Account one period with whatever state we found before 2519 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
2389 * folding in the nohz state and ageing the entire idle period.
2390 *
2391 * This avoids loosing a sample when we go idle between
2392 * calc_load_account_active() (10 ticks ago) and now and thus
2393 * under-accounting.
2394 */ 2520 */
2395 calc_global_nohz(); 2521 calc_global_nohz();
2396} 2522}
@@ -2407,7 +2533,6 @@ static void calc_load_account_active(struct rq *this_rq)
2407 return; 2533 return;
2408 2534
2409 delta = calc_load_fold_active(this_rq); 2535 delta = calc_load_fold_active(this_rq);
2410 delta += calc_load_fold_idle();
2411 if (delta) 2536 if (delta)
2412 atomic_long_add(delta, &calc_load_tasks); 2537 atomic_long_add(delta, &calc_load_tasks);
2413 2538
@@ -2415,6 +2540,10 @@ static void calc_load_account_active(struct rq *this_rq)
2415} 2540}
2416 2541
2417/* 2542/*
2543 * End of global load-average stuff
2544 */
2545
2546/*
2418 * The exact cpuload at various idx values, calculated at every tick would be 2547 * The exact cpuload at various idx values, calculated at every tick would be
2419 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load 2548 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
2420 * 2549 *
@@ -2517,25 +2646,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
2517 sched_avg_update(this_rq); 2646 sched_avg_update(this_rq);
2518} 2647}
2519 2648
2649#ifdef CONFIG_NO_HZ
2650/*
2651 * There is no sane way to deal with nohz on smp when using jiffies because the
2652 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
2653 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
2654 *
2655 * Therefore we cannot use the delta approach from the regular tick since that
2656 * would seriously skew the load calculation. However we'll make do for those
2657 * updates happening while idle (nohz_idle_balance) or coming out of idle
2658 * (tick_nohz_idle_exit).
2659 *
2660 * This means we might still be one tick off for nohz periods.
2661 */
2662
2520/* 2663/*
2521 * Called from nohz_idle_balance() to update the load ratings before doing the 2664 * Called from nohz_idle_balance() to update the load ratings before doing the
2522 * idle balance. 2665 * idle balance.
2523 */ 2666 */
2524void update_idle_cpu_load(struct rq *this_rq) 2667void update_idle_cpu_load(struct rq *this_rq)
2525{ 2668{
2526 unsigned long curr_jiffies = jiffies; 2669 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2527 unsigned long load = this_rq->load.weight; 2670 unsigned long load = this_rq->load.weight;
2528 unsigned long pending_updates; 2671 unsigned long pending_updates;
2529 2672
2530 /* 2673 /*
2531 * Bloody broken means of dealing with nohz, but better than nothing.. 2674 * bail if there's load or we're actually up-to-date.
2532 * jiffies is updated by one cpu, another cpu can drift wrt the jiffy
2533 * update and see 0 difference the one time and 2 the next, even though
2534 * we ticked at roughtly the same rate.
2535 *
2536 * Hence we only use this from nohz_idle_balance() and skip this
2537 * nonsense when called from the scheduler_tick() since that's
2538 * guaranteed a stable rate.
2539 */ 2675 */
2540 if (load || curr_jiffies == this_rq->last_load_update_tick) 2676 if (load || curr_jiffies == this_rq->last_load_update_tick)
2541 return; 2677 return;
@@ -2547,12 +2683,38 @@ void update_idle_cpu_load(struct rq *this_rq)
2547} 2683}
2548 2684
2549/* 2685/*
2686 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
2687 */
2688void update_cpu_load_nohz(void)
2689{
2690 struct rq *this_rq = this_rq();
2691 unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
2692 unsigned long pending_updates;
2693
2694 if (curr_jiffies == this_rq->last_load_update_tick)
2695 return;
2696
2697 raw_spin_lock(&this_rq->lock);
2698 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
2699 if (pending_updates) {
2700 this_rq->last_load_update_tick = curr_jiffies;
2701 /*
2702 * We were idle, this means load 0, the current load might be
2703 * !0 due to remote wakeups and the sort.
2704 */
2705 __update_cpu_load(this_rq, 0, pending_updates);
2706 }
2707 raw_spin_unlock(&this_rq->lock);
2708}
2709#endif /* CONFIG_NO_HZ */
2710
2711/*
2550 * Called from scheduler_tick() 2712 * Called from scheduler_tick()
2551 */ 2713 */
2552static void update_cpu_load_active(struct rq *this_rq) 2714static void update_cpu_load_active(struct rq *this_rq)
2553{ 2715{
2554 /* 2716 /*
2555 * See the mess in update_idle_cpu_load(). 2717 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
2556 */ 2718 */
2557 this_rq->last_load_update_tick = jiffies; 2719 this_rq->last_load_update_tick = jiffies;
2558 __update_cpu_load(this_rq, this_rq->load.weight, 1); 2720 __update_cpu_load(this_rq, this_rq->load.weight, 1);
@@ -4982,7 +5144,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
4982 p->sched_class->set_cpus_allowed(p, new_mask); 5144 p->sched_class->set_cpus_allowed(p, new_mask);
4983 5145
4984 cpumask_copy(&p->cpus_allowed, new_mask); 5146 cpumask_copy(&p->cpus_allowed, new_mask);
4985 p->rt.nr_cpus_allowed = cpumask_weight(new_mask); 5147 p->nr_cpus_allowed = cpumask_weight(new_mask);
4986} 5148}
4987 5149
4988/* 5150/*
@@ -5524,15 +5686,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
5524 5686
5525#ifdef CONFIG_SCHED_DEBUG 5687#ifdef CONFIG_SCHED_DEBUG
5526 5688
5527static __read_mostly int sched_domain_debug_enabled; 5689static __read_mostly int sched_debug_enabled;
5528 5690
5529static int __init sched_domain_debug_setup(char *str) 5691static int __init sched_debug_setup(char *str)
5530{ 5692{
5531 sched_domain_debug_enabled = 1; 5693 sched_debug_enabled = 1;
5532 5694
5533 return 0; 5695 return 0;
5534} 5696}
5535early_param("sched_debug", sched_domain_debug_setup); 5697early_param("sched_debug", sched_debug_setup);
5698
5699static inline bool sched_debug(void)
5700{
5701 return sched_debug_enabled;
5702}
5536 5703
5537static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 5704static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5538 struct cpumask *groupmask) 5705 struct cpumask *groupmask)
@@ -5572,7 +5739,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
5572 break; 5739 break;
5573 } 5740 }
5574 5741
5575 if (!group->sgp->power) { 5742 /*
5743 * Even though we initialize ->power to something semi-sane,
5744 * we leave power_orig unset. This allows us to detect if
5745 * domain iteration is still funny without causing /0 traps.
5746 */
5747 if (!group->sgp->power_orig) {
5576 printk(KERN_CONT "\n"); 5748 printk(KERN_CONT "\n");
5577 printk(KERN_ERR "ERROR: domain->cpu_power not " 5749 printk(KERN_ERR "ERROR: domain->cpu_power not "
5578 "set\n"); 5750 "set\n");
@@ -5620,7 +5792,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5620{ 5792{
5621 int level = 0; 5793 int level = 0;
5622 5794
5623 if (!sched_domain_debug_enabled) 5795 if (!sched_debug_enabled)
5624 return; 5796 return;
5625 5797
5626 if (!sd) { 5798 if (!sd) {
@@ -5641,6 +5813,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
5641} 5813}
5642#else /* !CONFIG_SCHED_DEBUG */ 5814#else /* !CONFIG_SCHED_DEBUG */
5643# define sched_domain_debug(sd, cpu) do { } while (0) 5815# define sched_domain_debug(sd, cpu) do { } while (0)
5816static inline bool sched_debug(void)
5817{
5818 return false;
5819}
5644#endif /* CONFIG_SCHED_DEBUG */ 5820#endif /* CONFIG_SCHED_DEBUG */
5645 5821
5646static int sd_degenerate(struct sched_domain *sd) 5822static int sd_degenerate(struct sched_domain *sd)
@@ -5962,6 +6138,44 @@ struct sched_domain_topology_level {
5962 struct sd_data data; 6138 struct sd_data data;
5963}; 6139};
5964 6140
6141/*
6142 * Build an iteration mask that can exclude certain CPUs from the upwards
6143 * domain traversal.
6144 *
6145 * Asymmetric node setups can result in situations where the domain tree is of
6146 * unequal depth, make sure to skip domains that already cover the entire
6147 * range.
6148 *
6149 * In that case build_sched_domains() will have terminated the iteration early
6150 * and our sibling sd spans will be empty. Domains should always include the
6151 * cpu they're built on, so check that.
6152 *
6153 */
6154static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
6155{
6156 const struct cpumask *span = sched_domain_span(sd);
6157 struct sd_data *sdd = sd->private;
6158 struct sched_domain *sibling;
6159 int i;
6160
6161 for_each_cpu(i, span) {
6162 sibling = *per_cpu_ptr(sdd->sd, i);
6163 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
6164 continue;
6165
6166 cpumask_set_cpu(i, sched_group_mask(sg));
6167 }
6168}
6169
6170/*
6171 * Return the canonical balance cpu for this group, this is the first cpu
6172 * of this group that's also in the iteration mask.
6173 */
6174int group_balance_cpu(struct sched_group *sg)
6175{
6176 return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
6177}
6178
5965static int 6179static int
5966build_overlap_sched_groups(struct sched_domain *sd, int cpu) 6180build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5967{ 6181{
@@ -5980,6 +6194,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5980 if (cpumask_test_cpu(i, covered)) 6194 if (cpumask_test_cpu(i, covered))
5981 continue; 6195 continue;
5982 6196
6197 child = *per_cpu_ptr(sdd->sd, i);
6198
6199 /* See the comment near build_group_mask(). */
6200 if (!cpumask_test_cpu(i, sched_domain_span(child)))
6201 continue;
6202
5983 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), 6203 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
5984 GFP_KERNEL, cpu_to_node(cpu)); 6204 GFP_KERNEL, cpu_to_node(cpu));
5985 6205
@@ -5987,8 +6207,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5987 goto fail; 6207 goto fail;
5988 6208
5989 sg_span = sched_group_cpus(sg); 6209 sg_span = sched_group_cpus(sg);
5990
5991 child = *per_cpu_ptr(sdd->sd, i);
5992 if (child->child) { 6210 if (child->child) {
5993 child = child->child; 6211 child = child->child;
5994 cpumask_copy(sg_span, sched_domain_span(child)); 6212 cpumask_copy(sg_span, sched_domain_span(child));
@@ -5997,10 +6215,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
5997 6215
5998 cpumask_or(covered, covered, sg_span); 6216 cpumask_or(covered, covered, sg_span);
5999 6217
6000 sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span)); 6218 sg->sgp = *per_cpu_ptr(sdd->sgp, i);
6001 atomic_inc(&sg->sgp->ref); 6219 if (atomic_inc_return(&sg->sgp->ref) == 1)
6220 build_group_mask(sd, sg);
6002 6221
6003 if (cpumask_test_cpu(cpu, sg_span)) 6222 /*
6223 * Initialize sgp->power such that even if we mess up the
6224 * domains and no possible iteration will get us here, we won't
6225 * die on a /0 trap.
6226 */
6227 sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
6228
6229 /*
6230 * Make sure the first group of this domain contains the
6231 * canonical balance cpu. Otherwise the sched_domain iteration
6232 * breaks. See update_sg_lb_stats().
6233 */
6234 if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
6235 group_balance_cpu(sg) == cpu)
6004 groups = sg; 6236 groups = sg;
6005 6237
6006 if (!first) 6238 if (!first)
@@ -6074,6 +6306,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
6074 6306
6075 cpumask_clear(sched_group_cpus(sg)); 6307 cpumask_clear(sched_group_cpus(sg));
6076 sg->sgp->power = 0; 6308 sg->sgp->power = 0;
6309 cpumask_setall(sched_group_mask(sg));
6077 6310
6078 for_each_cpu(j, span) { 6311 for_each_cpu(j, span) {
6079 if (get_group(j, sdd, NULL) != group) 6312 if (get_group(j, sdd, NULL) != group)
@@ -6115,7 +6348,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6115 sg = sg->next; 6348 sg = sg->next;
6116 } while (sg != sd->groups); 6349 } while (sg != sd->groups);
6117 6350
6118 if (cpu != group_first_cpu(sg)) 6351 if (cpu != group_balance_cpu(sg))
6119 return; 6352 return;
6120 6353
6121 update_group_power(sd, cpu); 6354 update_group_power(sd, cpu);
@@ -6165,11 +6398,8 @@ int sched_domain_level_max;
6165 6398
6166static int __init setup_relax_domain_level(char *str) 6399static int __init setup_relax_domain_level(char *str)
6167{ 6400{
6168 unsigned long val; 6401 if (kstrtoint(str, 0, &default_relax_domain_level))
6169 6402 pr_warn("Unable to set relax_domain_level\n");
6170 val = simple_strtoul(str, NULL, 0);
6171 if (val < sched_domain_level_max)
6172 default_relax_domain_level = val;
6173 6403
6174 return 1; 6404 return 1;
6175} 6405}
@@ -6279,14 +6509,13 @@ static struct sched_domain_topology_level *sched_domain_topology = default_topol
6279#ifdef CONFIG_NUMA 6509#ifdef CONFIG_NUMA
6280 6510
6281static int sched_domains_numa_levels; 6511static int sched_domains_numa_levels;
6282static int sched_domains_numa_scale;
6283static int *sched_domains_numa_distance; 6512static int *sched_domains_numa_distance;
6284static struct cpumask ***sched_domains_numa_masks; 6513static struct cpumask ***sched_domains_numa_masks;
6285static int sched_domains_curr_level; 6514static int sched_domains_curr_level;
6286 6515
6287static inline int sd_local_flags(int level) 6516static inline int sd_local_flags(int level)
6288{ 6517{
6289 if (sched_domains_numa_distance[level] > REMOTE_DISTANCE) 6518 if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
6290 return 0; 6519 return 0;
6291 6520
6292 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE; 6521 return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
@@ -6344,6 +6573,42 @@ static const struct cpumask *sd_numa_mask(int cpu)
6344 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; 6573 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
6345} 6574}
6346 6575
6576static void sched_numa_warn(const char *str)
6577{
6578 static int done = false;
6579 int i,j;
6580
6581 if (done)
6582 return;
6583
6584 done = true;
6585
6586 printk(KERN_WARNING "ERROR: %s\n\n", str);
6587
6588 for (i = 0; i < nr_node_ids; i++) {
6589 printk(KERN_WARNING " ");
6590 for (j = 0; j < nr_node_ids; j++)
6591 printk(KERN_CONT "%02d ", node_distance(i,j));
6592 printk(KERN_CONT "\n");
6593 }
6594 printk(KERN_WARNING "\n");
6595}
6596
6597static bool find_numa_distance(int distance)
6598{
6599 int i;
6600
6601 if (distance == node_distance(0, 0))
6602 return true;
6603
6604 for (i = 0; i < sched_domains_numa_levels; i++) {
6605 if (sched_domains_numa_distance[i] == distance)
6606 return true;
6607 }
6608
6609 return false;
6610}
6611
6347static void sched_init_numa(void) 6612static void sched_init_numa(void)
6348{ 6613{
6349 int next_distance, curr_distance = node_distance(0, 0); 6614 int next_distance, curr_distance = node_distance(0, 0);
@@ -6351,7 +6616,6 @@ static void sched_init_numa(void)
6351 int level = 0; 6616 int level = 0;
6352 int i, j, k; 6617 int i, j, k;
6353 6618
6354 sched_domains_numa_scale = curr_distance;
6355 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL); 6619 sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
6356 if (!sched_domains_numa_distance) 6620 if (!sched_domains_numa_distance)
6357 return; 6621 return;
@@ -6362,23 +6626,41 @@ static void sched_init_numa(void)
6362 * 6626 *
6363 * Assumes node_distance(0,j) includes all distances in 6627 * Assumes node_distance(0,j) includes all distances in
6364 * node_distance(i,j) in order to avoid cubic time. 6628 * node_distance(i,j) in order to avoid cubic time.
6365 *
6366 * XXX: could be optimized to O(n log n) by using sort()
6367 */ 6629 */
6368 next_distance = curr_distance; 6630 next_distance = curr_distance;
6369 for (i = 0; i < nr_node_ids; i++) { 6631 for (i = 0; i < nr_node_ids; i++) {
6370 for (j = 0; j < nr_node_ids; j++) { 6632 for (j = 0; j < nr_node_ids; j++) {
6371 int distance = node_distance(0, j); 6633 for (k = 0; k < nr_node_ids; k++) {
6372 if (distance > curr_distance && 6634 int distance = node_distance(i, k);
6373 (distance < next_distance || 6635
6374 next_distance == curr_distance)) 6636 if (distance > curr_distance &&
6375 next_distance = distance; 6637 (distance < next_distance ||
6638 next_distance == curr_distance))
6639 next_distance = distance;
6640
6641 /*
6642 * While not a strong assumption it would be nice to know
6643 * about cases where if node A is connected to B, B is not
6644 * equally connected to A.
6645 */
6646 if (sched_debug() && node_distance(k, i) != distance)
6647 sched_numa_warn("Node-distance not symmetric");
6648
6649 if (sched_debug() && i && !find_numa_distance(distance))
6650 sched_numa_warn("Node-0 not representative");
6651 }
6652 if (next_distance != curr_distance) {
6653 sched_domains_numa_distance[level++] = next_distance;
6654 sched_domains_numa_levels = level;
6655 curr_distance = next_distance;
6656 } else break;
6376 } 6657 }
6377 if (next_distance != curr_distance) { 6658
6378 sched_domains_numa_distance[level++] = next_distance; 6659 /*
6379 sched_domains_numa_levels = level; 6660 * In case of sched_debug() we verify the above assumption.
6380 curr_distance = next_distance; 6661 */
6381 } else break; 6662 if (!sched_debug())
6663 break;
6382 } 6664 }
6383 /* 6665 /*
6384 * 'level' contains the number of unique distances, excluding the 6666 * 'level' contains the number of unique distances, excluding the
@@ -6403,7 +6685,7 @@ static void sched_init_numa(void)
6403 return; 6685 return;
6404 6686
6405 for (j = 0; j < nr_node_ids; j++) { 6687 for (j = 0; j < nr_node_ids; j++) {
6406 struct cpumask *mask = kzalloc_node(cpumask_size(), GFP_KERNEL, j); 6688 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
6407 if (!mask) 6689 if (!mask)
6408 return; 6690 return;
6409 6691
@@ -6490,7 +6772,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
6490 6772
6491 *per_cpu_ptr(sdd->sg, j) = sg; 6773 *per_cpu_ptr(sdd->sg, j) = sg;
6492 6774
6493 sgp = kzalloc_node(sizeof(struct sched_group_power), 6775 sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
6494 GFP_KERNEL, cpu_to_node(j)); 6776 GFP_KERNEL, cpu_to_node(j));
6495 if (!sgp) 6777 if (!sgp)
6496 return -ENOMEM; 6778 return -ENOMEM;
@@ -6543,7 +6825,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6543 if (!sd) 6825 if (!sd)
6544 return child; 6826 return child;
6545 6827
6546 set_domain_attribute(sd, attr);
6547 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu)); 6828 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
6548 if (child) { 6829 if (child) {
6549 sd->level = child->level + 1; 6830 sd->level = child->level + 1;
@@ -6551,6 +6832,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
6551 child->parent = sd; 6832 child->parent = sd;
6552 } 6833 }
6553 sd->child = child; 6834 sd->child = child;
6835 set_domain_attribute(sd, attr);
6554 6836
6555 return sd; 6837 return sd;
6556} 6838}
@@ -6691,7 +6973,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
6691 if (!doms_cur) 6973 if (!doms_cur)
6692 doms_cur = &fallback_doms; 6974 doms_cur = &fallback_doms;
6693 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map); 6975 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
6694 dattr_cur = NULL;
6695 err = build_sched_domains(doms_cur[0], NULL); 6976 err = build_sched_domains(doms_cur[0], NULL);
6696 register_sched_domain_sysctl(); 6977 register_sched_domain_sysctl();
6697 6978
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 940e6d17cf96..c099cc6eebe3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
2703 int want_sd = 1; 2703 int want_sd = 1;
2704 int sync = wake_flags & WF_SYNC; 2704 int sync = wake_flags & WF_SYNC;
2705 2705
2706 if (p->rt.nr_cpus_allowed == 1) 2706 if (p->nr_cpus_allowed == 1)
2707 return prev_cpu; 2707 return prev_cpu;
2708 2708
2709 if (sd_flag & SD_BALANCE_WAKE) { 2709 if (sd_flag & SD_BALANCE_WAKE) {
@@ -3503,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3503unsigned long scale_rt_power(int cpu) 3503unsigned long scale_rt_power(int cpu)
3504{ 3504{
3505 struct rq *rq = cpu_rq(cpu); 3505 struct rq *rq = cpu_rq(cpu);
3506 u64 total, available; 3506 u64 total, available, age_stamp, avg;
3507 3507
3508 total = sched_avg_period() + (rq->clock - rq->age_stamp); 3508 /*
3509 * Since we're reading these variables without serialization make sure
3510 * we read them once before doing sanity checks on them.
3511 */
3512 age_stamp = ACCESS_ONCE(rq->age_stamp);
3513 avg = ACCESS_ONCE(rq->rt_avg);
3514
3515 total = sched_avg_period() + (rq->clock - age_stamp);
3509 3516
3510 if (unlikely(total < rq->rt_avg)) { 3517 if (unlikely(total < avg)) {
3511 /* Ensures that power won't end up being negative */ 3518 /* Ensures that power won't end up being negative */
3512 available = 0; 3519 available = 0;
3513 } else { 3520 } else {
3514 available = total - rq->rt_avg; 3521 available = total - avg;
3515 } 3522 }
3516 3523
3517 if (unlikely((s64)total < SCHED_POWER_SCALE)) 3524 if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3574,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu)
3574 3581
3575 power = 0; 3582 power = 0;
3576 3583
3577 group = child->groups; 3584 if (child->flags & SD_OVERLAP) {
3578 do { 3585 /*
3579 power += group->sgp->power; 3586 * SD_OVERLAP domains cannot assume that child groups
3580 group = group->next; 3587 * span the current group.
3581 } while (group != child->groups); 3588 */
3582 3589
3583 sdg->sgp->power = power; 3590 for_each_cpu(cpu, sched_group_cpus(sdg))
3591 power += power_of(cpu);
3592 } else {
3593 /*
3594 * !SD_OVERLAP domains can assume that child groups
3595 * span the current group.
3596 */
3597
3598 group = child->groups;
3599 do {
3600 power += group->sgp->power;
3601 group = group->next;
3602 } while (group != child->groups);
3603 }
3604
3605 sdg->sgp->power_orig = sdg->sgp->power = power;
3584} 3606}
3585 3607
3586/* 3608/*
@@ -3610,7 +3632,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
3610 3632
3611/** 3633/**
3612 * update_sg_lb_stats - Update sched_group's statistics for load balancing. 3634 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3613 * @sd: The sched_domain whose statistics are to be updated. 3635 * @env: The load balancing environment.
3614 * @group: sched_group whose statistics are to be updated. 3636 * @group: sched_group whose statistics are to be updated.
3615 * @load_idx: Load index of sched_domain of this_cpu for load calc. 3637 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3616 * @local_group: Does group contain this_cpu. 3638 * @local_group: Does group contain this_cpu.
@@ -3630,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3630 int i; 3652 int i;
3631 3653
3632 if (local_group) 3654 if (local_group)
3633 balance_cpu = group_first_cpu(group); 3655 balance_cpu = group_balance_cpu(group);
3634 3656
3635 /* Tally up the load of all CPUs in the group */ 3657 /* Tally up the load of all CPUs in the group */
3636 max_cpu_load = 0; 3658 max_cpu_load = 0;
@@ -3645,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3645 3667
3646 /* Bias balancing toward cpus of our domain */ 3668 /* Bias balancing toward cpus of our domain */
3647 if (local_group) { 3669 if (local_group) {
3648 if (idle_cpu(i) && !first_idle_cpu) { 3670 if (idle_cpu(i) && !first_idle_cpu &&
3671 cpumask_test_cpu(i, sched_group_mask(group))) {
3649 first_idle_cpu = 1; 3672 first_idle_cpu = 1;
3650 balance_cpu = i; 3673 balance_cpu = i;
3651 } 3674 }
@@ -3719,11 +3742,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
3719 3742
3720/** 3743/**
3721 * update_sd_pick_busiest - return 1 on busiest group 3744 * update_sd_pick_busiest - return 1 on busiest group
3722 * @sd: sched_domain whose statistics are to be checked 3745 * @env: The load balancing environment.
3723 * @sds: sched_domain statistics 3746 * @sds: sched_domain statistics
3724 * @sg: sched_group candidate to be checked for being the busiest 3747 * @sg: sched_group candidate to be checked for being the busiest
3725 * @sgs: sched_group statistics 3748 * @sgs: sched_group statistics
3726 * @this_cpu: the current cpu
3727 * 3749 *
3728 * Determine if @sg is a busier group than the previously selected 3750 * Determine if @sg is a busier group than the previously selected
3729 * busiest group. 3751 * busiest group.
@@ -3761,9 +3783,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
3761 3783
3762/** 3784/**
3763 * update_sd_lb_stats - Update sched_domain's statistics for load balancing. 3785 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
3764 * @sd: sched_domain whose statistics are to be updated. 3786 * @env: The load balancing environment.
3765 * @this_cpu: Cpu for which load balance is currently performed.
3766 * @idle: Idle status of this_cpu
3767 * @cpus: Set of cpus considered for load balancing. 3787 * @cpus: Set of cpus considered for load balancing.
3768 * @balance: Should we balance. 3788 * @balance: Should we balance.
3769 * @sds: variable to hold the statistics for this sched_domain. 3789 * @sds: variable to hold the statistics for this sched_domain.
@@ -3852,10 +3872,8 @@ static inline void update_sd_lb_stats(struct lb_env *env,
3852 * Returns 1 when packing is required and a task should be moved to 3872 * Returns 1 when packing is required and a task should be moved to
3853 * this CPU. The amount of the imbalance is returned in *imbalance. 3873 * this CPU. The amount of the imbalance is returned in *imbalance.
3854 * 3874 *
3855 * @sd: The sched_domain whose packing is to be checked. 3875 * @env: The load balancing environment.
3856 * @sds: Statistics of the sched_domain which is to be packed 3876 * @sds: Statistics of the sched_domain which is to be packed
3857 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3858 * @imbalance: returns amount of imbalanced due to packing.
3859 */ 3877 */
3860static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds) 3878static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
3861{ 3879{
@@ -3881,9 +3899,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
3881 * fix_small_imbalance - Calculate the minor imbalance that exists 3899 * fix_small_imbalance - Calculate the minor imbalance that exists
3882 * amongst the groups of a sched_domain, during 3900 * amongst the groups of a sched_domain, during
3883 * load balancing. 3901 * load balancing.
3902 * @env: The load balancing environment.
3884 * @sds: Statistics of the sched_domain whose imbalance is to be calculated. 3903 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3885 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3886 * @imbalance: Variable to store the imbalance.
3887 */ 3904 */
3888static inline 3905static inline
3889void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds) 3906void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
@@ -4026,11 +4043,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
4026 * Also calculates the amount of weighted load which should be moved 4043 * Also calculates the amount of weighted load which should be moved
4027 * to restore balance. 4044 * to restore balance.
4028 * 4045 *
4029 * @sd: The sched_domain whose busiest group is to be returned. 4046 * @env: The load balancing environment.
4030 * @this_cpu: The cpu for which load balancing is currently being performed.
4031 * @imbalance: Variable which stores amount of weighted load which should
4032 * be moved to restore balance/put a group to idle.
4033 * @idle: The idle status of this_cpu.
4034 * @cpus: The set of CPUs under consideration for load-balancing. 4047 * @cpus: The set of CPUs under consideration for load-balancing.
4035 * @balance: Pointer to a variable indicating if this_cpu 4048 * @balance: Pointer to a variable indicating if this_cpu
4036 * is the appropriate cpu to perform load balancing at this_level. 4049 * is the appropriate cpu to perform load balancing at this_level.
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b44d604b35d1..b6baf370cae9 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -25,7 +25,6 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
25static struct task_struct *pick_next_task_idle(struct rq *rq) 25static struct task_struct *pick_next_task_idle(struct rq *rq)
26{ 26{
27 schedstat_inc(rq, sched_goidle); 27 schedstat_inc(rq, sched_goidle);
28 calc_load_account_idle(rq);
29 return rq->idle; 28 return rq->idle;
30} 29}
31 30
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c5565c3c515f..573e1ca01102 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
274 274
275static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 275static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
276{ 276{
277 struct task_struct *p;
278
277 if (!rt_entity_is_task(rt_se)) 279 if (!rt_entity_is_task(rt_se))
278 return; 280 return;
279 281
282 p = rt_task_of(rt_se);
280 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 283 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
281 284
282 rt_rq->rt_nr_total++; 285 rt_rq->rt_nr_total++;
283 if (rt_se->nr_cpus_allowed > 1) 286 if (p->nr_cpus_allowed > 1)
284 rt_rq->rt_nr_migratory++; 287 rt_rq->rt_nr_migratory++;
285 288
286 update_rt_migration(rt_rq); 289 update_rt_migration(rt_rq);
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
288 291
289static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 292static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
290{ 293{
294 struct task_struct *p;
295
291 if (!rt_entity_is_task(rt_se)) 296 if (!rt_entity_is_task(rt_se))
292 return; 297 return;
293 298
299 p = rt_task_of(rt_se);
294 rt_rq = &rq_of_rt_rq(rt_rq)->rt; 300 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
295 301
296 rt_rq->rt_nr_total--; 302 rt_rq->rt_nr_total--;
297 if (rt_se->nr_cpus_allowed > 1) 303 if (p->nr_cpus_allowed > 1)
298 rt_rq->rt_nr_migratory--; 304 rt_rq->rt_nr_migratory--;
299 305
300 update_rt_migration(rt_rq); 306 update_rt_migration(rt_rq);
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1161 1167
1162 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); 1168 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
1163 1169
1164 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 1170 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1165 enqueue_pushable_task(rq, p); 1171 enqueue_pushable_task(rq, p);
1166 1172
1167 inc_nr_running(rq); 1173 inc_nr_running(rq);
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1225 1231
1226 cpu = task_cpu(p); 1232 cpu = task_cpu(p);
1227 1233
1228 if (p->rt.nr_cpus_allowed == 1) 1234 if (p->nr_cpus_allowed == 1)
1229 goto out; 1235 goto out;
1230 1236
1231 /* For anything but wake ups, just return the task_cpu */ 1237 /* For anything but wake ups, just return the task_cpu */
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
1260 * will have to sort it out. 1266 * will have to sort it out.
1261 */ 1267 */
1262 if (curr && unlikely(rt_task(curr)) && 1268 if (curr && unlikely(rt_task(curr)) &&
1263 (curr->rt.nr_cpus_allowed < 2 || 1269 (curr->nr_cpus_allowed < 2 ||
1264 curr->prio <= p->prio) && 1270 curr->prio <= p->prio) &&
1265 (p->rt.nr_cpus_allowed > 1)) { 1271 (p->nr_cpus_allowed > 1)) {
1266 int target = find_lowest_rq(p); 1272 int target = find_lowest_rq(p);
1267 1273
1268 if (target != -1) 1274 if (target != -1)
@@ -1276,10 +1282,10 @@ out:
1276 1282
1277static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) 1283static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1278{ 1284{
1279 if (rq->curr->rt.nr_cpus_allowed == 1) 1285 if (rq->curr->nr_cpus_allowed == 1)
1280 return; 1286 return;
1281 1287
1282 if (p->rt.nr_cpus_allowed != 1 1288 if (p->nr_cpus_allowed != 1
1283 && cpupri_find(&rq->rd->cpupri, p, NULL)) 1289 && cpupri_find(&rq->rd->cpupri, p, NULL))
1284 return; 1290 return;
1285 1291
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1395 * The previous task needs to be made eligible for pushing 1401 * The previous task needs to be made eligible for pushing
1396 * if it is still active 1402 * if it is still active
1397 */ 1403 */
1398 if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) 1404 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1399 enqueue_pushable_task(rq, p); 1405 enqueue_pushable_task(rq, p);
1400} 1406}
1401 1407
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1408{ 1414{
1409 if (!task_running(rq, p) && 1415 if (!task_running(rq, p) &&
1410 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && 1416 (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
1411 (p->rt.nr_cpus_allowed > 1)) 1417 (p->nr_cpus_allowed > 1))
1412 return 1; 1418 return 1;
1413 return 0; 1419 return 0;
1414} 1420}
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task)
1464 if (unlikely(!lowest_mask)) 1470 if (unlikely(!lowest_mask))
1465 return -1; 1471 return -1;
1466 1472
1467 if (task->rt.nr_cpus_allowed == 1) 1473 if (task->nr_cpus_allowed == 1)
1468 return -1; /* No other targets possible */ 1474 return -1; /* No other targets possible */
1469 1475
1470 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) 1476 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1556 task_running(rq, task) || 1562 task_running(rq, task) ||
1557 !task->on_rq)) { 1563 !task->on_rq)) {
1558 1564
1559 raw_spin_unlock(&lowest_rq->lock); 1565 double_unlock_balance(rq, lowest_rq);
1560 lowest_rq = NULL; 1566 lowest_rq = NULL;
1561 break; 1567 break;
1562 } 1568 }
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
1586 1592
1587 BUG_ON(rq->cpu != task_cpu(p)); 1593 BUG_ON(rq->cpu != task_cpu(p));
1588 BUG_ON(task_current(rq, p)); 1594 BUG_ON(task_current(rq, p));
1589 BUG_ON(p->rt.nr_cpus_allowed <= 1); 1595 BUG_ON(p->nr_cpus_allowed <= 1);
1590 1596
1591 BUG_ON(!p->on_rq); 1597 BUG_ON(!p->on_rq);
1592 BUG_ON(!rt_task(p)); 1598 BUG_ON(!rt_task(p));
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1793 if (!task_running(rq, p) && 1799 if (!task_running(rq, p) &&
1794 !test_tsk_need_resched(rq->curr) && 1800 !test_tsk_need_resched(rq->curr) &&
1795 has_pushable_tasks(rq) && 1801 has_pushable_tasks(rq) &&
1796 p->rt.nr_cpus_allowed > 1 && 1802 p->nr_cpus_allowed > 1 &&
1797 rt_task(rq->curr) && 1803 rt_task(rq->curr) &&
1798 (rq->curr->rt.nr_cpus_allowed < 2 || 1804 (rq->curr->nr_cpus_allowed < 2 ||
1799 rq->curr->prio <= p->prio)) 1805 rq->curr->prio <= p->prio))
1800 push_rt_tasks(rq); 1806 push_rt_tasks(rq);
1801} 1807}
@@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1817 * Only update if the process changes its state from whether it 1823 * Only update if the process changes its state from whether it
1818 * can migrate or not. 1824 * can migrate or not.
1819 */ 1825 */
1820 if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) 1826 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1821 return; 1827 return;
1822 1828
1823 rq = task_rq(p); 1829 rq = task_rq(p);
@@ -1979,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1979 1985
1980static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 1986static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1981{ 1987{
1988 struct sched_rt_entity *rt_se = &p->rt;
1989
1982 update_curr_rt(rq); 1990 update_curr_rt(rq);
1983 1991
1984 watchdog(rq, p); 1992 watchdog(rq, p);
@@ -1996,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1996 p->rt.time_slice = RR_TIMESLICE; 2004 p->rt.time_slice = RR_TIMESLICE;
1997 2005
1998 /* 2006 /*
1999 * Requeue to the end of queue if we are not the only element 2007 * Requeue to the end of queue if we (and all of our ancestors) are the
2000 * on the queue: 2008 * only element on the queue
2001 */ 2009 */
2002 if (p->rt.run_list.prev != p->rt.run_list.next) { 2010 for_each_sched_rt_entity(rt_se) {
2003 requeue_task_rt(rq, p, 0); 2011 if (rt_se->run_list.prev != rt_se->run_list.next) {
2004 set_tsk_need_resched(p); 2012 requeue_task_rt(rq, p, 0);
2013 set_tsk_need_resched(p);
2014 return;
2015 }
2005 } 2016 }
2006} 2017}
2007 2018
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ba9dccfd24ce..55844f24435a 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
526DECLARE_PER_CPU(struct sched_domain *, sd_llc); 526DECLARE_PER_CPU(struct sched_domain *, sd_llc);
527DECLARE_PER_CPU(int, sd_llc_id); 527DECLARE_PER_CPU(int, sd_llc_id);
528 528
529extern int group_balance_cpu(struct sched_group *sg);
530
529#endif /* CONFIG_SMP */ 531#endif /* CONFIG_SMP */
530 532
531#include "stats.h" 533#include "stats.h"
@@ -940,8 +942,6 @@ static inline u64 sched_avg_period(void)
940 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2; 942 return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
941} 943}
942 944
943void calc_load_account_idle(struct rq *this_rq);
944
945#ifdef CONFIG_SCHED_HRTICK 945#ifdef CONFIG_SCHED_HRTICK
946 946
947/* 947/*