aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-04-08 11:25:42 -0400
committerIngo Molnar <mingo@elte.hu>2009-04-08 11:26:00 -0400
commit5af8c4e0fac9838428bd718040b664043a05f37c (patch)
tree75a01d98ed244db45fe3c734c4a81c1a3d92ac37 /kernel/sched.c
parent46e0bb9c12f4bab539736f1714cbf16600f681ec (diff)
parent577c9c456f0e1371cbade38eaf91ae8e8a308555 (diff)
Merge commit 'v2.6.30-rc1' into sched/urgent
Merge reason: update to latest upstream to queue up fix Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1141
1 files changed, 788 insertions, 353 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 6234d10c6a79..5724508c3b66 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
223{ 223{
224 ktime_t now; 224 ktime_t now;
225 225
226 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) 226 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
227 return; 227 return;
228 228
229 if (hrtimer_active(&rt_b->rt_period_timer)) 229 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -231,13 +231,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
231 231
232 spin_lock(&rt_b->rt_runtime_lock); 232 spin_lock(&rt_b->rt_runtime_lock);
233 for (;;) { 233 for (;;) {
234 unsigned long delta;
235 ktime_t soft, hard;
236
234 if (hrtimer_active(&rt_b->rt_period_timer)) 237 if (hrtimer_active(&rt_b->rt_period_timer))
235 break; 238 break;
236 239
237 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 240 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
238 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 241 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
239 hrtimer_start_expires(&rt_b->rt_period_timer, 242
240 HRTIMER_MODE_ABS); 243 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
244 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
245 delta = ktime_to_ns(ktime_sub(hard, soft));
246 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
247 HRTIMER_MODE_ABS, 0);
241 } 248 }
242 spin_unlock(&rt_b->rt_runtime_lock); 249 spin_unlock(&rt_b->rt_runtime_lock);
243} 250}
@@ -331,6 +338,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
331 */ 338 */
332static DEFINE_SPINLOCK(task_group_lock); 339static DEFINE_SPINLOCK(task_group_lock);
333 340
341#ifdef CONFIG_SMP
342static int root_task_group_empty(void)
343{
344 return list_empty(&root_task_group.children);
345}
346#endif
347
334#ifdef CONFIG_FAIR_GROUP_SCHED 348#ifdef CONFIG_FAIR_GROUP_SCHED
335#ifdef CONFIG_USER_SCHED 349#ifdef CONFIG_USER_SCHED
336# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 350# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -391,6 +405,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
391 405
392#else 406#else
393 407
408#ifdef CONFIG_SMP
409static int root_task_group_empty(void)
410{
411 return 1;
412}
413#endif
414
394static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 415static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
395static inline struct task_group *task_group(struct task_struct *p) 416static inline struct task_group *task_group(struct task_struct *p)
396{ 417{
@@ -467,11 +488,17 @@ struct rt_rq {
467 struct rt_prio_array active; 488 struct rt_prio_array active;
468 unsigned long rt_nr_running; 489 unsigned long rt_nr_running;
469#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 490#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
470 int highest_prio; /* highest queued rt task prio */ 491 struct {
492 int curr; /* highest queued rt task prio */
493#ifdef CONFIG_SMP
494 int next; /* next highest */
495#endif
496 } highest_prio;
471#endif 497#endif
472#ifdef CONFIG_SMP 498#ifdef CONFIG_SMP
473 unsigned long rt_nr_migratory; 499 unsigned long rt_nr_migratory;
474 int overloaded; 500 int overloaded;
501 struct plist_head pushable_tasks;
475#endif 502#endif
476 int rt_throttled; 503 int rt_throttled;
477 u64 rt_time; 504 u64 rt_time;
@@ -549,7 +576,6 @@ struct rq {
549 unsigned long nr_running; 576 unsigned long nr_running;
550 #define CPU_LOAD_IDX_MAX 5 577 #define CPU_LOAD_IDX_MAX 5
551 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 578 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
552 unsigned char idle_at_tick;
553#ifdef CONFIG_NO_HZ 579#ifdef CONFIG_NO_HZ
554 unsigned long last_tick_seen; 580 unsigned long last_tick_seen;
555 unsigned char in_nohz_recently; 581 unsigned char in_nohz_recently;
@@ -590,6 +616,7 @@ struct rq {
590 struct root_domain *rd; 616 struct root_domain *rd;
591 struct sched_domain *sd; 617 struct sched_domain *sd;
592 618
619 unsigned char idle_at_tick;
593 /* For active balancing */ 620 /* For active balancing */
594 int active_balance; 621 int active_balance;
595 int push_cpu; 622 int push_cpu;
@@ -618,9 +645,6 @@ struct rq {
618 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 645 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
619 646
620 /* sys_sched_yield() stats */ 647 /* sys_sched_yield() stats */
621 unsigned int yld_exp_empty;
622 unsigned int yld_act_empty;
623 unsigned int yld_both_empty;
624 unsigned int yld_count; 648 unsigned int yld_count;
625 649
626 /* schedule() stats */ 650 /* schedule() stats */
@@ -1093,7 +1117,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1093 if (rq == this_rq()) { 1117 if (rq == this_rq()) {
1094 hrtimer_restart(timer); 1118 hrtimer_restart(timer);
1095 } else if (!rq->hrtick_csd_pending) { 1119 } else if (!rq->hrtick_csd_pending) {
1096 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); 1120 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1097 rq->hrtick_csd_pending = 1; 1121 rq->hrtick_csd_pending = 1;
1098 } 1122 }
1099} 1123}
@@ -1129,7 +1153,8 @@ static __init void init_hrtick(void)
1129 */ 1153 */
1130static void hrtick_start(struct rq *rq, u64 delay) 1154static void hrtick_start(struct rq *rq, u64 delay)
1131{ 1155{
1132 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1156 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1157 HRTIMER_MODE_REL, 0);
1133} 1158}
1134 1159
1135static inline void init_hrtick(void) 1160static inline void init_hrtick(void)
@@ -1183,10 +1208,10 @@ static void resched_task(struct task_struct *p)
1183 1208
1184 assert_spin_locked(&task_rq(p)->lock); 1209 assert_spin_locked(&task_rq(p)->lock);
1185 1210
1186 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 1211 if (test_tsk_need_resched(p))
1187 return; 1212 return;
1188 1213
1189 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 1214 set_tsk_need_resched(p);
1190 1215
1191 cpu = task_cpu(p); 1216 cpu = task_cpu(p);
1192 if (cpu == smp_processor_id()) 1217 if (cpu == smp_processor_id())
@@ -1242,7 +1267,7 @@ void wake_up_idle_cpu(int cpu)
1242 * lockless. The worst case is that the other CPU runs the 1267 * lockless. The worst case is that the other CPU runs the
1243 * idle task through an additional NOOP schedule() 1268 * idle task through an additional NOOP schedule()
1244 */ 1269 */
1245 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); 1270 set_tsk_need_resched(rq->idle);
1246 1271
1247 /* NEED_RESCHED must be visible before we test polling */ 1272 /* NEED_RESCHED must be visible before we test polling */
1248 smp_mb(); 1273 smp_mb();
@@ -1622,21 +1647,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1622 1647
1623#endif 1648#endif
1624 1649
1650#ifdef CONFIG_PREEMPT
1651
1625/* 1652/*
1626 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1653 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1654 * way at the expense of forcing extra atomic operations in all
1655 * invocations. This assures that the double_lock is acquired using the
1656 * same underlying policy as the spinlock_t on this architecture, which
1657 * reduces latency compared to the unfair variant below. However, it
1658 * also adds more overhead and therefore may reduce throughput.
1627 */ 1659 */
1628static int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1660static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1661 __releases(this_rq->lock)
1662 __acquires(busiest->lock)
1663 __acquires(this_rq->lock)
1664{
1665 spin_unlock(&this_rq->lock);
1666 double_rq_lock(this_rq, busiest);
1667
1668 return 1;
1669}
1670
1671#else
1672/*
1673 * Unfair double_lock_balance: Optimizes throughput at the expense of
1674 * latency by eliminating extra atomic operations when the locks are
1675 * already in proper order on entry. This favors lower cpu-ids and will
1676 * grant the double lock to lower cpus over higher ids under contention,
1677 * regardless of entry order into the function.
1678 */
1679static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1629 __releases(this_rq->lock) 1680 __releases(this_rq->lock)
1630 __acquires(busiest->lock) 1681 __acquires(busiest->lock)
1631 __acquires(this_rq->lock) 1682 __acquires(this_rq->lock)
1632{ 1683{
1633 int ret = 0; 1684 int ret = 0;
1634 1685
1635 if (unlikely(!irqs_disabled())) {
1636 /* printk() doesn't work good under rq->lock */
1637 spin_unlock(&this_rq->lock);
1638 BUG_ON(1);
1639 }
1640 if (unlikely(!spin_trylock(&busiest->lock))) { 1686 if (unlikely(!spin_trylock(&busiest->lock))) {
1641 if (busiest < this_rq) { 1687 if (busiest < this_rq) {
1642 spin_unlock(&this_rq->lock); 1688 spin_unlock(&this_rq->lock);
@@ -1649,6 +1695,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1649 return ret; 1695 return ret;
1650} 1696}
1651 1697
1698#endif /* CONFIG_PREEMPT */
1699
1700/*
1701 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1702 */
1703static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1704{
1705 if (unlikely(!irqs_disabled())) {
1706 /* printk() doesn't work good under rq->lock */
1707 spin_unlock(&this_rq->lock);
1708 BUG_ON(1);
1709 }
1710
1711 return _double_lock_balance(this_rq, busiest);
1712}
1713
1652static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1714static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1653 __releases(busiest->lock) 1715 __releases(busiest->lock)
1654{ 1716{
@@ -1717,6 +1779,9 @@ static void update_avg(u64 *avg, u64 sample)
1717 1779
1718static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1780static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1719{ 1781{
1782 if (wakeup)
1783 p->se.start_runtime = p->se.sum_exec_runtime;
1784
1720 sched_info_queued(p); 1785 sched_info_queued(p);
1721 p->sched_class->enqueue_task(rq, p, wakeup); 1786 p->sched_class->enqueue_task(rq, p, wakeup);
1722 p->se.on_rq = 1; 1787 p->se.on_rq = 1;
@@ -1724,10 +1789,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1724 1789
1725static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1790static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1726{ 1791{
1727 if (sleep && p->se.last_wakeup) { 1792 if (sleep) {
1728 update_avg(&p->se.avg_overlap, 1793 if (p->se.last_wakeup) {
1729 p->se.sum_exec_runtime - p->se.last_wakeup); 1794 update_avg(&p->se.avg_overlap,
1730 p->se.last_wakeup = 0; 1795 p->se.sum_exec_runtime - p->se.last_wakeup);
1796 p->se.last_wakeup = 0;
1797 } else {
1798 update_avg(&p->se.avg_wakeup,
1799 sysctl_sched_wakeup_granularity);
1800 }
1731 } 1801 }
1732 1802
1733 sched_info_dequeued(p); 1803 sched_info_dequeued(p);
@@ -2029,7 +2099,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2029 * it must be off the runqueue _entirely_, and not 2099 * it must be off the runqueue _entirely_, and not
2030 * preempted! 2100 * preempted!
2031 * 2101 *
2032 * So if it wa still runnable (but just not actively 2102 * So if it was still runnable (but just not actively
2033 * running right now), it's preempted, and we should 2103 * running right now), it's preempted, and we should
2034 * yield - it could be a while. 2104 * yield - it could be a while.
2035 */ 2105 */
@@ -2278,18 +2348,8 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2278 if (!sched_feat(SYNC_WAKEUPS)) 2348 if (!sched_feat(SYNC_WAKEUPS))
2279 sync = 0; 2349 sync = 0;
2280 2350
2281 if (!sync) {
2282 if (current->se.avg_overlap < sysctl_sched_migration_cost &&
2283 p->se.avg_overlap < sysctl_sched_migration_cost)
2284 sync = 1;
2285 } else {
2286 if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
2287 p->se.avg_overlap >= sysctl_sched_migration_cost)
2288 sync = 0;
2289 }
2290
2291#ifdef CONFIG_SMP 2351#ifdef CONFIG_SMP
2292 if (sched_feat(LB_WAKEUP_UPDATE)) { 2352 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2293 struct sched_domain *sd; 2353 struct sched_domain *sd;
2294 2354
2295 this_cpu = raw_smp_processor_id(); 2355 this_cpu = raw_smp_processor_id();
@@ -2367,6 +2427,22 @@ out_activate:
2367 activate_task(rq, p, 1); 2427 activate_task(rq, p, 1);
2368 success = 1; 2428 success = 1;
2369 2429
2430 /*
2431 * Only attribute actual wakeups done by this task.
2432 */
2433 if (!in_interrupt()) {
2434 struct sched_entity *se = &current->se;
2435 u64 sample = se->sum_exec_runtime;
2436
2437 if (se->last_wakeup)
2438 sample -= se->last_wakeup;
2439 else
2440 sample -= se->start_runtime;
2441 update_avg(&se->avg_wakeup, sample);
2442
2443 se->last_wakeup = se->sum_exec_runtime;
2444 }
2445
2370out_running: 2446out_running:
2371 trace_sched_wakeup(rq, p, success); 2447 trace_sched_wakeup(rq, p, success);
2372 check_preempt_curr(rq, p, sync); 2448 check_preempt_curr(rq, p, sync);
@@ -2377,8 +2453,6 @@ out_running:
2377 p->sched_class->task_wake_up(rq, p); 2453 p->sched_class->task_wake_up(rq, p);
2378#endif 2454#endif
2379out: 2455out:
2380 current->se.last_wakeup = current->se.sum_exec_runtime;
2381
2382 task_rq_unlock(rq, &flags); 2456 task_rq_unlock(rq, &flags);
2383 2457
2384 return success; 2458 return success;
@@ -2408,6 +2482,8 @@ static void __sched_fork(struct task_struct *p)
2408 p->se.prev_sum_exec_runtime = 0; 2482 p->se.prev_sum_exec_runtime = 0;
2409 p->se.last_wakeup = 0; 2483 p->se.last_wakeup = 0;
2410 p->se.avg_overlap = 0; 2484 p->se.avg_overlap = 0;
2485 p->se.start_runtime = 0;
2486 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2411 2487
2412#ifdef CONFIG_SCHEDSTATS 2488#ifdef CONFIG_SCHEDSTATS
2413 p->se.wait_start = 0; 2489 p->se.wait_start = 0;
@@ -2470,6 +2546,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
2470 /* Want to start with kernel preemption disabled. */ 2546 /* Want to start with kernel preemption disabled. */
2471 task_thread_info(p)->preempt_count = 1; 2547 task_thread_info(p)->preempt_count = 1;
2472#endif 2548#endif
2549 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2550
2473 put_cpu(); 2551 put_cpu();
2474} 2552}
2475 2553
@@ -2513,7 +2591,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2513#ifdef CONFIG_PREEMPT_NOTIFIERS 2591#ifdef CONFIG_PREEMPT_NOTIFIERS
2514 2592
2515/** 2593/**
2516 * preempt_notifier_register - tell me when current is being being preempted & rescheduled 2594 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2517 * @notifier: notifier struct to register 2595 * @notifier: notifier struct to register
2518 */ 2596 */
2519void preempt_notifier_register(struct preempt_notifier *notifier) 2597void preempt_notifier_register(struct preempt_notifier *notifier)
@@ -2610,6 +2688,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2610{ 2688{
2611 struct mm_struct *mm = rq->prev_mm; 2689 struct mm_struct *mm = rq->prev_mm;
2612 long prev_state; 2690 long prev_state;
2691#ifdef CONFIG_SMP
2692 int post_schedule = 0;
2693
2694 if (current->sched_class->needs_post_schedule)
2695 post_schedule = current->sched_class->needs_post_schedule(rq);
2696#endif
2613 2697
2614 rq->prev_mm = NULL; 2698 rq->prev_mm = NULL;
2615 2699
@@ -2628,7 +2712,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2628 finish_arch_switch(prev); 2712 finish_arch_switch(prev);
2629 finish_lock_switch(rq, prev); 2713 finish_lock_switch(rq, prev);
2630#ifdef CONFIG_SMP 2714#ifdef CONFIG_SMP
2631 if (current->sched_class->post_schedule) 2715 if (post_schedule)
2632 current->sched_class->post_schedule(rq); 2716 current->sched_class->post_schedule(rq);
2633#endif 2717#endif
2634 2718
@@ -2935,6 +3019,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2935 struct sched_domain *sd, enum cpu_idle_type idle, 3019 struct sched_domain *sd, enum cpu_idle_type idle,
2936 int *all_pinned) 3020 int *all_pinned)
2937{ 3021{
3022 int tsk_cache_hot = 0;
2938 /* 3023 /*
2939 * We do not migrate tasks that are: 3024 * We do not migrate tasks that are:
2940 * 1) running (obviously), or 3025 * 1) running (obviously), or
@@ -2958,10 +3043,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2958 * 2) too many balance attempts have failed. 3043 * 2) too many balance attempts have failed.
2959 */ 3044 */
2960 3045
2961 if (!task_hot(p, rq->clock, sd) || 3046 tsk_cache_hot = task_hot(p, rq->clock, sd);
2962 sd->nr_balance_failed > sd->cache_nice_tries) { 3047 if (!tsk_cache_hot ||
3048 sd->nr_balance_failed > sd->cache_nice_tries) {
2963#ifdef CONFIG_SCHEDSTATS 3049#ifdef CONFIG_SCHEDSTATS
2964 if (task_hot(p, rq->clock, sd)) { 3050 if (tsk_cache_hot) {
2965 schedstat_inc(sd, lb_hot_gained[idle]); 3051 schedstat_inc(sd, lb_hot_gained[idle]);
2966 schedstat_inc(p, se.nr_forced_migrations); 3052 schedstat_inc(p, se.nr_forced_migrations);
2967 } 3053 }
@@ -2969,7 +3055,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2969 return 1; 3055 return 1;
2970 } 3056 }
2971 3057
2972 if (task_hot(p, rq->clock, sd)) { 3058 if (tsk_cache_hot) {
2973 schedstat_inc(p, se.nr_failed_migrations_hot); 3059 schedstat_inc(p, se.nr_failed_migrations_hot);
2974 return 0; 3060 return 0;
2975 } 3061 }
@@ -3009,6 +3095,16 @@ next:
3009 pulled++; 3095 pulled++;
3010 rem_load_move -= p->se.load.weight; 3096 rem_load_move -= p->se.load.weight;
3011 3097
3098#ifdef CONFIG_PREEMPT
3099 /*
3100 * NEWIDLE balancing is a source of latency, so preemptible kernels
3101 * will stop after the first task is pulled to minimize the critical
3102 * section.
3103 */
3104 if (idle == CPU_NEWLY_IDLE)
3105 goto out;
3106#endif
3107
3012 /* 3108 /*
3013 * We only want to steal up to the prescribed amount of weighted load. 3109 * We only want to steal up to the prescribed amount of weighted load.
3014 */ 3110 */
@@ -3055,9 +3151,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3055 sd, idle, all_pinned, &this_best_prio); 3151 sd, idle, all_pinned, &this_best_prio);
3056 class = class->next; 3152 class = class->next;
3057 3153
3154#ifdef CONFIG_PREEMPT
3155 /*
3156 * NEWIDLE balancing is a source of latency, so preemptible
3157 * kernels will stop after the first task is pulled to minimize
3158 * the critical section.
3159 */
3058 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3160 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3059 break; 3161 break;
3060 3162#endif
3061 } while (class && max_load_move > total_load_moved); 3163 } while (class && max_load_move > total_load_moved);
3062 3164
3063 return total_load_moved > 0; 3165 return total_load_moved > 0;
@@ -3107,246 +3209,480 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3107 3209
3108 return 0; 3210 return 0;
3109} 3211}
3110 3212/********** Helpers for find_busiest_group ************************/
3111/* 3213/*
3112 * find_busiest_group finds and returns the busiest CPU group within the 3214 * sd_lb_stats - Structure to store the statistics of a sched_domain
3113 * domain. It calculates and returns the amount of weighted load which 3215 * during load balancing.
3114 * should be moved to restore balance via the imbalance parameter.
3115 */ 3216 */
3116static struct sched_group * 3217struct sd_lb_stats {
3117find_busiest_group(struct sched_domain *sd, int this_cpu, 3218 struct sched_group *busiest; /* Busiest group in this sd */
3118 unsigned long *imbalance, enum cpu_idle_type idle, 3219 struct sched_group *this; /* Local group in this sd */
3119 int *sd_idle, const struct cpumask *cpus, int *balance) 3220 unsigned long total_load; /* Total load of all groups in sd */
3120{ 3221 unsigned long total_pwr; /* Total power of all groups in sd */
3121 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3222 unsigned long avg_load; /* Average load across all groups in sd */
3122 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3223
3123 unsigned long max_pull; 3224 /** Statistics of this group */
3124 unsigned long busiest_load_per_task, busiest_nr_running; 3225 unsigned long this_load;
3125 unsigned long this_load_per_task, this_nr_running; 3226 unsigned long this_load_per_task;
3126 int load_idx, group_imb = 0; 3227 unsigned long this_nr_running;
3228
3229 /* Statistics of the busiest group */
3230 unsigned long max_load;
3231 unsigned long busiest_load_per_task;
3232 unsigned long busiest_nr_running;
3233
3234 int group_imb; /* Is there imbalance in this sd */
3127#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3235#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3128 int power_savings_balance = 1; 3236 int power_savings_balance; /* Is powersave balance needed for this sd */
3129 unsigned long leader_nr_running = 0, min_load_per_task = 0; 3237 struct sched_group *group_min; /* Least loaded group in sd */
3130 unsigned long min_nr_running = ULONG_MAX; 3238 struct sched_group *group_leader; /* Group which relieves group_min */
3131 struct sched_group *group_min = NULL, *group_leader = NULL; 3239 unsigned long min_load_per_task; /* load_per_task in group_min */
3240 unsigned long leader_nr_running; /* Nr running of group_leader */
3241 unsigned long min_nr_running; /* Nr running of group_min */
3132#endif 3242#endif
3243};
3133 3244
3134 max_load = this_load = total_load = total_pwr = 0; 3245/*
3135 busiest_load_per_task = busiest_nr_running = 0; 3246 * sg_lb_stats - stats of a sched_group required for load_balancing
3136 this_load_per_task = this_nr_running = 0; 3247 */
3248struct sg_lb_stats {
3249 unsigned long avg_load; /*Avg load across the CPUs of the group */
3250 unsigned long group_load; /* Total load over the CPUs of the group */
3251 unsigned long sum_nr_running; /* Nr tasks running in the group */
3252 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3253 unsigned long group_capacity;
3254 int group_imb; /* Is there an imbalance in the group ? */
3255};
3137 3256
3138 if (idle == CPU_NOT_IDLE) 3257/**
3258 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3259 * @group: The group whose first cpu is to be returned.
3260 */
3261static inline unsigned int group_first_cpu(struct sched_group *group)
3262{
3263 return cpumask_first(sched_group_cpus(group));
3264}
3265
3266/**
3267 * get_sd_load_idx - Obtain the load index for a given sched domain.
3268 * @sd: The sched_domain whose load_idx is to be obtained.
3269 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3270 */
3271static inline int get_sd_load_idx(struct sched_domain *sd,
3272 enum cpu_idle_type idle)
3273{
3274 int load_idx;
3275
3276 switch (idle) {
3277 case CPU_NOT_IDLE:
3139 load_idx = sd->busy_idx; 3278 load_idx = sd->busy_idx;
3140 else if (idle == CPU_NEWLY_IDLE) 3279 break;
3280
3281 case CPU_NEWLY_IDLE:
3141 load_idx = sd->newidle_idx; 3282 load_idx = sd->newidle_idx;
3142 else 3283 break;
3284 default:
3143 load_idx = sd->idle_idx; 3285 load_idx = sd->idle_idx;
3286 break;
3287 }
3144 3288
3145 do { 3289 return load_idx;
3146 unsigned long load, group_capacity, max_cpu_load, min_cpu_load; 3290}
3147 int local_group;
3148 int i;
3149 int __group_imb = 0;
3150 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3151 unsigned long sum_nr_running, sum_weighted_load;
3152 unsigned long sum_avg_load_per_task;
3153 unsigned long avg_load_per_task;
3154 3291
3155 local_group = cpumask_test_cpu(this_cpu,
3156 sched_group_cpus(group));
3157 3292
3158 if (local_group) 3293#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3159 balance_cpu = cpumask_first(sched_group_cpus(group)); 3294/**
3295 * init_sd_power_savings_stats - Initialize power savings statistics for
3296 * the given sched_domain, during load balancing.
3297 *
3298 * @sd: Sched domain whose power-savings statistics are to be initialized.
3299 * @sds: Variable containing the statistics for sd.
3300 * @idle: Idle status of the CPU at which we're performing load-balancing.
3301 */
3302static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3303 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3304{
3305 /*
3306 * Busy processors will not participate in power savings
3307 * balance.
3308 */
3309 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3310 sds->power_savings_balance = 0;
3311 else {
3312 sds->power_savings_balance = 1;
3313 sds->min_nr_running = ULONG_MAX;
3314 sds->leader_nr_running = 0;
3315 }
3316}
3160 3317
3161 /* Tally up the load of all CPUs in the group */ 3318/**
3162 sum_weighted_load = sum_nr_running = avg_load = 0; 3319 * update_sd_power_savings_stats - Update the power saving stats for a
3163 sum_avg_load_per_task = avg_load_per_task = 0; 3320 * sched_domain while performing load balancing.
3321 *
3322 * @group: sched_group belonging to the sched_domain under consideration.
3323 * @sds: Variable containing the statistics of the sched_domain
3324 * @local_group: Does group contain the CPU for which we're performing
3325 * load balancing ?
3326 * @sgs: Variable containing the statistics of the group.
3327 */
3328static inline void update_sd_power_savings_stats(struct sched_group *group,
3329 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3330{
3164 3331
3165 max_cpu_load = 0; 3332 if (!sds->power_savings_balance)
3166 min_cpu_load = ~0UL; 3333 return;
3167 3334
3168 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3335 /*
3169 struct rq *rq = cpu_rq(i); 3336 * If the local group is idle or completely loaded
3337 * no need to do power savings balance at this domain
3338 */
3339 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3340 !sds->this_nr_running))
3341 sds->power_savings_balance = 0;
3170 3342
3171 if (*sd_idle && rq->nr_running) 3343 /*
3172 *sd_idle = 0; 3344 * If a group is already running at full capacity or idle,
3345 * don't include that group in power savings calculations
3346 */
3347 if (!sds->power_savings_balance ||
3348 sgs->sum_nr_running >= sgs->group_capacity ||
3349 !sgs->sum_nr_running)
3350 return;
3173 3351
3174 /* Bias balancing toward cpus of our domain */ 3352 /*
3175 if (local_group) { 3353 * Calculate the group which has the least non-idle load.
3176 if (idle_cpu(i) && !first_idle_cpu) { 3354 * This is the group from where we need to pick up the load
3177 first_idle_cpu = 1; 3355 * for saving power
3178 balance_cpu = i; 3356 */
3179 } 3357 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3358 (sgs->sum_nr_running == sds->min_nr_running &&
3359 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3360 sds->group_min = group;
3361 sds->min_nr_running = sgs->sum_nr_running;
3362 sds->min_load_per_task = sgs->sum_weighted_load /
3363 sgs->sum_nr_running;
3364 }
3180 3365
3181 load = target_load(i, load_idx); 3366 /*
3182 } else { 3367 * Calculate the group which is almost near its
3183 load = source_load(i, load_idx); 3368 * capacity but still has some space to pick up some load
3184 if (load > max_cpu_load) 3369 * from other group and save more power
3185 max_cpu_load = load; 3370 */
3186 if (min_cpu_load > load) 3371 if (sgs->sum_nr_running > sgs->group_capacity - 1)
3187 min_cpu_load = load; 3372 return;
3188 }
3189 3373
3190 avg_load += load; 3374 if (sgs->sum_nr_running > sds->leader_nr_running ||
3191 sum_nr_running += rq->nr_running; 3375 (sgs->sum_nr_running == sds->leader_nr_running &&
3192 sum_weighted_load += weighted_cpuload(i); 3376 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3377 sds->group_leader = group;
3378 sds->leader_nr_running = sgs->sum_nr_running;
3379 }
3380}
3193 3381
3194 sum_avg_load_per_task += cpu_avg_load_per_task(i); 3382/**
3195 } 3383 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3384 * @sds: Variable containing the statistics of the sched_domain
3385 * under consideration.
3386 * @this_cpu: Cpu at which we're currently performing load-balancing.
3387 * @imbalance: Variable to store the imbalance.
3388 *
3389 * Description:
3390 * Check if we have potential to perform some power-savings balance.
3391 * If yes, set the busiest group to be the least loaded group in the
3392 * sched_domain, so that it's CPUs can be put to idle.
3393 *
3394 * Returns 1 if there is potential to perform power-savings balance.
3395 * Else returns 0.
3396 */
3397static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3398 int this_cpu, unsigned long *imbalance)
3399{
3400 if (!sds->power_savings_balance)
3401 return 0;
3196 3402
3197 /* 3403 if (sds->this != sds->group_leader ||
3198 * First idle cpu or the first cpu(busiest) in this sched group 3404 sds->group_leader == sds->group_min)
3199 * is eligible for doing load balancing at this and above 3405 return 0;
3200 * domains. In the newly idle case, we will allow all the cpu's
3201 * to do the newly idle load balance.
3202 */
3203 if (idle != CPU_NEWLY_IDLE && local_group &&
3204 balance_cpu != this_cpu && balance) {
3205 *balance = 0;
3206 goto ret;
3207 }
3208 3406
3209 total_load += avg_load; 3407 *imbalance = sds->min_load_per_task;
3210 total_pwr += group->__cpu_power; 3408 sds->busiest = sds->group_min;
3211 3409
3212 /* Adjust by relative CPU power of the group */ 3410 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3213 avg_load = sg_div_cpu_power(group, 3411 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3214 avg_load * SCHED_LOAD_SCALE); 3412 group_first_cpu(sds->group_leader);
3413 }
3414
3415 return 1;
3215 3416
3417}
3418#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3419static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3420 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3421{
3422 return;
3423}
3216 3424
3217 /* 3425static inline void update_sd_power_savings_stats(struct sched_group *group,
3218 * Consider the group unbalanced when the imbalance is larger 3426 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3219 * than the average weight of two tasks. 3427{
3220 * 3428 return;
3221 * APZ: with cgroup the avg task weight can vary wildly and 3429}
3222 * might not be a suitable number - should we keep a 3430
3223 * normalized nr_running number somewhere that negates 3431static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3224 * the hierarchy? 3432 int this_cpu, unsigned long *imbalance)
3225 */ 3433{
3226 avg_load_per_task = sg_div_cpu_power(group, 3434 return 0;
3227 sum_avg_load_per_task * SCHED_LOAD_SCALE); 3435}
3436#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3437
3438
3439/**
3440 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3441 * @group: sched_group whose statistics are to be updated.
3442 * @this_cpu: Cpu for which load balance is currently performed.
3443 * @idle: Idle status of this_cpu
3444 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3445 * @sd_idle: Idle status of the sched_domain containing group.
3446 * @local_group: Does group contain this_cpu.
3447 * @cpus: Set of cpus considered for load balancing.
3448 * @balance: Should we balance.
3449 * @sgs: variable to hold the statistics for this group.
3450 */
3451static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3452 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3453 int local_group, const struct cpumask *cpus,
3454 int *balance, struct sg_lb_stats *sgs)
3455{
3456 unsigned long load, max_cpu_load, min_cpu_load;
3457 int i;
3458 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3459 unsigned long sum_avg_load_per_task;
3460 unsigned long avg_load_per_task;
3461
3462 if (local_group)
3463 balance_cpu = group_first_cpu(group);
3464
3465 /* Tally up the load of all CPUs in the group */
3466 sum_avg_load_per_task = avg_load_per_task = 0;
3467 max_cpu_load = 0;
3468 min_cpu_load = ~0UL;
3228 3469
3229 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3470 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3230 __group_imb = 1; 3471 struct rq *rq = cpu_rq(i);
3231 3472
3232 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3473 if (*sd_idle && rq->nr_running)
3474 *sd_idle = 0;
3233 3475
3476 /* Bias balancing toward cpus of our domain */
3234 if (local_group) { 3477 if (local_group) {
3235 this_load = avg_load; 3478 if (idle_cpu(i) && !first_idle_cpu) {
3236 this = group; 3479 first_idle_cpu = 1;
3237 this_nr_running = sum_nr_running; 3480 balance_cpu = i;
3238 this_load_per_task = sum_weighted_load; 3481 }
3239 } else if (avg_load > max_load && 3482
3240 (sum_nr_running > group_capacity || __group_imb)) { 3483 load = target_load(i, load_idx);
3241 max_load = avg_load; 3484 } else {
3242 busiest = group; 3485 load = source_load(i, load_idx);
3243 busiest_nr_running = sum_nr_running; 3486 if (load > max_cpu_load)
3244 busiest_load_per_task = sum_weighted_load; 3487 max_cpu_load = load;
3245 group_imb = __group_imb; 3488 if (min_cpu_load > load)
3489 min_cpu_load = load;
3246 } 3490 }
3247 3491
3248#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3492 sgs->group_load += load;
3249 /* 3493 sgs->sum_nr_running += rq->nr_running;
3250 * Busy processors will not participate in power savings 3494 sgs->sum_weighted_load += weighted_cpuload(i);
3251 * balance.
3252 */
3253 if (idle == CPU_NOT_IDLE ||
3254 !(sd->flags & SD_POWERSAVINGS_BALANCE))
3255 goto group_next;
3256 3495
3257 /* 3496 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3258 * If the local group is idle or completely loaded 3497 }
3259 * no need to do power savings balance at this domain
3260 */
3261 if (local_group && (this_nr_running >= group_capacity ||
3262 !this_nr_running))
3263 power_savings_balance = 0;
3264 3498
3265 /* 3499 /*
3266 * If a group is already running at full capacity or idle, 3500 * First idle cpu or the first cpu(busiest) in this sched group
3267 * don't include that group in power savings calculations 3501 * is eligible for doing load balancing at this and above
3268 */ 3502 * domains. In the newly idle case, we will allow all the cpu's
3269 if (!power_savings_balance || sum_nr_running >= group_capacity 3503 * to do the newly idle load balance.
3270 || !sum_nr_running) 3504 */
3271 goto group_next; 3505 if (idle != CPU_NEWLY_IDLE && local_group &&
3506 balance_cpu != this_cpu && balance) {
3507 *balance = 0;
3508 return;
3509 }
3272 3510
3273 /* 3511 /* Adjust by relative CPU power of the group */
3274 * Calculate the group which has the least non-idle load. 3512 sgs->avg_load = sg_div_cpu_power(group,
3275 * This is the group from where we need to pick up the load 3513 sgs->group_load * SCHED_LOAD_SCALE);
3276 * for saving power
3277 */
3278 if ((sum_nr_running < min_nr_running) ||
3279 (sum_nr_running == min_nr_running &&
3280 cpumask_first(sched_group_cpus(group)) >
3281 cpumask_first(sched_group_cpus(group_min)))) {
3282 group_min = group;
3283 min_nr_running = sum_nr_running;
3284 min_load_per_task = sum_weighted_load /
3285 sum_nr_running;
3286 }
3287 3514
3288 /* 3515
3289 * Calculate the group which is almost near its 3516 /*
3290 * capacity but still has some space to pick up some load 3517 * Consider the group unbalanced when the imbalance is larger
3291 * from other group and save more power 3518 * than the average weight of two tasks.
3292 */ 3519 *
3293 if (sum_nr_running <= group_capacity - 1) { 3520 * APZ: with cgroup the avg task weight can vary wildly and
3294 if (sum_nr_running > leader_nr_running || 3521 * might not be a suitable number - should we keep a
3295 (sum_nr_running == leader_nr_running && 3522 * normalized nr_running number somewhere that negates
3296 cpumask_first(sched_group_cpus(group)) < 3523 * the hierarchy?
3297 cpumask_first(sched_group_cpus(group_leader)))) { 3524 */
3298 group_leader = group; 3525 avg_load_per_task = sg_div_cpu_power(group,
3299 leader_nr_running = sum_nr_running; 3526 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3300 } 3527
3528 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3529 sgs->group_imb = 1;
3530
3531 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3532
3533}
3534
3535/**
3536 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3537 * @sd: sched_domain whose statistics are to be updated.
3538 * @this_cpu: Cpu for which load balance is currently performed.
3539 * @idle: Idle status of this_cpu
3540 * @sd_idle: Idle status of the sched_domain containing group.
3541 * @cpus: Set of cpus considered for load balancing.
3542 * @balance: Should we balance.
3543 * @sds: variable to hold the statistics for this sched_domain.
3544 */
3545static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3546 enum cpu_idle_type idle, int *sd_idle,
3547 const struct cpumask *cpus, int *balance,
3548 struct sd_lb_stats *sds)
3549{
3550 struct sched_group *group = sd->groups;
3551 struct sg_lb_stats sgs;
3552 int load_idx;
3553
3554 init_sd_power_savings_stats(sd, sds, idle);
3555 load_idx = get_sd_load_idx(sd, idle);
3556
3557 do {
3558 int local_group;
3559
3560 local_group = cpumask_test_cpu(this_cpu,
3561 sched_group_cpus(group));
3562 memset(&sgs, 0, sizeof(sgs));
3563 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
3564 local_group, cpus, balance, &sgs);
3565
3566 if (local_group && balance && !(*balance))
3567 return;
3568
3569 sds->total_load += sgs.group_load;
3570 sds->total_pwr += group->__cpu_power;
3571
3572 if (local_group) {
3573 sds->this_load = sgs.avg_load;
3574 sds->this = group;
3575 sds->this_nr_running = sgs.sum_nr_running;
3576 sds->this_load_per_task = sgs.sum_weighted_load;
3577 } else if (sgs.avg_load > sds->max_load &&
3578 (sgs.sum_nr_running > sgs.group_capacity ||
3579 sgs.group_imb)) {
3580 sds->max_load = sgs.avg_load;
3581 sds->busiest = group;
3582 sds->busiest_nr_running = sgs.sum_nr_running;
3583 sds->busiest_load_per_task = sgs.sum_weighted_load;
3584 sds->group_imb = sgs.group_imb;
3301 } 3585 }
3302group_next: 3586
3303#endif 3587 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3304 group = group->next; 3588 group = group->next;
3305 } while (group != sd->groups); 3589 } while (group != sd->groups);
3306 3590
3307 if (!busiest || this_load >= max_load || busiest_nr_running == 0) 3591}
3308 goto out_balanced;
3309
3310 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3311 3592
3312 if (this_load >= avg_load || 3593/**
3313 100*max_load <= sd->imbalance_pct*this_load) 3594 * fix_small_imbalance - Calculate the minor imbalance that exists
3314 goto out_balanced; 3595 * amongst the groups of a sched_domain, during
3596 * load balancing.
3597 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3598 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3599 * @imbalance: Variable to store the imbalance.
3600 */
3601static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3602 int this_cpu, unsigned long *imbalance)
3603{
3604 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3605 unsigned int imbn = 2;
3606
3607 if (sds->this_nr_running) {
3608 sds->this_load_per_task /= sds->this_nr_running;
3609 if (sds->busiest_load_per_task >
3610 sds->this_load_per_task)
3611 imbn = 1;
3612 } else
3613 sds->this_load_per_task =
3614 cpu_avg_load_per_task(this_cpu);
3315 3615
3316 busiest_load_per_task /= busiest_nr_running; 3616 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3317 if (group_imb) 3617 sds->busiest_load_per_task * imbn) {
3318 busiest_load_per_task = min(busiest_load_per_task, avg_load); 3618 *imbalance = sds->busiest_load_per_task;
3619 return;
3620 }
3319 3621
3320 /* 3622 /*
3321 * We're trying to get all the cpus to the average_load, so we don't 3623 * OK, we don't have enough imbalance to justify moving tasks,
3322 * want to push ourselves above the average load, nor do we wish to 3624 * however we may be able to increase total CPU power used by
3323 * reduce the max loaded cpu below the average load, as either of these 3625 * moving them.
3324 * actions would just result in more rebalancing later, and ping-pong
3325 * tasks around. Thus we look for the minimum possible imbalance.
3326 * Negative imbalances (*we* are more loaded than anyone else) will
3327 * be counted as no imbalance for these purposes -- we can't fix that
3328 * by pulling tasks to us. Be careful of negative numbers as they'll
3329 * appear as very large values with unsigned longs.
3330 */ 3626 */
3331 if (max_load <= busiest_load_per_task)
3332 goto out_balanced;
3333 3627
3628 pwr_now += sds->busiest->__cpu_power *
3629 min(sds->busiest_load_per_task, sds->max_load);
3630 pwr_now += sds->this->__cpu_power *
3631 min(sds->this_load_per_task, sds->this_load);
3632 pwr_now /= SCHED_LOAD_SCALE;
3633
3634 /* Amount of load we'd subtract */
3635 tmp = sg_div_cpu_power(sds->busiest,
3636 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3637 if (sds->max_load > tmp)
3638 pwr_move += sds->busiest->__cpu_power *
3639 min(sds->busiest_load_per_task, sds->max_load - tmp);
3640
3641 /* Amount of load we'd add */
3642 if (sds->max_load * sds->busiest->__cpu_power <
3643 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3644 tmp = sg_div_cpu_power(sds->this,
3645 sds->max_load * sds->busiest->__cpu_power);
3646 else
3647 tmp = sg_div_cpu_power(sds->this,
3648 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3649 pwr_move += sds->this->__cpu_power *
3650 min(sds->this_load_per_task, sds->this_load + tmp);
3651 pwr_move /= SCHED_LOAD_SCALE;
3652
3653 /* Move if we gain throughput */
3654 if (pwr_move > pwr_now)
3655 *imbalance = sds->busiest_load_per_task;
3656}
3657
3658/**
3659 * calculate_imbalance - Calculate the amount of imbalance present within the
3660 * groups of a given sched_domain during load balance.
3661 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3662 * @this_cpu: Cpu for which currently load balance is being performed.
3663 * @imbalance: The variable to store the imbalance.
3664 */
3665static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3666 unsigned long *imbalance)
3667{
3668 unsigned long max_pull;
3334 /* 3669 /*
3335 * In the presence of smp nice balancing, certain scenarios can have 3670 * In the presence of smp nice balancing, certain scenarios can have
3336 * max load less than avg load(as we skip the groups at or below 3671 * max load less than avg load(as we skip the groups at or below
3337 * its cpu_power, while calculating max_load..) 3672 * its cpu_power, while calculating max_load..)
3338 */ 3673 */
3339 if (max_load < avg_load) { 3674 if (sds->max_load < sds->avg_load) {
3340 *imbalance = 0; 3675 *imbalance = 0;
3341 goto small_imbalance; 3676 return fix_small_imbalance(sds, this_cpu, imbalance);
3342 } 3677 }
3343 3678
3344 /* Don't want to pull so many tasks that a group would go idle */ 3679 /* Don't want to pull so many tasks that a group would go idle */
3345 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); 3680 max_pull = min(sds->max_load - sds->avg_load,
3681 sds->max_load - sds->busiest_load_per_task);
3346 3682
3347 /* How much load to actually move to equalise the imbalance */ 3683 /* How much load to actually move to equalise the imbalance */
3348 *imbalance = min(max_pull * busiest->__cpu_power, 3684 *imbalance = min(max_pull * sds->busiest->__cpu_power,
3349 (avg_load - this_load) * this->__cpu_power) 3685 (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
3350 / SCHED_LOAD_SCALE; 3686 / SCHED_LOAD_SCALE;
3351 3687
3352 /* 3688 /*
@@ -3355,78 +3691,110 @@ group_next:
3355 * a think about bumping its value to force at least one task to be 3691 * a think about bumping its value to force at least one task to be
3356 * moved 3692 * moved
3357 */ 3693 */
3358 if (*imbalance < busiest_load_per_task) { 3694 if (*imbalance < sds->busiest_load_per_task)
3359 unsigned long tmp, pwr_now, pwr_move; 3695 return fix_small_imbalance(sds, this_cpu, imbalance);
3360 unsigned int imbn;
3361
3362small_imbalance:
3363 pwr_move = pwr_now = 0;
3364 imbn = 2;
3365 if (this_nr_running) {
3366 this_load_per_task /= this_nr_running;
3367 if (busiest_load_per_task > this_load_per_task)
3368 imbn = 1;
3369 } else
3370 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3371 3696
3372 if (max_load - this_load + busiest_load_per_task >= 3697}
3373 busiest_load_per_task * imbn) { 3698/******* find_busiest_group() helpers end here *********************/
3374 *imbalance = busiest_load_per_task;
3375 return busiest;
3376 }
3377 3699
3378 /* 3700/**
3379 * OK, we don't have enough imbalance to justify moving tasks, 3701 * find_busiest_group - Returns the busiest group within the sched_domain
3380 * however we may be able to increase total CPU power used by 3702 * if there is an imbalance. If there isn't an imbalance, and
3381 * moving them. 3703 * the user has opted for power-savings, it returns a group whose
3382 */ 3704 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3705 * such a group exists.
3706 *
3707 * Also calculates the amount of weighted load which should be moved
3708 * to restore balance.
3709 *
3710 * @sd: The sched_domain whose busiest group is to be returned.
3711 * @this_cpu: The cpu for which load balancing is currently being performed.
3712 * @imbalance: Variable which stores amount of weighted load which should
3713 * be moved to restore balance/put a group to idle.
3714 * @idle: The idle status of this_cpu.
3715 * @sd_idle: The idleness of sd
3716 * @cpus: The set of CPUs under consideration for load-balancing.
3717 * @balance: Pointer to a variable indicating if this_cpu
3718 * is the appropriate cpu to perform load balancing at this_level.
3719 *
3720 * Returns: - the busiest group if imbalance exists.
3721 * - If no imbalance and user has opted for power-savings balance,
3722 * return the least loaded group whose CPUs can be
3723 * put to idle by rebalancing its tasks onto our group.
3724 */
3725static struct sched_group *
3726find_busiest_group(struct sched_domain *sd, int this_cpu,
3727 unsigned long *imbalance, enum cpu_idle_type idle,
3728 int *sd_idle, const struct cpumask *cpus, int *balance)
3729{
3730 struct sd_lb_stats sds;
3383 3731
3384 pwr_now += busiest->__cpu_power * 3732 memset(&sds, 0, sizeof(sds));
3385 min(busiest_load_per_task, max_load);
3386 pwr_now += this->__cpu_power *
3387 min(this_load_per_task, this_load);
3388 pwr_now /= SCHED_LOAD_SCALE;
3389
3390 /* Amount of load we'd subtract */
3391 tmp = sg_div_cpu_power(busiest,
3392 busiest_load_per_task * SCHED_LOAD_SCALE);
3393 if (max_load > tmp)
3394 pwr_move += busiest->__cpu_power *
3395 min(busiest_load_per_task, max_load - tmp);
3396
3397 /* Amount of load we'd add */
3398 if (max_load * busiest->__cpu_power <
3399 busiest_load_per_task * SCHED_LOAD_SCALE)
3400 tmp = sg_div_cpu_power(this,
3401 max_load * busiest->__cpu_power);
3402 else
3403 tmp = sg_div_cpu_power(this,
3404 busiest_load_per_task * SCHED_LOAD_SCALE);
3405 pwr_move += this->__cpu_power *
3406 min(this_load_per_task, this_load + tmp);
3407 pwr_move /= SCHED_LOAD_SCALE;
3408 3733
3409 /* Move if we gain throughput */ 3734 /*
3410 if (pwr_move > pwr_now) 3735 * Compute the various statistics relavent for load balancing at
3411 *imbalance = busiest_load_per_task; 3736 * this level.
3412 } 3737 */
3738 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3739 balance, &sds);
3740
3741 /* Cases where imbalance does not exist from POV of this_cpu */
3742 /* 1) this_cpu is not the appropriate cpu to perform load balancing
3743 * at this level.
3744 * 2) There is no busy sibling group to pull from.
3745 * 3) This group is the busiest group.
3746 * 4) This group is more busy than the avg busieness at this
3747 * sched_domain.
3748 * 5) The imbalance is within the specified limit.
3749 * 6) Any rebalance would lead to ping-pong
3750 */
3751 if (balance && !(*balance))
3752 goto ret;
3413 3753
3414 return busiest; 3754 if (!sds.busiest || sds.busiest_nr_running == 0)
3755 goto out_balanced;
3415 3756
3416out_balanced: 3757 if (sds.this_load >= sds.max_load)
3417#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3758 goto out_balanced;
3418 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3419 goto ret;
3420 3759
3421 if (this == group_leader && group_leader != group_min) { 3760 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3422 *imbalance = min_load_per_task; 3761
3423 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { 3762 if (sds.this_load >= sds.avg_load)
3424 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = 3763 goto out_balanced;
3425 cpumask_first(sched_group_cpus(group_leader)); 3764
3426 } 3765 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3427 return group_min; 3766 goto out_balanced;
3428 } 3767
3429#endif 3768 sds.busiest_load_per_task /= sds.busiest_nr_running;
3769 if (sds.group_imb)
3770 sds.busiest_load_per_task =
3771 min(sds.busiest_load_per_task, sds.avg_load);
3772
3773 /*
3774 * We're trying to get all the cpus to the average_load, so we don't
3775 * want to push ourselves above the average load, nor do we wish to
3776 * reduce the max loaded cpu below the average load, as either of these
3777 * actions would just result in more rebalancing later, and ping-pong
3778 * tasks around. Thus we look for the minimum possible imbalance.
3779 * Negative imbalances (*we* are more loaded than anyone else) will
3780 * be counted as no imbalance for these purposes -- we can't fix that
3781 * by pulling tasks to us. Be careful of negative numbers as they'll
3782 * appear as very large values with unsigned longs.
3783 */
3784 if (sds.max_load <= sds.busiest_load_per_task)
3785 goto out_balanced;
3786
3787 /* Looks like there is an imbalance. Compute it */
3788 calculate_imbalance(&sds, this_cpu, imbalance);
3789 return sds.busiest;
3790
3791out_balanced:
3792 /*
3793 * There is no obvious imbalance. But check if we can do some balancing
3794 * to save power.
3795 */
3796 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
3797 return sds.busiest;
3430ret: 3798ret:
3431 *imbalance = 0; 3799 *imbalance = 0;
3432 return NULL; 3800 return NULL;
@@ -3470,19 +3838,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3470 */ 3838 */
3471#define MAX_PINNED_INTERVAL 512 3839#define MAX_PINNED_INTERVAL 512
3472 3840
3841/* Working cpumask for load_balance and load_balance_newidle. */
3842static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3843
3473/* 3844/*
3474 * Check this_cpu to ensure it is balanced within domain. Attempt to move 3845 * Check this_cpu to ensure it is balanced within domain. Attempt to move
3475 * tasks if there is an imbalance. 3846 * tasks if there is an imbalance.
3476 */ 3847 */
3477static int load_balance(int this_cpu, struct rq *this_rq, 3848static int load_balance(int this_cpu, struct rq *this_rq,
3478 struct sched_domain *sd, enum cpu_idle_type idle, 3849 struct sched_domain *sd, enum cpu_idle_type idle,
3479 int *balance, struct cpumask *cpus) 3850 int *balance)
3480{ 3851{
3481 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3852 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3482 struct sched_group *group; 3853 struct sched_group *group;
3483 unsigned long imbalance; 3854 unsigned long imbalance;
3484 struct rq *busiest; 3855 struct rq *busiest;
3485 unsigned long flags; 3856 unsigned long flags;
3857 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3486 3858
3487 cpumask_setall(cpus); 3859 cpumask_setall(cpus);
3488 3860
@@ -3637,8 +4009,7 @@ out:
3637 * this_rq is locked. 4009 * this_rq is locked.
3638 */ 4010 */
3639static int 4011static int
3640load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, 4012load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
3641 struct cpumask *cpus)
3642{ 4013{
3643 struct sched_group *group; 4014 struct sched_group *group;
3644 struct rq *busiest = NULL; 4015 struct rq *busiest = NULL;
@@ -3646,6 +4017,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3646 int ld_moved = 0; 4017 int ld_moved = 0;
3647 int sd_idle = 0; 4018 int sd_idle = 0;
3648 int all_pinned = 0; 4019 int all_pinned = 0;
4020 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3649 4021
3650 cpumask_setall(cpus); 4022 cpumask_setall(cpus);
3651 4023
@@ -3786,10 +4158,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3786 struct sched_domain *sd; 4158 struct sched_domain *sd;
3787 int pulled_task = 0; 4159 int pulled_task = 0;
3788 unsigned long next_balance = jiffies + HZ; 4160 unsigned long next_balance = jiffies + HZ;
3789 cpumask_var_t tmpmask;
3790
3791 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
3792 return;
3793 4161
3794 for_each_domain(this_cpu, sd) { 4162 for_each_domain(this_cpu, sd) {
3795 unsigned long interval; 4163 unsigned long interval;
@@ -3800,7 +4168,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3800 if (sd->flags & SD_BALANCE_NEWIDLE) 4168 if (sd->flags & SD_BALANCE_NEWIDLE)
3801 /* If we've pulled tasks over stop searching: */ 4169 /* If we've pulled tasks over stop searching: */
3802 pulled_task = load_balance_newidle(this_cpu, this_rq, 4170 pulled_task = load_balance_newidle(this_cpu, this_rq,
3803 sd, tmpmask); 4171 sd);
3804 4172
3805 interval = msecs_to_jiffies(sd->balance_interval); 4173 interval = msecs_to_jiffies(sd->balance_interval);
3806 if (time_after(next_balance, sd->last_balance + interval)) 4174 if (time_after(next_balance, sd->last_balance + interval))
@@ -3815,7 +4183,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3815 */ 4183 */
3816 this_rq->next_balance = next_balance; 4184 this_rq->next_balance = next_balance;
3817 } 4185 }
3818 free_cpumask_var(tmpmask);
3819} 4186}
3820 4187
3821/* 4188/*
@@ -3902,19 +4269,24 @@ int select_nohz_load_balancer(int stop_tick)
3902 int cpu = smp_processor_id(); 4269 int cpu = smp_processor_id();
3903 4270
3904 if (stop_tick) { 4271 if (stop_tick) {
3905 cpumask_set_cpu(cpu, nohz.cpu_mask);
3906 cpu_rq(cpu)->in_nohz_recently = 1; 4272 cpu_rq(cpu)->in_nohz_recently = 1;
3907 4273
3908 /* 4274 if (!cpu_active(cpu)) {
3909 * If we are going offline and still the leader, give up! 4275 if (atomic_read(&nohz.load_balancer) != cpu)
3910 */ 4276 return 0;
3911 if (!cpu_active(cpu) && 4277
3912 atomic_read(&nohz.load_balancer) == cpu) { 4278 /*
4279 * If we are going offline and still the leader,
4280 * give up!
4281 */
3913 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 4282 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3914 BUG(); 4283 BUG();
4284
3915 return 0; 4285 return 0;
3916 } 4286 }
3917 4287
4288 cpumask_set_cpu(cpu, nohz.cpu_mask);
4289
3918 /* time for ilb owner also to sleep */ 4290 /* time for ilb owner also to sleep */
3919 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4291 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3920 if (atomic_read(&nohz.load_balancer) == cpu) 4292 if (atomic_read(&nohz.load_balancer) == cpu)
@@ -3960,11 +4332,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3960 unsigned long next_balance = jiffies + 60*HZ; 4332 unsigned long next_balance = jiffies + 60*HZ;
3961 int update_next_balance = 0; 4333 int update_next_balance = 0;
3962 int need_serialize; 4334 int need_serialize;
3963 cpumask_var_t tmp;
3964
3965 /* Fails alloc? Rebalancing probably not a priority right now. */
3966 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
3967 return;
3968 4335
3969 for_each_domain(cpu, sd) { 4336 for_each_domain(cpu, sd) {
3970 if (!(sd->flags & SD_LOAD_BALANCE)) 4337 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3989,7 +4356,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3989 } 4356 }
3990 4357
3991 if (time_after_eq(jiffies, sd->last_balance + interval)) { 4358 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3992 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { 4359 if (load_balance(cpu, rq, sd, idle, &balance)) {
3993 /* 4360 /*
3994 * We've pulled tasks over so either we're no 4361 * We've pulled tasks over so either we're no
3995 * longer idle, or one of our SMT siblings is 4362 * longer idle, or one of our SMT siblings is
@@ -4023,8 +4390,6 @@ out:
4023 */ 4390 */
4024 if (likely(update_next_balance)) 4391 if (likely(update_next_balance))
4025 rq->next_balance = next_balance; 4392 rq->next_balance = next_balance;
4026
4027 free_cpumask_var(tmp);
4028} 4393}
4029 4394
4030/* 4395/*
@@ -4074,6 +4439,11 @@ static void run_rebalance_domains(struct softirq_action *h)
4074#endif 4439#endif
4075} 4440}
4076 4441
4442static inline int on_null_domain(int cpu)
4443{
4444 return !rcu_dereference(cpu_rq(cpu)->sd);
4445}
4446
4077/* 4447/*
4078 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 4448 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4079 * 4449 *
@@ -4131,7 +4501,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4131 cpumask_test_cpu(cpu, nohz.cpu_mask)) 4501 cpumask_test_cpu(cpu, nohz.cpu_mask))
4132 return; 4502 return;
4133#endif 4503#endif
4134 if (time_after_eq(jiffies, rq->next_balance)) 4504 /* Don't need to rebalance while attached to NULL domain */
4505 if (time_after_eq(jiffies, rq->next_balance) &&
4506 likely(!on_null_domain(cpu)))
4135 raise_softirq(SCHED_SOFTIRQ); 4507 raise_softirq(SCHED_SOFTIRQ);
4136} 4508}
4137 4509
@@ -4474,10 +4846,7 @@ void scheduler_tick(void)
4474#endif 4846#endif
4475} 4847}
4476 4848
4477#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 4849unsigned long get_parent_ip(unsigned long addr)
4478 defined(CONFIG_PREEMPT_TRACER))
4479
4480static inline unsigned long get_parent_ip(unsigned long addr)
4481{ 4850{
4482 if (in_lock_functions(addr)) { 4851 if (in_lock_functions(addr)) {
4483 addr = CALLER_ADDR2; 4852 addr = CALLER_ADDR2;
@@ -4487,6 +4856,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
4487 return addr; 4856 return addr;
4488} 4857}
4489 4858
4859#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4860 defined(CONFIG_PREEMPT_TRACER))
4861
4490void __kprobes add_preempt_count(int val) 4862void __kprobes add_preempt_count(int val)
4491{ 4863{
4492#ifdef CONFIG_DEBUG_PREEMPT 4864#ifdef CONFIG_DEBUG_PREEMPT
@@ -4578,11 +4950,33 @@ static inline void schedule_debug(struct task_struct *prev)
4578#endif 4950#endif
4579} 4951}
4580 4952
4953static void put_prev_task(struct rq *rq, struct task_struct *prev)
4954{
4955 if (prev->state == TASK_RUNNING) {
4956 u64 runtime = prev->se.sum_exec_runtime;
4957
4958 runtime -= prev->se.prev_sum_exec_runtime;
4959 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
4960
4961 /*
4962 * In order to avoid avg_overlap growing stale when we are
4963 * indeed overlapping and hence not getting put to sleep, grow
4964 * the avg_overlap on preemption.
4965 *
4966 * We use the average preemption runtime because that
4967 * correlates to the amount of cache footprint a task can
4968 * build up.
4969 */
4970 update_avg(&prev->se.avg_overlap, runtime);
4971 }
4972 prev->sched_class->put_prev_task(rq, prev);
4973}
4974
4581/* 4975/*
4582 * Pick up the highest-prio task: 4976 * Pick up the highest-prio task:
4583 */ 4977 */
4584static inline struct task_struct * 4978static inline struct task_struct *
4585pick_next_task(struct rq *rq, struct task_struct *prev) 4979pick_next_task(struct rq *rq)
4586{ 4980{
4587 const struct sched_class *class; 4981 const struct sched_class *class;
4588 struct task_struct *p; 4982 struct task_struct *p;
@@ -4654,8 +5048,8 @@ need_resched_nonpreemptible:
4654 if (unlikely(!rq->nr_running)) 5048 if (unlikely(!rq->nr_running))
4655 idle_balance(cpu, rq); 5049 idle_balance(cpu, rq);
4656 5050
4657 prev->sched_class->put_prev_task(rq, prev); 5051 put_prev_task(rq, prev);
4658 next = pick_next_task(rq, prev); 5052 next = pick_next_task(rq);
4659 5053
4660 if (likely(prev != next)) { 5054 if (likely(prev != next)) {
4661 sched_info_switch(prev, next); 5055 sched_info_switch(prev, next);
@@ -4777,7 +5171,7 @@ asmlinkage void __sched preempt_schedule(void)
4777 * between schedule and now. 5171 * between schedule and now.
4778 */ 5172 */
4779 barrier(); 5173 barrier();
4780 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 5174 } while (need_resched());
4781} 5175}
4782EXPORT_SYMBOL(preempt_schedule); 5176EXPORT_SYMBOL(preempt_schedule);
4783 5177
@@ -4806,7 +5200,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
4806 * between schedule and now. 5200 * between schedule and now.
4807 */ 5201 */
4808 barrier(); 5202 barrier();
4809 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 5203 } while (need_resched());
4810} 5204}
4811 5205
4812#endif /* CONFIG_PREEMPT */ 5206#endif /* CONFIG_PREEMPT */
@@ -4867,11 +5261,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4867 __wake_up_common(q, mode, 1, 0, NULL); 5261 __wake_up_common(q, mode, 1, 0, NULL);
4868} 5262}
4869 5263
5264void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5265{
5266 __wake_up_common(q, mode, 1, 0, key);
5267}
5268
4870/** 5269/**
4871 * __wake_up_sync - wake up threads blocked on a waitqueue. 5270 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
4872 * @q: the waitqueue 5271 * @q: the waitqueue
4873 * @mode: which threads 5272 * @mode: which threads
4874 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5273 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5274 * @key: opaque value to be passed to wakeup targets
4875 * 5275 *
4876 * The sync wakeup differs that the waker knows that it will schedule 5276 * The sync wakeup differs that the waker knows that it will schedule
4877 * away soon, so while the target thread will be woken up, it will not 5277 * away soon, so while the target thread will be woken up, it will not
@@ -4880,8 +5280,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4880 * 5280 *
4881 * On UP it can prevent extra preemption. 5281 * On UP it can prevent extra preemption.
4882 */ 5282 */
4883void 5283void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
4884__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 5284 int nr_exclusive, void *key)
4885{ 5285{
4886 unsigned long flags; 5286 unsigned long flags;
4887 int sync = 1; 5287 int sync = 1;
@@ -4893,9 +5293,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4893 sync = 0; 5293 sync = 0;
4894 5294
4895 spin_lock_irqsave(&q->lock, flags); 5295 spin_lock_irqsave(&q->lock, flags);
4896 __wake_up_common(q, mode, nr_exclusive, sync, NULL); 5296 __wake_up_common(q, mode, nr_exclusive, sync, key);
4897 spin_unlock_irqrestore(&q->lock, flags); 5297 spin_unlock_irqrestore(&q->lock, flags);
4898} 5298}
5299EXPORT_SYMBOL_GPL(__wake_up_sync_key);
5300
5301/*
5302 * __wake_up_sync - see __wake_up_sync_key()
5303 */
5304void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5305{
5306 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
5307}
4899EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 5308EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4900 5309
4901/** 5310/**
@@ -5280,7 +5689,7 @@ SYSCALL_DEFINE1(nice, int, increment)
5280 if (increment > 40) 5689 if (increment > 40)
5281 increment = 40; 5690 increment = 40;
5282 5691
5283 nice = PRIO_TO_NICE(current->static_prio) + increment; 5692 nice = TASK_NICE(current) + increment;
5284 if (nice < -20) 5693 if (nice < -20)
5285 nice = -20; 5694 nice = -20;
5286 if (nice > 19) 5695 if (nice > 19)
@@ -6079,12 +6488,7 @@ void sched_show_task(struct task_struct *p)
6079 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 6488 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
6080#endif 6489#endif
6081#ifdef CONFIG_DEBUG_STACK_USAGE 6490#ifdef CONFIG_DEBUG_STACK_USAGE
6082 { 6491 free = stack_not_used(p);
6083 unsigned long *n = end_of_stack(p);
6084 while (!*n)
6085 n++;
6086 free = (unsigned long)n - (unsigned long)end_of_stack(p);
6087 }
6088#endif 6492#endif
6089 printk(KERN_CONT "%5lu %5d %6d\n", free, 6493 printk(KERN_CONT "%5lu %5d %6d\n", free,
6090 task_pid_nr(p), task_pid_nr(p->real_parent)); 6494 task_pid_nr(p), task_pid_nr(p->real_parent));
@@ -6558,7 +6962,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6558 if (!rq->nr_running) 6962 if (!rq->nr_running)
6559 break; 6963 break;
6560 update_rq_clock(rq); 6964 update_rq_clock(rq);
6561 next = pick_next_task(rq, rq->curr); 6965 next = pick_next_task(rq);
6562 if (!next) 6966 if (!next)
6563 break; 6967 break;
6564 next->sched_class->put_prev_task(rq, next); 6968 next->sched_class->put_prev_task(rq, next);
@@ -7080,20 +7484,26 @@ static void free_rootdomain(struct root_domain *rd)
7080 7484
7081static void rq_attach_root(struct rq *rq, struct root_domain *rd) 7485static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7082{ 7486{
7487 struct root_domain *old_rd = NULL;
7083 unsigned long flags; 7488 unsigned long flags;
7084 7489
7085 spin_lock_irqsave(&rq->lock, flags); 7490 spin_lock_irqsave(&rq->lock, flags);
7086 7491
7087 if (rq->rd) { 7492 if (rq->rd) {
7088 struct root_domain *old_rd = rq->rd; 7493 old_rd = rq->rd;
7089 7494
7090 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 7495 if (cpumask_test_cpu(rq->cpu, old_rd->online))
7091 set_rq_offline(rq); 7496 set_rq_offline(rq);
7092 7497
7093 cpumask_clear_cpu(rq->cpu, old_rd->span); 7498 cpumask_clear_cpu(rq->cpu, old_rd->span);
7094 7499
7095 if (atomic_dec_and_test(&old_rd->refcount)) 7500 /*
7096 free_rootdomain(old_rd); 7501 * If we dont want to free the old_rt yet then
7502 * set old_rd to NULL to skip the freeing later
7503 * in this function:
7504 */
7505 if (!atomic_dec_and_test(&old_rd->refcount))
7506 old_rd = NULL;
7097 } 7507 }
7098 7508
7099 atomic_inc(&rd->refcount); 7509 atomic_inc(&rd->refcount);
@@ -7104,6 +7514,9 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7104 set_rq_online(rq); 7514 set_rq_online(rq);
7105 7515
7106 spin_unlock_irqrestore(&rq->lock, flags); 7516 spin_unlock_irqrestore(&rq->lock, flags);
7517
7518 if (old_rd)
7519 free_rootdomain(old_rd);
7107} 7520}
7108 7521
7109static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7522static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
@@ -7381,7 +7794,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7381{ 7794{
7382 int group; 7795 int group;
7383 7796
7384 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); 7797 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7385 group = cpumask_first(mask); 7798 group = cpumask_first(mask);
7386 if (sg) 7799 if (sg)
7387 *sg = &per_cpu(sched_group_core, group).sg; 7800 *sg = &per_cpu(sched_group_core, group).sg;
@@ -7410,7 +7823,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7410 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 7823 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7411 group = cpumask_first(mask); 7824 group = cpumask_first(mask);
7412#elif defined(CONFIG_SCHED_SMT) 7825#elif defined(CONFIG_SCHED_SMT)
7413 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); 7826 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7414 group = cpumask_first(mask); 7827 group = cpumask_first(mask);
7415#else 7828#else
7416 group = cpu; 7829 group = cpu;
@@ -7753,7 +8166,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7753 SD_INIT(sd, SIBLING); 8166 SD_INIT(sd, SIBLING);
7754 set_domain_attribute(sd, attr); 8167 set_domain_attribute(sd, attr);
7755 cpumask_and(sched_domain_span(sd), 8168 cpumask_and(sched_domain_span(sd),
7756 &per_cpu(cpu_sibling_map, i), cpu_map); 8169 topology_thread_cpumask(i), cpu_map);
7757 sd->parent = p; 8170 sd->parent = p;
7758 p->child = sd; 8171 p->child = sd;
7759 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); 8172 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7764,7 +8177,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7764 /* Set up CPU (sibling) groups */ 8177 /* Set up CPU (sibling) groups */
7765 for_each_cpu(i, cpu_map) { 8178 for_each_cpu(i, cpu_map) {
7766 cpumask_and(this_sibling_map, 8179 cpumask_and(this_sibling_map,
7767 &per_cpu(cpu_sibling_map, i), cpu_map); 8180 topology_thread_cpumask(i), cpu_map);
7768 if (i != cpumask_first(this_sibling_map)) 8181 if (i != cpumask_first(this_sibling_map))
7769 continue; 8182 continue;
7770 8183
@@ -8345,11 +8758,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8345 __set_bit(MAX_RT_PRIO, array->bitmap); 8758 __set_bit(MAX_RT_PRIO, array->bitmap);
8346 8759
8347#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 8760#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
8348 rt_rq->highest_prio = MAX_RT_PRIO; 8761 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8762#ifdef CONFIG_SMP
8763 rt_rq->highest_prio.next = MAX_RT_PRIO;
8764#endif
8349#endif 8765#endif
8350#ifdef CONFIG_SMP 8766#ifdef CONFIG_SMP
8351 rt_rq->rt_nr_migratory = 0; 8767 rt_rq->rt_nr_migratory = 0;
8352 rt_rq->overloaded = 0; 8768 rt_rq->overloaded = 0;
8769 plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
8353#endif 8770#endif
8354 8771
8355 rt_rq->rt_time = 0; 8772 rt_rq->rt_time = 0;
@@ -8436,6 +8853,9 @@ void __init sched_init(void)
8436#ifdef CONFIG_USER_SCHED 8853#ifdef CONFIG_USER_SCHED
8437 alloc_size *= 2; 8854 alloc_size *= 2;
8438#endif 8855#endif
8856#ifdef CONFIG_CPUMASK_OFFSTACK
8857 alloc_size += num_possible_cpus() * cpumask_size();
8858#endif
8439 /* 8859 /*
8440 * As sched_init() is called before page_alloc is setup, 8860 * As sched_init() is called before page_alloc is setup,
8441 * we use alloc_bootmem(). 8861 * we use alloc_bootmem().
@@ -8473,6 +8893,12 @@ void __init sched_init(void)
8473 ptr += nr_cpu_ids * sizeof(void **); 8893 ptr += nr_cpu_ids * sizeof(void **);
8474#endif /* CONFIG_USER_SCHED */ 8894#endif /* CONFIG_USER_SCHED */
8475#endif /* CONFIG_RT_GROUP_SCHED */ 8895#endif /* CONFIG_RT_GROUP_SCHED */
8896#ifdef CONFIG_CPUMASK_OFFSTACK
8897 for_each_possible_cpu(i) {
8898 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
8899 ptr += cpumask_size();
8900 }
8901#endif /* CONFIG_CPUMASK_OFFSTACK */
8476 } 8902 }
8477 8903
8478#ifdef CONFIG_SMP 8904#ifdef CONFIG_SMP
@@ -9351,6 +9777,16 @@ static int sched_rt_global_constraints(void)
9351 9777
9352 return ret; 9778 return ret;
9353} 9779}
9780
9781int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
9782{
9783 /* Don't accept realtime tasks when there is no way for them to run */
9784 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
9785 return 0;
9786
9787 return 1;
9788}
9789
9354#else /* !CONFIG_RT_GROUP_SCHED */ 9790#else /* !CONFIG_RT_GROUP_SCHED */
9355static int sched_rt_global_constraints(void) 9791static int sched_rt_global_constraints(void)
9356{ 9792{
@@ -9444,8 +9880,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9444 struct task_struct *tsk) 9880 struct task_struct *tsk)
9445{ 9881{
9446#ifdef CONFIG_RT_GROUP_SCHED 9882#ifdef CONFIG_RT_GROUP_SCHED
9447 /* Don't accept realtime tasks when there is no way for them to run */ 9883 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
9448 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
9449 return -EINVAL; 9884 return -EINVAL;
9450#else 9885#else
9451 /* We don't support RT-tasks being in separate groups */ 9886 /* We don't support RT-tasks being in separate groups */
@@ -9624,7 +10059,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9624 10059
9625static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 10060static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9626{ 10061{
9627 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 10062 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9628 u64 data; 10063 u64 data;
9629 10064
9630#ifndef CONFIG_64BIT 10065#ifndef CONFIG_64BIT
@@ -9643,7 +10078,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9643 10078
9644static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 10079static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9645{ 10080{
9646 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 10081 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9647 10082
9648#ifndef CONFIG_64BIT 10083#ifndef CONFIG_64BIT
9649 /* 10084 /*
@@ -9754,7 +10189,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9754 struct cpuacct *ca; 10189 struct cpuacct *ca;
9755 int cpu; 10190 int cpu;
9756 10191
9757 if (!cpuacct_subsys.active) 10192 if (unlikely(!cpuacct_subsys.active))
9758 return; 10193 return;
9759 10194
9760 cpu = task_cpu(tsk); 10195 cpu = task_cpu(tsk);
@@ -9764,7 +10199,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9764 ca = task_ca(tsk); 10199 ca = task_ca(tsk);
9765 10200
9766 for (; ca; ca = ca->parent) { 10201 for (; ca; ca = ca->parent) {
9767 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 10202 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9768 *cpuusage += cputime; 10203 *cpuusage += cputime;
9769 } 10204 }
9770 10205