aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1212
1 files changed, 856 insertions, 356 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 8ee437a5ec1d..6cc1fd5d5072 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
223{ 223{
224 ktime_t now; 224 ktime_t now;
225 225
226 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) 226 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
227 return; 227 return;
228 228
229 if (hrtimer_active(&rt_b->rt_period_timer)) 229 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -231,13 +231,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
231 231
232 spin_lock(&rt_b->rt_runtime_lock); 232 spin_lock(&rt_b->rt_runtime_lock);
233 for (;;) { 233 for (;;) {
234 unsigned long delta;
235 ktime_t soft, hard;
236
234 if (hrtimer_active(&rt_b->rt_period_timer)) 237 if (hrtimer_active(&rt_b->rt_period_timer))
235 break; 238 break;
236 239
237 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 240 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
238 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 241 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
239 hrtimer_start_expires(&rt_b->rt_period_timer, 242
240 HRTIMER_MODE_ABS); 243 soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
244 hard = hrtimer_get_expires(&rt_b->rt_period_timer);
245 delta = ktime_to_ns(ktime_sub(hard, soft));
246 __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
247 HRTIMER_MODE_ABS, 0);
241 } 248 }
242 spin_unlock(&rt_b->rt_runtime_lock); 249 spin_unlock(&rt_b->rt_runtime_lock);
243} 250}
@@ -331,6 +338,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
331 */ 338 */
332static DEFINE_SPINLOCK(task_group_lock); 339static DEFINE_SPINLOCK(task_group_lock);
333 340
341#ifdef CONFIG_SMP
342static int root_task_group_empty(void)
343{
344 return list_empty(&root_task_group.children);
345}
346#endif
347
334#ifdef CONFIG_FAIR_GROUP_SCHED 348#ifdef CONFIG_FAIR_GROUP_SCHED
335#ifdef CONFIG_USER_SCHED 349#ifdef CONFIG_USER_SCHED
336# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 350# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -391,6 +405,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
391 405
392#else 406#else
393 407
408#ifdef CONFIG_SMP
409static int root_task_group_empty(void)
410{
411 return 1;
412}
413#endif
414
394static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 415static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
395static inline struct task_group *task_group(struct task_struct *p) 416static inline struct task_group *task_group(struct task_struct *p)
396{ 417{
@@ -467,11 +488,17 @@ struct rt_rq {
467 struct rt_prio_array active; 488 struct rt_prio_array active;
468 unsigned long rt_nr_running; 489 unsigned long rt_nr_running;
469#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 490#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
470 int highest_prio; /* highest queued rt task prio */ 491 struct {
492 int curr; /* highest queued rt task prio */
493#ifdef CONFIG_SMP
494 int next; /* next highest */
495#endif
496 } highest_prio;
471#endif 497#endif
472#ifdef CONFIG_SMP 498#ifdef CONFIG_SMP
473 unsigned long rt_nr_migratory; 499 unsigned long rt_nr_migratory;
474 int overloaded; 500 int overloaded;
501 struct plist_head pushable_tasks;
475#endif 502#endif
476 int rt_throttled; 503 int rt_throttled;
477 u64 rt_time; 504 u64 rt_time;
@@ -549,7 +576,6 @@ struct rq {
549 unsigned long nr_running; 576 unsigned long nr_running;
550 #define CPU_LOAD_IDX_MAX 5 577 #define CPU_LOAD_IDX_MAX 5
551 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 578 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
552 unsigned char idle_at_tick;
553#ifdef CONFIG_NO_HZ 579#ifdef CONFIG_NO_HZ
554 unsigned long last_tick_seen; 580 unsigned long last_tick_seen;
555 unsigned char in_nohz_recently; 581 unsigned char in_nohz_recently;
@@ -590,6 +616,7 @@ struct rq {
590 struct root_domain *rd; 616 struct root_domain *rd;
591 struct sched_domain *sd; 617 struct sched_domain *sd;
592 618
619 unsigned char idle_at_tick;
593 /* For active balancing */ 620 /* For active balancing */
594 int active_balance; 621 int active_balance;
595 int push_cpu; 622 int push_cpu;
@@ -618,9 +645,6 @@ struct rq {
618 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 645 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
619 646
620 /* sys_sched_yield() stats */ 647 /* sys_sched_yield() stats */
621 unsigned int yld_exp_empty;
622 unsigned int yld_act_empty;
623 unsigned int yld_both_empty;
624 unsigned int yld_count; 648 unsigned int yld_count;
625 649
626 /* schedule() stats */ 650 /* schedule() stats */
@@ -1093,7 +1117,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1093 if (rq == this_rq()) { 1117 if (rq == this_rq()) {
1094 hrtimer_restart(timer); 1118 hrtimer_restart(timer);
1095 } else if (!rq->hrtick_csd_pending) { 1119 } else if (!rq->hrtick_csd_pending) {
1096 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); 1120 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1097 rq->hrtick_csd_pending = 1; 1121 rq->hrtick_csd_pending = 1;
1098 } 1122 }
1099} 1123}
@@ -1129,7 +1153,8 @@ static __init void init_hrtick(void)
1129 */ 1153 */
1130static void hrtick_start(struct rq *rq, u64 delay) 1154static void hrtick_start(struct rq *rq, u64 delay)
1131{ 1155{
1132 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL); 1156 __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
1157 HRTIMER_MODE_REL, 0);
1133} 1158}
1134 1159
1135static inline void init_hrtick(void) 1160static inline void init_hrtick(void)
@@ -1183,10 +1208,10 @@ static void resched_task(struct task_struct *p)
1183 1208
1184 assert_spin_locked(&task_rq(p)->lock); 1209 assert_spin_locked(&task_rq(p)->lock);
1185 1210
1186 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 1211 if (test_tsk_need_resched(p))
1187 return; 1212 return;
1188 1213
1189 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 1214 set_tsk_need_resched(p);
1190 1215
1191 cpu = task_cpu(p); 1216 cpu = task_cpu(p);
1192 if (cpu == smp_processor_id()) 1217 if (cpu == smp_processor_id())
@@ -1242,7 +1267,7 @@ void wake_up_idle_cpu(int cpu)
1242 * lockless. The worst case is that the other CPU runs the 1267 * lockless. The worst case is that the other CPU runs the
1243 * idle task through an additional NOOP schedule() 1268 * idle task through an additional NOOP schedule()
1244 */ 1269 */
1245 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); 1270 set_tsk_need_resched(rq->idle);
1246 1271
1247 /* NEED_RESCHED must be visible before we test polling */ 1272 /* NEED_RESCHED must be visible before we test polling */
1248 smp_mb(); 1273 smp_mb();
@@ -1610,21 +1635,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1610 1635
1611#endif 1636#endif
1612 1637
1638#ifdef CONFIG_PREEMPT
1639
1613/* 1640/*
1614 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1641 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1642 * way at the expense of forcing extra atomic operations in all
1643 * invocations. This assures that the double_lock is acquired using the
1644 * same underlying policy as the spinlock_t on this architecture, which
1645 * reduces latency compared to the unfair variant below. However, it
1646 * also adds more overhead and therefore may reduce throughput.
1615 */ 1647 */
1616static int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1648static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1649 __releases(this_rq->lock)
1650 __acquires(busiest->lock)
1651 __acquires(this_rq->lock)
1652{
1653 spin_unlock(&this_rq->lock);
1654 double_rq_lock(this_rq, busiest);
1655
1656 return 1;
1657}
1658
1659#else
1660/*
1661 * Unfair double_lock_balance: Optimizes throughput at the expense of
1662 * latency by eliminating extra atomic operations when the locks are
1663 * already in proper order on entry. This favors lower cpu-ids and will
1664 * grant the double lock to lower cpus over higher ids under contention,
1665 * regardless of entry order into the function.
1666 */
1667static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1617 __releases(this_rq->lock) 1668 __releases(this_rq->lock)
1618 __acquires(busiest->lock) 1669 __acquires(busiest->lock)
1619 __acquires(this_rq->lock) 1670 __acquires(this_rq->lock)
1620{ 1671{
1621 int ret = 0; 1672 int ret = 0;
1622 1673
1623 if (unlikely(!irqs_disabled())) {
1624 /* printk() doesn't work good under rq->lock */
1625 spin_unlock(&this_rq->lock);
1626 BUG_ON(1);
1627 }
1628 if (unlikely(!spin_trylock(&busiest->lock))) { 1674 if (unlikely(!spin_trylock(&busiest->lock))) {
1629 if (busiest < this_rq) { 1675 if (busiest < this_rq) {
1630 spin_unlock(&this_rq->lock); 1676 spin_unlock(&this_rq->lock);
@@ -1637,6 +1683,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1637 return ret; 1683 return ret;
1638} 1684}
1639 1685
1686#endif /* CONFIG_PREEMPT */
1687
1688/*
1689 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1690 */
1691static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1692{
1693 if (unlikely(!irqs_disabled())) {
1694 /* printk() doesn't work good under rq->lock */
1695 spin_unlock(&this_rq->lock);
1696 BUG_ON(1);
1697 }
1698
1699 return _double_lock_balance(this_rq, busiest);
1700}
1701
1640static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1702static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1641 __releases(busiest->lock) 1703 __releases(busiest->lock)
1642{ 1704{
@@ -1705,6 +1767,9 @@ static void update_avg(u64 *avg, u64 sample)
1705 1767
1706static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1768static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1707{ 1769{
1770 if (wakeup)
1771 p->se.start_runtime = p->se.sum_exec_runtime;
1772
1708 sched_info_queued(p); 1773 sched_info_queued(p);
1709 p->sched_class->enqueue_task(rq, p, wakeup); 1774 p->sched_class->enqueue_task(rq, p, wakeup);
1710 p->se.on_rq = 1; 1775 p->se.on_rq = 1;
@@ -1712,10 +1777,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1712 1777
1713static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1778static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1714{ 1779{
1715 if (sleep && p->se.last_wakeup) { 1780 if (sleep) {
1716 update_avg(&p->se.avg_overlap, 1781 if (p->se.last_wakeup) {
1717 p->se.sum_exec_runtime - p->se.last_wakeup); 1782 update_avg(&p->se.avg_overlap,
1718 p->se.last_wakeup = 0; 1783 p->se.sum_exec_runtime - p->se.last_wakeup);
1784 p->se.last_wakeup = 0;
1785 } else {
1786 update_avg(&p->se.avg_wakeup,
1787 sysctl_sched_wakeup_granularity);
1788 }
1719 } 1789 }
1720 1790
1721 sched_info_dequeued(p); 1791 sched_info_dequeued(p);
@@ -2017,7 +2087,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2017 * it must be off the runqueue _entirely_, and not 2087 * it must be off the runqueue _entirely_, and not
2018 * preempted! 2088 * preempted!
2019 * 2089 *
2020 * So if it wa still runnable (but just not actively 2090 * So if it was still runnable (but just not actively
2021 * running right now), it's preempted, and we should 2091 * running right now), it's preempted, and we should
2022 * yield - it could be a while. 2092 * yield - it could be a while.
2023 */ 2093 */
@@ -2266,18 +2336,8 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2266 if (!sched_feat(SYNC_WAKEUPS)) 2336 if (!sched_feat(SYNC_WAKEUPS))
2267 sync = 0; 2337 sync = 0;
2268 2338
2269 if (!sync) {
2270 if (current->se.avg_overlap < sysctl_sched_migration_cost &&
2271 p->se.avg_overlap < sysctl_sched_migration_cost)
2272 sync = 1;
2273 } else {
2274 if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
2275 p->se.avg_overlap >= sysctl_sched_migration_cost)
2276 sync = 0;
2277 }
2278
2279#ifdef CONFIG_SMP 2339#ifdef CONFIG_SMP
2280 if (sched_feat(LB_WAKEUP_UPDATE)) { 2340 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2281 struct sched_domain *sd; 2341 struct sched_domain *sd;
2282 2342
2283 this_cpu = raw_smp_processor_id(); 2343 this_cpu = raw_smp_processor_id();
@@ -2355,6 +2415,22 @@ out_activate:
2355 activate_task(rq, p, 1); 2415 activate_task(rq, p, 1);
2356 success = 1; 2416 success = 1;
2357 2417
2418 /*
2419 * Only attribute actual wakeups done by this task.
2420 */
2421 if (!in_interrupt()) {
2422 struct sched_entity *se = &current->se;
2423 u64 sample = se->sum_exec_runtime;
2424
2425 if (se->last_wakeup)
2426 sample -= se->last_wakeup;
2427 else
2428 sample -= se->start_runtime;
2429 update_avg(&se->avg_wakeup, sample);
2430
2431 se->last_wakeup = se->sum_exec_runtime;
2432 }
2433
2358out_running: 2434out_running:
2359 trace_sched_wakeup(rq, p, success); 2435 trace_sched_wakeup(rq, p, success);
2360 check_preempt_curr(rq, p, sync); 2436 check_preempt_curr(rq, p, sync);
@@ -2365,8 +2441,6 @@ out_running:
2365 p->sched_class->task_wake_up(rq, p); 2441 p->sched_class->task_wake_up(rq, p);
2366#endif 2442#endif
2367out: 2443out:
2368 current->se.last_wakeup = current->se.sum_exec_runtime;
2369
2370 task_rq_unlock(rq, &flags); 2444 task_rq_unlock(rq, &flags);
2371 2445
2372 return success; 2446 return success;
@@ -2396,6 +2470,8 @@ static void __sched_fork(struct task_struct *p)
2396 p->se.prev_sum_exec_runtime = 0; 2470 p->se.prev_sum_exec_runtime = 0;
2397 p->se.last_wakeup = 0; 2471 p->se.last_wakeup = 0;
2398 p->se.avg_overlap = 0; 2472 p->se.avg_overlap = 0;
2473 p->se.start_runtime = 0;
2474 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2399 2475
2400#ifdef CONFIG_SCHEDSTATS 2476#ifdef CONFIG_SCHEDSTATS
2401 p->se.wait_start = 0; 2477 p->se.wait_start = 0;
@@ -2458,6 +2534,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
2458 /* Want to start with kernel preemption disabled. */ 2534 /* Want to start with kernel preemption disabled. */
2459 task_thread_info(p)->preempt_count = 1; 2535 task_thread_info(p)->preempt_count = 1;
2460#endif 2536#endif
2537 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2538
2461 put_cpu(); 2539 put_cpu();
2462} 2540}
2463 2541
@@ -2501,7 +2579,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2501#ifdef CONFIG_PREEMPT_NOTIFIERS 2579#ifdef CONFIG_PREEMPT_NOTIFIERS
2502 2580
2503/** 2581/**
2504 * preempt_notifier_register - tell me when current is being being preempted & rescheduled 2582 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2505 * @notifier: notifier struct to register 2583 * @notifier: notifier struct to register
2506 */ 2584 */
2507void preempt_notifier_register(struct preempt_notifier *notifier) 2585void preempt_notifier_register(struct preempt_notifier *notifier)
@@ -2598,6 +2676,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2598{ 2676{
2599 struct mm_struct *mm = rq->prev_mm; 2677 struct mm_struct *mm = rq->prev_mm;
2600 long prev_state; 2678 long prev_state;
2679#ifdef CONFIG_SMP
2680 int post_schedule = 0;
2681
2682 if (current->sched_class->needs_post_schedule)
2683 post_schedule = current->sched_class->needs_post_schedule(rq);
2684#endif
2601 2685
2602 rq->prev_mm = NULL; 2686 rq->prev_mm = NULL;
2603 2687
@@ -2616,7 +2700,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2616 finish_arch_switch(prev); 2700 finish_arch_switch(prev);
2617 finish_lock_switch(rq, prev); 2701 finish_lock_switch(rq, prev);
2618#ifdef CONFIG_SMP 2702#ifdef CONFIG_SMP
2619 if (current->sched_class->post_schedule) 2703 if (post_schedule)
2620 current->sched_class->post_schedule(rq); 2704 current->sched_class->post_schedule(rq);
2621#endif 2705#endif
2622 2706
@@ -2923,6 +3007,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2923 struct sched_domain *sd, enum cpu_idle_type idle, 3007 struct sched_domain *sd, enum cpu_idle_type idle,
2924 int *all_pinned) 3008 int *all_pinned)
2925{ 3009{
3010 int tsk_cache_hot = 0;
2926 /* 3011 /*
2927 * We do not migrate tasks that are: 3012 * We do not migrate tasks that are:
2928 * 1) running (obviously), or 3013 * 1) running (obviously), or
@@ -2946,10 +3031,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2946 * 2) too many balance attempts have failed. 3031 * 2) too many balance attempts have failed.
2947 */ 3032 */
2948 3033
2949 if (!task_hot(p, rq->clock, sd) || 3034 tsk_cache_hot = task_hot(p, rq->clock, sd);
2950 sd->nr_balance_failed > sd->cache_nice_tries) { 3035 if (!tsk_cache_hot ||
3036 sd->nr_balance_failed > sd->cache_nice_tries) {
2951#ifdef CONFIG_SCHEDSTATS 3037#ifdef CONFIG_SCHEDSTATS
2952 if (task_hot(p, rq->clock, sd)) { 3038 if (tsk_cache_hot) {
2953 schedstat_inc(sd, lb_hot_gained[idle]); 3039 schedstat_inc(sd, lb_hot_gained[idle]);
2954 schedstat_inc(p, se.nr_forced_migrations); 3040 schedstat_inc(p, se.nr_forced_migrations);
2955 } 3041 }
@@ -2957,7 +3043,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2957 return 1; 3043 return 1;
2958 } 3044 }
2959 3045
2960 if (task_hot(p, rq->clock, sd)) { 3046 if (tsk_cache_hot) {
2961 schedstat_inc(p, se.nr_failed_migrations_hot); 3047 schedstat_inc(p, se.nr_failed_migrations_hot);
2962 return 0; 3048 return 0;
2963 } 3049 }
@@ -2997,6 +3083,16 @@ next:
2997 pulled++; 3083 pulled++;
2998 rem_load_move -= p->se.load.weight; 3084 rem_load_move -= p->se.load.weight;
2999 3085
3086#ifdef CONFIG_PREEMPT
3087 /*
3088 * NEWIDLE balancing is a source of latency, so preemptible kernels
3089 * will stop after the first task is pulled to minimize the critical
3090 * section.
3091 */
3092 if (idle == CPU_NEWLY_IDLE)
3093 goto out;
3094#endif
3095
3000 /* 3096 /*
3001 * We only want to steal up to the prescribed amount of weighted load. 3097 * We only want to steal up to the prescribed amount of weighted load.
3002 */ 3098 */
@@ -3043,9 +3139,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3043 sd, idle, all_pinned, &this_best_prio); 3139 sd, idle, all_pinned, &this_best_prio);
3044 class = class->next; 3140 class = class->next;
3045 3141
3142#ifdef CONFIG_PREEMPT
3143 /*
3144 * NEWIDLE balancing is a source of latency, so preemptible
3145 * kernels will stop after the first task is pulled to minimize
3146 * the critical section.
3147 */
3046 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3148 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3047 break; 3149 break;
3048 3150#endif
3049 } while (class && max_load_move > total_load_moved); 3151 } while (class && max_load_move > total_load_moved);
3050 3152
3051 return total_load_moved > 0; 3153 return total_load_moved > 0;
@@ -3095,246 +3197,480 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3095 3197
3096 return 0; 3198 return 0;
3097} 3199}
3098 3200/********** Helpers for find_busiest_group ************************/
3099/* 3201/*
3100 * find_busiest_group finds and returns the busiest CPU group within the 3202 * sd_lb_stats - Structure to store the statistics of a sched_domain
3101 * domain. It calculates and returns the amount of weighted load which 3203 * during load balancing.
3102 * should be moved to restore balance via the imbalance parameter.
3103 */ 3204 */
3104static struct sched_group * 3205struct sd_lb_stats {
3105find_busiest_group(struct sched_domain *sd, int this_cpu, 3206 struct sched_group *busiest; /* Busiest group in this sd */
3106 unsigned long *imbalance, enum cpu_idle_type idle, 3207 struct sched_group *this; /* Local group in this sd */
3107 int *sd_idle, const struct cpumask *cpus, int *balance) 3208 unsigned long total_load; /* Total load of all groups in sd */
3108{ 3209 unsigned long total_pwr; /* Total power of all groups in sd */
3109 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3210 unsigned long avg_load; /* Average load across all groups in sd */
3110 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3211
3111 unsigned long max_pull; 3212 /** Statistics of this group */
3112 unsigned long busiest_load_per_task, busiest_nr_running; 3213 unsigned long this_load;
3113 unsigned long this_load_per_task, this_nr_running; 3214 unsigned long this_load_per_task;
3114 int load_idx, group_imb = 0; 3215 unsigned long this_nr_running;
3216
3217 /* Statistics of the busiest group */
3218 unsigned long max_load;
3219 unsigned long busiest_load_per_task;
3220 unsigned long busiest_nr_running;
3221
3222 int group_imb; /* Is there imbalance in this sd */
3115#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3223#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3116 int power_savings_balance = 1; 3224 int power_savings_balance; /* Is powersave balance needed for this sd */
3117 unsigned long leader_nr_running = 0, min_load_per_task = 0; 3225 struct sched_group *group_min; /* Least loaded group in sd */
3118 unsigned long min_nr_running = ULONG_MAX; 3226 struct sched_group *group_leader; /* Group which relieves group_min */
3119 struct sched_group *group_min = NULL, *group_leader = NULL; 3227 unsigned long min_load_per_task; /* load_per_task in group_min */
3228 unsigned long leader_nr_running; /* Nr running of group_leader */
3229 unsigned long min_nr_running; /* Nr running of group_min */
3120#endif 3230#endif
3231};
3121 3232
3122 max_load = this_load = total_load = total_pwr = 0; 3233/*
3123 busiest_load_per_task = busiest_nr_running = 0; 3234 * sg_lb_stats - stats of a sched_group required for load_balancing
3124 this_load_per_task = this_nr_running = 0; 3235 */
3236struct sg_lb_stats {
3237 unsigned long avg_load; /*Avg load across the CPUs of the group */
3238 unsigned long group_load; /* Total load over the CPUs of the group */
3239 unsigned long sum_nr_running; /* Nr tasks running in the group */
3240 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3241 unsigned long group_capacity;
3242 int group_imb; /* Is there an imbalance in the group ? */
3243};
3125 3244
3126 if (idle == CPU_NOT_IDLE) 3245/**
3246 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3247 * @group: The group whose first cpu is to be returned.
3248 */
3249static inline unsigned int group_first_cpu(struct sched_group *group)
3250{
3251 return cpumask_first(sched_group_cpus(group));
3252}
3253
3254/**
3255 * get_sd_load_idx - Obtain the load index for a given sched domain.
3256 * @sd: The sched_domain whose load_idx is to be obtained.
3257 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3258 */
3259static inline int get_sd_load_idx(struct sched_domain *sd,
3260 enum cpu_idle_type idle)
3261{
3262 int load_idx;
3263
3264 switch (idle) {
3265 case CPU_NOT_IDLE:
3127 load_idx = sd->busy_idx; 3266 load_idx = sd->busy_idx;
3128 else if (idle == CPU_NEWLY_IDLE) 3267 break;
3268
3269 case CPU_NEWLY_IDLE:
3129 load_idx = sd->newidle_idx; 3270 load_idx = sd->newidle_idx;
3130 else 3271 break;
3272 default:
3131 load_idx = sd->idle_idx; 3273 load_idx = sd->idle_idx;
3274 break;
3275 }
3132 3276
3133 do { 3277 return load_idx;
3134 unsigned long load, group_capacity, max_cpu_load, min_cpu_load; 3278}
3135 int local_group;
3136 int i;
3137 int __group_imb = 0;
3138 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3139 unsigned long sum_nr_running, sum_weighted_load;
3140 unsigned long sum_avg_load_per_task;
3141 unsigned long avg_load_per_task;
3142 3279
3143 local_group = cpumask_test_cpu(this_cpu,
3144 sched_group_cpus(group));
3145 3280
3146 if (local_group) 3281#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3147 balance_cpu = cpumask_first(sched_group_cpus(group)); 3282/**
3283 * init_sd_power_savings_stats - Initialize power savings statistics for
3284 * the given sched_domain, during load balancing.
3285 *
3286 * @sd: Sched domain whose power-savings statistics are to be initialized.
3287 * @sds: Variable containing the statistics for sd.
3288 * @idle: Idle status of the CPU at which we're performing load-balancing.
3289 */
3290static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3291 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3292{
3293 /*
3294 * Busy processors will not participate in power savings
3295 * balance.
3296 */
3297 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3298 sds->power_savings_balance = 0;
3299 else {
3300 sds->power_savings_balance = 1;
3301 sds->min_nr_running = ULONG_MAX;
3302 sds->leader_nr_running = 0;
3303 }
3304}
3148 3305
3149 /* Tally up the load of all CPUs in the group */ 3306/**
3150 sum_weighted_load = sum_nr_running = avg_load = 0; 3307 * update_sd_power_savings_stats - Update the power saving stats for a
3151 sum_avg_load_per_task = avg_load_per_task = 0; 3308 * sched_domain while performing load balancing.
3309 *
3310 * @group: sched_group belonging to the sched_domain under consideration.
3311 * @sds: Variable containing the statistics of the sched_domain
3312 * @local_group: Does group contain the CPU for which we're performing
3313 * load balancing ?
3314 * @sgs: Variable containing the statistics of the group.
3315 */
3316static inline void update_sd_power_savings_stats(struct sched_group *group,
3317 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3318{
3152 3319
3153 max_cpu_load = 0; 3320 if (!sds->power_savings_balance)
3154 min_cpu_load = ~0UL; 3321 return;
3155 3322
3156 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3323 /*
3157 struct rq *rq = cpu_rq(i); 3324 * If the local group is idle or completely loaded
3325 * no need to do power savings balance at this domain
3326 */
3327 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3328 !sds->this_nr_running))
3329 sds->power_savings_balance = 0;
3158 3330
3159 if (*sd_idle && rq->nr_running) 3331 /*
3160 *sd_idle = 0; 3332 * If a group is already running at full capacity or idle,
3333 * don't include that group in power savings calculations
3334 */
3335 if (!sds->power_savings_balance ||
3336 sgs->sum_nr_running >= sgs->group_capacity ||
3337 !sgs->sum_nr_running)
3338 return;
3161 3339
3162 /* Bias balancing toward cpus of our domain */ 3340 /*
3163 if (local_group) { 3341 * Calculate the group which has the least non-idle load.
3164 if (idle_cpu(i) && !first_idle_cpu) { 3342 * This is the group from where we need to pick up the load
3165 first_idle_cpu = 1; 3343 * for saving power
3166 balance_cpu = i; 3344 */
3167 } 3345 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3346 (sgs->sum_nr_running == sds->min_nr_running &&
3347 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3348 sds->group_min = group;
3349 sds->min_nr_running = sgs->sum_nr_running;
3350 sds->min_load_per_task = sgs->sum_weighted_load /
3351 sgs->sum_nr_running;
3352 }
3168 3353
3169 load = target_load(i, load_idx); 3354 /*
3170 } else { 3355 * Calculate the group which is almost near its
3171 load = source_load(i, load_idx); 3356 * capacity but still has some space to pick up some load
3172 if (load > max_cpu_load) 3357 * from other group and save more power
3173 max_cpu_load = load; 3358 */
3174 if (min_cpu_load > load) 3359 if (sgs->sum_nr_running > sgs->group_capacity - 1)
3175 min_cpu_load = load; 3360 return;
3176 }
3177 3361
3178 avg_load += load; 3362 if (sgs->sum_nr_running > sds->leader_nr_running ||
3179 sum_nr_running += rq->nr_running; 3363 (sgs->sum_nr_running == sds->leader_nr_running &&
3180 sum_weighted_load += weighted_cpuload(i); 3364 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3365 sds->group_leader = group;
3366 sds->leader_nr_running = sgs->sum_nr_running;
3367 }
3368}
3181 3369
3182 sum_avg_load_per_task += cpu_avg_load_per_task(i); 3370/**
3183 } 3371 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3372 * @sds: Variable containing the statistics of the sched_domain
3373 * under consideration.
3374 * @this_cpu: Cpu at which we're currently performing load-balancing.
3375 * @imbalance: Variable to store the imbalance.
3376 *
3377 * Description:
3378 * Check if we have potential to perform some power-savings balance.
3379 * If yes, set the busiest group to be the least loaded group in the
3380 * sched_domain, so that it's CPUs can be put to idle.
3381 *
3382 * Returns 1 if there is potential to perform power-savings balance.
3383 * Else returns 0.
3384 */
3385static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3386 int this_cpu, unsigned long *imbalance)
3387{
3388 if (!sds->power_savings_balance)
3389 return 0;
3184 3390
3185 /* 3391 if (sds->this != sds->group_leader ||
3186 * First idle cpu or the first cpu(busiest) in this sched group 3392 sds->group_leader == sds->group_min)
3187 * is eligible for doing load balancing at this and above 3393 return 0;
3188 * domains. In the newly idle case, we will allow all the cpu's
3189 * to do the newly idle load balance.
3190 */
3191 if (idle != CPU_NEWLY_IDLE && local_group &&
3192 balance_cpu != this_cpu && balance) {
3193 *balance = 0;
3194 goto ret;
3195 }
3196 3394
3197 total_load += avg_load; 3395 *imbalance = sds->min_load_per_task;
3198 total_pwr += group->__cpu_power; 3396 sds->busiest = sds->group_min;
3199 3397
3200 /* Adjust by relative CPU power of the group */ 3398 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3201 avg_load = sg_div_cpu_power(group, 3399 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3202 avg_load * SCHED_LOAD_SCALE); 3400 group_first_cpu(sds->group_leader);
3401 }
3402
3403 return 1;
3203 3404
3405}
3406#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3407static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3408 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3409{
3410 return;
3411}
3204 3412
3205 /* 3413static inline void update_sd_power_savings_stats(struct sched_group *group,
3206 * Consider the group unbalanced when the imbalance is larger 3414 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3207 * than the average weight of two tasks. 3415{
3208 * 3416 return;
3209 * APZ: with cgroup the avg task weight can vary wildly and 3417}
3210 * might not be a suitable number - should we keep a 3418
3211 * normalized nr_running number somewhere that negates 3419static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3212 * the hierarchy? 3420 int this_cpu, unsigned long *imbalance)
3213 */ 3421{
3214 avg_load_per_task = sg_div_cpu_power(group, 3422 return 0;
3215 sum_avg_load_per_task * SCHED_LOAD_SCALE); 3423}
3424#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3425
3426
3427/**
3428 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3429 * @group: sched_group whose statistics are to be updated.
3430 * @this_cpu: Cpu for which load balance is currently performed.
3431 * @idle: Idle status of this_cpu
3432 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3433 * @sd_idle: Idle status of the sched_domain containing group.
3434 * @local_group: Does group contain this_cpu.
3435 * @cpus: Set of cpus considered for load balancing.
3436 * @balance: Should we balance.
3437 * @sgs: variable to hold the statistics for this group.
3438 */
3439static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3440 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3441 int local_group, const struct cpumask *cpus,
3442 int *balance, struct sg_lb_stats *sgs)
3443{
3444 unsigned long load, max_cpu_load, min_cpu_load;
3445 int i;
3446 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3447 unsigned long sum_avg_load_per_task;
3448 unsigned long avg_load_per_task;
3449
3450 if (local_group)
3451 balance_cpu = group_first_cpu(group);
3216 3452
3217 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3453 /* Tally up the load of all CPUs in the group */
3218 __group_imb = 1; 3454 sum_avg_load_per_task = avg_load_per_task = 0;
3455 max_cpu_load = 0;
3456 min_cpu_load = ~0UL;
3219 3457
3220 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3458 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3459 struct rq *rq = cpu_rq(i);
3221 3460
3461 if (*sd_idle && rq->nr_running)
3462 *sd_idle = 0;
3463
3464 /* Bias balancing toward cpus of our domain */
3222 if (local_group) { 3465 if (local_group) {
3223 this_load = avg_load; 3466 if (idle_cpu(i) && !first_idle_cpu) {
3224 this = group; 3467 first_idle_cpu = 1;
3225 this_nr_running = sum_nr_running; 3468 balance_cpu = i;
3226 this_load_per_task = sum_weighted_load; 3469 }
3227 } else if (avg_load > max_load && 3470
3228 (sum_nr_running > group_capacity || __group_imb)) { 3471 load = target_load(i, load_idx);
3229 max_load = avg_load; 3472 } else {
3230 busiest = group; 3473 load = source_load(i, load_idx);
3231 busiest_nr_running = sum_nr_running; 3474 if (load > max_cpu_load)
3232 busiest_load_per_task = sum_weighted_load; 3475 max_cpu_load = load;
3233 group_imb = __group_imb; 3476 if (min_cpu_load > load)
3477 min_cpu_load = load;
3234 } 3478 }
3235 3479
3236#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3480 sgs->group_load += load;
3237 /* 3481 sgs->sum_nr_running += rq->nr_running;
3238 * Busy processors will not participate in power savings 3482 sgs->sum_weighted_load += weighted_cpuload(i);
3239 * balance.
3240 */
3241 if (idle == CPU_NOT_IDLE ||
3242 !(sd->flags & SD_POWERSAVINGS_BALANCE))
3243 goto group_next;
3244 3483
3245 /* 3484 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3246 * If the local group is idle or completely loaded 3485 }
3247 * no need to do power savings balance at this domain
3248 */
3249 if (local_group && (this_nr_running >= group_capacity ||
3250 !this_nr_running))
3251 power_savings_balance = 0;
3252 3486
3253 /* 3487 /*
3254 * If a group is already running at full capacity or idle, 3488 * First idle cpu or the first cpu(busiest) in this sched group
3255 * don't include that group in power savings calculations 3489 * is eligible for doing load balancing at this and above
3256 */ 3490 * domains. In the newly idle case, we will allow all the cpu's
3257 if (!power_savings_balance || sum_nr_running >= group_capacity 3491 * to do the newly idle load balance.
3258 || !sum_nr_running) 3492 */
3259 goto group_next; 3493 if (idle != CPU_NEWLY_IDLE && local_group &&
3494 balance_cpu != this_cpu && balance) {
3495 *balance = 0;
3496 return;
3497 }
3260 3498
3261 /* 3499 /* Adjust by relative CPU power of the group */
3262 * Calculate the group which has the least non-idle load. 3500 sgs->avg_load = sg_div_cpu_power(group,
3263 * This is the group from where we need to pick up the load 3501 sgs->group_load * SCHED_LOAD_SCALE);
3264 * for saving power
3265 */
3266 if ((sum_nr_running < min_nr_running) ||
3267 (sum_nr_running == min_nr_running &&
3268 cpumask_first(sched_group_cpus(group)) >
3269 cpumask_first(sched_group_cpus(group_min)))) {
3270 group_min = group;
3271 min_nr_running = sum_nr_running;
3272 min_load_per_task = sum_weighted_load /
3273 sum_nr_running;
3274 }
3275 3502
3276 /* 3503
3277 * Calculate the group which is almost near its 3504 /*
3278 * capacity but still has some space to pick up some load 3505 * Consider the group unbalanced when the imbalance is larger
3279 * from other group and save more power 3506 * than the average weight of two tasks.
3280 */ 3507 *
3281 if (sum_nr_running <= group_capacity - 1) { 3508 * APZ: with cgroup the avg task weight can vary wildly and
3282 if (sum_nr_running > leader_nr_running || 3509 * might not be a suitable number - should we keep a
3283 (sum_nr_running == leader_nr_running && 3510 * normalized nr_running number somewhere that negates
3284 cpumask_first(sched_group_cpus(group)) < 3511 * the hierarchy?
3285 cpumask_first(sched_group_cpus(group_leader)))) { 3512 */
3286 group_leader = group; 3513 avg_load_per_task = sg_div_cpu_power(group,
3287 leader_nr_running = sum_nr_running; 3514 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3288 } 3515
3516 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3517 sgs->group_imb = 1;
3518
3519 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3520
3521}
3522
3523/**
3524 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3525 * @sd: sched_domain whose statistics are to be updated.
3526 * @this_cpu: Cpu for which load balance is currently performed.
3527 * @idle: Idle status of this_cpu
3528 * @sd_idle: Idle status of the sched_domain containing group.
3529 * @cpus: Set of cpus considered for load balancing.
3530 * @balance: Should we balance.
3531 * @sds: variable to hold the statistics for this sched_domain.
3532 */
3533static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3534 enum cpu_idle_type idle, int *sd_idle,
3535 const struct cpumask *cpus, int *balance,
3536 struct sd_lb_stats *sds)
3537{
3538 struct sched_group *group = sd->groups;
3539 struct sg_lb_stats sgs;
3540 int load_idx;
3541
3542 init_sd_power_savings_stats(sd, sds, idle);
3543 load_idx = get_sd_load_idx(sd, idle);
3544
3545 do {
3546 int local_group;
3547
3548 local_group = cpumask_test_cpu(this_cpu,
3549 sched_group_cpus(group));
3550 memset(&sgs, 0, sizeof(sgs));
3551 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
3552 local_group, cpus, balance, &sgs);
3553
3554 if (local_group && balance && !(*balance))
3555 return;
3556
3557 sds->total_load += sgs.group_load;
3558 sds->total_pwr += group->__cpu_power;
3559
3560 if (local_group) {
3561 sds->this_load = sgs.avg_load;
3562 sds->this = group;
3563 sds->this_nr_running = sgs.sum_nr_running;
3564 sds->this_load_per_task = sgs.sum_weighted_load;
3565 } else if (sgs.avg_load > sds->max_load &&
3566 (sgs.sum_nr_running > sgs.group_capacity ||
3567 sgs.group_imb)) {
3568 sds->max_load = sgs.avg_load;
3569 sds->busiest = group;
3570 sds->busiest_nr_running = sgs.sum_nr_running;
3571 sds->busiest_load_per_task = sgs.sum_weighted_load;
3572 sds->group_imb = sgs.group_imb;
3289 } 3573 }
3290group_next: 3574
3291#endif 3575 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3292 group = group->next; 3576 group = group->next;
3293 } while (group != sd->groups); 3577 } while (group != sd->groups);
3294 3578
3295 if (!busiest || this_load >= max_load || busiest_nr_running == 0) 3579}
3296 goto out_balanced;
3297
3298 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3299 3580
3300 if (this_load >= avg_load || 3581/**
3301 100*max_load <= sd->imbalance_pct*this_load) 3582 * fix_small_imbalance - Calculate the minor imbalance that exists
3302 goto out_balanced; 3583 * amongst the groups of a sched_domain, during
3584 * load balancing.
3585 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3586 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3587 * @imbalance: Variable to store the imbalance.
3588 */
3589static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3590 int this_cpu, unsigned long *imbalance)
3591{
3592 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3593 unsigned int imbn = 2;
3594
3595 if (sds->this_nr_running) {
3596 sds->this_load_per_task /= sds->this_nr_running;
3597 if (sds->busiest_load_per_task >
3598 sds->this_load_per_task)
3599 imbn = 1;
3600 } else
3601 sds->this_load_per_task =
3602 cpu_avg_load_per_task(this_cpu);
3303 3603
3304 busiest_load_per_task /= busiest_nr_running; 3604 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3305 if (group_imb) 3605 sds->busiest_load_per_task * imbn) {
3306 busiest_load_per_task = min(busiest_load_per_task, avg_load); 3606 *imbalance = sds->busiest_load_per_task;
3607 return;
3608 }
3307 3609
3308 /* 3610 /*
3309 * We're trying to get all the cpus to the average_load, so we don't 3611 * OK, we don't have enough imbalance to justify moving tasks,
3310 * want to push ourselves above the average load, nor do we wish to 3612 * however we may be able to increase total CPU power used by
3311 * reduce the max loaded cpu below the average load, as either of these 3613 * moving them.
3312 * actions would just result in more rebalancing later, and ping-pong
3313 * tasks around. Thus we look for the minimum possible imbalance.
3314 * Negative imbalances (*we* are more loaded than anyone else) will
3315 * be counted as no imbalance for these purposes -- we can't fix that
3316 * by pulling tasks to us. Be careful of negative numbers as they'll
3317 * appear as very large values with unsigned longs.
3318 */ 3614 */
3319 if (max_load <= busiest_load_per_task)
3320 goto out_balanced;
3321 3615
3616 pwr_now += sds->busiest->__cpu_power *
3617 min(sds->busiest_load_per_task, sds->max_load);
3618 pwr_now += sds->this->__cpu_power *
3619 min(sds->this_load_per_task, sds->this_load);
3620 pwr_now /= SCHED_LOAD_SCALE;
3621
3622 /* Amount of load we'd subtract */
3623 tmp = sg_div_cpu_power(sds->busiest,
3624 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3625 if (sds->max_load > tmp)
3626 pwr_move += sds->busiest->__cpu_power *
3627 min(sds->busiest_load_per_task, sds->max_load - tmp);
3628
3629 /* Amount of load we'd add */
3630 if (sds->max_load * sds->busiest->__cpu_power <
3631 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3632 tmp = sg_div_cpu_power(sds->this,
3633 sds->max_load * sds->busiest->__cpu_power);
3634 else
3635 tmp = sg_div_cpu_power(sds->this,
3636 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3637 pwr_move += sds->this->__cpu_power *
3638 min(sds->this_load_per_task, sds->this_load + tmp);
3639 pwr_move /= SCHED_LOAD_SCALE;
3640
3641 /* Move if we gain throughput */
3642 if (pwr_move > pwr_now)
3643 *imbalance = sds->busiest_load_per_task;
3644}
3645
3646/**
3647 * calculate_imbalance - Calculate the amount of imbalance present within the
3648 * groups of a given sched_domain during load balance.
3649 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3650 * @this_cpu: Cpu for which currently load balance is being performed.
3651 * @imbalance: The variable to store the imbalance.
3652 */
3653static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3654 unsigned long *imbalance)
3655{
3656 unsigned long max_pull;
3322 /* 3657 /*
3323 * In the presence of smp nice balancing, certain scenarios can have 3658 * In the presence of smp nice balancing, certain scenarios can have
3324 * max load less than avg load(as we skip the groups at or below 3659 * max load less than avg load(as we skip the groups at or below
3325 * its cpu_power, while calculating max_load..) 3660 * its cpu_power, while calculating max_load..)
3326 */ 3661 */
3327 if (max_load < avg_load) { 3662 if (sds->max_load < sds->avg_load) {
3328 *imbalance = 0; 3663 *imbalance = 0;
3329 goto small_imbalance; 3664 return fix_small_imbalance(sds, this_cpu, imbalance);
3330 } 3665 }
3331 3666
3332 /* Don't want to pull so many tasks that a group would go idle */ 3667 /* Don't want to pull so many tasks that a group would go idle */
3333 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); 3668 max_pull = min(sds->max_load - sds->avg_load,
3669 sds->max_load - sds->busiest_load_per_task);
3334 3670
3335 /* How much load to actually move to equalise the imbalance */ 3671 /* How much load to actually move to equalise the imbalance */
3336 *imbalance = min(max_pull * busiest->__cpu_power, 3672 *imbalance = min(max_pull * sds->busiest->__cpu_power,
3337 (avg_load - this_load) * this->__cpu_power) 3673 (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
3338 / SCHED_LOAD_SCALE; 3674 / SCHED_LOAD_SCALE;
3339 3675
3340 /* 3676 /*
@@ -3343,78 +3679,110 @@ group_next:
3343 * a think about bumping its value to force at least one task to be 3679 * a think about bumping its value to force at least one task to be
3344 * moved 3680 * moved
3345 */ 3681 */
3346 if (*imbalance < busiest_load_per_task) { 3682 if (*imbalance < sds->busiest_load_per_task)
3347 unsigned long tmp, pwr_now, pwr_move; 3683 return fix_small_imbalance(sds, this_cpu, imbalance);
3348 unsigned int imbn;
3349
3350small_imbalance:
3351 pwr_move = pwr_now = 0;
3352 imbn = 2;
3353 if (this_nr_running) {
3354 this_load_per_task /= this_nr_running;
3355 if (busiest_load_per_task > this_load_per_task)
3356 imbn = 1;
3357 } else
3358 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3359 3684
3360 if (max_load - this_load + busiest_load_per_task >= 3685}
3361 busiest_load_per_task * imbn) { 3686/******* find_busiest_group() helpers end here *********************/
3362 *imbalance = busiest_load_per_task;
3363 return busiest;
3364 }
3365 3687
3366 /* 3688/**
3367 * OK, we don't have enough imbalance to justify moving tasks, 3689 * find_busiest_group - Returns the busiest group within the sched_domain
3368 * however we may be able to increase total CPU power used by 3690 * if there is an imbalance. If there isn't an imbalance, and
3369 * moving them. 3691 * the user has opted for power-savings, it returns a group whose
3370 */ 3692 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3693 * such a group exists.
3694 *
3695 * Also calculates the amount of weighted load which should be moved
3696 * to restore balance.
3697 *
3698 * @sd: The sched_domain whose busiest group is to be returned.
3699 * @this_cpu: The cpu for which load balancing is currently being performed.
3700 * @imbalance: Variable which stores amount of weighted load which should
3701 * be moved to restore balance/put a group to idle.
3702 * @idle: The idle status of this_cpu.
3703 * @sd_idle: The idleness of sd
3704 * @cpus: The set of CPUs under consideration for load-balancing.
3705 * @balance: Pointer to a variable indicating if this_cpu
3706 * is the appropriate cpu to perform load balancing at this_level.
3707 *
3708 * Returns: - the busiest group if imbalance exists.
3709 * - If no imbalance and user has opted for power-savings balance,
3710 * return the least loaded group whose CPUs can be
3711 * put to idle by rebalancing its tasks onto our group.
3712 */
3713static struct sched_group *
3714find_busiest_group(struct sched_domain *sd, int this_cpu,
3715 unsigned long *imbalance, enum cpu_idle_type idle,
3716 int *sd_idle, const struct cpumask *cpus, int *balance)
3717{
3718 struct sd_lb_stats sds;
3371 3719
3372 pwr_now += busiest->__cpu_power * 3720 memset(&sds, 0, sizeof(sds));
3373 min(busiest_load_per_task, max_load);
3374 pwr_now += this->__cpu_power *
3375 min(this_load_per_task, this_load);
3376 pwr_now /= SCHED_LOAD_SCALE;
3377
3378 /* Amount of load we'd subtract */
3379 tmp = sg_div_cpu_power(busiest,
3380 busiest_load_per_task * SCHED_LOAD_SCALE);
3381 if (max_load > tmp)
3382 pwr_move += busiest->__cpu_power *
3383 min(busiest_load_per_task, max_load - tmp);
3384
3385 /* Amount of load we'd add */
3386 if (max_load * busiest->__cpu_power <
3387 busiest_load_per_task * SCHED_LOAD_SCALE)
3388 tmp = sg_div_cpu_power(this,
3389 max_load * busiest->__cpu_power);
3390 else
3391 tmp = sg_div_cpu_power(this,
3392 busiest_load_per_task * SCHED_LOAD_SCALE);
3393 pwr_move += this->__cpu_power *
3394 min(this_load_per_task, this_load + tmp);
3395 pwr_move /= SCHED_LOAD_SCALE;
3396 3721
3397 /* Move if we gain throughput */ 3722 /*
3398 if (pwr_move > pwr_now) 3723 * Compute the various statistics relavent for load balancing at
3399 *imbalance = busiest_load_per_task; 3724 * this level.
3400 } 3725 */
3726 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3727 balance, &sds);
3728
3729 /* Cases where imbalance does not exist from POV of this_cpu */
3730 /* 1) this_cpu is not the appropriate cpu to perform load balancing
3731 * at this level.
3732 * 2) There is no busy sibling group to pull from.
3733 * 3) This group is the busiest group.
3734 * 4) This group is more busy than the avg busieness at this
3735 * sched_domain.
3736 * 5) The imbalance is within the specified limit.
3737 * 6) Any rebalance would lead to ping-pong
3738 */
3739 if (balance && !(*balance))
3740 goto ret;
3401 3741
3402 return busiest; 3742 if (!sds.busiest || sds.busiest_nr_running == 0)
3743 goto out_balanced;
3403 3744
3404out_balanced: 3745 if (sds.this_load >= sds.max_load)
3405#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3746 goto out_balanced;
3406 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3407 goto ret;
3408 3747
3409 if (this == group_leader && group_leader != group_min) { 3748 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3410 *imbalance = min_load_per_task; 3749
3411 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { 3750 if (sds.this_load >= sds.avg_load)
3412 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = 3751 goto out_balanced;
3413 cpumask_first(sched_group_cpus(group_leader)); 3752
3414 } 3753 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3415 return group_min; 3754 goto out_balanced;
3416 } 3755
3417#endif 3756 sds.busiest_load_per_task /= sds.busiest_nr_running;
3757 if (sds.group_imb)
3758 sds.busiest_load_per_task =
3759 min(sds.busiest_load_per_task, sds.avg_load);
3760
3761 /*
3762 * We're trying to get all the cpus to the average_load, so we don't
3763 * want to push ourselves above the average load, nor do we wish to
3764 * reduce the max loaded cpu below the average load, as either of these
3765 * actions would just result in more rebalancing later, and ping-pong
3766 * tasks around. Thus we look for the minimum possible imbalance.
3767 * Negative imbalances (*we* are more loaded than anyone else) will
3768 * be counted as no imbalance for these purposes -- we can't fix that
3769 * by pulling tasks to us. Be careful of negative numbers as they'll
3770 * appear as very large values with unsigned longs.
3771 */
3772 if (sds.max_load <= sds.busiest_load_per_task)
3773 goto out_balanced;
3774
3775 /* Looks like there is an imbalance. Compute it */
3776 calculate_imbalance(&sds, this_cpu, imbalance);
3777 return sds.busiest;
3778
3779out_balanced:
3780 /*
3781 * There is no obvious imbalance. But check if we can do some balancing
3782 * to save power.
3783 */
3784 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
3785 return sds.busiest;
3418ret: 3786ret:
3419 *imbalance = 0; 3787 *imbalance = 0;
3420 return NULL; 3788 return NULL;
@@ -3458,19 +3826,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3458 */ 3826 */
3459#define MAX_PINNED_INTERVAL 512 3827#define MAX_PINNED_INTERVAL 512
3460 3828
3829/* Working cpumask for load_balance and load_balance_newidle. */
3830static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3831
3461/* 3832/*
3462 * Check this_cpu to ensure it is balanced within domain. Attempt to move 3833 * Check this_cpu to ensure it is balanced within domain. Attempt to move
3463 * tasks if there is an imbalance. 3834 * tasks if there is an imbalance.
3464 */ 3835 */
3465static int load_balance(int this_cpu, struct rq *this_rq, 3836static int load_balance(int this_cpu, struct rq *this_rq,
3466 struct sched_domain *sd, enum cpu_idle_type idle, 3837 struct sched_domain *sd, enum cpu_idle_type idle,
3467 int *balance, struct cpumask *cpus) 3838 int *balance)
3468{ 3839{
3469 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3840 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3470 struct sched_group *group; 3841 struct sched_group *group;
3471 unsigned long imbalance; 3842 unsigned long imbalance;
3472 struct rq *busiest; 3843 struct rq *busiest;
3473 unsigned long flags; 3844 unsigned long flags;
3845 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3474 3846
3475 cpumask_setall(cpus); 3847 cpumask_setall(cpus);
3476 3848
@@ -3625,8 +3997,7 @@ out:
3625 * this_rq is locked. 3997 * this_rq is locked.
3626 */ 3998 */
3627static int 3999static int
3628load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, 4000load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
3629 struct cpumask *cpus)
3630{ 4001{
3631 struct sched_group *group; 4002 struct sched_group *group;
3632 struct rq *busiest = NULL; 4003 struct rq *busiest = NULL;
@@ -3634,6 +4005,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3634 int ld_moved = 0; 4005 int ld_moved = 0;
3635 int sd_idle = 0; 4006 int sd_idle = 0;
3636 int all_pinned = 0; 4007 int all_pinned = 0;
4008 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3637 4009
3638 cpumask_setall(cpus); 4010 cpumask_setall(cpus);
3639 4011
@@ -3774,10 +4146,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3774 struct sched_domain *sd; 4146 struct sched_domain *sd;
3775 int pulled_task = 0; 4147 int pulled_task = 0;
3776 unsigned long next_balance = jiffies + HZ; 4148 unsigned long next_balance = jiffies + HZ;
3777 cpumask_var_t tmpmask;
3778
3779 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
3780 return;
3781 4149
3782 for_each_domain(this_cpu, sd) { 4150 for_each_domain(this_cpu, sd) {
3783 unsigned long interval; 4151 unsigned long interval;
@@ -3788,7 +4156,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3788 if (sd->flags & SD_BALANCE_NEWIDLE) 4156 if (sd->flags & SD_BALANCE_NEWIDLE)
3789 /* If we've pulled tasks over stop searching: */ 4157 /* If we've pulled tasks over stop searching: */
3790 pulled_task = load_balance_newidle(this_cpu, this_rq, 4158 pulled_task = load_balance_newidle(this_cpu, this_rq,
3791 sd, tmpmask); 4159 sd);
3792 4160
3793 interval = msecs_to_jiffies(sd->balance_interval); 4161 interval = msecs_to_jiffies(sd->balance_interval);
3794 if (time_after(next_balance, sd->last_balance + interval)) 4162 if (time_after(next_balance, sd->last_balance + interval))
@@ -3803,7 +4171,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3803 */ 4171 */
3804 this_rq->next_balance = next_balance; 4172 this_rq->next_balance = next_balance;
3805 } 4173 }
3806 free_cpumask_var(tmpmask);
3807} 4174}
3808 4175
3809/* 4176/*
@@ -3890,19 +4257,24 @@ int select_nohz_load_balancer(int stop_tick)
3890 int cpu = smp_processor_id(); 4257 int cpu = smp_processor_id();
3891 4258
3892 if (stop_tick) { 4259 if (stop_tick) {
3893 cpumask_set_cpu(cpu, nohz.cpu_mask);
3894 cpu_rq(cpu)->in_nohz_recently = 1; 4260 cpu_rq(cpu)->in_nohz_recently = 1;
3895 4261
3896 /* 4262 if (!cpu_active(cpu)) {
3897 * If we are going offline and still the leader, give up! 4263 if (atomic_read(&nohz.load_balancer) != cpu)
3898 */ 4264 return 0;
3899 if (!cpu_active(cpu) && 4265
3900 atomic_read(&nohz.load_balancer) == cpu) { 4266 /*
4267 * If we are going offline and still the leader,
4268 * give up!
4269 */
3901 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 4270 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3902 BUG(); 4271 BUG();
4272
3903 return 0; 4273 return 0;
3904 } 4274 }
3905 4275
4276 cpumask_set_cpu(cpu, nohz.cpu_mask);
4277
3906 /* time for ilb owner also to sleep */ 4278 /* time for ilb owner also to sleep */
3907 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4279 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3908 if (atomic_read(&nohz.load_balancer) == cpu) 4280 if (atomic_read(&nohz.load_balancer) == cpu)
@@ -3948,11 +4320,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3948 unsigned long next_balance = jiffies + 60*HZ; 4320 unsigned long next_balance = jiffies + 60*HZ;
3949 int update_next_balance = 0; 4321 int update_next_balance = 0;
3950 int need_serialize; 4322 int need_serialize;
3951 cpumask_var_t tmp;
3952
3953 /* Fails alloc? Rebalancing probably not a priority right now. */
3954 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
3955 return;
3956 4323
3957 for_each_domain(cpu, sd) { 4324 for_each_domain(cpu, sd) {
3958 if (!(sd->flags & SD_LOAD_BALANCE)) 4325 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3977,7 +4344,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3977 } 4344 }
3978 4345
3979 if (time_after_eq(jiffies, sd->last_balance + interval)) { 4346 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3980 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { 4347 if (load_balance(cpu, rq, sd, idle, &balance)) {
3981 /* 4348 /*
3982 * We've pulled tasks over so either we're no 4349 * We've pulled tasks over so either we're no
3983 * longer idle, or one of our SMT siblings is 4350 * longer idle, or one of our SMT siblings is
@@ -4011,8 +4378,6 @@ out:
4011 */ 4378 */
4012 if (likely(update_next_balance)) 4379 if (likely(update_next_balance))
4013 rq->next_balance = next_balance; 4380 rq->next_balance = next_balance;
4014
4015 free_cpumask_var(tmp);
4016} 4381}
4017 4382
4018/* 4383/*
@@ -4062,6 +4427,11 @@ static void run_rebalance_domains(struct softirq_action *h)
4062#endif 4427#endif
4063} 4428}
4064 4429
4430static inline int on_null_domain(int cpu)
4431{
4432 return !rcu_dereference(cpu_rq(cpu)->sd);
4433}
4434
4065/* 4435/*
4066 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 4436 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4067 * 4437 *
@@ -4119,7 +4489,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4119 cpumask_test_cpu(cpu, nohz.cpu_mask)) 4489 cpumask_test_cpu(cpu, nohz.cpu_mask))
4120 return; 4490 return;
4121#endif 4491#endif
4122 if (time_after_eq(jiffies, rq->next_balance)) 4492 /* Don't need to rebalance while attached to NULL domain */
4493 if (time_after_eq(jiffies, rq->next_balance) &&
4494 likely(!on_null_domain(cpu)))
4123 raise_softirq(SCHED_SOFTIRQ); 4495 raise_softirq(SCHED_SOFTIRQ);
4124} 4496}
4125 4497
@@ -4409,10 +4781,7 @@ void scheduler_tick(void)
4409#endif 4781#endif
4410} 4782}
4411 4783
4412#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 4784unsigned long get_parent_ip(unsigned long addr)
4413 defined(CONFIG_PREEMPT_TRACER))
4414
4415static inline unsigned long get_parent_ip(unsigned long addr)
4416{ 4785{
4417 if (in_lock_functions(addr)) { 4786 if (in_lock_functions(addr)) {
4418 addr = CALLER_ADDR2; 4787 addr = CALLER_ADDR2;
@@ -4422,6 +4791,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
4422 return addr; 4791 return addr;
4423} 4792}
4424 4793
4794#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4795 defined(CONFIG_PREEMPT_TRACER))
4796
4425void __kprobes add_preempt_count(int val) 4797void __kprobes add_preempt_count(int val)
4426{ 4798{
4427#ifdef CONFIG_DEBUG_PREEMPT 4799#ifdef CONFIG_DEBUG_PREEMPT
@@ -4513,11 +4885,33 @@ static inline void schedule_debug(struct task_struct *prev)
4513#endif 4885#endif
4514} 4886}
4515 4887
4888static void put_prev_task(struct rq *rq, struct task_struct *prev)
4889{
4890 if (prev->state == TASK_RUNNING) {
4891 u64 runtime = prev->se.sum_exec_runtime;
4892
4893 runtime -= prev->se.prev_sum_exec_runtime;
4894 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
4895
4896 /*
4897 * In order to avoid avg_overlap growing stale when we are
4898 * indeed overlapping and hence not getting put to sleep, grow
4899 * the avg_overlap on preemption.
4900 *
4901 * We use the average preemption runtime because that
4902 * correlates to the amount of cache footprint a task can
4903 * build up.
4904 */
4905 update_avg(&prev->se.avg_overlap, runtime);
4906 }
4907 prev->sched_class->put_prev_task(rq, prev);
4908}
4909
4516/* 4910/*
4517 * Pick up the highest-prio task: 4911 * Pick up the highest-prio task:
4518 */ 4912 */
4519static inline struct task_struct * 4913static inline struct task_struct *
4520pick_next_task(struct rq *rq, struct task_struct *prev) 4914pick_next_task(struct rq *rq)
4521{ 4915{
4522 const struct sched_class *class; 4916 const struct sched_class *class;
4523 struct task_struct *p; 4917 struct task_struct *p;
@@ -4548,15 +4942,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
4548/* 4942/*
4549 * schedule() is the main scheduler function. 4943 * schedule() is the main scheduler function.
4550 */ 4944 */
4551asmlinkage void __sched schedule(void) 4945asmlinkage void __sched __schedule(void)
4552{ 4946{
4553 struct task_struct *prev, *next; 4947 struct task_struct *prev, *next;
4554 unsigned long *switch_count; 4948 unsigned long *switch_count;
4555 struct rq *rq; 4949 struct rq *rq;
4556 int cpu; 4950 int cpu;
4557 4951
4558need_resched:
4559 preempt_disable();
4560 cpu = smp_processor_id(); 4952 cpu = smp_processor_id();
4561 rq = cpu_rq(cpu); 4953 rq = cpu_rq(cpu);
4562 rcu_qsctr_inc(cpu); 4954 rcu_qsctr_inc(cpu);
@@ -4591,8 +4983,8 @@ need_resched_nonpreemptible:
4591 if (unlikely(!rq->nr_running)) 4983 if (unlikely(!rq->nr_running))
4592 idle_balance(cpu, rq); 4984 idle_balance(cpu, rq);
4593 4985
4594 prev->sched_class->put_prev_task(rq, prev); 4986 put_prev_task(rq, prev);
4595 next = pick_next_task(rq, prev); 4987 next = pick_next_task(rq);
4596 4988
4597 if (likely(prev != next)) { 4989 if (likely(prev != next)) {
4598 sched_info_switch(prev, next); 4990 sched_info_switch(prev, next);
@@ -4613,13 +5005,80 @@ need_resched_nonpreemptible:
4613 5005
4614 if (unlikely(reacquire_kernel_lock(current) < 0)) 5006 if (unlikely(reacquire_kernel_lock(current) < 0))
4615 goto need_resched_nonpreemptible; 5007 goto need_resched_nonpreemptible;
5008}
4616 5009
5010asmlinkage void __sched schedule(void)
5011{
5012need_resched:
5013 preempt_disable();
5014 __schedule();
4617 preempt_enable_no_resched(); 5015 preempt_enable_no_resched();
4618 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5016 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
4619 goto need_resched; 5017 goto need_resched;
4620} 5018}
4621EXPORT_SYMBOL(schedule); 5019EXPORT_SYMBOL(schedule);
4622 5020
5021#ifdef CONFIG_SMP
5022/*
5023 * Look out! "owner" is an entirely speculative pointer
5024 * access and not reliable.
5025 */
5026int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5027{
5028 unsigned int cpu;
5029 struct rq *rq;
5030
5031 if (!sched_feat(OWNER_SPIN))
5032 return 0;
5033
5034#ifdef CONFIG_DEBUG_PAGEALLOC
5035 /*
5036 * Need to access the cpu field knowing that
5037 * DEBUG_PAGEALLOC could have unmapped it if
5038 * the mutex owner just released it and exited.
5039 */
5040 if (probe_kernel_address(&owner->cpu, cpu))
5041 goto out;
5042#else
5043 cpu = owner->cpu;
5044#endif
5045
5046 /*
5047 * Even if the access succeeded (likely case),
5048 * the cpu field may no longer be valid.
5049 */
5050 if (cpu >= nr_cpumask_bits)
5051 goto out;
5052
5053 /*
5054 * We need to validate that we can do a
5055 * get_cpu() and that we have the percpu area.
5056 */
5057 if (!cpu_online(cpu))
5058 goto out;
5059
5060 rq = cpu_rq(cpu);
5061
5062 for (;;) {
5063 /*
5064 * Owner changed, break to re-assess state.
5065 */
5066 if (lock->owner != owner)
5067 break;
5068
5069 /*
5070 * Is that owner really running on that cpu?
5071 */
5072 if (task_thread_info(rq->curr) != owner || need_resched())
5073 return 0;
5074
5075 cpu_relax();
5076 }
5077out:
5078 return 1;
5079}
5080#endif
5081
4623#ifdef CONFIG_PREEMPT 5082#ifdef CONFIG_PREEMPT
4624/* 5083/*
4625 * this is the entry point to schedule() from in-kernel preemption 5084 * this is the entry point to schedule() from in-kernel preemption
@@ -4647,7 +5106,7 @@ asmlinkage void __sched preempt_schedule(void)
4647 * between schedule and now. 5106 * between schedule and now.
4648 */ 5107 */
4649 barrier(); 5108 barrier();
4650 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 5109 } while (need_resched());
4651} 5110}
4652EXPORT_SYMBOL(preempt_schedule); 5111EXPORT_SYMBOL(preempt_schedule);
4653 5112
@@ -4676,7 +5135,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
4676 * between schedule and now. 5135 * between schedule and now.
4677 */ 5136 */
4678 barrier(); 5137 barrier();
4679 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 5138 } while (need_resched());
4680} 5139}
4681 5140
4682#endif /* CONFIG_PREEMPT */ 5141#endif /* CONFIG_PREEMPT */
@@ -4737,11 +5196,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4737 __wake_up_common(q, mode, 1, 0, NULL); 5196 __wake_up_common(q, mode, 1, 0, NULL);
4738} 5197}
4739 5198
5199void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5200{
5201 __wake_up_common(q, mode, 1, 0, key);
5202}
5203
4740/** 5204/**
4741 * __wake_up_sync - wake up threads blocked on a waitqueue. 5205 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
4742 * @q: the waitqueue 5206 * @q: the waitqueue
4743 * @mode: which threads 5207 * @mode: which threads
4744 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5208 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5209 * @key: opaque value to be passed to wakeup targets
4745 * 5210 *
4746 * The sync wakeup differs that the waker knows that it will schedule 5211 * The sync wakeup differs that the waker knows that it will schedule
4747 * away soon, so while the target thread will be woken up, it will not 5212 * away soon, so while the target thread will be woken up, it will not
@@ -4750,8 +5215,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4750 * 5215 *
4751 * On UP it can prevent extra preemption. 5216 * On UP it can prevent extra preemption.
4752 */ 5217 */
4753void 5218void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
4754__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 5219 int nr_exclusive, void *key)
4755{ 5220{
4756 unsigned long flags; 5221 unsigned long flags;
4757 int sync = 1; 5222 int sync = 1;
@@ -4763,9 +5228,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4763 sync = 0; 5228 sync = 0;
4764 5229
4765 spin_lock_irqsave(&q->lock, flags); 5230 spin_lock_irqsave(&q->lock, flags);
4766 __wake_up_common(q, mode, nr_exclusive, sync, NULL); 5231 __wake_up_common(q, mode, nr_exclusive, sync, key);
4767 spin_unlock_irqrestore(&q->lock, flags); 5232 spin_unlock_irqrestore(&q->lock, flags);
4768} 5233}
5234EXPORT_SYMBOL_GPL(__wake_up_sync_key);
5235
5236/*
5237 * __wake_up_sync - see __wake_up_sync_key()
5238 */
5239void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5240{
5241 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
5242}
4769EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 5243EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4770 5244
4771/** 5245/**
@@ -5150,7 +5624,7 @@ SYSCALL_DEFINE1(nice, int, increment)
5150 if (increment > 40) 5624 if (increment > 40)
5151 increment = 40; 5625 increment = 40;
5152 5626
5153 nice = PRIO_TO_NICE(current->static_prio) + increment; 5627 nice = TASK_NICE(current) + increment;
5154 if (nice < -20) 5628 if (nice < -20)
5155 nice = -20; 5629 nice = -20;
5156 if (nice > 19) 5630 if (nice > 19)
@@ -5949,12 +6423,7 @@ void sched_show_task(struct task_struct *p)
5949 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 6423 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
5950#endif 6424#endif
5951#ifdef CONFIG_DEBUG_STACK_USAGE 6425#ifdef CONFIG_DEBUG_STACK_USAGE
5952 { 6426 free = stack_not_used(p);
5953 unsigned long *n = end_of_stack(p);
5954 while (!*n)
5955 n++;
5956 free = (unsigned long)n - (unsigned long)end_of_stack(p);
5957 }
5958#endif 6427#endif
5959 printk(KERN_CONT "%5lu %5d %6d\n", free, 6428 printk(KERN_CONT "%5lu %5d %6d\n", free,
5960 task_pid_nr(p), task_pid_nr(p->real_parent)); 6429 task_pid_nr(p), task_pid_nr(p->real_parent));
@@ -6428,7 +6897,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6428 if (!rq->nr_running) 6897 if (!rq->nr_running)
6429 break; 6898 break;
6430 update_rq_clock(rq); 6899 update_rq_clock(rq);
6431 next = pick_next_task(rq, rq->curr); 6900 next = pick_next_task(rq);
6432 if (!next) 6901 if (!next)
6433 break; 6902 break;
6434 next->sched_class->put_prev_task(rq, next); 6903 next->sched_class->put_prev_task(rq, next);
@@ -6949,20 +7418,26 @@ static void free_rootdomain(struct root_domain *rd)
6949 7418
6950static void rq_attach_root(struct rq *rq, struct root_domain *rd) 7419static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6951{ 7420{
7421 struct root_domain *old_rd = NULL;
6952 unsigned long flags; 7422 unsigned long flags;
6953 7423
6954 spin_lock_irqsave(&rq->lock, flags); 7424 spin_lock_irqsave(&rq->lock, flags);
6955 7425
6956 if (rq->rd) { 7426 if (rq->rd) {
6957 struct root_domain *old_rd = rq->rd; 7427 old_rd = rq->rd;
6958 7428
6959 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 7429 if (cpumask_test_cpu(rq->cpu, old_rd->online))
6960 set_rq_offline(rq); 7430 set_rq_offline(rq);
6961 7431
6962 cpumask_clear_cpu(rq->cpu, old_rd->span); 7432 cpumask_clear_cpu(rq->cpu, old_rd->span);
6963 7433
6964 if (atomic_dec_and_test(&old_rd->refcount)) 7434 /*
6965 free_rootdomain(old_rd); 7435 * If we dont want to free the old_rt yet then
7436 * set old_rd to NULL to skip the freeing later
7437 * in this function:
7438 */
7439 if (!atomic_dec_and_test(&old_rd->refcount))
7440 old_rd = NULL;
6966 } 7441 }
6967 7442
6968 atomic_inc(&rd->refcount); 7443 atomic_inc(&rd->refcount);
@@ -6973,6 +7448,9 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6973 set_rq_online(rq); 7448 set_rq_online(rq);
6974 7449
6975 spin_unlock_irqrestore(&rq->lock, flags); 7450 spin_unlock_irqrestore(&rq->lock, flags);
7451
7452 if (old_rd)
7453 free_rootdomain(old_rd);
6976} 7454}
6977 7455
6978static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7456static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
@@ -7250,7 +7728,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7250{ 7728{
7251 int group; 7729 int group;
7252 7730
7253 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); 7731 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7254 group = cpumask_first(mask); 7732 group = cpumask_first(mask);
7255 if (sg) 7733 if (sg)
7256 *sg = &per_cpu(sched_group_core, group).sg; 7734 *sg = &per_cpu(sched_group_core, group).sg;
@@ -7279,7 +7757,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7279 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 7757 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7280 group = cpumask_first(mask); 7758 group = cpumask_first(mask);
7281#elif defined(CONFIG_SCHED_SMT) 7759#elif defined(CONFIG_SCHED_SMT)
7282 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); 7760 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7283 group = cpumask_first(mask); 7761 group = cpumask_first(mask);
7284#else 7762#else
7285 group = cpu; 7763 group = cpu;
@@ -7622,7 +8100,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7622 SD_INIT(sd, SIBLING); 8100 SD_INIT(sd, SIBLING);
7623 set_domain_attribute(sd, attr); 8101 set_domain_attribute(sd, attr);
7624 cpumask_and(sched_domain_span(sd), 8102 cpumask_and(sched_domain_span(sd),
7625 &per_cpu(cpu_sibling_map, i), cpu_map); 8103 topology_thread_cpumask(i), cpu_map);
7626 sd->parent = p; 8104 sd->parent = p;
7627 p->child = sd; 8105 p->child = sd;
7628 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); 8106 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7633,7 +8111,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7633 /* Set up CPU (sibling) groups */ 8111 /* Set up CPU (sibling) groups */
7634 for_each_cpu(i, cpu_map) { 8112 for_each_cpu(i, cpu_map) {
7635 cpumask_and(this_sibling_map, 8113 cpumask_and(this_sibling_map,
7636 &per_cpu(cpu_sibling_map, i), cpu_map); 8114 topology_thread_cpumask(i), cpu_map);
7637 if (i != cpumask_first(this_sibling_map)) 8115 if (i != cpumask_first(this_sibling_map))
7638 continue; 8116 continue;
7639 8117
@@ -8214,11 +8692,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8214 __set_bit(MAX_RT_PRIO, array->bitmap); 8692 __set_bit(MAX_RT_PRIO, array->bitmap);
8215 8693
8216#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 8694#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
8217 rt_rq->highest_prio = MAX_RT_PRIO; 8695 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8696#ifdef CONFIG_SMP
8697 rt_rq->highest_prio.next = MAX_RT_PRIO;
8698#endif
8218#endif 8699#endif
8219#ifdef CONFIG_SMP 8700#ifdef CONFIG_SMP
8220 rt_rq->rt_nr_migratory = 0; 8701 rt_rq->rt_nr_migratory = 0;
8221 rt_rq->overloaded = 0; 8702 rt_rq->overloaded = 0;
8703 plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
8222#endif 8704#endif
8223 8705
8224 rt_rq->rt_time = 0; 8706 rt_rq->rt_time = 0;
@@ -8305,6 +8787,9 @@ void __init sched_init(void)
8305#ifdef CONFIG_USER_SCHED 8787#ifdef CONFIG_USER_SCHED
8306 alloc_size *= 2; 8788 alloc_size *= 2;
8307#endif 8789#endif
8790#ifdef CONFIG_CPUMASK_OFFSTACK
8791 alloc_size += num_possible_cpus() * cpumask_size();
8792#endif
8308 /* 8793 /*
8309 * As sched_init() is called before page_alloc is setup, 8794 * As sched_init() is called before page_alloc is setup,
8310 * we use alloc_bootmem(). 8795 * we use alloc_bootmem().
@@ -8342,6 +8827,12 @@ void __init sched_init(void)
8342 ptr += nr_cpu_ids * sizeof(void **); 8827 ptr += nr_cpu_ids * sizeof(void **);
8343#endif /* CONFIG_USER_SCHED */ 8828#endif /* CONFIG_USER_SCHED */
8344#endif /* CONFIG_RT_GROUP_SCHED */ 8829#endif /* CONFIG_RT_GROUP_SCHED */
8830#ifdef CONFIG_CPUMASK_OFFSTACK
8831 for_each_possible_cpu(i) {
8832 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
8833 ptr += cpumask_size();
8834 }
8835#endif /* CONFIG_CPUMASK_OFFSTACK */
8345 } 8836 }
8346 8837
8347#ifdef CONFIG_SMP 8838#ifdef CONFIG_SMP
@@ -9220,6 +9711,16 @@ static int sched_rt_global_constraints(void)
9220 9711
9221 return ret; 9712 return ret;
9222} 9713}
9714
9715int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
9716{
9717 /* Don't accept realtime tasks when there is no way for them to run */
9718 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
9719 return 0;
9720
9721 return 1;
9722}
9723
9223#else /* !CONFIG_RT_GROUP_SCHED */ 9724#else /* !CONFIG_RT_GROUP_SCHED */
9224static int sched_rt_global_constraints(void) 9725static int sched_rt_global_constraints(void)
9225{ 9726{
@@ -9313,8 +9814,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9313 struct task_struct *tsk) 9814 struct task_struct *tsk)
9314{ 9815{
9315#ifdef CONFIG_RT_GROUP_SCHED 9816#ifdef CONFIG_RT_GROUP_SCHED
9316 /* Don't accept realtime tasks when there is no way for them to run */ 9817 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
9317 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
9318 return -EINVAL; 9818 return -EINVAL;
9319#else 9819#else
9320 /* We don't support RT-tasks being in separate groups */ 9820 /* We don't support RT-tasks being in separate groups */
@@ -9477,7 +9977,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9477 9977
9478static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 9978static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9479{ 9979{
9480 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 9980 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9481 u64 data; 9981 u64 data;
9482 9982
9483#ifndef CONFIG_64BIT 9983#ifndef CONFIG_64BIT
@@ -9496,7 +9996,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9496 9996
9497static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 9997static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9498{ 9998{
9499 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 9999 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9500 10000
9501#ifndef CONFIG_64BIT 10001#ifndef CONFIG_64BIT
9502 /* 10002 /*
@@ -9585,14 +10085,14 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9585 struct cpuacct *ca; 10085 struct cpuacct *ca;
9586 int cpu; 10086 int cpu;
9587 10087
9588 if (!cpuacct_subsys.active) 10088 if (unlikely(!cpuacct_subsys.active))
9589 return; 10089 return;
9590 10090
9591 cpu = task_cpu(tsk); 10091 cpu = task_cpu(tsk);
9592 ca = task_ca(tsk); 10092 ca = task_ca(tsk);
9593 10093
9594 for (; ca; ca = ca->parent) { 10094 for (; ca; ca = ca->parent) {
9595 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 10095 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9596 *cpuusage += cputime; 10096 *cpuusage += cputime;
9597 } 10097 }
9598} 10098}