aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-04-06 03:02:57 -0400
committerIngo Molnar <mingo@elte.hu>2009-04-06 03:02:57 -0400
commitf541ae326fa120fa5c57433e4d9a133df212ce41 (patch)
treebdbd94ec72cfc601118051cb35e8617d55510177 /kernel/sched.c
parente255357764f92afcafafbd4879b222b8c752065a (diff)
parent0221c81b1b8eb0cbb6b30a0ced52ead32d2b4e4c (diff)
Merge branch 'linus' into perfcounters/core-v2
Merge reason: we have gathered quite a few conflicts, need to merge upstream Conflicts: arch/powerpc/kernel/Makefile arch/x86/ia32/ia32entry.S arch/x86/include/asm/hardirq.h arch/x86/include/asm/unistd_32.h arch/x86/include/asm/unistd_64.h arch/x86/kernel/cpu/common.c arch/x86/kernel/irq.c arch/x86/kernel/syscall_table_32.S arch/x86/mm/iomap_32.c include/linux/sched.h kernel/Makefile Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1134
1 files changed, 809 insertions, 325 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 78f4424b7c43..39e708602169 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -331,6 +331,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
331 */ 331 */
332static DEFINE_SPINLOCK(task_group_lock); 332static DEFINE_SPINLOCK(task_group_lock);
333 333
334#ifdef CONFIG_SMP
335static int root_task_group_empty(void)
336{
337 return list_empty(&root_task_group.children);
338}
339#endif
340
334#ifdef CONFIG_FAIR_GROUP_SCHED 341#ifdef CONFIG_FAIR_GROUP_SCHED
335#ifdef CONFIG_USER_SCHED 342#ifdef CONFIG_USER_SCHED
336# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 343# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -391,6 +398,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
391 398
392#else 399#else
393 400
401#ifdef CONFIG_SMP
402static int root_task_group_empty(void)
403{
404 return 1;
405}
406#endif
407
394static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 408static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
395static inline struct task_group *task_group(struct task_struct *p) 409static inline struct task_group *task_group(struct task_struct *p)
396{ 410{
@@ -467,11 +481,17 @@ struct rt_rq {
467 struct rt_prio_array active; 481 struct rt_prio_array active;
468 unsigned long rt_nr_running; 482 unsigned long rt_nr_running;
469#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 483#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
470 int highest_prio; /* highest queued rt task prio */ 484 struct {
485 int curr; /* highest queued rt task prio */
486#ifdef CONFIG_SMP
487 int next; /* next highest */
488#endif
489 } highest_prio;
471#endif 490#endif
472#ifdef CONFIG_SMP 491#ifdef CONFIG_SMP
473 unsigned long rt_nr_migratory; 492 unsigned long rt_nr_migratory;
474 int overloaded; 493 int overloaded;
494 struct plist_head pushable_tasks;
475#endif 495#endif
476 int rt_throttled; 496 int rt_throttled;
477 u64 rt_time; 497 u64 rt_time;
@@ -549,7 +569,6 @@ struct rq {
549 unsigned long nr_running; 569 unsigned long nr_running;
550 #define CPU_LOAD_IDX_MAX 5 570 #define CPU_LOAD_IDX_MAX 5
551 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 571 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
552 unsigned char idle_at_tick;
553#ifdef CONFIG_NO_HZ 572#ifdef CONFIG_NO_HZ
554 unsigned long last_tick_seen; 573 unsigned long last_tick_seen;
555 unsigned char in_nohz_recently; 574 unsigned char in_nohz_recently;
@@ -591,6 +610,7 @@ struct rq {
591 struct root_domain *rd; 610 struct root_domain *rd;
592 struct sched_domain *sd; 611 struct sched_domain *sd;
593 612
613 unsigned char idle_at_tick;
594 /* For active balancing */ 614 /* For active balancing */
595 int active_balance; 615 int active_balance;
596 int push_cpu; 616 int push_cpu;
@@ -619,9 +639,6 @@ struct rq {
619 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 639 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
620 640
621 /* sys_sched_yield() stats */ 641 /* sys_sched_yield() stats */
622 unsigned int yld_exp_empty;
623 unsigned int yld_act_empty;
624 unsigned int yld_both_empty;
625 unsigned int yld_count; 642 unsigned int yld_count;
626 643
627 /* schedule() stats */ 644 /* schedule() stats */
@@ -1114,7 +1131,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
1114 if (rq == this_rq()) { 1131 if (rq == this_rq()) {
1115 hrtimer_restart(timer); 1132 hrtimer_restart(timer);
1116 } else if (!rq->hrtick_csd_pending) { 1133 } else if (!rq->hrtick_csd_pending) {
1117 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd); 1134 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
1118 rq->hrtick_csd_pending = 1; 1135 rq->hrtick_csd_pending = 1;
1119 } 1136 }
1120} 1137}
@@ -1204,10 +1221,10 @@ static void resched_task(struct task_struct *p)
1204 1221
1205 assert_spin_locked(&task_rq(p)->lock); 1222 assert_spin_locked(&task_rq(p)->lock);
1206 1223
1207 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 1224 if (test_tsk_need_resched(p))
1208 return; 1225 return;
1209 1226
1210 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 1227 set_tsk_need_resched(p);
1211 1228
1212 cpu = task_cpu(p); 1229 cpu = task_cpu(p);
1213 if (cpu == smp_processor_id()) 1230 if (cpu == smp_processor_id())
@@ -1263,7 +1280,7 @@ void wake_up_idle_cpu(int cpu)
1263 * lockless. The worst case is that the other CPU runs the 1280 * lockless. The worst case is that the other CPU runs the
1264 * idle task through an additional NOOP schedule() 1281 * idle task through an additional NOOP schedule()
1265 */ 1282 */
1266 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); 1283 set_tsk_need_resched(rq->idle);
1267 1284
1268 /* NEED_RESCHED must be visible before we test polling */ 1285 /* NEED_RESCHED must be visible before we test polling */
1269 smp_mb(); 1286 smp_mb();
@@ -1631,21 +1648,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1631 1648
1632#endif 1649#endif
1633 1650
1651#ifdef CONFIG_PREEMPT
1652
1634/* 1653/*
1635 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1654 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1655 * way at the expense of forcing extra atomic operations in all
1656 * invocations. This assures that the double_lock is acquired using the
1657 * same underlying policy as the spinlock_t on this architecture, which
1658 * reduces latency compared to the unfair variant below. However, it
1659 * also adds more overhead and therefore may reduce throughput.
1636 */ 1660 */
1637static int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1661static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1662 __releases(this_rq->lock)
1663 __acquires(busiest->lock)
1664 __acquires(this_rq->lock)
1665{
1666 spin_unlock(&this_rq->lock);
1667 double_rq_lock(this_rq, busiest);
1668
1669 return 1;
1670}
1671
1672#else
1673/*
1674 * Unfair double_lock_balance: Optimizes throughput at the expense of
1675 * latency by eliminating extra atomic operations when the locks are
1676 * already in proper order on entry. This favors lower cpu-ids and will
1677 * grant the double lock to lower cpus over higher ids under contention,
1678 * regardless of entry order into the function.
1679 */
1680static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1638 __releases(this_rq->lock) 1681 __releases(this_rq->lock)
1639 __acquires(busiest->lock) 1682 __acquires(busiest->lock)
1640 __acquires(this_rq->lock) 1683 __acquires(this_rq->lock)
1641{ 1684{
1642 int ret = 0; 1685 int ret = 0;
1643 1686
1644 if (unlikely(!irqs_disabled())) {
1645 /* printk() doesn't work good under rq->lock */
1646 spin_unlock(&this_rq->lock);
1647 BUG_ON(1);
1648 }
1649 if (unlikely(!spin_trylock(&busiest->lock))) { 1687 if (unlikely(!spin_trylock(&busiest->lock))) {
1650 if (busiest < this_rq) { 1688 if (busiest < this_rq) {
1651 spin_unlock(&this_rq->lock); 1689 spin_unlock(&this_rq->lock);
@@ -1658,6 +1696,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1658 return ret; 1696 return ret;
1659} 1697}
1660 1698
1699#endif /* CONFIG_PREEMPT */
1700
1701/*
1702 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1703 */
1704static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1705{
1706 if (unlikely(!irqs_disabled())) {
1707 /* printk() doesn't work good under rq->lock */
1708 spin_unlock(&this_rq->lock);
1709 BUG_ON(1);
1710 }
1711
1712 return _double_lock_balance(this_rq, busiest);
1713}
1714
1661static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1715static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1662 __releases(busiest->lock) 1716 __releases(busiest->lock)
1663{ 1717{
@@ -1726,6 +1780,9 @@ static void update_avg(u64 *avg, u64 sample)
1726 1780
1727static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1781static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1728{ 1782{
1783 if (wakeup)
1784 p->se.start_runtime = p->se.sum_exec_runtime;
1785
1729 sched_info_queued(p); 1786 sched_info_queued(p);
1730 p->sched_class->enqueue_task(rq, p, wakeup); 1787 p->sched_class->enqueue_task(rq, p, wakeup);
1731 p->se.on_rq = 1; 1788 p->se.on_rq = 1;
@@ -1733,10 +1790,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1733 1790
1734static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1791static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1735{ 1792{
1736 if (sleep && p->se.last_wakeup) { 1793 if (sleep) {
1737 update_avg(&p->se.avg_overlap, 1794 if (p->se.last_wakeup) {
1738 p->se.sum_exec_runtime - p->se.last_wakeup); 1795 update_avg(&p->se.avg_overlap,
1739 p->se.last_wakeup = 0; 1796 p->se.sum_exec_runtime - p->se.last_wakeup);
1797 p->se.last_wakeup = 0;
1798 } else {
1799 update_avg(&p->se.avg_wakeup,
1800 sysctl_sched_wakeup_granularity);
1801 }
1740 } 1802 }
1741 1803
1742 sched_info_dequeued(p); 1804 sched_info_dequeued(p);
@@ -2041,7 +2103,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2041 * it must be off the runqueue _entirely_, and not 2103 * it must be off the runqueue _entirely_, and not
2042 * preempted! 2104 * preempted!
2043 * 2105 *
2044 * So if it wa still runnable (but just not actively 2106 * So if it was still runnable (but just not actively
2045 * running right now), it's preempted, and we should 2107 * running right now), it's preempted, and we should
2046 * yield - it could be a while. 2108 * yield - it could be a while.
2047 */ 2109 */
@@ -2312,7 +2374,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2312 sync = 0; 2374 sync = 0;
2313 2375
2314#ifdef CONFIG_SMP 2376#ifdef CONFIG_SMP
2315 if (sched_feat(LB_WAKEUP_UPDATE)) { 2377 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2316 struct sched_domain *sd; 2378 struct sched_domain *sd;
2317 2379
2318 this_cpu = raw_smp_processor_id(); 2380 this_cpu = raw_smp_processor_id();
@@ -2390,6 +2452,22 @@ out_activate:
2390 activate_task(rq, p, 1); 2452 activate_task(rq, p, 1);
2391 success = 1; 2453 success = 1;
2392 2454
2455 /*
2456 * Only attribute actual wakeups done by this task.
2457 */
2458 if (!in_interrupt()) {
2459 struct sched_entity *se = &current->se;
2460 u64 sample = se->sum_exec_runtime;
2461
2462 if (se->last_wakeup)
2463 sample -= se->last_wakeup;
2464 else
2465 sample -= se->start_runtime;
2466 update_avg(&se->avg_wakeup, sample);
2467
2468 se->last_wakeup = se->sum_exec_runtime;
2469 }
2470
2393out_running: 2471out_running:
2394 trace_sched_wakeup(rq, p, success); 2472 trace_sched_wakeup(rq, p, success);
2395 check_preempt_curr(rq, p, sync); 2473 check_preempt_curr(rq, p, sync);
@@ -2400,8 +2478,6 @@ out_running:
2400 p->sched_class->task_wake_up(rq, p); 2478 p->sched_class->task_wake_up(rq, p);
2401#endif 2479#endif
2402out: 2480out:
2403 current->se.last_wakeup = current->se.sum_exec_runtime;
2404
2405 task_rq_unlock(rq, &flags); 2481 task_rq_unlock(rq, &flags);
2406 2482
2407 return success; 2483 return success;
@@ -2432,6 +2508,8 @@ static void __sched_fork(struct task_struct *p)
2432 p->se.nr_migrations = 0; 2508 p->se.nr_migrations = 0;
2433 p->se.last_wakeup = 0; 2509 p->se.last_wakeup = 0;
2434 p->se.avg_overlap = 0; 2510 p->se.avg_overlap = 0;
2511 p->se.start_runtime = 0;
2512 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2435 2513
2436#ifdef CONFIG_SCHEDSTATS 2514#ifdef CONFIG_SCHEDSTATS
2437 p->se.wait_start = 0; 2515 p->se.wait_start = 0;
@@ -2494,6 +2572,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
2494 /* Want to start with kernel preemption disabled. */ 2572 /* Want to start with kernel preemption disabled. */
2495 task_thread_info(p)->preempt_count = 1; 2573 task_thread_info(p)->preempt_count = 1;
2496#endif 2574#endif
2575 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2576
2497 put_cpu(); 2577 put_cpu();
2498} 2578}
2499 2579
@@ -2537,7 +2617,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2537#ifdef CONFIG_PREEMPT_NOTIFIERS 2617#ifdef CONFIG_PREEMPT_NOTIFIERS
2538 2618
2539/** 2619/**
2540 * preempt_notifier_register - tell me when current is being being preempted & rescheduled 2620 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2541 * @notifier: notifier struct to register 2621 * @notifier: notifier struct to register
2542 */ 2622 */
2543void preempt_notifier_register(struct preempt_notifier *notifier) 2623void preempt_notifier_register(struct preempt_notifier *notifier)
@@ -2634,6 +2714,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2634{ 2714{
2635 struct mm_struct *mm = rq->prev_mm; 2715 struct mm_struct *mm = rq->prev_mm;
2636 long prev_state; 2716 long prev_state;
2717#ifdef CONFIG_SMP
2718 int post_schedule = 0;
2719
2720 if (current->sched_class->needs_post_schedule)
2721 post_schedule = current->sched_class->needs_post_schedule(rq);
2722#endif
2637 2723
2638 rq->prev_mm = NULL; 2724 rq->prev_mm = NULL;
2639 2725
@@ -2653,7 +2739,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2653 perf_counter_task_sched_in(current, cpu_of(rq)); 2739 perf_counter_task_sched_in(current, cpu_of(rq));
2654 finish_lock_switch(rq, prev); 2740 finish_lock_switch(rq, prev);
2655#ifdef CONFIG_SMP 2741#ifdef CONFIG_SMP
2656 if (current->sched_class->post_schedule) 2742 if (post_schedule)
2657 current->sched_class->post_schedule(rq); 2743 current->sched_class->post_schedule(rq);
2658#endif 2744#endif
2659 2745
@@ -2975,6 +3061,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2975 struct sched_domain *sd, enum cpu_idle_type idle, 3061 struct sched_domain *sd, enum cpu_idle_type idle,
2976 int *all_pinned) 3062 int *all_pinned)
2977{ 3063{
3064 int tsk_cache_hot = 0;
2978 /* 3065 /*
2979 * We do not migrate tasks that are: 3066 * We do not migrate tasks that are:
2980 * 1) running (obviously), or 3067 * 1) running (obviously), or
@@ -2998,10 +3085,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2998 * 2) too many balance attempts have failed. 3085 * 2) too many balance attempts have failed.
2999 */ 3086 */
3000 3087
3001 if (!task_hot(p, rq->clock, sd) || 3088 tsk_cache_hot = task_hot(p, rq->clock, sd);
3002 sd->nr_balance_failed > sd->cache_nice_tries) { 3089 if (!tsk_cache_hot ||
3090 sd->nr_balance_failed > sd->cache_nice_tries) {
3003#ifdef CONFIG_SCHEDSTATS 3091#ifdef CONFIG_SCHEDSTATS
3004 if (task_hot(p, rq->clock, sd)) { 3092 if (tsk_cache_hot) {
3005 schedstat_inc(sd, lb_hot_gained[idle]); 3093 schedstat_inc(sd, lb_hot_gained[idle]);
3006 schedstat_inc(p, se.nr_forced_migrations); 3094 schedstat_inc(p, se.nr_forced_migrations);
3007 } 3095 }
@@ -3009,7 +3097,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3009 return 1; 3097 return 1;
3010 } 3098 }
3011 3099
3012 if (task_hot(p, rq->clock, sd)) { 3100 if (tsk_cache_hot) {
3013 schedstat_inc(p, se.nr_failed_migrations_hot); 3101 schedstat_inc(p, se.nr_failed_migrations_hot);
3014 return 0; 3102 return 0;
3015 } 3103 }
@@ -3049,6 +3137,16 @@ next:
3049 pulled++; 3137 pulled++;
3050 rem_load_move -= p->se.load.weight; 3138 rem_load_move -= p->se.load.weight;
3051 3139
3140#ifdef CONFIG_PREEMPT
3141 /*
3142 * NEWIDLE balancing is a source of latency, so preemptible kernels
3143 * will stop after the first task is pulled to minimize the critical
3144 * section.
3145 */
3146 if (idle == CPU_NEWLY_IDLE)
3147 goto out;
3148#endif
3149
3052 /* 3150 /*
3053 * We only want to steal up to the prescribed amount of weighted load. 3151 * We only want to steal up to the prescribed amount of weighted load.
3054 */ 3152 */
@@ -3095,9 +3193,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3095 sd, idle, all_pinned, &this_best_prio); 3193 sd, idle, all_pinned, &this_best_prio);
3096 class = class->next; 3194 class = class->next;
3097 3195
3196#ifdef CONFIG_PREEMPT
3197 /*
3198 * NEWIDLE balancing is a source of latency, so preemptible
3199 * kernels will stop after the first task is pulled to minimize
3200 * the critical section.
3201 */
3098 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3202 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3099 break; 3203 break;
3100 3204#endif
3101 } while (class && max_load_move > total_load_moved); 3205 } while (class && max_load_move > total_load_moved);
3102 3206
3103 return total_load_moved > 0; 3207 return total_load_moved > 0;
@@ -3147,246 +3251,480 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3147 3251
3148 return 0; 3252 return 0;
3149} 3253}
3150 3254/********** Helpers for find_busiest_group ************************/
3151/* 3255/*
3152 * find_busiest_group finds and returns the busiest CPU group within the 3256 * sd_lb_stats - Structure to store the statistics of a sched_domain
3153 * domain. It calculates and returns the amount of weighted load which 3257 * during load balancing.
3154 * should be moved to restore balance via the imbalance parameter.
3155 */ 3258 */
3156static struct sched_group * 3259struct sd_lb_stats {
3157find_busiest_group(struct sched_domain *sd, int this_cpu, 3260 struct sched_group *busiest; /* Busiest group in this sd */
3158 unsigned long *imbalance, enum cpu_idle_type idle, 3261 struct sched_group *this; /* Local group in this sd */
3159 int *sd_idle, const struct cpumask *cpus, int *balance) 3262 unsigned long total_load; /* Total load of all groups in sd */
3160{ 3263 unsigned long total_pwr; /* Total power of all groups in sd */
3161 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3264 unsigned long avg_load; /* Average load across all groups in sd */
3162 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3265
3163 unsigned long max_pull; 3266 /** Statistics of this group */
3164 unsigned long busiest_load_per_task, busiest_nr_running; 3267 unsigned long this_load;
3165 unsigned long this_load_per_task, this_nr_running; 3268 unsigned long this_load_per_task;
3166 int load_idx, group_imb = 0; 3269 unsigned long this_nr_running;
3270
3271 /* Statistics of the busiest group */
3272 unsigned long max_load;
3273 unsigned long busiest_load_per_task;
3274 unsigned long busiest_nr_running;
3275
3276 int group_imb; /* Is there imbalance in this sd */
3167#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3277#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3168 int power_savings_balance = 1; 3278 int power_savings_balance; /* Is powersave balance needed for this sd */
3169 unsigned long leader_nr_running = 0, min_load_per_task = 0; 3279 struct sched_group *group_min; /* Least loaded group in sd */
3170 unsigned long min_nr_running = ULONG_MAX; 3280 struct sched_group *group_leader; /* Group which relieves group_min */
3171 struct sched_group *group_min = NULL, *group_leader = NULL; 3281 unsigned long min_load_per_task; /* load_per_task in group_min */
3282 unsigned long leader_nr_running; /* Nr running of group_leader */
3283 unsigned long min_nr_running; /* Nr running of group_min */
3172#endif 3284#endif
3285};
3173 3286
3174 max_load = this_load = total_load = total_pwr = 0; 3287/*
3175 busiest_load_per_task = busiest_nr_running = 0; 3288 * sg_lb_stats - stats of a sched_group required for load_balancing
3176 this_load_per_task = this_nr_running = 0; 3289 */
3290struct sg_lb_stats {
3291 unsigned long avg_load; /*Avg load across the CPUs of the group */
3292 unsigned long group_load; /* Total load over the CPUs of the group */
3293 unsigned long sum_nr_running; /* Nr tasks running in the group */
3294 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3295 unsigned long group_capacity;
3296 int group_imb; /* Is there an imbalance in the group ? */
3297};
3298
3299/**
3300 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3301 * @group: The group whose first cpu is to be returned.
3302 */
3303static inline unsigned int group_first_cpu(struct sched_group *group)
3304{
3305 return cpumask_first(sched_group_cpus(group));
3306}
3307
3308/**
3309 * get_sd_load_idx - Obtain the load index for a given sched domain.
3310 * @sd: The sched_domain whose load_idx is to be obtained.
3311 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3312 */
3313static inline int get_sd_load_idx(struct sched_domain *sd,
3314 enum cpu_idle_type idle)
3315{
3316 int load_idx;
3177 3317
3178 if (idle == CPU_NOT_IDLE) 3318 switch (idle) {
3319 case CPU_NOT_IDLE:
3179 load_idx = sd->busy_idx; 3320 load_idx = sd->busy_idx;
3180 else if (idle == CPU_NEWLY_IDLE) 3321 break;
3322
3323 case CPU_NEWLY_IDLE:
3181 load_idx = sd->newidle_idx; 3324 load_idx = sd->newidle_idx;
3182 else 3325 break;
3326 default:
3183 load_idx = sd->idle_idx; 3327 load_idx = sd->idle_idx;
3328 break;
3329 }
3184 3330
3185 do { 3331 return load_idx;
3186 unsigned long load, group_capacity, max_cpu_load, min_cpu_load; 3332}
3187 int local_group;
3188 int i;
3189 int __group_imb = 0;
3190 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3191 unsigned long sum_nr_running, sum_weighted_load;
3192 unsigned long sum_avg_load_per_task;
3193 unsigned long avg_load_per_task;
3194 3333
3195 local_group = cpumask_test_cpu(this_cpu,
3196 sched_group_cpus(group));
3197 3334
3198 if (local_group) 3335#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3199 balance_cpu = cpumask_first(sched_group_cpus(group)); 3336/**
3337 * init_sd_power_savings_stats - Initialize power savings statistics for
3338 * the given sched_domain, during load balancing.
3339 *
3340 * @sd: Sched domain whose power-savings statistics are to be initialized.
3341 * @sds: Variable containing the statistics for sd.
3342 * @idle: Idle status of the CPU at which we're performing load-balancing.
3343 */
3344static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3345 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3346{
3347 /*
3348 * Busy processors will not participate in power savings
3349 * balance.
3350 */
3351 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3352 sds->power_savings_balance = 0;
3353 else {
3354 sds->power_savings_balance = 1;
3355 sds->min_nr_running = ULONG_MAX;
3356 sds->leader_nr_running = 0;
3357 }
3358}
3200 3359
3201 /* Tally up the load of all CPUs in the group */ 3360/**
3202 sum_weighted_load = sum_nr_running = avg_load = 0; 3361 * update_sd_power_savings_stats - Update the power saving stats for a
3203 sum_avg_load_per_task = avg_load_per_task = 0; 3362 * sched_domain while performing load balancing.
3363 *
3364 * @group: sched_group belonging to the sched_domain under consideration.
3365 * @sds: Variable containing the statistics of the sched_domain
3366 * @local_group: Does group contain the CPU for which we're performing
3367 * load balancing ?
3368 * @sgs: Variable containing the statistics of the group.
3369 */
3370static inline void update_sd_power_savings_stats(struct sched_group *group,
3371 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3372{
3204 3373
3205 max_cpu_load = 0; 3374 if (!sds->power_savings_balance)
3206 min_cpu_load = ~0UL; 3375 return;
3207 3376
3208 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3377 /*
3209 struct rq *rq = cpu_rq(i); 3378 * If the local group is idle or completely loaded
3379 * no need to do power savings balance at this domain
3380 */
3381 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3382 !sds->this_nr_running))
3383 sds->power_savings_balance = 0;
3210 3384
3211 if (*sd_idle && rq->nr_running) 3385 /*
3212 *sd_idle = 0; 3386 * If a group is already running at full capacity or idle,
3387 * don't include that group in power savings calculations
3388 */
3389 if (!sds->power_savings_balance ||
3390 sgs->sum_nr_running >= sgs->group_capacity ||
3391 !sgs->sum_nr_running)
3392 return;
3213 3393
3214 /* Bias balancing toward cpus of our domain */ 3394 /*
3215 if (local_group) { 3395 * Calculate the group which has the least non-idle load.
3216 if (idle_cpu(i) && !first_idle_cpu) { 3396 * This is the group from where we need to pick up the load
3217 first_idle_cpu = 1; 3397 * for saving power
3218 balance_cpu = i; 3398 */
3219 } 3399 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3400 (sgs->sum_nr_running == sds->min_nr_running &&
3401 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3402 sds->group_min = group;
3403 sds->min_nr_running = sgs->sum_nr_running;
3404 sds->min_load_per_task = sgs->sum_weighted_load /
3405 sgs->sum_nr_running;
3406 }
3220 3407
3221 load = target_load(i, load_idx); 3408 /*
3222 } else { 3409 * Calculate the group which is almost near its
3223 load = source_load(i, load_idx); 3410 * capacity but still has some space to pick up some load
3224 if (load > max_cpu_load) 3411 * from other group and save more power
3225 max_cpu_load = load; 3412 */
3226 if (min_cpu_load > load) 3413 if (sgs->sum_nr_running > sgs->group_capacity - 1)
3227 min_cpu_load = load; 3414 return;
3228 }
3229 3415
3230 avg_load += load; 3416 if (sgs->sum_nr_running > sds->leader_nr_running ||
3231 sum_nr_running += rq->nr_running; 3417 (sgs->sum_nr_running == sds->leader_nr_running &&
3232 sum_weighted_load += weighted_cpuload(i); 3418 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3419 sds->group_leader = group;
3420 sds->leader_nr_running = sgs->sum_nr_running;
3421 }
3422}
3233 3423
3234 sum_avg_load_per_task += cpu_avg_load_per_task(i); 3424/**
3235 } 3425 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3426 * @sds: Variable containing the statistics of the sched_domain
3427 * under consideration.
3428 * @this_cpu: Cpu at which we're currently performing load-balancing.
3429 * @imbalance: Variable to store the imbalance.
3430 *
3431 * Description:
3432 * Check if we have potential to perform some power-savings balance.
3433 * If yes, set the busiest group to be the least loaded group in the
3434 * sched_domain, so that it's CPUs can be put to idle.
3435 *
3436 * Returns 1 if there is potential to perform power-savings balance.
3437 * Else returns 0.
3438 */
3439static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3440 int this_cpu, unsigned long *imbalance)
3441{
3442 if (!sds->power_savings_balance)
3443 return 0;
3236 3444
3237 /* 3445 if (sds->this != sds->group_leader ||
3238 * First idle cpu or the first cpu(busiest) in this sched group 3446 sds->group_leader == sds->group_min)
3239 * is eligible for doing load balancing at this and above 3447 return 0;
3240 * domains. In the newly idle case, we will allow all the cpu's
3241 * to do the newly idle load balance.
3242 */
3243 if (idle != CPU_NEWLY_IDLE && local_group &&
3244 balance_cpu != this_cpu && balance) {
3245 *balance = 0;
3246 goto ret;
3247 }
3248 3448
3249 total_load += avg_load; 3449 *imbalance = sds->min_load_per_task;
3250 total_pwr += group->__cpu_power; 3450 sds->busiest = sds->group_min;
3251 3451
3252 /* Adjust by relative CPU power of the group */ 3452 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3253 avg_load = sg_div_cpu_power(group, 3453 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3254 avg_load * SCHED_LOAD_SCALE); 3454 group_first_cpu(sds->group_leader);
3455 }
3255 3456
3457 return 1;
3256 3458
3257 /* 3459}
3258 * Consider the group unbalanced when the imbalance is larger 3460#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3259 * than the average weight of two tasks. 3461static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3260 * 3462 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3261 * APZ: with cgroup the avg task weight can vary wildly and 3463{
3262 * might not be a suitable number - should we keep a 3464 return;
3263 * normalized nr_running number somewhere that negates 3465}
3264 * the hierarchy? 3466
3265 */ 3467static inline void update_sd_power_savings_stats(struct sched_group *group,
3266 avg_load_per_task = sg_div_cpu_power(group, 3468 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3267 sum_avg_load_per_task * SCHED_LOAD_SCALE); 3469{
3470 return;
3471}
3472
3473static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3474 int this_cpu, unsigned long *imbalance)
3475{
3476 return 0;
3477}
3478#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3479
3480
3481/**
3482 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3483 * @group: sched_group whose statistics are to be updated.
3484 * @this_cpu: Cpu for which load balance is currently performed.
3485 * @idle: Idle status of this_cpu
3486 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3487 * @sd_idle: Idle status of the sched_domain containing group.
3488 * @local_group: Does group contain this_cpu.
3489 * @cpus: Set of cpus considered for load balancing.
3490 * @balance: Should we balance.
3491 * @sgs: variable to hold the statistics for this group.
3492 */
3493static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3494 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3495 int local_group, const struct cpumask *cpus,
3496 int *balance, struct sg_lb_stats *sgs)
3497{
3498 unsigned long load, max_cpu_load, min_cpu_load;
3499 int i;
3500 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3501 unsigned long sum_avg_load_per_task;
3502 unsigned long avg_load_per_task;
3503
3504 if (local_group)
3505 balance_cpu = group_first_cpu(group);
3268 3506
3269 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3507 /* Tally up the load of all CPUs in the group */
3270 __group_imb = 1; 3508 sum_avg_load_per_task = avg_load_per_task = 0;
3509 max_cpu_load = 0;
3510 min_cpu_load = ~0UL;
3271 3511
3272 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3512 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3513 struct rq *rq = cpu_rq(i);
3273 3514
3515 if (*sd_idle && rq->nr_running)
3516 *sd_idle = 0;
3517
3518 /* Bias balancing toward cpus of our domain */
3274 if (local_group) { 3519 if (local_group) {
3275 this_load = avg_load; 3520 if (idle_cpu(i) && !first_idle_cpu) {
3276 this = group; 3521 first_idle_cpu = 1;
3277 this_nr_running = sum_nr_running; 3522 balance_cpu = i;
3278 this_load_per_task = sum_weighted_load; 3523 }
3279 } else if (avg_load > max_load && 3524
3280 (sum_nr_running > group_capacity || __group_imb)) { 3525 load = target_load(i, load_idx);
3281 max_load = avg_load; 3526 } else {
3282 busiest = group; 3527 load = source_load(i, load_idx);
3283 busiest_nr_running = sum_nr_running; 3528 if (load > max_cpu_load)
3284 busiest_load_per_task = sum_weighted_load; 3529 max_cpu_load = load;
3285 group_imb = __group_imb; 3530 if (min_cpu_load > load)
3531 min_cpu_load = load;
3286 } 3532 }
3287 3533
3288#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3534 sgs->group_load += load;
3289 /* 3535 sgs->sum_nr_running += rq->nr_running;
3290 * Busy processors will not participate in power savings 3536 sgs->sum_weighted_load += weighted_cpuload(i);
3291 * balance.
3292 */
3293 if (idle == CPU_NOT_IDLE ||
3294 !(sd->flags & SD_POWERSAVINGS_BALANCE))
3295 goto group_next;
3296 3537
3297 /* 3538 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3298 * If the local group is idle or completely loaded 3539 }
3299 * no need to do power savings balance at this domain
3300 */
3301 if (local_group && (this_nr_running >= group_capacity ||
3302 !this_nr_running))
3303 power_savings_balance = 0;
3304 3540
3305 /* 3541 /*
3306 * If a group is already running at full capacity or idle, 3542 * First idle cpu or the first cpu(busiest) in this sched group
3307 * don't include that group in power savings calculations 3543 * is eligible for doing load balancing at this and above
3308 */ 3544 * domains. In the newly idle case, we will allow all the cpu's
3309 if (!power_savings_balance || sum_nr_running >= group_capacity 3545 * to do the newly idle load balance.
3310 || !sum_nr_running) 3546 */
3311 goto group_next; 3547 if (idle != CPU_NEWLY_IDLE && local_group &&
3548 balance_cpu != this_cpu && balance) {
3549 *balance = 0;
3550 return;
3551 }
3312 3552
3313 /* 3553 /* Adjust by relative CPU power of the group */
3314 * Calculate the group which has the least non-idle load. 3554 sgs->avg_load = sg_div_cpu_power(group,
3315 * This is the group from where we need to pick up the load 3555 sgs->group_load * SCHED_LOAD_SCALE);
3316 * for saving power
3317 */
3318 if ((sum_nr_running < min_nr_running) ||
3319 (sum_nr_running == min_nr_running &&
3320 cpumask_first(sched_group_cpus(group)) >
3321 cpumask_first(sched_group_cpus(group_min)))) {
3322 group_min = group;
3323 min_nr_running = sum_nr_running;
3324 min_load_per_task = sum_weighted_load /
3325 sum_nr_running;
3326 }
3327 3556
3328 /* 3557
3329 * Calculate the group which is almost near its 3558 /*
3330 * capacity but still has some space to pick up some load 3559 * Consider the group unbalanced when the imbalance is larger
3331 * from other group and save more power 3560 * than the average weight of two tasks.
3332 */ 3561 *
3333 if (sum_nr_running <= group_capacity - 1) { 3562 * APZ: with cgroup the avg task weight can vary wildly and
3334 if (sum_nr_running > leader_nr_running || 3563 * might not be a suitable number - should we keep a
3335 (sum_nr_running == leader_nr_running && 3564 * normalized nr_running number somewhere that negates
3336 cpumask_first(sched_group_cpus(group)) < 3565 * the hierarchy?
3337 cpumask_first(sched_group_cpus(group_leader)))) { 3566 */
3338 group_leader = group; 3567 avg_load_per_task = sg_div_cpu_power(group,
3339 leader_nr_running = sum_nr_running; 3568 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3340 } 3569
3570 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3571 sgs->group_imb = 1;
3572
3573 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3574
3575}
3576
3577/**
3578 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3579 * @sd: sched_domain whose statistics are to be updated.
3580 * @this_cpu: Cpu for which load balance is currently performed.
3581 * @idle: Idle status of this_cpu
3582 * @sd_idle: Idle status of the sched_domain containing group.
3583 * @cpus: Set of cpus considered for load balancing.
3584 * @balance: Should we balance.
3585 * @sds: variable to hold the statistics for this sched_domain.
3586 */
3587static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3588 enum cpu_idle_type idle, int *sd_idle,
3589 const struct cpumask *cpus, int *balance,
3590 struct sd_lb_stats *sds)
3591{
3592 struct sched_group *group = sd->groups;
3593 struct sg_lb_stats sgs;
3594 int load_idx;
3595
3596 init_sd_power_savings_stats(sd, sds, idle);
3597 load_idx = get_sd_load_idx(sd, idle);
3598
3599 do {
3600 int local_group;
3601
3602 local_group = cpumask_test_cpu(this_cpu,
3603 sched_group_cpus(group));
3604 memset(&sgs, 0, sizeof(sgs));
3605 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
3606 local_group, cpus, balance, &sgs);
3607
3608 if (local_group && balance && !(*balance))
3609 return;
3610
3611 sds->total_load += sgs.group_load;
3612 sds->total_pwr += group->__cpu_power;
3613
3614 if (local_group) {
3615 sds->this_load = sgs.avg_load;
3616 sds->this = group;
3617 sds->this_nr_running = sgs.sum_nr_running;
3618 sds->this_load_per_task = sgs.sum_weighted_load;
3619 } else if (sgs.avg_load > sds->max_load &&
3620 (sgs.sum_nr_running > sgs.group_capacity ||
3621 sgs.group_imb)) {
3622 sds->max_load = sgs.avg_load;
3623 sds->busiest = group;
3624 sds->busiest_nr_running = sgs.sum_nr_running;
3625 sds->busiest_load_per_task = sgs.sum_weighted_load;
3626 sds->group_imb = sgs.group_imb;
3341 } 3627 }
3342group_next: 3628
3343#endif 3629 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3344 group = group->next; 3630 group = group->next;
3345 } while (group != sd->groups); 3631 } while (group != sd->groups);
3346 3632
3347 if (!busiest || this_load >= max_load || busiest_nr_running == 0) 3633}
3348 goto out_balanced;
3349
3350 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3351 3634
3352 if (this_load >= avg_load || 3635/**
3353 100*max_load <= sd->imbalance_pct*this_load) 3636 * fix_small_imbalance - Calculate the minor imbalance that exists
3354 goto out_balanced; 3637 * amongst the groups of a sched_domain, during
3638 * load balancing.
3639 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3640 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3641 * @imbalance: Variable to store the imbalance.
3642 */
3643static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3644 int this_cpu, unsigned long *imbalance)
3645{
3646 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3647 unsigned int imbn = 2;
3648
3649 if (sds->this_nr_running) {
3650 sds->this_load_per_task /= sds->this_nr_running;
3651 if (sds->busiest_load_per_task >
3652 sds->this_load_per_task)
3653 imbn = 1;
3654 } else
3655 sds->this_load_per_task =
3656 cpu_avg_load_per_task(this_cpu);
3355 3657
3356 busiest_load_per_task /= busiest_nr_running; 3658 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3357 if (group_imb) 3659 sds->busiest_load_per_task * imbn) {
3358 busiest_load_per_task = min(busiest_load_per_task, avg_load); 3660 *imbalance = sds->busiest_load_per_task;
3661 return;
3662 }
3359 3663
3360 /* 3664 /*
3361 * We're trying to get all the cpus to the average_load, so we don't 3665 * OK, we don't have enough imbalance to justify moving tasks,
3362 * want to push ourselves above the average load, nor do we wish to 3666 * however we may be able to increase total CPU power used by
3363 * reduce the max loaded cpu below the average load, as either of these 3667 * moving them.
3364 * actions would just result in more rebalancing later, and ping-pong
3365 * tasks around. Thus we look for the minimum possible imbalance.
3366 * Negative imbalances (*we* are more loaded than anyone else) will
3367 * be counted as no imbalance for these purposes -- we can't fix that
3368 * by pulling tasks to us. Be careful of negative numbers as they'll
3369 * appear as very large values with unsigned longs.
3370 */ 3668 */
3371 if (max_load <= busiest_load_per_task)
3372 goto out_balanced;
3373 3669
3670 pwr_now += sds->busiest->__cpu_power *
3671 min(sds->busiest_load_per_task, sds->max_load);
3672 pwr_now += sds->this->__cpu_power *
3673 min(sds->this_load_per_task, sds->this_load);
3674 pwr_now /= SCHED_LOAD_SCALE;
3675
3676 /* Amount of load we'd subtract */
3677 tmp = sg_div_cpu_power(sds->busiest,
3678 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3679 if (sds->max_load > tmp)
3680 pwr_move += sds->busiest->__cpu_power *
3681 min(sds->busiest_load_per_task, sds->max_load - tmp);
3682
3683 /* Amount of load we'd add */
3684 if (sds->max_load * sds->busiest->__cpu_power <
3685 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3686 tmp = sg_div_cpu_power(sds->this,
3687 sds->max_load * sds->busiest->__cpu_power);
3688 else
3689 tmp = sg_div_cpu_power(sds->this,
3690 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3691 pwr_move += sds->this->__cpu_power *
3692 min(sds->this_load_per_task, sds->this_load + tmp);
3693 pwr_move /= SCHED_LOAD_SCALE;
3694
3695 /* Move if we gain throughput */
3696 if (pwr_move > pwr_now)
3697 *imbalance = sds->busiest_load_per_task;
3698}
3699
3700/**
3701 * calculate_imbalance - Calculate the amount of imbalance present within the
3702 * groups of a given sched_domain during load balance.
3703 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3704 * @this_cpu: Cpu for which currently load balance is being performed.
3705 * @imbalance: The variable to store the imbalance.
3706 */
3707static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3708 unsigned long *imbalance)
3709{
3710 unsigned long max_pull;
3374 /* 3711 /*
3375 * In the presence of smp nice balancing, certain scenarios can have 3712 * In the presence of smp nice balancing, certain scenarios can have
3376 * max load less than avg load(as we skip the groups at or below 3713 * max load less than avg load(as we skip the groups at or below
3377 * its cpu_power, while calculating max_load..) 3714 * its cpu_power, while calculating max_load..)
3378 */ 3715 */
3379 if (max_load < avg_load) { 3716 if (sds->max_load < sds->avg_load) {
3380 *imbalance = 0; 3717 *imbalance = 0;
3381 goto small_imbalance; 3718 return fix_small_imbalance(sds, this_cpu, imbalance);
3382 } 3719 }
3383 3720
3384 /* Don't want to pull so many tasks that a group would go idle */ 3721 /* Don't want to pull so many tasks that a group would go idle */
3385 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); 3722 max_pull = min(sds->max_load - sds->avg_load,
3723 sds->max_load - sds->busiest_load_per_task);
3386 3724
3387 /* How much load to actually move to equalise the imbalance */ 3725 /* How much load to actually move to equalise the imbalance */
3388 *imbalance = min(max_pull * busiest->__cpu_power, 3726 *imbalance = min(max_pull * sds->busiest->__cpu_power,
3389 (avg_load - this_load) * this->__cpu_power) 3727 (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
3390 / SCHED_LOAD_SCALE; 3728 / SCHED_LOAD_SCALE;
3391 3729
3392 /* 3730 /*
@@ -3395,78 +3733,110 @@ group_next:
3395 * a think about bumping its value to force at least one task to be 3733 * a think about bumping its value to force at least one task to be
3396 * moved 3734 * moved
3397 */ 3735 */
3398 if (*imbalance < busiest_load_per_task) { 3736 if (*imbalance < sds->busiest_load_per_task)
3399 unsigned long tmp, pwr_now, pwr_move; 3737 return fix_small_imbalance(sds, this_cpu, imbalance);
3400 unsigned int imbn;
3401
3402small_imbalance:
3403 pwr_move = pwr_now = 0;
3404 imbn = 2;
3405 if (this_nr_running) {
3406 this_load_per_task /= this_nr_running;
3407 if (busiest_load_per_task > this_load_per_task)
3408 imbn = 1;
3409 } else
3410 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3411 3738
3412 if (max_load - this_load + busiest_load_per_task >= 3739}
3413 busiest_load_per_task * imbn) { 3740/******* find_busiest_group() helpers end here *********************/
3414 *imbalance = busiest_load_per_task;
3415 return busiest;
3416 }
3417 3741
3418 /* 3742/**
3419 * OK, we don't have enough imbalance to justify moving tasks, 3743 * find_busiest_group - Returns the busiest group within the sched_domain
3420 * however we may be able to increase total CPU power used by 3744 * if there is an imbalance. If there isn't an imbalance, and
3421 * moving them. 3745 * the user has opted for power-savings, it returns a group whose
3422 */ 3746 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3747 * such a group exists.
3748 *
3749 * Also calculates the amount of weighted load which should be moved
3750 * to restore balance.
3751 *
3752 * @sd: The sched_domain whose busiest group is to be returned.
3753 * @this_cpu: The cpu for which load balancing is currently being performed.
3754 * @imbalance: Variable which stores amount of weighted load which should
3755 * be moved to restore balance/put a group to idle.
3756 * @idle: The idle status of this_cpu.
3757 * @sd_idle: The idleness of sd
3758 * @cpus: The set of CPUs under consideration for load-balancing.
3759 * @balance: Pointer to a variable indicating if this_cpu
3760 * is the appropriate cpu to perform load balancing at this_level.
3761 *
3762 * Returns: - the busiest group if imbalance exists.
3763 * - If no imbalance and user has opted for power-savings balance,
3764 * return the least loaded group whose CPUs can be
3765 * put to idle by rebalancing its tasks onto our group.
3766 */
3767static struct sched_group *
3768find_busiest_group(struct sched_domain *sd, int this_cpu,
3769 unsigned long *imbalance, enum cpu_idle_type idle,
3770 int *sd_idle, const struct cpumask *cpus, int *balance)
3771{
3772 struct sd_lb_stats sds;
3423 3773
3424 pwr_now += busiest->__cpu_power * 3774 memset(&sds, 0, sizeof(sds));
3425 min(busiest_load_per_task, max_load);
3426 pwr_now += this->__cpu_power *
3427 min(this_load_per_task, this_load);
3428 pwr_now /= SCHED_LOAD_SCALE;
3429
3430 /* Amount of load we'd subtract */
3431 tmp = sg_div_cpu_power(busiest,
3432 busiest_load_per_task * SCHED_LOAD_SCALE);
3433 if (max_load > tmp)
3434 pwr_move += busiest->__cpu_power *
3435 min(busiest_load_per_task, max_load - tmp);
3436
3437 /* Amount of load we'd add */
3438 if (max_load * busiest->__cpu_power <
3439 busiest_load_per_task * SCHED_LOAD_SCALE)
3440 tmp = sg_div_cpu_power(this,
3441 max_load * busiest->__cpu_power);
3442 else
3443 tmp = sg_div_cpu_power(this,
3444 busiest_load_per_task * SCHED_LOAD_SCALE);
3445 pwr_move += this->__cpu_power *
3446 min(this_load_per_task, this_load + tmp);
3447 pwr_move /= SCHED_LOAD_SCALE;
3448 3775
3449 /* Move if we gain throughput */ 3776 /*
3450 if (pwr_move > pwr_now) 3777 * Compute the various statistics relavent for load balancing at
3451 *imbalance = busiest_load_per_task; 3778 * this level.
3452 } 3779 */
3780 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3781 balance, &sds);
3782
3783 /* Cases where imbalance does not exist from POV of this_cpu */
3784 /* 1) this_cpu is not the appropriate cpu to perform load balancing
3785 * at this level.
3786 * 2) There is no busy sibling group to pull from.
3787 * 3) This group is the busiest group.
3788 * 4) This group is more busy than the avg busieness at this
3789 * sched_domain.
3790 * 5) The imbalance is within the specified limit.
3791 * 6) Any rebalance would lead to ping-pong
3792 */
3793 if (balance && !(*balance))
3794 goto ret;
3453 3795
3454 return busiest; 3796 if (!sds.busiest || sds.busiest_nr_running == 0)
3797 goto out_balanced;
3455 3798
3456out_balanced: 3799 if (sds.this_load >= sds.max_load)
3457#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3800 goto out_balanced;
3458 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3459 goto ret;
3460 3801
3461 if (this == group_leader && group_leader != group_min) { 3802 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3462 *imbalance = min_load_per_task; 3803
3463 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { 3804 if (sds.this_load >= sds.avg_load)
3464 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = 3805 goto out_balanced;
3465 cpumask_first(sched_group_cpus(group_leader)); 3806
3466 } 3807 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3467 return group_min; 3808 goto out_balanced;
3468 } 3809
3469#endif 3810 sds.busiest_load_per_task /= sds.busiest_nr_running;
3811 if (sds.group_imb)
3812 sds.busiest_load_per_task =
3813 min(sds.busiest_load_per_task, sds.avg_load);
3814
3815 /*
3816 * We're trying to get all the cpus to the average_load, so we don't
3817 * want to push ourselves above the average load, nor do we wish to
3818 * reduce the max loaded cpu below the average load, as either of these
3819 * actions would just result in more rebalancing later, and ping-pong
3820 * tasks around. Thus we look for the minimum possible imbalance.
3821 * Negative imbalances (*we* are more loaded than anyone else) will
3822 * be counted as no imbalance for these purposes -- we can't fix that
3823 * by pulling tasks to us. Be careful of negative numbers as they'll
3824 * appear as very large values with unsigned longs.
3825 */
3826 if (sds.max_load <= sds.busiest_load_per_task)
3827 goto out_balanced;
3828
3829 /* Looks like there is an imbalance. Compute it */
3830 calculate_imbalance(&sds, this_cpu, imbalance);
3831 return sds.busiest;
3832
3833out_balanced:
3834 /*
3835 * There is no obvious imbalance. But check if we can do some balancing
3836 * to save power.
3837 */
3838 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
3839 return sds.busiest;
3470ret: 3840ret:
3471 *imbalance = 0; 3841 *imbalance = 0;
3472 return NULL; 3842 return NULL;
@@ -3510,19 +3880,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3510 */ 3880 */
3511#define MAX_PINNED_INTERVAL 512 3881#define MAX_PINNED_INTERVAL 512
3512 3882
3883/* Working cpumask for load_balance and load_balance_newidle. */
3884static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
3885
3513/* 3886/*
3514 * Check this_cpu to ensure it is balanced within domain. Attempt to move 3887 * Check this_cpu to ensure it is balanced within domain. Attempt to move
3515 * tasks if there is an imbalance. 3888 * tasks if there is an imbalance.
3516 */ 3889 */
3517static int load_balance(int this_cpu, struct rq *this_rq, 3890static int load_balance(int this_cpu, struct rq *this_rq,
3518 struct sched_domain *sd, enum cpu_idle_type idle, 3891 struct sched_domain *sd, enum cpu_idle_type idle,
3519 int *balance, struct cpumask *cpus) 3892 int *balance)
3520{ 3893{
3521 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; 3894 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
3522 struct sched_group *group; 3895 struct sched_group *group;
3523 unsigned long imbalance; 3896 unsigned long imbalance;
3524 struct rq *busiest; 3897 struct rq *busiest;
3525 unsigned long flags; 3898 unsigned long flags;
3899 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3526 3900
3527 cpumask_setall(cpus); 3901 cpumask_setall(cpus);
3528 3902
@@ -3677,8 +4051,7 @@ out:
3677 * this_rq is locked. 4051 * this_rq is locked.
3678 */ 4052 */
3679static int 4053static int
3680load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd, 4054load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
3681 struct cpumask *cpus)
3682{ 4055{
3683 struct sched_group *group; 4056 struct sched_group *group;
3684 struct rq *busiest = NULL; 4057 struct rq *busiest = NULL;
@@ -3686,6 +4059,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3686 int ld_moved = 0; 4059 int ld_moved = 0;
3687 int sd_idle = 0; 4060 int sd_idle = 0;
3688 int all_pinned = 0; 4061 int all_pinned = 0;
4062 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
3689 4063
3690 cpumask_setall(cpus); 4064 cpumask_setall(cpus);
3691 4065
@@ -3826,10 +4200,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3826 struct sched_domain *sd; 4200 struct sched_domain *sd;
3827 int pulled_task = 0; 4201 int pulled_task = 0;
3828 unsigned long next_balance = jiffies + HZ; 4202 unsigned long next_balance = jiffies + HZ;
3829 cpumask_var_t tmpmask;
3830
3831 if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
3832 return;
3833 4203
3834 for_each_domain(this_cpu, sd) { 4204 for_each_domain(this_cpu, sd) {
3835 unsigned long interval; 4205 unsigned long interval;
@@ -3840,7 +4210,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3840 if (sd->flags & SD_BALANCE_NEWIDLE) 4210 if (sd->flags & SD_BALANCE_NEWIDLE)
3841 /* If we've pulled tasks over stop searching: */ 4211 /* If we've pulled tasks over stop searching: */
3842 pulled_task = load_balance_newidle(this_cpu, this_rq, 4212 pulled_task = load_balance_newidle(this_cpu, this_rq,
3843 sd, tmpmask); 4213 sd);
3844 4214
3845 interval = msecs_to_jiffies(sd->balance_interval); 4215 interval = msecs_to_jiffies(sd->balance_interval);
3846 if (time_after(next_balance, sd->last_balance + interval)) 4216 if (time_after(next_balance, sd->last_balance + interval))
@@ -3855,7 +4225,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
3855 */ 4225 */
3856 this_rq->next_balance = next_balance; 4226 this_rq->next_balance = next_balance;
3857 } 4227 }
3858 free_cpumask_var(tmpmask);
3859} 4228}
3860 4229
3861/* 4230/*
@@ -4005,11 +4374,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4005 unsigned long next_balance = jiffies + 60*HZ; 4374 unsigned long next_balance = jiffies + 60*HZ;
4006 int update_next_balance = 0; 4375 int update_next_balance = 0;
4007 int need_serialize; 4376 int need_serialize;
4008 cpumask_var_t tmp;
4009
4010 /* Fails alloc? Rebalancing probably not a priority right now. */
4011 if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
4012 return;
4013 4377
4014 for_each_domain(cpu, sd) { 4378 for_each_domain(cpu, sd) {
4015 if (!(sd->flags & SD_LOAD_BALANCE)) 4379 if (!(sd->flags & SD_LOAD_BALANCE))
@@ -4034,7 +4398,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4034 } 4398 }
4035 4399
4036 if (time_after_eq(jiffies, sd->last_balance + interval)) { 4400 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4037 if (load_balance(cpu, rq, sd, idle, &balance, tmp)) { 4401 if (load_balance(cpu, rq, sd, idle, &balance)) {
4038 /* 4402 /*
4039 * We've pulled tasks over so either we're no 4403 * We've pulled tasks over so either we're no
4040 * longer idle, or one of our SMT siblings is 4404 * longer idle, or one of our SMT siblings is
@@ -4068,8 +4432,6 @@ out:
4068 */ 4432 */
4069 if (likely(update_next_balance)) 4433 if (likely(update_next_balance))
4070 rq->next_balance = next_balance; 4434 rq->next_balance = next_balance;
4071
4072 free_cpumask_var(tmp);
4073} 4435}
4074 4436
4075/* 4437/*
@@ -4119,6 +4481,11 @@ static void run_rebalance_domains(struct softirq_action *h)
4119#endif 4481#endif
4120} 4482}
4121 4483
4484static inline int on_null_domain(int cpu)
4485{
4486 return !rcu_dereference(cpu_rq(cpu)->sd);
4487}
4488
4122/* 4489/*
4123 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 4490 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4124 * 4491 *
@@ -4176,7 +4543,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4176 cpumask_test_cpu(cpu, nohz.cpu_mask)) 4543 cpumask_test_cpu(cpu, nohz.cpu_mask))
4177 return; 4544 return;
4178#endif 4545#endif
4179 if (time_after_eq(jiffies, rq->next_balance)) 4546 /* Don't need to rebalance while attached to NULL domain */
4547 if (time_after_eq(jiffies, rq->next_balance) &&
4548 likely(!on_null_domain(cpu)))
4180 raise_softirq(SCHED_SOFTIRQ); 4549 raise_softirq(SCHED_SOFTIRQ);
4181} 4550}
4182 4551
@@ -4490,10 +4859,7 @@ void scheduler_tick(void)
4490#endif 4859#endif
4491} 4860}
4492 4861
4493#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 4862unsigned long get_parent_ip(unsigned long addr)
4494 defined(CONFIG_PREEMPT_TRACER))
4495
4496static inline unsigned long get_parent_ip(unsigned long addr)
4497{ 4863{
4498 if (in_lock_functions(addr)) { 4864 if (in_lock_functions(addr)) {
4499 addr = CALLER_ADDR2; 4865 addr = CALLER_ADDR2;
@@ -4503,6 +4869,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
4503 return addr; 4869 return addr;
4504} 4870}
4505 4871
4872#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4873 defined(CONFIG_PREEMPT_TRACER))
4874
4506void __kprobes add_preempt_count(int val) 4875void __kprobes add_preempt_count(int val)
4507{ 4876{
4508#ifdef CONFIG_DEBUG_PREEMPT 4877#ifdef CONFIG_DEBUG_PREEMPT
@@ -4594,11 +4963,33 @@ static inline void schedule_debug(struct task_struct *prev)
4594#endif 4963#endif
4595} 4964}
4596 4965
4966static void put_prev_task(struct rq *rq, struct task_struct *prev)
4967{
4968 if (prev->state == TASK_RUNNING) {
4969 u64 runtime = prev->se.sum_exec_runtime;
4970
4971 runtime -= prev->se.prev_sum_exec_runtime;
4972 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
4973
4974 /*
4975 * In order to avoid avg_overlap growing stale when we are
4976 * indeed overlapping and hence not getting put to sleep, grow
4977 * the avg_overlap on preemption.
4978 *
4979 * We use the average preemption runtime because that
4980 * correlates to the amount of cache footprint a task can
4981 * build up.
4982 */
4983 update_avg(&prev->se.avg_overlap, runtime);
4984 }
4985 prev->sched_class->put_prev_task(rq, prev);
4986}
4987
4597/* 4988/*
4598 * Pick up the highest-prio task: 4989 * Pick up the highest-prio task:
4599 */ 4990 */
4600static inline struct task_struct * 4991static inline struct task_struct *
4601pick_next_task(struct rq *rq, struct task_struct *prev) 4992pick_next_task(struct rq *rq)
4602{ 4993{
4603 const struct sched_class *class; 4994 const struct sched_class *class;
4604 struct task_struct *p; 4995 struct task_struct *p;
@@ -4629,15 +5020,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
4629/* 5020/*
4630 * schedule() is the main scheduler function. 5021 * schedule() is the main scheduler function.
4631 */ 5022 */
4632asmlinkage void __sched schedule(void) 5023asmlinkage void __sched __schedule(void)
4633{ 5024{
4634 struct task_struct *prev, *next; 5025 struct task_struct *prev, *next;
4635 unsigned long *switch_count; 5026 unsigned long *switch_count;
4636 struct rq *rq; 5027 struct rq *rq;
4637 int cpu; 5028 int cpu;
4638 5029
4639need_resched:
4640 preempt_disable();
4641 cpu = smp_processor_id(); 5030 cpu = smp_processor_id();
4642 rq = cpu_rq(cpu); 5031 rq = cpu_rq(cpu);
4643 rcu_qsctr_inc(cpu); 5032 rcu_qsctr_inc(cpu);
@@ -4672,8 +5061,8 @@ need_resched_nonpreemptible:
4672 if (unlikely(!rq->nr_running)) 5061 if (unlikely(!rq->nr_running))
4673 idle_balance(cpu, rq); 5062 idle_balance(cpu, rq);
4674 5063
4675 prev->sched_class->put_prev_task(rq, prev); 5064 put_prev_task(rq, prev);
4676 next = pick_next_task(rq, prev); 5065 next = pick_next_task(rq);
4677 5066
4678 if (likely(prev != next)) { 5067 if (likely(prev != next)) {
4679 sched_info_switch(prev, next); 5068 sched_info_switch(prev, next);
@@ -4695,13 +5084,80 @@ need_resched_nonpreemptible:
4695 5084
4696 if (unlikely(reacquire_kernel_lock(current) < 0)) 5085 if (unlikely(reacquire_kernel_lock(current) < 0))
4697 goto need_resched_nonpreemptible; 5086 goto need_resched_nonpreemptible;
5087}
4698 5088
5089asmlinkage void __sched schedule(void)
5090{
5091need_resched:
5092 preempt_disable();
5093 __schedule();
4699 preempt_enable_no_resched(); 5094 preempt_enable_no_resched();
4700 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 5095 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
4701 goto need_resched; 5096 goto need_resched;
4702} 5097}
4703EXPORT_SYMBOL(schedule); 5098EXPORT_SYMBOL(schedule);
4704 5099
5100#ifdef CONFIG_SMP
5101/*
5102 * Look out! "owner" is an entirely speculative pointer
5103 * access and not reliable.
5104 */
5105int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
5106{
5107 unsigned int cpu;
5108 struct rq *rq;
5109
5110 if (!sched_feat(OWNER_SPIN))
5111 return 0;
5112
5113#ifdef CONFIG_DEBUG_PAGEALLOC
5114 /*
5115 * Need to access the cpu field knowing that
5116 * DEBUG_PAGEALLOC could have unmapped it if
5117 * the mutex owner just released it and exited.
5118 */
5119 if (probe_kernel_address(&owner->cpu, cpu))
5120 goto out;
5121#else
5122 cpu = owner->cpu;
5123#endif
5124
5125 /*
5126 * Even if the access succeeded (likely case),
5127 * the cpu field may no longer be valid.
5128 */
5129 if (cpu >= nr_cpumask_bits)
5130 goto out;
5131
5132 /*
5133 * We need to validate that we can do a
5134 * get_cpu() and that we have the percpu area.
5135 */
5136 if (!cpu_online(cpu))
5137 goto out;
5138
5139 rq = cpu_rq(cpu);
5140
5141 for (;;) {
5142 /*
5143 * Owner changed, break to re-assess state.
5144 */
5145 if (lock->owner != owner)
5146 break;
5147
5148 /*
5149 * Is that owner really running on that cpu?
5150 */
5151 if (task_thread_info(rq->curr) != owner || need_resched())
5152 return 0;
5153
5154 cpu_relax();
5155 }
5156out:
5157 return 1;
5158}
5159#endif
5160
4705#ifdef CONFIG_PREEMPT 5161#ifdef CONFIG_PREEMPT
4706/* 5162/*
4707 * this is the entry point to schedule() from in-kernel preemption 5163 * this is the entry point to schedule() from in-kernel preemption
@@ -4729,7 +5185,7 @@ asmlinkage void __sched preempt_schedule(void)
4729 * between schedule and now. 5185 * between schedule and now.
4730 */ 5186 */
4731 barrier(); 5187 barrier();
4732 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 5188 } while (need_resched());
4733} 5189}
4734EXPORT_SYMBOL(preempt_schedule); 5190EXPORT_SYMBOL(preempt_schedule);
4735 5191
@@ -4758,7 +5214,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
4758 * between schedule and now. 5214 * between schedule and now.
4759 */ 5215 */
4760 barrier(); 5216 barrier();
4761 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 5217 } while (need_resched());
4762} 5218}
4763 5219
4764#endif /* CONFIG_PREEMPT */ 5220#endif /* CONFIG_PREEMPT */
@@ -4819,11 +5275,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4819 __wake_up_common(q, mode, 1, 0, NULL); 5275 __wake_up_common(q, mode, 1, 0, NULL);
4820} 5276}
4821 5277
5278void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
5279{
5280 __wake_up_common(q, mode, 1, 0, key);
5281}
5282
4822/** 5283/**
4823 * __wake_up_sync - wake up threads blocked on a waitqueue. 5284 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
4824 * @q: the waitqueue 5285 * @q: the waitqueue
4825 * @mode: which threads 5286 * @mode: which threads
4826 * @nr_exclusive: how many wake-one or wake-many threads to wake up 5287 * @nr_exclusive: how many wake-one or wake-many threads to wake up
5288 * @key: opaque value to be passed to wakeup targets
4827 * 5289 *
4828 * The sync wakeup differs that the waker knows that it will schedule 5290 * The sync wakeup differs that the waker knows that it will schedule
4829 * away soon, so while the target thread will be woken up, it will not 5291 * away soon, so while the target thread will be woken up, it will not
@@ -4832,8 +5294,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4832 * 5294 *
4833 * On UP it can prevent extra preemption. 5295 * On UP it can prevent extra preemption.
4834 */ 5296 */
4835void 5297void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
4836__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) 5298 int nr_exclusive, void *key)
4837{ 5299{
4838 unsigned long flags; 5300 unsigned long flags;
4839 int sync = 1; 5301 int sync = 1;
@@ -4845,9 +5307,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4845 sync = 0; 5307 sync = 0;
4846 5308
4847 spin_lock_irqsave(&q->lock, flags); 5309 spin_lock_irqsave(&q->lock, flags);
4848 __wake_up_common(q, mode, nr_exclusive, sync, NULL); 5310 __wake_up_common(q, mode, nr_exclusive, sync, key);
4849 spin_unlock_irqrestore(&q->lock, flags); 5311 spin_unlock_irqrestore(&q->lock, flags);
4850} 5312}
5313EXPORT_SYMBOL_GPL(__wake_up_sync_key);
5314
5315/*
5316 * __wake_up_sync - see __wake_up_sync_key()
5317 */
5318void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
5319{
5320 __wake_up_sync_key(q, mode, nr_exclusive, NULL);
5321}
4851EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 5322EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4852 5323
4853/** 5324/**
@@ -5232,7 +5703,7 @@ SYSCALL_DEFINE1(nice, int, increment)
5232 if (increment > 40) 5703 if (increment > 40)
5233 increment = 40; 5704 increment = 40;
5234 5705
5235 nice = PRIO_TO_NICE(current->static_prio) + increment; 5706 nice = TASK_NICE(current) + increment;
5236 if (nice < -20) 5707 if (nice < -20)
5237 nice = -20; 5708 nice = -20;
5238 if (nice > 19) 5709 if (nice > 19)
@@ -6505,7 +6976,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6505 if (!rq->nr_running) 6976 if (!rq->nr_running)
6506 break; 6977 break;
6507 update_rq_clock(rq); 6978 update_rq_clock(rq);
6508 next = pick_next_task(rq, rq->curr); 6979 next = pick_next_task(rq);
6509 if (!next) 6980 if (!next)
6510 break; 6981 break;
6511 next->sched_class->put_prev_task(rq, next); 6982 next->sched_class->put_prev_task(rq, next);
@@ -7336,7 +7807,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
7336{ 7807{
7337 int group; 7808 int group;
7338 7809
7339 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); 7810 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7340 group = cpumask_first(mask); 7811 group = cpumask_first(mask);
7341 if (sg) 7812 if (sg)
7342 *sg = &per_cpu(sched_group_core, group).sg; 7813 *sg = &per_cpu(sched_group_core, group).sg;
@@ -7365,7 +7836,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
7365 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 7836 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
7366 group = cpumask_first(mask); 7837 group = cpumask_first(mask);
7367#elif defined(CONFIG_SCHED_SMT) 7838#elif defined(CONFIG_SCHED_SMT)
7368 cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map); 7839 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
7369 group = cpumask_first(mask); 7840 group = cpumask_first(mask);
7370#else 7841#else
7371 group = cpu; 7842 group = cpu;
@@ -7708,7 +8179,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7708 SD_INIT(sd, SIBLING); 8179 SD_INIT(sd, SIBLING);
7709 set_domain_attribute(sd, attr); 8180 set_domain_attribute(sd, attr);
7710 cpumask_and(sched_domain_span(sd), 8181 cpumask_and(sched_domain_span(sd),
7711 &per_cpu(cpu_sibling_map, i), cpu_map); 8182 topology_thread_cpumask(i), cpu_map);
7712 sd->parent = p; 8183 sd->parent = p;
7713 p->child = sd; 8184 p->child = sd;
7714 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask); 8185 cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7719,7 +8190,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7719 /* Set up CPU (sibling) groups */ 8190 /* Set up CPU (sibling) groups */
7720 for_each_cpu(i, cpu_map) { 8191 for_each_cpu(i, cpu_map) {
7721 cpumask_and(this_sibling_map, 8192 cpumask_and(this_sibling_map,
7722 &per_cpu(cpu_sibling_map, i), cpu_map); 8193 topology_thread_cpumask(i), cpu_map);
7723 if (i != cpumask_first(this_sibling_map)) 8194 if (i != cpumask_first(this_sibling_map))
7724 continue; 8195 continue;
7725 8196
@@ -8300,11 +8771,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8300 __set_bit(MAX_RT_PRIO, array->bitmap); 8771 __set_bit(MAX_RT_PRIO, array->bitmap);
8301 8772
8302#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 8773#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
8303 rt_rq->highest_prio = MAX_RT_PRIO; 8774 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8775#ifdef CONFIG_SMP
8776 rt_rq->highest_prio.next = MAX_RT_PRIO;
8777#endif
8304#endif 8778#endif
8305#ifdef CONFIG_SMP 8779#ifdef CONFIG_SMP
8306 rt_rq->rt_nr_migratory = 0; 8780 rt_rq->rt_nr_migratory = 0;
8307 rt_rq->overloaded = 0; 8781 rt_rq->overloaded = 0;
8782 plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
8308#endif 8783#endif
8309 8784
8310 rt_rq->rt_time = 0; 8785 rt_rq->rt_time = 0;
@@ -8391,6 +8866,9 @@ void __init sched_init(void)
8391#ifdef CONFIG_USER_SCHED 8866#ifdef CONFIG_USER_SCHED
8392 alloc_size *= 2; 8867 alloc_size *= 2;
8393#endif 8868#endif
8869#ifdef CONFIG_CPUMASK_OFFSTACK
8870 alloc_size += num_possible_cpus() * cpumask_size();
8871#endif
8394 /* 8872 /*
8395 * As sched_init() is called before page_alloc is setup, 8873 * As sched_init() is called before page_alloc is setup,
8396 * we use alloc_bootmem(). 8874 * we use alloc_bootmem().
@@ -8428,6 +8906,12 @@ void __init sched_init(void)
8428 ptr += nr_cpu_ids * sizeof(void **); 8906 ptr += nr_cpu_ids * sizeof(void **);
8429#endif /* CONFIG_USER_SCHED */ 8907#endif /* CONFIG_USER_SCHED */
8430#endif /* CONFIG_RT_GROUP_SCHED */ 8908#endif /* CONFIG_RT_GROUP_SCHED */
8909#ifdef CONFIG_CPUMASK_OFFSTACK
8910 for_each_possible_cpu(i) {
8911 per_cpu(load_balance_tmpmask, i) = (void *)ptr;
8912 ptr += cpumask_size();
8913 }
8914#endif /* CONFIG_CPUMASK_OFFSTACK */
8431 } 8915 }
8432 8916
8433#ifdef CONFIG_SMP 8917#ifdef CONFIG_SMP
@@ -9572,7 +10056,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9572 10056
9573static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 10057static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9574{ 10058{
9575 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 10059 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9576 u64 data; 10060 u64 data;
9577 10061
9578#ifndef CONFIG_64BIT 10062#ifndef CONFIG_64BIT
@@ -9591,7 +10075,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9591 10075
9592static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 10076static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9593{ 10077{
9594 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 10078 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9595 10079
9596#ifndef CONFIG_64BIT 10080#ifndef CONFIG_64BIT
9597 /* 10081 /*
@@ -9680,14 +10164,14 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9680 struct cpuacct *ca; 10164 struct cpuacct *ca;
9681 int cpu; 10165 int cpu;
9682 10166
9683 if (!cpuacct_subsys.active) 10167 if (unlikely(!cpuacct_subsys.active))
9684 return; 10168 return;
9685 10169
9686 cpu = task_cpu(tsk); 10170 cpu = task_cpu(tsk);
9687 ca = task_ca(tsk); 10171 ca = task_ca(tsk);
9688 10172
9689 for (; ca; ca = ca->parent) { 10173 for (; ca; ca = ca->parent) {
9690 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 10174 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9691 *cpuusage += cputime; 10175 *cpuusage += cputime;
9692 } 10176 }
9693} 10177}