aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1051
1 files changed, 731 insertions, 320 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 186c6fd08acf..196d48babbef 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
223{ 223{
224 ktime_t now; 224 ktime_t now;
225 225
226 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF) 226 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
227 return; 227 return;
228 228
229 if (hrtimer_active(&rt_b->rt_period_timer)) 229 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -331,6 +331,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
331 */ 331 */
332static DEFINE_SPINLOCK(task_group_lock); 332static DEFINE_SPINLOCK(task_group_lock);
333 333
334#ifdef CONFIG_SMP
335static int root_task_group_empty(void)
336{
337 return list_empty(&root_task_group.children);
338}
339#endif
340
334#ifdef CONFIG_FAIR_GROUP_SCHED 341#ifdef CONFIG_FAIR_GROUP_SCHED
335#ifdef CONFIG_USER_SCHED 342#ifdef CONFIG_USER_SCHED
336# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 343# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
@@ -391,6 +398,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
391 398
392#else 399#else
393 400
401#ifdef CONFIG_SMP
402static int root_task_group_empty(void)
403{
404 return 1;
405}
406#endif
407
394static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 408static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
395static inline struct task_group *task_group(struct task_struct *p) 409static inline struct task_group *task_group(struct task_struct *p)
396{ 410{
@@ -467,11 +481,17 @@ struct rt_rq {
467 struct rt_prio_array active; 481 struct rt_prio_array active;
468 unsigned long rt_nr_running; 482 unsigned long rt_nr_running;
469#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 483#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
470 int highest_prio; /* highest queued rt task prio */ 484 struct {
485 int curr; /* highest queued rt task prio */
486#ifdef CONFIG_SMP
487 int next; /* next highest */
488#endif
489 } highest_prio;
471#endif 490#endif
472#ifdef CONFIG_SMP 491#ifdef CONFIG_SMP
473 unsigned long rt_nr_migratory; 492 unsigned long rt_nr_migratory;
474 int overloaded; 493 int overloaded;
494 struct plist_head pushable_tasks;
475#endif 495#endif
476 int rt_throttled; 496 int rt_throttled;
477 u64 rt_time; 497 u64 rt_time;
@@ -549,7 +569,6 @@ struct rq {
549 unsigned long nr_running; 569 unsigned long nr_running;
550 #define CPU_LOAD_IDX_MAX 5 570 #define CPU_LOAD_IDX_MAX 5
551 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 571 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
552 unsigned char idle_at_tick;
553#ifdef CONFIG_NO_HZ 572#ifdef CONFIG_NO_HZ
554 unsigned long last_tick_seen; 573 unsigned long last_tick_seen;
555 unsigned char in_nohz_recently; 574 unsigned char in_nohz_recently;
@@ -590,6 +609,7 @@ struct rq {
590 struct root_domain *rd; 609 struct root_domain *rd;
591 struct sched_domain *sd; 610 struct sched_domain *sd;
592 611
612 unsigned char idle_at_tick;
593 /* For active balancing */ 613 /* For active balancing */
594 int active_balance; 614 int active_balance;
595 int push_cpu; 615 int push_cpu;
@@ -618,9 +638,6 @@ struct rq {
618 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 638 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
619 639
620 /* sys_sched_yield() stats */ 640 /* sys_sched_yield() stats */
621 unsigned int yld_exp_empty;
622 unsigned int yld_act_empty;
623 unsigned int yld_both_empty;
624 unsigned int yld_count; 641 unsigned int yld_count;
625 642
626 /* schedule() stats */ 643 /* schedule() stats */
@@ -1183,10 +1200,10 @@ static void resched_task(struct task_struct *p)
1183 1200
1184 assert_spin_locked(&task_rq(p)->lock); 1201 assert_spin_locked(&task_rq(p)->lock);
1185 1202
1186 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 1203 if (test_tsk_need_resched(p))
1187 return; 1204 return;
1188 1205
1189 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 1206 set_tsk_need_resched(p);
1190 1207
1191 cpu = task_cpu(p); 1208 cpu = task_cpu(p);
1192 if (cpu == smp_processor_id()) 1209 if (cpu == smp_processor_id())
@@ -1242,7 +1259,7 @@ void wake_up_idle_cpu(int cpu)
1242 * lockless. The worst case is that the other CPU runs the 1259 * lockless. The worst case is that the other CPU runs the
1243 * idle task through an additional NOOP schedule() 1260 * idle task through an additional NOOP schedule()
1244 */ 1261 */
1245 set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED); 1262 set_tsk_need_resched(rq->idle);
1246 1263
1247 /* NEED_RESCHED must be visible before we test polling */ 1264 /* NEED_RESCHED must be visible before we test polling */
1248 smp_mb(); 1265 smp_mb();
@@ -1610,21 +1627,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1610 1627
1611#endif 1628#endif
1612 1629
1630#ifdef CONFIG_PREEMPT
1631
1613/* 1632/*
1614 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1633 * fair double_lock_balance: Safely acquires both rq->locks in a fair
1634 * way at the expense of forcing extra atomic operations in all
1635 * invocations. This assures that the double_lock is acquired using the
1636 * same underlying policy as the spinlock_t on this architecture, which
1637 * reduces latency compared to the unfair variant below. However, it
1638 * also adds more overhead and therefore may reduce throughput.
1615 */ 1639 */
1616static int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1640static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1641 __releases(this_rq->lock)
1642 __acquires(busiest->lock)
1643 __acquires(this_rq->lock)
1644{
1645 spin_unlock(&this_rq->lock);
1646 double_rq_lock(this_rq, busiest);
1647
1648 return 1;
1649}
1650
1651#else
1652/*
1653 * Unfair double_lock_balance: Optimizes throughput at the expense of
1654 * latency by eliminating extra atomic operations when the locks are
1655 * already in proper order on entry. This favors lower cpu-ids and will
1656 * grant the double lock to lower cpus over higher ids under contention,
1657 * regardless of entry order into the function.
1658 */
1659static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1617 __releases(this_rq->lock) 1660 __releases(this_rq->lock)
1618 __acquires(busiest->lock) 1661 __acquires(busiest->lock)
1619 __acquires(this_rq->lock) 1662 __acquires(this_rq->lock)
1620{ 1663{
1621 int ret = 0; 1664 int ret = 0;
1622 1665
1623 if (unlikely(!irqs_disabled())) {
1624 /* printk() doesn't work good under rq->lock */
1625 spin_unlock(&this_rq->lock);
1626 BUG_ON(1);
1627 }
1628 if (unlikely(!spin_trylock(&busiest->lock))) { 1666 if (unlikely(!spin_trylock(&busiest->lock))) {
1629 if (busiest < this_rq) { 1667 if (busiest < this_rq) {
1630 spin_unlock(&this_rq->lock); 1668 spin_unlock(&this_rq->lock);
@@ -1637,6 +1675,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1637 return ret; 1675 return ret;
1638} 1676}
1639 1677
1678#endif /* CONFIG_PREEMPT */
1679
1680/*
1681 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1682 */
1683static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1684{
1685 if (unlikely(!irqs_disabled())) {
1686 /* printk() doesn't work good under rq->lock */
1687 spin_unlock(&this_rq->lock);
1688 BUG_ON(1);
1689 }
1690
1691 return _double_lock_balance(this_rq, busiest);
1692}
1693
1640static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest) 1694static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1641 __releases(busiest->lock) 1695 __releases(busiest->lock)
1642{ 1696{
@@ -1705,6 +1759,9 @@ static void update_avg(u64 *avg, u64 sample)
1705 1759
1706static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1760static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1707{ 1761{
1762 if (wakeup)
1763 p->se.start_runtime = p->se.sum_exec_runtime;
1764
1708 sched_info_queued(p); 1765 sched_info_queued(p);
1709 p->sched_class->enqueue_task(rq, p, wakeup); 1766 p->sched_class->enqueue_task(rq, p, wakeup);
1710 p->se.on_rq = 1; 1767 p->se.on_rq = 1;
@@ -1712,10 +1769,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1712 1769
1713static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1770static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1714{ 1771{
1715 if (sleep && p->se.last_wakeup) { 1772 if (sleep) {
1716 update_avg(&p->se.avg_overlap, 1773 if (p->se.last_wakeup) {
1717 p->se.sum_exec_runtime - p->se.last_wakeup); 1774 update_avg(&p->se.avg_overlap,
1718 p->se.last_wakeup = 0; 1775 p->se.sum_exec_runtime - p->se.last_wakeup);
1776 p->se.last_wakeup = 0;
1777 } else {
1778 update_avg(&p->se.avg_wakeup,
1779 sysctl_sched_wakeup_granularity);
1780 }
1719 } 1781 }
1720 1782
1721 sched_info_dequeued(p); 1783 sched_info_dequeued(p);
@@ -2017,7 +2079,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2017 * it must be off the runqueue _entirely_, and not 2079 * it must be off the runqueue _entirely_, and not
2018 * preempted! 2080 * preempted!
2019 * 2081 *
2020 * So if it wa still runnable (but just not actively 2082 * So if it was still runnable (but just not actively
2021 * running right now), it's preempted, and we should 2083 * running right now), it's preempted, and we should
2022 * yield - it could be a while. 2084 * yield - it could be a while.
2023 */ 2085 */
@@ -2266,18 +2328,8 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2266 if (!sched_feat(SYNC_WAKEUPS)) 2328 if (!sched_feat(SYNC_WAKEUPS))
2267 sync = 0; 2329 sync = 0;
2268 2330
2269 if (!sync) {
2270 if (current->se.avg_overlap < sysctl_sched_migration_cost &&
2271 p->se.avg_overlap < sysctl_sched_migration_cost)
2272 sync = 1;
2273 } else {
2274 if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
2275 p->se.avg_overlap >= sysctl_sched_migration_cost)
2276 sync = 0;
2277 }
2278
2279#ifdef CONFIG_SMP 2331#ifdef CONFIG_SMP
2280 if (sched_feat(LB_WAKEUP_UPDATE)) { 2332 if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
2281 struct sched_domain *sd; 2333 struct sched_domain *sd;
2282 2334
2283 this_cpu = raw_smp_processor_id(); 2335 this_cpu = raw_smp_processor_id();
@@ -2355,6 +2407,22 @@ out_activate:
2355 activate_task(rq, p, 1); 2407 activate_task(rq, p, 1);
2356 success = 1; 2408 success = 1;
2357 2409
2410 /*
2411 * Only attribute actual wakeups done by this task.
2412 */
2413 if (!in_interrupt()) {
2414 struct sched_entity *se = &current->se;
2415 u64 sample = se->sum_exec_runtime;
2416
2417 if (se->last_wakeup)
2418 sample -= se->last_wakeup;
2419 else
2420 sample -= se->start_runtime;
2421 update_avg(&se->avg_wakeup, sample);
2422
2423 se->last_wakeup = se->sum_exec_runtime;
2424 }
2425
2358out_running: 2426out_running:
2359 trace_sched_wakeup(rq, p, success); 2427 trace_sched_wakeup(rq, p, success);
2360 check_preempt_curr(rq, p, sync); 2428 check_preempt_curr(rq, p, sync);
@@ -2365,8 +2433,6 @@ out_running:
2365 p->sched_class->task_wake_up(rq, p); 2433 p->sched_class->task_wake_up(rq, p);
2366#endif 2434#endif
2367out: 2435out:
2368 current->se.last_wakeup = current->se.sum_exec_runtime;
2369
2370 task_rq_unlock(rq, &flags); 2436 task_rq_unlock(rq, &flags);
2371 2437
2372 return success; 2438 return success;
@@ -2396,6 +2462,8 @@ static void __sched_fork(struct task_struct *p)
2396 p->se.prev_sum_exec_runtime = 0; 2462 p->se.prev_sum_exec_runtime = 0;
2397 p->se.last_wakeup = 0; 2463 p->se.last_wakeup = 0;
2398 p->se.avg_overlap = 0; 2464 p->se.avg_overlap = 0;
2465 p->se.start_runtime = 0;
2466 p->se.avg_wakeup = sysctl_sched_wakeup_granularity;
2399 2467
2400#ifdef CONFIG_SCHEDSTATS 2468#ifdef CONFIG_SCHEDSTATS
2401 p->se.wait_start = 0; 2469 p->se.wait_start = 0;
@@ -2458,6 +2526,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
2458 /* Want to start with kernel preemption disabled. */ 2526 /* Want to start with kernel preemption disabled. */
2459 task_thread_info(p)->preempt_count = 1; 2527 task_thread_info(p)->preempt_count = 1;
2460#endif 2528#endif
2529 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2530
2461 put_cpu(); 2531 put_cpu();
2462} 2532}
2463 2533
@@ -2501,7 +2571,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2501#ifdef CONFIG_PREEMPT_NOTIFIERS 2571#ifdef CONFIG_PREEMPT_NOTIFIERS
2502 2572
2503/** 2573/**
2504 * preempt_notifier_register - tell me when current is being being preempted & rescheduled 2574 * preempt_notifier_register - tell me when current is being preempted & rescheduled
2505 * @notifier: notifier struct to register 2575 * @notifier: notifier struct to register
2506 */ 2576 */
2507void preempt_notifier_register(struct preempt_notifier *notifier) 2577void preempt_notifier_register(struct preempt_notifier *notifier)
@@ -2598,6 +2668,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2598{ 2668{
2599 struct mm_struct *mm = rq->prev_mm; 2669 struct mm_struct *mm = rq->prev_mm;
2600 long prev_state; 2670 long prev_state;
2671#ifdef CONFIG_SMP
2672 int post_schedule = 0;
2673
2674 if (current->sched_class->needs_post_schedule)
2675 post_schedule = current->sched_class->needs_post_schedule(rq);
2676#endif
2601 2677
2602 rq->prev_mm = NULL; 2678 rq->prev_mm = NULL;
2603 2679
@@ -2616,7 +2692,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2616 finish_arch_switch(prev); 2692 finish_arch_switch(prev);
2617 finish_lock_switch(rq, prev); 2693 finish_lock_switch(rq, prev);
2618#ifdef CONFIG_SMP 2694#ifdef CONFIG_SMP
2619 if (current->sched_class->post_schedule) 2695 if (post_schedule)
2620 current->sched_class->post_schedule(rq); 2696 current->sched_class->post_schedule(rq);
2621#endif 2697#endif
2622 2698
@@ -2923,6 +2999,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2923 struct sched_domain *sd, enum cpu_idle_type idle, 2999 struct sched_domain *sd, enum cpu_idle_type idle,
2924 int *all_pinned) 3000 int *all_pinned)
2925{ 3001{
3002 int tsk_cache_hot = 0;
2926 /* 3003 /*
2927 * We do not migrate tasks that are: 3004 * We do not migrate tasks that are:
2928 * 1) running (obviously), or 3005 * 1) running (obviously), or
@@ -2946,10 +3023,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2946 * 2) too many balance attempts have failed. 3023 * 2) too many balance attempts have failed.
2947 */ 3024 */
2948 3025
2949 if (!task_hot(p, rq->clock, sd) || 3026 tsk_cache_hot = task_hot(p, rq->clock, sd);
2950 sd->nr_balance_failed > sd->cache_nice_tries) { 3027 if (!tsk_cache_hot ||
3028 sd->nr_balance_failed > sd->cache_nice_tries) {
2951#ifdef CONFIG_SCHEDSTATS 3029#ifdef CONFIG_SCHEDSTATS
2952 if (task_hot(p, rq->clock, sd)) { 3030 if (tsk_cache_hot) {
2953 schedstat_inc(sd, lb_hot_gained[idle]); 3031 schedstat_inc(sd, lb_hot_gained[idle]);
2954 schedstat_inc(p, se.nr_forced_migrations); 3032 schedstat_inc(p, se.nr_forced_migrations);
2955 } 3033 }
@@ -2957,7 +3035,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
2957 return 1; 3035 return 1;
2958 } 3036 }
2959 3037
2960 if (task_hot(p, rq->clock, sd)) { 3038 if (tsk_cache_hot) {
2961 schedstat_inc(p, se.nr_failed_migrations_hot); 3039 schedstat_inc(p, se.nr_failed_migrations_hot);
2962 return 0; 3040 return 0;
2963 } 3041 }
@@ -2997,6 +3075,16 @@ next:
2997 pulled++; 3075 pulled++;
2998 rem_load_move -= p->se.load.weight; 3076 rem_load_move -= p->se.load.weight;
2999 3077
3078#ifdef CONFIG_PREEMPT
3079 /*
3080 * NEWIDLE balancing is a source of latency, so preemptible kernels
3081 * will stop after the first task is pulled to minimize the critical
3082 * section.
3083 */
3084 if (idle == CPU_NEWLY_IDLE)
3085 goto out;
3086#endif
3087
3000 /* 3088 /*
3001 * We only want to steal up to the prescribed amount of weighted load. 3089 * We only want to steal up to the prescribed amount of weighted load.
3002 */ 3090 */
@@ -3043,9 +3131,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3043 sd, idle, all_pinned, &this_best_prio); 3131 sd, idle, all_pinned, &this_best_prio);
3044 class = class->next; 3132 class = class->next;
3045 3133
3134#ifdef CONFIG_PREEMPT
3135 /*
3136 * NEWIDLE balancing is a source of latency, so preemptible
3137 * kernels will stop after the first task is pulled to minimize
3138 * the critical section.
3139 */
3046 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) 3140 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3047 break; 3141 break;
3048 3142#endif
3049 } while (class && max_load_move > total_load_moved); 3143 } while (class && max_load_move > total_load_moved);
3050 3144
3051 return total_load_moved > 0; 3145 return total_load_moved > 0;
@@ -3095,246 +3189,480 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3095 3189
3096 return 0; 3190 return 0;
3097} 3191}
3098 3192/********** Helpers for find_busiest_group ************************/
3099/* 3193/*
3100 * find_busiest_group finds and returns the busiest CPU group within the 3194 * sd_lb_stats - Structure to store the statistics of a sched_domain
3101 * domain. It calculates and returns the amount of weighted load which 3195 * during load balancing.
3102 * should be moved to restore balance via the imbalance parameter.
3103 */ 3196 */
3104static struct sched_group * 3197struct sd_lb_stats {
3105find_busiest_group(struct sched_domain *sd, int this_cpu, 3198 struct sched_group *busiest; /* Busiest group in this sd */
3106 unsigned long *imbalance, enum cpu_idle_type idle, 3199 struct sched_group *this; /* Local group in this sd */
3107 int *sd_idle, const struct cpumask *cpus, int *balance) 3200 unsigned long total_load; /* Total load of all groups in sd */
3108{ 3201 unsigned long total_pwr; /* Total power of all groups in sd */
3109 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 3202 unsigned long avg_load; /* Average load across all groups in sd */
3110 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 3203
3111 unsigned long max_pull; 3204 /** Statistics of this group */
3112 unsigned long busiest_load_per_task, busiest_nr_running; 3205 unsigned long this_load;
3113 unsigned long this_load_per_task, this_nr_running; 3206 unsigned long this_load_per_task;
3114 int load_idx, group_imb = 0; 3207 unsigned long this_nr_running;
3208
3209 /* Statistics of the busiest group */
3210 unsigned long max_load;
3211 unsigned long busiest_load_per_task;
3212 unsigned long busiest_nr_running;
3213
3214 int group_imb; /* Is there imbalance in this sd */
3115#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3215#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3116 int power_savings_balance = 1; 3216 int power_savings_balance; /* Is powersave balance needed for this sd */
3117 unsigned long leader_nr_running = 0, min_load_per_task = 0; 3217 struct sched_group *group_min; /* Least loaded group in sd */
3118 unsigned long min_nr_running = ULONG_MAX; 3218 struct sched_group *group_leader; /* Group which relieves group_min */
3119 struct sched_group *group_min = NULL, *group_leader = NULL; 3219 unsigned long min_load_per_task; /* load_per_task in group_min */
3220 unsigned long leader_nr_running; /* Nr running of group_leader */
3221 unsigned long min_nr_running; /* Nr running of group_min */
3120#endif 3222#endif
3223};
3224
3225/*
3226 * sg_lb_stats - stats of a sched_group required for load_balancing
3227 */
3228struct sg_lb_stats {
3229 unsigned long avg_load; /*Avg load across the CPUs of the group */
3230 unsigned long group_load; /* Total load over the CPUs of the group */
3231 unsigned long sum_nr_running; /* Nr tasks running in the group */
3232 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3233 unsigned long group_capacity;
3234 int group_imb; /* Is there an imbalance in the group ? */
3235};
3121 3236
3122 max_load = this_load = total_load = total_pwr = 0; 3237/**
3123 busiest_load_per_task = busiest_nr_running = 0; 3238 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3124 this_load_per_task = this_nr_running = 0; 3239 * @group: The group whose first cpu is to be returned.
3240 */
3241static inline unsigned int group_first_cpu(struct sched_group *group)
3242{
3243 return cpumask_first(sched_group_cpus(group));
3244}
3125 3245
3126 if (idle == CPU_NOT_IDLE) 3246/**
3247 * get_sd_load_idx - Obtain the load index for a given sched domain.
3248 * @sd: The sched_domain whose load_idx is to be obtained.
3249 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3250 */
3251static inline int get_sd_load_idx(struct sched_domain *sd,
3252 enum cpu_idle_type idle)
3253{
3254 int load_idx;
3255
3256 switch (idle) {
3257 case CPU_NOT_IDLE:
3127 load_idx = sd->busy_idx; 3258 load_idx = sd->busy_idx;
3128 else if (idle == CPU_NEWLY_IDLE) 3259 break;
3260
3261 case CPU_NEWLY_IDLE:
3129 load_idx = sd->newidle_idx; 3262 load_idx = sd->newidle_idx;
3130 else 3263 break;
3264 default:
3131 load_idx = sd->idle_idx; 3265 load_idx = sd->idle_idx;
3266 break;
3267 }
3132 3268
3133 do { 3269 return load_idx;
3134 unsigned long load, group_capacity, max_cpu_load, min_cpu_load; 3270}
3135 int local_group;
3136 int i;
3137 int __group_imb = 0;
3138 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3139 unsigned long sum_nr_running, sum_weighted_load;
3140 unsigned long sum_avg_load_per_task;
3141 unsigned long avg_load_per_task;
3142 3271
3143 local_group = cpumask_test_cpu(this_cpu,
3144 sched_group_cpus(group));
3145 3272
3146 if (local_group) 3273#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3147 balance_cpu = cpumask_first(sched_group_cpus(group)); 3274/**
3275 * init_sd_power_savings_stats - Initialize power savings statistics for
3276 * the given sched_domain, during load balancing.
3277 *
3278 * @sd: Sched domain whose power-savings statistics are to be initialized.
3279 * @sds: Variable containing the statistics for sd.
3280 * @idle: Idle status of the CPU at which we're performing load-balancing.
3281 */
3282static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3283 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3284{
3285 /*
3286 * Busy processors will not participate in power savings
3287 * balance.
3288 */
3289 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3290 sds->power_savings_balance = 0;
3291 else {
3292 sds->power_savings_balance = 1;
3293 sds->min_nr_running = ULONG_MAX;
3294 sds->leader_nr_running = 0;
3295 }
3296}
3297
3298/**
3299 * update_sd_power_savings_stats - Update the power saving stats for a
3300 * sched_domain while performing load balancing.
3301 *
3302 * @group: sched_group belonging to the sched_domain under consideration.
3303 * @sds: Variable containing the statistics of the sched_domain
3304 * @local_group: Does group contain the CPU for which we're performing
3305 * load balancing ?
3306 * @sgs: Variable containing the statistics of the group.
3307 */
3308static inline void update_sd_power_savings_stats(struct sched_group *group,
3309 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3310{
3148 3311
3149 /* Tally up the load of all CPUs in the group */ 3312 if (!sds->power_savings_balance)
3150 sum_weighted_load = sum_nr_running = avg_load = 0; 3313 return;
3151 sum_avg_load_per_task = avg_load_per_task = 0;
3152 3314
3153 max_cpu_load = 0; 3315 /*
3154 min_cpu_load = ~0UL; 3316 * If the local group is idle or completely loaded
3317 * no need to do power savings balance at this domain
3318 */
3319 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3320 !sds->this_nr_running))
3321 sds->power_savings_balance = 0;
3155 3322
3156 for_each_cpu_and(i, sched_group_cpus(group), cpus) { 3323 /*
3157 struct rq *rq = cpu_rq(i); 3324 * If a group is already running at full capacity or idle,
3325 * don't include that group in power savings calculations
3326 */
3327 if (!sds->power_savings_balance ||
3328 sgs->sum_nr_running >= sgs->group_capacity ||
3329 !sgs->sum_nr_running)
3330 return;
3158 3331
3159 if (*sd_idle && rq->nr_running) 3332 /*
3160 *sd_idle = 0; 3333 * Calculate the group which has the least non-idle load.
3334 * This is the group from where we need to pick up the load
3335 * for saving power
3336 */
3337 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3338 (sgs->sum_nr_running == sds->min_nr_running &&
3339 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3340 sds->group_min = group;
3341 sds->min_nr_running = sgs->sum_nr_running;
3342 sds->min_load_per_task = sgs->sum_weighted_load /
3343 sgs->sum_nr_running;
3344 }
3161 3345
3162 /* Bias balancing toward cpus of our domain */ 3346 /*
3163 if (local_group) { 3347 * Calculate the group which is almost near its
3164 if (idle_cpu(i) && !first_idle_cpu) { 3348 * capacity but still has some space to pick up some load
3165 first_idle_cpu = 1; 3349 * from other group and save more power
3166 balance_cpu = i; 3350 */
3167 } 3351 if (sgs->sum_nr_running > sgs->group_capacity - 1)
3352 return;
3168 3353
3169 load = target_load(i, load_idx); 3354 if (sgs->sum_nr_running > sds->leader_nr_running ||
3170 } else { 3355 (sgs->sum_nr_running == sds->leader_nr_running &&
3171 load = source_load(i, load_idx); 3356 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3172 if (load > max_cpu_load) 3357 sds->group_leader = group;
3173 max_cpu_load = load; 3358 sds->leader_nr_running = sgs->sum_nr_running;
3174 if (min_cpu_load > load) 3359 }
3175 min_cpu_load = load; 3360}
3176 }
3177 3361
3178 avg_load += load; 3362/**
3179 sum_nr_running += rq->nr_running; 3363 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3180 sum_weighted_load += weighted_cpuload(i); 3364 * @sds: Variable containing the statistics of the sched_domain
3365 * under consideration.
3366 * @this_cpu: Cpu at which we're currently performing load-balancing.
3367 * @imbalance: Variable to store the imbalance.
3368 *
3369 * Description:
3370 * Check if we have potential to perform some power-savings balance.
3371 * If yes, set the busiest group to be the least loaded group in the
3372 * sched_domain, so that it's CPUs can be put to idle.
3373 *
3374 * Returns 1 if there is potential to perform power-savings balance.
3375 * Else returns 0.
3376 */
3377static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3378 int this_cpu, unsigned long *imbalance)
3379{
3380 if (!sds->power_savings_balance)
3381 return 0;
3181 3382
3182 sum_avg_load_per_task += cpu_avg_load_per_task(i); 3383 if (sds->this != sds->group_leader ||
3183 } 3384 sds->group_leader == sds->group_min)
3385 return 0;
3184 3386
3185 /* 3387 *imbalance = sds->min_load_per_task;
3186 * First idle cpu or the first cpu(busiest) in this sched group 3388 sds->busiest = sds->group_min;
3187 * is eligible for doing load balancing at this and above
3188 * domains. In the newly idle case, we will allow all the cpu's
3189 * to do the newly idle load balance.
3190 */
3191 if (idle != CPU_NEWLY_IDLE && local_group &&
3192 balance_cpu != this_cpu && balance) {
3193 *balance = 0;
3194 goto ret;
3195 }
3196 3389
3197 total_load += avg_load; 3390 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
3198 total_pwr += group->__cpu_power; 3391 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
3392 group_first_cpu(sds->group_leader);
3393 }
3199 3394
3200 /* Adjust by relative CPU power of the group */ 3395 return 1;
3201 avg_load = sg_div_cpu_power(group,
3202 avg_load * SCHED_LOAD_SCALE);
3203 3396
3397}
3398#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3399static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3400 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3401{
3402 return;
3403}
3204 3404
3205 /* 3405static inline void update_sd_power_savings_stats(struct sched_group *group,
3206 * Consider the group unbalanced when the imbalance is larger 3406 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3207 * than the average weight of two tasks. 3407{
3208 * 3408 return;
3209 * APZ: with cgroup the avg task weight can vary wildly and 3409}
3210 * might not be a suitable number - should we keep a
3211 * normalized nr_running number somewhere that negates
3212 * the hierarchy?
3213 */
3214 avg_load_per_task = sg_div_cpu_power(group,
3215 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3216 3410
3217 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task) 3411static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3218 __group_imb = 1; 3412 int this_cpu, unsigned long *imbalance)
3413{
3414 return 0;
3415}
3416#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3219 3417
3220 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3221 3418
3419/**
3420 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3421 * @group: sched_group whose statistics are to be updated.
3422 * @this_cpu: Cpu for which load balance is currently performed.
3423 * @idle: Idle status of this_cpu
3424 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3425 * @sd_idle: Idle status of the sched_domain containing group.
3426 * @local_group: Does group contain this_cpu.
3427 * @cpus: Set of cpus considered for load balancing.
3428 * @balance: Should we balance.
3429 * @sgs: variable to hold the statistics for this group.
3430 */
3431static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
3432 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3433 int local_group, const struct cpumask *cpus,
3434 int *balance, struct sg_lb_stats *sgs)
3435{
3436 unsigned long load, max_cpu_load, min_cpu_load;
3437 int i;
3438 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3439 unsigned long sum_avg_load_per_task;
3440 unsigned long avg_load_per_task;
3441
3442 if (local_group)
3443 balance_cpu = group_first_cpu(group);
3444
3445 /* Tally up the load of all CPUs in the group */
3446 sum_avg_load_per_task = avg_load_per_task = 0;
3447 max_cpu_load = 0;
3448 min_cpu_load = ~0UL;
3449
3450 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3451 struct rq *rq = cpu_rq(i);
3452
3453 if (*sd_idle && rq->nr_running)
3454 *sd_idle = 0;
3455
3456 /* Bias balancing toward cpus of our domain */
3222 if (local_group) { 3457 if (local_group) {
3223 this_load = avg_load; 3458 if (idle_cpu(i) && !first_idle_cpu) {
3224 this = group; 3459 first_idle_cpu = 1;
3225 this_nr_running = sum_nr_running; 3460 balance_cpu = i;
3226 this_load_per_task = sum_weighted_load; 3461 }
3227 } else if (avg_load > max_load && 3462
3228 (sum_nr_running > group_capacity || __group_imb)) { 3463 load = target_load(i, load_idx);
3229 max_load = avg_load; 3464 } else {
3230 busiest = group; 3465 load = source_load(i, load_idx);
3231 busiest_nr_running = sum_nr_running; 3466 if (load > max_cpu_load)
3232 busiest_load_per_task = sum_weighted_load; 3467 max_cpu_load = load;
3233 group_imb = __group_imb; 3468 if (min_cpu_load > load)
3469 min_cpu_load = load;
3234 } 3470 }
3235 3471
3236#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3472 sgs->group_load += load;
3237 /* 3473 sgs->sum_nr_running += rq->nr_running;
3238 * Busy processors will not participate in power savings 3474 sgs->sum_weighted_load += weighted_cpuload(i);
3239 * balance.
3240 */
3241 if (idle == CPU_NOT_IDLE ||
3242 !(sd->flags & SD_POWERSAVINGS_BALANCE))
3243 goto group_next;
3244 3475
3245 /* 3476 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3246 * If the local group is idle or completely loaded 3477 }
3247 * no need to do power savings balance at this domain
3248 */
3249 if (local_group && (this_nr_running >= group_capacity ||
3250 !this_nr_running))
3251 power_savings_balance = 0;
3252 3478
3253 /* 3479 /*
3254 * If a group is already running at full capacity or idle, 3480 * First idle cpu or the first cpu(busiest) in this sched group
3255 * don't include that group in power savings calculations 3481 * is eligible for doing load balancing at this and above
3256 */ 3482 * domains. In the newly idle case, we will allow all the cpu's
3257 if (!power_savings_balance || sum_nr_running >= group_capacity 3483 * to do the newly idle load balance.
3258 || !sum_nr_running) 3484 */
3259 goto group_next; 3485 if (idle != CPU_NEWLY_IDLE && local_group &&
3486 balance_cpu != this_cpu && balance) {
3487 *balance = 0;
3488 return;
3489 }
3260 3490
3261 /* 3491 /* Adjust by relative CPU power of the group */
3262 * Calculate the group which has the least non-idle load. 3492 sgs->avg_load = sg_div_cpu_power(group,
3263 * This is the group from where we need to pick up the load 3493 sgs->group_load * SCHED_LOAD_SCALE);
3264 * for saving power
3265 */
3266 if ((sum_nr_running < min_nr_running) ||
3267 (sum_nr_running == min_nr_running &&
3268 cpumask_first(sched_group_cpus(group)) >
3269 cpumask_first(sched_group_cpus(group_min)))) {
3270 group_min = group;
3271 min_nr_running = sum_nr_running;
3272 min_load_per_task = sum_weighted_load /
3273 sum_nr_running;
3274 }
3275 3494
3276 /* 3495
3277 * Calculate the group which is almost near its 3496 /*
3278 * capacity but still has some space to pick up some load 3497 * Consider the group unbalanced when the imbalance is larger
3279 * from other group and save more power 3498 * than the average weight of two tasks.
3280 */ 3499 *
3281 if (sum_nr_running <= group_capacity - 1) { 3500 * APZ: with cgroup the avg task weight can vary wildly and
3282 if (sum_nr_running > leader_nr_running || 3501 * might not be a suitable number - should we keep a
3283 (sum_nr_running == leader_nr_running && 3502 * normalized nr_running number somewhere that negates
3284 cpumask_first(sched_group_cpus(group)) < 3503 * the hierarchy?
3285 cpumask_first(sched_group_cpus(group_leader)))) { 3504 */
3286 group_leader = group; 3505 avg_load_per_task = sg_div_cpu_power(group,
3287 leader_nr_running = sum_nr_running; 3506 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3288 } 3507
3508 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3509 sgs->group_imb = 1;
3510
3511 sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
3512
3513}
3514
3515/**
3516 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3517 * @sd: sched_domain whose statistics are to be updated.
3518 * @this_cpu: Cpu for which load balance is currently performed.
3519 * @idle: Idle status of this_cpu
3520 * @sd_idle: Idle status of the sched_domain containing group.
3521 * @cpus: Set of cpus considered for load balancing.
3522 * @balance: Should we balance.
3523 * @sds: variable to hold the statistics for this sched_domain.
3524 */
3525static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3526 enum cpu_idle_type idle, int *sd_idle,
3527 const struct cpumask *cpus, int *balance,
3528 struct sd_lb_stats *sds)
3529{
3530 struct sched_group *group = sd->groups;
3531 struct sg_lb_stats sgs;
3532 int load_idx;
3533
3534 init_sd_power_savings_stats(sd, sds, idle);
3535 load_idx = get_sd_load_idx(sd, idle);
3536
3537 do {
3538 int local_group;
3539
3540 local_group = cpumask_test_cpu(this_cpu,
3541 sched_group_cpus(group));
3542 memset(&sgs, 0, sizeof(sgs));
3543 update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
3544 local_group, cpus, balance, &sgs);
3545
3546 if (local_group && balance && !(*balance))
3547 return;
3548
3549 sds->total_load += sgs.group_load;
3550 sds->total_pwr += group->__cpu_power;
3551
3552 if (local_group) {
3553 sds->this_load = sgs.avg_load;
3554 sds->this = group;
3555 sds->this_nr_running = sgs.sum_nr_running;
3556 sds->this_load_per_task = sgs.sum_weighted_load;
3557 } else if (sgs.avg_load > sds->max_load &&
3558 (sgs.sum_nr_running > sgs.group_capacity ||
3559 sgs.group_imb)) {
3560 sds->max_load = sgs.avg_load;
3561 sds->busiest = group;
3562 sds->busiest_nr_running = sgs.sum_nr_running;
3563 sds->busiest_load_per_task = sgs.sum_weighted_load;
3564 sds->group_imb = sgs.group_imb;
3289 } 3565 }
3290group_next: 3566
3291#endif 3567 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3292 group = group->next; 3568 group = group->next;
3293 } while (group != sd->groups); 3569 } while (group != sd->groups);
3294 3570
3295 if (!busiest || this_load >= max_load || busiest_nr_running == 0) 3571}
3296 goto out_balanced;
3297
3298 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
3299 3572
3300 if (this_load >= avg_load || 3573/**
3301 100*max_load <= sd->imbalance_pct*this_load) 3574 * fix_small_imbalance - Calculate the minor imbalance that exists
3302 goto out_balanced; 3575 * amongst the groups of a sched_domain, during
3576 * load balancing.
3577 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3578 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3579 * @imbalance: Variable to store the imbalance.
3580 */
3581static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3582 int this_cpu, unsigned long *imbalance)
3583{
3584 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3585 unsigned int imbn = 2;
3586
3587 if (sds->this_nr_running) {
3588 sds->this_load_per_task /= sds->this_nr_running;
3589 if (sds->busiest_load_per_task >
3590 sds->this_load_per_task)
3591 imbn = 1;
3592 } else
3593 sds->this_load_per_task =
3594 cpu_avg_load_per_task(this_cpu);
3303 3595
3304 busiest_load_per_task /= busiest_nr_running; 3596 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3305 if (group_imb) 3597 sds->busiest_load_per_task * imbn) {
3306 busiest_load_per_task = min(busiest_load_per_task, avg_load); 3598 *imbalance = sds->busiest_load_per_task;
3599 return;
3600 }
3307 3601
3308 /* 3602 /*
3309 * We're trying to get all the cpus to the average_load, so we don't 3603 * OK, we don't have enough imbalance to justify moving tasks,
3310 * want to push ourselves above the average load, nor do we wish to 3604 * however we may be able to increase total CPU power used by
3311 * reduce the max loaded cpu below the average load, as either of these 3605 * moving them.
3312 * actions would just result in more rebalancing later, and ping-pong
3313 * tasks around. Thus we look for the minimum possible imbalance.
3314 * Negative imbalances (*we* are more loaded than anyone else) will
3315 * be counted as no imbalance for these purposes -- we can't fix that
3316 * by pulling tasks to us. Be careful of negative numbers as they'll
3317 * appear as very large values with unsigned longs.
3318 */ 3606 */
3319 if (max_load <= busiest_load_per_task)
3320 goto out_balanced;
3321 3607
3608 pwr_now += sds->busiest->__cpu_power *
3609 min(sds->busiest_load_per_task, sds->max_load);
3610 pwr_now += sds->this->__cpu_power *
3611 min(sds->this_load_per_task, sds->this_load);
3612 pwr_now /= SCHED_LOAD_SCALE;
3613
3614 /* Amount of load we'd subtract */
3615 tmp = sg_div_cpu_power(sds->busiest,
3616 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3617 if (sds->max_load > tmp)
3618 pwr_move += sds->busiest->__cpu_power *
3619 min(sds->busiest_load_per_task, sds->max_load - tmp);
3620
3621 /* Amount of load we'd add */
3622 if (sds->max_load * sds->busiest->__cpu_power <
3623 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3624 tmp = sg_div_cpu_power(sds->this,
3625 sds->max_load * sds->busiest->__cpu_power);
3626 else
3627 tmp = sg_div_cpu_power(sds->this,
3628 sds->busiest_load_per_task * SCHED_LOAD_SCALE);
3629 pwr_move += sds->this->__cpu_power *
3630 min(sds->this_load_per_task, sds->this_load + tmp);
3631 pwr_move /= SCHED_LOAD_SCALE;
3632
3633 /* Move if we gain throughput */
3634 if (pwr_move > pwr_now)
3635 *imbalance = sds->busiest_load_per_task;
3636}
3637
3638/**
3639 * calculate_imbalance - Calculate the amount of imbalance present within the
3640 * groups of a given sched_domain during load balance.
3641 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3642 * @this_cpu: Cpu for which currently load balance is being performed.
3643 * @imbalance: The variable to store the imbalance.
3644 */
3645static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3646 unsigned long *imbalance)
3647{
3648 unsigned long max_pull;
3322 /* 3649 /*
3323 * In the presence of smp nice balancing, certain scenarios can have 3650 * In the presence of smp nice balancing, certain scenarios can have
3324 * max load less than avg load(as we skip the groups at or below 3651 * max load less than avg load(as we skip the groups at or below
3325 * its cpu_power, while calculating max_load..) 3652 * its cpu_power, while calculating max_load..)
3326 */ 3653 */
3327 if (max_load < avg_load) { 3654 if (sds->max_load < sds->avg_load) {
3328 *imbalance = 0; 3655 *imbalance = 0;
3329 goto small_imbalance; 3656 return fix_small_imbalance(sds, this_cpu, imbalance);
3330 } 3657 }
3331 3658
3332 /* Don't want to pull so many tasks that a group would go idle */ 3659 /* Don't want to pull so many tasks that a group would go idle */
3333 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); 3660 max_pull = min(sds->max_load - sds->avg_load,
3661 sds->max_load - sds->busiest_load_per_task);
3334 3662
3335 /* How much load to actually move to equalise the imbalance */ 3663 /* How much load to actually move to equalise the imbalance */
3336 *imbalance = min(max_pull * busiest->__cpu_power, 3664 *imbalance = min(max_pull * sds->busiest->__cpu_power,
3337 (avg_load - this_load) * this->__cpu_power) 3665 (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
3338 / SCHED_LOAD_SCALE; 3666 / SCHED_LOAD_SCALE;
3339 3667
3340 /* 3668 /*
@@ -3343,78 +3671,110 @@ group_next:
3343 * a think about bumping its value to force at least one task to be 3671 * a think about bumping its value to force at least one task to be
3344 * moved 3672 * moved
3345 */ 3673 */
3346 if (*imbalance < busiest_load_per_task) { 3674 if (*imbalance < sds->busiest_load_per_task)
3347 unsigned long tmp, pwr_now, pwr_move; 3675 return fix_small_imbalance(sds, this_cpu, imbalance);
3348 unsigned int imbn;
3349
3350small_imbalance:
3351 pwr_move = pwr_now = 0;
3352 imbn = 2;
3353 if (this_nr_running) {
3354 this_load_per_task /= this_nr_running;
3355 if (busiest_load_per_task > this_load_per_task)
3356 imbn = 1;
3357 } else
3358 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3359 3676
3360 if (max_load - this_load + busiest_load_per_task >= 3677}
3361 busiest_load_per_task * imbn) { 3678/******* find_busiest_group() helpers end here *********************/
3362 *imbalance = busiest_load_per_task;
3363 return busiest;
3364 }
3365 3679
3366 /* 3680/**
3367 * OK, we don't have enough imbalance to justify moving tasks, 3681 * find_busiest_group - Returns the busiest group within the sched_domain
3368 * however we may be able to increase total CPU power used by 3682 * if there is an imbalance. If there isn't an imbalance, and
3369 * moving them. 3683 * the user has opted for power-savings, it returns a group whose
3370 */ 3684 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3685 * such a group exists.
3686 *
3687 * Also calculates the amount of weighted load which should be moved
3688 * to restore balance.
3689 *
3690 * @sd: The sched_domain whose busiest group is to be returned.
3691 * @this_cpu: The cpu for which load balancing is currently being performed.
3692 * @imbalance: Variable which stores amount of weighted load which should
3693 * be moved to restore balance/put a group to idle.
3694 * @idle: The idle status of this_cpu.
3695 * @sd_idle: The idleness of sd
3696 * @cpus: The set of CPUs under consideration for load-balancing.
3697 * @balance: Pointer to a variable indicating if this_cpu
3698 * is the appropriate cpu to perform load balancing at this_level.
3699 *
3700 * Returns: - the busiest group if imbalance exists.
3701 * - If no imbalance and user has opted for power-savings balance,
3702 * return the least loaded group whose CPUs can be
3703 * put to idle by rebalancing its tasks onto our group.
3704 */
3705static struct sched_group *
3706find_busiest_group(struct sched_domain *sd, int this_cpu,
3707 unsigned long *imbalance, enum cpu_idle_type idle,
3708 int *sd_idle, const struct cpumask *cpus, int *balance)
3709{
3710 struct sd_lb_stats sds;
3371 3711
3372 pwr_now += busiest->__cpu_power * 3712 memset(&sds, 0, sizeof(sds));
3373 min(busiest_load_per_task, max_load);
3374 pwr_now += this->__cpu_power *
3375 min(this_load_per_task, this_load);
3376 pwr_now /= SCHED_LOAD_SCALE;
3377
3378 /* Amount of load we'd subtract */
3379 tmp = sg_div_cpu_power(busiest,
3380 busiest_load_per_task * SCHED_LOAD_SCALE);
3381 if (max_load > tmp)
3382 pwr_move += busiest->__cpu_power *
3383 min(busiest_load_per_task, max_load - tmp);
3384
3385 /* Amount of load we'd add */
3386 if (max_load * busiest->__cpu_power <
3387 busiest_load_per_task * SCHED_LOAD_SCALE)
3388 tmp = sg_div_cpu_power(this,
3389 max_load * busiest->__cpu_power);
3390 else
3391 tmp = sg_div_cpu_power(this,
3392 busiest_load_per_task * SCHED_LOAD_SCALE);
3393 pwr_move += this->__cpu_power *
3394 min(this_load_per_task, this_load + tmp);
3395 pwr_move /= SCHED_LOAD_SCALE;
3396 3713
3397 /* Move if we gain throughput */ 3714 /*
3398 if (pwr_move > pwr_now) 3715 * Compute the various statistics relavent for load balancing at
3399 *imbalance = busiest_load_per_task; 3716 * this level.
3400 } 3717 */
3718 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3719 balance, &sds);
3720
3721 /* Cases where imbalance does not exist from POV of this_cpu */
3722 /* 1) this_cpu is not the appropriate cpu to perform load balancing
3723 * at this level.
3724 * 2) There is no busy sibling group to pull from.
3725 * 3) This group is the busiest group.
3726 * 4) This group is more busy than the avg busieness at this
3727 * sched_domain.
3728 * 5) The imbalance is within the specified limit.
3729 * 6) Any rebalance would lead to ping-pong
3730 */
3731 if (balance && !(*balance))
3732 goto ret;
3401 3733
3402 return busiest; 3734 if (!sds.busiest || sds.busiest_nr_running == 0)
3735 goto out_balanced;
3403 3736
3404out_balanced: 3737 if (sds.this_load >= sds.max_load)
3405#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 3738 goto out_balanced;
3406 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3407 goto ret;
3408 3739
3409 if (this == group_leader && group_leader != group_min) { 3740 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
3410 *imbalance = min_load_per_task; 3741
3411 if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { 3742 if (sds.this_load >= sds.avg_load)
3412 cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = 3743 goto out_balanced;
3413 cpumask_first(sched_group_cpus(group_leader)); 3744
3414 } 3745 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
3415 return group_min; 3746 goto out_balanced;
3416 } 3747
3417#endif 3748 sds.busiest_load_per_task /= sds.busiest_nr_running;
3749 if (sds.group_imb)
3750 sds.busiest_load_per_task =
3751 min(sds.busiest_load_per_task, sds.avg_load);
3752
3753 /*
3754 * We're trying to get all the cpus to the average_load, so we don't
3755 * want to push ourselves above the average load, nor do we wish to
3756 * reduce the max loaded cpu below the average load, as either of these
3757 * actions would just result in more rebalancing later, and ping-pong
3758 * tasks around. Thus we look for the minimum possible imbalance.
3759 * Negative imbalances (*we* are more loaded than anyone else) will
3760 * be counted as no imbalance for these purposes -- we can't fix that
3761 * by pulling tasks to us. Be careful of negative numbers as they'll
3762 * appear as very large values with unsigned longs.
3763 */
3764 if (sds.max_load <= sds.busiest_load_per_task)
3765 goto out_balanced;
3766
3767 /* Looks like there is an imbalance. Compute it */
3768 calculate_imbalance(&sds, this_cpu, imbalance);
3769 return sds.busiest;
3770
3771out_balanced:
3772 /*
3773 * There is no obvious imbalance. But check if we can do some balancing
3774 * to save power.
3775 */
3776 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
3777 return sds.busiest;
3418ret: 3778ret:
3419 *imbalance = 0; 3779 *imbalance = 0;
3420 return NULL; 3780 return NULL;
@@ -3890,19 +4250,24 @@ int select_nohz_load_balancer(int stop_tick)
3890 int cpu = smp_processor_id(); 4250 int cpu = smp_processor_id();
3891 4251
3892 if (stop_tick) { 4252 if (stop_tick) {
3893 cpumask_set_cpu(cpu, nohz.cpu_mask);
3894 cpu_rq(cpu)->in_nohz_recently = 1; 4253 cpu_rq(cpu)->in_nohz_recently = 1;
3895 4254
3896 /* 4255 if (!cpu_active(cpu)) {
3897 * If we are going offline and still the leader, give up! 4256 if (atomic_read(&nohz.load_balancer) != cpu)
3898 */ 4257 return 0;
3899 if (!cpu_active(cpu) && 4258
3900 atomic_read(&nohz.load_balancer) == cpu) { 4259 /*
4260 * If we are going offline and still the leader,
4261 * give up!
4262 */
3901 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 4263 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3902 BUG(); 4264 BUG();
4265
3903 return 0; 4266 return 0;
3904 } 4267 }
3905 4268
4269 cpumask_set_cpu(cpu, nohz.cpu_mask);
4270
3906 /* time for ilb owner also to sleep */ 4271 /* time for ilb owner also to sleep */
3907 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) { 4272 if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3908 if (atomic_read(&nohz.load_balancer) == cpu) 4273 if (atomic_read(&nohz.load_balancer) == cpu)
@@ -4062,6 +4427,11 @@ static void run_rebalance_domains(struct softirq_action *h)
4062#endif 4427#endif
4063} 4428}
4064 4429
4430static inline int on_null_domain(int cpu)
4431{
4432 return !rcu_dereference(cpu_rq(cpu)->sd);
4433}
4434
4065/* 4435/*
4066 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. 4436 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4067 * 4437 *
@@ -4119,7 +4489,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
4119 cpumask_test_cpu(cpu, nohz.cpu_mask)) 4489 cpumask_test_cpu(cpu, nohz.cpu_mask))
4120 return; 4490 return;
4121#endif 4491#endif
4122 if (time_after_eq(jiffies, rq->next_balance)) 4492 /* Don't need to rebalance while attached to NULL domain */
4493 if (time_after_eq(jiffies, rq->next_balance) &&
4494 likely(!on_null_domain(cpu)))
4123 raise_softirq(SCHED_SOFTIRQ); 4495 raise_softirq(SCHED_SOFTIRQ);
4124} 4496}
4125 4497
@@ -4513,11 +4885,33 @@ static inline void schedule_debug(struct task_struct *prev)
4513#endif 4885#endif
4514} 4886}
4515 4887
4888static void put_prev_task(struct rq *rq, struct task_struct *prev)
4889{
4890 if (prev->state == TASK_RUNNING) {
4891 u64 runtime = prev->se.sum_exec_runtime;
4892
4893 runtime -= prev->se.prev_sum_exec_runtime;
4894 runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
4895
4896 /*
4897 * In order to avoid avg_overlap growing stale when we are
4898 * indeed overlapping and hence not getting put to sleep, grow
4899 * the avg_overlap on preemption.
4900 *
4901 * We use the average preemption runtime because that
4902 * correlates to the amount of cache footprint a task can
4903 * build up.
4904 */
4905 update_avg(&prev->se.avg_overlap, runtime);
4906 }
4907 prev->sched_class->put_prev_task(rq, prev);
4908}
4909
4516/* 4910/*
4517 * Pick up the highest-prio task: 4911 * Pick up the highest-prio task:
4518 */ 4912 */
4519static inline struct task_struct * 4913static inline struct task_struct *
4520pick_next_task(struct rq *rq, struct task_struct *prev) 4914pick_next_task(struct rq *rq)
4521{ 4915{
4522 const struct sched_class *class; 4916 const struct sched_class *class;
4523 struct task_struct *p; 4917 struct task_struct *p;
@@ -4589,8 +4983,8 @@ need_resched_nonpreemptible:
4589 if (unlikely(!rq->nr_running)) 4983 if (unlikely(!rq->nr_running))
4590 idle_balance(cpu, rq); 4984 idle_balance(cpu, rq);
4591 4985
4592 prev->sched_class->put_prev_task(rq, prev); 4986 put_prev_task(rq, prev);
4593 next = pick_next_task(rq, prev); 4987 next = pick_next_task(rq);
4594 4988
4595 if (likely(prev != next)) { 4989 if (likely(prev != next)) {
4596 sched_info_switch(prev, next); 4990 sched_info_switch(prev, next);
@@ -4712,7 +5106,7 @@ asmlinkage void __sched preempt_schedule(void)
4712 * between schedule and now. 5106 * between schedule and now.
4713 */ 5107 */
4714 barrier(); 5108 barrier();
4715 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 5109 } while (need_resched());
4716} 5110}
4717EXPORT_SYMBOL(preempt_schedule); 5111EXPORT_SYMBOL(preempt_schedule);
4718 5112
@@ -4741,7 +5135,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
4741 * between schedule and now. 5135 * between schedule and now.
4742 */ 5136 */
4743 barrier(); 5137 barrier();
4744 } while (unlikely(test_thread_flag(TIF_NEED_RESCHED))); 5138 } while (need_resched());
4745} 5139}
4746 5140
4747#endif /* CONFIG_PREEMPT */ 5141#endif /* CONFIG_PREEMPT */
@@ -5215,7 +5609,7 @@ SYSCALL_DEFINE1(nice, int, increment)
5215 if (increment > 40) 5609 if (increment > 40)
5216 increment = 40; 5610 increment = 40;
5217 5611
5218 nice = PRIO_TO_NICE(current->static_prio) + increment; 5612 nice = TASK_NICE(current) + increment;
5219 if (nice < -20) 5613 if (nice < -20)
5220 nice = -20; 5614 nice = -20;
5221 if (nice > 19) 5615 if (nice > 19)
@@ -6014,12 +6408,7 @@ void sched_show_task(struct task_struct *p)
6014 printk(KERN_CONT " %016lx ", thread_saved_pc(p)); 6408 printk(KERN_CONT " %016lx ", thread_saved_pc(p));
6015#endif 6409#endif
6016#ifdef CONFIG_DEBUG_STACK_USAGE 6410#ifdef CONFIG_DEBUG_STACK_USAGE
6017 { 6411 free = stack_not_used(p);
6018 unsigned long *n = end_of_stack(p);
6019 while (!*n)
6020 n++;
6021 free = (unsigned long)n - (unsigned long)end_of_stack(p);
6022 }
6023#endif 6412#endif
6024 printk(KERN_CONT "%5lu %5d %6d\n", free, 6413 printk(KERN_CONT "%5lu %5d %6d\n", free,
6025 task_pid_nr(p), task_pid_nr(p->real_parent)); 6414 task_pid_nr(p), task_pid_nr(p->real_parent));
@@ -6493,7 +6882,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
6493 if (!rq->nr_running) 6882 if (!rq->nr_running)
6494 break; 6883 break;
6495 update_rq_clock(rq); 6884 update_rq_clock(rq);
6496 next = pick_next_task(rq, rq->curr); 6885 next = pick_next_task(rq);
6497 if (!next) 6886 if (!next)
6498 break; 6887 break;
6499 next->sched_class->put_prev_task(rq, next); 6888 next->sched_class->put_prev_task(rq, next);
@@ -7014,20 +7403,26 @@ static void free_rootdomain(struct root_domain *rd)
7014 7403
7015static void rq_attach_root(struct rq *rq, struct root_domain *rd) 7404static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7016{ 7405{
7406 struct root_domain *old_rd = NULL;
7017 unsigned long flags; 7407 unsigned long flags;
7018 7408
7019 spin_lock_irqsave(&rq->lock, flags); 7409 spin_lock_irqsave(&rq->lock, flags);
7020 7410
7021 if (rq->rd) { 7411 if (rq->rd) {
7022 struct root_domain *old_rd = rq->rd; 7412 old_rd = rq->rd;
7023 7413
7024 if (cpumask_test_cpu(rq->cpu, old_rd->online)) 7414 if (cpumask_test_cpu(rq->cpu, old_rd->online))
7025 set_rq_offline(rq); 7415 set_rq_offline(rq);
7026 7416
7027 cpumask_clear_cpu(rq->cpu, old_rd->span); 7417 cpumask_clear_cpu(rq->cpu, old_rd->span);
7028 7418
7029 if (atomic_dec_and_test(&old_rd->refcount)) 7419 /*
7030 free_rootdomain(old_rd); 7420 * If we dont want to free the old_rt yet then
7421 * set old_rd to NULL to skip the freeing later
7422 * in this function:
7423 */
7424 if (!atomic_dec_and_test(&old_rd->refcount))
7425 old_rd = NULL;
7031 } 7426 }
7032 7427
7033 atomic_inc(&rd->refcount); 7428 atomic_inc(&rd->refcount);
@@ -7038,6 +7433,9 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
7038 set_rq_online(rq); 7433 set_rq_online(rq);
7039 7434
7040 spin_unlock_irqrestore(&rq->lock, flags); 7435 spin_unlock_irqrestore(&rq->lock, flags);
7436
7437 if (old_rd)
7438 free_rootdomain(old_rd);
7041} 7439}
7042 7440
7043static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem) 7441static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
@@ -8279,11 +8677,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
8279 __set_bit(MAX_RT_PRIO, array->bitmap); 8677 __set_bit(MAX_RT_PRIO, array->bitmap);
8280 8678
8281#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 8679#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
8282 rt_rq->highest_prio = MAX_RT_PRIO; 8680 rt_rq->highest_prio.curr = MAX_RT_PRIO;
8681#ifdef CONFIG_SMP
8682 rt_rq->highest_prio.next = MAX_RT_PRIO;
8683#endif
8283#endif 8684#endif
8284#ifdef CONFIG_SMP 8685#ifdef CONFIG_SMP
8285 rt_rq->rt_nr_migratory = 0; 8686 rt_rq->rt_nr_migratory = 0;
8286 rt_rq->overloaded = 0; 8687 rt_rq->overloaded = 0;
8688 plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
8287#endif 8689#endif
8288 8690
8289 rt_rq->rt_time = 0; 8691 rt_rq->rt_time = 0;
@@ -9285,6 +9687,16 @@ static int sched_rt_global_constraints(void)
9285 9687
9286 return ret; 9688 return ret;
9287} 9689}
9690
9691int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
9692{
9693 /* Don't accept realtime tasks when there is no way for them to run */
9694 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
9695 return 0;
9696
9697 return 1;
9698}
9699
9288#else /* !CONFIG_RT_GROUP_SCHED */ 9700#else /* !CONFIG_RT_GROUP_SCHED */
9289static int sched_rt_global_constraints(void) 9701static int sched_rt_global_constraints(void)
9290{ 9702{
@@ -9378,8 +9790,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
9378 struct task_struct *tsk) 9790 struct task_struct *tsk)
9379{ 9791{
9380#ifdef CONFIG_RT_GROUP_SCHED 9792#ifdef CONFIG_RT_GROUP_SCHED
9381 /* Don't accept realtime tasks when there is no way for them to run */ 9793 if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
9382 if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
9383 return -EINVAL; 9794 return -EINVAL;
9384#else 9795#else
9385 /* We don't support RT-tasks being in separate groups */ 9796 /* We don't support RT-tasks being in separate groups */
@@ -9542,7 +9953,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
9542 9953
9543static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) 9954static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9544{ 9955{
9545 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 9956 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9546 u64 data; 9957 u64 data;
9547 9958
9548#ifndef CONFIG_64BIT 9959#ifndef CONFIG_64BIT
@@ -9561,7 +9972,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
9561 9972
9562static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) 9973static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
9563{ 9974{
9564 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 9975 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9565 9976
9566#ifndef CONFIG_64BIT 9977#ifndef CONFIG_64BIT
9567 /* 9978 /*
@@ -9650,14 +10061,14 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
9650 struct cpuacct *ca; 10061 struct cpuacct *ca;
9651 int cpu; 10062 int cpu;
9652 10063
9653 if (!cpuacct_subsys.active) 10064 if (unlikely(!cpuacct_subsys.active))
9654 return; 10065 return;
9655 10066
9656 cpu = task_cpu(tsk); 10067 cpu = task_cpu(tsk);
9657 ca = task_ca(tsk); 10068 ca = task_ca(tsk);
9658 10069
9659 for (; ca; ca = ca->parent) { 10070 for (; ca; ca = ca->parent) {
9660 u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu); 10071 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
9661 *cpuusage += cputime; 10072 *cpuusage += cputime;
9662 } 10073 }
9663} 10074}