aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c2119
1 files changed, 112 insertions, 2007 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 404e2017c0cf..af5fa239804d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 233 */
234static DEFINE_MUTEX(sched_domains_mutex); 234static DEFINE_MUTEX(sched_domains_mutex);
235 235
236#ifdef CONFIG_GROUP_SCHED 236#ifdef CONFIG_CGROUP_SCHED
237 237
238#include <linux/cgroup.h> 238#include <linux/cgroup.h>
239 239
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
243 243
244/* task group related information */ 244/* task group related information */
245struct task_group { 245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 246 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 247
254#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 249 /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
274 struct list_head children; 268 struct list_head children;
275}; 269};
276 270
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 271#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 272
307/* task_group_lock serializes add/remove of task groups and also changes to 273/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 274 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
318} 284}
319#endif 285#endif
320 286
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 287# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 288
327/* 289/*
328 * A weight of 0 or 1 can cause arithmetics problems. 290 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 310{
349 struct task_group *tg; 311 struct task_group *tg;
350 312
351#ifdef CONFIG_USER_SCHED 313#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 315 struct task_group, css);
358#else 316#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 341 return NULL;
384} 342}
385 343
386#endif /* CONFIG_GROUP_SCHED */ 344#endif /* CONFIG_CGROUP_SCHED */
387 345
388/* CFS-related fields in a runqueue */ 346/* CFS-related fields in a runqueue */
389struct cfs_rq { 347struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
478 struct rq *rq; 436 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 437 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 438 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 439#endif
483}; 440};
484 441
@@ -1409,32 +1366,6 @@ static const u32 prio_to_wmult[40] = {
1409 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1366 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1410}; 1367};
1411 1368
1412static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1413
1414/*
1415 * runqueue iterator, to support SMP load-balancing between different
1416 * scheduling classes, without having to expose their internal data
1417 * structures to the load-balancing proper:
1418 */
1419struct rq_iterator {
1420 void *arg;
1421 struct task_struct *(*start)(void *);
1422 struct task_struct *(*next)(void *);
1423};
1424
1425#ifdef CONFIG_SMP
1426static unsigned long
1427balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1428 unsigned long max_load_move, struct sched_domain *sd,
1429 enum cpu_idle_type idle, int *all_pinned,
1430 int *this_best_prio, struct rq_iterator *iterator);
1431
1432static int
1433iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1434 struct sched_domain *sd, enum cpu_idle_type idle,
1435 struct rq_iterator *iterator);
1436#endif
1437
1438/* Time spent by the tasks of the cpu accounting group executing in ... */ 1369/* Time spent by the tasks of the cpu accounting group executing in ... */
1439enum cpuacct_stat_index { 1370enum cpuacct_stat_index {
1440 CPUACCT_STAT_USER, /* ... user mode */ 1371 CPUACCT_STAT_USER, /* ... user mode */
@@ -1720,16 +1651,6 @@ static void update_shares(struct sched_domain *sd)
1720 } 1651 }
1721} 1652}
1722 1653
1723static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1724{
1725 if (root_task_group_empty())
1726 return;
1727
1728 raw_spin_unlock(&rq->lock);
1729 update_shares(sd);
1730 raw_spin_lock(&rq->lock);
1731}
1732
1733static void update_h_load(long cpu) 1654static void update_h_load(long cpu)
1734{ 1655{
1735 if (root_task_group_empty()) 1656 if (root_task_group_empty())
@@ -1744,10 +1665,6 @@ static inline void update_shares(struct sched_domain *sd)
1744{ 1665{
1745} 1666}
1746 1667
1747static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1748{
1749}
1750
1751#endif 1668#endif
1752 1669
1753#ifdef CONFIG_PREEMPT 1670#ifdef CONFIG_PREEMPT
@@ -1824,6 +1741,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1824 raw_spin_unlock(&busiest->lock); 1741 raw_spin_unlock(&busiest->lock);
1825 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1742 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1826} 1743}
1744
1745/*
1746 * double_rq_lock - safely lock two runqueues
1747 *
1748 * Note this does not disable interrupts like task_rq_lock,
1749 * you need to do so manually before calling.
1750 */
1751static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1752 __acquires(rq1->lock)
1753 __acquires(rq2->lock)
1754{
1755 BUG_ON(!irqs_disabled());
1756 if (rq1 == rq2) {
1757 raw_spin_lock(&rq1->lock);
1758 __acquire(rq2->lock); /* Fake it out ;) */
1759 } else {
1760 if (rq1 < rq2) {
1761 raw_spin_lock(&rq1->lock);
1762 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1763 } else {
1764 raw_spin_lock(&rq2->lock);
1765 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1766 }
1767 }
1768 update_rq_clock(rq1);
1769 update_rq_clock(rq2);
1770}
1771
1772/*
1773 * double_rq_unlock - safely unlock two runqueues
1774 *
1775 * Note this does not restore interrupts like task_rq_unlock,
1776 * you need to do so manually after calling.
1777 */
1778static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1779 __releases(rq1->lock)
1780 __releases(rq2->lock)
1781{
1782 raw_spin_unlock(&rq1->lock);
1783 if (rq1 != rq2)
1784 raw_spin_unlock(&rq2->lock);
1785 else
1786 __release(rq2->lock);
1787}
1788
1827#endif 1789#endif
1828 1790
1829#ifdef CONFIG_FAIR_GROUP_SCHED 1791#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1853,18 +1815,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1853#endif 1815#endif
1854} 1816}
1855 1817
1856#include "sched_stats.h" 1818static const struct sched_class rt_sched_class;
1857#include "sched_idletask.c"
1858#include "sched_fair.c"
1859#include "sched_rt.c"
1860#ifdef CONFIG_SCHED_DEBUG
1861# include "sched_debug.c"
1862#endif
1863 1819
1864#define sched_class_highest (&rt_sched_class) 1820#define sched_class_highest (&rt_sched_class)
1865#define for_each_class(class) \ 1821#define for_each_class(class) \
1866 for (class = sched_class_highest; class; class = class->next) 1822 for (class = sched_class_highest; class; class = class->next)
1867 1823
1824#include "sched_stats.h"
1825
1868static void inc_nr_running(struct rq *rq) 1826static void inc_nr_running(struct rq *rq)
1869{ 1827{
1870 rq->nr_running++; 1828 rq->nr_running++;
@@ -1902,13 +1860,14 @@ static void update_avg(u64 *avg, u64 sample)
1902 *avg += diff >> 3; 1860 *avg += diff >> 3;
1903} 1861}
1904 1862
1905static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1863static void
1864enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1906{ 1865{
1907 if (wakeup) 1866 if (wakeup)
1908 p->se.start_runtime = p->se.sum_exec_runtime; 1867 p->se.start_runtime = p->se.sum_exec_runtime;
1909 1868
1910 sched_info_queued(p); 1869 sched_info_queued(p);
1911 p->sched_class->enqueue_task(rq, p, wakeup); 1870 p->sched_class->enqueue_task(rq, p, wakeup, head);
1912 p->se.on_rq = 1; 1871 p->se.on_rq = 1;
1913} 1872}
1914 1873
@@ -1931,6 +1890,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1931} 1890}
1932 1891
1933/* 1892/*
1893 * activate_task - move a task to the runqueue.
1894 */
1895static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1896{
1897 if (task_contributes_to_load(p))
1898 rq->nr_uninterruptible--;
1899
1900 enqueue_task(rq, p, wakeup, false);
1901 inc_nr_running(rq);
1902}
1903
1904/*
1905 * deactivate_task - remove a task from the runqueue.
1906 */
1907static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1908{
1909 if (task_contributes_to_load(p))
1910 rq->nr_uninterruptible++;
1911
1912 dequeue_task(rq, p, sleep);
1913 dec_nr_running(rq);
1914}
1915
1916#include "sched_idletask.c"
1917#include "sched_fair.c"
1918#include "sched_rt.c"
1919#ifdef CONFIG_SCHED_DEBUG
1920# include "sched_debug.c"
1921#endif
1922
1923/*
1934 * __normal_prio - return the priority that is based on the static prio 1924 * __normal_prio - return the priority that is based on the static prio
1935 */ 1925 */
1936static inline int __normal_prio(struct task_struct *p) 1926static inline int __normal_prio(struct task_struct *p)
@@ -1976,30 +1966,6 @@ static int effective_prio(struct task_struct *p)
1976 return p->prio; 1966 return p->prio;
1977} 1967}
1978 1968
1979/*
1980 * activate_task - move a task to the runqueue.
1981 */
1982static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1983{
1984 if (task_contributes_to_load(p))
1985 rq->nr_uninterruptible--;
1986
1987 enqueue_task(rq, p, wakeup);
1988 inc_nr_running(rq);
1989}
1990
1991/*
1992 * deactivate_task - remove a task from the runqueue.
1993 */
1994static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1995{
1996 if (task_contributes_to_load(p))
1997 rq->nr_uninterruptible++;
1998
1999 dequeue_task(rq, p, sleep);
2000 dec_nr_running(rq);
2001}
2002
2003/** 1969/**
2004 * task_curr - is this task currently executing on a CPU? 1970 * task_curr - is this task currently executing on a CPU?
2005 * @p: the task in question. 1971 * @p: the task in question.
@@ -3137,50 +3103,6 @@ static void update_cpu_load(struct rq *this_rq)
3137#ifdef CONFIG_SMP 3103#ifdef CONFIG_SMP
3138 3104
3139/* 3105/*
3140 * double_rq_lock - safely lock two runqueues
3141 *
3142 * Note this does not disable interrupts like task_rq_lock,
3143 * you need to do so manually before calling.
3144 */
3145static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3146 __acquires(rq1->lock)
3147 __acquires(rq2->lock)
3148{
3149 BUG_ON(!irqs_disabled());
3150 if (rq1 == rq2) {
3151 raw_spin_lock(&rq1->lock);
3152 __acquire(rq2->lock); /* Fake it out ;) */
3153 } else {
3154 if (rq1 < rq2) {
3155 raw_spin_lock(&rq1->lock);
3156 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3157 } else {
3158 raw_spin_lock(&rq2->lock);
3159 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3160 }
3161 }
3162 update_rq_clock(rq1);
3163 update_rq_clock(rq2);
3164}
3165
3166/*
3167 * double_rq_unlock - safely unlock two runqueues
3168 *
3169 * Note this does not restore interrupts like task_rq_unlock,
3170 * you need to do so manually after calling.
3171 */
3172static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3173 __releases(rq1->lock)
3174 __releases(rq2->lock)
3175{
3176 raw_spin_unlock(&rq1->lock);
3177 if (rq1 != rq2)
3178 raw_spin_unlock(&rq2->lock);
3179 else
3180 __release(rq2->lock);
3181}
3182
3183/*
3184 * sched_exec - execve() is a valuable balancing opportunity, because at 3106 * sched_exec - execve() is a valuable balancing opportunity, because at
3185 * this point the task has the smallest effective memory and cache footprint. 3107 * this point the task has the smallest effective memory and cache footprint.
3186 */ 3108 */
@@ -3228,1782 +3150,6 @@ again:
3228 task_rq_unlock(rq, &flags); 3150 task_rq_unlock(rq, &flags);
3229} 3151}
3230 3152
3231/*
3232 * pull_task - move a task from a remote runqueue to the local runqueue.
3233 * Both runqueues must be locked.
3234 */
3235static void pull_task(struct rq *src_rq, struct task_struct *p,
3236 struct rq *this_rq, int this_cpu)
3237{
3238 deactivate_task(src_rq, p, 0);
3239 set_task_cpu(p, this_cpu);
3240 activate_task(this_rq, p, 0);
3241 check_preempt_curr(this_rq, p, 0);
3242}
3243
3244/*
3245 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3246 */
3247static
3248int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3249 struct sched_domain *sd, enum cpu_idle_type idle,
3250 int *all_pinned)
3251{
3252 int tsk_cache_hot = 0;
3253 /*
3254 * We do not migrate tasks that are:
3255 * 1) running (obviously), or
3256 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3257 * 3) are cache-hot on their current CPU.
3258 */
3259 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3260 schedstat_inc(p, se.nr_failed_migrations_affine);
3261 return 0;
3262 }
3263 *all_pinned = 0;
3264
3265 if (task_running(rq, p)) {
3266 schedstat_inc(p, se.nr_failed_migrations_running);
3267 return 0;
3268 }
3269
3270 /*
3271 * Aggressive migration if:
3272 * 1) task is cache cold, or
3273 * 2) too many balance attempts have failed.
3274 */
3275
3276 tsk_cache_hot = task_hot(p, rq->clock, sd);
3277 if (!tsk_cache_hot ||
3278 sd->nr_balance_failed > sd->cache_nice_tries) {
3279#ifdef CONFIG_SCHEDSTATS
3280 if (tsk_cache_hot) {
3281 schedstat_inc(sd, lb_hot_gained[idle]);
3282 schedstat_inc(p, se.nr_forced_migrations);
3283 }
3284#endif
3285 return 1;
3286 }
3287
3288 if (tsk_cache_hot) {
3289 schedstat_inc(p, se.nr_failed_migrations_hot);
3290 return 0;
3291 }
3292 return 1;
3293}
3294
3295static unsigned long
3296balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3297 unsigned long max_load_move, struct sched_domain *sd,
3298 enum cpu_idle_type idle, int *all_pinned,
3299 int *this_best_prio, struct rq_iterator *iterator)
3300{
3301 int loops = 0, pulled = 0, pinned = 0;
3302 struct task_struct *p;
3303 long rem_load_move = max_load_move;
3304
3305 if (max_load_move == 0)
3306 goto out;
3307
3308 pinned = 1;
3309
3310 /*
3311 * Start the load-balancing iterator:
3312 */
3313 p = iterator->start(iterator->arg);
3314next:
3315 if (!p || loops++ > sysctl_sched_nr_migrate)
3316 goto out;
3317
3318 if ((p->se.load.weight >> 1) > rem_load_move ||
3319 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3320 p = iterator->next(iterator->arg);
3321 goto next;
3322 }
3323
3324 pull_task(busiest, p, this_rq, this_cpu);
3325 pulled++;
3326 rem_load_move -= p->se.load.weight;
3327
3328#ifdef CONFIG_PREEMPT
3329 /*
3330 * NEWIDLE balancing is a source of latency, so preemptible kernels
3331 * will stop after the first task is pulled to minimize the critical
3332 * section.
3333 */
3334 if (idle == CPU_NEWLY_IDLE)
3335 goto out;
3336#endif
3337
3338 /*
3339 * We only want to steal up to the prescribed amount of weighted load.
3340 */
3341 if (rem_load_move > 0) {
3342 if (p->prio < *this_best_prio)
3343 *this_best_prio = p->prio;
3344 p = iterator->next(iterator->arg);
3345 goto next;
3346 }
3347out:
3348 /*
3349 * Right now, this is one of only two places pull_task() is called,
3350 * so we can safely collect pull_task() stats here rather than
3351 * inside pull_task().
3352 */
3353 schedstat_add(sd, lb_gained[idle], pulled);
3354
3355 if (all_pinned)
3356 *all_pinned = pinned;
3357
3358 return max_load_move - rem_load_move;
3359}
3360
3361/*
3362 * move_tasks tries to move up to max_load_move weighted load from busiest to
3363 * this_rq, as part of a balancing operation within domain "sd".
3364 * Returns 1 if successful and 0 otherwise.
3365 *
3366 * Called with both runqueues locked.
3367 */
3368static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3369 unsigned long max_load_move,
3370 struct sched_domain *sd, enum cpu_idle_type idle,
3371 int *all_pinned)
3372{
3373 const struct sched_class *class = sched_class_highest;
3374 unsigned long total_load_moved = 0;
3375 int this_best_prio = this_rq->curr->prio;
3376
3377 do {
3378 total_load_moved +=
3379 class->load_balance(this_rq, this_cpu, busiest,
3380 max_load_move - total_load_moved,
3381 sd, idle, all_pinned, &this_best_prio);
3382 class = class->next;
3383
3384#ifdef CONFIG_PREEMPT
3385 /*
3386 * NEWIDLE balancing is a source of latency, so preemptible
3387 * kernels will stop after the first task is pulled to minimize
3388 * the critical section.
3389 */
3390 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3391 break;
3392#endif
3393 } while (class && max_load_move > total_load_moved);
3394
3395 return total_load_moved > 0;
3396}
3397
3398static int
3399iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3400 struct sched_domain *sd, enum cpu_idle_type idle,
3401 struct rq_iterator *iterator)
3402{
3403 struct task_struct *p = iterator->start(iterator->arg);
3404 int pinned = 0;
3405
3406 while (p) {
3407 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3408 pull_task(busiest, p, this_rq, this_cpu);
3409 /*
3410 * Right now, this is only the second place pull_task()
3411 * is called, so we can safely collect pull_task()
3412 * stats here rather than inside pull_task().
3413 */
3414 schedstat_inc(sd, lb_gained[idle]);
3415
3416 return 1;
3417 }
3418 p = iterator->next(iterator->arg);
3419 }
3420
3421 return 0;
3422}
3423
3424/*
3425 * move_one_task tries to move exactly one task from busiest to this_rq, as
3426 * part of active balancing operations within "domain".
3427 * Returns 1 if successful and 0 otherwise.
3428 *
3429 * Called with both runqueues locked.
3430 */
3431static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3432 struct sched_domain *sd, enum cpu_idle_type idle)
3433{
3434 const struct sched_class *class;
3435
3436 for_each_class(class) {
3437 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3438 return 1;
3439 }
3440
3441 return 0;
3442}
3443/********** Helpers for find_busiest_group ************************/
3444/*
3445 * sd_lb_stats - Structure to store the statistics of a sched_domain
3446 * during load balancing.
3447 */
3448struct sd_lb_stats {
3449 struct sched_group *busiest; /* Busiest group in this sd */
3450 struct sched_group *this; /* Local group in this sd */
3451 unsigned long total_load; /* Total load of all groups in sd */
3452 unsigned long total_pwr; /* Total power of all groups in sd */
3453 unsigned long avg_load; /* Average load across all groups in sd */
3454
3455 /** Statistics of this group */
3456 unsigned long this_load;
3457 unsigned long this_load_per_task;
3458 unsigned long this_nr_running;
3459
3460 /* Statistics of the busiest group */
3461 unsigned long max_load;
3462 unsigned long busiest_load_per_task;
3463 unsigned long busiest_nr_running;
3464
3465 int group_imb; /* Is there imbalance in this sd */
3466#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3467 int power_savings_balance; /* Is powersave balance needed for this sd */
3468 struct sched_group *group_min; /* Least loaded group in sd */
3469 struct sched_group *group_leader; /* Group which relieves group_min */
3470 unsigned long min_load_per_task; /* load_per_task in group_min */
3471 unsigned long leader_nr_running; /* Nr running of group_leader */
3472 unsigned long min_nr_running; /* Nr running of group_min */
3473#endif
3474};
3475
3476/*
3477 * sg_lb_stats - stats of a sched_group required for load_balancing
3478 */
3479struct sg_lb_stats {
3480 unsigned long avg_load; /*Avg load across the CPUs of the group */
3481 unsigned long group_load; /* Total load over the CPUs of the group */
3482 unsigned long sum_nr_running; /* Nr tasks running in the group */
3483 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3484 unsigned long group_capacity;
3485 int group_imb; /* Is there an imbalance in the group ? */
3486};
3487
3488/**
3489 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3490 * @group: The group whose first cpu is to be returned.
3491 */
3492static inline unsigned int group_first_cpu(struct sched_group *group)
3493{
3494 return cpumask_first(sched_group_cpus(group));
3495}
3496
3497/**
3498 * get_sd_load_idx - Obtain the load index for a given sched domain.
3499 * @sd: The sched_domain whose load_idx is to be obtained.
3500 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3501 */
3502static inline int get_sd_load_idx(struct sched_domain *sd,
3503 enum cpu_idle_type idle)
3504{
3505 int load_idx;
3506
3507 switch (idle) {
3508 case CPU_NOT_IDLE:
3509 load_idx = sd->busy_idx;
3510 break;
3511
3512 case CPU_NEWLY_IDLE:
3513 load_idx = sd->newidle_idx;
3514 break;
3515 default:
3516 load_idx = sd->idle_idx;
3517 break;
3518 }
3519
3520 return load_idx;
3521}
3522
3523
3524#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3525/**
3526 * init_sd_power_savings_stats - Initialize power savings statistics for
3527 * the given sched_domain, during load balancing.
3528 *
3529 * @sd: Sched domain whose power-savings statistics are to be initialized.
3530 * @sds: Variable containing the statistics for sd.
3531 * @idle: Idle status of the CPU at which we're performing load-balancing.
3532 */
3533static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3534 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3535{
3536 /*
3537 * Busy processors will not participate in power savings
3538 * balance.
3539 */
3540 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3541 sds->power_savings_balance = 0;
3542 else {
3543 sds->power_savings_balance = 1;
3544 sds->min_nr_running = ULONG_MAX;
3545 sds->leader_nr_running = 0;
3546 }
3547}
3548
3549/**
3550 * update_sd_power_savings_stats - Update the power saving stats for a
3551 * sched_domain while performing load balancing.
3552 *
3553 * @group: sched_group belonging to the sched_domain under consideration.
3554 * @sds: Variable containing the statistics of the sched_domain
3555 * @local_group: Does group contain the CPU for which we're performing
3556 * load balancing ?
3557 * @sgs: Variable containing the statistics of the group.
3558 */
3559static inline void update_sd_power_savings_stats(struct sched_group *group,
3560 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3561{
3562
3563 if (!sds->power_savings_balance)
3564 return;
3565
3566 /*
3567 * If the local group is idle or completely loaded
3568 * no need to do power savings balance at this domain
3569 */
3570 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3571 !sds->this_nr_running))
3572 sds->power_savings_balance = 0;
3573
3574 /*
3575 * If a group is already running at full capacity or idle,
3576 * don't include that group in power savings calculations
3577 */
3578 if (!sds->power_savings_balance ||
3579 sgs->sum_nr_running >= sgs->group_capacity ||
3580 !sgs->sum_nr_running)
3581 return;
3582
3583 /*
3584 * Calculate the group which has the least non-idle load.
3585 * This is the group from where we need to pick up the load
3586 * for saving power
3587 */
3588 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3589 (sgs->sum_nr_running == sds->min_nr_running &&
3590 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3591 sds->group_min = group;
3592 sds->min_nr_running = sgs->sum_nr_running;
3593 sds->min_load_per_task = sgs->sum_weighted_load /
3594 sgs->sum_nr_running;
3595 }
3596
3597 /*
3598 * Calculate the group which is almost near its
3599 * capacity but still has some space to pick up some load
3600 * from other group and save more power
3601 */
3602 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3603 return;
3604
3605 if (sgs->sum_nr_running > sds->leader_nr_running ||
3606 (sgs->sum_nr_running == sds->leader_nr_running &&
3607 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3608 sds->group_leader = group;
3609 sds->leader_nr_running = sgs->sum_nr_running;
3610 }
3611}
3612
3613/**
3614 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3615 * @sds: Variable containing the statistics of the sched_domain
3616 * under consideration.
3617 * @this_cpu: Cpu at which we're currently performing load-balancing.
3618 * @imbalance: Variable to store the imbalance.
3619 *
3620 * Description:
3621 * Check if we have potential to perform some power-savings balance.
3622 * If yes, set the busiest group to be the least loaded group in the
3623 * sched_domain, so that it's CPUs can be put to idle.
3624 *
3625 * Returns 1 if there is potential to perform power-savings balance.
3626 * Else returns 0.
3627 */
3628static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3629 int this_cpu, unsigned long *imbalance)
3630{
3631 if (!sds->power_savings_balance)
3632 return 0;
3633
3634 if (sds->this != sds->group_leader ||
3635 sds->group_leader == sds->group_min)
3636 return 0;
3637
3638 *imbalance = sds->min_load_per_task;
3639 sds->busiest = sds->group_min;
3640
3641 return 1;
3642
3643}
3644#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3645static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3646 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3647{
3648 return;
3649}
3650
3651static inline void update_sd_power_savings_stats(struct sched_group *group,
3652 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3653{
3654 return;
3655}
3656
3657static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3658 int this_cpu, unsigned long *imbalance)
3659{
3660 return 0;
3661}
3662#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3663
3664
3665unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3666{
3667 return SCHED_LOAD_SCALE;
3668}
3669
3670unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3671{
3672 return default_scale_freq_power(sd, cpu);
3673}
3674
3675unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3676{
3677 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3678 unsigned long smt_gain = sd->smt_gain;
3679
3680 smt_gain /= weight;
3681
3682 return smt_gain;
3683}
3684
3685unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3686{
3687 return default_scale_smt_power(sd, cpu);
3688}
3689
3690unsigned long scale_rt_power(int cpu)
3691{
3692 struct rq *rq = cpu_rq(cpu);
3693 u64 total, available;
3694
3695 sched_avg_update(rq);
3696
3697 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3698 available = total - rq->rt_avg;
3699
3700 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3701 total = SCHED_LOAD_SCALE;
3702
3703 total >>= SCHED_LOAD_SHIFT;
3704
3705 return div_u64(available, total);
3706}
3707
3708static void update_cpu_power(struct sched_domain *sd, int cpu)
3709{
3710 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3711 unsigned long power = SCHED_LOAD_SCALE;
3712 struct sched_group *sdg = sd->groups;
3713
3714 if (sched_feat(ARCH_POWER))
3715 power *= arch_scale_freq_power(sd, cpu);
3716 else
3717 power *= default_scale_freq_power(sd, cpu);
3718
3719 power >>= SCHED_LOAD_SHIFT;
3720
3721 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3722 if (sched_feat(ARCH_POWER))
3723 power *= arch_scale_smt_power(sd, cpu);
3724 else
3725 power *= default_scale_smt_power(sd, cpu);
3726
3727 power >>= SCHED_LOAD_SHIFT;
3728 }
3729
3730 power *= scale_rt_power(cpu);
3731 power >>= SCHED_LOAD_SHIFT;
3732
3733 if (!power)
3734 power = 1;
3735
3736 sdg->cpu_power = power;
3737}
3738
3739static void update_group_power(struct sched_domain *sd, int cpu)
3740{
3741 struct sched_domain *child = sd->child;
3742 struct sched_group *group, *sdg = sd->groups;
3743 unsigned long power;
3744
3745 if (!child) {
3746 update_cpu_power(sd, cpu);
3747 return;
3748 }
3749
3750 power = 0;
3751
3752 group = child->groups;
3753 do {
3754 power += group->cpu_power;
3755 group = group->next;
3756 } while (group != child->groups);
3757
3758 sdg->cpu_power = power;
3759}
3760
3761/**
3762 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3763 * @sd: The sched_domain whose statistics are to be updated.
3764 * @group: sched_group whose statistics are to be updated.
3765 * @this_cpu: Cpu for which load balance is currently performed.
3766 * @idle: Idle status of this_cpu
3767 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3768 * @sd_idle: Idle status of the sched_domain containing group.
3769 * @local_group: Does group contain this_cpu.
3770 * @cpus: Set of cpus considered for load balancing.
3771 * @balance: Should we balance.
3772 * @sgs: variable to hold the statistics for this group.
3773 */
3774static inline void update_sg_lb_stats(struct sched_domain *sd,
3775 struct sched_group *group, int this_cpu,
3776 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3777 int local_group, const struct cpumask *cpus,
3778 int *balance, struct sg_lb_stats *sgs)
3779{
3780 unsigned long load, max_cpu_load, min_cpu_load;
3781 int i;
3782 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3783 unsigned long sum_avg_load_per_task;
3784 unsigned long avg_load_per_task;
3785
3786 if (local_group) {
3787 balance_cpu = group_first_cpu(group);
3788 if (balance_cpu == this_cpu)
3789 update_group_power(sd, this_cpu);
3790 }
3791
3792 /* Tally up the load of all CPUs in the group */
3793 sum_avg_load_per_task = avg_load_per_task = 0;
3794 max_cpu_load = 0;
3795 min_cpu_load = ~0UL;
3796
3797 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3798 struct rq *rq = cpu_rq(i);
3799
3800 if (*sd_idle && rq->nr_running)
3801 *sd_idle = 0;
3802
3803 /* Bias balancing toward cpus of our domain */
3804 if (local_group) {
3805 if (idle_cpu(i) && !first_idle_cpu) {
3806 first_idle_cpu = 1;
3807 balance_cpu = i;
3808 }
3809
3810 load = target_load(i, load_idx);
3811 } else {
3812 load = source_load(i, load_idx);
3813 if (load > max_cpu_load)
3814 max_cpu_load = load;
3815 if (min_cpu_load > load)
3816 min_cpu_load = load;
3817 }
3818
3819 sgs->group_load += load;
3820 sgs->sum_nr_running += rq->nr_running;
3821 sgs->sum_weighted_load += weighted_cpuload(i);
3822
3823 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3824 }
3825
3826 /*
3827 * First idle cpu or the first cpu(busiest) in this sched group
3828 * is eligible for doing load balancing at this and above
3829 * domains. In the newly idle case, we will allow all the cpu's
3830 * to do the newly idle load balance.
3831 */
3832 if (idle != CPU_NEWLY_IDLE && local_group &&
3833 balance_cpu != this_cpu && balance) {
3834 *balance = 0;
3835 return;
3836 }
3837
3838 /* Adjust by relative CPU power of the group */
3839 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3840
3841
3842 /*
3843 * Consider the group unbalanced when the imbalance is larger
3844 * than the average weight of two tasks.
3845 *
3846 * APZ: with cgroup the avg task weight can vary wildly and
3847 * might not be a suitable number - should we keep a
3848 * normalized nr_running number somewhere that negates
3849 * the hierarchy?
3850 */
3851 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3852 group->cpu_power;
3853
3854 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3855 sgs->group_imb = 1;
3856
3857 sgs->group_capacity =
3858 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3859}
3860
3861/**
3862 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3863 * @sd: sched_domain whose statistics are to be updated.
3864 * @this_cpu: Cpu for which load balance is currently performed.
3865 * @idle: Idle status of this_cpu
3866 * @sd_idle: Idle status of the sched_domain containing group.
3867 * @cpus: Set of cpus considered for load balancing.
3868 * @balance: Should we balance.
3869 * @sds: variable to hold the statistics for this sched_domain.
3870 */
3871static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3872 enum cpu_idle_type idle, int *sd_idle,
3873 const struct cpumask *cpus, int *balance,
3874 struct sd_lb_stats *sds)
3875{
3876 struct sched_domain *child = sd->child;
3877 struct sched_group *group = sd->groups;
3878 struct sg_lb_stats sgs;
3879 int load_idx, prefer_sibling = 0;
3880
3881 if (child && child->flags & SD_PREFER_SIBLING)
3882 prefer_sibling = 1;
3883
3884 init_sd_power_savings_stats(sd, sds, idle);
3885 load_idx = get_sd_load_idx(sd, idle);
3886
3887 do {
3888 int local_group;
3889
3890 local_group = cpumask_test_cpu(this_cpu,
3891 sched_group_cpus(group));
3892 memset(&sgs, 0, sizeof(sgs));
3893 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3894 local_group, cpus, balance, &sgs);
3895
3896 if (local_group && balance && !(*balance))
3897 return;
3898
3899 sds->total_load += sgs.group_load;
3900 sds->total_pwr += group->cpu_power;
3901
3902 /*
3903 * In case the child domain prefers tasks go to siblings
3904 * first, lower the group capacity to one so that we'll try
3905 * and move all the excess tasks away.
3906 */
3907 if (prefer_sibling)
3908 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3909
3910 if (local_group) {
3911 sds->this_load = sgs.avg_load;
3912 sds->this = group;
3913 sds->this_nr_running = sgs.sum_nr_running;
3914 sds->this_load_per_task = sgs.sum_weighted_load;
3915 } else if (sgs.avg_load > sds->max_load &&
3916 (sgs.sum_nr_running > sgs.group_capacity ||
3917 sgs.group_imb)) {
3918 sds->max_load = sgs.avg_load;
3919 sds->busiest = group;
3920 sds->busiest_nr_running = sgs.sum_nr_running;
3921 sds->busiest_load_per_task = sgs.sum_weighted_load;
3922 sds->group_imb = sgs.group_imb;
3923 }
3924
3925 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3926 group = group->next;
3927 } while (group != sd->groups);
3928}
3929
3930/**
3931 * fix_small_imbalance - Calculate the minor imbalance that exists
3932 * amongst the groups of a sched_domain, during
3933 * load balancing.
3934 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3935 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3936 * @imbalance: Variable to store the imbalance.
3937 */
3938static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3939 int this_cpu, unsigned long *imbalance)
3940{
3941 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3942 unsigned int imbn = 2;
3943
3944 if (sds->this_nr_running) {
3945 sds->this_load_per_task /= sds->this_nr_running;
3946 if (sds->busiest_load_per_task >
3947 sds->this_load_per_task)
3948 imbn = 1;
3949 } else
3950 sds->this_load_per_task =
3951 cpu_avg_load_per_task(this_cpu);
3952
3953 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3954 sds->busiest_load_per_task * imbn) {
3955 *imbalance = sds->busiest_load_per_task;
3956 return;
3957 }
3958
3959 /*
3960 * OK, we don't have enough imbalance to justify moving tasks,
3961 * however we may be able to increase total CPU power used by
3962 * moving them.
3963 */
3964
3965 pwr_now += sds->busiest->cpu_power *
3966 min(sds->busiest_load_per_task, sds->max_load);
3967 pwr_now += sds->this->cpu_power *
3968 min(sds->this_load_per_task, sds->this_load);
3969 pwr_now /= SCHED_LOAD_SCALE;
3970
3971 /* Amount of load we'd subtract */
3972 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3973 sds->busiest->cpu_power;
3974 if (sds->max_load > tmp)
3975 pwr_move += sds->busiest->cpu_power *
3976 min(sds->busiest_load_per_task, sds->max_load - tmp);
3977
3978 /* Amount of load we'd add */
3979 if (sds->max_load * sds->busiest->cpu_power <
3980 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3981 tmp = (sds->max_load * sds->busiest->cpu_power) /
3982 sds->this->cpu_power;
3983 else
3984 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3985 sds->this->cpu_power;
3986 pwr_move += sds->this->cpu_power *
3987 min(sds->this_load_per_task, sds->this_load + tmp);
3988 pwr_move /= SCHED_LOAD_SCALE;
3989
3990 /* Move if we gain throughput */
3991 if (pwr_move > pwr_now)
3992 *imbalance = sds->busiest_load_per_task;
3993}
3994
3995/**
3996 * calculate_imbalance - Calculate the amount of imbalance present within the
3997 * groups of a given sched_domain during load balance.
3998 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3999 * @this_cpu: Cpu for which currently load balance is being performed.
4000 * @imbalance: The variable to store the imbalance.
4001 */
4002static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4003 unsigned long *imbalance)
4004{
4005 unsigned long max_pull;
4006 /*
4007 * In the presence of smp nice balancing, certain scenarios can have
4008 * max load less than avg load(as we skip the groups at or below
4009 * its cpu_power, while calculating max_load..)
4010 */
4011 if (sds->max_load < sds->avg_load) {
4012 *imbalance = 0;
4013 return fix_small_imbalance(sds, this_cpu, imbalance);
4014 }
4015
4016 /* Don't want to pull so many tasks that a group would go idle */
4017 max_pull = min(sds->max_load - sds->avg_load,
4018 sds->max_load - sds->busiest_load_per_task);
4019
4020 /* How much load to actually move to equalise the imbalance */
4021 *imbalance = min(max_pull * sds->busiest->cpu_power,
4022 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
4023 / SCHED_LOAD_SCALE;
4024
4025 /*
4026 * if *imbalance is less than the average load per runnable task
4027 * there is no gaurantee that any tasks will be moved so we'll have
4028 * a think about bumping its value to force at least one task to be
4029 * moved
4030 */
4031 if (*imbalance < sds->busiest_load_per_task)
4032 return fix_small_imbalance(sds, this_cpu, imbalance);
4033
4034}
4035/******* find_busiest_group() helpers end here *********************/
4036
4037/**
4038 * find_busiest_group - Returns the busiest group within the sched_domain
4039 * if there is an imbalance. If there isn't an imbalance, and
4040 * the user has opted for power-savings, it returns a group whose
4041 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4042 * such a group exists.
4043 *
4044 * Also calculates the amount of weighted load which should be moved
4045 * to restore balance.
4046 *
4047 * @sd: The sched_domain whose busiest group is to be returned.
4048 * @this_cpu: The cpu for which load balancing is currently being performed.
4049 * @imbalance: Variable which stores amount of weighted load which should
4050 * be moved to restore balance/put a group to idle.
4051 * @idle: The idle status of this_cpu.
4052 * @sd_idle: The idleness of sd
4053 * @cpus: The set of CPUs under consideration for load-balancing.
4054 * @balance: Pointer to a variable indicating if this_cpu
4055 * is the appropriate cpu to perform load balancing at this_level.
4056 *
4057 * Returns: - the busiest group if imbalance exists.
4058 * - If no imbalance and user has opted for power-savings balance,
4059 * return the least loaded group whose CPUs can be
4060 * put to idle by rebalancing its tasks onto our group.
4061 */
4062static struct sched_group *
4063find_busiest_group(struct sched_domain *sd, int this_cpu,
4064 unsigned long *imbalance, enum cpu_idle_type idle,
4065 int *sd_idle, const struct cpumask *cpus, int *balance)
4066{
4067 struct sd_lb_stats sds;
4068
4069 memset(&sds, 0, sizeof(sds));
4070
4071 /*
4072 * Compute the various statistics relavent for load balancing at
4073 * this level.
4074 */
4075 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4076 balance, &sds);
4077
4078 /* Cases where imbalance does not exist from POV of this_cpu */
4079 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4080 * at this level.
4081 * 2) There is no busy sibling group to pull from.
4082 * 3) This group is the busiest group.
4083 * 4) This group is more busy than the avg busieness at this
4084 * sched_domain.
4085 * 5) The imbalance is within the specified limit.
4086 * 6) Any rebalance would lead to ping-pong
4087 */
4088 if (balance && !(*balance))
4089 goto ret;
4090
4091 if (!sds.busiest || sds.busiest_nr_running == 0)
4092 goto out_balanced;
4093
4094 if (sds.this_load >= sds.max_load)
4095 goto out_balanced;
4096
4097 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4098
4099 if (sds.this_load >= sds.avg_load)
4100 goto out_balanced;
4101
4102 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4103 goto out_balanced;
4104
4105 sds.busiest_load_per_task /= sds.busiest_nr_running;
4106 if (sds.group_imb)
4107 sds.busiest_load_per_task =
4108 min(sds.busiest_load_per_task, sds.avg_load);
4109
4110 /*
4111 * We're trying to get all the cpus to the average_load, so we don't
4112 * want to push ourselves above the average load, nor do we wish to
4113 * reduce the max loaded cpu below the average load, as either of these
4114 * actions would just result in more rebalancing later, and ping-pong
4115 * tasks around. Thus we look for the minimum possible imbalance.
4116 * Negative imbalances (*we* are more loaded than anyone else) will
4117 * be counted as no imbalance for these purposes -- we can't fix that
4118 * by pulling tasks to us. Be careful of negative numbers as they'll
4119 * appear as very large values with unsigned longs.
4120 */
4121 if (sds.max_load <= sds.busiest_load_per_task)
4122 goto out_balanced;
4123
4124 /* Looks like there is an imbalance. Compute it */
4125 calculate_imbalance(&sds, this_cpu, imbalance);
4126 return sds.busiest;
4127
4128out_balanced:
4129 /*
4130 * There is no obvious imbalance. But check if we can do some balancing
4131 * to save power.
4132 */
4133 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4134 return sds.busiest;
4135ret:
4136 *imbalance = 0;
4137 return NULL;
4138}
4139
4140/*
4141 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4142 */
4143static struct rq *
4144find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4145 unsigned long imbalance, const struct cpumask *cpus)
4146{
4147 struct rq *busiest = NULL, *rq;
4148 unsigned long max_load = 0;
4149 int i;
4150
4151 for_each_cpu(i, sched_group_cpus(group)) {
4152 unsigned long power = power_of(i);
4153 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4154 unsigned long wl;
4155
4156 if (!cpumask_test_cpu(i, cpus))
4157 continue;
4158
4159 rq = cpu_rq(i);
4160 wl = weighted_cpuload(i);
4161
4162 /*
4163 * When comparing with imbalance, use weighted_cpuload()
4164 * which is not scaled with the cpu power.
4165 */
4166 if (capacity && rq->nr_running == 1 && wl > imbalance)
4167 continue;
4168
4169 /*
4170 * For the load comparisons with the other cpu's, consider
4171 * the weighted_cpuload() scaled with the cpu power, so that
4172 * the load can be moved away from the cpu that is potentially
4173 * running at a lower capacity.
4174 */
4175 wl = (wl * SCHED_LOAD_SCALE) / power;
4176
4177 if (wl > max_load) {
4178 max_load = wl;
4179 busiest = rq;
4180 }
4181 }
4182
4183 return busiest;
4184}
4185
4186/*
4187 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4188 * so long as it is large enough.
4189 */
4190#define MAX_PINNED_INTERVAL 512
4191
4192/* Working cpumask for load_balance and load_balance_newidle. */
4193static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4194
4195/*
4196 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4197 * tasks if there is an imbalance.
4198 */
4199static int load_balance(int this_cpu, struct rq *this_rq,
4200 struct sched_domain *sd, enum cpu_idle_type idle,
4201 int *balance)
4202{
4203 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4204 struct sched_group *group;
4205 unsigned long imbalance;
4206 struct rq *busiest;
4207 unsigned long flags;
4208 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4209
4210 cpumask_copy(cpus, cpu_active_mask);
4211
4212 /*
4213 * When power savings policy is enabled for the parent domain, idle
4214 * sibling can pick up load irrespective of busy siblings. In this case,
4215 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4216 * portraying it as CPU_NOT_IDLE.
4217 */
4218 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4219 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4220 sd_idle = 1;
4221
4222 schedstat_inc(sd, lb_count[idle]);
4223
4224redo:
4225 update_shares(sd);
4226 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4227 cpus, balance);
4228
4229 if (*balance == 0)
4230 goto out_balanced;
4231
4232 if (!group) {
4233 schedstat_inc(sd, lb_nobusyg[idle]);
4234 goto out_balanced;
4235 }
4236
4237 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4238 if (!busiest) {
4239 schedstat_inc(sd, lb_nobusyq[idle]);
4240 goto out_balanced;
4241 }
4242
4243 BUG_ON(busiest == this_rq);
4244
4245 schedstat_add(sd, lb_imbalance[idle], imbalance);
4246
4247 ld_moved = 0;
4248 if (busiest->nr_running > 1) {
4249 /*
4250 * Attempt to move tasks. If find_busiest_group has found
4251 * an imbalance but busiest->nr_running <= 1, the group is
4252 * still unbalanced. ld_moved simply stays zero, so it is
4253 * correctly treated as an imbalance.
4254 */
4255 local_irq_save(flags);
4256 double_rq_lock(this_rq, busiest);
4257 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4258 imbalance, sd, idle, &all_pinned);
4259 double_rq_unlock(this_rq, busiest);
4260 local_irq_restore(flags);
4261
4262 /*
4263 * some other cpu did the load balance for us.
4264 */
4265 if (ld_moved && this_cpu != smp_processor_id())
4266 resched_cpu(this_cpu);
4267
4268 /* All tasks on this runqueue were pinned by CPU affinity */
4269 if (unlikely(all_pinned)) {
4270 cpumask_clear_cpu(cpu_of(busiest), cpus);
4271 if (!cpumask_empty(cpus))
4272 goto redo;
4273 goto out_balanced;
4274 }
4275 }
4276
4277 if (!ld_moved) {
4278 schedstat_inc(sd, lb_failed[idle]);
4279 sd->nr_balance_failed++;
4280
4281 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4282
4283 raw_spin_lock_irqsave(&busiest->lock, flags);
4284
4285 /* don't kick the migration_thread, if the curr
4286 * task on busiest cpu can't be moved to this_cpu
4287 */
4288 if (!cpumask_test_cpu(this_cpu,
4289 &busiest->curr->cpus_allowed)) {
4290 raw_spin_unlock_irqrestore(&busiest->lock,
4291 flags);
4292 all_pinned = 1;
4293 goto out_one_pinned;
4294 }
4295
4296 if (!busiest->active_balance) {
4297 busiest->active_balance = 1;
4298 busiest->push_cpu = this_cpu;
4299 active_balance = 1;
4300 }
4301 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4302 if (active_balance)
4303 wake_up_process(busiest->migration_thread);
4304
4305 /*
4306 * We've kicked active balancing, reset the failure
4307 * counter.
4308 */
4309 sd->nr_balance_failed = sd->cache_nice_tries+1;
4310 }
4311 } else
4312 sd->nr_balance_failed = 0;
4313
4314 if (likely(!active_balance)) {
4315 /* We were unbalanced, so reset the balancing interval */
4316 sd->balance_interval = sd->min_interval;
4317 } else {
4318 /*
4319 * If we've begun active balancing, start to back off. This
4320 * case may not be covered by the all_pinned logic if there
4321 * is only 1 task on the busy runqueue (because we don't call
4322 * move_tasks).
4323 */
4324 if (sd->balance_interval < sd->max_interval)
4325 sd->balance_interval *= 2;
4326 }
4327
4328 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4329 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4330 ld_moved = -1;
4331
4332 goto out;
4333
4334out_balanced:
4335 schedstat_inc(sd, lb_balanced[idle]);
4336
4337 sd->nr_balance_failed = 0;
4338
4339out_one_pinned:
4340 /* tune up the balancing interval */
4341 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4342 (sd->balance_interval < sd->max_interval))
4343 sd->balance_interval *= 2;
4344
4345 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4346 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4347 ld_moved = -1;
4348 else
4349 ld_moved = 0;
4350out:
4351 if (ld_moved)
4352 update_shares(sd);
4353 return ld_moved;
4354}
4355
4356/*
4357 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4358 * tasks if there is an imbalance.
4359 *
4360 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4361 * this_rq is locked.
4362 */
4363static int
4364load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4365{
4366 struct sched_group *group;
4367 struct rq *busiest = NULL;
4368 unsigned long imbalance;
4369 int ld_moved = 0;
4370 int sd_idle = 0;
4371 int all_pinned = 0;
4372 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4373
4374 cpumask_copy(cpus, cpu_active_mask);
4375
4376 /*
4377 * When power savings policy is enabled for the parent domain, idle
4378 * sibling can pick up load irrespective of busy siblings. In this case,
4379 * let the state of idle sibling percolate up as IDLE, instead of
4380 * portraying it as CPU_NOT_IDLE.
4381 */
4382 if (sd->flags & SD_SHARE_CPUPOWER &&
4383 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4384 sd_idle = 1;
4385
4386 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4387redo:
4388 update_shares_locked(this_rq, sd);
4389 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4390 &sd_idle, cpus, NULL);
4391 if (!group) {
4392 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4393 goto out_balanced;
4394 }
4395
4396 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4397 if (!busiest) {
4398 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4399 goto out_balanced;
4400 }
4401
4402 BUG_ON(busiest == this_rq);
4403
4404 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4405
4406 ld_moved = 0;
4407 if (busiest->nr_running > 1) {
4408 /* Attempt to move tasks */
4409 double_lock_balance(this_rq, busiest);
4410 /* this_rq->clock is already updated */
4411 update_rq_clock(busiest);
4412 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4413 imbalance, sd, CPU_NEWLY_IDLE,
4414 &all_pinned);
4415 double_unlock_balance(this_rq, busiest);
4416
4417 if (unlikely(all_pinned)) {
4418 cpumask_clear_cpu(cpu_of(busiest), cpus);
4419 if (!cpumask_empty(cpus))
4420 goto redo;
4421 }
4422 }
4423
4424 if (!ld_moved) {
4425 int active_balance = 0;
4426
4427 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4428 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4429 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4430 return -1;
4431
4432 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4433 return -1;
4434
4435 if (sd->nr_balance_failed++ < 2)
4436 return -1;
4437
4438 /*
4439 * The only task running in a non-idle cpu can be moved to this
4440 * cpu in an attempt to completely freeup the other CPU
4441 * package. The same method used to move task in load_balance()
4442 * have been extended for load_balance_newidle() to speedup
4443 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4444 *
4445 * The package power saving logic comes from
4446 * find_busiest_group(). If there are no imbalance, then
4447 * f_b_g() will return NULL. However when sched_mc={1,2} then
4448 * f_b_g() will select a group from which a running task may be
4449 * pulled to this cpu in order to make the other package idle.
4450 * If there is no opportunity to make a package idle and if
4451 * there are no imbalance, then f_b_g() will return NULL and no
4452 * action will be taken in load_balance_newidle().
4453 *
4454 * Under normal task pull operation due to imbalance, there
4455 * will be more than one task in the source run queue and
4456 * move_tasks() will succeed. ld_moved will be true and this
4457 * active balance code will not be triggered.
4458 */
4459
4460 /* Lock busiest in correct order while this_rq is held */
4461 double_lock_balance(this_rq, busiest);
4462
4463 /*
4464 * don't kick the migration_thread, if the curr
4465 * task on busiest cpu can't be moved to this_cpu
4466 */
4467 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4468 double_unlock_balance(this_rq, busiest);
4469 all_pinned = 1;
4470 return ld_moved;
4471 }
4472
4473 if (!busiest->active_balance) {
4474 busiest->active_balance = 1;
4475 busiest->push_cpu = this_cpu;
4476 active_balance = 1;
4477 }
4478
4479 double_unlock_balance(this_rq, busiest);
4480 /*
4481 * Should not call ttwu while holding a rq->lock
4482 */
4483 raw_spin_unlock(&this_rq->lock);
4484 if (active_balance)
4485 wake_up_process(busiest->migration_thread);
4486 raw_spin_lock(&this_rq->lock);
4487
4488 } else
4489 sd->nr_balance_failed = 0;
4490
4491 update_shares_locked(this_rq, sd);
4492 return ld_moved;
4493
4494out_balanced:
4495 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4496 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4497 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4498 return -1;
4499 sd->nr_balance_failed = 0;
4500
4501 return 0;
4502}
4503
4504/*
4505 * idle_balance is called by schedule() if this_cpu is about to become
4506 * idle. Attempts to pull tasks from other CPUs.
4507 */
4508static void idle_balance(int this_cpu, struct rq *this_rq)
4509{
4510 struct sched_domain *sd;
4511 int pulled_task = 0;
4512 unsigned long next_balance = jiffies + HZ;
4513
4514 this_rq->idle_stamp = this_rq->clock;
4515
4516 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4517 return;
4518
4519 for_each_domain(this_cpu, sd) {
4520 unsigned long interval;
4521
4522 if (!(sd->flags & SD_LOAD_BALANCE))
4523 continue;
4524
4525 if (sd->flags & SD_BALANCE_NEWIDLE)
4526 /* If we've pulled tasks over stop searching: */
4527 pulled_task = load_balance_newidle(this_cpu, this_rq,
4528 sd);
4529
4530 interval = msecs_to_jiffies(sd->balance_interval);
4531 if (time_after(next_balance, sd->last_balance + interval))
4532 next_balance = sd->last_balance + interval;
4533 if (pulled_task) {
4534 this_rq->idle_stamp = 0;
4535 break;
4536 }
4537 }
4538 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4539 /*
4540 * We are going idle. next_balance may be set based on
4541 * a busy processor. So reset next_balance.
4542 */
4543 this_rq->next_balance = next_balance;
4544 }
4545}
4546
4547/*
4548 * active_load_balance is run by migration threads. It pushes running tasks
4549 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4550 * running on each physical CPU where possible, and avoids physical /
4551 * logical imbalances.
4552 *
4553 * Called with busiest_rq locked.
4554 */
4555static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4556{
4557 int target_cpu = busiest_rq->push_cpu;
4558 struct sched_domain *sd;
4559 struct rq *target_rq;
4560
4561 /* Is there any task to move? */
4562 if (busiest_rq->nr_running <= 1)
4563 return;
4564
4565 target_rq = cpu_rq(target_cpu);
4566
4567 /*
4568 * This condition is "impossible", if it occurs
4569 * we need to fix it. Originally reported by
4570 * Bjorn Helgaas on a 128-cpu setup.
4571 */
4572 BUG_ON(busiest_rq == target_rq);
4573
4574 /* move a task from busiest_rq to target_rq */
4575 double_lock_balance(busiest_rq, target_rq);
4576 update_rq_clock(busiest_rq);
4577 update_rq_clock(target_rq);
4578
4579 /* Search for an sd spanning us and the target CPU. */
4580 for_each_domain(target_cpu, sd) {
4581 if ((sd->flags & SD_LOAD_BALANCE) &&
4582 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4583 break;
4584 }
4585
4586 if (likely(sd)) {
4587 schedstat_inc(sd, alb_count);
4588
4589 if (move_one_task(target_rq, target_cpu, busiest_rq,
4590 sd, CPU_IDLE))
4591 schedstat_inc(sd, alb_pushed);
4592 else
4593 schedstat_inc(sd, alb_failed);
4594 }
4595 double_unlock_balance(busiest_rq, target_rq);
4596}
4597
4598#ifdef CONFIG_NO_HZ
4599static struct {
4600 atomic_t load_balancer;
4601 cpumask_var_t cpu_mask;
4602 cpumask_var_t ilb_grp_nohz_mask;
4603} nohz ____cacheline_aligned = {
4604 .load_balancer = ATOMIC_INIT(-1),
4605};
4606
4607int get_nohz_load_balancer(void)
4608{
4609 return atomic_read(&nohz.load_balancer);
4610}
4611
4612#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4613/**
4614 * lowest_flag_domain - Return lowest sched_domain containing flag.
4615 * @cpu: The cpu whose lowest level of sched domain is to
4616 * be returned.
4617 * @flag: The flag to check for the lowest sched_domain
4618 * for the given cpu.
4619 *
4620 * Returns the lowest sched_domain of a cpu which contains the given flag.
4621 */
4622static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4623{
4624 struct sched_domain *sd;
4625
4626 for_each_domain(cpu, sd)
4627 if (sd && (sd->flags & flag))
4628 break;
4629
4630 return sd;
4631}
4632
4633/**
4634 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4635 * @cpu: The cpu whose domains we're iterating over.
4636 * @sd: variable holding the value of the power_savings_sd
4637 * for cpu.
4638 * @flag: The flag to filter the sched_domains to be iterated.
4639 *
4640 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4641 * set, starting from the lowest sched_domain to the highest.
4642 */
4643#define for_each_flag_domain(cpu, sd, flag) \
4644 for (sd = lowest_flag_domain(cpu, flag); \
4645 (sd && (sd->flags & flag)); sd = sd->parent)
4646
4647/**
4648 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4649 * @ilb_group: group to be checked for semi-idleness
4650 *
4651 * Returns: 1 if the group is semi-idle. 0 otherwise.
4652 *
4653 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4654 * and atleast one non-idle CPU. This helper function checks if the given
4655 * sched_group is semi-idle or not.
4656 */
4657static inline int is_semi_idle_group(struct sched_group *ilb_group)
4658{
4659 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4660 sched_group_cpus(ilb_group));
4661
4662 /*
4663 * A sched_group is semi-idle when it has atleast one busy cpu
4664 * and atleast one idle cpu.
4665 */
4666 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4667 return 0;
4668
4669 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4670 return 0;
4671
4672 return 1;
4673}
4674/**
4675 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4676 * @cpu: The cpu which is nominating a new idle_load_balancer.
4677 *
4678 * Returns: Returns the id of the idle load balancer if it exists,
4679 * Else, returns >= nr_cpu_ids.
4680 *
4681 * This algorithm picks the idle load balancer such that it belongs to a
4682 * semi-idle powersavings sched_domain. The idea is to try and avoid
4683 * completely idle packages/cores just for the purpose of idle load balancing
4684 * when there are other idle cpu's which are better suited for that job.
4685 */
4686static int find_new_ilb(int cpu)
4687{
4688 struct sched_domain *sd;
4689 struct sched_group *ilb_group;
4690
4691 /*
4692 * Have idle load balancer selection from semi-idle packages only
4693 * when power-aware load balancing is enabled
4694 */
4695 if (!(sched_smt_power_savings || sched_mc_power_savings))
4696 goto out_done;
4697
4698 /*
4699 * Optimize for the case when we have no idle CPUs or only one
4700 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4701 */
4702 if (cpumask_weight(nohz.cpu_mask) < 2)
4703 goto out_done;
4704
4705 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4706 ilb_group = sd->groups;
4707
4708 do {
4709 if (is_semi_idle_group(ilb_group))
4710 return cpumask_first(nohz.ilb_grp_nohz_mask);
4711
4712 ilb_group = ilb_group->next;
4713
4714 } while (ilb_group != sd->groups);
4715 }
4716
4717out_done:
4718 return cpumask_first(nohz.cpu_mask);
4719}
4720#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4721static inline int find_new_ilb(int call_cpu)
4722{
4723 return cpumask_first(nohz.cpu_mask);
4724}
4725#endif
4726
4727/*
4728 * This routine will try to nominate the ilb (idle load balancing)
4729 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4730 * load balancing on behalf of all those cpus. If all the cpus in the system
4731 * go into this tickless mode, then there will be no ilb owner (as there is
4732 * no need for one) and all the cpus will sleep till the next wakeup event
4733 * arrives...
4734 *
4735 * For the ilb owner, tick is not stopped. And this tick will be used
4736 * for idle load balancing. ilb owner will still be part of
4737 * nohz.cpu_mask..
4738 *
4739 * While stopping the tick, this cpu will become the ilb owner if there
4740 * is no other owner. And will be the owner till that cpu becomes busy
4741 * or if all cpus in the system stop their ticks at which point
4742 * there is no need for ilb owner.
4743 *
4744 * When the ilb owner becomes busy, it nominates another owner, during the
4745 * next busy scheduler_tick()
4746 */
4747int select_nohz_load_balancer(int stop_tick)
4748{
4749 int cpu = smp_processor_id();
4750
4751 if (stop_tick) {
4752 cpu_rq(cpu)->in_nohz_recently = 1;
4753
4754 if (!cpu_active(cpu)) {
4755 if (atomic_read(&nohz.load_balancer) != cpu)
4756 return 0;
4757
4758 /*
4759 * If we are going offline and still the leader,
4760 * give up!
4761 */
4762 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4763 BUG();
4764
4765 return 0;
4766 }
4767
4768 cpumask_set_cpu(cpu, nohz.cpu_mask);
4769
4770 /* time for ilb owner also to sleep */
4771 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4772 if (atomic_read(&nohz.load_balancer) == cpu)
4773 atomic_set(&nohz.load_balancer, -1);
4774 return 0;
4775 }
4776
4777 if (atomic_read(&nohz.load_balancer) == -1) {
4778 /* make me the ilb owner */
4779 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4780 return 1;
4781 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4782 int new_ilb;
4783
4784 if (!(sched_smt_power_savings ||
4785 sched_mc_power_savings))
4786 return 1;
4787 /*
4788 * Check to see if there is a more power-efficient
4789 * ilb.
4790 */
4791 new_ilb = find_new_ilb(cpu);
4792 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4793 atomic_set(&nohz.load_balancer, -1);
4794 resched_cpu(new_ilb);
4795 return 0;
4796 }
4797 return 1;
4798 }
4799 } else {
4800 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4801 return 0;
4802
4803 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4804
4805 if (atomic_read(&nohz.load_balancer) == cpu)
4806 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4807 BUG();
4808 }
4809 return 0;
4810}
4811#endif
4812
4813static DEFINE_SPINLOCK(balancing);
4814
4815/*
4816 * It checks each scheduling domain to see if it is due to be balanced,
4817 * and initiates a balancing operation if so.
4818 *
4819 * Balancing parameters are set up in arch_init_sched_domains.
4820 */
4821static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4822{
4823 int balance = 1;
4824 struct rq *rq = cpu_rq(cpu);
4825 unsigned long interval;
4826 struct sched_domain *sd;
4827 /* Earliest time when we have to do rebalance again */
4828 unsigned long next_balance = jiffies + 60*HZ;
4829 int update_next_balance = 0;
4830 int need_serialize;
4831
4832 for_each_domain(cpu, sd) {
4833 if (!(sd->flags & SD_LOAD_BALANCE))
4834 continue;
4835
4836 interval = sd->balance_interval;
4837 if (idle != CPU_IDLE)
4838 interval *= sd->busy_factor;
4839
4840 /* scale ms to jiffies */
4841 interval = msecs_to_jiffies(interval);
4842 if (unlikely(!interval))
4843 interval = 1;
4844 if (interval > HZ*NR_CPUS/10)
4845 interval = HZ*NR_CPUS/10;
4846
4847 need_serialize = sd->flags & SD_SERIALIZE;
4848
4849 if (need_serialize) {
4850 if (!spin_trylock(&balancing))
4851 goto out;
4852 }
4853
4854 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4855 if (load_balance(cpu, rq, sd, idle, &balance)) {
4856 /*
4857 * We've pulled tasks over so either we're no
4858 * longer idle, or one of our SMT siblings is
4859 * not idle.
4860 */
4861 idle = CPU_NOT_IDLE;
4862 }
4863 sd->last_balance = jiffies;
4864 }
4865 if (need_serialize)
4866 spin_unlock(&balancing);
4867out:
4868 if (time_after(next_balance, sd->last_balance + interval)) {
4869 next_balance = sd->last_balance + interval;
4870 update_next_balance = 1;
4871 }
4872
4873 /*
4874 * Stop the load balance at this level. There is another
4875 * CPU in our sched group which is doing load balancing more
4876 * actively.
4877 */
4878 if (!balance)
4879 break;
4880 }
4881
4882 /*
4883 * next_balance will be updated only when there is a need.
4884 * When the cpu is attached to null domain for ex, it will not be
4885 * updated.
4886 */
4887 if (likely(update_next_balance))
4888 rq->next_balance = next_balance;
4889}
4890
4891/*
4892 * run_rebalance_domains is triggered when needed from the scheduler tick.
4893 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4894 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4895 */
4896static void run_rebalance_domains(struct softirq_action *h)
4897{
4898 int this_cpu = smp_processor_id();
4899 struct rq *this_rq = cpu_rq(this_cpu);
4900 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4901 CPU_IDLE : CPU_NOT_IDLE;
4902
4903 rebalance_domains(this_cpu, idle);
4904
4905#ifdef CONFIG_NO_HZ
4906 /*
4907 * If this cpu is the owner for idle load balancing, then do the
4908 * balancing on behalf of the other idle cpus whose ticks are
4909 * stopped.
4910 */
4911 if (this_rq->idle_at_tick &&
4912 atomic_read(&nohz.load_balancer) == this_cpu) {
4913 struct rq *rq;
4914 int balance_cpu;
4915
4916 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4917 if (balance_cpu == this_cpu)
4918 continue;
4919
4920 /*
4921 * If this cpu gets work to do, stop the load balancing
4922 * work being done for other cpus. Next load
4923 * balancing owner will pick it up.
4924 */
4925 if (need_resched())
4926 break;
4927
4928 rebalance_domains(balance_cpu, CPU_IDLE);
4929
4930 rq = cpu_rq(balance_cpu);
4931 if (time_after(this_rq->next_balance, rq->next_balance))
4932 this_rq->next_balance = rq->next_balance;
4933 }
4934 }
4935#endif
4936}
4937
4938static inline int on_null_domain(int cpu)
4939{
4940 return !rcu_dereference(cpu_rq(cpu)->sd);
4941}
4942
4943/*
4944 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4945 *
4946 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4947 * idle load balancing owner or decide to stop the periodic load balancing,
4948 * if the whole system is idle.
4949 */
4950static inline void trigger_load_balance(struct rq *rq, int cpu)
4951{
4952#ifdef CONFIG_NO_HZ
4953 /*
4954 * If we were in the nohz mode recently and busy at the current
4955 * scheduler tick, then check if we need to nominate new idle
4956 * load balancer.
4957 */
4958 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4959 rq->in_nohz_recently = 0;
4960
4961 if (atomic_read(&nohz.load_balancer) == cpu) {
4962 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4963 atomic_set(&nohz.load_balancer, -1);
4964 }
4965
4966 if (atomic_read(&nohz.load_balancer) == -1) {
4967 int ilb = find_new_ilb(cpu);
4968
4969 if (ilb < nr_cpu_ids)
4970 resched_cpu(ilb);
4971 }
4972 }
4973
4974 /*
4975 * If this cpu is idle and doing idle load balancing for all the
4976 * cpus with ticks stopped, is it time for that to stop?
4977 */
4978 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4979 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4980 resched_cpu(cpu);
4981 return;
4982 }
4983
4984 /*
4985 * If this cpu is idle and the idle load balancing is done by
4986 * someone else, then no need raise the SCHED_SOFTIRQ
4987 */
4988 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4989 cpumask_test_cpu(cpu, nohz.cpu_mask))
4990 return;
4991#endif
4992 /* Don't need to rebalance while attached to NULL domain */
4993 if (time_after_eq(jiffies, rq->next_balance) &&
4994 likely(!on_null_domain(cpu)))
4995 raise_softirq(SCHED_SOFTIRQ);
4996}
4997
4998#else /* CONFIG_SMP */
4999
5000/*
5001 * on UP we do not need to balance between CPUs:
5002 */
5003static inline void idle_balance(int cpu, struct rq *rq)
5004{
5005}
5006
5007#endif 3153#endif
5008 3154
5009DEFINE_PER_CPU(struct kernel_stat, kstat); 3155DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -6128,7 +4274,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6128 if (running) 4274 if (running)
6129 p->sched_class->set_curr_task(rq); 4275 p->sched_class->set_curr_task(rq);
6130 if (on_rq) { 4276 if (on_rq) {
6131 enqueue_task(rq, p, 0); 4277 enqueue_task(rq, p, 0, oldprio < prio);
6132 4278
6133 check_class_changed(rq, p, prev_class, oldprio, running); 4279 check_class_changed(rq, p, prev_class, oldprio, running);
6134 } 4280 }
@@ -6172,7 +4318,7 @@ void set_user_nice(struct task_struct *p, long nice)
6172 delta = p->prio - old_prio; 4318 delta = p->prio - old_prio;
6173 4319
6174 if (on_rq) { 4320 if (on_rq) {
6175 enqueue_task(rq, p, 0); 4321 enqueue_task(rq, p, 0, false);
6176 /* 4322 /*
6177 * If the task increased its priority or is running and 4323 * If the task increased its priority or is running and
6178 * lowered its priority, then reschedule its CPU: 4324 * lowered its priority, then reschedule its CPU:
@@ -9482,7 +7628,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9482 tg->rt_rq[cpu] = rt_rq; 7628 tg->rt_rq[cpu] = rt_rq;
9483 init_rt_rq(rt_rq, rq); 7629 init_rt_rq(rt_rq, rq);
9484 rt_rq->tg = tg; 7630 rt_rq->tg = tg;
9485 rt_rq->rt_se = rt_se;
9486 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7631 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9487 if (add) 7632 if (add)
9488 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7633 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9513,9 +7658,6 @@ void __init sched_init(void)
9513#ifdef CONFIG_RT_GROUP_SCHED 7658#ifdef CONFIG_RT_GROUP_SCHED
9514 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7659 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9515#endif 7660#endif
9516#ifdef CONFIG_USER_SCHED
9517 alloc_size *= 2;
9518#endif
9519#ifdef CONFIG_CPUMASK_OFFSTACK 7661#ifdef CONFIG_CPUMASK_OFFSTACK
9520 alloc_size += num_possible_cpus() * cpumask_size(); 7662 alloc_size += num_possible_cpus() * cpumask_size();
9521#endif 7663#endif
@@ -9529,13 +7671,6 @@ void __init sched_init(void)
9529 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7671 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9530 ptr += nr_cpu_ids * sizeof(void **); 7672 ptr += nr_cpu_ids * sizeof(void **);
9531 7673
9532#ifdef CONFIG_USER_SCHED
9533 root_task_group.se = (struct sched_entity **)ptr;
9534 ptr += nr_cpu_ids * sizeof(void **);
9535
9536 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9537 ptr += nr_cpu_ids * sizeof(void **);
9538#endif /* CONFIG_USER_SCHED */
9539#endif /* CONFIG_FAIR_GROUP_SCHED */ 7674#endif /* CONFIG_FAIR_GROUP_SCHED */
9540#ifdef CONFIG_RT_GROUP_SCHED 7675#ifdef CONFIG_RT_GROUP_SCHED
9541 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7676 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9544,13 +7679,6 @@ void __init sched_init(void)
9544 init_task_group.rt_rq = (struct rt_rq **)ptr; 7679 init_task_group.rt_rq = (struct rt_rq **)ptr;
9545 ptr += nr_cpu_ids * sizeof(void **); 7680 ptr += nr_cpu_ids * sizeof(void **);
9546 7681
9547#ifdef CONFIG_USER_SCHED
9548 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9549 ptr += nr_cpu_ids * sizeof(void **);
9550
9551 root_task_group.rt_rq = (struct rt_rq **)ptr;
9552 ptr += nr_cpu_ids * sizeof(void **);
9553#endif /* CONFIG_USER_SCHED */
9554#endif /* CONFIG_RT_GROUP_SCHED */ 7682#endif /* CONFIG_RT_GROUP_SCHED */
9555#ifdef CONFIG_CPUMASK_OFFSTACK 7683#ifdef CONFIG_CPUMASK_OFFSTACK
9556 for_each_possible_cpu(i) { 7684 for_each_possible_cpu(i) {
@@ -9570,22 +7698,13 @@ void __init sched_init(void)
9570#ifdef CONFIG_RT_GROUP_SCHED 7698#ifdef CONFIG_RT_GROUP_SCHED
9571 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7699 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9572 global_rt_period(), global_rt_runtime()); 7700 global_rt_period(), global_rt_runtime());
9573#ifdef CONFIG_USER_SCHED
9574 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9575 global_rt_period(), RUNTIME_INF);
9576#endif /* CONFIG_USER_SCHED */
9577#endif /* CONFIG_RT_GROUP_SCHED */ 7701#endif /* CONFIG_RT_GROUP_SCHED */
9578 7702
9579#ifdef CONFIG_GROUP_SCHED 7703#ifdef CONFIG_CGROUP_SCHED
9580 list_add(&init_task_group.list, &task_groups); 7704 list_add(&init_task_group.list, &task_groups);
9581 INIT_LIST_HEAD(&init_task_group.children); 7705 INIT_LIST_HEAD(&init_task_group.children);
9582 7706
9583#ifdef CONFIG_USER_SCHED 7707#endif /* CONFIG_CGROUP_SCHED */
9584 INIT_LIST_HEAD(&root_task_group.children);
9585 init_task_group.parent = &root_task_group;
9586 list_add(&init_task_group.siblings, &root_task_group.children);
9587#endif /* CONFIG_USER_SCHED */
9588#endif /* CONFIG_GROUP_SCHED */
9589 7708
9590#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7709#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9591 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7710 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9625,25 +7744,6 @@ void __init sched_init(void)
9625 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7744 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9626 */ 7745 */
9627 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7746 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9628#elif defined CONFIG_USER_SCHED
9629 root_task_group.shares = NICE_0_LOAD;
9630 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9631 /*
9632 * In case of task-groups formed thr' the user id of tasks,
9633 * init_task_group represents tasks belonging to root user.
9634 * Hence it forms a sibling of all subsequent groups formed.
9635 * In this case, init_task_group gets only a fraction of overall
9636 * system cpu resource, based on the weight assigned to root
9637 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9638 * by letting tasks of init_task_group sit in a separate cfs_rq
9639 * (init_tg_cfs_rq) and having one entity represent this group of
9640 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9641 */
9642 init_tg_cfs_entry(&init_task_group,
9643 &per_cpu(init_tg_cfs_rq, i),
9644 &per_cpu(init_sched_entity, i), i, 1,
9645 root_task_group.se[i]);
9646
9647#endif 7747#endif
9648#endif /* CONFIG_FAIR_GROUP_SCHED */ 7748#endif /* CONFIG_FAIR_GROUP_SCHED */
9649 7749
@@ -9652,12 +7752,6 @@ void __init sched_init(void)
9652 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7752 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9653#ifdef CONFIG_CGROUP_SCHED 7753#ifdef CONFIG_CGROUP_SCHED
9654 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7754 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9655#elif defined CONFIG_USER_SCHED
9656 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9657 init_tg_rt_entry(&init_task_group,
9658 &per_cpu(init_rt_rq_var, i),
9659 &per_cpu(init_sched_rt_entity, i), i, 1,
9660 root_task_group.rt_se[i]);
9661#endif 7755#endif
9662#endif 7756#endif
9663 7757
@@ -9742,7 +7836,7 @@ static inline int preempt_count_equals(int preempt_offset)
9742 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7836 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9743} 7837}
9744 7838
9745void __might_sleep(char *file, int line, int preempt_offset) 7839void __might_sleep(const char *file, int line, int preempt_offset)
9746{ 7840{
9747#ifdef in_atomic 7841#ifdef in_atomic
9748 static unsigned long prev_jiffy; /* ratelimiting */ 7842 static unsigned long prev_jiffy; /* ratelimiting */
@@ -10053,7 +8147,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
10053} 8147}
10054#endif /* CONFIG_RT_GROUP_SCHED */ 8148#endif /* CONFIG_RT_GROUP_SCHED */
10055 8149
10056#ifdef CONFIG_GROUP_SCHED 8150#ifdef CONFIG_CGROUP_SCHED
10057static void free_sched_group(struct task_group *tg) 8151static void free_sched_group(struct task_group *tg)
10058{ 8152{
10059 free_fair_sched_group(tg); 8153 free_fair_sched_group(tg);
@@ -10158,11 +8252,11 @@ void sched_move_task(struct task_struct *tsk)
10158 if (unlikely(running)) 8252 if (unlikely(running))
10159 tsk->sched_class->set_curr_task(rq); 8253 tsk->sched_class->set_curr_task(rq);
10160 if (on_rq) 8254 if (on_rq)
10161 enqueue_task(rq, tsk, 0); 8255 enqueue_task(rq, tsk, 0, false);
10162 8256
10163 task_rq_unlock(rq, &flags); 8257 task_rq_unlock(rq, &flags);
10164} 8258}
10165#endif /* CONFIG_GROUP_SCHED */ 8259#endif /* CONFIG_CGROUP_SCHED */
10166 8260
10167#ifdef CONFIG_FAIR_GROUP_SCHED 8261#ifdef CONFIG_FAIR_GROUP_SCHED
10168static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8262static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10304,13 +8398,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10304 runtime = d->rt_runtime; 8398 runtime = d->rt_runtime;
10305 } 8399 }
10306 8400
10307#ifdef CONFIG_USER_SCHED
10308 if (tg == &root_task_group) {
10309 period = global_rt_period();
10310 runtime = global_rt_runtime();
10311 }
10312#endif
10313
10314 /* 8401 /*
10315 * Cannot have more runtime than the period. 8402 * Cannot have more runtime than the period.
10316 */ 8403 */
@@ -10930,12 +9017,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10930} 9017}
10931 9018
10932/* 9019/*
9020 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9021 * in cputime_t units. As a result, cpuacct_update_stats calls
9022 * percpu_counter_add with values large enough to always overflow the
9023 * per cpu batch limit causing bad SMP scalability.
9024 *
9025 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9026 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9027 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9028 */
9029#ifdef CONFIG_SMP
9030#define CPUACCT_BATCH \
9031 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9032#else
9033#define CPUACCT_BATCH 0
9034#endif
9035
9036/*
10933 * Charge the system/user time to the task's accounting group. 9037 * Charge the system/user time to the task's accounting group.
10934 */ 9038 */
10935static void cpuacct_update_stats(struct task_struct *tsk, 9039static void cpuacct_update_stats(struct task_struct *tsk,
10936 enum cpuacct_stat_index idx, cputime_t val) 9040 enum cpuacct_stat_index idx, cputime_t val)
10937{ 9041{
10938 struct cpuacct *ca; 9042 struct cpuacct *ca;
9043 int batch = CPUACCT_BATCH;
10939 9044
10940 if (unlikely(!cpuacct_subsys.active)) 9045 if (unlikely(!cpuacct_subsys.active))
10941 return; 9046 return;
@@ -10944,7 +9049,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10944 ca = task_ca(tsk); 9049 ca = task_ca(tsk);
10945 9050
10946 do { 9051 do {
10947 percpu_counter_add(&ca->cpustat[idx], val); 9052 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10948 ca = ca->parent; 9053 ca = ca->parent;
10949 } while (ca); 9054 } while (ca);
10950 rcu_read_unlock(); 9055 rcu_read_unlock();