aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-02-28 13:31:01 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-02-28 13:31:01 -0500
commitf66ffdedbf0fc059a92219bb08c1dbcac88f074b (patch)
tree9db4ad51764455123130e82fb7acf4f0a0be58ce /kernel/sched.c
parent2531216f236cb2a1f39ffa12a4a9339541e52191 (diff)
parentdd5feea14a7de4edbd9f36db1a2db785de91b88d (diff)
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (25 commits) sched: Fix SCHED_MC regression caused by change in sched cpu_power sched: Don't use possibly stale sched_class kthread, sched: Remove reference to kthread_create_on_cpu sched: cpuacct: Use bigger percpu counter batch values for stats counters percpu_counter: Make __percpu_counter_add an inline function on UP sched: Remove member rt_se from struct rt_rq sched: Change usage of rt_rq->rt_se to rt_rq->tg->rt_se[cpu] sched: Remove unused update_shares_locked() sched: Use for_each_bit sched: Queue a deboosted task to the head of the RT prio queue sched: Implement head queueing for sched_rt sched: Extend enqueue_task to allow head queueing sched: Remove USER_SCHED sched: Fix the place where group powers are updated sched: Assume *balance is valid sched: Remove load_balance_newidle() sched: Unify load_balance{,_newidle}() sched: Add a lock break for PREEMPT=y sched: Remove from fwd decls sched: Remove rq_iterator from move_one_task ... Fix up trivial conflicts in kernel/sched.c
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c2125
1 files changed, 116 insertions, 2009 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index caf54e1eef6e..6a212c97f523 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 233 */
234static DEFINE_MUTEX(sched_domains_mutex); 234static DEFINE_MUTEX(sched_domains_mutex);
235 235
236#ifdef CONFIG_GROUP_SCHED 236#ifdef CONFIG_CGROUP_SCHED
237 237
238#include <linux/cgroup.h> 238#include <linux/cgroup.h>
239 239
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
243 243
244/* task group related information */ 244/* task group related information */
245struct task_group { 245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 246 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 247
254#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 249 /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
274 struct list_head children; 268 struct list_head children;
275}; 269};
276 270
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 271#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 272
307/* task_group_lock serializes add/remove of task groups and also changes to 273/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 274 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
318} 284}
319#endif 285#endif
320 286
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 287# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 288
327/* 289/*
328 * A weight of 0 or 1 can cause arithmetics problems. 290 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 310{
349 struct task_group *tg; 311 struct task_group *tg;
350 312
351#ifdef CONFIG_USER_SCHED 313#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 315 struct task_group, css);
358#else 316#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 341 return NULL;
384} 342}
385 343
386#endif /* CONFIG_GROUP_SCHED */ 344#endif /* CONFIG_CGROUP_SCHED */
387 345
388/* CFS-related fields in a runqueue */ 346/* CFS-related fields in a runqueue */
389struct cfs_rq { 347struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
478 struct rq *rq; 436 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 437 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 438 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 439#endif
483}; 440};
484 441
@@ -1414,32 +1371,6 @@ static const u32 prio_to_wmult[40] = {
1414 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1371 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1415}; 1372};
1416 1373
1417static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1418
1419/*
1420 * runqueue iterator, to support SMP load-balancing between different
1421 * scheduling classes, without having to expose their internal data
1422 * structures to the load-balancing proper:
1423 */
1424struct rq_iterator {
1425 void *arg;
1426 struct task_struct *(*start)(void *);
1427 struct task_struct *(*next)(void *);
1428};
1429
1430#ifdef CONFIG_SMP
1431static unsigned long
1432balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1433 unsigned long max_load_move, struct sched_domain *sd,
1434 enum cpu_idle_type idle, int *all_pinned,
1435 int *this_best_prio, struct rq_iterator *iterator);
1436
1437static int
1438iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1439 struct sched_domain *sd, enum cpu_idle_type idle,
1440 struct rq_iterator *iterator);
1441#endif
1442
1443/* Time spent by the tasks of the cpu accounting group executing in ... */ 1374/* Time spent by the tasks of the cpu accounting group executing in ... */
1444enum cpuacct_stat_index { 1375enum cpuacct_stat_index {
1445 CPUACCT_STAT_USER, /* ... user mode */ 1376 CPUACCT_STAT_USER, /* ... user mode */
@@ -1725,16 +1656,6 @@ static void update_shares(struct sched_domain *sd)
1725 } 1656 }
1726} 1657}
1727 1658
1728static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1729{
1730 if (root_task_group_empty())
1731 return;
1732
1733 raw_spin_unlock(&rq->lock);
1734 update_shares(sd);
1735 raw_spin_lock(&rq->lock);
1736}
1737
1738static void update_h_load(long cpu) 1659static void update_h_load(long cpu)
1739{ 1660{
1740 if (root_task_group_empty()) 1661 if (root_task_group_empty())
@@ -1749,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd)
1749{ 1670{
1750} 1671}
1751 1672
1752static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1753{
1754}
1755
1756#endif 1673#endif
1757 1674
1758#ifdef CONFIG_PREEMPT 1675#ifdef CONFIG_PREEMPT
@@ -1829,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1829 raw_spin_unlock(&busiest->lock); 1746 raw_spin_unlock(&busiest->lock);
1830 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1747 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1831} 1748}
1749
1750/*
1751 * double_rq_lock - safely lock two runqueues
1752 *
1753 * Note this does not disable interrupts like task_rq_lock,
1754 * you need to do so manually before calling.
1755 */
1756static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1757 __acquires(rq1->lock)
1758 __acquires(rq2->lock)
1759{
1760 BUG_ON(!irqs_disabled());
1761 if (rq1 == rq2) {
1762 raw_spin_lock(&rq1->lock);
1763 __acquire(rq2->lock); /* Fake it out ;) */
1764 } else {
1765 if (rq1 < rq2) {
1766 raw_spin_lock(&rq1->lock);
1767 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1768 } else {
1769 raw_spin_lock(&rq2->lock);
1770 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1771 }
1772 }
1773 update_rq_clock(rq1);
1774 update_rq_clock(rq2);
1775}
1776
1777/*
1778 * double_rq_unlock - safely unlock two runqueues
1779 *
1780 * Note this does not restore interrupts like task_rq_unlock,
1781 * you need to do so manually after calling.
1782 */
1783static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1784 __releases(rq1->lock)
1785 __releases(rq2->lock)
1786{
1787 raw_spin_unlock(&rq1->lock);
1788 if (rq1 != rq2)
1789 raw_spin_unlock(&rq2->lock);
1790 else
1791 __release(rq2->lock);
1792}
1793
1832#endif 1794#endif
1833 1795
1834#ifdef CONFIG_FAIR_GROUP_SCHED 1796#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1858,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1858#endif 1820#endif
1859} 1821}
1860 1822
1861#include "sched_stats.h" 1823static const struct sched_class rt_sched_class;
1862#include "sched_idletask.c"
1863#include "sched_fair.c"
1864#include "sched_rt.c"
1865#ifdef CONFIG_SCHED_DEBUG
1866# include "sched_debug.c"
1867#endif
1868 1824
1869#define sched_class_highest (&rt_sched_class) 1825#define sched_class_highest (&rt_sched_class)
1870#define for_each_class(class) \ 1826#define for_each_class(class) \
1871 for (class = sched_class_highest; class; class = class->next) 1827 for (class = sched_class_highest; class; class = class->next)
1872 1828
1829#include "sched_stats.h"
1830
1873static void inc_nr_running(struct rq *rq) 1831static void inc_nr_running(struct rq *rq)
1874{ 1832{
1875 rq->nr_running++; 1833 rq->nr_running++;
@@ -1907,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample)
1907 *avg += diff >> 3; 1865 *avg += diff >> 3;
1908} 1866}
1909 1867
1910static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1868static void
1869enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1911{ 1870{
1912 if (wakeup) 1871 if (wakeup)
1913 p->se.start_runtime = p->se.sum_exec_runtime; 1872 p->se.start_runtime = p->se.sum_exec_runtime;
1914 1873
1915 sched_info_queued(p); 1874 sched_info_queued(p);
1916 p->sched_class->enqueue_task(rq, p, wakeup); 1875 p->sched_class->enqueue_task(rq, p, wakeup, head);
1917 p->se.on_rq = 1; 1876 p->se.on_rq = 1;
1918} 1877}
1919 1878
@@ -1936,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1936} 1895}
1937 1896
1938/* 1897/*
1898 * activate_task - move a task to the runqueue.
1899 */
1900static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1901{
1902 if (task_contributes_to_load(p))
1903 rq->nr_uninterruptible--;
1904
1905 enqueue_task(rq, p, wakeup, false);
1906 inc_nr_running(rq);
1907}
1908
1909/*
1910 * deactivate_task - remove a task from the runqueue.
1911 */
1912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1913{
1914 if (task_contributes_to_load(p))
1915 rq->nr_uninterruptible++;
1916
1917 dequeue_task(rq, p, sleep);
1918 dec_nr_running(rq);
1919}
1920
1921#include "sched_idletask.c"
1922#include "sched_fair.c"
1923#include "sched_rt.c"
1924#ifdef CONFIG_SCHED_DEBUG
1925# include "sched_debug.c"
1926#endif
1927
1928/*
1939 * __normal_prio - return the priority that is based on the static prio 1929 * __normal_prio - return the priority that is based on the static prio
1940 */ 1930 */
1941static inline int __normal_prio(struct task_struct *p) 1931static inline int __normal_prio(struct task_struct *p)
@@ -1981,30 +1971,6 @@ static int effective_prio(struct task_struct *p)
1981 return p->prio; 1971 return p->prio;
1982} 1972}
1983 1973
1984/*
1985 * activate_task - move a task to the runqueue.
1986 */
1987static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1988{
1989 if (task_contributes_to_load(p))
1990 rq->nr_uninterruptible--;
1991
1992 enqueue_task(rq, p, wakeup);
1993 inc_nr_running(rq);
1994}
1995
1996/*
1997 * deactivate_task - remove a task from the runqueue.
1998 */
1999static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
2000{
2001 if (task_contributes_to_load(p))
2002 rq->nr_uninterruptible++;
2003
2004 dequeue_task(rq, p, sleep);
2005 dec_nr_running(rq);
2006}
2007
2008/** 1974/**
2009 * task_curr - is this task currently executing on a CPU? 1975 * task_curr - is this task currently executing on a CPU?
2010 * @p: the task in question. 1976 * @p: the task in question.
@@ -3148,50 +3114,6 @@ static void update_cpu_load(struct rq *this_rq)
3148#ifdef CONFIG_SMP 3114#ifdef CONFIG_SMP
3149 3115
3150/* 3116/*
3151 * double_rq_lock - safely lock two runqueues
3152 *
3153 * Note this does not disable interrupts like task_rq_lock,
3154 * you need to do so manually before calling.
3155 */
3156static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3157 __acquires(rq1->lock)
3158 __acquires(rq2->lock)
3159{
3160 BUG_ON(!irqs_disabled());
3161 if (rq1 == rq2) {
3162 raw_spin_lock(&rq1->lock);
3163 __acquire(rq2->lock); /* Fake it out ;) */
3164 } else {
3165 if (rq1 < rq2) {
3166 raw_spin_lock(&rq1->lock);
3167 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3168 } else {
3169 raw_spin_lock(&rq2->lock);
3170 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3171 }
3172 }
3173 update_rq_clock(rq1);
3174 update_rq_clock(rq2);
3175}
3176
3177/*
3178 * double_rq_unlock - safely unlock two runqueues
3179 *
3180 * Note this does not restore interrupts like task_rq_unlock,
3181 * you need to do so manually after calling.
3182 */
3183static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3184 __releases(rq1->lock)
3185 __releases(rq2->lock)
3186{
3187 raw_spin_unlock(&rq1->lock);
3188 if (rq1 != rq2)
3189 raw_spin_unlock(&rq2->lock);
3190 else
3191 __release(rq2->lock);
3192}
3193
3194/*
3195 * sched_exec - execve() is a valuable balancing opportunity, because at 3117 * sched_exec - execve() is a valuable balancing opportunity, because at
3196 * this point the task has the smallest effective memory and cache footprint. 3118 * this point the task has the smallest effective memory and cache footprint.
3197 */ 3119 */
@@ -3239,1782 +3161,6 @@ again:
3239 task_rq_unlock(rq, &flags); 3161 task_rq_unlock(rq, &flags);
3240} 3162}
3241 3163
3242/*
3243 * pull_task - move a task from a remote runqueue to the local runqueue.
3244 * Both runqueues must be locked.
3245 */
3246static void pull_task(struct rq *src_rq, struct task_struct *p,
3247 struct rq *this_rq, int this_cpu)
3248{
3249 deactivate_task(src_rq, p, 0);
3250 set_task_cpu(p, this_cpu);
3251 activate_task(this_rq, p, 0);
3252 check_preempt_curr(this_rq, p, 0);
3253}
3254
3255/*
3256 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3257 */
3258static
3259int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3260 struct sched_domain *sd, enum cpu_idle_type idle,
3261 int *all_pinned)
3262{
3263 int tsk_cache_hot = 0;
3264 /*
3265 * We do not migrate tasks that are:
3266 * 1) running (obviously), or
3267 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3268 * 3) are cache-hot on their current CPU.
3269 */
3270 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3271 schedstat_inc(p, se.nr_failed_migrations_affine);
3272 return 0;
3273 }
3274 *all_pinned = 0;
3275
3276 if (task_running(rq, p)) {
3277 schedstat_inc(p, se.nr_failed_migrations_running);
3278 return 0;
3279 }
3280
3281 /*
3282 * Aggressive migration if:
3283 * 1) task is cache cold, or
3284 * 2) too many balance attempts have failed.
3285 */
3286
3287 tsk_cache_hot = task_hot(p, rq->clock, sd);
3288 if (!tsk_cache_hot ||
3289 sd->nr_balance_failed > sd->cache_nice_tries) {
3290#ifdef CONFIG_SCHEDSTATS
3291 if (tsk_cache_hot) {
3292 schedstat_inc(sd, lb_hot_gained[idle]);
3293 schedstat_inc(p, se.nr_forced_migrations);
3294 }
3295#endif
3296 return 1;
3297 }
3298
3299 if (tsk_cache_hot) {
3300 schedstat_inc(p, se.nr_failed_migrations_hot);
3301 return 0;
3302 }
3303 return 1;
3304}
3305
3306static unsigned long
3307balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3308 unsigned long max_load_move, struct sched_domain *sd,
3309 enum cpu_idle_type idle, int *all_pinned,
3310 int *this_best_prio, struct rq_iterator *iterator)
3311{
3312 int loops = 0, pulled = 0, pinned = 0;
3313 struct task_struct *p;
3314 long rem_load_move = max_load_move;
3315
3316 if (max_load_move == 0)
3317 goto out;
3318
3319 pinned = 1;
3320
3321 /*
3322 * Start the load-balancing iterator:
3323 */
3324 p = iterator->start(iterator->arg);
3325next:
3326 if (!p || loops++ > sysctl_sched_nr_migrate)
3327 goto out;
3328
3329 if ((p->se.load.weight >> 1) > rem_load_move ||
3330 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3331 p = iterator->next(iterator->arg);
3332 goto next;
3333 }
3334
3335 pull_task(busiest, p, this_rq, this_cpu);
3336 pulled++;
3337 rem_load_move -= p->se.load.weight;
3338
3339#ifdef CONFIG_PREEMPT
3340 /*
3341 * NEWIDLE balancing is a source of latency, so preemptible kernels
3342 * will stop after the first task is pulled to minimize the critical
3343 * section.
3344 */
3345 if (idle == CPU_NEWLY_IDLE)
3346 goto out;
3347#endif
3348
3349 /*
3350 * We only want to steal up to the prescribed amount of weighted load.
3351 */
3352 if (rem_load_move > 0) {
3353 if (p->prio < *this_best_prio)
3354 *this_best_prio = p->prio;
3355 p = iterator->next(iterator->arg);
3356 goto next;
3357 }
3358out:
3359 /*
3360 * Right now, this is one of only two places pull_task() is called,
3361 * so we can safely collect pull_task() stats here rather than
3362 * inside pull_task().
3363 */
3364 schedstat_add(sd, lb_gained[idle], pulled);
3365
3366 if (all_pinned)
3367 *all_pinned = pinned;
3368
3369 return max_load_move - rem_load_move;
3370}
3371
3372/*
3373 * move_tasks tries to move up to max_load_move weighted load from busiest to
3374 * this_rq, as part of a balancing operation within domain "sd".
3375 * Returns 1 if successful and 0 otherwise.
3376 *
3377 * Called with both runqueues locked.
3378 */
3379static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3380 unsigned long max_load_move,
3381 struct sched_domain *sd, enum cpu_idle_type idle,
3382 int *all_pinned)
3383{
3384 const struct sched_class *class = sched_class_highest;
3385 unsigned long total_load_moved = 0;
3386 int this_best_prio = this_rq->curr->prio;
3387
3388 do {
3389 total_load_moved +=
3390 class->load_balance(this_rq, this_cpu, busiest,
3391 max_load_move - total_load_moved,
3392 sd, idle, all_pinned, &this_best_prio);
3393 class = class->next;
3394
3395#ifdef CONFIG_PREEMPT
3396 /*
3397 * NEWIDLE balancing is a source of latency, so preemptible
3398 * kernels will stop after the first task is pulled to minimize
3399 * the critical section.
3400 */
3401 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3402 break;
3403#endif
3404 } while (class && max_load_move > total_load_moved);
3405
3406 return total_load_moved > 0;
3407}
3408
3409static int
3410iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3411 struct sched_domain *sd, enum cpu_idle_type idle,
3412 struct rq_iterator *iterator)
3413{
3414 struct task_struct *p = iterator->start(iterator->arg);
3415 int pinned = 0;
3416
3417 while (p) {
3418 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3419 pull_task(busiest, p, this_rq, this_cpu);
3420 /*
3421 * Right now, this is only the second place pull_task()
3422 * is called, so we can safely collect pull_task()
3423 * stats here rather than inside pull_task().
3424 */
3425 schedstat_inc(sd, lb_gained[idle]);
3426
3427 return 1;
3428 }
3429 p = iterator->next(iterator->arg);
3430 }
3431
3432 return 0;
3433}
3434
3435/*
3436 * move_one_task tries to move exactly one task from busiest to this_rq, as
3437 * part of active balancing operations within "domain".
3438 * Returns 1 if successful and 0 otherwise.
3439 *
3440 * Called with both runqueues locked.
3441 */
3442static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3443 struct sched_domain *sd, enum cpu_idle_type idle)
3444{
3445 const struct sched_class *class;
3446
3447 for_each_class(class) {
3448 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3449 return 1;
3450 }
3451
3452 return 0;
3453}
3454/********** Helpers for find_busiest_group ************************/
3455/*
3456 * sd_lb_stats - Structure to store the statistics of a sched_domain
3457 * during load balancing.
3458 */
3459struct sd_lb_stats {
3460 struct sched_group *busiest; /* Busiest group in this sd */
3461 struct sched_group *this; /* Local group in this sd */
3462 unsigned long total_load; /* Total load of all groups in sd */
3463 unsigned long total_pwr; /* Total power of all groups in sd */
3464 unsigned long avg_load; /* Average load across all groups in sd */
3465
3466 /** Statistics of this group */
3467 unsigned long this_load;
3468 unsigned long this_load_per_task;
3469 unsigned long this_nr_running;
3470
3471 /* Statistics of the busiest group */
3472 unsigned long max_load;
3473 unsigned long busiest_load_per_task;
3474 unsigned long busiest_nr_running;
3475
3476 int group_imb; /* Is there imbalance in this sd */
3477#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3478 int power_savings_balance; /* Is powersave balance needed for this sd */
3479 struct sched_group *group_min; /* Least loaded group in sd */
3480 struct sched_group *group_leader; /* Group which relieves group_min */
3481 unsigned long min_load_per_task; /* load_per_task in group_min */
3482 unsigned long leader_nr_running; /* Nr running of group_leader */
3483 unsigned long min_nr_running; /* Nr running of group_min */
3484#endif
3485};
3486
3487/*
3488 * sg_lb_stats - stats of a sched_group required for load_balancing
3489 */
3490struct sg_lb_stats {
3491 unsigned long avg_load; /*Avg load across the CPUs of the group */
3492 unsigned long group_load; /* Total load over the CPUs of the group */
3493 unsigned long sum_nr_running; /* Nr tasks running in the group */
3494 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3495 unsigned long group_capacity;
3496 int group_imb; /* Is there an imbalance in the group ? */
3497};
3498
3499/**
3500 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3501 * @group: The group whose first cpu is to be returned.
3502 */
3503static inline unsigned int group_first_cpu(struct sched_group *group)
3504{
3505 return cpumask_first(sched_group_cpus(group));
3506}
3507
3508/**
3509 * get_sd_load_idx - Obtain the load index for a given sched domain.
3510 * @sd: The sched_domain whose load_idx is to be obtained.
3511 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3512 */
3513static inline int get_sd_load_idx(struct sched_domain *sd,
3514 enum cpu_idle_type idle)
3515{
3516 int load_idx;
3517
3518 switch (idle) {
3519 case CPU_NOT_IDLE:
3520 load_idx = sd->busy_idx;
3521 break;
3522
3523 case CPU_NEWLY_IDLE:
3524 load_idx = sd->newidle_idx;
3525 break;
3526 default:
3527 load_idx = sd->idle_idx;
3528 break;
3529 }
3530
3531 return load_idx;
3532}
3533
3534
3535#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3536/**
3537 * init_sd_power_savings_stats - Initialize power savings statistics for
3538 * the given sched_domain, during load balancing.
3539 *
3540 * @sd: Sched domain whose power-savings statistics are to be initialized.
3541 * @sds: Variable containing the statistics for sd.
3542 * @idle: Idle status of the CPU at which we're performing load-balancing.
3543 */
3544static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3545 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3546{
3547 /*
3548 * Busy processors will not participate in power savings
3549 * balance.
3550 */
3551 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3552 sds->power_savings_balance = 0;
3553 else {
3554 sds->power_savings_balance = 1;
3555 sds->min_nr_running = ULONG_MAX;
3556 sds->leader_nr_running = 0;
3557 }
3558}
3559
3560/**
3561 * update_sd_power_savings_stats - Update the power saving stats for a
3562 * sched_domain while performing load balancing.
3563 *
3564 * @group: sched_group belonging to the sched_domain under consideration.
3565 * @sds: Variable containing the statistics of the sched_domain
3566 * @local_group: Does group contain the CPU for which we're performing
3567 * load balancing ?
3568 * @sgs: Variable containing the statistics of the group.
3569 */
3570static inline void update_sd_power_savings_stats(struct sched_group *group,
3571 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3572{
3573
3574 if (!sds->power_savings_balance)
3575 return;
3576
3577 /*
3578 * If the local group is idle or completely loaded
3579 * no need to do power savings balance at this domain
3580 */
3581 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3582 !sds->this_nr_running))
3583 sds->power_savings_balance = 0;
3584
3585 /*
3586 * If a group is already running at full capacity or idle,
3587 * don't include that group in power savings calculations
3588 */
3589 if (!sds->power_savings_balance ||
3590 sgs->sum_nr_running >= sgs->group_capacity ||
3591 !sgs->sum_nr_running)
3592 return;
3593
3594 /*
3595 * Calculate the group which has the least non-idle load.
3596 * This is the group from where we need to pick up the load
3597 * for saving power
3598 */
3599 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3600 (sgs->sum_nr_running == sds->min_nr_running &&
3601 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3602 sds->group_min = group;
3603 sds->min_nr_running = sgs->sum_nr_running;
3604 sds->min_load_per_task = sgs->sum_weighted_load /
3605 sgs->sum_nr_running;
3606 }
3607
3608 /*
3609 * Calculate the group which is almost near its
3610 * capacity but still has some space to pick up some load
3611 * from other group and save more power
3612 */
3613 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3614 return;
3615
3616 if (sgs->sum_nr_running > sds->leader_nr_running ||
3617 (sgs->sum_nr_running == sds->leader_nr_running &&
3618 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3619 sds->group_leader = group;
3620 sds->leader_nr_running = sgs->sum_nr_running;
3621 }
3622}
3623
3624/**
3625 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3626 * @sds: Variable containing the statistics of the sched_domain
3627 * under consideration.
3628 * @this_cpu: Cpu at which we're currently performing load-balancing.
3629 * @imbalance: Variable to store the imbalance.
3630 *
3631 * Description:
3632 * Check if we have potential to perform some power-savings balance.
3633 * If yes, set the busiest group to be the least loaded group in the
3634 * sched_domain, so that it's CPUs can be put to idle.
3635 *
3636 * Returns 1 if there is potential to perform power-savings balance.
3637 * Else returns 0.
3638 */
3639static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3640 int this_cpu, unsigned long *imbalance)
3641{
3642 if (!sds->power_savings_balance)
3643 return 0;
3644
3645 if (sds->this != sds->group_leader ||
3646 sds->group_leader == sds->group_min)
3647 return 0;
3648
3649 *imbalance = sds->min_load_per_task;
3650 sds->busiest = sds->group_min;
3651
3652 return 1;
3653
3654}
3655#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3656static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3657 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3658{
3659 return;
3660}
3661
3662static inline void update_sd_power_savings_stats(struct sched_group *group,
3663 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3664{
3665 return;
3666}
3667
3668static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3669 int this_cpu, unsigned long *imbalance)
3670{
3671 return 0;
3672}
3673#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3674
3675
3676unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3677{
3678 return SCHED_LOAD_SCALE;
3679}
3680
3681unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3682{
3683 return default_scale_freq_power(sd, cpu);
3684}
3685
3686unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3687{
3688 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3689 unsigned long smt_gain = sd->smt_gain;
3690
3691 smt_gain /= weight;
3692
3693 return smt_gain;
3694}
3695
3696unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3697{
3698 return default_scale_smt_power(sd, cpu);
3699}
3700
3701unsigned long scale_rt_power(int cpu)
3702{
3703 struct rq *rq = cpu_rq(cpu);
3704 u64 total, available;
3705
3706 sched_avg_update(rq);
3707
3708 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3709 available = total - rq->rt_avg;
3710
3711 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3712 total = SCHED_LOAD_SCALE;
3713
3714 total >>= SCHED_LOAD_SHIFT;
3715
3716 return div_u64(available, total);
3717}
3718
3719static void update_cpu_power(struct sched_domain *sd, int cpu)
3720{
3721 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3722 unsigned long power = SCHED_LOAD_SCALE;
3723 struct sched_group *sdg = sd->groups;
3724
3725 if (sched_feat(ARCH_POWER))
3726 power *= arch_scale_freq_power(sd, cpu);
3727 else
3728 power *= default_scale_freq_power(sd, cpu);
3729
3730 power >>= SCHED_LOAD_SHIFT;
3731
3732 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3733 if (sched_feat(ARCH_POWER))
3734 power *= arch_scale_smt_power(sd, cpu);
3735 else
3736 power *= default_scale_smt_power(sd, cpu);
3737
3738 power >>= SCHED_LOAD_SHIFT;
3739 }
3740
3741 power *= scale_rt_power(cpu);
3742 power >>= SCHED_LOAD_SHIFT;
3743
3744 if (!power)
3745 power = 1;
3746
3747 sdg->cpu_power = power;
3748}
3749
3750static void update_group_power(struct sched_domain *sd, int cpu)
3751{
3752 struct sched_domain *child = sd->child;
3753 struct sched_group *group, *sdg = sd->groups;
3754 unsigned long power;
3755
3756 if (!child) {
3757 update_cpu_power(sd, cpu);
3758 return;
3759 }
3760
3761 power = 0;
3762
3763 group = child->groups;
3764 do {
3765 power += group->cpu_power;
3766 group = group->next;
3767 } while (group != child->groups);
3768
3769 sdg->cpu_power = power;
3770}
3771
3772/**
3773 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3774 * @sd: The sched_domain whose statistics are to be updated.
3775 * @group: sched_group whose statistics are to be updated.
3776 * @this_cpu: Cpu for which load balance is currently performed.
3777 * @idle: Idle status of this_cpu
3778 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3779 * @sd_idle: Idle status of the sched_domain containing group.
3780 * @local_group: Does group contain this_cpu.
3781 * @cpus: Set of cpus considered for load balancing.
3782 * @balance: Should we balance.
3783 * @sgs: variable to hold the statistics for this group.
3784 */
3785static inline void update_sg_lb_stats(struct sched_domain *sd,
3786 struct sched_group *group, int this_cpu,
3787 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3788 int local_group, const struct cpumask *cpus,
3789 int *balance, struct sg_lb_stats *sgs)
3790{
3791 unsigned long load, max_cpu_load, min_cpu_load;
3792 int i;
3793 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3794 unsigned long sum_avg_load_per_task;
3795 unsigned long avg_load_per_task;
3796
3797 if (local_group) {
3798 balance_cpu = group_first_cpu(group);
3799 if (balance_cpu == this_cpu)
3800 update_group_power(sd, this_cpu);
3801 }
3802
3803 /* Tally up the load of all CPUs in the group */
3804 sum_avg_load_per_task = avg_load_per_task = 0;
3805 max_cpu_load = 0;
3806 min_cpu_load = ~0UL;
3807
3808 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3809 struct rq *rq = cpu_rq(i);
3810
3811 if (*sd_idle && rq->nr_running)
3812 *sd_idle = 0;
3813
3814 /* Bias balancing toward cpus of our domain */
3815 if (local_group) {
3816 if (idle_cpu(i) && !first_idle_cpu) {
3817 first_idle_cpu = 1;
3818 balance_cpu = i;
3819 }
3820
3821 load = target_load(i, load_idx);
3822 } else {
3823 load = source_load(i, load_idx);
3824 if (load > max_cpu_load)
3825 max_cpu_load = load;
3826 if (min_cpu_load > load)
3827 min_cpu_load = load;
3828 }
3829
3830 sgs->group_load += load;
3831 sgs->sum_nr_running += rq->nr_running;
3832 sgs->sum_weighted_load += weighted_cpuload(i);
3833
3834 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3835 }
3836
3837 /*
3838 * First idle cpu or the first cpu(busiest) in this sched group
3839 * is eligible for doing load balancing at this and above
3840 * domains. In the newly idle case, we will allow all the cpu's
3841 * to do the newly idle load balance.
3842 */
3843 if (idle != CPU_NEWLY_IDLE && local_group &&
3844 balance_cpu != this_cpu && balance) {
3845 *balance = 0;
3846 return;
3847 }
3848
3849 /* Adjust by relative CPU power of the group */
3850 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3851
3852
3853 /*
3854 * Consider the group unbalanced when the imbalance is larger
3855 * than the average weight of two tasks.
3856 *
3857 * APZ: with cgroup the avg task weight can vary wildly and
3858 * might not be a suitable number - should we keep a
3859 * normalized nr_running number somewhere that negates
3860 * the hierarchy?
3861 */
3862 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3863 group->cpu_power;
3864
3865 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3866 sgs->group_imb = 1;
3867
3868 sgs->group_capacity =
3869 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3870}
3871
3872/**
3873 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3874 * @sd: sched_domain whose statistics are to be updated.
3875 * @this_cpu: Cpu for which load balance is currently performed.
3876 * @idle: Idle status of this_cpu
3877 * @sd_idle: Idle status of the sched_domain containing group.
3878 * @cpus: Set of cpus considered for load balancing.
3879 * @balance: Should we balance.
3880 * @sds: variable to hold the statistics for this sched_domain.
3881 */
3882static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3883 enum cpu_idle_type idle, int *sd_idle,
3884 const struct cpumask *cpus, int *balance,
3885 struct sd_lb_stats *sds)
3886{
3887 struct sched_domain *child = sd->child;
3888 struct sched_group *group = sd->groups;
3889 struct sg_lb_stats sgs;
3890 int load_idx, prefer_sibling = 0;
3891
3892 if (child && child->flags & SD_PREFER_SIBLING)
3893 prefer_sibling = 1;
3894
3895 init_sd_power_savings_stats(sd, sds, idle);
3896 load_idx = get_sd_load_idx(sd, idle);
3897
3898 do {
3899 int local_group;
3900
3901 local_group = cpumask_test_cpu(this_cpu,
3902 sched_group_cpus(group));
3903 memset(&sgs, 0, sizeof(sgs));
3904 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3905 local_group, cpus, balance, &sgs);
3906
3907 if (local_group && balance && !(*balance))
3908 return;
3909
3910 sds->total_load += sgs.group_load;
3911 sds->total_pwr += group->cpu_power;
3912
3913 /*
3914 * In case the child domain prefers tasks go to siblings
3915 * first, lower the group capacity to one so that we'll try
3916 * and move all the excess tasks away.
3917 */
3918 if (prefer_sibling)
3919 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3920
3921 if (local_group) {
3922 sds->this_load = sgs.avg_load;
3923 sds->this = group;
3924 sds->this_nr_running = sgs.sum_nr_running;
3925 sds->this_load_per_task = sgs.sum_weighted_load;
3926 } else if (sgs.avg_load > sds->max_load &&
3927 (sgs.sum_nr_running > sgs.group_capacity ||
3928 sgs.group_imb)) {
3929 sds->max_load = sgs.avg_load;
3930 sds->busiest = group;
3931 sds->busiest_nr_running = sgs.sum_nr_running;
3932 sds->busiest_load_per_task = sgs.sum_weighted_load;
3933 sds->group_imb = sgs.group_imb;
3934 }
3935
3936 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3937 group = group->next;
3938 } while (group != sd->groups);
3939}
3940
3941/**
3942 * fix_small_imbalance - Calculate the minor imbalance that exists
3943 * amongst the groups of a sched_domain, during
3944 * load balancing.
3945 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3946 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3947 * @imbalance: Variable to store the imbalance.
3948 */
3949static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3950 int this_cpu, unsigned long *imbalance)
3951{
3952 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3953 unsigned int imbn = 2;
3954
3955 if (sds->this_nr_running) {
3956 sds->this_load_per_task /= sds->this_nr_running;
3957 if (sds->busiest_load_per_task >
3958 sds->this_load_per_task)
3959 imbn = 1;
3960 } else
3961 sds->this_load_per_task =
3962 cpu_avg_load_per_task(this_cpu);
3963
3964 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3965 sds->busiest_load_per_task * imbn) {
3966 *imbalance = sds->busiest_load_per_task;
3967 return;
3968 }
3969
3970 /*
3971 * OK, we don't have enough imbalance to justify moving tasks,
3972 * however we may be able to increase total CPU power used by
3973 * moving them.
3974 */
3975
3976 pwr_now += sds->busiest->cpu_power *
3977 min(sds->busiest_load_per_task, sds->max_load);
3978 pwr_now += sds->this->cpu_power *
3979 min(sds->this_load_per_task, sds->this_load);
3980 pwr_now /= SCHED_LOAD_SCALE;
3981
3982 /* Amount of load we'd subtract */
3983 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3984 sds->busiest->cpu_power;
3985 if (sds->max_load > tmp)
3986 pwr_move += sds->busiest->cpu_power *
3987 min(sds->busiest_load_per_task, sds->max_load - tmp);
3988
3989 /* Amount of load we'd add */
3990 if (sds->max_load * sds->busiest->cpu_power <
3991 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3992 tmp = (sds->max_load * sds->busiest->cpu_power) /
3993 sds->this->cpu_power;
3994 else
3995 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3996 sds->this->cpu_power;
3997 pwr_move += sds->this->cpu_power *
3998 min(sds->this_load_per_task, sds->this_load + tmp);
3999 pwr_move /= SCHED_LOAD_SCALE;
4000
4001 /* Move if we gain throughput */
4002 if (pwr_move > pwr_now)
4003 *imbalance = sds->busiest_load_per_task;
4004}
4005
4006/**
4007 * calculate_imbalance - Calculate the amount of imbalance present within the
4008 * groups of a given sched_domain during load balance.
4009 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4010 * @this_cpu: Cpu for which currently load balance is being performed.
4011 * @imbalance: The variable to store the imbalance.
4012 */
4013static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4014 unsigned long *imbalance)
4015{
4016 unsigned long max_pull;
4017 /*
4018 * In the presence of smp nice balancing, certain scenarios can have
4019 * max load less than avg load(as we skip the groups at or below
4020 * its cpu_power, while calculating max_load..)
4021 */
4022 if (sds->max_load < sds->avg_load) {
4023 *imbalance = 0;
4024 return fix_small_imbalance(sds, this_cpu, imbalance);
4025 }
4026
4027 /* Don't want to pull so many tasks that a group would go idle */
4028 max_pull = min(sds->max_load - sds->avg_load,
4029 sds->max_load - sds->busiest_load_per_task);
4030
4031 /* How much load to actually move to equalise the imbalance */
4032 *imbalance = min(max_pull * sds->busiest->cpu_power,
4033 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
4034 / SCHED_LOAD_SCALE;
4035
4036 /*
4037 * if *imbalance is less than the average load per runnable task
4038 * there is no gaurantee that any tasks will be moved so we'll have
4039 * a think about bumping its value to force at least one task to be
4040 * moved
4041 */
4042 if (*imbalance < sds->busiest_load_per_task)
4043 return fix_small_imbalance(sds, this_cpu, imbalance);
4044
4045}
4046/******* find_busiest_group() helpers end here *********************/
4047
4048/**
4049 * find_busiest_group - Returns the busiest group within the sched_domain
4050 * if there is an imbalance. If there isn't an imbalance, and
4051 * the user has opted for power-savings, it returns a group whose
4052 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4053 * such a group exists.
4054 *
4055 * Also calculates the amount of weighted load which should be moved
4056 * to restore balance.
4057 *
4058 * @sd: The sched_domain whose busiest group is to be returned.
4059 * @this_cpu: The cpu for which load balancing is currently being performed.
4060 * @imbalance: Variable which stores amount of weighted load which should
4061 * be moved to restore balance/put a group to idle.
4062 * @idle: The idle status of this_cpu.
4063 * @sd_idle: The idleness of sd
4064 * @cpus: The set of CPUs under consideration for load-balancing.
4065 * @balance: Pointer to a variable indicating if this_cpu
4066 * is the appropriate cpu to perform load balancing at this_level.
4067 *
4068 * Returns: - the busiest group if imbalance exists.
4069 * - If no imbalance and user has opted for power-savings balance,
4070 * return the least loaded group whose CPUs can be
4071 * put to idle by rebalancing its tasks onto our group.
4072 */
4073static struct sched_group *
4074find_busiest_group(struct sched_domain *sd, int this_cpu,
4075 unsigned long *imbalance, enum cpu_idle_type idle,
4076 int *sd_idle, const struct cpumask *cpus, int *balance)
4077{
4078 struct sd_lb_stats sds;
4079
4080 memset(&sds, 0, sizeof(sds));
4081
4082 /*
4083 * Compute the various statistics relavent for load balancing at
4084 * this level.
4085 */
4086 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4087 balance, &sds);
4088
4089 /* Cases where imbalance does not exist from POV of this_cpu */
4090 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4091 * at this level.
4092 * 2) There is no busy sibling group to pull from.
4093 * 3) This group is the busiest group.
4094 * 4) This group is more busy than the avg busieness at this
4095 * sched_domain.
4096 * 5) The imbalance is within the specified limit.
4097 * 6) Any rebalance would lead to ping-pong
4098 */
4099 if (balance && !(*balance))
4100 goto ret;
4101
4102 if (!sds.busiest || sds.busiest_nr_running == 0)
4103 goto out_balanced;
4104
4105 if (sds.this_load >= sds.max_load)
4106 goto out_balanced;
4107
4108 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4109
4110 if (sds.this_load >= sds.avg_load)
4111 goto out_balanced;
4112
4113 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4114 goto out_balanced;
4115
4116 sds.busiest_load_per_task /= sds.busiest_nr_running;
4117 if (sds.group_imb)
4118 sds.busiest_load_per_task =
4119 min(sds.busiest_load_per_task, sds.avg_load);
4120
4121 /*
4122 * We're trying to get all the cpus to the average_load, so we don't
4123 * want to push ourselves above the average load, nor do we wish to
4124 * reduce the max loaded cpu below the average load, as either of these
4125 * actions would just result in more rebalancing later, and ping-pong
4126 * tasks around. Thus we look for the minimum possible imbalance.
4127 * Negative imbalances (*we* are more loaded than anyone else) will
4128 * be counted as no imbalance for these purposes -- we can't fix that
4129 * by pulling tasks to us. Be careful of negative numbers as they'll
4130 * appear as very large values with unsigned longs.
4131 */
4132 if (sds.max_load <= sds.busiest_load_per_task)
4133 goto out_balanced;
4134
4135 /* Looks like there is an imbalance. Compute it */
4136 calculate_imbalance(&sds, this_cpu, imbalance);
4137 return sds.busiest;
4138
4139out_balanced:
4140 /*
4141 * There is no obvious imbalance. But check if we can do some balancing
4142 * to save power.
4143 */
4144 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4145 return sds.busiest;
4146ret:
4147 *imbalance = 0;
4148 return NULL;
4149}
4150
4151/*
4152 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4153 */
4154static struct rq *
4155find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4156 unsigned long imbalance, const struct cpumask *cpus)
4157{
4158 struct rq *busiest = NULL, *rq;
4159 unsigned long max_load = 0;
4160 int i;
4161
4162 for_each_cpu(i, sched_group_cpus(group)) {
4163 unsigned long power = power_of(i);
4164 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4165 unsigned long wl;
4166
4167 if (!cpumask_test_cpu(i, cpus))
4168 continue;
4169
4170 rq = cpu_rq(i);
4171 wl = weighted_cpuload(i);
4172
4173 /*
4174 * When comparing with imbalance, use weighted_cpuload()
4175 * which is not scaled with the cpu power.
4176 */
4177 if (capacity && rq->nr_running == 1 && wl > imbalance)
4178 continue;
4179
4180 /*
4181 * For the load comparisons with the other cpu's, consider
4182 * the weighted_cpuload() scaled with the cpu power, so that
4183 * the load can be moved away from the cpu that is potentially
4184 * running at a lower capacity.
4185 */
4186 wl = (wl * SCHED_LOAD_SCALE) / power;
4187
4188 if (wl > max_load) {
4189 max_load = wl;
4190 busiest = rq;
4191 }
4192 }
4193
4194 return busiest;
4195}
4196
4197/*
4198 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4199 * so long as it is large enough.
4200 */
4201#define MAX_PINNED_INTERVAL 512
4202
4203/* Working cpumask for load_balance and load_balance_newidle. */
4204static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4205
4206/*
4207 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4208 * tasks if there is an imbalance.
4209 */
4210static int load_balance(int this_cpu, struct rq *this_rq,
4211 struct sched_domain *sd, enum cpu_idle_type idle,
4212 int *balance)
4213{
4214 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4215 struct sched_group *group;
4216 unsigned long imbalance;
4217 struct rq *busiest;
4218 unsigned long flags;
4219 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4220
4221 cpumask_copy(cpus, cpu_active_mask);
4222
4223 /*
4224 * When power savings policy is enabled for the parent domain, idle
4225 * sibling can pick up load irrespective of busy siblings. In this case,
4226 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4227 * portraying it as CPU_NOT_IDLE.
4228 */
4229 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4230 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4231 sd_idle = 1;
4232
4233 schedstat_inc(sd, lb_count[idle]);
4234
4235redo:
4236 update_shares(sd);
4237 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4238 cpus, balance);
4239
4240 if (*balance == 0)
4241 goto out_balanced;
4242
4243 if (!group) {
4244 schedstat_inc(sd, lb_nobusyg[idle]);
4245 goto out_balanced;
4246 }
4247
4248 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4249 if (!busiest) {
4250 schedstat_inc(sd, lb_nobusyq[idle]);
4251 goto out_balanced;
4252 }
4253
4254 BUG_ON(busiest == this_rq);
4255
4256 schedstat_add(sd, lb_imbalance[idle], imbalance);
4257
4258 ld_moved = 0;
4259 if (busiest->nr_running > 1) {
4260 /*
4261 * Attempt to move tasks. If find_busiest_group has found
4262 * an imbalance but busiest->nr_running <= 1, the group is
4263 * still unbalanced. ld_moved simply stays zero, so it is
4264 * correctly treated as an imbalance.
4265 */
4266 local_irq_save(flags);
4267 double_rq_lock(this_rq, busiest);
4268 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4269 imbalance, sd, idle, &all_pinned);
4270 double_rq_unlock(this_rq, busiest);
4271 local_irq_restore(flags);
4272
4273 /*
4274 * some other cpu did the load balance for us.
4275 */
4276 if (ld_moved && this_cpu != smp_processor_id())
4277 resched_cpu(this_cpu);
4278
4279 /* All tasks on this runqueue were pinned by CPU affinity */
4280 if (unlikely(all_pinned)) {
4281 cpumask_clear_cpu(cpu_of(busiest), cpus);
4282 if (!cpumask_empty(cpus))
4283 goto redo;
4284 goto out_balanced;
4285 }
4286 }
4287
4288 if (!ld_moved) {
4289 schedstat_inc(sd, lb_failed[idle]);
4290 sd->nr_balance_failed++;
4291
4292 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4293
4294 raw_spin_lock_irqsave(&busiest->lock, flags);
4295
4296 /* don't kick the migration_thread, if the curr
4297 * task on busiest cpu can't be moved to this_cpu
4298 */
4299 if (!cpumask_test_cpu(this_cpu,
4300 &busiest->curr->cpus_allowed)) {
4301 raw_spin_unlock_irqrestore(&busiest->lock,
4302 flags);
4303 all_pinned = 1;
4304 goto out_one_pinned;
4305 }
4306
4307 if (!busiest->active_balance) {
4308 busiest->active_balance = 1;
4309 busiest->push_cpu = this_cpu;
4310 active_balance = 1;
4311 }
4312 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4313 if (active_balance)
4314 wake_up_process(busiest->migration_thread);
4315
4316 /*
4317 * We've kicked active balancing, reset the failure
4318 * counter.
4319 */
4320 sd->nr_balance_failed = sd->cache_nice_tries+1;
4321 }
4322 } else
4323 sd->nr_balance_failed = 0;
4324
4325 if (likely(!active_balance)) {
4326 /* We were unbalanced, so reset the balancing interval */
4327 sd->balance_interval = sd->min_interval;
4328 } else {
4329 /*
4330 * If we've begun active balancing, start to back off. This
4331 * case may not be covered by the all_pinned logic if there
4332 * is only 1 task on the busy runqueue (because we don't call
4333 * move_tasks).
4334 */
4335 if (sd->balance_interval < sd->max_interval)
4336 sd->balance_interval *= 2;
4337 }
4338
4339 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4340 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4341 ld_moved = -1;
4342
4343 goto out;
4344
4345out_balanced:
4346 schedstat_inc(sd, lb_balanced[idle]);
4347
4348 sd->nr_balance_failed = 0;
4349
4350out_one_pinned:
4351 /* tune up the balancing interval */
4352 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4353 (sd->balance_interval < sd->max_interval))
4354 sd->balance_interval *= 2;
4355
4356 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4357 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4358 ld_moved = -1;
4359 else
4360 ld_moved = 0;
4361out:
4362 if (ld_moved)
4363 update_shares(sd);
4364 return ld_moved;
4365}
4366
4367/*
4368 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4369 * tasks if there is an imbalance.
4370 *
4371 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4372 * this_rq is locked.
4373 */
4374static int
4375load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4376{
4377 struct sched_group *group;
4378 struct rq *busiest = NULL;
4379 unsigned long imbalance;
4380 int ld_moved = 0;
4381 int sd_idle = 0;
4382 int all_pinned = 0;
4383 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4384
4385 cpumask_copy(cpus, cpu_active_mask);
4386
4387 /*
4388 * When power savings policy is enabled for the parent domain, idle
4389 * sibling can pick up load irrespective of busy siblings. In this case,
4390 * let the state of idle sibling percolate up as IDLE, instead of
4391 * portraying it as CPU_NOT_IDLE.
4392 */
4393 if (sd->flags & SD_SHARE_CPUPOWER &&
4394 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4395 sd_idle = 1;
4396
4397 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4398redo:
4399 update_shares_locked(this_rq, sd);
4400 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4401 &sd_idle, cpus, NULL);
4402 if (!group) {
4403 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4404 goto out_balanced;
4405 }
4406
4407 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4408 if (!busiest) {
4409 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4410 goto out_balanced;
4411 }
4412
4413 BUG_ON(busiest == this_rq);
4414
4415 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4416
4417 ld_moved = 0;
4418 if (busiest->nr_running > 1) {
4419 /* Attempt to move tasks */
4420 double_lock_balance(this_rq, busiest);
4421 /* this_rq->clock is already updated */
4422 update_rq_clock(busiest);
4423 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4424 imbalance, sd, CPU_NEWLY_IDLE,
4425 &all_pinned);
4426 double_unlock_balance(this_rq, busiest);
4427
4428 if (unlikely(all_pinned)) {
4429 cpumask_clear_cpu(cpu_of(busiest), cpus);
4430 if (!cpumask_empty(cpus))
4431 goto redo;
4432 }
4433 }
4434
4435 if (!ld_moved) {
4436 int active_balance = 0;
4437
4438 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4439 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4440 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4441 return -1;
4442
4443 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4444 return -1;
4445
4446 if (sd->nr_balance_failed++ < 2)
4447 return -1;
4448
4449 /*
4450 * The only task running in a non-idle cpu can be moved to this
4451 * cpu in an attempt to completely freeup the other CPU
4452 * package. The same method used to move task in load_balance()
4453 * have been extended for load_balance_newidle() to speedup
4454 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4455 *
4456 * The package power saving logic comes from
4457 * find_busiest_group(). If there are no imbalance, then
4458 * f_b_g() will return NULL. However when sched_mc={1,2} then
4459 * f_b_g() will select a group from which a running task may be
4460 * pulled to this cpu in order to make the other package idle.
4461 * If there is no opportunity to make a package idle and if
4462 * there are no imbalance, then f_b_g() will return NULL and no
4463 * action will be taken in load_balance_newidle().
4464 *
4465 * Under normal task pull operation due to imbalance, there
4466 * will be more than one task in the source run queue and
4467 * move_tasks() will succeed. ld_moved will be true and this
4468 * active balance code will not be triggered.
4469 */
4470
4471 /* Lock busiest in correct order while this_rq is held */
4472 double_lock_balance(this_rq, busiest);
4473
4474 /*
4475 * don't kick the migration_thread, if the curr
4476 * task on busiest cpu can't be moved to this_cpu
4477 */
4478 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4479 double_unlock_balance(this_rq, busiest);
4480 all_pinned = 1;
4481 return ld_moved;
4482 }
4483
4484 if (!busiest->active_balance) {
4485 busiest->active_balance = 1;
4486 busiest->push_cpu = this_cpu;
4487 active_balance = 1;
4488 }
4489
4490 double_unlock_balance(this_rq, busiest);
4491 /*
4492 * Should not call ttwu while holding a rq->lock
4493 */
4494 raw_spin_unlock(&this_rq->lock);
4495 if (active_balance)
4496 wake_up_process(busiest->migration_thread);
4497 raw_spin_lock(&this_rq->lock);
4498
4499 } else
4500 sd->nr_balance_failed = 0;
4501
4502 update_shares_locked(this_rq, sd);
4503 return ld_moved;
4504
4505out_balanced:
4506 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4507 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4508 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4509 return -1;
4510 sd->nr_balance_failed = 0;
4511
4512 return 0;
4513}
4514
4515/*
4516 * idle_balance is called by schedule() if this_cpu is about to become
4517 * idle. Attempts to pull tasks from other CPUs.
4518 */
4519static void idle_balance(int this_cpu, struct rq *this_rq)
4520{
4521 struct sched_domain *sd;
4522 int pulled_task = 0;
4523 unsigned long next_balance = jiffies + HZ;
4524
4525 this_rq->idle_stamp = this_rq->clock;
4526
4527 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4528 return;
4529
4530 for_each_domain(this_cpu, sd) {
4531 unsigned long interval;
4532
4533 if (!(sd->flags & SD_LOAD_BALANCE))
4534 continue;
4535
4536 if (sd->flags & SD_BALANCE_NEWIDLE)
4537 /* If we've pulled tasks over stop searching: */
4538 pulled_task = load_balance_newidle(this_cpu, this_rq,
4539 sd);
4540
4541 interval = msecs_to_jiffies(sd->balance_interval);
4542 if (time_after(next_balance, sd->last_balance + interval))
4543 next_balance = sd->last_balance + interval;
4544 if (pulled_task) {
4545 this_rq->idle_stamp = 0;
4546 break;
4547 }
4548 }
4549 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4550 /*
4551 * We are going idle. next_balance may be set based on
4552 * a busy processor. So reset next_balance.
4553 */
4554 this_rq->next_balance = next_balance;
4555 }
4556}
4557
4558/*
4559 * active_load_balance is run by migration threads. It pushes running tasks
4560 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4561 * running on each physical CPU where possible, and avoids physical /
4562 * logical imbalances.
4563 *
4564 * Called with busiest_rq locked.
4565 */
4566static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4567{
4568 int target_cpu = busiest_rq->push_cpu;
4569 struct sched_domain *sd;
4570 struct rq *target_rq;
4571
4572 /* Is there any task to move? */
4573 if (busiest_rq->nr_running <= 1)
4574 return;
4575
4576 target_rq = cpu_rq(target_cpu);
4577
4578 /*
4579 * This condition is "impossible", if it occurs
4580 * we need to fix it. Originally reported by
4581 * Bjorn Helgaas on a 128-cpu setup.
4582 */
4583 BUG_ON(busiest_rq == target_rq);
4584
4585 /* move a task from busiest_rq to target_rq */
4586 double_lock_balance(busiest_rq, target_rq);
4587 update_rq_clock(busiest_rq);
4588 update_rq_clock(target_rq);
4589
4590 /* Search for an sd spanning us and the target CPU. */
4591 for_each_domain(target_cpu, sd) {
4592 if ((sd->flags & SD_LOAD_BALANCE) &&
4593 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4594 break;
4595 }
4596
4597 if (likely(sd)) {
4598 schedstat_inc(sd, alb_count);
4599
4600 if (move_one_task(target_rq, target_cpu, busiest_rq,
4601 sd, CPU_IDLE))
4602 schedstat_inc(sd, alb_pushed);
4603 else
4604 schedstat_inc(sd, alb_failed);
4605 }
4606 double_unlock_balance(busiest_rq, target_rq);
4607}
4608
4609#ifdef CONFIG_NO_HZ
4610static struct {
4611 atomic_t load_balancer;
4612 cpumask_var_t cpu_mask;
4613 cpumask_var_t ilb_grp_nohz_mask;
4614} nohz ____cacheline_aligned = {
4615 .load_balancer = ATOMIC_INIT(-1),
4616};
4617
4618int get_nohz_load_balancer(void)
4619{
4620 return atomic_read(&nohz.load_balancer);
4621}
4622
4623#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4624/**
4625 * lowest_flag_domain - Return lowest sched_domain containing flag.
4626 * @cpu: The cpu whose lowest level of sched domain is to
4627 * be returned.
4628 * @flag: The flag to check for the lowest sched_domain
4629 * for the given cpu.
4630 *
4631 * Returns the lowest sched_domain of a cpu which contains the given flag.
4632 */
4633static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4634{
4635 struct sched_domain *sd;
4636
4637 for_each_domain(cpu, sd)
4638 if (sd && (sd->flags & flag))
4639 break;
4640
4641 return sd;
4642}
4643
4644/**
4645 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4646 * @cpu: The cpu whose domains we're iterating over.
4647 * @sd: variable holding the value of the power_savings_sd
4648 * for cpu.
4649 * @flag: The flag to filter the sched_domains to be iterated.
4650 *
4651 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4652 * set, starting from the lowest sched_domain to the highest.
4653 */
4654#define for_each_flag_domain(cpu, sd, flag) \
4655 for (sd = lowest_flag_domain(cpu, flag); \
4656 (sd && (sd->flags & flag)); sd = sd->parent)
4657
4658/**
4659 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4660 * @ilb_group: group to be checked for semi-idleness
4661 *
4662 * Returns: 1 if the group is semi-idle. 0 otherwise.
4663 *
4664 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4665 * and atleast one non-idle CPU. This helper function checks if the given
4666 * sched_group is semi-idle or not.
4667 */
4668static inline int is_semi_idle_group(struct sched_group *ilb_group)
4669{
4670 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4671 sched_group_cpus(ilb_group));
4672
4673 /*
4674 * A sched_group is semi-idle when it has atleast one busy cpu
4675 * and atleast one idle cpu.
4676 */
4677 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4678 return 0;
4679
4680 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4681 return 0;
4682
4683 return 1;
4684}
4685/**
4686 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4687 * @cpu: The cpu which is nominating a new idle_load_balancer.
4688 *
4689 * Returns: Returns the id of the idle load balancer if it exists,
4690 * Else, returns >= nr_cpu_ids.
4691 *
4692 * This algorithm picks the idle load balancer such that it belongs to a
4693 * semi-idle powersavings sched_domain. The idea is to try and avoid
4694 * completely idle packages/cores just for the purpose of idle load balancing
4695 * when there are other idle cpu's which are better suited for that job.
4696 */
4697static int find_new_ilb(int cpu)
4698{
4699 struct sched_domain *sd;
4700 struct sched_group *ilb_group;
4701
4702 /*
4703 * Have idle load balancer selection from semi-idle packages only
4704 * when power-aware load balancing is enabled
4705 */
4706 if (!(sched_smt_power_savings || sched_mc_power_savings))
4707 goto out_done;
4708
4709 /*
4710 * Optimize for the case when we have no idle CPUs or only one
4711 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4712 */
4713 if (cpumask_weight(nohz.cpu_mask) < 2)
4714 goto out_done;
4715
4716 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4717 ilb_group = sd->groups;
4718
4719 do {
4720 if (is_semi_idle_group(ilb_group))
4721 return cpumask_first(nohz.ilb_grp_nohz_mask);
4722
4723 ilb_group = ilb_group->next;
4724
4725 } while (ilb_group != sd->groups);
4726 }
4727
4728out_done:
4729 return cpumask_first(nohz.cpu_mask);
4730}
4731#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4732static inline int find_new_ilb(int call_cpu)
4733{
4734 return cpumask_first(nohz.cpu_mask);
4735}
4736#endif
4737
4738/*
4739 * This routine will try to nominate the ilb (idle load balancing)
4740 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4741 * load balancing on behalf of all those cpus. If all the cpus in the system
4742 * go into this tickless mode, then there will be no ilb owner (as there is
4743 * no need for one) and all the cpus will sleep till the next wakeup event
4744 * arrives...
4745 *
4746 * For the ilb owner, tick is not stopped. And this tick will be used
4747 * for idle load balancing. ilb owner will still be part of
4748 * nohz.cpu_mask..
4749 *
4750 * While stopping the tick, this cpu will become the ilb owner if there
4751 * is no other owner. And will be the owner till that cpu becomes busy
4752 * or if all cpus in the system stop their ticks at which point
4753 * there is no need for ilb owner.
4754 *
4755 * When the ilb owner becomes busy, it nominates another owner, during the
4756 * next busy scheduler_tick()
4757 */
4758int select_nohz_load_balancer(int stop_tick)
4759{
4760 int cpu = smp_processor_id();
4761
4762 if (stop_tick) {
4763 cpu_rq(cpu)->in_nohz_recently = 1;
4764
4765 if (!cpu_active(cpu)) {
4766 if (atomic_read(&nohz.load_balancer) != cpu)
4767 return 0;
4768
4769 /*
4770 * If we are going offline and still the leader,
4771 * give up!
4772 */
4773 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4774 BUG();
4775
4776 return 0;
4777 }
4778
4779 cpumask_set_cpu(cpu, nohz.cpu_mask);
4780
4781 /* time for ilb owner also to sleep */
4782 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4783 if (atomic_read(&nohz.load_balancer) == cpu)
4784 atomic_set(&nohz.load_balancer, -1);
4785 return 0;
4786 }
4787
4788 if (atomic_read(&nohz.load_balancer) == -1) {
4789 /* make me the ilb owner */
4790 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4791 return 1;
4792 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4793 int new_ilb;
4794
4795 if (!(sched_smt_power_savings ||
4796 sched_mc_power_savings))
4797 return 1;
4798 /*
4799 * Check to see if there is a more power-efficient
4800 * ilb.
4801 */
4802 new_ilb = find_new_ilb(cpu);
4803 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4804 atomic_set(&nohz.load_balancer, -1);
4805 resched_cpu(new_ilb);
4806 return 0;
4807 }
4808 return 1;
4809 }
4810 } else {
4811 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4812 return 0;
4813
4814 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4815
4816 if (atomic_read(&nohz.load_balancer) == cpu)
4817 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4818 BUG();
4819 }
4820 return 0;
4821}
4822#endif
4823
4824static DEFINE_SPINLOCK(balancing);
4825
4826/*
4827 * It checks each scheduling domain to see if it is due to be balanced,
4828 * and initiates a balancing operation if so.
4829 *
4830 * Balancing parameters are set up in arch_init_sched_domains.
4831 */
4832static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4833{
4834 int balance = 1;
4835 struct rq *rq = cpu_rq(cpu);
4836 unsigned long interval;
4837 struct sched_domain *sd;
4838 /* Earliest time when we have to do rebalance again */
4839 unsigned long next_balance = jiffies + 60*HZ;
4840 int update_next_balance = 0;
4841 int need_serialize;
4842
4843 for_each_domain(cpu, sd) {
4844 if (!(sd->flags & SD_LOAD_BALANCE))
4845 continue;
4846
4847 interval = sd->balance_interval;
4848 if (idle != CPU_IDLE)
4849 interval *= sd->busy_factor;
4850
4851 /* scale ms to jiffies */
4852 interval = msecs_to_jiffies(interval);
4853 if (unlikely(!interval))
4854 interval = 1;
4855 if (interval > HZ*NR_CPUS/10)
4856 interval = HZ*NR_CPUS/10;
4857
4858 need_serialize = sd->flags & SD_SERIALIZE;
4859
4860 if (need_serialize) {
4861 if (!spin_trylock(&balancing))
4862 goto out;
4863 }
4864
4865 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4866 if (load_balance(cpu, rq, sd, idle, &balance)) {
4867 /*
4868 * We've pulled tasks over so either we're no
4869 * longer idle, or one of our SMT siblings is
4870 * not idle.
4871 */
4872 idle = CPU_NOT_IDLE;
4873 }
4874 sd->last_balance = jiffies;
4875 }
4876 if (need_serialize)
4877 spin_unlock(&balancing);
4878out:
4879 if (time_after(next_balance, sd->last_balance + interval)) {
4880 next_balance = sd->last_balance + interval;
4881 update_next_balance = 1;
4882 }
4883
4884 /*
4885 * Stop the load balance at this level. There is another
4886 * CPU in our sched group which is doing load balancing more
4887 * actively.
4888 */
4889 if (!balance)
4890 break;
4891 }
4892
4893 /*
4894 * next_balance will be updated only when there is a need.
4895 * When the cpu is attached to null domain for ex, it will not be
4896 * updated.
4897 */
4898 if (likely(update_next_balance))
4899 rq->next_balance = next_balance;
4900}
4901
4902/*
4903 * run_rebalance_domains is triggered when needed from the scheduler tick.
4904 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4905 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4906 */
4907static void run_rebalance_domains(struct softirq_action *h)
4908{
4909 int this_cpu = smp_processor_id();
4910 struct rq *this_rq = cpu_rq(this_cpu);
4911 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4912 CPU_IDLE : CPU_NOT_IDLE;
4913
4914 rebalance_domains(this_cpu, idle);
4915
4916#ifdef CONFIG_NO_HZ
4917 /*
4918 * If this cpu is the owner for idle load balancing, then do the
4919 * balancing on behalf of the other idle cpus whose ticks are
4920 * stopped.
4921 */
4922 if (this_rq->idle_at_tick &&
4923 atomic_read(&nohz.load_balancer) == this_cpu) {
4924 struct rq *rq;
4925 int balance_cpu;
4926
4927 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4928 if (balance_cpu == this_cpu)
4929 continue;
4930
4931 /*
4932 * If this cpu gets work to do, stop the load balancing
4933 * work being done for other cpus. Next load
4934 * balancing owner will pick it up.
4935 */
4936 if (need_resched())
4937 break;
4938
4939 rebalance_domains(balance_cpu, CPU_IDLE);
4940
4941 rq = cpu_rq(balance_cpu);
4942 if (time_after(this_rq->next_balance, rq->next_balance))
4943 this_rq->next_balance = rq->next_balance;
4944 }
4945 }
4946#endif
4947}
4948
4949static inline int on_null_domain(int cpu)
4950{
4951 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
4952}
4953
4954/*
4955 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4956 *
4957 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4958 * idle load balancing owner or decide to stop the periodic load balancing,
4959 * if the whole system is idle.
4960 */
4961static inline void trigger_load_balance(struct rq *rq, int cpu)
4962{
4963#ifdef CONFIG_NO_HZ
4964 /*
4965 * If we were in the nohz mode recently and busy at the current
4966 * scheduler tick, then check if we need to nominate new idle
4967 * load balancer.
4968 */
4969 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4970 rq->in_nohz_recently = 0;
4971
4972 if (atomic_read(&nohz.load_balancer) == cpu) {
4973 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4974 atomic_set(&nohz.load_balancer, -1);
4975 }
4976
4977 if (atomic_read(&nohz.load_balancer) == -1) {
4978 int ilb = find_new_ilb(cpu);
4979
4980 if (ilb < nr_cpu_ids)
4981 resched_cpu(ilb);
4982 }
4983 }
4984
4985 /*
4986 * If this cpu is idle and doing idle load balancing for all the
4987 * cpus with ticks stopped, is it time for that to stop?
4988 */
4989 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4990 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4991 resched_cpu(cpu);
4992 return;
4993 }
4994
4995 /*
4996 * If this cpu is idle and the idle load balancing is done by
4997 * someone else, then no need raise the SCHED_SOFTIRQ
4998 */
4999 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
5000 cpumask_test_cpu(cpu, nohz.cpu_mask))
5001 return;
5002#endif
5003 /* Don't need to rebalance while attached to NULL domain */
5004 if (time_after_eq(jiffies, rq->next_balance) &&
5005 likely(!on_null_domain(cpu)))
5006 raise_softirq(SCHED_SOFTIRQ);
5007}
5008
5009#else /* CONFIG_SMP */
5010
5011/*
5012 * on UP we do not need to balance between CPUs:
5013 */
5014static inline void idle_balance(int cpu, struct rq *rq)
5015{
5016}
5017
5018#endif 3164#endif
5019 3165
5020DEFINE_PER_CPU(struct kernel_stat, kstat); 3166DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -6114,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6114 unsigned long flags; 4260 unsigned long flags;
6115 int oldprio, on_rq, running; 4261 int oldprio, on_rq, running;
6116 struct rq *rq; 4262 struct rq *rq;
6117 const struct sched_class *prev_class = p->sched_class; 4263 const struct sched_class *prev_class;
6118 4264
6119 BUG_ON(prio < 0 || prio > MAX_PRIO); 4265 BUG_ON(prio < 0 || prio > MAX_PRIO);
6120 4266
@@ -6122,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6122 update_rq_clock(rq); 4268 update_rq_clock(rq);
6123 4269
6124 oldprio = p->prio; 4270 oldprio = p->prio;
4271 prev_class = p->sched_class;
6125 on_rq = p->se.on_rq; 4272 on_rq = p->se.on_rq;
6126 running = task_current(rq, p); 4273 running = task_current(rq, p);
6127 if (on_rq) 4274 if (on_rq)
@@ -6139,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6139 if (running) 4286 if (running)
6140 p->sched_class->set_curr_task(rq); 4287 p->sched_class->set_curr_task(rq);
6141 if (on_rq) { 4288 if (on_rq) {
6142 enqueue_task(rq, p, 0); 4289 enqueue_task(rq, p, 0, oldprio < prio);
6143 4290
6144 check_class_changed(rq, p, prev_class, oldprio, running); 4291 check_class_changed(rq, p, prev_class, oldprio, running);
6145 } 4292 }
@@ -6183,7 +4330,7 @@ void set_user_nice(struct task_struct *p, long nice)
6183 delta = p->prio - old_prio; 4330 delta = p->prio - old_prio;
6184 4331
6185 if (on_rq) { 4332 if (on_rq) {
6186 enqueue_task(rq, p, 0); 4333 enqueue_task(rq, p, 0, false);
6187 /* 4334 /*
6188 * If the task increased its priority or is running and 4335 * If the task increased its priority or is running and
6189 * lowered its priority, then reschedule its CPU: 4336 * lowered its priority, then reschedule its CPU:
@@ -6341,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6341{ 4488{
6342 int retval, oldprio, oldpolicy = -1, on_rq, running; 4489 int retval, oldprio, oldpolicy = -1, on_rq, running;
6343 unsigned long flags; 4490 unsigned long flags;
6344 const struct sched_class *prev_class = p->sched_class; 4491 const struct sched_class *prev_class;
6345 struct rq *rq; 4492 struct rq *rq;
6346 int reset_on_fork; 4493 int reset_on_fork;
6347 4494
@@ -6455,6 +4602,7 @@ recheck:
6455 p->sched_reset_on_fork = reset_on_fork; 4602 p->sched_reset_on_fork = reset_on_fork;
6456 4603
6457 oldprio = p->prio; 4604 oldprio = p->prio;
4605 prev_class = p->sched_class;
6458 __setscheduler(rq, p, policy, param->sched_priority); 4606 __setscheduler(rq, p, policy, param->sched_priority);
6459 4607
6460 if (running) 4608 if (running)
@@ -9493,7 +7641,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9493 tg->rt_rq[cpu] = rt_rq; 7641 tg->rt_rq[cpu] = rt_rq;
9494 init_rt_rq(rt_rq, rq); 7642 init_rt_rq(rt_rq, rq);
9495 rt_rq->tg = tg; 7643 rt_rq->tg = tg;
9496 rt_rq->rt_se = rt_se;
9497 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7644 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9498 if (add) 7645 if (add)
9499 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7646 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9524,9 +7671,6 @@ void __init sched_init(void)
9524#ifdef CONFIG_RT_GROUP_SCHED 7671#ifdef CONFIG_RT_GROUP_SCHED
9525 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7672 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9526#endif 7673#endif
9527#ifdef CONFIG_USER_SCHED
9528 alloc_size *= 2;
9529#endif
9530#ifdef CONFIG_CPUMASK_OFFSTACK 7674#ifdef CONFIG_CPUMASK_OFFSTACK
9531 alloc_size += num_possible_cpus() * cpumask_size(); 7675 alloc_size += num_possible_cpus() * cpumask_size();
9532#endif 7676#endif
@@ -9540,13 +7684,6 @@ void __init sched_init(void)
9540 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7684 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9541 ptr += nr_cpu_ids * sizeof(void **); 7685 ptr += nr_cpu_ids * sizeof(void **);
9542 7686
9543#ifdef CONFIG_USER_SCHED
9544 root_task_group.se = (struct sched_entity **)ptr;
9545 ptr += nr_cpu_ids * sizeof(void **);
9546
9547 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9548 ptr += nr_cpu_ids * sizeof(void **);
9549#endif /* CONFIG_USER_SCHED */
9550#endif /* CONFIG_FAIR_GROUP_SCHED */ 7687#endif /* CONFIG_FAIR_GROUP_SCHED */
9551#ifdef CONFIG_RT_GROUP_SCHED 7688#ifdef CONFIG_RT_GROUP_SCHED
9552 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7689 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9555,13 +7692,6 @@ void __init sched_init(void)
9555 init_task_group.rt_rq = (struct rt_rq **)ptr; 7692 init_task_group.rt_rq = (struct rt_rq **)ptr;
9556 ptr += nr_cpu_ids * sizeof(void **); 7693 ptr += nr_cpu_ids * sizeof(void **);
9557 7694
9558#ifdef CONFIG_USER_SCHED
9559 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9560 ptr += nr_cpu_ids * sizeof(void **);
9561
9562 root_task_group.rt_rq = (struct rt_rq **)ptr;
9563 ptr += nr_cpu_ids * sizeof(void **);
9564#endif /* CONFIG_USER_SCHED */
9565#endif /* CONFIG_RT_GROUP_SCHED */ 7695#endif /* CONFIG_RT_GROUP_SCHED */
9566#ifdef CONFIG_CPUMASK_OFFSTACK 7696#ifdef CONFIG_CPUMASK_OFFSTACK
9567 for_each_possible_cpu(i) { 7697 for_each_possible_cpu(i) {
@@ -9581,22 +7711,13 @@ void __init sched_init(void)
9581#ifdef CONFIG_RT_GROUP_SCHED 7711#ifdef CONFIG_RT_GROUP_SCHED
9582 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7712 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9583 global_rt_period(), global_rt_runtime()); 7713 global_rt_period(), global_rt_runtime());
9584#ifdef CONFIG_USER_SCHED
9585 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9586 global_rt_period(), RUNTIME_INF);
9587#endif /* CONFIG_USER_SCHED */
9588#endif /* CONFIG_RT_GROUP_SCHED */ 7714#endif /* CONFIG_RT_GROUP_SCHED */
9589 7715
9590#ifdef CONFIG_GROUP_SCHED 7716#ifdef CONFIG_CGROUP_SCHED
9591 list_add(&init_task_group.list, &task_groups); 7717 list_add(&init_task_group.list, &task_groups);
9592 INIT_LIST_HEAD(&init_task_group.children); 7718 INIT_LIST_HEAD(&init_task_group.children);
9593 7719
9594#ifdef CONFIG_USER_SCHED 7720#endif /* CONFIG_CGROUP_SCHED */
9595 INIT_LIST_HEAD(&root_task_group.children);
9596 init_task_group.parent = &root_task_group;
9597 list_add(&init_task_group.siblings, &root_task_group.children);
9598#endif /* CONFIG_USER_SCHED */
9599#endif /* CONFIG_GROUP_SCHED */
9600 7721
9601#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7722#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9602 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7723 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9636,25 +7757,6 @@ void __init sched_init(void)
9636 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7757 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9637 */ 7758 */
9638 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7759 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9639#elif defined CONFIG_USER_SCHED
9640 root_task_group.shares = NICE_0_LOAD;
9641 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9642 /*
9643 * In case of task-groups formed thr' the user id of tasks,
9644 * init_task_group represents tasks belonging to root user.
9645 * Hence it forms a sibling of all subsequent groups formed.
9646 * In this case, init_task_group gets only a fraction of overall
9647 * system cpu resource, based on the weight assigned to root
9648 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9649 * by letting tasks of init_task_group sit in a separate cfs_rq
9650 * (init_tg_cfs_rq) and having one entity represent this group of
9651 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9652 */
9653 init_tg_cfs_entry(&init_task_group,
9654 &per_cpu(init_tg_cfs_rq, i),
9655 &per_cpu(init_sched_entity, i), i, 1,
9656 root_task_group.se[i]);
9657
9658#endif 7760#endif
9659#endif /* CONFIG_FAIR_GROUP_SCHED */ 7761#endif /* CONFIG_FAIR_GROUP_SCHED */
9660 7762
@@ -9663,12 +7765,6 @@ void __init sched_init(void)
9663 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7765 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9664#ifdef CONFIG_CGROUP_SCHED 7766#ifdef CONFIG_CGROUP_SCHED
9665 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7767 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9666#elif defined CONFIG_USER_SCHED
9667 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9668 init_tg_rt_entry(&init_task_group,
9669 &per_cpu(init_rt_rq_var, i),
9670 &per_cpu(init_sched_rt_entity, i), i, 1,
9671 root_task_group.rt_se[i]);
9672#endif 7768#endif
9673#endif 7769#endif
9674 7770
@@ -9753,7 +7849,7 @@ static inline int preempt_count_equals(int preempt_offset)
9753 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7849 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9754} 7850}
9755 7851
9756void __might_sleep(char *file, int line, int preempt_offset) 7852void __might_sleep(const char *file, int line, int preempt_offset)
9757{ 7853{
9758#ifdef in_atomic 7854#ifdef in_atomic
9759 static unsigned long prev_jiffy; /* ratelimiting */ 7855 static unsigned long prev_jiffy; /* ratelimiting */
@@ -10064,7 +8160,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
10064} 8160}
10065#endif /* CONFIG_RT_GROUP_SCHED */ 8161#endif /* CONFIG_RT_GROUP_SCHED */
10066 8162
10067#ifdef CONFIG_GROUP_SCHED 8163#ifdef CONFIG_CGROUP_SCHED
10068static void free_sched_group(struct task_group *tg) 8164static void free_sched_group(struct task_group *tg)
10069{ 8165{
10070 free_fair_sched_group(tg); 8166 free_fair_sched_group(tg);
@@ -10169,11 +8265,11 @@ void sched_move_task(struct task_struct *tsk)
10169 if (unlikely(running)) 8265 if (unlikely(running))
10170 tsk->sched_class->set_curr_task(rq); 8266 tsk->sched_class->set_curr_task(rq);
10171 if (on_rq) 8267 if (on_rq)
10172 enqueue_task(rq, tsk, 0); 8268 enqueue_task(rq, tsk, 0, false);
10173 8269
10174 task_rq_unlock(rq, &flags); 8270 task_rq_unlock(rq, &flags);
10175} 8271}
10176#endif /* CONFIG_GROUP_SCHED */ 8272#endif /* CONFIG_CGROUP_SCHED */
10177 8273
10178#ifdef CONFIG_FAIR_GROUP_SCHED 8274#ifdef CONFIG_FAIR_GROUP_SCHED
10179static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8275static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10315,13 +8411,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10315 runtime = d->rt_runtime; 8411 runtime = d->rt_runtime;
10316 } 8412 }
10317 8413
10318#ifdef CONFIG_USER_SCHED
10319 if (tg == &root_task_group) {
10320 period = global_rt_period();
10321 runtime = global_rt_runtime();
10322 }
10323#endif
10324
10325 /* 8414 /*
10326 * Cannot have more runtime than the period. 8415 * Cannot have more runtime than the period.
10327 */ 8416 */
@@ -10941,12 +9030,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10941} 9030}
10942 9031
10943/* 9032/*
9033 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9034 * in cputime_t units. As a result, cpuacct_update_stats calls
9035 * percpu_counter_add with values large enough to always overflow the
9036 * per cpu batch limit causing bad SMP scalability.
9037 *
9038 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9039 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9040 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9041 */
9042#ifdef CONFIG_SMP
9043#define CPUACCT_BATCH \
9044 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9045#else
9046#define CPUACCT_BATCH 0
9047#endif
9048
9049/*
10944 * Charge the system/user time to the task's accounting group. 9050 * Charge the system/user time to the task's accounting group.
10945 */ 9051 */
10946static void cpuacct_update_stats(struct task_struct *tsk, 9052static void cpuacct_update_stats(struct task_struct *tsk,
10947 enum cpuacct_stat_index idx, cputime_t val) 9053 enum cpuacct_stat_index idx, cputime_t val)
10948{ 9054{
10949 struct cpuacct *ca; 9055 struct cpuacct *ca;
9056 int batch = CPUACCT_BATCH;
10950 9057
10951 if (unlikely(!cpuacct_subsys.active)) 9058 if (unlikely(!cpuacct_subsys.active))
10952 return; 9059 return;
@@ -10955,7 +9062,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10955 ca = task_ca(tsk); 9062 ca = task_ca(tsk);
10956 9063
10957 do { 9064 do {
10958 percpu_counter_add(&ca->cpustat[idx], val); 9065 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10959 ca = ca->parent; 9066 ca = ca->parent;
10960 } while (ca); 9067 } while (ca);
10961 rcu_read_unlock(); 9068 rcu_read_unlock();