aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/ksysfs.c8
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/sched.c2125
-rw-r--r--kernel/sched_cpupri.c4
-rw-r--r--kernel/sched_fair.c1699
-rw-r--r--kernel/sched_idletask.c23
-rw-r--r--kernel/sched_rt.c54
-rw-r--r--kernel/sys.c5
-rw-r--r--kernel/user.c305
9 files changed, 1785 insertions, 2440 deletions
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3feaf5a74514..6b1ccc3f0205 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -197,16 +197,8 @@ static int __init ksysfs_init(void)
197 goto group_exit; 197 goto group_exit;
198 } 198 }
199 199
200 /* create the /sys/kernel/uids/ directory */
201 error = uids_sysfs_init();
202 if (error)
203 goto notes_exit;
204
205 return 0; 200 return 0;
206 201
207notes_exit:
208 if (notes_size > 0)
209 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
210group_exit: 202group_exit:
211 sysfs_remove_group(kernel_kobj, &kernel_attr_group); 203 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
212kset_exit: 204kset_exit:
diff --git a/kernel/kthread.c b/kernel/kthread.c
index fbb6222fe7e0..82ed0ea15194 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create)
101 * 101 *
102 * Description: This helper function creates and names a kernel 102 * Description: This helper function creates and names a kernel
103 * thread. The thread will be stopped: use wake_up_process() to start 103 * thread. The thread will be stopped: use wake_up_process() to start
104 * it. See also kthread_run(), kthread_create_on_cpu(). 104 * it. See also kthread_run().
105 * 105 *
106 * When woken, the thread will run @threadfn() with @data as its 106 * When woken, the thread will run @threadfn() with @data as its
107 * argument. @threadfn() can either call do_exit() directly if it is a 107 * argument. @threadfn() can either call do_exit() directly if it is a
diff --git a/kernel/sched.c b/kernel/sched.c
index caf54e1eef6e..6a212c97f523 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 233 */
234static DEFINE_MUTEX(sched_domains_mutex); 234static DEFINE_MUTEX(sched_domains_mutex);
235 235
236#ifdef CONFIG_GROUP_SCHED 236#ifdef CONFIG_CGROUP_SCHED
237 237
238#include <linux/cgroup.h> 238#include <linux/cgroup.h>
239 239
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
243 243
244/* task group related information */ 244/* task group related information */
245struct task_group { 245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 246 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 247
254#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 249 /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
274 struct list_head children; 268 struct list_head children;
275}; 269};
276 270
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 271#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 272
307/* task_group_lock serializes add/remove of task groups and also changes to 273/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 274 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
318} 284}
319#endif 285#endif
320 286
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 287# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 288
327/* 289/*
328 * A weight of 0 or 1 can cause arithmetics problems. 290 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 310{
349 struct task_group *tg; 311 struct task_group *tg;
350 312
351#ifdef CONFIG_USER_SCHED 313#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 315 struct task_group, css);
358#else 316#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 341 return NULL;
384} 342}
385 343
386#endif /* CONFIG_GROUP_SCHED */ 344#endif /* CONFIG_CGROUP_SCHED */
387 345
388/* CFS-related fields in a runqueue */ 346/* CFS-related fields in a runqueue */
389struct cfs_rq { 347struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
478 struct rq *rq; 436 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 437 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 438 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 439#endif
483}; 440};
484 441
@@ -1414,32 +1371,6 @@ static const u32 prio_to_wmult[40] = {
1414 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1371 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1415}; 1372};
1416 1373
1417static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1418
1419/*
1420 * runqueue iterator, to support SMP load-balancing between different
1421 * scheduling classes, without having to expose their internal data
1422 * structures to the load-balancing proper:
1423 */
1424struct rq_iterator {
1425 void *arg;
1426 struct task_struct *(*start)(void *);
1427 struct task_struct *(*next)(void *);
1428};
1429
1430#ifdef CONFIG_SMP
1431static unsigned long
1432balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1433 unsigned long max_load_move, struct sched_domain *sd,
1434 enum cpu_idle_type idle, int *all_pinned,
1435 int *this_best_prio, struct rq_iterator *iterator);
1436
1437static int
1438iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1439 struct sched_domain *sd, enum cpu_idle_type idle,
1440 struct rq_iterator *iterator);
1441#endif
1442
1443/* Time spent by the tasks of the cpu accounting group executing in ... */ 1374/* Time spent by the tasks of the cpu accounting group executing in ... */
1444enum cpuacct_stat_index { 1375enum cpuacct_stat_index {
1445 CPUACCT_STAT_USER, /* ... user mode */ 1376 CPUACCT_STAT_USER, /* ... user mode */
@@ -1725,16 +1656,6 @@ static void update_shares(struct sched_domain *sd)
1725 } 1656 }
1726} 1657}
1727 1658
1728static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1729{
1730 if (root_task_group_empty())
1731 return;
1732
1733 raw_spin_unlock(&rq->lock);
1734 update_shares(sd);
1735 raw_spin_lock(&rq->lock);
1736}
1737
1738static void update_h_load(long cpu) 1659static void update_h_load(long cpu)
1739{ 1660{
1740 if (root_task_group_empty()) 1661 if (root_task_group_empty())
@@ -1749,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd)
1749{ 1670{
1750} 1671}
1751 1672
1752static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1753{
1754}
1755
1756#endif 1673#endif
1757 1674
1758#ifdef CONFIG_PREEMPT 1675#ifdef CONFIG_PREEMPT
@@ -1829,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1829 raw_spin_unlock(&busiest->lock); 1746 raw_spin_unlock(&busiest->lock);
1830 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1747 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1831} 1748}
1749
1750/*
1751 * double_rq_lock - safely lock two runqueues
1752 *
1753 * Note this does not disable interrupts like task_rq_lock,
1754 * you need to do so manually before calling.
1755 */
1756static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1757 __acquires(rq1->lock)
1758 __acquires(rq2->lock)
1759{
1760 BUG_ON(!irqs_disabled());
1761 if (rq1 == rq2) {
1762 raw_spin_lock(&rq1->lock);
1763 __acquire(rq2->lock); /* Fake it out ;) */
1764 } else {
1765 if (rq1 < rq2) {
1766 raw_spin_lock(&rq1->lock);
1767 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1768 } else {
1769 raw_spin_lock(&rq2->lock);
1770 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1771 }
1772 }
1773 update_rq_clock(rq1);
1774 update_rq_clock(rq2);
1775}
1776
1777/*
1778 * double_rq_unlock - safely unlock two runqueues
1779 *
1780 * Note this does not restore interrupts like task_rq_unlock,
1781 * you need to do so manually after calling.
1782 */
1783static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1784 __releases(rq1->lock)
1785 __releases(rq2->lock)
1786{
1787 raw_spin_unlock(&rq1->lock);
1788 if (rq1 != rq2)
1789 raw_spin_unlock(&rq2->lock);
1790 else
1791 __release(rq2->lock);
1792}
1793
1832#endif 1794#endif
1833 1795
1834#ifdef CONFIG_FAIR_GROUP_SCHED 1796#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1858,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1858#endif 1820#endif
1859} 1821}
1860 1822
1861#include "sched_stats.h" 1823static const struct sched_class rt_sched_class;
1862#include "sched_idletask.c"
1863#include "sched_fair.c"
1864#include "sched_rt.c"
1865#ifdef CONFIG_SCHED_DEBUG
1866# include "sched_debug.c"
1867#endif
1868 1824
1869#define sched_class_highest (&rt_sched_class) 1825#define sched_class_highest (&rt_sched_class)
1870#define for_each_class(class) \ 1826#define for_each_class(class) \
1871 for (class = sched_class_highest; class; class = class->next) 1827 for (class = sched_class_highest; class; class = class->next)
1872 1828
1829#include "sched_stats.h"
1830
1873static void inc_nr_running(struct rq *rq) 1831static void inc_nr_running(struct rq *rq)
1874{ 1832{
1875 rq->nr_running++; 1833 rq->nr_running++;
@@ -1907,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample)
1907 *avg += diff >> 3; 1865 *avg += diff >> 3;
1908} 1866}
1909 1867
1910static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1868static void
1869enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1911{ 1870{
1912 if (wakeup) 1871 if (wakeup)
1913 p->se.start_runtime = p->se.sum_exec_runtime; 1872 p->se.start_runtime = p->se.sum_exec_runtime;
1914 1873
1915 sched_info_queued(p); 1874 sched_info_queued(p);
1916 p->sched_class->enqueue_task(rq, p, wakeup); 1875 p->sched_class->enqueue_task(rq, p, wakeup, head);
1917 p->se.on_rq = 1; 1876 p->se.on_rq = 1;
1918} 1877}
1919 1878
@@ -1936,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1936} 1895}
1937 1896
1938/* 1897/*
1898 * activate_task - move a task to the runqueue.
1899 */
1900static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1901{
1902 if (task_contributes_to_load(p))
1903 rq->nr_uninterruptible--;
1904
1905 enqueue_task(rq, p, wakeup, false);
1906 inc_nr_running(rq);
1907}
1908
1909/*
1910 * deactivate_task - remove a task from the runqueue.
1911 */
1912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1913{
1914 if (task_contributes_to_load(p))
1915 rq->nr_uninterruptible++;
1916
1917 dequeue_task(rq, p, sleep);
1918 dec_nr_running(rq);
1919}
1920
1921#include "sched_idletask.c"
1922#include "sched_fair.c"
1923#include "sched_rt.c"
1924#ifdef CONFIG_SCHED_DEBUG
1925# include "sched_debug.c"
1926#endif
1927
1928/*
1939 * __normal_prio - return the priority that is based on the static prio 1929 * __normal_prio - return the priority that is based on the static prio
1940 */ 1930 */
1941static inline int __normal_prio(struct task_struct *p) 1931static inline int __normal_prio(struct task_struct *p)
@@ -1981,30 +1971,6 @@ static int effective_prio(struct task_struct *p)
1981 return p->prio; 1971 return p->prio;
1982} 1972}
1983 1973
1984/*
1985 * activate_task - move a task to the runqueue.
1986 */
1987static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1988{
1989 if (task_contributes_to_load(p))
1990 rq->nr_uninterruptible--;
1991
1992 enqueue_task(rq, p, wakeup);
1993 inc_nr_running(rq);
1994}
1995
1996/*
1997 * deactivate_task - remove a task from the runqueue.
1998 */
1999static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
2000{
2001 if (task_contributes_to_load(p))
2002 rq->nr_uninterruptible++;
2003
2004 dequeue_task(rq, p, sleep);
2005 dec_nr_running(rq);
2006}
2007
2008/** 1974/**
2009 * task_curr - is this task currently executing on a CPU? 1975 * task_curr - is this task currently executing on a CPU?
2010 * @p: the task in question. 1976 * @p: the task in question.
@@ -3148,50 +3114,6 @@ static void update_cpu_load(struct rq *this_rq)
3148#ifdef CONFIG_SMP 3114#ifdef CONFIG_SMP
3149 3115
3150/* 3116/*
3151 * double_rq_lock - safely lock two runqueues
3152 *
3153 * Note this does not disable interrupts like task_rq_lock,
3154 * you need to do so manually before calling.
3155 */
3156static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3157 __acquires(rq1->lock)
3158 __acquires(rq2->lock)
3159{
3160 BUG_ON(!irqs_disabled());
3161 if (rq1 == rq2) {
3162 raw_spin_lock(&rq1->lock);
3163 __acquire(rq2->lock); /* Fake it out ;) */
3164 } else {
3165 if (rq1 < rq2) {
3166 raw_spin_lock(&rq1->lock);
3167 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3168 } else {
3169 raw_spin_lock(&rq2->lock);
3170 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3171 }
3172 }
3173 update_rq_clock(rq1);
3174 update_rq_clock(rq2);
3175}
3176
3177/*
3178 * double_rq_unlock - safely unlock two runqueues
3179 *
3180 * Note this does not restore interrupts like task_rq_unlock,
3181 * you need to do so manually after calling.
3182 */
3183static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3184 __releases(rq1->lock)
3185 __releases(rq2->lock)
3186{
3187 raw_spin_unlock(&rq1->lock);
3188 if (rq1 != rq2)
3189 raw_spin_unlock(&rq2->lock);
3190 else
3191 __release(rq2->lock);
3192}
3193
3194/*
3195 * sched_exec - execve() is a valuable balancing opportunity, because at 3117 * sched_exec - execve() is a valuable balancing opportunity, because at
3196 * this point the task has the smallest effective memory and cache footprint. 3118 * this point the task has the smallest effective memory and cache footprint.
3197 */ 3119 */
@@ -3239,1782 +3161,6 @@ again:
3239 task_rq_unlock(rq, &flags); 3161 task_rq_unlock(rq, &flags);
3240} 3162}
3241 3163
3242/*
3243 * pull_task - move a task from a remote runqueue to the local runqueue.
3244 * Both runqueues must be locked.
3245 */
3246static void pull_task(struct rq *src_rq, struct task_struct *p,
3247 struct rq *this_rq, int this_cpu)
3248{
3249 deactivate_task(src_rq, p, 0);
3250 set_task_cpu(p, this_cpu);
3251 activate_task(this_rq, p, 0);
3252 check_preempt_curr(this_rq, p, 0);
3253}
3254
3255/*
3256 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3257 */
3258static
3259int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3260 struct sched_domain *sd, enum cpu_idle_type idle,
3261 int *all_pinned)
3262{
3263 int tsk_cache_hot = 0;
3264 /*
3265 * We do not migrate tasks that are:
3266 * 1) running (obviously), or
3267 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3268 * 3) are cache-hot on their current CPU.
3269 */
3270 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3271 schedstat_inc(p, se.nr_failed_migrations_affine);
3272 return 0;
3273 }
3274 *all_pinned = 0;
3275
3276 if (task_running(rq, p)) {
3277 schedstat_inc(p, se.nr_failed_migrations_running);
3278 return 0;
3279 }
3280
3281 /*
3282 * Aggressive migration if:
3283 * 1) task is cache cold, or
3284 * 2) too many balance attempts have failed.
3285 */
3286
3287 tsk_cache_hot = task_hot(p, rq->clock, sd);
3288 if (!tsk_cache_hot ||
3289 sd->nr_balance_failed > sd->cache_nice_tries) {
3290#ifdef CONFIG_SCHEDSTATS
3291 if (tsk_cache_hot) {
3292 schedstat_inc(sd, lb_hot_gained[idle]);
3293 schedstat_inc(p, se.nr_forced_migrations);
3294 }
3295#endif
3296 return 1;
3297 }
3298
3299 if (tsk_cache_hot) {
3300 schedstat_inc(p, se.nr_failed_migrations_hot);
3301 return 0;
3302 }
3303 return 1;
3304}
3305
3306static unsigned long
3307balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3308 unsigned long max_load_move, struct sched_domain *sd,
3309 enum cpu_idle_type idle, int *all_pinned,
3310 int *this_best_prio, struct rq_iterator *iterator)
3311{
3312 int loops = 0, pulled = 0, pinned = 0;
3313 struct task_struct *p;
3314 long rem_load_move = max_load_move;
3315
3316 if (max_load_move == 0)
3317 goto out;
3318
3319 pinned = 1;
3320
3321 /*
3322 * Start the load-balancing iterator:
3323 */
3324 p = iterator->start(iterator->arg);
3325next:
3326 if (!p || loops++ > sysctl_sched_nr_migrate)
3327 goto out;
3328
3329 if ((p->se.load.weight >> 1) > rem_load_move ||
3330 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3331 p = iterator->next(iterator->arg);
3332 goto next;
3333 }
3334
3335 pull_task(busiest, p, this_rq, this_cpu);
3336 pulled++;
3337 rem_load_move -= p->se.load.weight;
3338
3339#ifdef CONFIG_PREEMPT
3340 /*
3341 * NEWIDLE balancing is a source of latency, so preemptible kernels
3342 * will stop after the first task is pulled to minimize the critical
3343 * section.
3344 */
3345 if (idle == CPU_NEWLY_IDLE)
3346 goto out;
3347#endif
3348
3349 /*
3350 * We only want to steal up to the prescribed amount of weighted load.
3351 */
3352 if (rem_load_move > 0) {
3353 if (p->prio < *this_best_prio)
3354 *this_best_prio = p->prio;
3355 p = iterator->next(iterator->arg);
3356 goto next;
3357 }
3358out:
3359 /*
3360 * Right now, this is one of only two places pull_task() is called,
3361 * so we can safely collect pull_task() stats here rather than
3362 * inside pull_task().
3363 */
3364 schedstat_add(sd, lb_gained[idle], pulled);
3365
3366 if (all_pinned)
3367 *all_pinned = pinned;
3368
3369 return max_load_move - rem_load_move;
3370}
3371
3372/*
3373 * move_tasks tries to move up to max_load_move weighted load from busiest to
3374 * this_rq, as part of a balancing operation within domain "sd".
3375 * Returns 1 if successful and 0 otherwise.
3376 *
3377 * Called with both runqueues locked.
3378 */
3379static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3380 unsigned long max_load_move,
3381 struct sched_domain *sd, enum cpu_idle_type idle,
3382 int *all_pinned)
3383{
3384 const struct sched_class *class = sched_class_highest;
3385 unsigned long total_load_moved = 0;
3386 int this_best_prio = this_rq->curr->prio;
3387
3388 do {
3389 total_load_moved +=
3390 class->load_balance(this_rq, this_cpu, busiest,
3391 max_load_move - total_load_moved,
3392 sd, idle, all_pinned, &this_best_prio);
3393 class = class->next;
3394
3395#ifdef CONFIG_PREEMPT
3396 /*
3397 * NEWIDLE balancing is a source of latency, so preemptible
3398 * kernels will stop after the first task is pulled to minimize
3399 * the critical section.
3400 */
3401 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3402 break;
3403#endif
3404 } while (class && max_load_move > total_load_moved);
3405
3406 return total_load_moved > 0;
3407}
3408
3409static int
3410iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3411 struct sched_domain *sd, enum cpu_idle_type idle,
3412 struct rq_iterator *iterator)
3413{
3414 struct task_struct *p = iterator->start(iterator->arg);
3415 int pinned = 0;
3416
3417 while (p) {
3418 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3419 pull_task(busiest, p, this_rq, this_cpu);
3420 /*
3421 * Right now, this is only the second place pull_task()
3422 * is called, so we can safely collect pull_task()
3423 * stats here rather than inside pull_task().
3424 */
3425 schedstat_inc(sd, lb_gained[idle]);
3426
3427 return 1;
3428 }
3429 p = iterator->next(iterator->arg);
3430 }
3431
3432 return 0;
3433}
3434
3435/*
3436 * move_one_task tries to move exactly one task from busiest to this_rq, as
3437 * part of active balancing operations within "domain".
3438 * Returns 1 if successful and 0 otherwise.
3439 *
3440 * Called with both runqueues locked.
3441 */
3442static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3443 struct sched_domain *sd, enum cpu_idle_type idle)
3444{
3445 const struct sched_class *class;
3446
3447 for_each_class(class) {
3448 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3449 return 1;
3450 }
3451
3452 return 0;
3453}
3454/********** Helpers for find_busiest_group ************************/
3455/*
3456 * sd_lb_stats - Structure to store the statistics of a sched_domain
3457 * during load balancing.
3458 */
3459struct sd_lb_stats {
3460 struct sched_group *busiest; /* Busiest group in this sd */
3461 struct sched_group *this; /* Local group in this sd */
3462 unsigned long total_load; /* Total load of all groups in sd */
3463 unsigned long total_pwr; /* Total power of all groups in sd */
3464 unsigned long avg_load; /* Average load across all groups in sd */
3465
3466 /** Statistics of this group */
3467 unsigned long this_load;
3468 unsigned long this_load_per_task;
3469 unsigned long this_nr_running;
3470
3471 /* Statistics of the busiest group */
3472 unsigned long max_load;
3473 unsigned long busiest_load_per_task;
3474 unsigned long busiest_nr_running;
3475
3476 int group_imb; /* Is there imbalance in this sd */
3477#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3478 int power_savings_balance; /* Is powersave balance needed for this sd */
3479 struct sched_group *group_min; /* Least loaded group in sd */
3480 struct sched_group *group_leader; /* Group which relieves group_min */
3481 unsigned long min_load_per_task; /* load_per_task in group_min */
3482 unsigned long leader_nr_running; /* Nr running of group_leader */
3483 unsigned long min_nr_running; /* Nr running of group_min */
3484#endif
3485};
3486
3487/*
3488 * sg_lb_stats - stats of a sched_group required for load_balancing
3489 */
3490struct sg_lb_stats {
3491 unsigned long avg_load; /*Avg load across the CPUs of the group */
3492 unsigned long group_load; /* Total load over the CPUs of the group */
3493 unsigned long sum_nr_running; /* Nr tasks running in the group */
3494 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3495 unsigned long group_capacity;
3496 int group_imb; /* Is there an imbalance in the group ? */
3497};
3498
3499/**
3500 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3501 * @group: The group whose first cpu is to be returned.
3502 */
3503static inline unsigned int group_first_cpu(struct sched_group *group)
3504{
3505 return cpumask_first(sched_group_cpus(group));
3506}
3507
3508/**
3509 * get_sd_load_idx - Obtain the load index for a given sched domain.
3510 * @sd: The sched_domain whose load_idx is to be obtained.
3511 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3512 */
3513static inline int get_sd_load_idx(struct sched_domain *sd,
3514 enum cpu_idle_type idle)
3515{
3516 int load_idx;
3517
3518 switch (idle) {
3519 case CPU_NOT_IDLE:
3520 load_idx = sd->busy_idx;
3521 break;
3522
3523 case CPU_NEWLY_IDLE:
3524 load_idx = sd->newidle_idx;
3525 break;
3526 default:
3527 load_idx = sd->idle_idx;
3528 break;
3529 }
3530
3531 return load_idx;
3532}
3533
3534
3535#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3536/**
3537 * init_sd_power_savings_stats - Initialize power savings statistics for
3538 * the given sched_domain, during load balancing.
3539 *
3540 * @sd: Sched domain whose power-savings statistics are to be initialized.
3541 * @sds: Variable containing the statistics for sd.
3542 * @idle: Idle status of the CPU at which we're performing load-balancing.
3543 */
3544static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3545 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3546{
3547 /*
3548 * Busy processors will not participate in power savings
3549 * balance.
3550 */
3551 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3552 sds->power_savings_balance = 0;
3553 else {
3554 sds->power_savings_balance = 1;
3555 sds->min_nr_running = ULONG_MAX;
3556 sds->leader_nr_running = 0;
3557 }
3558}
3559
3560/**
3561 * update_sd_power_savings_stats - Update the power saving stats for a
3562 * sched_domain while performing load balancing.
3563 *
3564 * @group: sched_group belonging to the sched_domain under consideration.
3565 * @sds: Variable containing the statistics of the sched_domain
3566 * @local_group: Does group contain the CPU for which we're performing
3567 * load balancing ?
3568 * @sgs: Variable containing the statistics of the group.
3569 */
3570static inline void update_sd_power_savings_stats(struct sched_group *group,
3571 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3572{
3573
3574 if (!sds->power_savings_balance)
3575 return;
3576
3577 /*
3578 * If the local group is idle or completely loaded
3579 * no need to do power savings balance at this domain
3580 */
3581 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3582 !sds->this_nr_running))
3583 sds->power_savings_balance = 0;
3584
3585 /*
3586 * If a group is already running at full capacity or idle,
3587 * don't include that group in power savings calculations
3588 */
3589 if (!sds->power_savings_balance ||
3590 sgs->sum_nr_running >= sgs->group_capacity ||
3591 !sgs->sum_nr_running)
3592 return;
3593
3594 /*
3595 * Calculate the group which has the least non-idle load.
3596 * This is the group from where we need to pick up the load
3597 * for saving power
3598 */
3599 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3600 (sgs->sum_nr_running == sds->min_nr_running &&
3601 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3602 sds->group_min = group;
3603 sds->min_nr_running = sgs->sum_nr_running;
3604 sds->min_load_per_task = sgs->sum_weighted_load /
3605 sgs->sum_nr_running;
3606 }
3607
3608 /*
3609 * Calculate the group which is almost near its
3610 * capacity but still has some space to pick up some load
3611 * from other group and save more power
3612 */
3613 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3614 return;
3615
3616 if (sgs->sum_nr_running > sds->leader_nr_running ||
3617 (sgs->sum_nr_running == sds->leader_nr_running &&
3618 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3619 sds->group_leader = group;
3620 sds->leader_nr_running = sgs->sum_nr_running;
3621 }
3622}
3623
3624/**
3625 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3626 * @sds: Variable containing the statistics of the sched_domain
3627 * under consideration.
3628 * @this_cpu: Cpu at which we're currently performing load-balancing.
3629 * @imbalance: Variable to store the imbalance.
3630 *
3631 * Description:
3632 * Check if we have potential to perform some power-savings balance.
3633 * If yes, set the busiest group to be the least loaded group in the
3634 * sched_domain, so that it's CPUs can be put to idle.
3635 *
3636 * Returns 1 if there is potential to perform power-savings balance.
3637 * Else returns 0.
3638 */
3639static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3640 int this_cpu, unsigned long *imbalance)
3641{
3642 if (!sds->power_savings_balance)
3643 return 0;
3644
3645 if (sds->this != sds->group_leader ||
3646 sds->group_leader == sds->group_min)
3647 return 0;
3648
3649 *imbalance = sds->min_load_per_task;
3650 sds->busiest = sds->group_min;
3651
3652 return 1;
3653
3654}
3655#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3656static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3657 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3658{
3659 return;
3660}
3661
3662static inline void update_sd_power_savings_stats(struct sched_group *group,
3663 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3664{
3665 return;
3666}
3667
3668static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3669 int this_cpu, unsigned long *imbalance)
3670{
3671 return 0;
3672}
3673#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3674
3675
3676unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3677{
3678 return SCHED_LOAD_SCALE;
3679}
3680
3681unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3682{
3683 return default_scale_freq_power(sd, cpu);
3684}
3685
3686unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3687{
3688 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3689 unsigned long smt_gain = sd->smt_gain;
3690
3691 smt_gain /= weight;
3692
3693 return smt_gain;
3694}
3695
3696unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3697{
3698 return default_scale_smt_power(sd, cpu);
3699}
3700
3701unsigned long scale_rt_power(int cpu)
3702{
3703 struct rq *rq = cpu_rq(cpu);
3704 u64 total, available;
3705
3706 sched_avg_update(rq);
3707
3708 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3709 available = total - rq->rt_avg;
3710
3711 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3712 total = SCHED_LOAD_SCALE;
3713
3714 total >>= SCHED_LOAD_SHIFT;
3715
3716 return div_u64(available, total);
3717}
3718
3719static void update_cpu_power(struct sched_domain *sd, int cpu)
3720{
3721 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3722 unsigned long power = SCHED_LOAD_SCALE;
3723 struct sched_group *sdg = sd->groups;
3724
3725 if (sched_feat(ARCH_POWER))
3726 power *= arch_scale_freq_power(sd, cpu);
3727 else
3728 power *= default_scale_freq_power(sd, cpu);
3729
3730 power >>= SCHED_LOAD_SHIFT;
3731
3732 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3733 if (sched_feat(ARCH_POWER))
3734 power *= arch_scale_smt_power(sd, cpu);
3735 else
3736 power *= default_scale_smt_power(sd, cpu);
3737
3738 power >>= SCHED_LOAD_SHIFT;
3739 }
3740
3741 power *= scale_rt_power(cpu);
3742 power >>= SCHED_LOAD_SHIFT;
3743
3744 if (!power)
3745 power = 1;
3746
3747 sdg->cpu_power = power;
3748}
3749
3750static void update_group_power(struct sched_domain *sd, int cpu)
3751{
3752 struct sched_domain *child = sd->child;
3753 struct sched_group *group, *sdg = sd->groups;
3754 unsigned long power;
3755
3756 if (!child) {
3757 update_cpu_power(sd, cpu);
3758 return;
3759 }
3760
3761 power = 0;
3762
3763 group = child->groups;
3764 do {
3765 power += group->cpu_power;
3766 group = group->next;
3767 } while (group != child->groups);
3768
3769 sdg->cpu_power = power;
3770}
3771
3772/**
3773 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3774 * @sd: The sched_domain whose statistics are to be updated.
3775 * @group: sched_group whose statistics are to be updated.
3776 * @this_cpu: Cpu for which load balance is currently performed.
3777 * @idle: Idle status of this_cpu
3778 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3779 * @sd_idle: Idle status of the sched_domain containing group.
3780 * @local_group: Does group contain this_cpu.
3781 * @cpus: Set of cpus considered for load balancing.
3782 * @balance: Should we balance.
3783 * @sgs: variable to hold the statistics for this group.
3784 */
3785static inline void update_sg_lb_stats(struct sched_domain *sd,
3786 struct sched_group *group, int this_cpu,
3787 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3788 int local_group, const struct cpumask *cpus,
3789 int *balance, struct sg_lb_stats *sgs)
3790{
3791 unsigned long load, max_cpu_load, min_cpu_load;
3792 int i;
3793 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3794 unsigned long sum_avg_load_per_task;
3795 unsigned long avg_load_per_task;
3796
3797 if (local_group) {
3798 balance_cpu = group_first_cpu(group);
3799 if (balance_cpu == this_cpu)
3800 update_group_power(sd, this_cpu);
3801 }
3802
3803 /* Tally up the load of all CPUs in the group */
3804 sum_avg_load_per_task = avg_load_per_task = 0;
3805 max_cpu_load = 0;
3806 min_cpu_load = ~0UL;
3807
3808 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3809 struct rq *rq = cpu_rq(i);
3810
3811 if (*sd_idle && rq->nr_running)
3812 *sd_idle = 0;
3813
3814 /* Bias balancing toward cpus of our domain */
3815 if (local_group) {
3816 if (idle_cpu(i) && !first_idle_cpu) {
3817 first_idle_cpu = 1;
3818 balance_cpu = i;
3819 }
3820
3821 load = target_load(i, load_idx);
3822 } else {
3823 load = source_load(i, load_idx);
3824 if (load > max_cpu_load)
3825 max_cpu_load = load;
3826 if (min_cpu_load > load)
3827 min_cpu_load = load;
3828 }
3829
3830 sgs->group_load += load;
3831 sgs->sum_nr_running += rq->nr_running;
3832 sgs->sum_weighted_load += weighted_cpuload(i);
3833
3834 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3835 }
3836
3837 /*
3838 * First idle cpu or the first cpu(busiest) in this sched group
3839 * is eligible for doing load balancing at this and above
3840 * domains. In the newly idle case, we will allow all the cpu's
3841 * to do the newly idle load balance.
3842 */
3843 if (idle != CPU_NEWLY_IDLE && local_group &&
3844 balance_cpu != this_cpu && balance) {
3845 *balance = 0;
3846 return;
3847 }
3848
3849 /* Adjust by relative CPU power of the group */
3850 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3851
3852
3853 /*
3854 * Consider the group unbalanced when the imbalance is larger
3855 * than the average weight of two tasks.
3856 *
3857 * APZ: with cgroup the avg task weight can vary wildly and
3858 * might not be a suitable number - should we keep a
3859 * normalized nr_running number somewhere that negates
3860 * the hierarchy?
3861 */
3862 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3863 group->cpu_power;
3864
3865 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3866 sgs->group_imb = 1;
3867
3868 sgs->group_capacity =
3869 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3870}
3871
3872/**
3873 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3874 * @sd: sched_domain whose statistics are to be updated.
3875 * @this_cpu: Cpu for which load balance is currently performed.
3876 * @idle: Idle status of this_cpu
3877 * @sd_idle: Idle status of the sched_domain containing group.
3878 * @cpus: Set of cpus considered for load balancing.
3879 * @balance: Should we balance.
3880 * @sds: variable to hold the statistics for this sched_domain.
3881 */
3882static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3883 enum cpu_idle_type idle, int *sd_idle,
3884 const struct cpumask *cpus, int *balance,
3885 struct sd_lb_stats *sds)
3886{
3887 struct sched_domain *child = sd->child;
3888 struct sched_group *group = sd->groups;
3889 struct sg_lb_stats sgs;
3890 int load_idx, prefer_sibling = 0;
3891
3892 if (child && child->flags & SD_PREFER_SIBLING)
3893 prefer_sibling = 1;
3894
3895 init_sd_power_savings_stats(sd, sds, idle);
3896 load_idx = get_sd_load_idx(sd, idle);
3897
3898 do {
3899 int local_group;
3900
3901 local_group = cpumask_test_cpu(this_cpu,
3902 sched_group_cpus(group));
3903 memset(&sgs, 0, sizeof(sgs));
3904 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3905 local_group, cpus, balance, &sgs);
3906
3907 if (local_group && balance && !(*balance))
3908 return;
3909
3910 sds->total_load += sgs.group_load;
3911 sds->total_pwr += group->cpu_power;
3912
3913 /*
3914 * In case the child domain prefers tasks go to siblings
3915 * first, lower the group capacity to one so that we'll try
3916 * and move all the excess tasks away.
3917 */
3918 if (prefer_sibling)
3919 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3920
3921 if (local_group) {
3922 sds->this_load = sgs.avg_load;
3923 sds->this = group;
3924 sds->this_nr_running = sgs.sum_nr_running;
3925 sds->this_load_per_task = sgs.sum_weighted_load;
3926 } else if (sgs.avg_load > sds->max_load &&
3927 (sgs.sum_nr_running > sgs.group_capacity ||
3928 sgs.group_imb)) {
3929 sds->max_load = sgs.avg_load;
3930 sds->busiest = group;
3931 sds->busiest_nr_running = sgs.sum_nr_running;
3932 sds->busiest_load_per_task = sgs.sum_weighted_load;
3933 sds->group_imb = sgs.group_imb;
3934 }
3935
3936 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3937 group = group->next;
3938 } while (group != sd->groups);
3939}
3940
3941/**
3942 * fix_small_imbalance - Calculate the minor imbalance that exists
3943 * amongst the groups of a sched_domain, during
3944 * load balancing.
3945 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3946 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3947 * @imbalance: Variable to store the imbalance.
3948 */
3949static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3950 int this_cpu, unsigned long *imbalance)
3951{
3952 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3953 unsigned int imbn = 2;
3954
3955 if (sds->this_nr_running) {
3956 sds->this_load_per_task /= sds->this_nr_running;
3957 if (sds->busiest_load_per_task >
3958 sds->this_load_per_task)
3959 imbn = 1;
3960 } else
3961 sds->this_load_per_task =
3962 cpu_avg_load_per_task(this_cpu);
3963
3964 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3965 sds->busiest_load_per_task * imbn) {
3966 *imbalance = sds->busiest_load_per_task;
3967 return;
3968 }
3969
3970 /*
3971 * OK, we don't have enough imbalance to justify moving tasks,
3972 * however we may be able to increase total CPU power used by
3973 * moving them.
3974 */
3975
3976 pwr_now += sds->busiest->cpu_power *
3977 min(sds->busiest_load_per_task, sds->max_load);
3978 pwr_now += sds->this->cpu_power *
3979 min(sds->this_load_per_task, sds->this_load);
3980 pwr_now /= SCHED_LOAD_SCALE;
3981
3982 /* Amount of load we'd subtract */
3983 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3984 sds->busiest->cpu_power;
3985 if (sds->max_load > tmp)
3986 pwr_move += sds->busiest->cpu_power *
3987 min(sds->busiest_load_per_task, sds->max_load - tmp);
3988
3989 /* Amount of load we'd add */
3990 if (sds->max_load * sds->busiest->cpu_power <
3991 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3992 tmp = (sds->max_load * sds->busiest->cpu_power) /
3993 sds->this->cpu_power;
3994 else
3995 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3996 sds->this->cpu_power;
3997 pwr_move += sds->this->cpu_power *
3998 min(sds->this_load_per_task, sds->this_load + tmp);
3999 pwr_move /= SCHED_LOAD_SCALE;
4000
4001 /* Move if we gain throughput */
4002 if (pwr_move > pwr_now)
4003 *imbalance = sds->busiest_load_per_task;
4004}
4005
4006/**
4007 * calculate_imbalance - Calculate the amount of imbalance present within the
4008 * groups of a given sched_domain during load balance.
4009 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
4010 * @this_cpu: Cpu for which currently load balance is being performed.
4011 * @imbalance: The variable to store the imbalance.
4012 */
4013static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
4014 unsigned long *imbalance)
4015{
4016 unsigned long max_pull;
4017 /*
4018 * In the presence of smp nice balancing, certain scenarios can have
4019 * max load less than avg load(as we skip the groups at or below
4020 * its cpu_power, while calculating max_load..)
4021 */
4022 if (sds->max_load < sds->avg_load) {
4023 *imbalance = 0;
4024 return fix_small_imbalance(sds, this_cpu, imbalance);
4025 }
4026
4027 /* Don't want to pull so many tasks that a group would go idle */
4028 max_pull = min(sds->max_load - sds->avg_load,
4029 sds->max_load - sds->busiest_load_per_task);
4030
4031 /* How much load to actually move to equalise the imbalance */
4032 *imbalance = min(max_pull * sds->busiest->cpu_power,
4033 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
4034 / SCHED_LOAD_SCALE;
4035
4036 /*
4037 * if *imbalance is less than the average load per runnable task
4038 * there is no gaurantee that any tasks will be moved so we'll have
4039 * a think about bumping its value to force at least one task to be
4040 * moved
4041 */
4042 if (*imbalance < sds->busiest_load_per_task)
4043 return fix_small_imbalance(sds, this_cpu, imbalance);
4044
4045}
4046/******* find_busiest_group() helpers end here *********************/
4047
4048/**
4049 * find_busiest_group - Returns the busiest group within the sched_domain
4050 * if there is an imbalance. If there isn't an imbalance, and
4051 * the user has opted for power-savings, it returns a group whose
4052 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4053 * such a group exists.
4054 *
4055 * Also calculates the amount of weighted load which should be moved
4056 * to restore balance.
4057 *
4058 * @sd: The sched_domain whose busiest group is to be returned.
4059 * @this_cpu: The cpu for which load balancing is currently being performed.
4060 * @imbalance: Variable which stores amount of weighted load which should
4061 * be moved to restore balance/put a group to idle.
4062 * @idle: The idle status of this_cpu.
4063 * @sd_idle: The idleness of sd
4064 * @cpus: The set of CPUs under consideration for load-balancing.
4065 * @balance: Pointer to a variable indicating if this_cpu
4066 * is the appropriate cpu to perform load balancing at this_level.
4067 *
4068 * Returns: - the busiest group if imbalance exists.
4069 * - If no imbalance and user has opted for power-savings balance,
4070 * return the least loaded group whose CPUs can be
4071 * put to idle by rebalancing its tasks onto our group.
4072 */
4073static struct sched_group *
4074find_busiest_group(struct sched_domain *sd, int this_cpu,
4075 unsigned long *imbalance, enum cpu_idle_type idle,
4076 int *sd_idle, const struct cpumask *cpus, int *balance)
4077{
4078 struct sd_lb_stats sds;
4079
4080 memset(&sds, 0, sizeof(sds));
4081
4082 /*
4083 * Compute the various statistics relavent for load balancing at
4084 * this level.
4085 */
4086 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4087 balance, &sds);
4088
4089 /* Cases where imbalance does not exist from POV of this_cpu */
4090 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4091 * at this level.
4092 * 2) There is no busy sibling group to pull from.
4093 * 3) This group is the busiest group.
4094 * 4) This group is more busy than the avg busieness at this
4095 * sched_domain.
4096 * 5) The imbalance is within the specified limit.
4097 * 6) Any rebalance would lead to ping-pong
4098 */
4099 if (balance && !(*balance))
4100 goto ret;
4101
4102 if (!sds.busiest || sds.busiest_nr_running == 0)
4103 goto out_balanced;
4104
4105 if (sds.this_load >= sds.max_load)
4106 goto out_balanced;
4107
4108 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4109
4110 if (sds.this_load >= sds.avg_load)
4111 goto out_balanced;
4112
4113 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4114 goto out_balanced;
4115
4116 sds.busiest_load_per_task /= sds.busiest_nr_running;
4117 if (sds.group_imb)
4118 sds.busiest_load_per_task =
4119 min(sds.busiest_load_per_task, sds.avg_load);
4120
4121 /*
4122 * We're trying to get all the cpus to the average_load, so we don't
4123 * want to push ourselves above the average load, nor do we wish to
4124 * reduce the max loaded cpu below the average load, as either of these
4125 * actions would just result in more rebalancing later, and ping-pong
4126 * tasks around. Thus we look for the minimum possible imbalance.
4127 * Negative imbalances (*we* are more loaded than anyone else) will
4128 * be counted as no imbalance for these purposes -- we can't fix that
4129 * by pulling tasks to us. Be careful of negative numbers as they'll
4130 * appear as very large values with unsigned longs.
4131 */
4132 if (sds.max_load <= sds.busiest_load_per_task)
4133 goto out_balanced;
4134
4135 /* Looks like there is an imbalance. Compute it */
4136 calculate_imbalance(&sds, this_cpu, imbalance);
4137 return sds.busiest;
4138
4139out_balanced:
4140 /*
4141 * There is no obvious imbalance. But check if we can do some balancing
4142 * to save power.
4143 */
4144 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4145 return sds.busiest;
4146ret:
4147 *imbalance = 0;
4148 return NULL;
4149}
4150
4151/*
4152 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4153 */
4154static struct rq *
4155find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4156 unsigned long imbalance, const struct cpumask *cpus)
4157{
4158 struct rq *busiest = NULL, *rq;
4159 unsigned long max_load = 0;
4160 int i;
4161
4162 for_each_cpu(i, sched_group_cpus(group)) {
4163 unsigned long power = power_of(i);
4164 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4165 unsigned long wl;
4166
4167 if (!cpumask_test_cpu(i, cpus))
4168 continue;
4169
4170 rq = cpu_rq(i);
4171 wl = weighted_cpuload(i);
4172
4173 /*
4174 * When comparing with imbalance, use weighted_cpuload()
4175 * which is not scaled with the cpu power.
4176 */
4177 if (capacity && rq->nr_running == 1 && wl > imbalance)
4178 continue;
4179
4180 /*
4181 * For the load comparisons with the other cpu's, consider
4182 * the weighted_cpuload() scaled with the cpu power, so that
4183 * the load can be moved away from the cpu that is potentially
4184 * running at a lower capacity.
4185 */
4186 wl = (wl * SCHED_LOAD_SCALE) / power;
4187
4188 if (wl > max_load) {
4189 max_load = wl;
4190 busiest = rq;
4191 }
4192 }
4193
4194 return busiest;
4195}
4196
4197/*
4198 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4199 * so long as it is large enough.
4200 */
4201#define MAX_PINNED_INTERVAL 512
4202
4203/* Working cpumask for load_balance and load_balance_newidle. */
4204static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4205
4206/*
4207 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4208 * tasks if there is an imbalance.
4209 */
4210static int load_balance(int this_cpu, struct rq *this_rq,
4211 struct sched_domain *sd, enum cpu_idle_type idle,
4212 int *balance)
4213{
4214 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4215 struct sched_group *group;
4216 unsigned long imbalance;
4217 struct rq *busiest;
4218 unsigned long flags;
4219 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4220
4221 cpumask_copy(cpus, cpu_active_mask);
4222
4223 /*
4224 * When power savings policy is enabled for the parent domain, idle
4225 * sibling can pick up load irrespective of busy siblings. In this case,
4226 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4227 * portraying it as CPU_NOT_IDLE.
4228 */
4229 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4230 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4231 sd_idle = 1;
4232
4233 schedstat_inc(sd, lb_count[idle]);
4234
4235redo:
4236 update_shares(sd);
4237 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4238 cpus, balance);
4239
4240 if (*balance == 0)
4241 goto out_balanced;
4242
4243 if (!group) {
4244 schedstat_inc(sd, lb_nobusyg[idle]);
4245 goto out_balanced;
4246 }
4247
4248 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4249 if (!busiest) {
4250 schedstat_inc(sd, lb_nobusyq[idle]);
4251 goto out_balanced;
4252 }
4253
4254 BUG_ON(busiest == this_rq);
4255
4256 schedstat_add(sd, lb_imbalance[idle], imbalance);
4257
4258 ld_moved = 0;
4259 if (busiest->nr_running > 1) {
4260 /*
4261 * Attempt to move tasks. If find_busiest_group has found
4262 * an imbalance but busiest->nr_running <= 1, the group is
4263 * still unbalanced. ld_moved simply stays zero, so it is
4264 * correctly treated as an imbalance.
4265 */
4266 local_irq_save(flags);
4267 double_rq_lock(this_rq, busiest);
4268 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4269 imbalance, sd, idle, &all_pinned);
4270 double_rq_unlock(this_rq, busiest);
4271 local_irq_restore(flags);
4272
4273 /*
4274 * some other cpu did the load balance for us.
4275 */
4276 if (ld_moved && this_cpu != smp_processor_id())
4277 resched_cpu(this_cpu);
4278
4279 /* All tasks on this runqueue were pinned by CPU affinity */
4280 if (unlikely(all_pinned)) {
4281 cpumask_clear_cpu(cpu_of(busiest), cpus);
4282 if (!cpumask_empty(cpus))
4283 goto redo;
4284 goto out_balanced;
4285 }
4286 }
4287
4288 if (!ld_moved) {
4289 schedstat_inc(sd, lb_failed[idle]);
4290 sd->nr_balance_failed++;
4291
4292 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4293
4294 raw_spin_lock_irqsave(&busiest->lock, flags);
4295
4296 /* don't kick the migration_thread, if the curr
4297 * task on busiest cpu can't be moved to this_cpu
4298 */
4299 if (!cpumask_test_cpu(this_cpu,
4300 &busiest->curr->cpus_allowed)) {
4301 raw_spin_unlock_irqrestore(&busiest->lock,
4302 flags);
4303 all_pinned = 1;
4304 goto out_one_pinned;
4305 }
4306
4307 if (!busiest->active_balance) {
4308 busiest->active_balance = 1;
4309 busiest->push_cpu = this_cpu;
4310 active_balance = 1;
4311 }
4312 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4313 if (active_balance)
4314 wake_up_process(busiest->migration_thread);
4315
4316 /*
4317 * We've kicked active balancing, reset the failure
4318 * counter.
4319 */
4320 sd->nr_balance_failed = sd->cache_nice_tries+1;
4321 }
4322 } else
4323 sd->nr_balance_failed = 0;
4324
4325 if (likely(!active_balance)) {
4326 /* We were unbalanced, so reset the balancing interval */
4327 sd->balance_interval = sd->min_interval;
4328 } else {
4329 /*
4330 * If we've begun active balancing, start to back off. This
4331 * case may not be covered by the all_pinned logic if there
4332 * is only 1 task on the busy runqueue (because we don't call
4333 * move_tasks).
4334 */
4335 if (sd->balance_interval < sd->max_interval)
4336 sd->balance_interval *= 2;
4337 }
4338
4339 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4340 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4341 ld_moved = -1;
4342
4343 goto out;
4344
4345out_balanced:
4346 schedstat_inc(sd, lb_balanced[idle]);
4347
4348 sd->nr_balance_failed = 0;
4349
4350out_one_pinned:
4351 /* tune up the balancing interval */
4352 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4353 (sd->balance_interval < sd->max_interval))
4354 sd->balance_interval *= 2;
4355
4356 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4357 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4358 ld_moved = -1;
4359 else
4360 ld_moved = 0;
4361out:
4362 if (ld_moved)
4363 update_shares(sd);
4364 return ld_moved;
4365}
4366
4367/*
4368 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4369 * tasks if there is an imbalance.
4370 *
4371 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4372 * this_rq is locked.
4373 */
4374static int
4375load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4376{
4377 struct sched_group *group;
4378 struct rq *busiest = NULL;
4379 unsigned long imbalance;
4380 int ld_moved = 0;
4381 int sd_idle = 0;
4382 int all_pinned = 0;
4383 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4384
4385 cpumask_copy(cpus, cpu_active_mask);
4386
4387 /*
4388 * When power savings policy is enabled for the parent domain, idle
4389 * sibling can pick up load irrespective of busy siblings. In this case,
4390 * let the state of idle sibling percolate up as IDLE, instead of
4391 * portraying it as CPU_NOT_IDLE.
4392 */
4393 if (sd->flags & SD_SHARE_CPUPOWER &&
4394 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4395 sd_idle = 1;
4396
4397 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4398redo:
4399 update_shares_locked(this_rq, sd);
4400 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4401 &sd_idle, cpus, NULL);
4402 if (!group) {
4403 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4404 goto out_balanced;
4405 }
4406
4407 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4408 if (!busiest) {
4409 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4410 goto out_balanced;
4411 }
4412
4413 BUG_ON(busiest == this_rq);
4414
4415 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4416
4417 ld_moved = 0;
4418 if (busiest->nr_running > 1) {
4419 /* Attempt to move tasks */
4420 double_lock_balance(this_rq, busiest);
4421 /* this_rq->clock is already updated */
4422 update_rq_clock(busiest);
4423 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4424 imbalance, sd, CPU_NEWLY_IDLE,
4425 &all_pinned);
4426 double_unlock_balance(this_rq, busiest);
4427
4428 if (unlikely(all_pinned)) {
4429 cpumask_clear_cpu(cpu_of(busiest), cpus);
4430 if (!cpumask_empty(cpus))
4431 goto redo;
4432 }
4433 }
4434
4435 if (!ld_moved) {
4436 int active_balance = 0;
4437
4438 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4439 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4440 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4441 return -1;
4442
4443 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4444 return -1;
4445
4446 if (sd->nr_balance_failed++ < 2)
4447 return -1;
4448
4449 /*
4450 * The only task running in a non-idle cpu can be moved to this
4451 * cpu in an attempt to completely freeup the other CPU
4452 * package. The same method used to move task in load_balance()
4453 * have been extended for load_balance_newidle() to speedup
4454 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4455 *
4456 * The package power saving logic comes from
4457 * find_busiest_group(). If there are no imbalance, then
4458 * f_b_g() will return NULL. However when sched_mc={1,2} then
4459 * f_b_g() will select a group from which a running task may be
4460 * pulled to this cpu in order to make the other package idle.
4461 * If there is no opportunity to make a package idle and if
4462 * there are no imbalance, then f_b_g() will return NULL and no
4463 * action will be taken in load_balance_newidle().
4464 *
4465 * Under normal task pull operation due to imbalance, there
4466 * will be more than one task in the source run queue and
4467 * move_tasks() will succeed. ld_moved will be true and this
4468 * active balance code will not be triggered.
4469 */
4470
4471 /* Lock busiest in correct order while this_rq is held */
4472 double_lock_balance(this_rq, busiest);
4473
4474 /*
4475 * don't kick the migration_thread, if the curr
4476 * task on busiest cpu can't be moved to this_cpu
4477 */
4478 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4479 double_unlock_balance(this_rq, busiest);
4480 all_pinned = 1;
4481 return ld_moved;
4482 }
4483
4484 if (!busiest->active_balance) {
4485 busiest->active_balance = 1;
4486 busiest->push_cpu = this_cpu;
4487 active_balance = 1;
4488 }
4489
4490 double_unlock_balance(this_rq, busiest);
4491 /*
4492 * Should not call ttwu while holding a rq->lock
4493 */
4494 raw_spin_unlock(&this_rq->lock);
4495 if (active_balance)
4496 wake_up_process(busiest->migration_thread);
4497 raw_spin_lock(&this_rq->lock);
4498
4499 } else
4500 sd->nr_balance_failed = 0;
4501
4502 update_shares_locked(this_rq, sd);
4503 return ld_moved;
4504
4505out_balanced:
4506 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4507 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4508 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4509 return -1;
4510 sd->nr_balance_failed = 0;
4511
4512 return 0;
4513}
4514
4515/*
4516 * idle_balance is called by schedule() if this_cpu is about to become
4517 * idle. Attempts to pull tasks from other CPUs.
4518 */
4519static void idle_balance(int this_cpu, struct rq *this_rq)
4520{
4521 struct sched_domain *sd;
4522 int pulled_task = 0;
4523 unsigned long next_balance = jiffies + HZ;
4524
4525 this_rq->idle_stamp = this_rq->clock;
4526
4527 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4528 return;
4529
4530 for_each_domain(this_cpu, sd) {
4531 unsigned long interval;
4532
4533 if (!(sd->flags & SD_LOAD_BALANCE))
4534 continue;
4535
4536 if (sd->flags & SD_BALANCE_NEWIDLE)
4537 /* If we've pulled tasks over stop searching: */
4538 pulled_task = load_balance_newidle(this_cpu, this_rq,
4539 sd);
4540
4541 interval = msecs_to_jiffies(sd->balance_interval);
4542 if (time_after(next_balance, sd->last_balance + interval))
4543 next_balance = sd->last_balance + interval;
4544 if (pulled_task) {
4545 this_rq->idle_stamp = 0;
4546 break;
4547 }
4548 }
4549 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4550 /*
4551 * We are going idle. next_balance may be set based on
4552 * a busy processor. So reset next_balance.
4553 */
4554 this_rq->next_balance = next_balance;
4555 }
4556}
4557
4558/*
4559 * active_load_balance is run by migration threads. It pushes running tasks
4560 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4561 * running on each physical CPU where possible, and avoids physical /
4562 * logical imbalances.
4563 *
4564 * Called with busiest_rq locked.
4565 */
4566static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4567{
4568 int target_cpu = busiest_rq->push_cpu;
4569 struct sched_domain *sd;
4570 struct rq *target_rq;
4571
4572 /* Is there any task to move? */
4573 if (busiest_rq->nr_running <= 1)
4574 return;
4575
4576 target_rq = cpu_rq(target_cpu);
4577
4578 /*
4579 * This condition is "impossible", if it occurs
4580 * we need to fix it. Originally reported by
4581 * Bjorn Helgaas on a 128-cpu setup.
4582 */
4583 BUG_ON(busiest_rq == target_rq);
4584
4585 /* move a task from busiest_rq to target_rq */
4586 double_lock_balance(busiest_rq, target_rq);
4587 update_rq_clock(busiest_rq);
4588 update_rq_clock(target_rq);
4589
4590 /* Search for an sd spanning us and the target CPU. */
4591 for_each_domain(target_cpu, sd) {
4592 if ((sd->flags & SD_LOAD_BALANCE) &&
4593 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4594 break;
4595 }
4596
4597 if (likely(sd)) {
4598 schedstat_inc(sd, alb_count);
4599
4600 if (move_one_task(target_rq, target_cpu, busiest_rq,
4601 sd, CPU_IDLE))
4602 schedstat_inc(sd, alb_pushed);
4603 else
4604 schedstat_inc(sd, alb_failed);
4605 }
4606 double_unlock_balance(busiest_rq, target_rq);
4607}
4608
4609#ifdef CONFIG_NO_HZ
4610static struct {
4611 atomic_t load_balancer;
4612 cpumask_var_t cpu_mask;
4613 cpumask_var_t ilb_grp_nohz_mask;
4614} nohz ____cacheline_aligned = {
4615 .load_balancer = ATOMIC_INIT(-1),
4616};
4617
4618int get_nohz_load_balancer(void)
4619{
4620 return atomic_read(&nohz.load_balancer);
4621}
4622
4623#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4624/**
4625 * lowest_flag_domain - Return lowest sched_domain containing flag.
4626 * @cpu: The cpu whose lowest level of sched domain is to
4627 * be returned.
4628 * @flag: The flag to check for the lowest sched_domain
4629 * for the given cpu.
4630 *
4631 * Returns the lowest sched_domain of a cpu which contains the given flag.
4632 */
4633static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4634{
4635 struct sched_domain *sd;
4636
4637 for_each_domain(cpu, sd)
4638 if (sd && (sd->flags & flag))
4639 break;
4640
4641 return sd;
4642}
4643
4644/**
4645 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4646 * @cpu: The cpu whose domains we're iterating over.
4647 * @sd: variable holding the value of the power_savings_sd
4648 * for cpu.
4649 * @flag: The flag to filter the sched_domains to be iterated.
4650 *
4651 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4652 * set, starting from the lowest sched_domain to the highest.
4653 */
4654#define for_each_flag_domain(cpu, sd, flag) \
4655 for (sd = lowest_flag_domain(cpu, flag); \
4656 (sd && (sd->flags & flag)); sd = sd->parent)
4657
4658/**
4659 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4660 * @ilb_group: group to be checked for semi-idleness
4661 *
4662 * Returns: 1 if the group is semi-idle. 0 otherwise.
4663 *
4664 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4665 * and atleast one non-idle CPU. This helper function checks if the given
4666 * sched_group is semi-idle or not.
4667 */
4668static inline int is_semi_idle_group(struct sched_group *ilb_group)
4669{
4670 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4671 sched_group_cpus(ilb_group));
4672
4673 /*
4674 * A sched_group is semi-idle when it has atleast one busy cpu
4675 * and atleast one idle cpu.
4676 */
4677 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4678 return 0;
4679
4680 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4681 return 0;
4682
4683 return 1;
4684}
4685/**
4686 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4687 * @cpu: The cpu which is nominating a new idle_load_balancer.
4688 *
4689 * Returns: Returns the id of the idle load balancer if it exists,
4690 * Else, returns >= nr_cpu_ids.
4691 *
4692 * This algorithm picks the idle load balancer such that it belongs to a
4693 * semi-idle powersavings sched_domain. The idea is to try and avoid
4694 * completely idle packages/cores just for the purpose of idle load balancing
4695 * when there are other idle cpu's which are better suited for that job.
4696 */
4697static int find_new_ilb(int cpu)
4698{
4699 struct sched_domain *sd;
4700 struct sched_group *ilb_group;
4701
4702 /*
4703 * Have idle load balancer selection from semi-idle packages only
4704 * when power-aware load balancing is enabled
4705 */
4706 if (!(sched_smt_power_savings || sched_mc_power_savings))
4707 goto out_done;
4708
4709 /*
4710 * Optimize for the case when we have no idle CPUs or only one
4711 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4712 */
4713 if (cpumask_weight(nohz.cpu_mask) < 2)
4714 goto out_done;
4715
4716 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4717 ilb_group = sd->groups;
4718
4719 do {
4720 if (is_semi_idle_group(ilb_group))
4721 return cpumask_first(nohz.ilb_grp_nohz_mask);
4722
4723 ilb_group = ilb_group->next;
4724
4725 } while (ilb_group != sd->groups);
4726 }
4727
4728out_done:
4729 return cpumask_first(nohz.cpu_mask);
4730}
4731#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4732static inline int find_new_ilb(int call_cpu)
4733{
4734 return cpumask_first(nohz.cpu_mask);
4735}
4736#endif
4737
4738/*
4739 * This routine will try to nominate the ilb (idle load balancing)
4740 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4741 * load balancing on behalf of all those cpus. If all the cpus in the system
4742 * go into this tickless mode, then there will be no ilb owner (as there is
4743 * no need for one) and all the cpus will sleep till the next wakeup event
4744 * arrives...
4745 *
4746 * For the ilb owner, tick is not stopped. And this tick will be used
4747 * for idle load balancing. ilb owner will still be part of
4748 * nohz.cpu_mask..
4749 *
4750 * While stopping the tick, this cpu will become the ilb owner if there
4751 * is no other owner. And will be the owner till that cpu becomes busy
4752 * or if all cpus in the system stop their ticks at which point
4753 * there is no need for ilb owner.
4754 *
4755 * When the ilb owner becomes busy, it nominates another owner, during the
4756 * next busy scheduler_tick()
4757 */
4758int select_nohz_load_balancer(int stop_tick)
4759{
4760 int cpu = smp_processor_id();
4761
4762 if (stop_tick) {
4763 cpu_rq(cpu)->in_nohz_recently = 1;
4764
4765 if (!cpu_active(cpu)) {
4766 if (atomic_read(&nohz.load_balancer) != cpu)
4767 return 0;
4768
4769 /*
4770 * If we are going offline and still the leader,
4771 * give up!
4772 */
4773 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4774 BUG();
4775
4776 return 0;
4777 }
4778
4779 cpumask_set_cpu(cpu, nohz.cpu_mask);
4780
4781 /* time for ilb owner also to sleep */
4782 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4783 if (atomic_read(&nohz.load_balancer) == cpu)
4784 atomic_set(&nohz.load_balancer, -1);
4785 return 0;
4786 }
4787
4788 if (atomic_read(&nohz.load_balancer) == -1) {
4789 /* make me the ilb owner */
4790 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4791 return 1;
4792 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4793 int new_ilb;
4794
4795 if (!(sched_smt_power_savings ||
4796 sched_mc_power_savings))
4797 return 1;
4798 /*
4799 * Check to see if there is a more power-efficient
4800 * ilb.
4801 */
4802 new_ilb = find_new_ilb(cpu);
4803 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4804 atomic_set(&nohz.load_balancer, -1);
4805 resched_cpu(new_ilb);
4806 return 0;
4807 }
4808 return 1;
4809 }
4810 } else {
4811 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4812 return 0;
4813
4814 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4815
4816 if (atomic_read(&nohz.load_balancer) == cpu)
4817 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4818 BUG();
4819 }
4820 return 0;
4821}
4822#endif
4823
4824static DEFINE_SPINLOCK(balancing);
4825
4826/*
4827 * It checks each scheduling domain to see if it is due to be balanced,
4828 * and initiates a balancing operation if so.
4829 *
4830 * Balancing parameters are set up in arch_init_sched_domains.
4831 */
4832static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4833{
4834 int balance = 1;
4835 struct rq *rq = cpu_rq(cpu);
4836 unsigned long interval;
4837 struct sched_domain *sd;
4838 /* Earliest time when we have to do rebalance again */
4839 unsigned long next_balance = jiffies + 60*HZ;
4840 int update_next_balance = 0;
4841 int need_serialize;
4842
4843 for_each_domain(cpu, sd) {
4844 if (!(sd->flags & SD_LOAD_BALANCE))
4845 continue;
4846
4847 interval = sd->balance_interval;
4848 if (idle != CPU_IDLE)
4849 interval *= sd->busy_factor;
4850
4851 /* scale ms to jiffies */
4852 interval = msecs_to_jiffies(interval);
4853 if (unlikely(!interval))
4854 interval = 1;
4855 if (interval > HZ*NR_CPUS/10)
4856 interval = HZ*NR_CPUS/10;
4857
4858 need_serialize = sd->flags & SD_SERIALIZE;
4859
4860 if (need_serialize) {
4861 if (!spin_trylock(&balancing))
4862 goto out;
4863 }
4864
4865 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4866 if (load_balance(cpu, rq, sd, idle, &balance)) {
4867 /*
4868 * We've pulled tasks over so either we're no
4869 * longer idle, or one of our SMT siblings is
4870 * not idle.
4871 */
4872 idle = CPU_NOT_IDLE;
4873 }
4874 sd->last_balance = jiffies;
4875 }
4876 if (need_serialize)
4877 spin_unlock(&balancing);
4878out:
4879 if (time_after(next_balance, sd->last_balance + interval)) {
4880 next_balance = sd->last_balance + interval;
4881 update_next_balance = 1;
4882 }
4883
4884 /*
4885 * Stop the load balance at this level. There is another
4886 * CPU in our sched group which is doing load balancing more
4887 * actively.
4888 */
4889 if (!balance)
4890 break;
4891 }
4892
4893 /*
4894 * next_balance will be updated only when there is a need.
4895 * When the cpu is attached to null domain for ex, it will not be
4896 * updated.
4897 */
4898 if (likely(update_next_balance))
4899 rq->next_balance = next_balance;
4900}
4901
4902/*
4903 * run_rebalance_domains is triggered when needed from the scheduler tick.
4904 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4905 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4906 */
4907static void run_rebalance_domains(struct softirq_action *h)
4908{
4909 int this_cpu = smp_processor_id();
4910 struct rq *this_rq = cpu_rq(this_cpu);
4911 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4912 CPU_IDLE : CPU_NOT_IDLE;
4913
4914 rebalance_domains(this_cpu, idle);
4915
4916#ifdef CONFIG_NO_HZ
4917 /*
4918 * If this cpu is the owner for idle load balancing, then do the
4919 * balancing on behalf of the other idle cpus whose ticks are
4920 * stopped.
4921 */
4922 if (this_rq->idle_at_tick &&
4923 atomic_read(&nohz.load_balancer) == this_cpu) {
4924 struct rq *rq;
4925 int balance_cpu;
4926
4927 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4928 if (balance_cpu == this_cpu)
4929 continue;
4930
4931 /*
4932 * If this cpu gets work to do, stop the load balancing
4933 * work being done for other cpus. Next load
4934 * balancing owner will pick it up.
4935 */
4936 if (need_resched())
4937 break;
4938
4939 rebalance_domains(balance_cpu, CPU_IDLE);
4940
4941 rq = cpu_rq(balance_cpu);
4942 if (time_after(this_rq->next_balance, rq->next_balance))
4943 this_rq->next_balance = rq->next_balance;
4944 }
4945 }
4946#endif
4947}
4948
4949static inline int on_null_domain(int cpu)
4950{
4951 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
4952}
4953
4954/*
4955 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4956 *
4957 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4958 * idle load balancing owner or decide to stop the periodic load balancing,
4959 * if the whole system is idle.
4960 */
4961static inline void trigger_load_balance(struct rq *rq, int cpu)
4962{
4963#ifdef CONFIG_NO_HZ
4964 /*
4965 * If we were in the nohz mode recently and busy at the current
4966 * scheduler tick, then check if we need to nominate new idle
4967 * load balancer.
4968 */
4969 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4970 rq->in_nohz_recently = 0;
4971
4972 if (atomic_read(&nohz.load_balancer) == cpu) {
4973 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4974 atomic_set(&nohz.load_balancer, -1);
4975 }
4976
4977 if (atomic_read(&nohz.load_balancer) == -1) {
4978 int ilb = find_new_ilb(cpu);
4979
4980 if (ilb < nr_cpu_ids)
4981 resched_cpu(ilb);
4982 }
4983 }
4984
4985 /*
4986 * If this cpu is idle and doing idle load balancing for all the
4987 * cpus with ticks stopped, is it time for that to stop?
4988 */
4989 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4990 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4991 resched_cpu(cpu);
4992 return;
4993 }
4994
4995 /*
4996 * If this cpu is idle and the idle load balancing is done by
4997 * someone else, then no need raise the SCHED_SOFTIRQ
4998 */
4999 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
5000 cpumask_test_cpu(cpu, nohz.cpu_mask))
5001 return;
5002#endif
5003 /* Don't need to rebalance while attached to NULL domain */
5004 if (time_after_eq(jiffies, rq->next_balance) &&
5005 likely(!on_null_domain(cpu)))
5006 raise_softirq(SCHED_SOFTIRQ);
5007}
5008
5009#else /* CONFIG_SMP */
5010
5011/*
5012 * on UP we do not need to balance between CPUs:
5013 */
5014static inline void idle_balance(int cpu, struct rq *rq)
5015{
5016}
5017
5018#endif 3164#endif
5019 3165
5020DEFINE_PER_CPU(struct kernel_stat, kstat); 3166DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -6114,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6114 unsigned long flags; 4260 unsigned long flags;
6115 int oldprio, on_rq, running; 4261 int oldprio, on_rq, running;
6116 struct rq *rq; 4262 struct rq *rq;
6117 const struct sched_class *prev_class = p->sched_class; 4263 const struct sched_class *prev_class;
6118 4264
6119 BUG_ON(prio < 0 || prio > MAX_PRIO); 4265 BUG_ON(prio < 0 || prio > MAX_PRIO);
6120 4266
@@ -6122,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6122 update_rq_clock(rq); 4268 update_rq_clock(rq);
6123 4269
6124 oldprio = p->prio; 4270 oldprio = p->prio;
4271 prev_class = p->sched_class;
6125 on_rq = p->se.on_rq; 4272 on_rq = p->se.on_rq;
6126 running = task_current(rq, p); 4273 running = task_current(rq, p);
6127 if (on_rq) 4274 if (on_rq)
@@ -6139,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6139 if (running) 4286 if (running)
6140 p->sched_class->set_curr_task(rq); 4287 p->sched_class->set_curr_task(rq);
6141 if (on_rq) { 4288 if (on_rq) {
6142 enqueue_task(rq, p, 0); 4289 enqueue_task(rq, p, 0, oldprio < prio);
6143 4290
6144 check_class_changed(rq, p, prev_class, oldprio, running); 4291 check_class_changed(rq, p, prev_class, oldprio, running);
6145 } 4292 }
@@ -6183,7 +4330,7 @@ void set_user_nice(struct task_struct *p, long nice)
6183 delta = p->prio - old_prio; 4330 delta = p->prio - old_prio;
6184 4331
6185 if (on_rq) { 4332 if (on_rq) {
6186 enqueue_task(rq, p, 0); 4333 enqueue_task(rq, p, 0, false);
6187 /* 4334 /*
6188 * If the task increased its priority or is running and 4335 * If the task increased its priority or is running and
6189 * lowered its priority, then reschedule its CPU: 4336 * lowered its priority, then reschedule its CPU:
@@ -6341,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6341{ 4488{
6342 int retval, oldprio, oldpolicy = -1, on_rq, running; 4489 int retval, oldprio, oldpolicy = -1, on_rq, running;
6343 unsigned long flags; 4490 unsigned long flags;
6344 const struct sched_class *prev_class = p->sched_class; 4491 const struct sched_class *prev_class;
6345 struct rq *rq; 4492 struct rq *rq;
6346 int reset_on_fork; 4493 int reset_on_fork;
6347 4494
@@ -6455,6 +4602,7 @@ recheck:
6455 p->sched_reset_on_fork = reset_on_fork; 4602 p->sched_reset_on_fork = reset_on_fork;
6456 4603
6457 oldprio = p->prio; 4604 oldprio = p->prio;
4605 prev_class = p->sched_class;
6458 __setscheduler(rq, p, policy, param->sched_priority); 4606 __setscheduler(rq, p, policy, param->sched_priority);
6459 4607
6460 if (running) 4608 if (running)
@@ -9493,7 +7641,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9493 tg->rt_rq[cpu] = rt_rq; 7641 tg->rt_rq[cpu] = rt_rq;
9494 init_rt_rq(rt_rq, rq); 7642 init_rt_rq(rt_rq, rq);
9495 rt_rq->tg = tg; 7643 rt_rq->tg = tg;
9496 rt_rq->rt_se = rt_se;
9497 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7644 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9498 if (add) 7645 if (add)
9499 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7646 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9524,9 +7671,6 @@ void __init sched_init(void)
9524#ifdef CONFIG_RT_GROUP_SCHED 7671#ifdef CONFIG_RT_GROUP_SCHED
9525 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7672 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9526#endif 7673#endif
9527#ifdef CONFIG_USER_SCHED
9528 alloc_size *= 2;
9529#endif
9530#ifdef CONFIG_CPUMASK_OFFSTACK 7674#ifdef CONFIG_CPUMASK_OFFSTACK
9531 alloc_size += num_possible_cpus() * cpumask_size(); 7675 alloc_size += num_possible_cpus() * cpumask_size();
9532#endif 7676#endif
@@ -9540,13 +7684,6 @@ void __init sched_init(void)
9540 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7684 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9541 ptr += nr_cpu_ids * sizeof(void **); 7685 ptr += nr_cpu_ids * sizeof(void **);
9542 7686
9543#ifdef CONFIG_USER_SCHED
9544 root_task_group.se = (struct sched_entity **)ptr;
9545 ptr += nr_cpu_ids * sizeof(void **);
9546
9547 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9548 ptr += nr_cpu_ids * sizeof(void **);
9549#endif /* CONFIG_USER_SCHED */
9550#endif /* CONFIG_FAIR_GROUP_SCHED */ 7687#endif /* CONFIG_FAIR_GROUP_SCHED */
9551#ifdef CONFIG_RT_GROUP_SCHED 7688#ifdef CONFIG_RT_GROUP_SCHED
9552 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7689 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9555,13 +7692,6 @@ void __init sched_init(void)
9555 init_task_group.rt_rq = (struct rt_rq **)ptr; 7692 init_task_group.rt_rq = (struct rt_rq **)ptr;
9556 ptr += nr_cpu_ids * sizeof(void **); 7693 ptr += nr_cpu_ids * sizeof(void **);
9557 7694
9558#ifdef CONFIG_USER_SCHED
9559 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9560 ptr += nr_cpu_ids * sizeof(void **);
9561
9562 root_task_group.rt_rq = (struct rt_rq **)ptr;
9563 ptr += nr_cpu_ids * sizeof(void **);
9564#endif /* CONFIG_USER_SCHED */
9565#endif /* CONFIG_RT_GROUP_SCHED */ 7695#endif /* CONFIG_RT_GROUP_SCHED */
9566#ifdef CONFIG_CPUMASK_OFFSTACK 7696#ifdef CONFIG_CPUMASK_OFFSTACK
9567 for_each_possible_cpu(i) { 7697 for_each_possible_cpu(i) {
@@ -9581,22 +7711,13 @@ void __init sched_init(void)
9581#ifdef CONFIG_RT_GROUP_SCHED 7711#ifdef CONFIG_RT_GROUP_SCHED
9582 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7712 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9583 global_rt_period(), global_rt_runtime()); 7713 global_rt_period(), global_rt_runtime());
9584#ifdef CONFIG_USER_SCHED
9585 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9586 global_rt_period(), RUNTIME_INF);
9587#endif /* CONFIG_USER_SCHED */
9588#endif /* CONFIG_RT_GROUP_SCHED */ 7714#endif /* CONFIG_RT_GROUP_SCHED */
9589 7715
9590#ifdef CONFIG_GROUP_SCHED 7716#ifdef CONFIG_CGROUP_SCHED
9591 list_add(&init_task_group.list, &task_groups); 7717 list_add(&init_task_group.list, &task_groups);
9592 INIT_LIST_HEAD(&init_task_group.children); 7718 INIT_LIST_HEAD(&init_task_group.children);
9593 7719
9594#ifdef CONFIG_USER_SCHED 7720#endif /* CONFIG_CGROUP_SCHED */
9595 INIT_LIST_HEAD(&root_task_group.children);
9596 init_task_group.parent = &root_task_group;
9597 list_add(&init_task_group.siblings, &root_task_group.children);
9598#endif /* CONFIG_USER_SCHED */
9599#endif /* CONFIG_GROUP_SCHED */
9600 7721
9601#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7722#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9602 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7723 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9636,25 +7757,6 @@ void __init sched_init(void)
9636 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7757 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9637 */ 7758 */
9638 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7759 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9639#elif defined CONFIG_USER_SCHED
9640 root_task_group.shares = NICE_0_LOAD;
9641 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9642 /*
9643 * In case of task-groups formed thr' the user id of tasks,
9644 * init_task_group represents tasks belonging to root user.
9645 * Hence it forms a sibling of all subsequent groups formed.
9646 * In this case, init_task_group gets only a fraction of overall
9647 * system cpu resource, based on the weight assigned to root
9648 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9649 * by letting tasks of init_task_group sit in a separate cfs_rq
9650 * (init_tg_cfs_rq) and having one entity represent this group of
9651 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9652 */
9653 init_tg_cfs_entry(&init_task_group,
9654 &per_cpu(init_tg_cfs_rq, i),
9655 &per_cpu(init_sched_entity, i), i, 1,
9656 root_task_group.se[i]);
9657
9658#endif 7760#endif
9659#endif /* CONFIG_FAIR_GROUP_SCHED */ 7761#endif /* CONFIG_FAIR_GROUP_SCHED */
9660 7762
@@ -9663,12 +7765,6 @@ void __init sched_init(void)
9663 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7765 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9664#ifdef CONFIG_CGROUP_SCHED 7766#ifdef CONFIG_CGROUP_SCHED
9665 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7767 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9666#elif defined CONFIG_USER_SCHED
9667 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9668 init_tg_rt_entry(&init_task_group,
9669 &per_cpu(init_rt_rq_var, i),
9670 &per_cpu(init_sched_rt_entity, i), i, 1,
9671 root_task_group.rt_se[i]);
9672#endif 7768#endif
9673#endif 7769#endif
9674 7770
@@ -9753,7 +7849,7 @@ static inline int preempt_count_equals(int preempt_offset)
9753 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7849 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9754} 7850}
9755 7851
9756void __might_sleep(char *file, int line, int preempt_offset) 7852void __might_sleep(const char *file, int line, int preempt_offset)
9757{ 7853{
9758#ifdef in_atomic 7854#ifdef in_atomic
9759 static unsigned long prev_jiffy; /* ratelimiting */ 7855 static unsigned long prev_jiffy; /* ratelimiting */
@@ -10064,7 +8160,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
10064} 8160}
10065#endif /* CONFIG_RT_GROUP_SCHED */ 8161#endif /* CONFIG_RT_GROUP_SCHED */
10066 8162
10067#ifdef CONFIG_GROUP_SCHED 8163#ifdef CONFIG_CGROUP_SCHED
10068static void free_sched_group(struct task_group *tg) 8164static void free_sched_group(struct task_group *tg)
10069{ 8165{
10070 free_fair_sched_group(tg); 8166 free_fair_sched_group(tg);
@@ -10169,11 +8265,11 @@ void sched_move_task(struct task_struct *tsk)
10169 if (unlikely(running)) 8265 if (unlikely(running))
10170 tsk->sched_class->set_curr_task(rq); 8266 tsk->sched_class->set_curr_task(rq);
10171 if (on_rq) 8267 if (on_rq)
10172 enqueue_task(rq, tsk, 0); 8268 enqueue_task(rq, tsk, 0, false);
10173 8269
10174 task_rq_unlock(rq, &flags); 8270 task_rq_unlock(rq, &flags);
10175} 8271}
10176#endif /* CONFIG_GROUP_SCHED */ 8272#endif /* CONFIG_CGROUP_SCHED */
10177 8273
10178#ifdef CONFIG_FAIR_GROUP_SCHED 8274#ifdef CONFIG_FAIR_GROUP_SCHED
10179static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8275static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10315,13 +8411,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10315 runtime = d->rt_runtime; 8411 runtime = d->rt_runtime;
10316 } 8412 }
10317 8413
10318#ifdef CONFIG_USER_SCHED
10319 if (tg == &root_task_group) {
10320 period = global_rt_period();
10321 runtime = global_rt_runtime();
10322 }
10323#endif
10324
10325 /* 8414 /*
10326 * Cannot have more runtime than the period. 8415 * Cannot have more runtime than the period.
10327 */ 8416 */
@@ -10941,12 +9030,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10941} 9030}
10942 9031
10943/* 9032/*
9033 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9034 * in cputime_t units. As a result, cpuacct_update_stats calls
9035 * percpu_counter_add with values large enough to always overflow the
9036 * per cpu batch limit causing bad SMP scalability.
9037 *
9038 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9039 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9040 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9041 */
9042#ifdef CONFIG_SMP
9043#define CPUACCT_BATCH \
9044 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9045#else
9046#define CPUACCT_BATCH 0
9047#endif
9048
9049/*
10944 * Charge the system/user time to the task's accounting group. 9050 * Charge the system/user time to the task's accounting group.
10945 */ 9051 */
10946static void cpuacct_update_stats(struct task_struct *tsk, 9052static void cpuacct_update_stats(struct task_struct *tsk,
10947 enum cpuacct_stat_index idx, cputime_t val) 9053 enum cpuacct_stat_index idx, cputime_t val)
10948{ 9054{
10949 struct cpuacct *ca; 9055 struct cpuacct *ca;
9056 int batch = CPUACCT_BATCH;
10950 9057
10951 if (unlikely(!cpuacct_subsys.active)) 9058 if (unlikely(!cpuacct_subsys.active))
10952 return; 9059 return;
@@ -10955,7 +9062,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10955 ca = task_ca(tsk); 9062 ca = task_ca(tsk);
10956 9063
10957 do { 9064 do {
10958 percpu_counter_add(&ca->cpustat[idx], val); 9065 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10959 ca = ca->parent; 9066 ca = ca->parent;
10960 } while (ca); 9067 } while (ca);
10961 rcu_read_unlock(); 9068 rcu_read_unlock();
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 597b33099dfa..eeb3506c4834 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,7 @@ static int convert_prio(int prio)
47} 47}
48 48
49#define for_each_cpupri_active(array, idx) \ 49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \ 50 for_each_bit(idx, array, CPUPRI_NR_PRIORITIES)
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53 51
54/** 52/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system 53 * cpupri_find - find the best (lowest-pri) CPU in the system
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8fe7ee81c552..3e1fd96c6cf9 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1053,7 +1053,8 @@ static inline void hrtick_update(struct rq *rq)
1053 * increased. Here we update the fair scheduling stats and 1053 * increased. Here we update the fair scheduling stats and
1054 * then put the task into the rbtree: 1054 * then put the task into the rbtree:
1055 */ 1055 */
1056static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 1056static void
1057enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1057{ 1058{
1058 struct cfs_rq *cfs_rq; 1059 struct cfs_rq *cfs_rq;
1059 struct sched_entity *se = &p->se; 1060 struct sched_entity *se = &p->se;
@@ -1815,57 +1816,164 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
1815 */ 1816 */
1816 1817
1817/* 1818/*
1818 * Load-balancing iterator. Note: while the runqueue stays locked 1819 * pull_task - move a task from a remote runqueue to the local runqueue.
1819 * during the whole iteration, the current task might be 1820 * Both runqueues must be locked.
1820 * dequeued so the iterator has to be dequeue-safe. Here we
1821 * achieve that by always pre-iterating before returning
1822 * the current task:
1823 */ 1821 */
1824static struct task_struct * 1822static void pull_task(struct rq *src_rq, struct task_struct *p,
1825__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) 1823 struct rq *this_rq, int this_cpu)
1826{ 1824{
1827 struct task_struct *p = NULL; 1825 deactivate_task(src_rq, p, 0);
1828 struct sched_entity *se; 1826 set_task_cpu(p, this_cpu);
1827 activate_task(this_rq, p, 0);
1828 check_preempt_curr(this_rq, p, 0);
1829}
1829 1830
1830 if (next == &cfs_rq->tasks) 1831/*
1831 return NULL; 1832 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1833 */
1834static
1835int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1836 struct sched_domain *sd, enum cpu_idle_type idle,
1837 int *all_pinned)
1838{
1839 int tsk_cache_hot = 0;
1840 /*
1841 * We do not migrate tasks that are:
1842 * 1) running (obviously), or
1843 * 2) cannot be migrated to this CPU due to cpus_allowed, or
1844 * 3) are cache-hot on their current CPU.
1845 */
1846 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
1847 schedstat_inc(p, se.nr_failed_migrations_affine);
1848 return 0;
1849 }
1850 *all_pinned = 0;
1832 1851
1833 se = list_entry(next, struct sched_entity, group_node); 1852 if (task_running(rq, p)) {
1834 p = task_of(se); 1853 schedstat_inc(p, se.nr_failed_migrations_running);
1835 cfs_rq->balance_iterator = next->next; 1854 return 0;
1855 }
1836 1856
1837 return p; 1857 /*
1838} 1858 * Aggressive migration if:
1859 * 1) task is cache cold, or
1860 * 2) too many balance attempts have failed.
1861 */
1839 1862
1840static struct task_struct *load_balance_start_fair(void *arg) 1863 tsk_cache_hot = task_hot(p, rq->clock, sd);
1841{ 1864 if (!tsk_cache_hot ||
1842 struct cfs_rq *cfs_rq = arg; 1865 sd->nr_balance_failed > sd->cache_nice_tries) {
1866#ifdef CONFIG_SCHEDSTATS
1867 if (tsk_cache_hot) {
1868 schedstat_inc(sd, lb_hot_gained[idle]);
1869 schedstat_inc(p, se.nr_forced_migrations);
1870 }
1871#endif
1872 return 1;
1873 }
1843 1874
1844 return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); 1875 if (tsk_cache_hot) {
1876 schedstat_inc(p, se.nr_failed_migrations_hot);
1877 return 0;
1878 }
1879 return 1;
1845} 1880}
1846 1881
1847static struct task_struct *load_balance_next_fair(void *arg) 1882/*
1883 * move_one_task tries to move exactly one task from busiest to this_rq, as
1884 * part of active balancing operations within "domain".
1885 * Returns 1 if successful and 0 otherwise.
1886 *
1887 * Called with both runqueues locked.
1888 */
1889static int
1890move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1891 struct sched_domain *sd, enum cpu_idle_type idle)
1848{ 1892{
1849 struct cfs_rq *cfs_rq = arg; 1893 struct task_struct *p, *n;
1894 struct cfs_rq *cfs_rq;
1895 int pinned = 0;
1896
1897 for_each_leaf_cfs_rq(busiest, cfs_rq) {
1898 list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
1899
1900 if (!can_migrate_task(p, busiest, this_cpu,
1901 sd, idle, &pinned))
1902 continue;
1850 1903
1851 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1904 pull_task(busiest, p, this_rq, this_cpu);
1905 /*
1906 * Right now, this is only the second place pull_task()
1907 * is called, so we can safely collect pull_task()
1908 * stats here rather than inside pull_task().
1909 */
1910 schedstat_inc(sd, lb_gained[idle]);
1911 return 1;
1912 }
1913 }
1914
1915 return 0;
1852} 1916}
1853 1917
1854static unsigned long 1918static unsigned long
1855__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1919balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1856 unsigned long max_load_move, struct sched_domain *sd, 1920 unsigned long max_load_move, struct sched_domain *sd,
1857 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, 1921 enum cpu_idle_type idle, int *all_pinned,
1858 struct cfs_rq *cfs_rq) 1922 int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
1859{ 1923{
1860 struct rq_iterator cfs_rq_iterator; 1924 int loops = 0, pulled = 0, pinned = 0;
1925 long rem_load_move = max_load_move;
1926 struct task_struct *p, *n;
1861 1927
1862 cfs_rq_iterator.start = load_balance_start_fair; 1928 if (max_load_move == 0)
1863 cfs_rq_iterator.next = load_balance_next_fair; 1929 goto out;
1864 cfs_rq_iterator.arg = cfs_rq;
1865 1930
1866 return balance_tasks(this_rq, this_cpu, busiest, 1931 pinned = 1;
1867 max_load_move, sd, idle, all_pinned, 1932
1868 this_best_prio, &cfs_rq_iterator); 1933 list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
1934 if (loops++ > sysctl_sched_nr_migrate)
1935 break;
1936
1937 if ((p->se.load.weight >> 1) > rem_load_move ||
1938 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
1939 continue;
1940
1941 pull_task(busiest, p, this_rq, this_cpu);
1942 pulled++;
1943 rem_load_move -= p->se.load.weight;
1944
1945#ifdef CONFIG_PREEMPT
1946 /*
1947 * NEWIDLE balancing is a source of latency, so preemptible
1948 * kernels will stop after the first task is pulled to minimize
1949 * the critical section.
1950 */
1951 if (idle == CPU_NEWLY_IDLE)
1952 break;
1953#endif
1954
1955 /*
1956 * We only want to steal up to the prescribed amount of
1957 * weighted load.
1958 */
1959 if (rem_load_move <= 0)
1960 break;
1961
1962 if (p->prio < *this_best_prio)
1963 *this_best_prio = p->prio;
1964 }
1965out:
1966 /*
1967 * Right now, this is one of only two places pull_task() is called,
1968 * so we can safely collect pull_task() stats here rather than
1969 * inside pull_task().
1970 */
1971 schedstat_add(sd, lb_gained[idle], pulled);
1972
1973 if (all_pinned)
1974 *all_pinned = pinned;
1975
1976 return max_load_move - rem_load_move;
1869} 1977}
1870 1978
1871#ifdef CONFIG_FAIR_GROUP_SCHED 1979#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1897,9 +2005,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1897 rem_load = (u64)rem_load_move * busiest_weight; 2005 rem_load = (u64)rem_load_move * busiest_weight;
1898 rem_load = div_u64(rem_load, busiest_h_load + 1); 2006 rem_load = div_u64(rem_load, busiest_h_load + 1);
1899 2007
1900 moved_load = __load_balance_fair(this_rq, this_cpu, busiest, 2008 moved_load = balance_tasks(this_rq, this_cpu, busiest,
1901 rem_load, sd, idle, all_pinned, this_best_prio, 2009 rem_load, sd, idle, all_pinned, this_best_prio,
1902 tg->cfs_rq[busiest_cpu]); 2010 busiest_cfs_rq);
1903 2011
1904 if (!moved_load) 2012 if (!moved_load)
1905 continue; 2013 continue;
@@ -1922,35 +2030,1509 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1922 struct sched_domain *sd, enum cpu_idle_type idle, 2030 struct sched_domain *sd, enum cpu_idle_type idle,
1923 int *all_pinned, int *this_best_prio) 2031 int *all_pinned, int *this_best_prio)
1924{ 2032{
1925 return __load_balance_fair(this_rq, this_cpu, busiest, 2033 return balance_tasks(this_rq, this_cpu, busiest,
1926 max_load_move, sd, idle, all_pinned, 2034 max_load_move, sd, idle, all_pinned,
1927 this_best_prio, &busiest->cfs); 2035 this_best_prio, &busiest->cfs);
1928} 2036}
1929#endif 2037#endif
1930 2038
1931static int 2039/*
1932move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 2040 * move_tasks tries to move up to max_load_move weighted load from busiest to
1933 struct sched_domain *sd, enum cpu_idle_type idle) 2041 * this_rq, as part of a balancing operation within domain "sd".
2042 * Returns 1 if successful and 0 otherwise.
2043 *
2044 * Called with both runqueues locked.
2045 */
2046static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2047 unsigned long max_load_move,
2048 struct sched_domain *sd, enum cpu_idle_type idle,
2049 int *all_pinned)
1934{ 2050{
1935 struct cfs_rq *busy_cfs_rq; 2051 unsigned long total_load_moved = 0, load_moved;
1936 struct rq_iterator cfs_rq_iterator; 2052 int this_best_prio = this_rq->curr->prio;
1937 2053
1938 cfs_rq_iterator.start = load_balance_start_fair; 2054 do {
1939 cfs_rq_iterator.next = load_balance_next_fair; 2055 load_moved = load_balance_fair(this_rq, this_cpu, busiest,
2056 max_load_move - total_load_moved,
2057 sd, idle, all_pinned, &this_best_prio);
1940 2058
1941 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 2059 total_load_moved += load_moved;
2060
2061#ifdef CONFIG_PREEMPT
1942 /* 2062 /*
1943 * pass busy_cfs_rq argument into 2063 * NEWIDLE balancing is a source of latency, so preemptible
1944 * load_balance_[start|next]_fair iterators 2064 * kernels will stop after the first task is pulled to minimize
2065 * the critical section.
1945 */ 2066 */
1946 cfs_rq_iterator.arg = busy_cfs_rq; 2067 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
1947 if (iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 2068 break;
1948 &cfs_rq_iterator)) 2069
1949 return 1; 2070 if (raw_spin_is_contended(&this_rq->lock) ||
2071 raw_spin_is_contended(&busiest->lock))
2072 break;
2073#endif
2074 } while (load_moved && max_load_move > total_load_moved);
2075
2076 return total_load_moved > 0;
2077}
2078
2079/********** Helpers for find_busiest_group ************************/
2080/*
2081 * sd_lb_stats - Structure to store the statistics of a sched_domain
2082 * during load balancing.
2083 */
2084struct sd_lb_stats {
2085 struct sched_group *busiest; /* Busiest group in this sd */
2086 struct sched_group *this; /* Local group in this sd */
2087 unsigned long total_load; /* Total load of all groups in sd */
2088 unsigned long total_pwr; /* Total power of all groups in sd */
2089 unsigned long avg_load; /* Average load across all groups in sd */
2090
2091 /** Statistics of this group */
2092 unsigned long this_load;
2093 unsigned long this_load_per_task;
2094 unsigned long this_nr_running;
2095
2096 /* Statistics of the busiest group */
2097 unsigned long max_load;
2098 unsigned long busiest_load_per_task;
2099 unsigned long busiest_nr_running;
2100 unsigned long busiest_group_capacity;
2101
2102 int group_imb; /* Is there imbalance in this sd */
2103#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2104 int power_savings_balance; /* Is powersave balance needed for this sd */
2105 struct sched_group *group_min; /* Least loaded group in sd */
2106 struct sched_group *group_leader; /* Group which relieves group_min */
2107 unsigned long min_load_per_task; /* load_per_task in group_min */
2108 unsigned long leader_nr_running; /* Nr running of group_leader */
2109 unsigned long min_nr_running; /* Nr running of group_min */
2110#endif
2111};
2112
2113/*
2114 * sg_lb_stats - stats of a sched_group required for load_balancing
2115 */
2116struct sg_lb_stats {
2117 unsigned long avg_load; /*Avg load across the CPUs of the group */
2118 unsigned long group_load; /* Total load over the CPUs of the group */
2119 unsigned long sum_nr_running; /* Nr tasks running in the group */
2120 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
2121 unsigned long group_capacity;
2122 int group_imb; /* Is there an imbalance in the group ? */
2123};
2124
2125/**
2126 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
2127 * @group: The group whose first cpu is to be returned.
2128 */
2129static inline unsigned int group_first_cpu(struct sched_group *group)
2130{
2131 return cpumask_first(sched_group_cpus(group));
2132}
2133
2134/**
2135 * get_sd_load_idx - Obtain the load index for a given sched domain.
2136 * @sd: The sched_domain whose load_idx is to be obtained.
2137 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
2138 */
2139static inline int get_sd_load_idx(struct sched_domain *sd,
2140 enum cpu_idle_type idle)
2141{
2142 int load_idx;
2143
2144 switch (idle) {
2145 case CPU_NOT_IDLE:
2146 load_idx = sd->busy_idx;
2147 break;
2148
2149 case CPU_NEWLY_IDLE:
2150 load_idx = sd->newidle_idx;
2151 break;
2152 default:
2153 load_idx = sd->idle_idx;
2154 break;
1950 } 2155 }
1951 2156
2157 return load_idx;
2158}
2159
2160
2161#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2162/**
2163 * init_sd_power_savings_stats - Initialize power savings statistics for
2164 * the given sched_domain, during load balancing.
2165 *
2166 * @sd: Sched domain whose power-savings statistics are to be initialized.
2167 * @sds: Variable containing the statistics for sd.
2168 * @idle: Idle status of the CPU at which we're performing load-balancing.
2169 */
2170static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2171 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2172{
2173 /*
2174 * Busy processors will not participate in power savings
2175 * balance.
2176 */
2177 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2178 sds->power_savings_balance = 0;
2179 else {
2180 sds->power_savings_balance = 1;
2181 sds->min_nr_running = ULONG_MAX;
2182 sds->leader_nr_running = 0;
2183 }
2184}
2185
2186/**
2187 * update_sd_power_savings_stats - Update the power saving stats for a
2188 * sched_domain while performing load balancing.
2189 *
2190 * @group: sched_group belonging to the sched_domain under consideration.
2191 * @sds: Variable containing the statistics of the sched_domain
2192 * @local_group: Does group contain the CPU for which we're performing
2193 * load balancing ?
2194 * @sgs: Variable containing the statistics of the group.
2195 */
2196static inline void update_sd_power_savings_stats(struct sched_group *group,
2197 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2198{
2199
2200 if (!sds->power_savings_balance)
2201 return;
2202
2203 /*
2204 * If the local group is idle or completely loaded
2205 * no need to do power savings balance at this domain
2206 */
2207 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
2208 !sds->this_nr_running))
2209 sds->power_savings_balance = 0;
2210
2211 /*
2212 * If a group is already running at full capacity or idle,
2213 * don't include that group in power savings calculations
2214 */
2215 if (!sds->power_savings_balance ||
2216 sgs->sum_nr_running >= sgs->group_capacity ||
2217 !sgs->sum_nr_running)
2218 return;
2219
2220 /*
2221 * Calculate the group which has the least non-idle load.
2222 * This is the group from where we need to pick up the load
2223 * for saving power
2224 */
2225 if ((sgs->sum_nr_running < sds->min_nr_running) ||
2226 (sgs->sum_nr_running == sds->min_nr_running &&
2227 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
2228 sds->group_min = group;
2229 sds->min_nr_running = sgs->sum_nr_running;
2230 sds->min_load_per_task = sgs->sum_weighted_load /
2231 sgs->sum_nr_running;
2232 }
2233
2234 /*
2235 * Calculate the group which is almost near its
2236 * capacity but still has some space to pick up some load
2237 * from other group and save more power
2238 */
2239 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
2240 return;
2241
2242 if (sgs->sum_nr_running > sds->leader_nr_running ||
2243 (sgs->sum_nr_running == sds->leader_nr_running &&
2244 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
2245 sds->group_leader = group;
2246 sds->leader_nr_running = sgs->sum_nr_running;
2247 }
2248}
2249
2250/**
2251 * check_power_save_busiest_group - see if there is potential for some power-savings balance
2252 * @sds: Variable containing the statistics of the sched_domain
2253 * under consideration.
2254 * @this_cpu: Cpu at which we're currently performing load-balancing.
2255 * @imbalance: Variable to store the imbalance.
2256 *
2257 * Description:
2258 * Check if we have potential to perform some power-savings balance.
2259 * If yes, set the busiest group to be the least loaded group in the
2260 * sched_domain, so that it's CPUs can be put to idle.
2261 *
2262 * Returns 1 if there is potential to perform power-savings balance.
2263 * Else returns 0.
2264 */
2265static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2266 int this_cpu, unsigned long *imbalance)
2267{
2268 if (!sds->power_savings_balance)
2269 return 0;
2270
2271 if (sds->this != sds->group_leader ||
2272 sds->group_leader == sds->group_min)
2273 return 0;
2274
2275 *imbalance = sds->min_load_per_task;
2276 sds->busiest = sds->group_min;
2277
2278 return 1;
2279
2280}
2281#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2282static inline void init_sd_power_savings_stats(struct sched_domain *sd,
2283 struct sd_lb_stats *sds, enum cpu_idle_type idle)
2284{
2285 return;
2286}
2287
2288static inline void update_sd_power_savings_stats(struct sched_group *group,
2289 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
2290{
2291 return;
2292}
2293
2294static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
2295 int this_cpu, unsigned long *imbalance)
2296{
1952 return 0; 2297 return 0;
1953} 2298}
2299#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
2300
2301
2302unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
2303{
2304 return SCHED_LOAD_SCALE;
2305}
2306
2307unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
2308{
2309 return default_scale_freq_power(sd, cpu);
2310}
2311
2312unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
2313{
2314 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2315 unsigned long smt_gain = sd->smt_gain;
2316
2317 smt_gain /= weight;
2318
2319 return smt_gain;
2320}
2321
2322unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
2323{
2324 return default_scale_smt_power(sd, cpu);
2325}
2326
2327unsigned long scale_rt_power(int cpu)
2328{
2329 struct rq *rq = cpu_rq(cpu);
2330 u64 total, available;
2331
2332 sched_avg_update(rq);
2333
2334 total = sched_avg_period() + (rq->clock - rq->age_stamp);
2335 available = total - rq->rt_avg;
2336
2337 if (unlikely((s64)total < SCHED_LOAD_SCALE))
2338 total = SCHED_LOAD_SCALE;
2339
2340 total >>= SCHED_LOAD_SHIFT;
2341
2342 return div_u64(available, total);
2343}
2344
2345static void update_cpu_power(struct sched_domain *sd, int cpu)
2346{
2347 unsigned long weight = cpumask_weight(sched_domain_span(sd));
2348 unsigned long power = SCHED_LOAD_SCALE;
2349 struct sched_group *sdg = sd->groups;
2350
2351 if (sched_feat(ARCH_POWER))
2352 power *= arch_scale_freq_power(sd, cpu);
2353 else
2354 power *= default_scale_freq_power(sd, cpu);
2355
2356 power >>= SCHED_LOAD_SHIFT;
2357
2358 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
2359 if (sched_feat(ARCH_POWER))
2360 power *= arch_scale_smt_power(sd, cpu);
2361 else
2362 power *= default_scale_smt_power(sd, cpu);
2363
2364 power >>= SCHED_LOAD_SHIFT;
2365 }
2366
2367 power *= scale_rt_power(cpu);
2368 power >>= SCHED_LOAD_SHIFT;
2369
2370 if (!power)
2371 power = 1;
2372
2373 sdg->cpu_power = power;
2374}
2375
2376static void update_group_power(struct sched_domain *sd, int cpu)
2377{
2378 struct sched_domain *child = sd->child;
2379 struct sched_group *group, *sdg = sd->groups;
2380 unsigned long power;
2381
2382 if (!child) {
2383 update_cpu_power(sd, cpu);
2384 return;
2385 }
2386
2387 power = 0;
2388
2389 group = child->groups;
2390 do {
2391 power += group->cpu_power;
2392 group = group->next;
2393 } while (group != child->groups);
2394
2395 sdg->cpu_power = power;
2396}
2397
2398/**
2399 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
2400 * @sd: The sched_domain whose statistics are to be updated.
2401 * @group: sched_group whose statistics are to be updated.
2402 * @this_cpu: Cpu for which load balance is currently performed.
2403 * @idle: Idle status of this_cpu
2404 * @load_idx: Load index of sched_domain of this_cpu for load calc.
2405 * @sd_idle: Idle status of the sched_domain containing group.
2406 * @local_group: Does group contain this_cpu.
2407 * @cpus: Set of cpus considered for load balancing.
2408 * @balance: Should we balance.
2409 * @sgs: variable to hold the statistics for this group.
2410 */
2411static inline void update_sg_lb_stats(struct sched_domain *sd,
2412 struct sched_group *group, int this_cpu,
2413 enum cpu_idle_type idle, int load_idx, int *sd_idle,
2414 int local_group, const struct cpumask *cpus,
2415 int *balance, struct sg_lb_stats *sgs)
2416{
2417 unsigned long load, max_cpu_load, min_cpu_load;
2418 int i;
2419 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2420 unsigned long avg_load_per_task = 0;
2421
2422 if (local_group)
2423 balance_cpu = group_first_cpu(group);
2424
2425 /* Tally up the load of all CPUs in the group */
2426 max_cpu_load = 0;
2427 min_cpu_load = ~0UL;
2428
2429 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
2430 struct rq *rq = cpu_rq(i);
2431
2432 if (*sd_idle && rq->nr_running)
2433 *sd_idle = 0;
2434
2435 /* Bias balancing toward cpus of our domain */
2436 if (local_group) {
2437 if (idle_cpu(i) && !first_idle_cpu) {
2438 first_idle_cpu = 1;
2439 balance_cpu = i;
2440 }
2441
2442 load = target_load(i, load_idx);
2443 } else {
2444 load = source_load(i, load_idx);
2445 if (load > max_cpu_load)
2446 max_cpu_load = load;
2447 if (min_cpu_load > load)
2448 min_cpu_load = load;
2449 }
2450
2451 sgs->group_load += load;
2452 sgs->sum_nr_running += rq->nr_running;
2453 sgs->sum_weighted_load += weighted_cpuload(i);
2454
2455 }
2456
2457 /*
2458 * First idle cpu or the first cpu(busiest) in this sched group
2459 * is eligible for doing load balancing at this and above
2460 * domains. In the newly idle case, we will allow all the cpu's
2461 * to do the newly idle load balance.
2462 */
2463 if (idle != CPU_NEWLY_IDLE && local_group &&
2464 balance_cpu != this_cpu) {
2465 *balance = 0;
2466 return;
2467 }
2468
2469 update_group_power(sd, this_cpu);
2470
2471 /* Adjust by relative CPU power of the group */
2472 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
2473
2474 /*
2475 * Consider the group unbalanced when the imbalance is larger
2476 * than the average weight of two tasks.
2477 *
2478 * APZ: with cgroup the avg task weight can vary wildly and
2479 * might not be a suitable number - should we keep a
2480 * normalized nr_running number somewhere that negates
2481 * the hierarchy?
2482 */
2483 if (sgs->sum_nr_running)
2484 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
2485
2486 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
2487 sgs->group_imb = 1;
2488
2489 sgs->group_capacity =
2490 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
2491}
2492
2493/**
2494 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
2495 * @sd: sched_domain whose statistics are to be updated.
2496 * @this_cpu: Cpu for which load balance is currently performed.
2497 * @idle: Idle status of this_cpu
2498 * @sd_idle: Idle status of the sched_domain containing group.
2499 * @cpus: Set of cpus considered for load balancing.
2500 * @balance: Should we balance.
2501 * @sds: variable to hold the statistics for this sched_domain.
2502 */
2503static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
2504 enum cpu_idle_type idle, int *sd_idle,
2505 const struct cpumask *cpus, int *balance,
2506 struct sd_lb_stats *sds)
2507{
2508 struct sched_domain *child = sd->child;
2509 struct sched_group *group = sd->groups;
2510 struct sg_lb_stats sgs;
2511 int load_idx, prefer_sibling = 0;
2512
2513 if (child && child->flags & SD_PREFER_SIBLING)
2514 prefer_sibling = 1;
2515
2516 init_sd_power_savings_stats(sd, sds, idle);
2517 load_idx = get_sd_load_idx(sd, idle);
2518
2519 do {
2520 int local_group;
2521
2522 local_group = cpumask_test_cpu(this_cpu,
2523 sched_group_cpus(group));
2524 memset(&sgs, 0, sizeof(sgs));
2525 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
2526 local_group, cpus, balance, &sgs);
2527
2528 if (local_group && !(*balance))
2529 return;
2530
2531 sds->total_load += sgs.group_load;
2532 sds->total_pwr += group->cpu_power;
2533
2534 /*
2535 * In case the child domain prefers tasks go to siblings
2536 * first, lower the group capacity to one so that we'll try
2537 * and move all the excess tasks away.
2538 */
2539 if (prefer_sibling)
2540 sgs.group_capacity = min(sgs.group_capacity, 1UL);
2541
2542 if (local_group) {
2543 sds->this_load = sgs.avg_load;
2544 sds->this = group;
2545 sds->this_nr_running = sgs.sum_nr_running;
2546 sds->this_load_per_task = sgs.sum_weighted_load;
2547 } else if (sgs.avg_load > sds->max_load &&
2548 (sgs.sum_nr_running > sgs.group_capacity ||
2549 sgs.group_imb)) {
2550 sds->max_load = sgs.avg_load;
2551 sds->busiest = group;
2552 sds->busiest_nr_running = sgs.sum_nr_running;
2553 sds->busiest_group_capacity = sgs.group_capacity;
2554 sds->busiest_load_per_task = sgs.sum_weighted_load;
2555 sds->group_imb = sgs.group_imb;
2556 }
2557
2558 update_sd_power_savings_stats(group, sds, local_group, &sgs);
2559 group = group->next;
2560 } while (group != sd->groups);
2561}
2562
2563/**
2564 * fix_small_imbalance - Calculate the minor imbalance that exists
2565 * amongst the groups of a sched_domain, during
2566 * load balancing.
2567 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
2568 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
2569 * @imbalance: Variable to store the imbalance.
2570 */
2571static inline void fix_small_imbalance(struct sd_lb_stats *sds,
2572 int this_cpu, unsigned long *imbalance)
2573{
2574 unsigned long tmp, pwr_now = 0, pwr_move = 0;
2575 unsigned int imbn = 2;
2576 unsigned long scaled_busy_load_per_task;
2577
2578 if (sds->this_nr_running) {
2579 sds->this_load_per_task /= sds->this_nr_running;
2580 if (sds->busiest_load_per_task >
2581 sds->this_load_per_task)
2582 imbn = 1;
2583 } else
2584 sds->this_load_per_task =
2585 cpu_avg_load_per_task(this_cpu);
2586
2587 scaled_busy_load_per_task = sds->busiest_load_per_task
2588 * SCHED_LOAD_SCALE;
2589 scaled_busy_load_per_task /= sds->busiest->cpu_power;
2590
2591 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
2592 (scaled_busy_load_per_task * imbn)) {
2593 *imbalance = sds->busiest_load_per_task;
2594 return;
2595 }
2596
2597 /*
2598 * OK, we don't have enough imbalance to justify moving tasks,
2599 * however we may be able to increase total CPU power used by
2600 * moving them.
2601 */
2602
2603 pwr_now += sds->busiest->cpu_power *
2604 min(sds->busiest_load_per_task, sds->max_load);
2605 pwr_now += sds->this->cpu_power *
2606 min(sds->this_load_per_task, sds->this_load);
2607 pwr_now /= SCHED_LOAD_SCALE;
2608
2609 /* Amount of load we'd subtract */
2610 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2611 sds->busiest->cpu_power;
2612 if (sds->max_load > tmp)
2613 pwr_move += sds->busiest->cpu_power *
2614 min(sds->busiest_load_per_task, sds->max_load - tmp);
2615
2616 /* Amount of load we'd add */
2617 if (sds->max_load * sds->busiest->cpu_power <
2618 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
2619 tmp = (sds->max_load * sds->busiest->cpu_power) /
2620 sds->this->cpu_power;
2621 else
2622 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
2623 sds->this->cpu_power;
2624 pwr_move += sds->this->cpu_power *
2625 min(sds->this_load_per_task, sds->this_load + tmp);
2626 pwr_move /= SCHED_LOAD_SCALE;
2627
2628 /* Move if we gain throughput */
2629 if (pwr_move > pwr_now)
2630 *imbalance = sds->busiest_load_per_task;
2631}
2632
2633/**
2634 * calculate_imbalance - Calculate the amount of imbalance present within the
2635 * groups of a given sched_domain during load balance.
2636 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
2637 * @this_cpu: Cpu for which currently load balance is being performed.
2638 * @imbalance: The variable to store the imbalance.
2639 */
2640static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
2641 unsigned long *imbalance)
2642{
2643 unsigned long max_pull, load_above_capacity = ~0UL;
2644
2645 sds->busiest_load_per_task /= sds->busiest_nr_running;
2646 if (sds->group_imb) {
2647 sds->busiest_load_per_task =
2648 min(sds->busiest_load_per_task, sds->avg_load);
2649 }
2650
2651 /*
2652 * In the presence of smp nice balancing, certain scenarios can have
2653 * max load less than avg load(as we skip the groups at or below
2654 * its cpu_power, while calculating max_load..)
2655 */
2656 if (sds->max_load < sds->avg_load) {
2657 *imbalance = 0;
2658 return fix_small_imbalance(sds, this_cpu, imbalance);
2659 }
2660
2661 if (!sds->group_imb) {
2662 /*
2663 * Don't want to pull so many tasks that a group would go idle.
2664 */
2665 load_above_capacity = (sds->busiest_nr_running -
2666 sds->busiest_group_capacity);
2667
2668 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
2669
2670 load_above_capacity /= sds->busiest->cpu_power;
2671 }
2672
2673 /*
2674 * We're trying to get all the cpus to the average_load, so we don't
2675 * want to push ourselves above the average load, nor do we wish to
2676 * reduce the max loaded cpu below the average load. At the same time,
2677 * we also don't want to reduce the group load below the group capacity
2678 * (so that we can implement power-savings policies etc). Thus we look
2679 * for the minimum possible imbalance.
2680 * Be careful of negative numbers as they'll appear as very large values
2681 * with unsigned longs.
2682 */
2683 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
2684
2685 /* How much load to actually move to equalise the imbalance */
2686 *imbalance = min(max_pull * sds->busiest->cpu_power,
2687 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
2688 / SCHED_LOAD_SCALE;
2689
2690 /*
2691 * if *imbalance is less than the average load per runnable task
2692 * there is no gaurantee that any tasks will be moved so we'll have
2693 * a think about bumping its value to force at least one task to be
2694 * moved
2695 */
2696 if (*imbalance < sds->busiest_load_per_task)
2697 return fix_small_imbalance(sds, this_cpu, imbalance);
2698
2699}
2700/******* find_busiest_group() helpers end here *********************/
2701
2702/**
2703 * find_busiest_group - Returns the busiest group within the sched_domain
2704 * if there is an imbalance. If there isn't an imbalance, and
2705 * the user has opted for power-savings, it returns a group whose
2706 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
2707 * such a group exists.
2708 *
2709 * Also calculates the amount of weighted load which should be moved
2710 * to restore balance.
2711 *
2712 * @sd: The sched_domain whose busiest group is to be returned.
2713 * @this_cpu: The cpu for which load balancing is currently being performed.
2714 * @imbalance: Variable which stores amount of weighted load which should
2715 * be moved to restore balance/put a group to idle.
2716 * @idle: The idle status of this_cpu.
2717 * @sd_idle: The idleness of sd
2718 * @cpus: The set of CPUs under consideration for load-balancing.
2719 * @balance: Pointer to a variable indicating if this_cpu
2720 * is the appropriate cpu to perform load balancing at this_level.
2721 *
2722 * Returns: - the busiest group if imbalance exists.
2723 * - If no imbalance and user has opted for power-savings balance,
2724 * return the least loaded group whose CPUs can be
2725 * put to idle by rebalancing its tasks onto our group.
2726 */
2727static struct sched_group *
2728find_busiest_group(struct sched_domain *sd, int this_cpu,
2729 unsigned long *imbalance, enum cpu_idle_type idle,
2730 int *sd_idle, const struct cpumask *cpus, int *balance)
2731{
2732 struct sd_lb_stats sds;
2733
2734 memset(&sds, 0, sizeof(sds));
2735
2736 /*
2737 * Compute the various statistics relavent for load balancing at
2738 * this level.
2739 */
2740 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
2741 balance, &sds);
2742
2743 /* Cases where imbalance does not exist from POV of this_cpu */
2744 /* 1) this_cpu is not the appropriate cpu to perform load balancing
2745 * at this level.
2746 * 2) There is no busy sibling group to pull from.
2747 * 3) This group is the busiest group.
2748 * 4) This group is more busy than the avg busieness at this
2749 * sched_domain.
2750 * 5) The imbalance is within the specified limit.
2751 */
2752 if (!(*balance))
2753 goto ret;
2754
2755 if (!sds.busiest || sds.busiest_nr_running == 0)
2756 goto out_balanced;
2757
2758 if (sds.this_load >= sds.max_load)
2759 goto out_balanced;
2760
2761 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
2762
2763 if (sds.this_load >= sds.avg_load)
2764 goto out_balanced;
2765
2766 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
2767 goto out_balanced;
2768
2769 /* Looks like there is an imbalance. Compute it */
2770 calculate_imbalance(&sds, this_cpu, imbalance);
2771 return sds.busiest;
2772
2773out_balanced:
2774 /*
2775 * There is no obvious imbalance. But check if we can do some balancing
2776 * to save power.
2777 */
2778 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
2779 return sds.busiest;
2780ret:
2781 *imbalance = 0;
2782 return NULL;
2783}
2784
2785/*
2786 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2787 */
2788static struct rq *
2789find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
2790 unsigned long imbalance, const struct cpumask *cpus)
2791{
2792 struct rq *busiest = NULL, *rq;
2793 unsigned long max_load = 0;
2794 int i;
2795
2796 for_each_cpu(i, sched_group_cpus(group)) {
2797 unsigned long power = power_of(i);
2798 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
2799 unsigned long wl;
2800
2801 if (!cpumask_test_cpu(i, cpus))
2802 continue;
2803
2804 rq = cpu_rq(i);
2805 wl = weighted_cpuload(i);
2806
2807 /*
2808 * When comparing with imbalance, use weighted_cpuload()
2809 * which is not scaled with the cpu power.
2810 */
2811 if (capacity && rq->nr_running == 1 && wl > imbalance)
2812 continue;
2813
2814 /*
2815 * For the load comparisons with the other cpu's, consider
2816 * the weighted_cpuload() scaled with the cpu power, so that
2817 * the load can be moved away from the cpu that is potentially
2818 * running at a lower capacity.
2819 */
2820 wl = (wl * SCHED_LOAD_SCALE) / power;
2821
2822 if (wl > max_load) {
2823 max_load = wl;
2824 busiest = rq;
2825 }
2826 }
2827
2828 return busiest;
2829}
2830
2831/*
2832 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
2833 * so long as it is large enough.
2834 */
2835#define MAX_PINNED_INTERVAL 512
2836
2837/* Working cpumask for load_balance and load_balance_newidle. */
2838static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
2839
2840static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle)
2841{
2842 if (idle == CPU_NEWLY_IDLE) {
2843 /*
2844 * The only task running in a non-idle cpu can be moved to this
2845 * cpu in an attempt to completely freeup the other CPU
2846 * package.
2847 *
2848 * The package power saving logic comes from
2849 * find_busiest_group(). If there are no imbalance, then
2850 * f_b_g() will return NULL. However when sched_mc={1,2} then
2851 * f_b_g() will select a group from which a running task may be
2852 * pulled to this cpu in order to make the other package idle.
2853 * If there is no opportunity to make a package idle and if
2854 * there are no imbalance, then f_b_g() will return NULL and no
2855 * action will be taken in load_balance_newidle().
2856 *
2857 * Under normal task pull operation due to imbalance, there
2858 * will be more than one task in the source run queue and
2859 * move_tasks() will succeed. ld_moved will be true and this
2860 * active balance code will not be triggered.
2861 */
2862 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2863 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2864 return 0;
2865
2866 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
2867 return 0;
2868 }
2869
2870 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
2871}
2872
2873/*
2874 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2875 * tasks if there is an imbalance.
2876 */
2877static int load_balance(int this_cpu, struct rq *this_rq,
2878 struct sched_domain *sd, enum cpu_idle_type idle,
2879 int *balance)
2880{
2881 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2882 struct sched_group *group;
2883 unsigned long imbalance;
2884 struct rq *busiest;
2885 unsigned long flags;
2886 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
2887
2888 cpumask_copy(cpus, cpu_active_mask);
2889
2890 /*
2891 * When power savings policy is enabled for the parent domain, idle
2892 * sibling can pick up load irrespective of busy siblings. In this case,
2893 * let the state of idle sibling percolate up as CPU_IDLE, instead of
2894 * portraying it as CPU_NOT_IDLE.
2895 */
2896 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2897 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
2898 sd_idle = 1;
2899
2900 schedstat_inc(sd, lb_count[idle]);
2901
2902redo:
2903 update_shares(sd);
2904 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2905 cpus, balance);
2906
2907 if (*balance == 0)
2908 goto out_balanced;
2909
2910 if (!group) {
2911 schedstat_inc(sd, lb_nobusyg[idle]);
2912 goto out_balanced;
2913 }
2914
2915 busiest = find_busiest_queue(group, idle, imbalance, cpus);
2916 if (!busiest) {
2917 schedstat_inc(sd, lb_nobusyq[idle]);
2918 goto out_balanced;
2919 }
2920
2921 BUG_ON(busiest == this_rq);
2922
2923 schedstat_add(sd, lb_imbalance[idle], imbalance);
2924
2925 ld_moved = 0;
2926 if (busiest->nr_running > 1) {
2927 /*
2928 * Attempt to move tasks. If find_busiest_group has found
2929 * an imbalance but busiest->nr_running <= 1, the group is
2930 * still unbalanced. ld_moved simply stays zero, so it is
2931 * correctly treated as an imbalance.
2932 */
2933 local_irq_save(flags);
2934 double_rq_lock(this_rq, busiest);
2935 ld_moved = move_tasks(this_rq, this_cpu, busiest,
2936 imbalance, sd, idle, &all_pinned);
2937 double_rq_unlock(this_rq, busiest);
2938 local_irq_restore(flags);
2939
2940 /*
2941 * some other cpu did the load balance for us.
2942 */
2943 if (ld_moved && this_cpu != smp_processor_id())
2944 resched_cpu(this_cpu);
2945
2946 /* All tasks on this runqueue were pinned by CPU affinity */
2947 if (unlikely(all_pinned)) {
2948 cpumask_clear_cpu(cpu_of(busiest), cpus);
2949 if (!cpumask_empty(cpus))
2950 goto redo;
2951 goto out_balanced;
2952 }
2953 }
2954
2955 if (!ld_moved) {
2956 schedstat_inc(sd, lb_failed[idle]);
2957 sd->nr_balance_failed++;
2958
2959 if (need_active_balance(sd, sd_idle, idle)) {
2960 raw_spin_lock_irqsave(&busiest->lock, flags);
2961
2962 /* don't kick the migration_thread, if the curr
2963 * task on busiest cpu can't be moved to this_cpu
2964 */
2965 if (!cpumask_test_cpu(this_cpu,
2966 &busiest->curr->cpus_allowed)) {
2967 raw_spin_unlock_irqrestore(&busiest->lock,
2968 flags);
2969 all_pinned = 1;
2970 goto out_one_pinned;
2971 }
2972
2973 if (!busiest->active_balance) {
2974 busiest->active_balance = 1;
2975 busiest->push_cpu = this_cpu;
2976 active_balance = 1;
2977 }
2978 raw_spin_unlock_irqrestore(&busiest->lock, flags);
2979 if (active_balance)
2980 wake_up_process(busiest->migration_thread);
2981
2982 /*
2983 * We've kicked active balancing, reset the failure
2984 * counter.
2985 */
2986 sd->nr_balance_failed = sd->cache_nice_tries+1;
2987 }
2988 } else
2989 sd->nr_balance_failed = 0;
2990
2991 if (likely(!active_balance)) {
2992 /* We were unbalanced, so reset the balancing interval */
2993 sd->balance_interval = sd->min_interval;
2994 } else {
2995 /*
2996 * If we've begun active balancing, start to back off. This
2997 * case may not be covered by the all_pinned logic if there
2998 * is only 1 task on the busy runqueue (because we don't call
2999 * move_tasks).
3000 */
3001 if (sd->balance_interval < sd->max_interval)
3002 sd->balance_interval *= 2;
3003 }
3004
3005 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3006 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3007 ld_moved = -1;
3008
3009 goto out;
3010
3011out_balanced:
3012 schedstat_inc(sd, lb_balanced[idle]);
3013
3014 sd->nr_balance_failed = 0;
3015
3016out_one_pinned:
3017 /* tune up the balancing interval */
3018 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
3019 (sd->balance_interval < sd->max_interval))
3020 sd->balance_interval *= 2;
3021
3022 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3023 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3024 ld_moved = -1;
3025 else
3026 ld_moved = 0;
3027out:
3028 if (ld_moved)
3029 update_shares(sd);
3030 return ld_moved;
3031}
3032
3033/*
3034 * idle_balance is called by schedule() if this_cpu is about to become
3035 * idle. Attempts to pull tasks from other CPUs.
3036 */
3037static void idle_balance(int this_cpu, struct rq *this_rq)
3038{
3039 struct sched_domain *sd;
3040 int pulled_task = 0;
3041 unsigned long next_balance = jiffies + HZ;
3042
3043 this_rq->idle_stamp = this_rq->clock;
3044
3045 if (this_rq->avg_idle < sysctl_sched_migration_cost)
3046 return;
3047
3048 /*
3049 * Drop the rq->lock, but keep IRQ/preempt disabled.
3050 */
3051 raw_spin_unlock(&this_rq->lock);
3052
3053 for_each_domain(this_cpu, sd) {
3054 unsigned long interval;
3055 int balance = 1;
3056
3057 if (!(sd->flags & SD_LOAD_BALANCE))
3058 continue;
3059
3060 if (sd->flags & SD_BALANCE_NEWIDLE) {
3061 /* If we've pulled tasks over stop searching: */
3062 pulled_task = load_balance(this_cpu, this_rq,
3063 sd, CPU_NEWLY_IDLE, &balance);
3064 }
3065
3066 interval = msecs_to_jiffies(sd->balance_interval);
3067 if (time_after(next_balance, sd->last_balance + interval))
3068 next_balance = sd->last_balance + interval;
3069 if (pulled_task) {
3070 this_rq->idle_stamp = 0;
3071 break;
3072 }
3073 }
3074
3075 raw_spin_lock(&this_rq->lock);
3076
3077 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
3078 /*
3079 * We are going idle. next_balance may be set based on
3080 * a busy processor. So reset next_balance.
3081 */
3082 this_rq->next_balance = next_balance;
3083 }
3084}
3085
3086/*
3087 * active_load_balance is run by migration threads. It pushes running tasks
3088 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
3089 * running on each physical CPU where possible, and avoids physical /
3090 * logical imbalances.
3091 *
3092 * Called with busiest_rq locked.
3093 */
3094static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3095{
3096 int target_cpu = busiest_rq->push_cpu;
3097 struct sched_domain *sd;
3098 struct rq *target_rq;
3099
3100 /* Is there any task to move? */
3101 if (busiest_rq->nr_running <= 1)
3102 return;
3103
3104 target_rq = cpu_rq(target_cpu);
3105
3106 /*
3107 * This condition is "impossible", if it occurs
3108 * we need to fix it. Originally reported by
3109 * Bjorn Helgaas on a 128-cpu setup.
3110 */
3111 BUG_ON(busiest_rq == target_rq);
3112
3113 /* move a task from busiest_rq to target_rq */
3114 double_lock_balance(busiest_rq, target_rq);
3115 update_rq_clock(busiest_rq);
3116 update_rq_clock(target_rq);
3117
3118 /* Search for an sd spanning us and the target CPU. */
3119 for_each_domain(target_cpu, sd) {
3120 if ((sd->flags & SD_LOAD_BALANCE) &&
3121 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
3122 break;
3123 }
3124
3125 if (likely(sd)) {
3126 schedstat_inc(sd, alb_count);
3127
3128 if (move_one_task(target_rq, target_cpu, busiest_rq,
3129 sd, CPU_IDLE))
3130 schedstat_inc(sd, alb_pushed);
3131 else
3132 schedstat_inc(sd, alb_failed);
3133 }
3134 double_unlock_balance(busiest_rq, target_rq);
3135}
3136
3137#ifdef CONFIG_NO_HZ
3138static struct {
3139 atomic_t load_balancer;
3140 cpumask_var_t cpu_mask;
3141 cpumask_var_t ilb_grp_nohz_mask;
3142} nohz ____cacheline_aligned = {
3143 .load_balancer = ATOMIC_INIT(-1),
3144};
3145
3146int get_nohz_load_balancer(void)
3147{
3148 return atomic_read(&nohz.load_balancer);
3149}
3150
3151#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3152/**
3153 * lowest_flag_domain - Return lowest sched_domain containing flag.
3154 * @cpu: The cpu whose lowest level of sched domain is to
3155 * be returned.
3156 * @flag: The flag to check for the lowest sched_domain
3157 * for the given cpu.
3158 *
3159 * Returns the lowest sched_domain of a cpu which contains the given flag.
3160 */
3161static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
3162{
3163 struct sched_domain *sd;
3164
3165 for_each_domain(cpu, sd)
3166 if (sd && (sd->flags & flag))
3167 break;
3168
3169 return sd;
3170}
3171
3172/**
3173 * for_each_flag_domain - Iterates over sched_domains containing the flag.
3174 * @cpu: The cpu whose domains we're iterating over.
3175 * @sd: variable holding the value of the power_savings_sd
3176 * for cpu.
3177 * @flag: The flag to filter the sched_domains to be iterated.
3178 *
3179 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
3180 * set, starting from the lowest sched_domain to the highest.
3181 */
3182#define for_each_flag_domain(cpu, sd, flag) \
3183 for (sd = lowest_flag_domain(cpu, flag); \
3184 (sd && (sd->flags & flag)); sd = sd->parent)
3185
3186/**
3187 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
3188 * @ilb_group: group to be checked for semi-idleness
3189 *
3190 * Returns: 1 if the group is semi-idle. 0 otherwise.
3191 *
3192 * We define a sched_group to be semi idle if it has atleast one idle-CPU
3193 * and atleast one non-idle CPU. This helper function checks if the given
3194 * sched_group is semi-idle or not.
3195 */
3196static inline int is_semi_idle_group(struct sched_group *ilb_group)
3197{
3198 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
3199 sched_group_cpus(ilb_group));
3200
3201 /*
3202 * A sched_group is semi-idle when it has atleast one busy cpu
3203 * and atleast one idle cpu.
3204 */
3205 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
3206 return 0;
3207
3208 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
3209 return 0;
3210
3211 return 1;
3212}
3213/**
3214 * find_new_ilb - Finds the optimum idle load balancer for nomination.
3215 * @cpu: The cpu which is nominating a new idle_load_balancer.
3216 *
3217 * Returns: Returns the id of the idle load balancer if it exists,
3218 * Else, returns >= nr_cpu_ids.
3219 *
3220 * This algorithm picks the idle load balancer such that it belongs to a
3221 * semi-idle powersavings sched_domain. The idea is to try and avoid
3222 * completely idle packages/cores just for the purpose of idle load balancing
3223 * when there are other idle cpu's which are better suited for that job.
3224 */
3225static int find_new_ilb(int cpu)
3226{
3227 struct sched_domain *sd;
3228 struct sched_group *ilb_group;
3229
3230 /*
3231 * Have idle load balancer selection from semi-idle packages only
3232 * when power-aware load balancing is enabled
3233 */
3234 if (!(sched_smt_power_savings || sched_mc_power_savings))
3235 goto out_done;
3236
3237 /*
3238 * Optimize for the case when we have no idle CPUs or only one
3239 * idle CPU. Don't walk the sched_domain hierarchy in such cases
3240 */
3241 if (cpumask_weight(nohz.cpu_mask) < 2)
3242 goto out_done;
3243
3244 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
3245 ilb_group = sd->groups;
3246
3247 do {
3248 if (is_semi_idle_group(ilb_group))
3249 return cpumask_first(nohz.ilb_grp_nohz_mask);
3250
3251 ilb_group = ilb_group->next;
3252
3253 } while (ilb_group != sd->groups);
3254 }
3255
3256out_done:
3257 return cpumask_first(nohz.cpu_mask);
3258}
3259#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
3260static inline int find_new_ilb(int call_cpu)
3261{
3262 return cpumask_first(nohz.cpu_mask);
3263}
3264#endif
3265
3266/*
3267 * This routine will try to nominate the ilb (idle load balancing)
3268 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
3269 * load balancing on behalf of all those cpus. If all the cpus in the system
3270 * go into this tickless mode, then there will be no ilb owner (as there is
3271 * no need for one) and all the cpus will sleep till the next wakeup event
3272 * arrives...
3273 *
3274 * For the ilb owner, tick is not stopped. And this tick will be used
3275 * for idle load balancing. ilb owner will still be part of
3276 * nohz.cpu_mask..
3277 *
3278 * While stopping the tick, this cpu will become the ilb owner if there
3279 * is no other owner. And will be the owner till that cpu becomes busy
3280 * or if all cpus in the system stop their ticks at which point
3281 * there is no need for ilb owner.
3282 *
3283 * When the ilb owner becomes busy, it nominates another owner, during the
3284 * next busy scheduler_tick()
3285 */
3286int select_nohz_load_balancer(int stop_tick)
3287{
3288 int cpu = smp_processor_id();
3289
3290 if (stop_tick) {
3291 cpu_rq(cpu)->in_nohz_recently = 1;
3292
3293 if (!cpu_active(cpu)) {
3294 if (atomic_read(&nohz.load_balancer) != cpu)
3295 return 0;
3296
3297 /*
3298 * If we are going offline and still the leader,
3299 * give up!
3300 */
3301 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3302 BUG();
3303
3304 return 0;
3305 }
3306
3307 cpumask_set_cpu(cpu, nohz.cpu_mask);
3308
3309 /* time for ilb owner also to sleep */
3310 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
3311 if (atomic_read(&nohz.load_balancer) == cpu)
3312 atomic_set(&nohz.load_balancer, -1);
3313 return 0;
3314 }
3315
3316 if (atomic_read(&nohz.load_balancer) == -1) {
3317 /* make me the ilb owner */
3318 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
3319 return 1;
3320 } else if (atomic_read(&nohz.load_balancer) == cpu) {
3321 int new_ilb;
3322
3323 if (!(sched_smt_power_savings ||
3324 sched_mc_power_savings))
3325 return 1;
3326 /*
3327 * Check to see if there is a more power-efficient
3328 * ilb.
3329 */
3330 new_ilb = find_new_ilb(cpu);
3331 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
3332 atomic_set(&nohz.load_balancer, -1);
3333 resched_cpu(new_ilb);
3334 return 0;
3335 }
3336 return 1;
3337 }
3338 } else {
3339 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
3340 return 0;
3341
3342 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3343
3344 if (atomic_read(&nohz.load_balancer) == cpu)
3345 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3346 BUG();
3347 }
3348 return 0;
3349}
3350#endif
3351
3352static DEFINE_SPINLOCK(balancing);
3353
3354/*
3355 * It checks each scheduling domain to see if it is due to be balanced,
3356 * and initiates a balancing operation if so.
3357 *
3358 * Balancing parameters are set up in arch_init_sched_domains.
3359 */
3360static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3361{
3362 int balance = 1;
3363 struct rq *rq = cpu_rq(cpu);
3364 unsigned long interval;
3365 struct sched_domain *sd;
3366 /* Earliest time when we have to do rebalance again */
3367 unsigned long next_balance = jiffies + 60*HZ;
3368 int update_next_balance = 0;
3369 int need_serialize;
3370
3371 for_each_domain(cpu, sd) {
3372 if (!(sd->flags & SD_LOAD_BALANCE))
3373 continue;
3374
3375 interval = sd->balance_interval;
3376 if (idle != CPU_IDLE)
3377 interval *= sd->busy_factor;
3378
3379 /* scale ms to jiffies */
3380 interval = msecs_to_jiffies(interval);
3381 if (unlikely(!interval))
3382 interval = 1;
3383 if (interval > HZ*NR_CPUS/10)
3384 interval = HZ*NR_CPUS/10;
3385
3386 need_serialize = sd->flags & SD_SERIALIZE;
3387
3388 if (need_serialize) {
3389 if (!spin_trylock(&balancing))
3390 goto out;
3391 }
3392
3393 if (time_after_eq(jiffies, sd->last_balance + interval)) {
3394 if (load_balance(cpu, rq, sd, idle, &balance)) {
3395 /*
3396 * We've pulled tasks over so either we're no
3397 * longer idle, or one of our SMT siblings is
3398 * not idle.
3399 */
3400 idle = CPU_NOT_IDLE;
3401 }
3402 sd->last_balance = jiffies;
3403 }
3404 if (need_serialize)
3405 spin_unlock(&balancing);
3406out:
3407 if (time_after(next_balance, sd->last_balance + interval)) {
3408 next_balance = sd->last_balance + interval;
3409 update_next_balance = 1;
3410 }
3411
3412 /*
3413 * Stop the load balance at this level. There is another
3414 * CPU in our sched group which is doing load balancing more
3415 * actively.
3416 */
3417 if (!balance)
3418 break;
3419 }
3420
3421 /*
3422 * next_balance will be updated only when there is a need.
3423 * When the cpu is attached to null domain for ex, it will not be
3424 * updated.
3425 */
3426 if (likely(update_next_balance))
3427 rq->next_balance = next_balance;
3428}
3429
3430/*
3431 * run_rebalance_domains is triggered when needed from the scheduler tick.
3432 * In CONFIG_NO_HZ case, the idle load balance owner will do the
3433 * rebalancing for all the cpus for whom scheduler ticks are stopped.
3434 */
3435static void run_rebalance_domains(struct softirq_action *h)
3436{
3437 int this_cpu = smp_processor_id();
3438 struct rq *this_rq = cpu_rq(this_cpu);
3439 enum cpu_idle_type idle = this_rq->idle_at_tick ?
3440 CPU_IDLE : CPU_NOT_IDLE;
3441
3442 rebalance_domains(this_cpu, idle);
3443
3444#ifdef CONFIG_NO_HZ
3445 /*
3446 * If this cpu is the owner for idle load balancing, then do the
3447 * balancing on behalf of the other idle cpus whose ticks are
3448 * stopped.
3449 */
3450 if (this_rq->idle_at_tick &&
3451 atomic_read(&nohz.load_balancer) == this_cpu) {
3452 struct rq *rq;
3453 int balance_cpu;
3454
3455 for_each_cpu(balance_cpu, nohz.cpu_mask) {
3456 if (balance_cpu == this_cpu)
3457 continue;
3458
3459 /*
3460 * If this cpu gets work to do, stop the load balancing
3461 * work being done for other cpus. Next load
3462 * balancing owner will pick it up.
3463 */
3464 if (need_resched())
3465 break;
3466
3467 rebalance_domains(balance_cpu, CPU_IDLE);
3468
3469 rq = cpu_rq(balance_cpu);
3470 if (time_after(this_rq->next_balance, rq->next_balance))
3471 this_rq->next_balance = rq->next_balance;
3472 }
3473 }
3474#endif
3475}
3476
3477static inline int on_null_domain(int cpu)
3478{
3479 return !rcu_dereference(cpu_rq(cpu)->sd);
3480}
3481
3482/*
3483 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
3484 *
3485 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
3486 * idle load balancing owner or decide to stop the periodic load balancing,
3487 * if the whole system is idle.
3488 */
3489static inline void trigger_load_balance(struct rq *rq, int cpu)
3490{
3491#ifdef CONFIG_NO_HZ
3492 /*
3493 * If we were in the nohz mode recently and busy at the current
3494 * scheduler tick, then check if we need to nominate new idle
3495 * load balancer.
3496 */
3497 if (rq->in_nohz_recently && !rq->idle_at_tick) {
3498 rq->in_nohz_recently = 0;
3499
3500 if (atomic_read(&nohz.load_balancer) == cpu) {
3501 cpumask_clear_cpu(cpu, nohz.cpu_mask);
3502 atomic_set(&nohz.load_balancer, -1);
3503 }
3504
3505 if (atomic_read(&nohz.load_balancer) == -1) {
3506 int ilb = find_new_ilb(cpu);
3507
3508 if (ilb < nr_cpu_ids)
3509 resched_cpu(ilb);
3510 }
3511 }
3512
3513 /*
3514 * If this cpu is idle and doing idle load balancing for all the
3515 * cpus with ticks stopped, is it time for that to stop?
3516 */
3517 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
3518 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
3519 resched_cpu(cpu);
3520 return;
3521 }
3522
3523 /*
3524 * If this cpu is idle and the idle load balancing is done by
3525 * someone else, then no need raise the SCHED_SOFTIRQ
3526 */
3527 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
3528 cpumask_test_cpu(cpu, nohz.cpu_mask))
3529 return;
3530#endif
3531 /* Don't need to rebalance while attached to NULL domain */
3532 if (time_after_eq(jiffies, rq->next_balance) &&
3533 likely(!on_null_domain(cpu)))
3534 raise_softirq(SCHED_SOFTIRQ);
3535}
1954 3536
1955static void rq_online_fair(struct rq *rq) 3537static void rq_online_fair(struct rq *rq)
1956{ 3538{
@@ -1962,6 +3544,15 @@ static void rq_offline_fair(struct rq *rq)
1962 update_sysctl(); 3544 update_sysctl();
1963} 3545}
1964 3546
3547#else /* CONFIG_SMP */
3548
3549/*
3550 * on UP we do not need to balance between CPUs:
3551 */
3552static inline void idle_balance(int cpu, struct rq *rq)
3553{
3554}
3555
1965#endif /* CONFIG_SMP */ 3556#endif /* CONFIG_SMP */
1966 3557
1967/* 3558/*
@@ -2076,7 +3667,7 @@ static void moved_group_fair(struct task_struct *p, int on_rq)
2076} 3667}
2077#endif 3668#endif
2078 3669
2079unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task) 3670static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
2080{ 3671{
2081 struct sched_entity *se = &task->se; 3672 struct sched_entity *se = &task->se;
2082 unsigned int rr_interval = 0; 3673 unsigned int rr_interval = 0;
@@ -2108,8 +3699,6 @@ static const struct sched_class fair_sched_class = {
2108#ifdef CONFIG_SMP 3699#ifdef CONFIG_SMP
2109 .select_task_rq = select_task_rq_fair, 3700 .select_task_rq = select_task_rq_fair,
2110 3701
2111 .load_balance = load_balance_fair,
2112 .move_one_task = move_one_task_fair,
2113 .rq_online = rq_online_fair, 3702 .rq_online = rq_online_fair,
2114 .rq_offline = rq_offline_fair, 3703 .rq_offline = rq_offline_fair,
2115 3704
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 5f93b570d383..a8a6d8a50947 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -44,24 +44,6 @@ static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
44{ 44{
45} 45}
46 46
47#ifdef CONFIG_SMP
48static unsigned long
49load_balance_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
50 unsigned long max_load_move,
51 struct sched_domain *sd, enum cpu_idle_type idle,
52 int *all_pinned, int *this_best_prio)
53{
54 return 0;
55}
56
57static int
58move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
59 struct sched_domain *sd, enum cpu_idle_type idle)
60{
61 return 0;
62}
63#endif
64
65static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) 47static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
66{ 48{
67} 49}
@@ -97,7 +79,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
97 check_preempt_curr(rq, p, 0); 79 check_preempt_curr(rq, p, 0);
98} 80}
99 81
100unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) 82static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
101{ 83{
102 return 0; 84 return 0;
103} 85}
@@ -119,9 +101,6 @@ static const struct sched_class idle_sched_class = {
119 101
120#ifdef CONFIG_SMP 102#ifdef CONFIG_SMP
121 .select_task_rq = select_task_rq_idle, 103 .select_task_rq = select_task_rq_idle,
122
123 .load_balance = load_balance_idle,
124 .move_one_task = move_one_task_idle,
125#endif 104#endif
126 105
127 .set_curr_task = set_curr_task_idle, 106 .set_curr_task = set_curr_task_idle,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f48328ac216f..bf3e38fdbe6d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -194,17 +194,20 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
194 return rt_se->my_q; 194 return rt_se->my_q;
195} 195}
196 196
197static void enqueue_rt_entity(struct sched_rt_entity *rt_se); 197static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
198static void dequeue_rt_entity(struct sched_rt_entity *rt_se); 198static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
199 199
200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 int this_cpu = smp_processor_id();
202 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 203 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
203 struct sched_rt_entity *rt_se = rt_rq->rt_se; 204 struct sched_rt_entity *rt_se;
205
206 rt_se = rt_rq->tg->rt_se[this_cpu];
204 207
205 if (rt_rq->rt_nr_running) { 208 if (rt_rq->rt_nr_running) {
206 if (rt_se && !on_rt_rq(rt_se)) 209 if (rt_se && !on_rt_rq(rt_se))
207 enqueue_rt_entity(rt_se); 210 enqueue_rt_entity(rt_se, false);
208 if (rt_rq->highest_prio.curr < curr->prio) 211 if (rt_rq->highest_prio.curr < curr->prio)
209 resched_task(curr); 212 resched_task(curr);
210 } 213 }
@@ -212,7 +215,10 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
212 215
213static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 216static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
214{ 217{
215 struct sched_rt_entity *rt_se = rt_rq->rt_se; 218 int this_cpu = smp_processor_id();
219 struct sched_rt_entity *rt_se;
220
221 rt_se = rt_rq->tg->rt_se[this_cpu];
216 222
217 if (rt_se && on_rt_rq(rt_se)) 223 if (rt_se && on_rt_rq(rt_se))
218 dequeue_rt_entity(rt_se); 224 dequeue_rt_entity(rt_se);
@@ -803,7 +809,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
803 dec_rt_group(rt_se, rt_rq); 809 dec_rt_group(rt_se, rt_rq);
804} 810}
805 811
806static void __enqueue_rt_entity(struct sched_rt_entity *rt_se) 812static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
807{ 813{
808 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 814 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
809 struct rt_prio_array *array = &rt_rq->active; 815 struct rt_prio_array *array = &rt_rq->active;
@@ -819,7 +825,10 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
819 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 825 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
820 return; 826 return;
821 827
822 list_add_tail(&rt_se->run_list, queue); 828 if (head)
829 list_add(&rt_se->run_list, queue);
830 else
831 list_add_tail(&rt_se->run_list, queue);
823 __set_bit(rt_se_prio(rt_se), array->bitmap); 832 __set_bit(rt_se_prio(rt_se), array->bitmap);
824 833
825 inc_rt_tasks(rt_se, rt_rq); 834 inc_rt_tasks(rt_se, rt_rq);
@@ -856,11 +865,11 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
856 } 865 }
857} 866}
858 867
859static void enqueue_rt_entity(struct sched_rt_entity *rt_se) 868static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
860{ 869{
861 dequeue_rt_stack(rt_se); 870 dequeue_rt_stack(rt_se);
862 for_each_sched_rt_entity(rt_se) 871 for_each_sched_rt_entity(rt_se)
863 __enqueue_rt_entity(rt_se); 872 __enqueue_rt_entity(rt_se, head);
864} 873}
865 874
866static void dequeue_rt_entity(struct sched_rt_entity *rt_se) 875static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
@@ -871,21 +880,22 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
871 struct rt_rq *rt_rq = group_rt_rq(rt_se); 880 struct rt_rq *rt_rq = group_rt_rq(rt_se);
872 881
873 if (rt_rq && rt_rq->rt_nr_running) 882 if (rt_rq && rt_rq->rt_nr_running)
874 __enqueue_rt_entity(rt_se); 883 __enqueue_rt_entity(rt_se, false);
875 } 884 }
876} 885}
877 886
878/* 887/*
879 * Adding/removing a task to/from a priority array: 888 * Adding/removing a task to/from a priority array:
880 */ 889 */
881static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 890static void
891enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup, bool head)
882{ 892{
883 struct sched_rt_entity *rt_se = &p->rt; 893 struct sched_rt_entity *rt_se = &p->rt;
884 894
885 if (wakeup) 895 if (wakeup)
886 rt_se->timeout = 0; 896 rt_se->timeout = 0;
887 897
888 enqueue_rt_entity(rt_se); 898 enqueue_rt_entity(rt_se, head);
889 899
890 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) 900 if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
891 enqueue_pushable_task(rq, p); 901 enqueue_pushable_task(rq, p);
@@ -1481,24 +1491,6 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
1481 push_rt_tasks(rq); 1491 push_rt_tasks(rq);
1482} 1492}
1483 1493
1484static unsigned long
1485load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1486 unsigned long max_load_move,
1487 struct sched_domain *sd, enum cpu_idle_type idle,
1488 int *all_pinned, int *this_best_prio)
1489{
1490 /* don't touch RT tasks */
1491 return 0;
1492}
1493
1494static int
1495move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
1496 struct sched_domain *sd, enum cpu_idle_type idle)
1497{
1498 /* don't touch RT tasks */
1499 return 0;
1500}
1501
1502static void set_cpus_allowed_rt(struct task_struct *p, 1494static void set_cpus_allowed_rt(struct task_struct *p,
1503 const struct cpumask *new_mask) 1495 const struct cpumask *new_mask)
1504{ 1496{
@@ -1721,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
1721 dequeue_pushable_task(rq, p); 1713 dequeue_pushable_task(rq, p);
1722} 1714}
1723 1715
1724unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) 1716static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
1725{ 1717{
1726 /* 1718 /*
1727 * Time slice is 0 for SCHED_FIFO tasks 1719 * Time slice is 0 for SCHED_FIFO tasks
@@ -1746,8 +1738,6 @@ static const struct sched_class rt_sched_class = {
1746#ifdef CONFIG_SMP 1738#ifdef CONFIG_SMP
1747 .select_task_rq = select_task_rq_rt, 1739 .select_task_rq = select_task_rq_rt,
1748 1740
1749 .load_balance = load_balance_rt,
1750 .move_one_task = move_one_task_rt,
1751 .set_cpus_allowed = set_cpus_allowed_rt, 1741 .set_cpus_allowed = set_cpus_allowed_rt,
1752 .rq_online = rq_online_rt, 1742 .rq_online = rq_online_rt,
1753 .rq_offline = rq_offline_rt, 1743 .rq_offline = rq_offline_rt,
diff --git a/kernel/sys.c b/kernel/sys.c
index 18bde979f346..877fe4f8e05e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -571,11 +571,6 @@ static int set_user(struct cred *new)
571 if (!new_user) 571 if (!new_user)
572 return -EAGAIN; 572 return -EAGAIN;
573 573
574 if (!task_can_switch_user(new_user, current)) {
575 free_uid(new_user);
576 return -EINVAL;
577 }
578
579 if (atomic_read(&new_user->processes) >= 574 if (atomic_read(&new_user->processes) >=
580 current->signal->rlim[RLIMIT_NPROC].rlim_cur && 575 current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
581 new_user != INIT_USER) { 576 new_user != INIT_USER) {
diff --git a/kernel/user.c b/kernel/user.c
index 46d0165ca70c..766467b3bcb7 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -56,9 +56,6 @@ struct user_struct root_user = {
56 .sigpending = ATOMIC_INIT(0), 56 .sigpending = ATOMIC_INIT(0),
57 .locked_shm = 0, 57 .locked_shm = 0,
58 .user_ns = &init_user_ns, 58 .user_ns = &init_user_ns,
59#ifdef CONFIG_USER_SCHED
60 .tg = &init_task_group,
61#endif
62}; 59};
63 60
64/* 61/*
@@ -75,268 +72,6 @@ static void uid_hash_remove(struct user_struct *up)
75 put_user_ns(up->user_ns); 72 put_user_ns(up->user_ns);
76} 73}
77 74
78#ifdef CONFIG_USER_SCHED
79
80static void sched_destroy_user(struct user_struct *up)
81{
82 sched_destroy_group(up->tg);
83}
84
85static int sched_create_user(struct user_struct *up)
86{
87 int rc = 0;
88
89 up->tg = sched_create_group(&root_task_group);
90 if (IS_ERR(up->tg))
91 rc = -ENOMEM;
92
93 set_tg_uid(up);
94
95 return rc;
96}
97
98#else /* CONFIG_USER_SCHED */
99
100static void sched_destroy_user(struct user_struct *up) { }
101static int sched_create_user(struct user_struct *up) { return 0; }
102
103#endif /* CONFIG_USER_SCHED */
104
105#if defined(CONFIG_USER_SCHED) && defined(CONFIG_SYSFS)
106
107static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
108{
109 struct user_struct *user;
110 struct hlist_node *h;
111
112 hlist_for_each_entry(user, h, hashent, uidhash_node) {
113 if (user->uid == uid) {
114 /* possibly resurrect an "almost deleted" object */
115 if (atomic_inc_return(&user->__count) == 1)
116 cancel_delayed_work(&user->work);
117 return user;
118 }
119 }
120
121 return NULL;
122}
123
124static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
125static DEFINE_MUTEX(uids_mutex);
126
127static inline void uids_mutex_lock(void)
128{
129 mutex_lock(&uids_mutex);
130}
131
132static inline void uids_mutex_unlock(void)
133{
134 mutex_unlock(&uids_mutex);
135}
136
137/* uid directory attributes */
138#ifdef CONFIG_FAIR_GROUP_SCHED
139static ssize_t cpu_shares_show(struct kobject *kobj,
140 struct kobj_attribute *attr,
141 char *buf)
142{
143 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144
145 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
146}
147
148static ssize_t cpu_shares_store(struct kobject *kobj,
149 struct kobj_attribute *attr,
150 const char *buf, size_t size)
151{
152 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
153 unsigned long shares;
154 int rc;
155
156 sscanf(buf, "%lu", &shares);
157
158 rc = sched_group_set_shares(up->tg, shares);
159
160 return (rc ? rc : size);
161}
162
163static struct kobj_attribute cpu_share_attr =
164 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
165#endif
166
167#ifdef CONFIG_RT_GROUP_SCHED
168static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169 struct kobj_attribute *attr,
170 char *buf)
171{
172 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
173
174 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
175}
176
177static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
178 struct kobj_attribute *attr,
179 const char *buf, size_t size)
180{
181 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
182 unsigned long rt_runtime;
183 int rc;
184
185 sscanf(buf, "%ld", &rt_runtime);
186
187 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
188
189 return (rc ? rc : size);
190}
191
192static struct kobj_attribute cpu_rt_runtime_attr =
193 __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
194
195static ssize_t cpu_rt_period_show(struct kobject *kobj,
196 struct kobj_attribute *attr,
197 char *buf)
198{
199 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
200
201 return sprintf(buf, "%lu\n", sched_group_rt_period(up->tg));
202}
203
204static ssize_t cpu_rt_period_store(struct kobject *kobj,
205 struct kobj_attribute *attr,
206 const char *buf, size_t size)
207{
208 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
209 unsigned long rt_period;
210 int rc;
211
212 sscanf(buf, "%lu", &rt_period);
213
214 rc = sched_group_set_rt_period(up->tg, rt_period);
215
216 return (rc ? rc : size);
217}
218
219static struct kobj_attribute cpu_rt_period_attr =
220 __ATTR(cpu_rt_period, 0644, cpu_rt_period_show, cpu_rt_period_store);
221#endif
222
223/* default attributes per uid directory */
224static struct attribute *uids_attributes[] = {
225#ifdef CONFIG_FAIR_GROUP_SCHED
226 &cpu_share_attr.attr,
227#endif
228#ifdef CONFIG_RT_GROUP_SCHED
229 &cpu_rt_runtime_attr.attr,
230 &cpu_rt_period_attr.attr,
231#endif
232 NULL
233};
234
235/* the lifetime of user_struct is not managed by the core (now) */
236static void uids_release(struct kobject *kobj)
237{
238 return;
239}
240
241static struct kobj_type uids_ktype = {
242 .sysfs_ops = &kobj_sysfs_ops,
243 .default_attrs = uids_attributes,
244 .release = uids_release,
245};
246
247/*
248 * Create /sys/kernel/uids/<uid>/cpu_share file for this user
249 * We do not create this file for users in a user namespace (until
250 * sysfs tagging is implemented).
251 *
252 * See Documentation/scheduler/sched-design-CFS.txt for ramifications.
253 */
254static int uids_user_create(struct user_struct *up)
255{
256 struct kobject *kobj = &up->kobj;
257 int error;
258
259 memset(kobj, 0, sizeof(struct kobject));
260 if (up->user_ns != &init_user_ns)
261 return 0;
262 kobj->kset = uids_kset;
263 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
264 if (error) {
265 kobject_put(kobj);
266 goto done;
267 }
268
269 kobject_uevent(kobj, KOBJ_ADD);
270done:
271 return error;
272}
273
274/* create these entries in sysfs:
275 * "/sys/kernel/uids" directory
276 * "/sys/kernel/uids/0" directory (for root user)
277 * "/sys/kernel/uids/0/cpu_share" file (for root user)
278 */
279int __init uids_sysfs_init(void)
280{
281 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
282 if (!uids_kset)
283 return -ENOMEM;
284
285 return uids_user_create(&root_user);
286}
287
288/* delayed work function to remove sysfs directory for a user and free up
289 * corresponding structures.
290 */
291static void cleanup_user_struct(struct work_struct *w)
292{
293 struct user_struct *up = container_of(w, struct user_struct, work.work);
294 unsigned long flags;
295 int remove_user = 0;
296
297 /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
298 * atomic.
299 */
300 uids_mutex_lock();
301
302 spin_lock_irqsave(&uidhash_lock, flags);
303 if (atomic_read(&up->__count) == 0) {
304 uid_hash_remove(up);
305 remove_user = 1;
306 }
307 spin_unlock_irqrestore(&uidhash_lock, flags);
308
309 if (!remove_user)
310 goto done;
311
312 if (up->user_ns == &init_user_ns) {
313 kobject_uevent(&up->kobj, KOBJ_REMOVE);
314 kobject_del(&up->kobj);
315 kobject_put(&up->kobj);
316 }
317
318 sched_destroy_user(up);
319 key_put(up->uid_keyring);
320 key_put(up->session_keyring);
321 kmem_cache_free(uid_cachep, up);
322
323done:
324 uids_mutex_unlock();
325}
326
327/* IRQs are disabled and uidhash_lock is held upon function entry.
328 * IRQ state (as stored in flags) is restored and uidhash_lock released
329 * upon function exit.
330 */
331static void free_user(struct user_struct *up, unsigned long flags)
332{
333 INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
334 schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
335 spin_unlock_irqrestore(&uidhash_lock, flags);
336}
337
338#else /* CONFIG_USER_SCHED && CONFIG_SYSFS */
339
340static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent) 75static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
341{ 76{
342 struct user_struct *user; 77 struct user_struct *user;
@@ -352,11 +87,6 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
352 return NULL; 87 return NULL;
353} 88}
354 89
355int uids_sysfs_init(void) { return 0; }
356static inline int uids_user_create(struct user_struct *up) { return 0; }
357static inline void uids_mutex_lock(void) { }
358static inline void uids_mutex_unlock(void) { }
359
360/* IRQs are disabled and uidhash_lock is held upon function entry. 90/* IRQs are disabled and uidhash_lock is held upon function entry.
361 * IRQ state (as stored in flags) is restored and uidhash_lock released 91 * IRQ state (as stored in flags) is restored and uidhash_lock released
362 * upon function exit. 92 * upon function exit.
@@ -365,32 +95,11 @@ static void free_user(struct user_struct *up, unsigned long flags)
365{ 95{
366 uid_hash_remove(up); 96 uid_hash_remove(up);
367 spin_unlock_irqrestore(&uidhash_lock, flags); 97 spin_unlock_irqrestore(&uidhash_lock, flags);
368 sched_destroy_user(up);
369 key_put(up->uid_keyring); 98 key_put(up->uid_keyring);
370 key_put(up->session_keyring); 99 key_put(up->session_keyring);
371 kmem_cache_free(uid_cachep, up); 100 kmem_cache_free(uid_cachep, up);
372} 101}
373 102
374#endif
375
376#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
377/*
378 * We need to check if a setuid can take place. This function should be called
379 * before successfully completing the setuid.
380 */
381int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
382{
383
384 return sched_rt_can_attach(up->tg, tsk);
385
386}
387#else
388int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
389{
390 return 1;
391}
392#endif
393
394/* 103/*
395 * Locate the user_struct for the passed UID. If found, take a ref on it. The 104 * Locate the user_struct for the passed UID. If found, take a ref on it. The
396 * caller must undo that ref with free_uid(). 105 * caller must undo that ref with free_uid().
@@ -431,8 +140,6 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
431 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() 140 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
432 * atomic. 141 * atomic.
433 */ 142 */
434 uids_mutex_lock();
435
436 spin_lock_irq(&uidhash_lock); 143 spin_lock_irq(&uidhash_lock);
437 up = uid_hash_find(uid, hashent); 144 up = uid_hash_find(uid, hashent);
438 spin_unlock_irq(&uidhash_lock); 145 spin_unlock_irq(&uidhash_lock);
@@ -445,14 +152,8 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
445 new->uid = uid; 152 new->uid = uid;
446 atomic_set(&new->__count, 1); 153 atomic_set(&new->__count, 1);
447 154
448 if (sched_create_user(new) < 0)
449 goto out_free_user;
450
451 new->user_ns = get_user_ns(ns); 155 new->user_ns = get_user_ns(ns);
452 156
453 if (uids_user_create(new))
454 goto out_destoy_sched;
455
456 /* 157 /*
457 * Before adding this, check whether we raced 158 * Before adding this, check whether we raced
458 * on adding the same user already.. 159 * on adding the same user already..
@@ -475,17 +176,11 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
475 spin_unlock_irq(&uidhash_lock); 176 spin_unlock_irq(&uidhash_lock);
476 } 177 }
477 178
478 uids_mutex_unlock();
479
480 return up; 179 return up;
481 180
482out_destoy_sched:
483 sched_destroy_user(new);
484 put_user_ns(new->user_ns); 181 put_user_ns(new->user_ns);
485out_free_user:
486 kmem_cache_free(uid_cachep, new); 182 kmem_cache_free(uid_cachep, new);
487out_unlock: 183out_unlock:
488 uids_mutex_unlock();
489 return NULL; 184 return NULL;
490} 185}
491 186