aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-03-09 11:11:53 -0500
committerIngo Molnar <mingo@elte.hu>2010-03-09 11:11:53 -0500
commit548b84166917d6f5e2296123b85ad24aecd3801d (patch)
tree0ab0300e23a02df0fe3c0579627e4998bb122c00 /kernel/sched.c
parentcfb581bcd4f8c158c6f2b48bf5e232bb9e6855c0 (diff)
parent57d54889cd00db2752994b389ba714138652e60c (diff)
Merge commit 'v2.6.34-rc1' into perf/urgent
Conflicts: tools/perf/util/probe-event.c Merge reason: Pick up -rc1 and resolve the conflict as well. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c2206
1 files changed, 176 insertions, 2030 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 3e71ebb101c2..150b6988de49 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 233 */
234static DEFINE_MUTEX(sched_domains_mutex); 234static DEFINE_MUTEX(sched_domains_mutex);
235 235
236#ifdef CONFIG_GROUP_SCHED 236#ifdef CONFIG_CGROUP_SCHED
237 237
238#include <linux/cgroup.h> 238#include <linux/cgroup.h>
239 239
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
243 243
244/* task group related information */ 244/* task group related information */
245struct task_group { 245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 246 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 247
254#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 249 /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
274 struct list_head children; 268 struct list_head children;
275}; 269};
276 270
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 271#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 272
307/* task_group_lock serializes add/remove of task groups and also changes to 273/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 274 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
318} 284}
319#endif 285#endif
320 286
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 287# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 288
327/* 289/*
328 * A weight of 0 or 1 can cause arithmetics problems. 290 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 310{
349 struct task_group *tg; 311 struct task_group *tg;
350 312
351#ifdef CONFIG_USER_SCHED 313#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 315 struct task_group, css);
358#else 316#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 341 return NULL;
384} 342}
385 343
386#endif /* CONFIG_GROUP_SCHED */ 344#endif /* CONFIG_CGROUP_SCHED */
387 345
388/* CFS-related fields in a runqueue */ 346/* CFS-related fields in a runqueue */
389struct cfs_rq { 347struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
478 struct rq *rq; 436 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 437 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 438 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 439#endif
483}; 440};
484 441
@@ -645,6 +602,11 @@ static inline int cpu_of(struct rq *rq)
645#endif 602#endif
646} 603}
647 604
605#define rcu_dereference_check_sched_domain(p) \
606 rcu_dereference_check((p), \
607 rcu_read_lock_sched_held() || \
608 lockdep_is_held(&sched_domains_mutex))
609
648/* 610/*
649 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 611 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
650 * See detach_destroy_domains: synchronize_sched for details. 612 * See detach_destroy_domains: synchronize_sched for details.
@@ -653,7 +615,7 @@ static inline int cpu_of(struct rq *rq)
653 * preempt-disabled sections. 615 * preempt-disabled sections.
654 */ 616 */
655#define for_each_domain(cpu, __sd) \ 617#define for_each_domain(cpu, __sd) \
656 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 618 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
657 619
658#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 620#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
659#define this_rq() (&__get_cpu_var(runqueues)) 621#define this_rq() (&__get_cpu_var(runqueues))
@@ -941,16 +903,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
941#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 903#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
942 904
943/* 905/*
906 * Check whether the task is waking, we use this to synchronize against
907 * ttwu() so that task_cpu() reports a stable number.
908 *
909 * We need to make an exception for PF_STARTING tasks because the fork
910 * path might require task_rq_lock() to work, eg. it can call
911 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
912 */
913static inline int task_is_waking(struct task_struct *p)
914{
915 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
916}
917
918/*
944 * __task_rq_lock - lock the runqueue a given task resides on. 919 * __task_rq_lock - lock the runqueue a given task resides on.
945 * Must be called interrupts disabled. 920 * Must be called interrupts disabled.
946 */ 921 */
947static inline struct rq *__task_rq_lock(struct task_struct *p) 922static inline struct rq *__task_rq_lock(struct task_struct *p)
948 __acquires(rq->lock) 923 __acquires(rq->lock)
949{ 924{
925 struct rq *rq;
926
950 for (;;) { 927 for (;;) {
951 struct rq *rq = task_rq(p); 928 while (task_is_waking(p))
929 cpu_relax();
930 rq = task_rq(p);
952 raw_spin_lock(&rq->lock); 931 raw_spin_lock(&rq->lock);
953 if (likely(rq == task_rq(p))) 932 if (likely(rq == task_rq(p) && !task_is_waking(p)))
954 return rq; 933 return rq;
955 raw_spin_unlock(&rq->lock); 934 raw_spin_unlock(&rq->lock);
956 } 935 }
@@ -967,10 +946,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
967 struct rq *rq; 946 struct rq *rq;
968 947
969 for (;;) { 948 for (;;) {
949 while (task_is_waking(p))
950 cpu_relax();
970 local_irq_save(*flags); 951 local_irq_save(*flags);
971 rq = task_rq(p); 952 rq = task_rq(p);
972 raw_spin_lock(&rq->lock); 953 raw_spin_lock(&rq->lock);
973 if (likely(rq == task_rq(p))) 954 if (likely(rq == task_rq(p) && !task_is_waking(p)))
974 return rq; 955 return rq;
975 raw_spin_unlock_irqrestore(&rq->lock, *flags); 956 raw_spin_unlock_irqrestore(&rq->lock, *flags);
976 } 957 }
@@ -1390,32 +1371,6 @@ static const u32 prio_to_wmult[40] = {
1390 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1371 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1391}; 1372};
1392 1373
1393static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1394
1395/*
1396 * runqueue iterator, to support SMP load-balancing between different
1397 * scheduling classes, without having to expose their internal data
1398 * structures to the load-balancing proper:
1399 */
1400struct rq_iterator {
1401 void *arg;
1402 struct task_struct *(*start)(void *);
1403 struct task_struct *(*next)(void *);
1404};
1405
1406#ifdef CONFIG_SMP
1407static unsigned long
1408balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1409 unsigned long max_load_move, struct sched_domain *sd,
1410 enum cpu_idle_type idle, int *all_pinned,
1411 int *this_best_prio, struct rq_iterator *iterator);
1412
1413static int
1414iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1415 struct sched_domain *sd, enum cpu_idle_type idle,
1416 struct rq_iterator *iterator);
1417#endif
1418
1419/* Time spent by the tasks of the cpu accounting group executing in ... */ 1374/* Time spent by the tasks of the cpu accounting group executing in ... */
1420enum cpuacct_stat_index { 1375enum cpuacct_stat_index {
1421 CPUACCT_STAT_USER, /* ... user mode */ 1376 CPUACCT_STAT_USER, /* ... user mode */
@@ -1531,7 +1486,7 @@ static unsigned long target_load(int cpu, int type)
1531 1486
1532static struct sched_group *group_of(int cpu) 1487static struct sched_group *group_of(int cpu)
1533{ 1488{
1534 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); 1489 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1535 1490
1536 if (!sd) 1491 if (!sd)
1537 return NULL; 1492 return NULL;
@@ -1566,7 +1521,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1566 1521
1567#ifdef CONFIG_FAIR_GROUP_SCHED 1522#ifdef CONFIG_FAIR_GROUP_SCHED
1568 1523
1569static __read_mostly unsigned long *update_shares_data; 1524static __read_mostly unsigned long __percpu *update_shares_data;
1570 1525
1571static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1526static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1572 1527
@@ -1701,16 +1656,6 @@ static void update_shares(struct sched_domain *sd)
1701 } 1656 }
1702} 1657}
1703 1658
1704static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1705{
1706 if (root_task_group_empty())
1707 return;
1708
1709 raw_spin_unlock(&rq->lock);
1710 update_shares(sd);
1711 raw_spin_lock(&rq->lock);
1712}
1713
1714static void update_h_load(long cpu) 1659static void update_h_load(long cpu)
1715{ 1660{
1716 if (root_task_group_empty()) 1661 if (root_task_group_empty())
@@ -1725,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd)
1725{ 1670{
1726} 1671}
1727 1672
1728static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1729{
1730}
1731
1732#endif 1673#endif
1733 1674
1734#ifdef CONFIG_PREEMPT 1675#ifdef CONFIG_PREEMPT
@@ -1805,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 raw_spin_unlock(&busiest->lock); 1746 raw_spin_unlock(&busiest->lock);
1806 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1747 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1807} 1748}
1749
1750/*
1751 * double_rq_lock - safely lock two runqueues
1752 *
1753 * Note this does not disable interrupts like task_rq_lock,
1754 * you need to do so manually before calling.
1755 */
1756static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1757 __acquires(rq1->lock)
1758 __acquires(rq2->lock)
1759{
1760 BUG_ON(!irqs_disabled());
1761 if (rq1 == rq2) {
1762 raw_spin_lock(&rq1->lock);
1763 __acquire(rq2->lock); /* Fake it out ;) */
1764 } else {
1765 if (rq1 < rq2) {
1766 raw_spin_lock(&rq1->lock);
1767 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1768 } else {
1769 raw_spin_lock(&rq2->lock);
1770 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1771 }
1772 }
1773 update_rq_clock(rq1);
1774 update_rq_clock(rq2);
1775}
1776
1777/*
1778 * double_rq_unlock - safely unlock two runqueues
1779 *
1780 * Note this does not restore interrupts like task_rq_unlock,
1781 * you need to do so manually after calling.
1782 */
1783static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1784 __releases(rq1->lock)
1785 __releases(rq2->lock)
1786{
1787 raw_spin_unlock(&rq1->lock);
1788 if (rq1 != rq2)
1789 raw_spin_unlock(&rq2->lock);
1790 else
1791 __release(rq2->lock);
1792}
1793
1808#endif 1794#endif
1809 1795
1810#ifdef CONFIG_FAIR_GROUP_SCHED 1796#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1834,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1834#endif 1820#endif
1835} 1821}
1836 1822
1837#include "sched_stats.h" 1823static const struct sched_class rt_sched_class;
1838#include "sched_idletask.c"
1839#include "sched_fair.c"
1840#include "sched_rt.c"
1841#ifdef CONFIG_SCHED_DEBUG
1842# include "sched_debug.c"
1843#endif
1844 1824
1845#define sched_class_highest (&rt_sched_class) 1825#define sched_class_highest (&rt_sched_class)
1846#define for_each_class(class) \ 1826#define for_each_class(class) \
1847 for (class = sched_class_highest; class; class = class->next) 1827 for (class = sched_class_highest; class; class = class->next)
1848 1828
1829#include "sched_stats.h"
1830
1849static void inc_nr_running(struct rq *rq) 1831static void inc_nr_running(struct rq *rq)
1850{ 1832{
1851 rq->nr_running++; 1833 rq->nr_running++;
@@ -1883,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample)
1883 *avg += diff >> 3; 1865 *avg += diff >> 3;
1884} 1866}
1885 1867
1886static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1868static void
1869enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1887{ 1870{
1888 if (wakeup) 1871 if (wakeup)
1889 p->se.start_runtime = p->se.sum_exec_runtime; 1872 p->se.start_runtime = p->se.sum_exec_runtime;
1890 1873
1891 sched_info_queued(p); 1874 sched_info_queued(p);
1892 p->sched_class->enqueue_task(rq, p, wakeup); 1875 p->sched_class->enqueue_task(rq, p, wakeup, head);
1893 p->se.on_rq = 1; 1876 p->se.on_rq = 1;
1894} 1877}
1895 1878
@@ -1912,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1912} 1895}
1913 1896
1914/* 1897/*
1898 * activate_task - move a task to the runqueue.
1899 */
1900static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1901{
1902 if (task_contributes_to_load(p))
1903 rq->nr_uninterruptible--;
1904
1905 enqueue_task(rq, p, wakeup, false);
1906 inc_nr_running(rq);
1907}
1908
1909/*
1910 * deactivate_task - remove a task from the runqueue.
1911 */
1912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1913{
1914 if (task_contributes_to_load(p))
1915 rq->nr_uninterruptible++;
1916
1917 dequeue_task(rq, p, sleep);
1918 dec_nr_running(rq);
1919}
1920
1921#include "sched_idletask.c"
1922#include "sched_fair.c"
1923#include "sched_rt.c"
1924#ifdef CONFIG_SCHED_DEBUG
1925# include "sched_debug.c"
1926#endif
1927
1928/*
1915 * __normal_prio - return the priority that is based on the static prio 1929 * __normal_prio - return the priority that is based on the static prio
1916 */ 1930 */
1917static inline int __normal_prio(struct task_struct *p) 1931static inline int __normal_prio(struct task_struct *p)
@@ -1957,30 +1971,6 @@ static int effective_prio(struct task_struct *p)
1957 return p->prio; 1971 return p->prio;
1958} 1972}
1959 1973
1960/*
1961 * activate_task - move a task to the runqueue.
1962 */
1963static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1964{
1965 if (task_contributes_to_load(p))
1966 rq->nr_uninterruptible--;
1967
1968 enqueue_task(rq, p, wakeup);
1969 inc_nr_running(rq);
1970}
1971
1972/*
1973 * deactivate_task - remove a task from the runqueue.
1974 */
1975static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1976{
1977 if (task_contributes_to_load(p))
1978 rq->nr_uninterruptible++;
1979
1980 dequeue_task(rq, p, sleep);
1981 dec_nr_running(rq);
1982}
1983
1984/** 1974/**
1985 * task_curr - is this task currently executing on a CPU? 1975 * task_curr - is this task currently executing on a CPU?
1986 * @p: the task in question. 1976 * @p: the task in question.
@@ -2408,14 +2398,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2408 __task_rq_unlock(rq); 2398 __task_rq_unlock(rq);
2409 2399
2410 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2400 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2411 if (cpu != orig_cpu) 2401 if (cpu != orig_cpu) {
2402 /*
2403 * Since we migrate the task without holding any rq->lock,
2404 * we need to be careful with task_rq_lock(), since that
2405 * might end up locking an invalid rq.
2406 */
2412 set_task_cpu(p, cpu); 2407 set_task_cpu(p, cpu);
2408 }
2413 2409
2414 rq = __task_rq_lock(p); 2410 rq = cpu_rq(cpu);
2411 raw_spin_lock(&rq->lock);
2415 update_rq_clock(rq); 2412 update_rq_clock(rq);
2416 2413
2414 /*
2415 * We migrated the task without holding either rq->lock, however
2416 * since the task is not on the task list itself, nobody else
2417 * will try and migrate the task, hence the rq should match the
2418 * cpu we just moved it to.
2419 */
2420 WARN_ON(task_cpu(p) != cpu);
2417 WARN_ON(p->state != TASK_WAKING); 2421 WARN_ON(p->state != TASK_WAKING);
2418 cpu = task_cpu(p);
2419 2422
2420#ifdef CONFIG_SCHEDSTATS 2423#ifdef CONFIG_SCHEDSTATS
2421 schedstat_inc(rq, ttwu_count); 2424 schedstat_inc(rq, ttwu_count);
@@ -2663,7 +2666,13 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2663 set_task_cpu(p, cpu); 2666 set_task_cpu(p, cpu);
2664#endif 2667#endif
2665 2668
2666 rq = task_rq_lock(p, &flags); 2669 /*
2670 * Since the task is not on the rq and we still have TASK_WAKING set
2671 * nobody else will migrate this task.
2672 */
2673 rq = cpu_rq(cpu);
2674 raw_spin_lock_irqsave(&rq->lock, flags);
2675
2667 BUG_ON(p->state != TASK_WAKING); 2676 BUG_ON(p->state != TASK_WAKING);
2668 p->state = TASK_RUNNING; 2677 p->state = TASK_RUNNING;
2669 update_rq_clock(rq); 2678 update_rq_clock(rq);
@@ -3105,50 +3114,6 @@ static void update_cpu_load(struct rq *this_rq)
3105#ifdef CONFIG_SMP 3114#ifdef CONFIG_SMP
3106 3115
3107/* 3116/*
3108 * double_rq_lock - safely lock two runqueues
3109 *
3110 * Note this does not disable interrupts like task_rq_lock,
3111 * you need to do so manually before calling.
3112 */
3113static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3114 __acquires(rq1->lock)
3115 __acquires(rq2->lock)
3116{
3117 BUG_ON(!irqs_disabled());
3118 if (rq1 == rq2) {
3119 raw_spin_lock(&rq1->lock);
3120 __acquire(rq2->lock); /* Fake it out ;) */
3121 } else {
3122 if (rq1 < rq2) {
3123 raw_spin_lock(&rq1->lock);
3124 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3125 } else {
3126 raw_spin_lock(&rq2->lock);
3127 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3128 }
3129 }
3130 update_rq_clock(rq1);
3131 update_rq_clock(rq2);
3132}
3133
3134/*
3135 * double_rq_unlock - safely unlock two runqueues
3136 *
3137 * Note this does not restore interrupts like task_rq_unlock,
3138 * you need to do so manually after calling.
3139 */
3140static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3141 __releases(rq1->lock)
3142 __releases(rq2->lock)
3143{
3144 raw_spin_unlock(&rq1->lock);
3145 if (rq1 != rq2)
3146 raw_spin_unlock(&rq2->lock);
3147 else
3148 __release(rq2->lock);
3149}
3150
3151/*
3152 * sched_exec - execve() is a valuable balancing opportunity, because at 3117 * sched_exec - execve() is a valuable balancing opportunity, because at
3153 * this point the task has the smallest effective memory and cache footprint. 3118 * this point the task has the smallest effective memory and cache footprint.
3154 */ 3119 */
@@ -3196,1771 +3161,6 @@ again:
3196 task_rq_unlock(rq, &flags); 3161 task_rq_unlock(rq, &flags);
3197} 3162}
3198 3163
3199/*
3200 * pull_task - move a task from a remote runqueue to the local runqueue.
3201 * Both runqueues must be locked.
3202 */
3203static void pull_task(struct rq *src_rq, struct task_struct *p,
3204 struct rq *this_rq, int this_cpu)
3205{
3206 deactivate_task(src_rq, p, 0);
3207 set_task_cpu(p, this_cpu);
3208 activate_task(this_rq, p, 0);
3209 check_preempt_curr(this_rq, p, 0);
3210}
3211
3212/*
3213 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3214 */
3215static
3216int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3217 struct sched_domain *sd, enum cpu_idle_type idle,
3218 int *all_pinned)
3219{
3220 int tsk_cache_hot = 0;
3221 /*
3222 * We do not migrate tasks that are:
3223 * 1) running (obviously), or
3224 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3225 * 3) are cache-hot on their current CPU.
3226 */
3227 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3228 schedstat_inc(p, se.nr_failed_migrations_affine);
3229 return 0;
3230 }
3231 *all_pinned = 0;
3232
3233 if (task_running(rq, p)) {
3234 schedstat_inc(p, se.nr_failed_migrations_running);
3235 return 0;
3236 }
3237
3238 /*
3239 * Aggressive migration if:
3240 * 1) task is cache cold, or
3241 * 2) too many balance attempts have failed.
3242 */
3243
3244 tsk_cache_hot = task_hot(p, rq->clock, sd);
3245 if (!tsk_cache_hot ||
3246 sd->nr_balance_failed > sd->cache_nice_tries) {
3247#ifdef CONFIG_SCHEDSTATS
3248 if (tsk_cache_hot) {
3249 schedstat_inc(sd, lb_hot_gained[idle]);
3250 schedstat_inc(p, se.nr_forced_migrations);
3251 }
3252#endif
3253 return 1;
3254 }
3255
3256 if (tsk_cache_hot) {
3257 schedstat_inc(p, se.nr_failed_migrations_hot);
3258 return 0;
3259 }
3260 return 1;
3261}
3262
3263static unsigned long
3264balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3265 unsigned long max_load_move, struct sched_domain *sd,
3266 enum cpu_idle_type idle, int *all_pinned,
3267 int *this_best_prio, struct rq_iterator *iterator)
3268{
3269 int loops = 0, pulled = 0, pinned = 0;
3270 struct task_struct *p;
3271 long rem_load_move = max_load_move;
3272
3273 if (max_load_move == 0)
3274 goto out;
3275
3276 pinned = 1;
3277
3278 /*
3279 * Start the load-balancing iterator:
3280 */
3281 p = iterator->start(iterator->arg);
3282next:
3283 if (!p || loops++ > sysctl_sched_nr_migrate)
3284 goto out;
3285
3286 if ((p->se.load.weight >> 1) > rem_load_move ||
3287 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3288 p = iterator->next(iterator->arg);
3289 goto next;
3290 }
3291
3292 pull_task(busiest, p, this_rq, this_cpu);
3293 pulled++;
3294 rem_load_move -= p->se.load.weight;
3295
3296#ifdef CONFIG_PREEMPT
3297 /*
3298 * NEWIDLE balancing is a source of latency, so preemptible kernels
3299 * will stop after the first task is pulled to minimize the critical
3300 * section.
3301 */
3302 if (idle == CPU_NEWLY_IDLE)
3303 goto out;
3304#endif
3305
3306 /*
3307 * We only want to steal up to the prescribed amount of weighted load.
3308 */
3309 if (rem_load_move > 0) {
3310 if (p->prio < *this_best_prio)
3311 *this_best_prio = p->prio;
3312 p = iterator->next(iterator->arg);
3313 goto next;
3314 }
3315out:
3316 /*
3317 * Right now, this is one of only two places pull_task() is called,
3318 * so we can safely collect pull_task() stats here rather than
3319 * inside pull_task().
3320 */
3321 schedstat_add(sd, lb_gained[idle], pulled);
3322
3323 if (all_pinned)
3324 *all_pinned = pinned;
3325
3326 return max_load_move - rem_load_move;
3327}
3328
3329/*
3330 * move_tasks tries to move up to max_load_move weighted load from busiest to
3331 * this_rq, as part of a balancing operation within domain "sd".
3332 * Returns 1 if successful and 0 otherwise.
3333 *
3334 * Called with both runqueues locked.
3335 */
3336static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3337 unsigned long max_load_move,
3338 struct sched_domain *sd, enum cpu_idle_type idle,
3339 int *all_pinned)
3340{
3341 const struct sched_class *class = sched_class_highest;
3342 unsigned long total_load_moved = 0;
3343 int this_best_prio = this_rq->curr->prio;
3344
3345 do {
3346 total_load_moved +=
3347 class->load_balance(this_rq, this_cpu, busiest,
3348 max_load_move - total_load_moved,
3349 sd, idle, all_pinned, &this_best_prio);
3350 class = class->next;
3351
3352#ifdef CONFIG_PREEMPT
3353 /*
3354 * NEWIDLE balancing is a source of latency, so preemptible
3355 * kernels will stop after the first task is pulled to minimize
3356 * the critical section.
3357 */
3358 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3359 break;
3360#endif
3361 } while (class && max_load_move > total_load_moved);
3362
3363 return total_load_moved > 0;
3364}
3365
3366static int
3367iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3368 struct sched_domain *sd, enum cpu_idle_type idle,
3369 struct rq_iterator *iterator)
3370{
3371 struct task_struct *p = iterator->start(iterator->arg);
3372 int pinned = 0;
3373
3374 while (p) {
3375 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3376 pull_task(busiest, p, this_rq, this_cpu);
3377 /*
3378 * Right now, this is only the second place pull_task()
3379 * is called, so we can safely collect pull_task()
3380 * stats here rather than inside pull_task().
3381 */
3382 schedstat_inc(sd, lb_gained[idle]);
3383
3384 return 1;
3385 }
3386 p = iterator->next(iterator->arg);
3387 }
3388
3389 return 0;
3390}
3391
3392/*
3393 * move_one_task tries to move exactly one task from busiest to this_rq, as
3394 * part of active balancing operations within "domain".
3395 * Returns 1 if successful and 0 otherwise.
3396 *
3397 * Called with both runqueues locked.
3398 */
3399static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3400 struct sched_domain *sd, enum cpu_idle_type idle)
3401{
3402 const struct sched_class *class;
3403
3404 for_each_class(class) {
3405 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3406 return 1;
3407 }
3408
3409 return 0;
3410}
3411/********** Helpers for find_busiest_group ************************/
3412/*
3413 * sd_lb_stats - Structure to store the statistics of a sched_domain
3414 * during load balancing.
3415 */
3416struct sd_lb_stats {
3417 struct sched_group *busiest; /* Busiest group in this sd */
3418 struct sched_group *this; /* Local group in this sd */
3419 unsigned long total_load; /* Total load of all groups in sd */
3420 unsigned long total_pwr; /* Total power of all groups in sd */
3421 unsigned long avg_load; /* Average load across all groups in sd */
3422
3423 /** Statistics of this group */
3424 unsigned long this_load;
3425 unsigned long this_load_per_task;
3426 unsigned long this_nr_running;
3427
3428 /* Statistics of the busiest group */
3429 unsigned long max_load;
3430 unsigned long busiest_load_per_task;
3431 unsigned long busiest_nr_running;
3432
3433 int group_imb; /* Is there imbalance in this sd */
3434#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3435 int power_savings_balance; /* Is powersave balance needed for this sd */
3436 struct sched_group *group_min; /* Least loaded group in sd */
3437 struct sched_group *group_leader; /* Group which relieves group_min */
3438 unsigned long min_load_per_task; /* load_per_task in group_min */
3439 unsigned long leader_nr_running; /* Nr running of group_leader */
3440 unsigned long min_nr_running; /* Nr running of group_min */
3441#endif
3442};
3443
3444/*
3445 * sg_lb_stats - stats of a sched_group required for load_balancing
3446 */
3447struct sg_lb_stats {
3448 unsigned long avg_load; /*Avg load across the CPUs of the group */
3449 unsigned long group_load; /* Total load over the CPUs of the group */
3450 unsigned long sum_nr_running; /* Nr tasks running in the group */
3451 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3452 unsigned long group_capacity;
3453 int group_imb; /* Is there an imbalance in the group ? */
3454};
3455
3456/**
3457 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3458 * @group: The group whose first cpu is to be returned.
3459 */
3460static inline unsigned int group_first_cpu(struct sched_group *group)
3461{
3462 return cpumask_first(sched_group_cpus(group));
3463}
3464
3465/**
3466 * get_sd_load_idx - Obtain the load index for a given sched domain.
3467 * @sd: The sched_domain whose load_idx is to be obtained.
3468 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3469 */
3470static inline int get_sd_load_idx(struct sched_domain *sd,
3471 enum cpu_idle_type idle)
3472{
3473 int load_idx;
3474
3475 switch (idle) {
3476 case CPU_NOT_IDLE:
3477 load_idx = sd->busy_idx;
3478 break;
3479
3480 case CPU_NEWLY_IDLE:
3481 load_idx = sd->newidle_idx;
3482 break;
3483 default:
3484 load_idx = sd->idle_idx;
3485 break;
3486 }
3487
3488 return load_idx;
3489}
3490
3491
3492#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3493/**
3494 * init_sd_power_savings_stats - Initialize power savings statistics for
3495 * the given sched_domain, during load balancing.
3496 *
3497 * @sd: Sched domain whose power-savings statistics are to be initialized.
3498 * @sds: Variable containing the statistics for sd.
3499 * @idle: Idle status of the CPU at which we're performing load-balancing.
3500 */
3501static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3502 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3503{
3504 /*
3505 * Busy processors will not participate in power savings
3506 * balance.
3507 */
3508 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3509 sds->power_savings_balance = 0;
3510 else {
3511 sds->power_savings_balance = 1;
3512 sds->min_nr_running = ULONG_MAX;
3513 sds->leader_nr_running = 0;
3514 }
3515}
3516
3517/**
3518 * update_sd_power_savings_stats - Update the power saving stats for a
3519 * sched_domain while performing load balancing.
3520 *
3521 * @group: sched_group belonging to the sched_domain under consideration.
3522 * @sds: Variable containing the statistics of the sched_domain
3523 * @local_group: Does group contain the CPU for which we're performing
3524 * load balancing ?
3525 * @sgs: Variable containing the statistics of the group.
3526 */
3527static inline void update_sd_power_savings_stats(struct sched_group *group,
3528 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3529{
3530
3531 if (!sds->power_savings_balance)
3532 return;
3533
3534 /*
3535 * If the local group is idle or completely loaded
3536 * no need to do power savings balance at this domain
3537 */
3538 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3539 !sds->this_nr_running))
3540 sds->power_savings_balance = 0;
3541
3542 /*
3543 * If a group is already running at full capacity or idle,
3544 * don't include that group in power savings calculations
3545 */
3546 if (!sds->power_savings_balance ||
3547 sgs->sum_nr_running >= sgs->group_capacity ||
3548 !sgs->sum_nr_running)
3549 return;
3550
3551 /*
3552 * Calculate the group which has the least non-idle load.
3553 * This is the group from where we need to pick up the load
3554 * for saving power
3555 */
3556 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3557 (sgs->sum_nr_running == sds->min_nr_running &&
3558 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3559 sds->group_min = group;
3560 sds->min_nr_running = sgs->sum_nr_running;
3561 sds->min_load_per_task = sgs->sum_weighted_load /
3562 sgs->sum_nr_running;
3563 }
3564
3565 /*
3566 * Calculate the group which is almost near its
3567 * capacity but still has some space to pick up some load
3568 * from other group and save more power
3569 */
3570 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3571 return;
3572
3573 if (sgs->sum_nr_running > sds->leader_nr_running ||
3574 (sgs->sum_nr_running == sds->leader_nr_running &&
3575 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3576 sds->group_leader = group;
3577 sds->leader_nr_running = sgs->sum_nr_running;
3578 }
3579}
3580
3581/**
3582 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3583 * @sds: Variable containing the statistics of the sched_domain
3584 * under consideration.
3585 * @this_cpu: Cpu at which we're currently performing load-balancing.
3586 * @imbalance: Variable to store the imbalance.
3587 *
3588 * Description:
3589 * Check if we have potential to perform some power-savings balance.
3590 * If yes, set the busiest group to be the least loaded group in the
3591 * sched_domain, so that it's CPUs can be put to idle.
3592 *
3593 * Returns 1 if there is potential to perform power-savings balance.
3594 * Else returns 0.
3595 */
3596static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3597 int this_cpu, unsigned long *imbalance)
3598{
3599 if (!sds->power_savings_balance)
3600 return 0;
3601
3602 if (sds->this != sds->group_leader ||
3603 sds->group_leader == sds->group_min)
3604 return 0;
3605
3606 *imbalance = sds->min_load_per_task;
3607 sds->busiest = sds->group_min;
3608
3609 return 1;
3610
3611}
3612#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3613static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3614 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3615{
3616 return;
3617}
3618
3619static inline void update_sd_power_savings_stats(struct sched_group *group,
3620 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3621{
3622 return;
3623}
3624
3625static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3626 int this_cpu, unsigned long *imbalance)
3627{
3628 return 0;
3629}
3630#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3631
3632
3633unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3634{
3635 return SCHED_LOAD_SCALE;
3636}
3637
3638unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3639{
3640 return default_scale_freq_power(sd, cpu);
3641}
3642
3643unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3644{
3645 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3646 unsigned long smt_gain = sd->smt_gain;
3647
3648 smt_gain /= weight;
3649
3650 return smt_gain;
3651}
3652
3653unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3654{
3655 return default_scale_smt_power(sd, cpu);
3656}
3657
3658unsigned long scale_rt_power(int cpu)
3659{
3660 struct rq *rq = cpu_rq(cpu);
3661 u64 total, available;
3662
3663 sched_avg_update(rq);
3664
3665 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3666 available = total - rq->rt_avg;
3667
3668 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3669 total = SCHED_LOAD_SCALE;
3670
3671 total >>= SCHED_LOAD_SHIFT;
3672
3673 return div_u64(available, total);
3674}
3675
3676static void update_cpu_power(struct sched_domain *sd, int cpu)
3677{
3678 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3679 unsigned long power = SCHED_LOAD_SCALE;
3680 struct sched_group *sdg = sd->groups;
3681
3682 if (sched_feat(ARCH_POWER))
3683 power *= arch_scale_freq_power(sd, cpu);
3684 else
3685 power *= default_scale_freq_power(sd, cpu);
3686
3687 power >>= SCHED_LOAD_SHIFT;
3688
3689 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3690 if (sched_feat(ARCH_POWER))
3691 power *= arch_scale_smt_power(sd, cpu);
3692 else
3693 power *= default_scale_smt_power(sd, cpu);
3694
3695 power >>= SCHED_LOAD_SHIFT;
3696 }
3697
3698 power *= scale_rt_power(cpu);
3699 power >>= SCHED_LOAD_SHIFT;
3700
3701 if (!power)
3702 power = 1;
3703
3704 sdg->cpu_power = power;
3705}
3706
3707static void update_group_power(struct sched_domain *sd, int cpu)
3708{
3709 struct sched_domain *child = sd->child;
3710 struct sched_group *group, *sdg = sd->groups;
3711 unsigned long power;
3712
3713 if (!child) {
3714 update_cpu_power(sd, cpu);
3715 return;
3716 }
3717
3718 power = 0;
3719
3720 group = child->groups;
3721 do {
3722 power += group->cpu_power;
3723 group = group->next;
3724 } while (group != child->groups);
3725
3726 sdg->cpu_power = power;
3727}
3728
3729/**
3730 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3731 * @sd: The sched_domain whose statistics are to be updated.
3732 * @group: sched_group whose statistics are to be updated.
3733 * @this_cpu: Cpu for which load balance is currently performed.
3734 * @idle: Idle status of this_cpu
3735 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3736 * @sd_idle: Idle status of the sched_domain containing group.
3737 * @local_group: Does group contain this_cpu.
3738 * @cpus: Set of cpus considered for load balancing.
3739 * @balance: Should we balance.
3740 * @sgs: variable to hold the statistics for this group.
3741 */
3742static inline void update_sg_lb_stats(struct sched_domain *sd,
3743 struct sched_group *group, int this_cpu,
3744 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3745 int local_group, const struct cpumask *cpus,
3746 int *balance, struct sg_lb_stats *sgs)
3747{
3748 unsigned long load, max_cpu_load, min_cpu_load;
3749 int i;
3750 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3751 unsigned long sum_avg_load_per_task;
3752 unsigned long avg_load_per_task;
3753
3754 if (local_group) {
3755 balance_cpu = group_first_cpu(group);
3756 if (balance_cpu == this_cpu)
3757 update_group_power(sd, this_cpu);
3758 }
3759
3760 /* Tally up the load of all CPUs in the group */
3761 sum_avg_load_per_task = avg_load_per_task = 0;
3762 max_cpu_load = 0;
3763 min_cpu_load = ~0UL;
3764
3765 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3766 struct rq *rq = cpu_rq(i);
3767
3768 if (*sd_idle && rq->nr_running)
3769 *sd_idle = 0;
3770
3771 /* Bias balancing toward cpus of our domain */
3772 if (local_group) {
3773 if (idle_cpu(i) && !first_idle_cpu) {
3774 first_idle_cpu = 1;
3775 balance_cpu = i;
3776 }
3777
3778 load = target_load(i, load_idx);
3779 } else {
3780 load = source_load(i, load_idx);
3781 if (load > max_cpu_load)
3782 max_cpu_load = load;
3783 if (min_cpu_load > load)
3784 min_cpu_load = load;
3785 }
3786
3787 sgs->group_load += load;
3788 sgs->sum_nr_running += rq->nr_running;
3789 sgs->sum_weighted_load += weighted_cpuload(i);
3790
3791 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3792 }
3793
3794 /*
3795 * First idle cpu or the first cpu(busiest) in this sched group
3796 * is eligible for doing load balancing at this and above
3797 * domains. In the newly idle case, we will allow all the cpu's
3798 * to do the newly idle load balance.
3799 */
3800 if (idle != CPU_NEWLY_IDLE && local_group &&
3801 balance_cpu != this_cpu && balance) {
3802 *balance = 0;
3803 return;
3804 }
3805
3806 /* Adjust by relative CPU power of the group */
3807 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3808
3809
3810 /*
3811 * Consider the group unbalanced when the imbalance is larger
3812 * than the average weight of two tasks.
3813 *
3814 * APZ: with cgroup the avg task weight can vary wildly and
3815 * might not be a suitable number - should we keep a
3816 * normalized nr_running number somewhere that negates
3817 * the hierarchy?
3818 */
3819 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3820 group->cpu_power;
3821
3822 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3823 sgs->group_imb = 1;
3824
3825 sgs->group_capacity =
3826 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3827}
3828
3829/**
3830 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3831 * @sd: sched_domain whose statistics are to be updated.
3832 * @this_cpu: Cpu for which load balance is currently performed.
3833 * @idle: Idle status of this_cpu
3834 * @sd_idle: Idle status of the sched_domain containing group.
3835 * @cpus: Set of cpus considered for load balancing.
3836 * @balance: Should we balance.
3837 * @sds: variable to hold the statistics for this sched_domain.
3838 */
3839static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3840 enum cpu_idle_type idle, int *sd_idle,
3841 const struct cpumask *cpus, int *balance,
3842 struct sd_lb_stats *sds)
3843{
3844 struct sched_domain *child = sd->child;
3845 struct sched_group *group = sd->groups;
3846 struct sg_lb_stats sgs;
3847 int load_idx, prefer_sibling = 0;
3848
3849 if (child && child->flags & SD_PREFER_SIBLING)
3850 prefer_sibling = 1;
3851
3852 init_sd_power_savings_stats(sd, sds, idle);
3853 load_idx = get_sd_load_idx(sd, idle);
3854
3855 do {
3856 int local_group;
3857
3858 local_group = cpumask_test_cpu(this_cpu,
3859 sched_group_cpus(group));
3860 memset(&sgs, 0, sizeof(sgs));
3861 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3862 local_group, cpus, balance, &sgs);
3863
3864 if (local_group && balance && !(*balance))
3865 return;
3866
3867 sds->total_load += sgs.group_load;
3868 sds->total_pwr += group->cpu_power;
3869
3870 /*
3871 * In case the child domain prefers tasks go to siblings
3872 * first, lower the group capacity to one so that we'll try
3873 * and move all the excess tasks away.
3874 */
3875 if (prefer_sibling)
3876 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3877
3878 if (local_group) {
3879 sds->this_load = sgs.avg_load;
3880 sds->this = group;
3881 sds->this_nr_running = sgs.sum_nr_running;
3882 sds->this_load_per_task = sgs.sum_weighted_load;
3883 } else if (sgs.avg_load > sds->max_load &&
3884 (sgs.sum_nr_running > sgs.group_capacity ||
3885 sgs.group_imb)) {
3886 sds->max_load = sgs.avg_load;
3887 sds->busiest = group;
3888 sds->busiest_nr_running = sgs.sum_nr_running;
3889 sds->busiest_load_per_task = sgs.sum_weighted_load;
3890 sds->group_imb = sgs.group_imb;
3891 }
3892
3893 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3894 group = group->next;
3895 } while (group != sd->groups);
3896}
3897
3898/**
3899 * fix_small_imbalance - Calculate the minor imbalance that exists
3900 * amongst the groups of a sched_domain, during
3901 * load balancing.
3902 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3903 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3904 * @imbalance: Variable to store the imbalance.
3905 */
3906static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3907 int this_cpu, unsigned long *imbalance)
3908{
3909 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3910 unsigned int imbn = 2;
3911
3912 if (sds->this_nr_running) {
3913 sds->this_load_per_task /= sds->this_nr_running;
3914 if (sds->busiest_load_per_task >
3915 sds->this_load_per_task)
3916 imbn = 1;
3917 } else
3918 sds->this_load_per_task =
3919 cpu_avg_load_per_task(this_cpu);
3920
3921 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3922 sds->busiest_load_per_task * imbn) {
3923 *imbalance = sds->busiest_load_per_task;
3924 return;
3925 }
3926
3927 /*
3928 * OK, we don't have enough imbalance to justify moving tasks,
3929 * however we may be able to increase total CPU power used by
3930 * moving them.
3931 */
3932
3933 pwr_now += sds->busiest->cpu_power *
3934 min(sds->busiest_load_per_task, sds->max_load);
3935 pwr_now += sds->this->cpu_power *
3936 min(sds->this_load_per_task, sds->this_load);
3937 pwr_now /= SCHED_LOAD_SCALE;
3938
3939 /* Amount of load we'd subtract */
3940 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3941 sds->busiest->cpu_power;
3942 if (sds->max_load > tmp)
3943 pwr_move += sds->busiest->cpu_power *
3944 min(sds->busiest_load_per_task, sds->max_load - tmp);
3945
3946 /* Amount of load we'd add */
3947 if (sds->max_load * sds->busiest->cpu_power <
3948 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3949 tmp = (sds->max_load * sds->busiest->cpu_power) /
3950 sds->this->cpu_power;
3951 else
3952 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3953 sds->this->cpu_power;
3954 pwr_move += sds->this->cpu_power *
3955 min(sds->this_load_per_task, sds->this_load + tmp);
3956 pwr_move /= SCHED_LOAD_SCALE;
3957
3958 /* Move if we gain throughput */
3959 if (pwr_move > pwr_now)
3960 *imbalance = sds->busiest_load_per_task;
3961}
3962
3963/**
3964 * calculate_imbalance - Calculate the amount of imbalance present within the
3965 * groups of a given sched_domain during load balance.
3966 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3967 * @this_cpu: Cpu for which currently load balance is being performed.
3968 * @imbalance: The variable to store the imbalance.
3969 */
3970static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3971 unsigned long *imbalance)
3972{
3973 unsigned long max_pull;
3974 /*
3975 * In the presence of smp nice balancing, certain scenarios can have
3976 * max load less than avg load(as we skip the groups at or below
3977 * its cpu_power, while calculating max_load..)
3978 */
3979 if (sds->max_load < sds->avg_load) {
3980 *imbalance = 0;
3981 return fix_small_imbalance(sds, this_cpu, imbalance);
3982 }
3983
3984 /* Don't want to pull so many tasks that a group would go idle */
3985 max_pull = min(sds->max_load - sds->avg_load,
3986 sds->max_load - sds->busiest_load_per_task);
3987
3988 /* How much load to actually move to equalise the imbalance */
3989 *imbalance = min(max_pull * sds->busiest->cpu_power,
3990 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3991 / SCHED_LOAD_SCALE;
3992
3993 /*
3994 * if *imbalance is less than the average load per runnable task
3995 * there is no gaurantee that any tasks will be moved so we'll have
3996 * a think about bumping its value to force at least one task to be
3997 * moved
3998 */
3999 if (*imbalance < sds->busiest_load_per_task)
4000 return fix_small_imbalance(sds, this_cpu, imbalance);
4001
4002}
4003/******* find_busiest_group() helpers end here *********************/
4004
4005/**
4006 * find_busiest_group - Returns the busiest group within the sched_domain
4007 * if there is an imbalance. If there isn't an imbalance, and
4008 * the user has opted for power-savings, it returns a group whose
4009 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4010 * such a group exists.
4011 *
4012 * Also calculates the amount of weighted load which should be moved
4013 * to restore balance.
4014 *
4015 * @sd: The sched_domain whose busiest group is to be returned.
4016 * @this_cpu: The cpu for which load balancing is currently being performed.
4017 * @imbalance: Variable which stores amount of weighted load which should
4018 * be moved to restore balance/put a group to idle.
4019 * @idle: The idle status of this_cpu.
4020 * @sd_idle: The idleness of sd
4021 * @cpus: The set of CPUs under consideration for load-balancing.
4022 * @balance: Pointer to a variable indicating if this_cpu
4023 * is the appropriate cpu to perform load balancing at this_level.
4024 *
4025 * Returns: - the busiest group if imbalance exists.
4026 * - If no imbalance and user has opted for power-savings balance,
4027 * return the least loaded group whose CPUs can be
4028 * put to idle by rebalancing its tasks onto our group.
4029 */
4030static struct sched_group *
4031find_busiest_group(struct sched_domain *sd, int this_cpu,
4032 unsigned long *imbalance, enum cpu_idle_type idle,
4033 int *sd_idle, const struct cpumask *cpus, int *balance)
4034{
4035 struct sd_lb_stats sds;
4036
4037 memset(&sds, 0, sizeof(sds));
4038
4039 /*
4040 * Compute the various statistics relavent for load balancing at
4041 * this level.
4042 */
4043 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4044 balance, &sds);
4045
4046 /* Cases where imbalance does not exist from POV of this_cpu */
4047 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4048 * at this level.
4049 * 2) There is no busy sibling group to pull from.
4050 * 3) This group is the busiest group.
4051 * 4) This group is more busy than the avg busieness at this
4052 * sched_domain.
4053 * 5) The imbalance is within the specified limit.
4054 * 6) Any rebalance would lead to ping-pong
4055 */
4056 if (balance && !(*balance))
4057 goto ret;
4058
4059 if (!sds.busiest || sds.busiest_nr_running == 0)
4060 goto out_balanced;
4061
4062 if (sds.this_load >= sds.max_load)
4063 goto out_balanced;
4064
4065 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4066
4067 if (sds.this_load >= sds.avg_load)
4068 goto out_balanced;
4069
4070 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4071 goto out_balanced;
4072
4073 sds.busiest_load_per_task /= sds.busiest_nr_running;
4074 if (sds.group_imb)
4075 sds.busiest_load_per_task =
4076 min(sds.busiest_load_per_task, sds.avg_load);
4077
4078 /*
4079 * We're trying to get all the cpus to the average_load, so we don't
4080 * want to push ourselves above the average load, nor do we wish to
4081 * reduce the max loaded cpu below the average load, as either of these
4082 * actions would just result in more rebalancing later, and ping-pong
4083 * tasks around. Thus we look for the minimum possible imbalance.
4084 * Negative imbalances (*we* are more loaded than anyone else) will
4085 * be counted as no imbalance for these purposes -- we can't fix that
4086 * by pulling tasks to us. Be careful of negative numbers as they'll
4087 * appear as very large values with unsigned longs.
4088 */
4089 if (sds.max_load <= sds.busiest_load_per_task)
4090 goto out_balanced;
4091
4092 /* Looks like there is an imbalance. Compute it */
4093 calculate_imbalance(&sds, this_cpu, imbalance);
4094 return sds.busiest;
4095
4096out_balanced:
4097 /*
4098 * There is no obvious imbalance. But check if we can do some balancing
4099 * to save power.
4100 */
4101 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4102 return sds.busiest;
4103ret:
4104 *imbalance = 0;
4105 return NULL;
4106}
4107
4108/*
4109 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4110 */
4111static struct rq *
4112find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4113 unsigned long imbalance, const struct cpumask *cpus)
4114{
4115 struct rq *busiest = NULL, *rq;
4116 unsigned long max_load = 0;
4117 int i;
4118
4119 for_each_cpu(i, sched_group_cpus(group)) {
4120 unsigned long power = power_of(i);
4121 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4122 unsigned long wl;
4123
4124 if (!cpumask_test_cpu(i, cpus))
4125 continue;
4126
4127 rq = cpu_rq(i);
4128 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4129 wl /= power;
4130
4131 if (capacity && rq->nr_running == 1 && wl > imbalance)
4132 continue;
4133
4134 if (wl > max_load) {
4135 max_load = wl;
4136 busiest = rq;
4137 }
4138 }
4139
4140 return busiest;
4141}
4142
4143/*
4144 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4145 * so long as it is large enough.
4146 */
4147#define MAX_PINNED_INTERVAL 512
4148
4149/* Working cpumask for load_balance and load_balance_newidle. */
4150static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4151
4152/*
4153 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4154 * tasks if there is an imbalance.
4155 */
4156static int load_balance(int this_cpu, struct rq *this_rq,
4157 struct sched_domain *sd, enum cpu_idle_type idle,
4158 int *balance)
4159{
4160 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4161 struct sched_group *group;
4162 unsigned long imbalance;
4163 struct rq *busiest;
4164 unsigned long flags;
4165 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4166
4167 cpumask_copy(cpus, cpu_active_mask);
4168
4169 /*
4170 * When power savings policy is enabled for the parent domain, idle
4171 * sibling can pick up load irrespective of busy siblings. In this case,
4172 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4173 * portraying it as CPU_NOT_IDLE.
4174 */
4175 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4176 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4177 sd_idle = 1;
4178
4179 schedstat_inc(sd, lb_count[idle]);
4180
4181redo:
4182 update_shares(sd);
4183 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4184 cpus, balance);
4185
4186 if (*balance == 0)
4187 goto out_balanced;
4188
4189 if (!group) {
4190 schedstat_inc(sd, lb_nobusyg[idle]);
4191 goto out_balanced;
4192 }
4193
4194 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4195 if (!busiest) {
4196 schedstat_inc(sd, lb_nobusyq[idle]);
4197 goto out_balanced;
4198 }
4199
4200 BUG_ON(busiest == this_rq);
4201
4202 schedstat_add(sd, lb_imbalance[idle], imbalance);
4203
4204 ld_moved = 0;
4205 if (busiest->nr_running > 1) {
4206 /*
4207 * Attempt to move tasks. If find_busiest_group has found
4208 * an imbalance but busiest->nr_running <= 1, the group is
4209 * still unbalanced. ld_moved simply stays zero, so it is
4210 * correctly treated as an imbalance.
4211 */
4212 local_irq_save(flags);
4213 double_rq_lock(this_rq, busiest);
4214 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4215 imbalance, sd, idle, &all_pinned);
4216 double_rq_unlock(this_rq, busiest);
4217 local_irq_restore(flags);
4218
4219 /*
4220 * some other cpu did the load balance for us.
4221 */
4222 if (ld_moved && this_cpu != smp_processor_id())
4223 resched_cpu(this_cpu);
4224
4225 /* All tasks on this runqueue were pinned by CPU affinity */
4226 if (unlikely(all_pinned)) {
4227 cpumask_clear_cpu(cpu_of(busiest), cpus);
4228 if (!cpumask_empty(cpus))
4229 goto redo;
4230 goto out_balanced;
4231 }
4232 }
4233
4234 if (!ld_moved) {
4235 schedstat_inc(sd, lb_failed[idle]);
4236 sd->nr_balance_failed++;
4237
4238 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4239
4240 raw_spin_lock_irqsave(&busiest->lock, flags);
4241
4242 /* don't kick the migration_thread, if the curr
4243 * task on busiest cpu can't be moved to this_cpu
4244 */
4245 if (!cpumask_test_cpu(this_cpu,
4246 &busiest->curr->cpus_allowed)) {
4247 raw_spin_unlock_irqrestore(&busiest->lock,
4248 flags);
4249 all_pinned = 1;
4250 goto out_one_pinned;
4251 }
4252
4253 if (!busiest->active_balance) {
4254 busiest->active_balance = 1;
4255 busiest->push_cpu = this_cpu;
4256 active_balance = 1;
4257 }
4258 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4259 if (active_balance)
4260 wake_up_process(busiest->migration_thread);
4261
4262 /*
4263 * We've kicked active balancing, reset the failure
4264 * counter.
4265 */
4266 sd->nr_balance_failed = sd->cache_nice_tries+1;
4267 }
4268 } else
4269 sd->nr_balance_failed = 0;
4270
4271 if (likely(!active_balance)) {
4272 /* We were unbalanced, so reset the balancing interval */
4273 sd->balance_interval = sd->min_interval;
4274 } else {
4275 /*
4276 * If we've begun active balancing, start to back off. This
4277 * case may not be covered by the all_pinned logic if there
4278 * is only 1 task on the busy runqueue (because we don't call
4279 * move_tasks).
4280 */
4281 if (sd->balance_interval < sd->max_interval)
4282 sd->balance_interval *= 2;
4283 }
4284
4285 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4286 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4287 ld_moved = -1;
4288
4289 goto out;
4290
4291out_balanced:
4292 schedstat_inc(sd, lb_balanced[idle]);
4293
4294 sd->nr_balance_failed = 0;
4295
4296out_one_pinned:
4297 /* tune up the balancing interval */
4298 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4299 (sd->balance_interval < sd->max_interval))
4300 sd->balance_interval *= 2;
4301
4302 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4303 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4304 ld_moved = -1;
4305 else
4306 ld_moved = 0;
4307out:
4308 if (ld_moved)
4309 update_shares(sd);
4310 return ld_moved;
4311}
4312
4313/*
4314 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4315 * tasks if there is an imbalance.
4316 *
4317 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4318 * this_rq is locked.
4319 */
4320static int
4321load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4322{
4323 struct sched_group *group;
4324 struct rq *busiest = NULL;
4325 unsigned long imbalance;
4326 int ld_moved = 0;
4327 int sd_idle = 0;
4328 int all_pinned = 0;
4329 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4330
4331 cpumask_copy(cpus, cpu_active_mask);
4332
4333 /*
4334 * When power savings policy is enabled for the parent domain, idle
4335 * sibling can pick up load irrespective of busy siblings. In this case,
4336 * let the state of idle sibling percolate up as IDLE, instead of
4337 * portraying it as CPU_NOT_IDLE.
4338 */
4339 if (sd->flags & SD_SHARE_CPUPOWER &&
4340 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4341 sd_idle = 1;
4342
4343 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4344redo:
4345 update_shares_locked(this_rq, sd);
4346 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4347 &sd_idle, cpus, NULL);
4348 if (!group) {
4349 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4350 goto out_balanced;
4351 }
4352
4353 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4354 if (!busiest) {
4355 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4356 goto out_balanced;
4357 }
4358
4359 BUG_ON(busiest == this_rq);
4360
4361 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4362
4363 ld_moved = 0;
4364 if (busiest->nr_running > 1) {
4365 /* Attempt to move tasks */
4366 double_lock_balance(this_rq, busiest);
4367 /* this_rq->clock is already updated */
4368 update_rq_clock(busiest);
4369 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4370 imbalance, sd, CPU_NEWLY_IDLE,
4371 &all_pinned);
4372 double_unlock_balance(this_rq, busiest);
4373
4374 if (unlikely(all_pinned)) {
4375 cpumask_clear_cpu(cpu_of(busiest), cpus);
4376 if (!cpumask_empty(cpus))
4377 goto redo;
4378 }
4379 }
4380
4381 if (!ld_moved) {
4382 int active_balance = 0;
4383
4384 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4385 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4386 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4387 return -1;
4388
4389 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4390 return -1;
4391
4392 if (sd->nr_balance_failed++ < 2)
4393 return -1;
4394
4395 /*
4396 * The only task running in a non-idle cpu can be moved to this
4397 * cpu in an attempt to completely freeup the other CPU
4398 * package. The same method used to move task in load_balance()
4399 * have been extended for load_balance_newidle() to speedup
4400 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4401 *
4402 * The package power saving logic comes from
4403 * find_busiest_group(). If there are no imbalance, then
4404 * f_b_g() will return NULL. However when sched_mc={1,2} then
4405 * f_b_g() will select a group from which a running task may be
4406 * pulled to this cpu in order to make the other package idle.
4407 * If there is no opportunity to make a package idle and if
4408 * there are no imbalance, then f_b_g() will return NULL and no
4409 * action will be taken in load_balance_newidle().
4410 *
4411 * Under normal task pull operation due to imbalance, there
4412 * will be more than one task in the source run queue and
4413 * move_tasks() will succeed. ld_moved will be true and this
4414 * active balance code will not be triggered.
4415 */
4416
4417 /* Lock busiest in correct order while this_rq is held */
4418 double_lock_balance(this_rq, busiest);
4419
4420 /*
4421 * don't kick the migration_thread, if the curr
4422 * task on busiest cpu can't be moved to this_cpu
4423 */
4424 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4425 double_unlock_balance(this_rq, busiest);
4426 all_pinned = 1;
4427 return ld_moved;
4428 }
4429
4430 if (!busiest->active_balance) {
4431 busiest->active_balance = 1;
4432 busiest->push_cpu = this_cpu;
4433 active_balance = 1;
4434 }
4435
4436 double_unlock_balance(this_rq, busiest);
4437 /*
4438 * Should not call ttwu while holding a rq->lock
4439 */
4440 raw_spin_unlock(&this_rq->lock);
4441 if (active_balance)
4442 wake_up_process(busiest->migration_thread);
4443 raw_spin_lock(&this_rq->lock);
4444
4445 } else
4446 sd->nr_balance_failed = 0;
4447
4448 update_shares_locked(this_rq, sd);
4449 return ld_moved;
4450
4451out_balanced:
4452 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4453 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4454 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4455 return -1;
4456 sd->nr_balance_failed = 0;
4457
4458 return 0;
4459}
4460
4461/*
4462 * idle_balance is called by schedule() if this_cpu is about to become
4463 * idle. Attempts to pull tasks from other CPUs.
4464 */
4465static void idle_balance(int this_cpu, struct rq *this_rq)
4466{
4467 struct sched_domain *sd;
4468 int pulled_task = 0;
4469 unsigned long next_balance = jiffies + HZ;
4470
4471 this_rq->idle_stamp = this_rq->clock;
4472
4473 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4474 return;
4475
4476 for_each_domain(this_cpu, sd) {
4477 unsigned long interval;
4478
4479 if (!(sd->flags & SD_LOAD_BALANCE))
4480 continue;
4481
4482 if (sd->flags & SD_BALANCE_NEWIDLE)
4483 /* If we've pulled tasks over stop searching: */
4484 pulled_task = load_balance_newidle(this_cpu, this_rq,
4485 sd);
4486
4487 interval = msecs_to_jiffies(sd->balance_interval);
4488 if (time_after(next_balance, sd->last_balance + interval))
4489 next_balance = sd->last_balance + interval;
4490 if (pulled_task) {
4491 this_rq->idle_stamp = 0;
4492 break;
4493 }
4494 }
4495 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4496 /*
4497 * We are going idle. next_balance may be set based on
4498 * a busy processor. So reset next_balance.
4499 */
4500 this_rq->next_balance = next_balance;
4501 }
4502}
4503
4504/*
4505 * active_load_balance is run by migration threads. It pushes running tasks
4506 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4507 * running on each physical CPU where possible, and avoids physical /
4508 * logical imbalances.
4509 *
4510 * Called with busiest_rq locked.
4511 */
4512static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4513{
4514 int target_cpu = busiest_rq->push_cpu;
4515 struct sched_domain *sd;
4516 struct rq *target_rq;
4517
4518 /* Is there any task to move? */
4519 if (busiest_rq->nr_running <= 1)
4520 return;
4521
4522 target_rq = cpu_rq(target_cpu);
4523
4524 /*
4525 * This condition is "impossible", if it occurs
4526 * we need to fix it. Originally reported by
4527 * Bjorn Helgaas on a 128-cpu setup.
4528 */
4529 BUG_ON(busiest_rq == target_rq);
4530
4531 /* move a task from busiest_rq to target_rq */
4532 double_lock_balance(busiest_rq, target_rq);
4533 update_rq_clock(busiest_rq);
4534 update_rq_clock(target_rq);
4535
4536 /* Search for an sd spanning us and the target CPU. */
4537 for_each_domain(target_cpu, sd) {
4538 if ((sd->flags & SD_LOAD_BALANCE) &&
4539 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4540 break;
4541 }
4542
4543 if (likely(sd)) {
4544 schedstat_inc(sd, alb_count);
4545
4546 if (move_one_task(target_rq, target_cpu, busiest_rq,
4547 sd, CPU_IDLE))
4548 schedstat_inc(sd, alb_pushed);
4549 else
4550 schedstat_inc(sd, alb_failed);
4551 }
4552 double_unlock_balance(busiest_rq, target_rq);
4553}
4554
4555#ifdef CONFIG_NO_HZ
4556static struct {
4557 atomic_t load_balancer;
4558 cpumask_var_t cpu_mask;
4559 cpumask_var_t ilb_grp_nohz_mask;
4560} nohz ____cacheline_aligned = {
4561 .load_balancer = ATOMIC_INIT(-1),
4562};
4563
4564int get_nohz_load_balancer(void)
4565{
4566 return atomic_read(&nohz.load_balancer);
4567}
4568
4569#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4570/**
4571 * lowest_flag_domain - Return lowest sched_domain containing flag.
4572 * @cpu: The cpu whose lowest level of sched domain is to
4573 * be returned.
4574 * @flag: The flag to check for the lowest sched_domain
4575 * for the given cpu.
4576 *
4577 * Returns the lowest sched_domain of a cpu which contains the given flag.
4578 */
4579static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4580{
4581 struct sched_domain *sd;
4582
4583 for_each_domain(cpu, sd)
4584 if (sd && (sd->flags & flag))
4585 break;
4586
4587 return sd;
4588}
4589
4590/**
4591 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4592 * @cpu: The cpu whose domains we're iterating over.
4593 * @sd: variable holding the value of the power_savings_sd
4594 * for cpu.
4595 * @flag: The flag to filter the sched_domains to be iterated.
4596 *
4597 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4598 * set, starting from the lowest sched_domain to the highest.
4599 */
4600#define for_each_flag_domain(cpu, sd, flag) \
4601 for (sd = lowest_flag_domain(cpu, flag); \
4602 (sd && (sd->flags & flag)); sd = sd->parent)
4603
4604/**
4605 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4606 * @ilb_group: group to be checked for semi-idleness
4607 *
4608 * Returns: 1 if the group is semi-idle. 0 otherwise.
4609 *
4610 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4611 * and atleast one non-idle CPU. This helper function checks if the given
4612 * sched_group is semi-idle or not.
4613 */
4614static inline int is_semi_idle_group(struct sched_group *ilb_group)
4615{
4616 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4617 sched_group_cpus(ilb_group));
4618
4619 /*
4620 * A sched_group is semi-idle when it has atleast one busy cpu
4621 * and atleast one idle cpu.
4622 */
4623 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4624 return 0;
4625
4626 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4627 return 0;
4628
4629 return 1;
4630}
4631/**
4632 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4633 * @cpu: The cpu which is nominating a new idle_load_balancer.
4634 *
4635 * Returns: Returns the id of the idle load balancer if it exists,
4636 * Else, returns >= nr_cpu_ids.
4637 *
4638 * This algorithm picks the idle load balancer such that it belongs to a
4639 * semi-idle powersavings sched_domain. The idea is to try and avoid
4640 * completely idle packages/cores just for the purpose of idle load balancing
4641 * when there are other idle cpu's which are better suited for that job.
4642 */
4643static int find_new_ilb(int cpu)
4644{
4645 struct sched_domain *sd;
4646 struct sched_group *ilb_group;
4647
4648 /*
4649 * Have idle load balancer selection from semi-idle packages only
4650 * when power-aware load balancing is enabled
4651 */
4652 if (!(sched_smt_power_savings || sched_mc_power_savings))
4653 goto out_done;
4654
4655 /*
4656 * Optimize for the case when we have no idle CPUs or only one
4657 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4658 */
4659 if (cpumask_weight(nohz.cpu_mask) < 2)
4660 goto out_done;
4661
4662 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4663 ilb_group = sd->groups;
4664
4665 do {
4666 if (is_semi_idle_group(ilb_group))
4667 return cpumask_first(nohz.ilb_grp_nohz_mask);
4668
4669 ilb_group = ilb_group->next;
4670
4671 } while (ilb_group != sd->groups);
4672 }
4673
4674out_done:
4675 return cpumask_first(nohz.cpu_mask);
4676}
4677#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4678static inline int find_new_ilb(int call_cpu)
4679{
4680 return cpumask_first(nohz.cpu_mask);
4681}
4682#endif
4683
4684/*
4685 * This routine will try to nominate the ilb (idle load balancing)
4686 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4687 * load balancing on behalf of all those cpus. If all the cpus in the system
4688 * go into this tickless mode, then there will be no ilb owner (as there is
4689 * no need for one) and all the cpus will sleep till the next wakeup event
4690 * arrives...
4691 *
4692 * For the ilb owner, tick is not stopped. And this tick will be used
4693 * for idle load balancing. ilb owner will still be part of
4694 * nohz.cpu_mask..
4695 *
4696 * While stopping the tick, this cpu will become the ilb owner if there
4697 * is no other owner. And will be the owner till that cpu becomes busy
4698 * or if all cpus in the system stop their ticks at which point
4699 * there is no need for ilb owner.
4700 *
4701 * When the ilb owner becomes busy, it nominates another owner, during the
4702 * next busy scheduler_tick()
4703 */
4704int select_nohz_load_balancer(int stop_tick)
4705{
4706 int cpu = smp_processor_id();
4707
4708 if (stop_tick) {
4709 cpu_rq(cpu)->in_nohz_recently = 1;
4710
4711 if (!cpu_active(cpu)) {
4712 if (atomic_read(&nohz.load_balancer) != cpu)
4713 return 0;
4714
4715 /*
4716 * If we are going offline and still the leader,
4717 * give up!
4718 */
4719 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4720 BUG();
4721
4722 return 0;
4723 }
4724
4725 cpumask_set_cpu(cpu, nohz.cpu_mask);
4726
4727 /* time for ilb owner also to sleep */
4728 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4729 if (atomic_read(&nohz.load_balancer) == cpu)
4730 atomic_set(&nohz.load_balancer, -1);
4731 return 0;
4732 }
4733
4734 if (atomic_read(&nohz.load_balancer) == -1) {
4735 /* make me the ilb owner */
4736 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4737 return 1;
4738 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4739 int new_ilb;
4740
4741 if (!(sched_smt_power_savings ||
4742 sched_mc_power_savings))
4743 return 1;
4744 /*
4745 * Check to see if there is a more power-efficient
4746 * ilb.
4747 */
4748 new_ilb = find_new_ilb(cpu);
4749 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4750 atomic_set(&nohz.load_balancer, -1);
4751 resched_cpu(new_ilb);
4752 return 0;
4753 }
4754 return 1;
4755 }
4756 } else {
4757 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4758 return 0;
4759
4760 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4761
4762 if (atomic_read(&nohz.load_balancer) == cpu)
4763 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4764 BUG();
4765 }
4766 return 0;
4767}
4768#endif
4769
4770static DEFINE_SPINLOCK(balancing);
4771
4772/*
4773 * It checks each scheduling domain to see if it is due to be balanced,
4774 * and initiates a balancing operation if so.
4775 *
4776 * Balancing parameters are set up in arch_init_sched_domains.
4777 */
4778static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4779{
4780 int balance = 1;
4781 struct rq *rq = cpu_rq(cpu);
4782 unsigned long interval;
4783 struct sched_domain *sd;
4784 /* Earliest time when we have to do rebalance again */
4785 unsigned long next_balance = jiffies + 60*HZ;
4786 int update_next_balance = 0;
4787 int need_serialize;
4788
4789 for_each_domain(cpu, sd) {
4790 if (!(sd->flags & SD_LOAD_BALANCE))
4791 continue;
4792
4793 interval = sd->balance_interval;
4794 if (idle != CPU_IDLE)
4795 interval *= sd->busy_factor;
4796
4797 /* scale ms to jiffies */
4798 interval = msecs_to_jiffies(interval);
4799 if (unlikely(!interval))
4800 interval = 1;
4801 if (interval > HZ*NR_CPUS/10)
4802 interval = HZ*NR_CPUS/10;
4803
4804 need_serialize = sd->flags & SD_SERIALIZE;
4805
4806 if (need_serialize) {
4807 if (!spin_trylock(&balancing))
4808 goto out;
4809 }
4810
4811 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4812 if (load_balance(cpu, rq, sd, idle, &balance)) {
4813 /*
4814 * We've pulled tasks over so either we're no
4815 * longer idle, or one of our SMT siblings is
4816 * not idle.
4817 */
4818 idle = CPU_NOT_IDLE;
4819 }
4820 sd->last_balance = jiffies;
4821 }
4822 if (need_serialize)
4823 spin_unlock(&balancing);
4824out:
4825 if (time_after(next_balance, sd->last_balance + interval)) {
4826 next_balance = sd->last_balance + interval;
4827 update_next_balance = 1;
4828 }
4829
4830 /*
4831 * Stop the load balance at this level. There is another
4832 * CPU in our sched group which is doing load balancing more
4833 * actively.
4834 */
4835 if (!balance)
4836 break;
4837 }
4838
4839 /*
4840 * next_balance will be updated only when there is a need.
4841 * When the cpu is attached to null domain for ex, it will not be
4842 * updated.
4843 */
4844 if (likely(update_next_balance))
4845 rq->next_balance = next_balance;
4846}
4847
4848/*
4849 * run_rebalance_domains is triggered when needed from the scheduler tick.
4850 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4851 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4852 */
4853static void run_rebalance_domains(struct softirq_action *h)
4854{
4855 int this_cpu = smp_processor_id();
4856 struct rq *this_rq = cpu_rq(this_cpu);
4857 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4858 CPU_IDLE : CPU_NOT_IDLE;
4859
4860 rebalance_domains(this_cpu, idle);
4861
4862#ifdef CONFIG_NO_HZ
4863 /*
4864 * If this cpu is the owner for idle load balancing, then do the
4865 * balancing on behalf of the other idle cpus whose ticks are
4866 * stopped.
4867 */
4868 if (this_rq->idle_at_tick &&
4869 atomic_read(&nohz.load_balancer) == this_cpu) {
4870 struct rq *rq;
4871 int balance_cpu;
4872
4873 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4874 if (balance_cpu == this_cpu)
4875 continue;
4876
4877 /*
4878 * If this cpu gets work to do, stop the load balancing
4879 * work being done for other cpus. Next load
4880 * balancing owner will pick it up.
4881 */
4882 if (need_resched())
4883 break;
4884
4885 rebalance_domains(balance_cpu, CPU_IDLE);
4886
4887 rq = cpu_rq(balance_cpu);
4888 if (time_after(this_rq->next_balance, rq->next_balance))
4889 this_rq->next_balance = rq->next_balance;
4890 }
4891 }
4892#endif
4893}
4894
4895static inline int on_null_domain(int cpu)
4896{
4897 return !rcu_dereference(cpu_rq(cpu)->sd);
4898}
4899
4900/*
4901 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4902 *
4903 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4904 * idle load balancing owner or decide to stop the periodic load balancing,
4905 * if the whole system is idle.
4906 */
4907static inline void trigger_load_balance(struct rq *rq, int cpu)
4908{
4909#ifdef CONFIG_NO_HZ
4910 /*
4911 * If we were in the nohz mode recently and busy at the current
4912 * scheduler tick, then check if we need to nominate new idle
4913 * load balancer.
4914 */
4915 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4916 rq->in_nohz_recently = 0;
4917
4918 if (atomic_read(&nohz.load_balancer) == cpu) {
4919 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4920 atomic_set(&nohz.load_balancer, -1);
4921 }
4922
4923 if (atomic_read(&nohz.load_balancer) == -1) {
4924 int ilb = find_new_ilb(cpu);
4925
4926 if (ilb < nr_cpu_ids)
4927 resched_cpu(ilb);
4928 }
4929 }
4930
4931 /*
4932 * If this cpu is idle and doing idle load balancing for all the
4933 * cpus with ticks stopped, is it time for that to stop?
4934 */
4935 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4936 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4937 resched_cpu(cpu);
4938 return;
4939 }
4940
4941 /*
4942 * If this cpu is idle and the idle load balancing is done by
4943 * someone else, then no need raise the SCHED_SOFTIRQ
4944 */
4945 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4946 cpumask_test_cpu(cpu, nohz.cpu_mask))
4947 return;
4948#endif
4949 /* Don't need to rebalance while attached to NULL domain */
4950 if (time_after_eq(jiffies, rq->next_balance) &&
4951 likely(!on_null_domain(cpu)))
4952 raise_softirq(SCHED_SOFTIRQ);
4953}
4954
4955#else /* CONFIG_SMP */
4956
4957/*
4958 * on UP we do not need to balance between CPUs:
4959 */
4960static inline void idle_balance(int cpu, struct rq *rq)
4961{
4962}
4963
4964#endif 3164#endif
4965 3165
4966DEFINE_PER_CPU(struct kernel_stat, kstat); 3166DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -6060,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6060 unsigned long flags; 4260 unsigned long flags;
6061 int oldprio, on_rq, running; 4261 int oldprio, on_rq, running;
6062 struct rq *rq; 4262 struct rq *rq;
6063 const struct sched_class *prev_class = p->sched_class; 4263 const struct sched_class *prev_class;
6064 4264
6065 BUG_ON(prio < 0 || prio > MAX_PRIO); 4265 BUG_ON(prio < 0 || prio > MAX_PRIO);
6066 4266
@@ -6068,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6068 update_rq_clock(rq); 4268 update_rq_clock(rq);
6069 4269
6070 oldprio = p->prio; 4270 oldprio = p->prio;
4271 prev_class = p->sched_class;
6071 on_rq = p->se.on_rq; 4272 on_rq = p->se.on_rq;
6072 running = task_current(rq, p); 4273 running = task_current(rq, p);
6073 if (on_rq) 4274 if (on_rq)
@@ -6085,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6085 if (running) 4286 if (running)
6086 p->sched_class->set_curr_task(rq); 4287 p->sched_class->set_curr_task(rq);
6087 if (on_rq) { 4288 if (on_rq) {
6088 enqueue_task(rq, p, 0); 4289 enqueue_task(rq, p, 0, oldprio < prio);
6089 4290
6090 check_class_changed(rq, p, prev_class, oldprio, running); 4291 check_class_changed(rq, p, prev_class, oldprio, running);
6091 } 4292 }
@@ -6129,7 +4330,7 @@ void set_user_nice(struct task_struct *p, long nice)
6129 delta = p->prio - old_prio; 4330 delta = p->prio - old_prio;
6130 4331
6131 if (on_rq) { 4332 if (on_rq) {
6132 enqueue_task(rq, p, 0); 4333 enqueue_task(rq, p, 0, false);
6133 /* 4334 /*
6134 * If the task increased its priority or is running and 4335 * If the task increased its priority or is running and
6135 * lowered its priority, then reschedule its CPU: 4336 * lowered its priority, then reschedule its CPU:
@@ -6152,7 +4353,7 @@ int can_nice(const struct task_struct *p, const int nice)
6152 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4353 /* convert nice value [19,-20] to rlimit style value [1,40] */
6153 int nice_rlim = 20 - nice; 4354 int nice_rlim = 20 - nice;
6154 4355
6155 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 4356 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
6156 capable(CAP_SYS_NICE)); 4357 capable(CAP_SYS_NICE));
6157} 4358}
6158 4359
@@ -6287,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6287{ 4488{
6288 int retval, oldprio, oldpolicy = -1, on_rq, running; 4489 int retval, oldprio, oldpolicy = -1, on_rq, running;
6289 unsigned long flags; 4490 unsigned long flags;
6290 const struct sched_class *prev_class = p->sched_class; 4491 const struct sched_class *prev_class;
6291 struct rq *rq; 4492 struct rq *rq;
6292 int reset_on_fork; 4493 int reset_on_fork;
6293 4494
@@ -6329,7 +4530,7 @@ recheck:
6329 4530
6330 if (!lock_task_sighand(p, &flags)) 4531 if (!lock_task_sighand(p, &flags))
6331 return -ESRCH; 4532 return -ESRCH;
6332 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; 4533 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
6333 unlock_task_sighand(p, &flags); 4534 unlock_task_sighand(p, &flags);
6334 4535
6335 /* can't set/change the rt policy */ 4536 /* can't set/change the rt policy */
@@ -6401,6 +4602,7 @@ recheck:
6401 p->sched_reset_on_fork = reset_on_fork; 4602 p->sched_reset_on_fork = reset_on_fork;
6402 4603
6403 oldprio = p->prio; 4604 oldprio = p->prio;
4605 prev_class = p->sched_class;
6404 __setscheduler(rq, p, policy, param->sched_priority); 4606 __setscheduler(rq, p, policy, param->sched_priority);
6405 4607
6406 if (running) 4608 if (running)
@@ -7151,27 +5353,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7151 struct rq *rq; 5353 struct rq *rq;
7152 int ret = 0; 5354 int ret = 0;
7153 5355
7154 /*
7155 * Since we rely on wake-ups to migrate sleeping tasks, don't change
7156 * the ->cpus_allowed mask from under waking tasks, which would be
7157 * possible when we change rq->lock in ttwu(), so synchronize against
7158 * TASK_WAKING to avoid that.
7159 *
7160 * Make an exception for freshly cloned tasks, since cpuset namespaces
7161 * might move the task about, we have to validate the target in
7162 * wake_up_new_task() anyway since the cpu might have gone away.
7163 */
7164again:
7165 while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
7166 cpu_relax();
7167
7168 rq = task_rq_lock(p, &flags); 5356 rq = task_rq_lock(p, &flags);
7169 5357
7170 if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
7171 task_rq_unlock(rq, &flags);
7172 goto again;
7173 }
7174
7175 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5358 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7176 ret = -EINVAL; 5359 ret = -EINVAL;
7177 goto out; 5360 goto out;
@@ -9223,11 +7406,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
9223 7406
9224#ifdef CONFIG_SCHED_MC 7407#ifdef CONFIG_SCHED_MC
9225static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, 7408static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7409 struct sysdev_class_attribute *attr,
9226 char *page) 7410 char *page)
9227{ 7411{
9228 return sprintf(page, "%u\n", sched_mc_power_savings); 7412 return sprintf(page, "%u\n", sched_mc_power_savings);
9229} 7413}
9230static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, 7414static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7415 struct sysdev_class_attribute *attr,
9231 const char *buf, size_t count) 7416 const char *buf, size_t count)
9232{ 7417{
9233 return sched_power_savings_store(buf, count, 0); 7418 return sched_power_savings_store(buf, count, 0);
@@ -9239,11 +7424,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
9239 7424
9240#ifdef CONFIG_SCHED_SMT 7425#ifdef CONFIG_SCHED_SMT
9241static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, 7426static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7427 struct sysdev_class_attribute *attr,
9242 char *page) 7428 char *page)
9243{ 7429{
9244 return sprintf(page, "%u\n", sched_smt_power_savings); 7430 return sprintf(page, "%u\n", sched_smt_power_savings);
9245} 7431}
9246static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, 7432static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7433 struct sysdev_class_attribute *attr,
9247 const char *buf, size_t count) 7434 const char *buf, size_t count)
9248{ 7435{
9249 return sched_power_savings_store(buf, count, 1); 7436 return sched_power_savings_store(buf, count, 1);
@@ -9458,7 +7645,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9458 tg->rt_rq[cpu] = rt_rq; 7645 tg->rt_rq[cpu] = rt_rq;
9459 init_rt_rq(rt_rq, rq); 7646 init_rt_rq(rt_rq, rq);
9460 rt_rq->tg = tg; 7647 rt_rq->tg = tg;
9461 rt_rq->rt_se = rt_se;
9462 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7648 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9463 if (add) 7649 if (add)
9464 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7650 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9489,9 +7675,6 @@ void __init sched_init(void)
9489#ifdef CONFIG_RT_GROUP_SCHED 7675#ifdef CONFIG_RT_GROUP_SCHED
9490 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7676 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9491#endif 7677#endif
9492#ifdef CONFIG_USER_SCHED
9493 alloc_size *= 2;
9494#endif
9495#ifdef CONFIG_CPUMASK_OFFSTACK 7678#ifdef CONFIG_CPUMASK_OFFSTACK
9496 alloc_size += num_possible_cpus() * cpumask_size(); 7679 alloc_size += num_possible_cpus() * cpumask_size();
9497#endif 7680#endif
@@ -9505,13 +7688,6 @@ void __init sched_init(void)
9505 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7688 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9506 ptr += nr_cpu_ids * sizeof(void **); 7689 ptr += nr_cpu_ids * sizeof(void **);
9507 7690
9508#ifdef CONFIG_USER_SCHED
9509 root_task_group.se = (struct sched_entity **)ptr;
9510 ptr += nr_cpu_ids * sizeof(void **);
9511
9512 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9513 ptr += nr_cpu_ids * sizeof(void **);
9514#endif /* CONFIG_USER_SCHED */
9515#endif /* CONFIG_FAIR_GROUP_SCHED */ 7691#endif /* CONFIG_FAIR_GROUP_SCHED */
9516#ifdef CONFIG_RT_GROUP_SCHED 7692#ifdef CONFIG_RT_GROUP_SCHED
9517 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7693 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9520,13 +7696,6 @@ void __init sched_init(void)
9520 init_task_group.rt_rq = (struct rt_rq **)ptr; 7696 init_task_group.rt_rq = (struct rt_rq **)ptr;
9521 ptr += nr_cpu_ids * sizeof(void **); 7697 ptr += nr_cpu_ids * sizeof(void **);
9522 7698
9523#ifdef CONFIG_USER_SCHED
9524 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9525 ptr += nr_cpu_ids * sizeof(void **);
9526
9527 root_task_group.rt_rq = (struct rt_rq **)ptr;
9528 ptr += nr_cpu_ids * sizeof(void **);
9529#endif /* CONFIG_USER_SCHED */
9530#endif /* CONFIG_RT_GROUP_SCHED */ 7699#endif /* CONFIG_RT_GROUP_SCHED */
9531#ifdef CONFIG_CPUMASK_OFFSTACK 7700#ifdef CONFIG_CPUMASK_OFFSTACK
9532 for_each_possible_cpu(i) { 7701 for_each_possible_cpu(i) {
@@ -9546,22 +7715,13 @@ void __init sched_init(void)
9546#ifdef CONFIG_RT_GROUP_SCHED 7715#ifdef CONFIG_RT_GROUP_SCHED
9547 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7716 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9548 global_rt_period(), global_rt_runtime()); 7717 global_rt_period(), global_rt_runtime());
9549#ifdef CONFIG_USER_SCHED
9550 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9551 global_rt_period(), RUNTIME_INF);
9552#endif /* CONFIG_USER_SCHED */
9553#endif /* CONFIG_RT_GROUP_SCHED */ 7718#endif /* CONFIG_RT_GROUP_SCHED */
9554 7719
9555#ifdef CONFIG_GROUP_SCHED 7720#ifdef CONFIG_CGROUP_SCHED
9556 list_add(&init_task_group.list, &task_groups); 7721 list_add(&init_task_group.list, &task_groups);
9557 INIT_LIST_HEAD(&init_task_group.children); 7722 INIT_LIST_HEAD(&init_task_group.children);
9558 7723
9559#ifdef CONFIG_USER_SCHED 7724#endif /* CONFIG_CGROUP_SCHED */
9560 INIT_LIST_HEAD(&root_task_group.children);
9561 init_task_group.parent = &root_task_group;
9562 list_add(&init_task_group.siblings, &root_task_group.children);
9563#endif /* CONFIG_USER_SCHED */
9564#endif /* CONFIG_GROUP_SCHED */
9565 7725
9566#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7726#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9567 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7727 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9601,25 +7761,6 @@ void __init sched_init(void)
9601 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7761 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9602 */ 7762 */
9603 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7763 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9604#elif defined CONFIG_USER_SCHED
9605 root_task_group.shares = NICE_0_LOAD;
9606 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9607 /*
9608 * In case of task-groups formed thr' the user id of tasks,
9609 * init_task_group represents tasks belonging to root user.
9610 * Hence it forms a sibling of all subsequent groups formed.
9611 * In this case, init_task_group gets only a fraction of overall
9612 * system cpu resource, based on the weight assigned to root
9613 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9614 * by letting tasks of init_task_group sit in a separate cfs_rq
9615 * (init_tg_cfs_rq) and having one entity represent this group of
9616 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9617 */
9618 init_tg_cfs_entry(&init_task_group,
9619 &per_cpu(init_tg_cfs_rq, i),
9620 &per_cpu(init_sched_entity, i), i, 1,
9621 root_task_group.se[i]);
9622
9623#endif 7764#endif
9624#endif /* CONFIG_FAIR_GROUP_SCHED */ 7765#endif /* CONFIG_FAIR_GROUP_SCHED */
9625 7766
@@ -9628,12 +7769,6 @@ void __init sched_init(void)
9628 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7769 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9629#ifdef CONFIG_CGROUP_SCHED 7770#ifdef CONFIG_CGROUP_SCHED
9630 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7771 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9631#elif defined CONFIG_USER_SCHED
9632 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9633 init_tg_rt_entry(&init_task_group,
9634 &per_cpu(init_rt_rq_var, i),
9635 &per_cpu(init_sched_rt_entity, i), i, 1,
9636 root_task_group.rt_se[i]);
9637#endif 7772#endif
9638#endif 7773#endif
9639 7774
@@ -9718,7 +7853,7 @@ static inline int preempt_count_equals(int preempt_offset)
9718 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7853 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9719} 7854}
9720 7855
9721void __might_sleep(char *file, int line, int preempt_offset) 7856void __might_sleep(const char *file, int line, int preempt_offset)
9722{ 7857{
9723#ifdef in_atomic 7858#ifdef in_atomic
9724 static unsigned long prev_jiffy; /* ratelimiting */ 7859 static unsigned long prev_jiffy; /* ratelimiting */
@@ -10029,7 +8164,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
10029} 8164}
10030#endif /* CONFIG_RT_GROUP_SCHED */ 8165#endif /* CONFIG_RT_GROUP_SCHED */
10031 8166
10032#ifdef CONFIG_GROUP_SCHED 8167#ifdef CONFIG_CGROUP_SCHED
10033static void free_sched_group(struct task_group *tg) 8168static void free_sched_group(struct task_group *tg)
10034{ 8169{
10035 free_fair_sched_group(tg); 8170 free_fair_sched_group(tg);
@@ -10134,11 +8269,11 @@ void sched_move_task(struct task_struct *tsk)
10134 if (unlikely(running)) 8269 if (unlikely(running))
10135 tsk->sched_class->set_curr_task(rq); 8270 tsk->sched_class->set_curr_task(rq);
10136 if (on_rq) 8271 if (on_rq)
10137 enqueue_task(rq, tsk, 0); 8272 enqueue_task(rq, tsk, 0, false);
10138 8273
10139 task_rq_unlock(rq, &flags); 8274 task_rq_unlock(rq, &flags);
10140} 8275}
10141#endif /* CONFIG_GROUP_SCHED */ 8276#endif /* CONFIG_CGROUP_SCHED */
10142 8277
10143#ifdef CONFIG_FAIR_GROUP_SCHED 8278#ifdef CONFIG_FAIR_GROUP_SCHED
10144static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8279static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10280,13 +8415,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10280 runtime = d->rt_runtime; 8415 runtime = d->rt_runtime;
10281 } 8416 }
10282 8417
10283#ifdef CONFIG_USER_SCHED
10284 if (tg == &root_task_group) {
10285 period = global_rt_period();
10286 runtime = global_rt_runtime();
10287 }
10288#endif
10289
10290 /* 8418 /*
10291 * Cannot have more runtime than the period. 8419 * Cannot have more runtime than the period.
10292 */ 8420 */
@@ -10689,7 +8817,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
10689struct cpuacct { 8817struct cpuacct {
10690 struct cgroup_subsys_state css; 8818 struct cgroup_subsys_state css;
10691 /* cpuusage holds pointer to a u64-type object on every cpu */ 8819 /* cpuusage holds pointer to a u64-type object on every cpu */
10692 u64 *cpuusage; 8820 u64 __percpu *cpuusage;
10693 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 8821 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
10694 struct cpuacct *parent; 8822 struct cpuacct *parent;
10695}; 8823};
@@ -10906,12 +9034,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10906} 9034}
10907 9035
10908/* 9036/*
9037 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9038 * in cputime_t units. As a result, cpuacct_update_stats calls
9039 * percpu_counter_add with values large enough to always overflow the
9040 * per cpu batch limit causing bad SMP scalability.
9041 *
9042 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9043 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9044 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9045 */
9046#ifdef CONFIG_SMP
9047#define CPUACCT_BATCH \
9048 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9049#else
9050#define CPUACCT_BATCH 0
9051#endif
9052
9053/*
10909 * Charge the system/user time to the task's accounting group. 9054 * Charge the system/user time to the task's accounting group.
10910 */ 9055 */
10911static void cpuacct_update_stats(struct task_struct *tsk, 9056static void cpuacct_update_stats(struct task_struct *tsk,
10912 enum cpuacct_stat_index idx, cputime_t val) 9057 enum cpuacct_stat_index idx, cputime_t val)
10913{ 9058{
10914 struct cpuacct *ca; 9059 struct cpuacct *ca;
9060 int batch = CPUACCT_BATCH;
10915 9061
10916 if (unlikely(!cpuacct_subsys.active)) 9062 if (unlikely(!cpuacct_subsys.active))
10917 return; 9063 return;
@@ -10920,7 +9066,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10920 ca = task_ca(tsk); 9066 ca = task_ca(tsk);
10921 9067
10922 do { 9068 do {
10923 percpu_counter_add(&ca->cpustat[idx], val); 9069 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10924 ca = ca->parent; 9070 ca = ca->parent;
10925 } while (ca); 9071 } while (ca);
10926 rcu_read_unlock(); 9072 rcu_read_unlock();