aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c2206
1 files changed, 177 insertions, 2029 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 978edfd35a96..abb36b16b93b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 233 */
234static DEFINE_MUTEX(sched_domains_mutex); 234static DEFINE_MUTEX(sched_domains_mutex);
235 235
236#ifdef CONFIG_GROUP_SCHED 236#ifdef CONFIG_CGROUP_SCHED
237 237
238#include <linux/cgroup.h> 238#include <linux/cgroup.h>
239 239
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
243 243
244/* task group related information */ 244/* task group related information */
245struct task_group { 245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 246 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 247
254#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 249 /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
274 struct list_head children; 268 struct list_head children;
275}; 269};
276 270
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 271#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 272
307/* task_group_lock serializes add/remove of task groups and also changes to 273/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 274 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
318} 284}
319#endif 285#endif
320 286
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 287# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 288
327/* 289/*
328 * A weight of 0 or 1 can cause arithmetics problems. 290 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 310{
349 struct task_group *tg; 311 struct task_group *tg;
350 312
351#ifdef CONFIG_USER_SCHED 313#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 315 struct task_group, css);
358#else 316#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 341 return NULL;
384} 342}
385 343
386#endif /* CONFIG_GROUP_SCHED */ 344#endif /* CONFIG_CGROUP_SCHED */
387 345
388/* CFS-related fields in a runqueue */ 346/* CFS-related fields in a runqueue */
389struct cfs_rq { 347struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
478 struct rq *rq; 436 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 437 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 438 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 439#endif
483}; 440};
484 441
@@ -645,6 +602,11 @@ static inline int cpu_of(struct rq *rq)
645#endif 602#endif
646} 603}
647 604
605#define rcu_dereference_check_sched_domain(p) \
606 rcu_dereference_check((p), \
607 rcu_read_lock_sched_held() || \
608 lockdep_is_held(&sched_domains_mutex))
609
648/* 610/*
649 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 611 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
650 * See detach_destroy_domains: synchronize_sched for details. 612 * See detach_destroy_domains: synchronize_sched for details.
@@ -653,7 +615,7 @@ static inline int cpu_of(struct rq *rq)
653 * preempt-disabled sections. 615 * preempt-disabled sections.
654 */ 616 */
655#define for_each_domain(cpu, __sd) \ 617#define for_each_domain(cpu, __sd) \
656 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 618 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
657 619
658#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 620#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
659#define this_rq() (&__get_cpu_var(runqueues)) 621#define this_rq() (&__get_cpu_var(runqueues))
@@ -941,16 +903,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
941#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 903#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
942 904
943/* 905/*
906 * Check whether the task is waking, we use this to synchronize against
907 * ttwu() so that task_cpu() reports a stable number.
908 *
909 * We need to make an exception for PF_STARTING tasks because the fork
910 * path might require task_rq_lock() to work, eg. it can call
911 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
912 */
913static inline int task_is_waking(struct task_struct *p)
914{
915 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
916}
917
918/*
944 * __task_rq_lock - lock the runqueue a given task resides on. 919 * __task_rq_lock - lock the runqueue a given task resides on.
945 * Must be called interrupts disabled. 920 * Must be called interrupts disabled.
946 */ 921 */
947static inline struct rq *__task_rq_lock(struct task_struct *p) 922static inline struct rq *__task_rq_lock(struct task_struct *p)
948 __acquires(rq->lock) 923 __acquires(rq->lock)
949{ 924{
925 struct rq *rq;
926
950 for (;;) { 927 for (;;) {
951 struct rq *rq = task_rq(p); 928 while (task_is_waking(p))
929 cpu_relax();
930 rq = task_rq(p);
952 raw_spin_lock(&rq->lock); 931 raw_spin_lock(&rq->lock);
953 if (likely(rq == task_rq(p))) 932 if (likely(rq == task_rq(p) && !task_is_waking(p)))
954 return rq; 933 return rq;
955 raw_spin_unlock(&rq->lock); 934 raw_spin_unlock(&rq->lock);
956 } 935 }
@@ -967,10 +946,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
967 struct rq *rq; 946 struct rq *rq;
968 947
969 for (;;) { 948 for (;;) {
949 while (task_is_waking(p))
950 cpu_relax();
970 local_irq_save(*flags); 951 local_irq_save(*flags);
971 rq = task_rq(p); 952 rq = task_rq(p);
972 raw_spin_lock(&rq->lock); 953 raw_spin_lock(&rq->lock);
973 if (likely(rq == task_rq(p))) 954 if (likely(rq == task_rq(p) && !task_is_waking(p)))
974 return rq; 955 return rq;
975 raw_spin_unlock_irqrestore(&rq->lock, *flags); 956 raw_spin_unlock_irqrestore(&rq->lock, *flags);
976 } 957 }
@@ -1390,32 +1371,6 @@ static const u32 prio_to_wmult[40] = {
1390 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1371 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1391}; 1372};
1392 1373
1393static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1394
1395/*
1396 * runqueue iterator, to support SMP load-balancing between different
1397 * scheduling classes, without having to expose their internal data
1398 * structures to the load-balancing proper:
1399 */
1400struct rq_iterator {
1401 void *arg;
1402 struct task_struct *(*start)(void *);
1403 struct task_struct *(*next)(void *);
1404};
1405
1406#ifdef CONFIG_SMP
1407static unsigned long
1408balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1409 unsigned long max_load_move, struct sched_domain *sd,
1410 enum cpu_idle_type idle, int *all_pinned,
1411 int *this_best_prio, struct rq_iterator *iterator);
1412
1413static int
1414iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1415 struct sched_domain *sd, enum cpu_idle_type idle,
1416 struct rq_iterator *iterator);
1417#endif
1418
1419/* Time spent by the tasks of the cpu accounting group executing in ... */ 1374/* Time spent by the tasks of the cpu accounting group executing in ... */
1420enum cpuacct_stat_index { 1375enum cpuacct_stat_index {
1421 CPUACCT_STAT_USER, /* ... user mode */ 1376 CPUACCT_STAT_USER, /* ... user mode */
@@ -1531,7 +1486,7 @@ static unsigned long target_load(int cpu, int type)
1531 1486
1532static struct sched_group *group_of(int cpu) 1487static struct sched_group *group_of(int cpu)
1533{ 1488{
1534 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); 1489 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1535 1490
1536 if (!sd) 1491 if (!sd)
1537 return NULL; 1492 return NULL;
@@ -1701,16 +1656,6 @@ static void update_shares(struct sched_domain *sd)
1701 } 1656 }
1702} 1657}
1703 1658
1704static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1705{
1706 if (root_task_group_empty())
1707 return;
1708
1709 raw_spin_unlock(&rq->lock);
1710 update_shares(sd);
1711 raw_spin_lock(&rq->lock);
1712}
1713
1714static void update_h_load(long cpu) 1659static void update_h_load(long cpu)
1715{ 1660{
1716 if (root_task_group_empty()) 1661 if (root_task_group_empty())
@@ -1725,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd)
1725{ 1670{
1726} 1671}
1727 1672
1728static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1729{
1730}
1731
1732#endif 1673#endif
1733 1674
1734#ifdef CONFIG_PREEMPT 1675#ifdef CONFIG_PREEMPT
@@ -1805,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 raw_spin_unlock(&busiest->lock); 1746 raw_spin_unlock(&busiest->lock);
1806 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1747 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1807} 1748}
1749
1750/*
1751 * double_rq_lock - safely lock two runqueues
1752 *
1753 * Note this does not disable interrupts like task_rq_lock,
1754 * you need to do so manually before calling.
1755 */
1756static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1757 __acquires(rq1->lock)
1758 __acquires(rq2->lock)
1759{
1760 BUG_ON(!irqs_disabled());
1761 if (rq1 == rq2) {
1762 raw_spin_lock(&rq1->lock);
1763 __acquire(rq2->lock); /* Fake it out ;) */
1764 } else {
1765 if (rq1 < rq2) {
1766 raw_spin_lock(&rq1->lock);
1767 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1768 } else {
1769 raw_spin_lock(&rq2->lock);
1770 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1771 }
1772 }
1773 update_rq_clock(rq1);
1774 update_rq_clock(rq2);
1775}
1776
1777/*
1778 * double_rq_unlock - safely unlock two runqueues
1779 *
1780 * Note this does not restore interrupts like task_rq_unlock,
1781 * you need to do so manually after calling.
1782 */
1783static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1784 __releases(rq1->lock)
1785 __releases(rq2->lock)
1786{
1787 raw_spin_unlock(&rq1->lock);
1788 if (rq1 != rq2)
1789 raw_spin_unlock(&rq2->lock);
1790 else
1791 __release(rq2->lock);
1792}
1793
1808#endif 1794#endif
1809 1795
1810#ifdef CONFIG_FAIR_GROUP_SCHED 1796#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1834,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1834#endif 1820#endif
1835} 1821}
1836 1822
1837#include "sched_stats.h" 1823static const struct sched_class rt_sched_class;
1838#include "sched_idletask.c"
1839#include "sched_fair.c"
1840#include "sched_rt.c"
1841#ifdef CONFIG_SCHED_DEBUG
1842# include "sched_debug.c"
1843#endif
1844 1824
1845#define sched_class_highest (&rt_sched_class) 1825#define sched_class_highest (&rt_sched_class)
1846#define for_each_class(class) \ 1826#define for_each_class(class) \
1847 for (class = sched_class_highest; class; class = class->next) 1827 for (class = sched_class_highest; class; class = class->next)
1848 1828
1829#include "sched_stats.h"
1830
1849static void inc_nr_running(struct rq *rq) 1831static void inc_nr_running(struct rq *rq)
1850{ 1832{
1851 rq->nr_running++; 1833 rq->nr_running++;
@@ -1883,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample)
1883 *avg += diff >> 3; 1865 *avg += diff >> 3;
1884} 1866}
1885 1867
1886static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1868static void
1869enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1887{ 1870{
1888 if (wakeup) 1871 if (wakeup)
1889 p->se.start_runtime = p->se.sum_exec_runtime; 1872 p->se.start_runtime = p->se.sum_exec_runtime;
1890 1873
1891 sched_info_queued(p); 1874 sched_info_queued(p);
1892 p->sched_class->enqueue_task(rq, p, wakeup); 1875 p->sched_class->enqueue_task(rq, p, wakeup, head);
1893 p->se.on_rq = 1; 1876 p->se.on_rq = 1;
1894} 1877}
1895 1878
@@ -1912,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1912} 1895}
1913 1896
1914/* 1897/*
1898 * activate_task - move a task to the runqueue.
1899 */
1900static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1901{
1902 if (task_contributes_to_load(p))
1903 rq->nr_uninterruptible--;
1904
1905 enqueue_task(rq, p, wakeup, false);
1906 inc_nr_running(rq);
1907}
1908
1909/*
1910 * deactivate_task - remove a task from the runqueue.
1911 */
1912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1913{
1914 if (task_contributes_to_load(p))
1915 rq->nr_uninterruptible++;
1916
1917 dequeue_task(rq, p, sleep);
1918 dec_nr_running(rq);
1919}
1920
1921#include "sched_idletask.c"
1922#include "sched_fair.c"
1923#include "sched_rt.c"
1924#ifdef CONFIG_SCHED_DEBUG
1925# include "sched_debug.c"
1926#endif
1927
1928/*
1915 * __normal_prio - return the priority that is based on the static prio 1929 * __normal_prio - return the priority that is based on the static prio
1916 */ 1930 */
1917static inline int __normal_prio(struct task_struct *p) 1931static inline int __normal_prio(struct task_struct *p)
@@ -1957,30 +1971,6 @@ static int effective_prio(struct task_struct *p)
1957 return p->prio; 1971 return p->prio;
1958} 1972}
1959 1973
1960/*
1961 * activate_task - move a task to the runqueue.
1962 */
1963static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1964{
1965 if (task_contributes_to_load(p))
1966 rq->nr_uninterruptible--;
1967
1968 enqueue_task(rq, p, wakeup);
1969 inc_nr_running(rq);
1970}
1971
1972/*
1973 * deactivate_task - remove a task from the runqueue.
1974 */
1975static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1976{
1977 if (task_contributes_to_load(p))
1978 rq->nr_uninterruptible++;
1979
1980 dequeue_task(rq, p, sleep);
1981 dec_nr_running(rq);
1982}
1983
1984/** 1974/**
1985 * task_curr - is this task currently executing on a CPU? 1975 * task_curr - is this task currently executing on a CPU?
1986 * @p: the task in question. 1976 * @p: the task in question.
@@ -2408,14 +2398,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2408 __task_rq_unlock(rq); 2398 __task_rq_unlock(rq);
2409 2399
2410 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2400 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2411 if (cpu != orig_cpu) 2401 if (cpu != orig_cpu) {
2402 /*
2403 * Since we migrate the task without holding any rq->lock,
2404 * we need to be careful with task_rq_lock(), since that
2405 * might end up locking an invalid rq.
2406 */
2412 set_task_cpu(p, cpu); 2407 set_task_cpu(p, cpu);
2408 }
2413 2409
2414 rq = __task_rq_lock(p); 2410 rq = cpu_rq(cpu);
2411 raw_spin_lock(&rq->lock);
2415 update_rq_clock(rq); 2412 update_rq_clock(rq);
2416 2413
2414 /*
2415 * We migrated the task without holding either rq->lock, however
2416 * since the task is not on the task list itself, nobody else
2417 * will try and migrate the task, hence the rq should match the
2418 * cpu we just moved it to.
2419 */
2420 WARN_ON(task_cpu(p) != cpu);
2417 WARN_ON(p->state != TASK_WAKING); 2421 WARN_ON(p->state != TASK_WAKING);
2418 cpu = task_cpu(p);
2419 2422
2420#ifdef CONFIG_SCHEDSTATS 2423#ifdef CONFIG_SCHEDSTATS
2421 schedstat_inc(rq, ttwu_count); 2424 schedstat_inc(rq, ttwu_count);
@@ -2663,7 +2666,13 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2663 set_task_cpu(p, cpu); 2666 set_task_cpu(p, cpu);
2664#endif 2667#endif
2665 2668
2666 rq = task_rq_lock(p, &flags); 2669 /*
2670 * Since the task is not on the rq and we still have TASK_WAKING set
2671 * nobody else will migrate this task.
2672 */
2673 rq = cpu_rq(cpu);
2674 raw_spin_lock_irqsave(&rq->lock, flags);
2675
2667 BUG_ON(p->state != TASK_WAKING); 2676 BUG_ON(p->state != TASK_WAKING);
2668 p->state = TASK_RUNNING; 2677 p->state = TASK_RUNNING;
2669 update_rq_clock(rq); 2678 update_rq_clock(rq);
@@ -2794,7 +2803,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2794 */ 2803 */
2795 prev_state = prev->state; 2804 prev_state = prev->state;
2796 finish_arch_switch(prev); 2805 finish_arch_switch(prev);
2797 perf_event_task_sched_in(current, cpu_of(rq)); 2806#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2807 local_irq_disable();
2808#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2809 perf_event_task_sched_in(current);
2810#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2811 local_irq_enable();
2812#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2798 finish_lock_switch(rq, prev); 2813 finish_lock_switch(rq, prev);
2799 2814
2800 fire_sched_in_preempt_notifiers(current); 2815 fire_sched_in_preempt_notifiers(current);
@@ -3099,50 +3114,6 @@ static void update_cpu_load(struct rq *this_rq)
3099#ifdef CONFIG_SMP 3114#ifdef CONFIG_SMP
3100 3115
3101/* 3116/*
3102 * double_rq_lock - safely lock two runqueues
3103 *
3104 * Note this does not disable interrupts like task_rq_lock,
3105 * you need to do so manually before calling.
3106 */
3107static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3108 __acquires(rq1->lock)
3109 __acquires(rq2->lock)
3110{
3111 BUG_ON(!irqs_disabled());
3112 if (rq1 == rq2) {
3113 raw_spin_lock(&rq1->lock);
3114 __acquire(rq2->lock); /* Fake it out ;) */
3115 } else {
3116 if (rq1 < rq2) {
3117 raw_spin_lock(&rq1->lock);
3118 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3119 } else {
3120 raw_spin_lock(&rq2->lock);
3121 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3122 }
3123 }
3124 update_rq_clock(rq1);
3125 update_rq_clock(rq2);
3126}
3127
3128/*
3129 * double_rq_unlock - safely unlock two runqueues
3130 *
3131 * Note this does not restore interrupts like task_rq_unlock,
3132 * you need to do so manually after calling.
3133 */
3134static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3135 __releases(rq1->lock)
3136 __releases(rq2->lock)
3137{
3138 raw_spin_unlock(&rq1->lock);
3139 if (rq1 != rq2)
3140 raw_spin_unlock(&rq2->lock);
3141 else
3142 __release(rq2->lock);
3143}
3144
3145/*
3146 * sched_exec - execve() is a valuable balancing opportunity, because at 3117 * sched_exec - execve() is a valuable balancing opportunity, because at
3147 * this point the task has the smallest effective memory and cache footprint. 3118 * this point the task has the smallest effective memory and cache footprint.
3148 */ 3119 */
@@ -3190,1771 +3161,6 @@ again:
3190 task_rq_unlock(rq, &flags); 3161 task_rq_unlock(rq, &flags);
3191} 3162}
3192 3163
3193/*
3194 * pull_task - move a task from a remote runqueue to the local runqueue.
3195 * Both runqueues must be locked.
3196 */
3197static void pull_task(struct rq *src_rq, struct task_struct *p,
3198 struct rq *this_rq, int this_cpu)
3199{
3200 deactivate_task(src_rq, p, 0);
3201 set_task_cpu(p, this_cpu);
3202 activate_task(this_rq, p, 0);
3203 check_preempt_curr(this_rq, p, 0);
3204}
3205
3206/*
3207 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3208 */
3209static
3210int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3211 struct sched_domain *sd, enum cpu_idle_type idle,
3212 int *all_pinned)
3213{
3214 int tsk_cache_hot = 0;
3215 /*
3216 * We do not migrate tasks that are:
3217 * 1) running (obviously), or
3218 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3219 * 3) are cache-hot on their current CPU.
3220 */
3221 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3222 schedstat_inc(p, se.nr_failed_migrations_affine);
3223 return 0;
3224 }
3225 *all_pinned = 0;
3226
3227 if (task_running(rq, p)) {
3228 schedstat_inc(p, se.nr_failed_migrations_running);
3229 return 0;
3230 }
3231
3232 /*
3233 * Aggressive migration if:
3234 * 1) task is cache cold, or
3235 * 2) too many balance attempts have failed.
3236 */
3237
3238 tsk_cache_hot = task_hot(p, rq->clock, sd);
3239 if (!tsk_cache_hot ||
3240 sd->nr_balance_failed > sd->cache_nice_tries) {
3241#ifdef CONFIG_SCHEDSTATS
3242 if (tsk_cache_hot) {
3243 schedstat_inc(sd, lb_hot_gained[idle]);
3244 schedstat_inc(p, se.nr_forced_migrations);
3245 }
3246#endif
3247 return 1;
3248 }
3249
3250 if (tsk_cache_hot) {
3251 schedstat_inc(p, se.nr_failed_migrations_hot);
3252 return 0;
3253 }
3254 return 1;
3255}
3256
3257static unsigned long
3258balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3259 unsigned long max_load_move, struct sched_domain *sd,
3260 enum cpu_idle_type idle, int *all_pinned,
3261 int *this_best_prio, struct rq_iterator *iterator)
3262{
3263 int loops = 0, pulled = 0, pinned = 0;
3264 struct task_struct *p;
3265 long rem_load_move = max_load_move;
3266
3267 if (max_load_move == 0)
3268 goto out;
3269
3270 pinned = 1;
3271
3272 /*
3273 * Start the load-balancing iterator:
3274 */
3275 p = iterator->start(iterator->arg);
3276next:
3277 if (!p || loops++ > sysctl_sched_nr_migrate)
3278 goto out;
3279
3280 if ((p->se.load.weight >> 1) > rem_load_move ||
3281 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3282 p = iterator->next(iterator->arg);
3283 goto next;
3284 }
3285
3286 pull_task(busiest, p, this_rq, this_cpu);
3287 pulled++;
3288 rem_load_move -= p->se.load.weight;
3289
3290#ifdef CONFIG_PREEMPT
3291 /*
3292 * NEWIDLE balancing is a source of latency, so preemptible kernels
3293 * will stop after the first task is pulled to minimize the critical
3294 * section.
3295 */
3296 if (idle == CPU_NEWLY_IDLE)
3297 goto out;
3298#endif
3299
3300 /*
3301 * We only want to steal up to the prescribed amount of weighted load.
3302 */
3303 if (rem_load_move > 0) {
3304 if (p->prio < *this_best_prio)
3305 *this_best_prio = p->prio;
3306 p = iterator->next(iterator->arg);
3307 goto next;
3308 }
3309out:
3310 /*
3311 * Right now, this is one of only two places pull_task() is called,
3312 * so we can safely collect pull_task() stats here rather than
3313 * inside pull_task().
3314 */
3315 schedstat_add(sd, lb_gained[idle], pulled);
3316
3317 if (all_pinned)
3318 *all_pinned = pinned;
3319
3320 return max_load_move - rem_load_move;
3321}
3322
3323/*
3324 * move_tasks tries to move up to max_load_move weighted load from busiest to
3325 * this_rq, as part of a balancing operation within domain "sd".
3326 * Returns 1 if successful and 0 otherwise.
3327 *
3328 * Called with both runqueues locked.
3329 */
3330static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3331 unsigned long max_load_move,
3332 struct sched_domain *sd, enum cpu_idle_type idle,
3333 int *all_pinned)
3334{
3335 const struct sched_class *class = sched_class_highest;
3336 unsigned long total_load_moved = 0;
3337 int this_best_prio = this_rq->curr->prio;
3338
3339 do {
3340 total_load_moved +=
3341 class->load_balance(this_rq, this_cpu, busiest,
3342 max_load_move - total_load_moved,
3343 sd, idle, all_pinned, &this_best_prio);
3344 class = class->next;
3345
3346#ifdef CONFIG_PREEMPT
3347 /*
3348 * NEWIDLE balancing is a source of latency, so preemptible
3349 * kernels will stop after the first task is pulled to minimize
3350 * the critical section.
3351 */
3352 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3353 break;
3354#endif
3355 } while (class && max_load_move > total_load_moved);
3356
3357 return total_load_moved > 0;
3358}
3359
3360static int
3361iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3362 struct sched_domain *sd, enum cpu_idle_type idle,
3363 struct rq_iterator *iterator)
3364{
3365 struct task_struct *p = iterator->start(iterator->arg);
3366 int pinned = 0;
3367
3368 while (p) {
3369 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3370 pull_task(busiest, p, this_rq, this_cpu);
3371 /*
3372 * Right now, this is only the second place pull_task()
3373 * is called, so we can safely collect pull_task()
3374 * stats here rather than inside pull_task().
3375 */
3376 schedstat_inc(sd, lb_gained[idle]);
3377
3378 return 1;
3379 }
3380 p = iterator->next(iterator->arg);
3381 }
3382
3383 return 0;
3384}
3385
3386/*
3387 * move_one_task tries to move exactly one task from busiest to this_rq, as
3388 * part of active balancing operations within "domain".
3389 * Returns 1 if successful and 0 otherwise.
3390 *
3391 * Called with both runqueues locked.
3392 */
3393static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3394 struct sched_domain *sd, enum cpu_idle_type idle)
3395{
3396 const struct sched_class *class;
3397
3398 for_each_class(class) {
3399 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3400 return 1;
3401 }
3402
3403 return 0;
3404}
3405/********** Helpers for find_busiest_group ************************/
3406/*
3407 * sd_lb_stats - Structure to store the statistics of a sched_domain
3408 * during load balancing.
3409 */
3410struct sd_lb_stats {
3411 struct sched_group *busiest; /* Busiest group in this sd */
3412 struct sched_group *this; /* Local group in this sd */
3413 unsigned long total_load; /* Total load of all groups in sd */
3414 unsigned long total_pwr; /* Total power of all groups in sd */
3415 unsigned long avg_load; /* Average load across all groups in sd */
3416
3417 /** Statistics of this group */
3418 unsigned long this_load;
3419 unsigned long this_load_per_task;
3420 unsigned long this_nr_running;
3421
3422 /* Statistics of the busiest group */
3423 unsigned long max_load;
3424 unsigned long busiest_load_per_task;
3425 unsigned long busiest_nr_running;
3426
3427 int group_imb; /* Is there imbalance in this sd */
3428#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3429 int power_savings_balance; /* Is powersave balance needed for this sd */
3430 struct sched_group *group_min; /* Least loaded group in sd */
3431 struct sched_group *group_leader; /* Group which relieves group_min */
3432 unsigned long min_load_per_task; /* load_per_task in group_min */
3433 unsigned long leader_nr_running; /* Nr running of group_leader */
3434 unsigned long min_nr_running; /* Nr running of group_min */
3435#endif
3436};
3437
3438/*
3439 * sg_lb_stats - stats of a sched_group required for load_balancing
3440 */
3441struct sg_lb_stats {
3442 unsigned long avg_load; /*Avg load across the CPUs of the group */
3443 unsigned long group_load; /* Total load over the CPUs of the group */
3444 unsigned long sum_nr_running; /* Nr tasks running in the group */
3445 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3446 unsigned long group_capacity;
3447 int group_imb; /* Is there an imbalance in the group ? */
3448};
3449
3450/**
3451 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3452 * @group: The group whose first cpu is to be returned.
3453 */
3454static inline unsigned int group_first_cpu(struct sched_group *group)
3455{
3456 return cpumask_first(sched_group_cpus(group));
3457}
3458
3459/**
3460 * get_sd_load_idx - Obtain the load index for a given sched domain.
3461 * @sd: The sched_domain whose load_idx is to be obtained.
3462 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3463 */
3464static inline int get_sd_load_idx(struct sched_domain *sd,
3465 enum cpu_idle_type idle)
3466{
3467 int load_idx;
3468
3469 switch (idle) {
3470 case CPU_NOT_IDLE:
3471 load_idx = sd->busy_idx;
3472 break;
3473
3474 case CPU_NEWLY_IDLE:
3475 load_idx = sd->newidle_idx;
3476 break;
3477 default:
3478 load_idx = sd->idle_idx;
3479 break;
3480 }
3481
3482 return load_idx;
3483}
3484
3485
3486#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3487/**
3488 * init_sd_power_savings_stats - Initialize power savings statistics for
3489 * the given sched_domain, during load balancing.
3490 *
3491 * @sd: Sched domain whose power-savings statistics are to be initialized.
3492 * @sds: Variable containing the statistics for sd.
3493 * @idle: Idle status of the CPU at which we're performing load-balancing.
3494 */
3495static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3496 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3497{
3498 /*
3499 * Busy processors will not participate in power savings
3500 * balance.
3501 */
3502 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3503 sds->power_savings_balance = 0;
3504 else {
3505 sds->power_savings_balance = 1;
3506 sds->min_nr_running = ULONG_MAX;
3507 sds->leader_nr_running = 0;
3508 }
3509}
3510
3511/**
3512 * update_sd_power_savings_stats - Update the power saving stats for a
3513 * sched_domain while performing load balancing.
3514 *
3515 * @group: sched_group belonging to the sched_domain under consideration.
3516 * @sds: Variable containing the statistics of the sched_domain
3517 * @local_group: Does group contain the CPU for which we're performing
3518 * load balancing ?
3519 * @sgs: Variable containing the statistics of the group.
3520 */
3521static inline void update_sd_power_savings_stats(struct sched_group *group,
3522 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3523{
3524
3525 if (!sds->power_savings_balance)
3526 return;
3527
3528 /*
3529 * If the local group is idle or completely loaded
3530 * no need to do power savings balance at this domain
3531 */
3532 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3533 !sds->this_nr_running))
3534 sds->power_savings_balance = 0;
3535
3536 /*
3537 * If a group is already running at full capacity or idle,
3538 * don't include that group in power savings calculations
3539 */
3540 if (!sds->power_savings_balance ||
3541 sgs->sum_nr_running >= sgs->group_capacity ||
3542 !sgs->sum_nr_running)
3543 return;
3544
3545 /*
3546 * Calculate the group which has the least non-idle load.
3547 * This is the group from where we need to pick up the load
3548 * for saving power
3549 */
3550 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3551 (sgs->sum_nr_running == sds->min_nr_running &&
3552 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3553 sds->group_min = group;
3554 sds->min_nr_running = sgs->sum_nr_running;
3555 sds->min_load_per_task = sgs->sum_weighted_load /
3556 sgs->sum_nr_running;
3557 }
3558
3559 /*
3560 * Calculate the group which is almost near its
3561 * capacity but still has some space to pick up some load
3562 * from other group and save more power
3563 */
3564 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3565 return;
3566
3567 if (sgs->sum_nr_running > sds->leader_nr_running ||
3568 (sgs->sum_nr_running == sds->leader_nr_running &&
3569 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3570 sds->group_leader = group;
3571 sds->leader_nr_running = sgs->sum_nr_running;
3572 }
3573}
3574
3575/**
3576 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3577 * @sds: Variable containing the statistics of the sched_domain
3578 * under consideration.
3579 * @this_cpu: Cpu at which we're currently performing load-balancing.
3580 * @imbalance: Variable to store the imbalance.
3581 *
3582 * Description:
3583 * Check if we have potential to perform some power-savings balance.
3584 * If yes, set the busiest group to be the least loaded group in the
3585 * sched_domain, so that it's CPUs can be put to idle.
3586 *
3587 * Returns 1 if there is potential to perform power-savings balance.
3588 * Else returns 0.
3589 */
3590static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3591 int this_cpu, unsigned long *imbalance)
3592{
3593 if (!sds->power_savings_balance)
3594 return 0;
3595
3596 if (sds->this != sds->group_leader ||
3597 sds->group_leader == sds->group_min)
3598 return 0;
3599
3600 *imbalance = sds->min_load_per_task;
3601 sds->busiest = sds->group_min;
3602
3603 return 1;
3604
3605}
3606#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3607static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3608 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3609{
3610 return;
3611}
3612
3613static inline void update_sd_power_savings_stats(struct sched_group *group,
3614 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3615{
3616 return;
3617}
3618
3619static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3620 int this_cpu, unsigned long *imbalance)
3621{
3622 return 0;
3623}
3624#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3625
3626
3627unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3628{
3629 return SCHED_LOAD_SCALE;
3630}
3631
3632unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3633{
3634 return default_scale_freq_power(sd, cpu);
3635}
3636
3637unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3638{
3639 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3640 unsigned long smt_gain = sd->smt_gain;
3641
3642 smt_gain /= weight;
3643
3644 return smt_gain;
3645}
3646
3647unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3648{
3649 return default_scale_smt_power(sd, cpu);
3650}
3651
3652unsigned long scale_rt_power(int cpu)
3653{
3654 struct rq *rq = cpu_rq(cpu);
3655 u64 total, available;
3656
3657 sched_avg_update(rq);
3658
3659 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3660 available = total - rq->rt_avg;
3661
3662 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3663 total = SCHED_LOAD_SCALE;
3664
3665 total >>= SCHED_LOAD_SHIFT;
3666
3667 return div_u64(available, total);
3668}
3669
3670static void update_cpu_power(struct sched_domain *sd, int cpu)
3671{
3672 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3673 unsigned long power = SCHED_LOAD_SCALE;
3674 struct sched_group *sdg = sd->groups;
3675
3676 if (sched_feat(ARCH_POWER))
3677 power *= arch_scale_freq_power(sd, cpu);
3678 else
3679 power *= default_scale_freq_power(sd, cpu);
3680
3681 power >>= SCHED_LOAD_SHIFT;
3682
3683 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3684 if (sched_feat(ARCH_POWER))
3685 power *= arch_scale_smt_power(sd, cpu);
3686 else
3687 power *= default_scale_smt_power(sd, cpu);
3688
3689 power >>= SCHED_LOAD_SHIFT;
3690 }
3691
3692 power *= scale_rt_power(cpu);
3693 power >>= SCHED_LOAD_SHIFT;
3694
3695 if (!power)
3696 power = 1;
3697
3698 sdg->cpu_power = power;
3699}
3700
3701static void update_group_power(struct sched_domain *sd, int cpu)
3702{
3703 struct sched_domain *child = sd->child;
3704 struct sched_group *group, *sdg = sd->groups;
3705 unsigned long power;
3706
3707 if (!child) {
3708 update_cpu_power(sd, cpu);
3709 return;
3710 }
3711
3712 power = 0;
3713
3714 group = child->groups;
3715 do {
3716 power += group->cpu_power;
3717 group = group->next;
3718 } while (group != child->groups);
3719
3720 sdg->cpu_power = power;
3721}
3722
3723/**
3724 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3725 * @sd: The sched_domain whose statistics are to be updated.
3726 * @group: sched_group whose statistics are to be updated.
3727 * @this_cpu: Cpu for which load balance is currently performed.
3728 * @idle: Idle status of this_cpu
3729 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3730 * @sd_idle: Idle status of the sched_domain containing group.
3731 * @local_group: Does group contain this_cpu.
3732 * @cpus: Set of cpus considered for load balancing.
3733 * @balance: Should we balance.
3734 * @sgs: variable to hold the statistics for this group.
3735 */
3736static inline void update_sg_lb_stats(struct sched_domain *sd,
3737 struct sched_group *group, int this_cpu,
3738 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3739 int local_group, const struct cpumask *cpus,
3740 int *balance, struct sg_lb_stats *sgs)
3741{
3742 unsigned long load, max_cpu_load, min_cpu_load;
3743 int i;
3744 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3745 unsigned long sum_avg_load_per_task;
3746 unsigned long avg_load_per_task;
3747
3748 if (local_group) {
3749 balance_cpu = group_first_cpu(group);
3750 if (balance_cpu == this_cpu)
3751 update_group_power(sd, this_cpu);
3752 }
3753
3754 /* Tally up the load of all CPUs in the group */
3755 sum_avg_load_per_task = avg_load_per_task = 0;
3756 max_cpu_load = 0;
3757 min_cpu_load = ~0UL;
3758
3759 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3760 struct rq *rq = cpu_rq(i);
3761
3762 if (*sd_idle && rq->nr_running)
3763 *sd_idle = 0;
3764
3765 /* Bias balancing toward cpus of our domain */
3766 if (local_group) {
3767 if (idle_cpu(i) && !first_idle_cpu) {
3768 first_idle_cpu = 1;
3769 balance_cpu = i;
3770 }
3771
3772 load = target_load(i, load_idx);
3773 } else {
3774 load = source_load(i, load_idx);
3775 if (load > max_cpu_load)
3776 max_cpu_load = load;
3777 if (min_cpu_load > load)
3778 min_cpu_load = load;
3779 }
3780
3781 sgs->group_load += load;
3782 sgs->sum_nr_running += rq->nr_running;
3783 sgs->sum_weighted_load += weighted_cpuload(i);
3784
3785 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3786 }
3787
3788 /*
3789 * First idle cpu or the first cpu(busiest) in this sched group
3790 * is eligible for doing load balancing at this and above
3791 * domains. In the newly idle case, we will allow all the cpu's
3792 * to do the newly idle load balance.
3793 */
3794 if (idle != CPU_NEWLY_IDLE && local_group &&
3795 balance_cpu != this_cpu && balance) {
3796 *balance = 0;
3797 return;
3798 }
3799
3800 /* Adjust by relative CPU power of the group */
3801 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3802
3803
3804 /*
3805 * Consider the group unbalanced when the imbalance is larger
3806 * than the average weight of two tasks.
3807 *
3808 * APZ: with cgroup the avg task weight can vary wildly and
3809 * might not be a suitable number - should we keep a
3810 * normalized nr_running number somewhere that negates
3811 * the hierarchy?
3812 */
3813 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3814 group->cpu_power;
3815
3816 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3817 sgs->group_imb = 1;
3818
3819 sgs->group_capacity =
3820 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3821}
3822
3823/**
3824 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3825 * @sd: sched_domain whose statistics are to be updated.
3826 * @this_cpu: Cpu for which load balance is currently performed.
3827 * @idle: Idle status of this_cpu
3828 * @sd_idle: Idle status of the sched_domain containing group.
3829 * @cpus: Set of cpus considered for load balancing.
3830 * @balance: Should we balance.
3831 * @sds: variable to hold the statistics for this sched_domain.
3832 */
3833static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3834 enum cpu_idle_type idle, int *sd_idle,
3835 const struct cpumask *cpus, int *balance,
3836 struct sd_lb_stats *sds)
3837{
3838 struct sched_domain *child = sd->child;
3839 struct sched_group *group = sd->groups;
3840 struct sg_lb_stats sgs;
3841 int load_idx, prefer_sibling = 0;
3842
3843 if (child && child->flags & SD_PREFER_SIBLING)
3844 prefer_sibling = 1;
3845
3846 init_sd_power_savings_stats(sd, sds, idle);
3847 load_idx = get_sd_load_idx(sd, idle);
3848
3849 do {
3850 int local_group;
3851
3852 local_group = cpumask_test_cpu(this_cpu,
3853 sched_group_cpus(group));
3854 memset(&sgs, 0, sizeof(sgs));
3855 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3856 local_group, cpus, balance, &sgs);
3857
3858 if (local_group && balance && !(*balance))
3859 return;
3860
3861 sds->total_load += sgs.group_load;
3862 sds->total_pwr += group->cpu_power;
3863
3864 /*
3865 * In case the child domain prefers tasks go to siblings
3866 * first, lower the group capacity to one so that we'll try
3867 * and move all the excess tasks away.
3868 */
3869 if (prefer_sibling)
3870 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3871
3872 if (local_group) {
3873 sds->this_load = sgs.avg_load;
3874 sds->this = group;
3875 sds->this_nr_running = sgs.sum_nr_running;
3876 sds->this_load_per_task = sgs.sum_weighted_load;
3877 } else if (sgs.avg_load > sds->max_load &&
3878 (sgs.sum_nr_running > sgs.group_capacity ||
3879 sgs.group_imb)) {
3880 sds->max_load = sgs.avg_load;
3881 sds->busiest = group;
3882 sds->busiest_nr_running = sgs.sum_nr_running;
3883 sds->busiest_load_per_task = sgs.sum_weighted_load;
3884 sds->group_imb = sgs.group_imb;
3885 }
3886
3887 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3888 group = group->next;
3889 } while (group != sd->groups);
3890}
3891
3892/**
3893 * fix_small_imbalance - Calculate the minor imbalance that exists
3894 * amongst the groups of a sched_domain, during
3895 * load balancing.
3896 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3897 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3898 * @imbalance: Variable to store the imbalance.
3899 */
3900static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3901 int this_cpu, unsigned long *imbalance)
3902{
3903 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3904 unsigned int imbn = 2;
3905
3906 if (sds->this_nr_running) {
3907 sds->this_load_per_task /= sds->this_nr_running;
3908 if (sds->busiest_load_per_task >
3909 sds->this_load_per_task)
3910 imbn = 1;
3911 } else
3912 sds->this_load_per_task =
3913 cpu_avg_load_per_task(this_cpu);
3914
3915 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3916 sds->busiest_load_per_task * imbn) {
3917 *imbalance = sds->busiest_load_per_task;
3918 return;
3919 }
3920
3921 /*
3922 * OK, we don't have enough imbalance to justify moving tasks,
3923 * however we may be able to increase total CPU power used by
3924 * moving them.
3925 */
3926
3927 pwr_now += sds->busiest->cpu_power *
3928 min(sds->busiest_load_per_task, sds->max_load);
3929 pwr_now += sds->this->cpu_power *
3930 min(sds->this_load_per_task, sds->this_load);
3931 pwr_now /= SCHED_LOAD_SCALE;
3932
3933 /* Amount of load we'd subtract */
3934 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3935 sds->busiest->cpu_power;
3936 if (sds->max_load > tmp)
3937 pwr_move += sds->busiest->cpu_power *
3938 min(sds->busiest_load_per_task, sds->max_load - tmp);
3939
3940 /* Amount of load we'd add */
3941 if (sds->max_load * sds->busiest->cpu_power <
3942 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3943 tmp = (sds->max_load * sds->busiest->cpu_power) /
3944 sds->this->cpu_power;
3945 else
3946 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3947 sds->this->cpu_power;
3948 pwr_move += sds->this->cpu_power *
3949 min(sds->this_load_per_task, sds->this_load + tmp);
3950 pwr_move /= SCHED_LOAD_SCALE;
3951
3952 /* Move if we gain throughput */
3953 if (pwr_move > pwr_now)
3954 *imbalance = sds->busiest_load_per_task;
3955}
3956
3957/**
3958 * calculate_imbalance - Calculate the amount of imbalance present within the
3959 * groups of a given sched_domain during load balance.
3960 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3961 * @this_cpu: Cpu for which currently load balance is being performed.
3962 * @imbalance: The variable to store the imbalance.
3963 */
3964static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3965 unsigned long *imbalance)
3966{
3967 unsigned long max_pull;
3968 /*
3969 * In the presence of smp nice balancing, certain scenarios can have
3970 * max load less than avg load(as we skip the groups at or below
3971 * its cpu_power, while calculating max_load..)
3972 */
3973 if (sds->max_load < sds->avg_load) {
3974 *imbalance = 0;
3975 return fix_small_imbalance(sds, this_cpu, imbalance);
3976 }
3977
3978 /* Don't want to pull so many tasks that a group would go idle */
3979 max_pull = min(sds->max_load - sds->avg_load,
3980 sds->max_load - sds->busiest_load_per_task);
3981
3982 /* How much load to actually move to equalise the imbalance */
3983 *imbalance = min(max_pull * sds->busiest->cpu_power,
3984 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3985 / SCHED_LOAD_SCALE;
3986
3987 /*
3988 * if *imbalance is less than the average load per runnable task
3989 * there is no gaurantee that any tasks will be moved so we'll have
3990 * a think about bumping its value to force at least one task to be
3991 * moved
3992 */
3993 if (*imbalance < sds->busiest_load_per_task)
3994 return fix_small_imbalance(sds, this_cpu, imbalance);
3995
3996}
3997/******* find_busiest_group() helpers end here *********************/
3998
3999/**
4000 * find_busiest_group - Returns the busiest group within the sched_domain
4001 * if there is an imbalance. If there isn't an imbalance, and
4002 * the user has opted for power-savings, it returns a group whose
4003 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4004 * such a group exists.
4005 *
4006 * Also calculates the amount of weighted load which should be moved
4007 * to restore balance.
4008 *
4009 * @sd: The sched_domain whose busiest group is to be returned.
4010 * @this_cpu: The cpu for which load balancing is currently being performed.
4011 * @imbalance: Variable which stores amount of weighted load which should
4012 * be moved to restore balance/put a group to idle.
4013 * @idle: The idle status of this_cpu.
4014 * @sd_idle: The idleness of sd
4015 * @cpus: The set of CPUs under consideration for load-balancing.
4016 * @balance: Pointer to a variable indicating if this_cpu
4017 * is the appropriate cpu to perform load balancing at this_level.
4018 *
4019 * Returns: - the busiest group if imbalance exists.
4020 * - If no imbalance and user has opted for power-savings balance,
4021 * return the least loaded group whose CPUs can be
4022 * put to idle by rebalancing its tasks onto our group.
4023 */
4024static struct sched_group *
4025find_busiest_group(struct sched_domain *sd, int this_cpu,
4026 unsigned long *imbalance, enum cpu_idle_type idle,
4027 int *sd_idle, const struct cpumask *cpus, int *balance)
4028{
4029 struct sd_lb_stats sds;
4030
4031 memset(&sds, 0, sizeof(sds));
4032
4033 /*
4034 * Compute the various statistics relavent for load balancing at
4035 * this level.
4036 */
4037 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
4038 balance, &sds);
4039
4040 /* Cases where imbalance does not exist from POV of this_cpu */
4041 /* 1) this_cpu is not the appropriate cpu to perform load balancing
4042 * at this level.
4043 * 2) There is no busy sibling group to pull from.
4044 * 3) This group is the busiest group.
4045 * 4) This group is more busy than the avg busieness at this
4046 * sched_domain.
4047 * 5) The imbalance is within the specified limit.
4048 * 6) Any rebalance would lead to ping-pong
4049 */
4050 if (balance && !(*balance))
4051 goto ret;
4052
4053 if (!sds.busiest || sds.busiest_nr_running == 0)
4054 goto out_balanced;
4055
4056 if (sds.this_load >= sds.max_load)
4057 goto out_balanced;
4058
4059 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4060
4061 if (sds.this_load >= sds.avg_load)
4062 goto out_balanced;
4063
4064 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4065 goto out_balanced;
4066
4067 sds.busiest_load_per_task /= sds.busiest_nr_running;
4068 if (sds.group_imb)
4069 sds.busiest_load_per_task =
4070 min(sds.busiest_load_per_task, sds.avg_load);
4071
4072 /*
4073 * We're trying to get all the cpus to the average_load, so we don't
4074 * want to push ourselves above the average load, nor do we wish to
4075 * reduce the max loaded cpu below the average load, as either of these
4076 * actions would just result in more rebalancing later, and ping-pong
4077 * tasks around. Thus we look for the minimum possible imbalance.
4078 * Negative imbalances (*we* are more loaded than anyone else) will
4079 * be counted as no imbalance for these purposes -- we can't fix that
4080 * by pulling tasks to us. Be careful of negative numbers as they'll
4081 * appear as very large values with unsigned longs.
4082 */
4083 if (sds.max_load <= sds.busiest_load_per_task)
4084 goto out_balanced;
4085
4086 /* Looks like there is an imbalance. Compute it */
4087 calculate_imbalance(&sds, this_cpu, imbalance);
4088 return sds.busiest;
4089
4090out_balanced:
4091 /*
4092 * There is no obvious imbalance. But check if we can do some balancing
4093 * to save power.
4094 */
4095 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4096 return sds.busiest;
4097ret:
4098 *imbalance = 0;
4099 return NULL;
4100}
4101
4102/*
4103 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4104 */
4105static struct rq *
4106find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4107 unsigned long imbalance, const struct cpumask *cpus)
4108{
4109 struct rq *busiest = NULL, *rq;
4110 unsigned long max_load = 0;
4111 int i;
4112
4113 for_each_cpu(i, sched_group_cpus(group)) {
4114 unsigned long power = power_of(i);
4115 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4116 unsigned long wl;
4117
4118 if (!cpumask_test_cpu(i, cpus))
4119 continue;
4120
4121 rq = cpu_rq(i);
4122 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4123 wl /= power;
4124
4125 if (capacity && rq->nr_running == 1 && wl > imbalance)
4126 continue;
4127
4128 if (wl > max_load) {
4129 max_load = wl;
4130 busiest = rq;
4131 }
4132 }
4133
4134 return busiest;
4135}
4136
4137/*
4138 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4139 * so long as it is large enough.
4140 */
4141#define MAX_PINNED_INTERVAL 512
4142
4143/* Working cpumask for load_balance and load_balance_newidle. */
4144static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4145
4146/*
4147 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4148 * tasks if there is an imbalance.
4149 */
4150static int load_balance(int this_cpu, struct rq *this_rq,
4151 struct sched_domain *sd, enum cpu_idle_type idle,
4152 int *balance)
4153{
4154 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4155 struct sched_group *group;
4156 unsigned long imbalance;
4157 struct rq *busiest;
4158 unsigned long flags;
4159 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4160
4161 cpumask_copy(cpus, cpu_active_mask);
4162
4163 /*
4164 * When power savings policy is enabled for the parent domain, idle
4165 * sibling can pick up load irrespective of busy siblings. In this case,
4166 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4167 * portraying it as CPU_NOT_IDLE.
4168 */
4169 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4170 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4171 sd_idle = 1;
4172
4173 schedstat_inc(sd, lb_count[idle]);
4174
4175redo:
4176 update_shares(sd);
4177 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4178 cpus, balance);
4179
4180 if (*balance == 0)
4181 goto out_balanced;
4182
4183 if (!group) {
4184 schedstat_inc(sd, lb_nobusyg[idle]);
4185 goto out_balanced;
4186 }
4187
4188 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4189 if (!busiest) {
4190 schedstat_inc(sd, lb_nobusyq[idle]);
4191 goto out_balanced;
4192 }
4193
4194 BUG_ON(busiest == this_rq);
4195
4196 schedstat_add(sd, lb_imbalance[idle], imbalance);
4197
4198 ld_moved = 0;
4199 if (busiest->nr_running > 1) {
4200 /*
4201 * Attempt to move tasks. If find_busiest_group has found
4202 * an imbalance but busiest->nr_running <= 1, the group is
4203 * still unbalanced. ld_moved simply stays zero, so it is
4204 * correctly treated as an imbalance.
4205 */
4206 local_irq_save(flags);
4207 double_rq_lock(this_rq, busiest);
4208 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4209 imbalance, sd, idle, &all_pinned);
4210 double_rq_unlock(this_rq, busiest);
4211 local_irq_restore(flags);
4212
4213 /*
4214 * some other cpu did the load balance for us.
4215 */
4216 if (ld_moved && this_cpu != smp_processor_id())
4217 resched_cpu(this_cpu);
4218
4219 /* All tasks on this runqueue were pinned by CPU affinity */
4220 if (unlikely(all_pinned)) {
4221 cpumask_clear_cpu(cpu_of(busiest), cpus);
4222 if (!cpumask_empty(cpus))
4223 goto redo;
4224 goto out_balanced;
4225 }
4226 }
4227
4228 if (!ld_moved) {
4229 schedstat_inc(sd, lb_failed[idle]);
4230 sd->nr_balance_failed++;
4231
4232 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4233
4234 raw_spin_lock_irqsave(&busiest->lock, flags);
4235
4236 /* don't kick the migration_thread, if the curr
4237 * task on busiest cpu can't be moved to this_cpu
4238 */
4239 if (!cpumask_test_cpu(this_cpu,
4240 &busiest->curr->cpus_allowed)) {
4241 raw_spin_unlock_irqrestore(&busiest->lock,
4242 flags);
4243 all_pinned = 1;
4244 goto out_one_pinned;
4245 }
4246
4247 if (!busiest->active_balance) {
4248 busiest->active_balance = 1;
4249 busiest->push_cpu = this_cpu;
4250 active_balance = 1;
4251 }
4252 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4253 if (active_balance)
4254 wake_up_process(busiest->migration_thread);
4255
4256 /*
4257 * We've kicked active balancing, reset the failure
4258 * counter.
4259 */
4260 sd->nr_balance_failed = sd->cache_nice_tries+1;
4261 }
4262 } else
4263 sd->nr_balance_failed = 0;
4264
4265 if (likely(!active_balance)) {
4266 /* We were unbalanced, so reset the balancing interval */
4267 sd->balance_interval = sd->min_interval;
4268 } else {
4269 /*
4270 * If we've begun active balancing, start to back off. This
4271 * case may not be covered by the all_pinned logic if there
4272 * is only 1 task on the busy runqueue (because we don't call
4273 * move_tasks).
4274 */
4275 if (sd->balance_interval < sd->max_interval)
4276 sd->balance_interval *= 2;
4277 }
4278
4279 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4280 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4281 ld_moved = -1;
4282
4283 goto out;
4284
4285out_balanced:
4286 schedstat_inc(sd, lb_balanced[idle]);
4287
4288 sd->nr_balance_failed = 0;
4289
4290out_one_pinned:
4291 /* tune up the balancing interval */
4292 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4293 (sd->balance_interval < sd->max_interval))
4294 sd->balance_interval *= 2;
4295
4296 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4297 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4298 ld_moved = -1;
4299 else
4300 ld_moved = 0;
4301out:
4302 if (ld_moved)
4303 update_shares(sd);
4304 return ld_moved;
4305}
4306
4307/*
4308 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4309 * tasks if there is an imbalance.
4310 *
4311 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4312 * this_rq is locked.
4313 */
4314static int
4315load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4316{
4317 struct sched_group *group;
4318 struct rq *busiest = NULL;
4319 unsigned long imbalance;
4320 int ld_moved = 0;
4321 int sd_idle = 0;
4322 int all_pinned = 0;
4323 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4324
4325 cpumask_copy(cpus, cpu_active_mask);
4326
4327 /*
4328 * When power savings policy is enabled for the parent domain, idle
4329 * sibling can pick up load irrespective of busy siblings. In this case,
4330 * let the state of idle sibling percolate up as IDLE, instead of
4331 * portraying it as CPU_NOT_IDLE.
4332 */
4333 if (sd->flags & SD_SHARE_CPUPOWER &&
4334 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4335 sd_idle = 1;
4336
4337 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4338redo:
4339 update_shares_locked(this_rq, sd);
4340 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4341 &sd_idle, cpus, NULL);
4342 if (!group) {
4343 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4344 goto out_balanced;
4345 }
4346
4347 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4348 if (!busiest) {
4349 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4350 goto out_balanced;
4351 }
4352
4353 BUG_ON(busiest == this_rq);
4354
4355 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4356
4357 ld_moved = 0;
4358 if (busiest->nr_running > 1) {
4359 /* Attempt to move tasks */
4360 double_lock_balance(this_rq, busiest);
4361 /* this_rq->clock is already updated */
4362 update_rq_clock(busiest);
4363 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4364 imbalance, sd, CPU_NEWLY_IDLE,
4365 &all_pinned);
4366 double_unlock_balance(this_rq, busiest);
4367
4368 if (unlikely(all_pinned)) {
4369 cpumask_clear_cpu(cpu_of(busiest), cpus);
4370 if (!cpumask_empty(cpus))
4371 goto redo;
4372 }
4373 }
4374
4375 if (!ld_moved) {
4376 int active_balance = 0;
4377
4378 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4379 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4380 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4381 return -1;
4382
4383 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4384 return -1;
4385
4386 if (sd->nr_balance_failed++ < 2)
4387 return -1;
4388
4389 /*
4390 * The only task running in a non-idle cpu can be moved to this
4391 * cpu in an attempt to completely freeup the other CPU
4392 * package. The same method used to move task in load_balance()
4393 * have been extended for load_balance_newidle() to speedup
4394 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4395 *
4396 * The package power saving logic comes from
4397 * find_busiest_group(). If there are no imbalance, then
4398 * f_b_g() will return NULL. However when sched_mc={1,2} then
4399 * f_b_g() will select a group from which a running task may be
4400 * pulled to this cpu in order to make the other package idle.
4401 * If there is no opportunity to make a package idle and if
4402 * there are no imbalance, then f_b_g() will return NULL and no
4403 * action will be taken in load_balance_newidle().
4404 *
4405 * Under normal task pull operation due to imbalance, there
4406 * will be more than one task in the source run queue and
4407 * move_tasks() will succeed. ld_moved will be true and this
4408 * active balance code will not be triggered.
4409 */
4410
4411 /* Lock busiest in correct order while this_rq is held */
4412 double_lock_balance(this_rq, busiest);
4413
4414 /*
4415 * don't kick the migration_thread, if the curr
4416 * task on busiest cpu can't be moved to this_cpu
4417 */
4418 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4419 double_unlock_balance(this_rq, busiest);
4420 all_pinned = 1;
4421 return ld_moved;
4422 }
4423
4424 if (!busiest->active_balance) {
4425 busiest->active_balance = 1;
4426 busiest->push_cpu = this_cpu;
4427 active_balance = 1;
4428 }
4429
4430 double_unlock_balance(this_rq, busiest);
4431 /*
4432 * Should not call ttwu while holding a rq->lock
4433 */
4434 raw_spin_unlock(&this_rq->lock);
4435 if (active_balance)
4436 wake_up_process(busiest->migration_thread);
4437 raw_spin_lock(&this_rq->lock);
4438
4439 } else
4440 sd->nr_balance_failed = 0;
4441
4442 update_shares_locked(this_rq, sd);
4443 return ld_moved;
4444
4445out_balanced:
4446 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4447 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4448 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4449 return -1;
4450 sd->nr_balance_failed = 0;
4451
4452 return 0;
4453}
4454
4455/*
4456 * idle_balance is called by schedule() if this_cpu is about to become
4457 * idle. Attempts to pull tasks from other CPUs.
4458 */
4459static void idle_balance(int this_cpu, struct rq *this_rq)
4460{
4461 struct sched_domain *sd;
4462 int pulled_task = 0;
4463 unsigned long next_balance = jiffies + HZ;
4464
4465 this_rq->idle_stamp = this_rq->clock;
4466
4467 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4468 return;
4469
4470 for_each_domain(this_cpu, sd) {
4471 unsigned long interval;
4472
4473 if (!(sd->flags & SD_LOAD_BALANCE))
4474 continue;
4475
4476 if (sd->flags & SD_BALANCE_NEWIDLE)
4477 /* If we've pulled tasks over stop searching: */
4478 pulled_task = load_balance_newidle(this_cpu, this_rq,
4479 sd);
4480
4481 interval = msecs_to_jiffies(sd->balance_interval);
4482 if (time_after(next_balance, sd->last_balance + interval))
4483 next_balance = sd->last_balance + interval;
4484 if (pulled_task) {
4485 this_rq->idle_stamp = 0;
4486 break;
4487 }
4488 }
4489 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4490 /*
4491 * We are going idle. next_balance may be set based on
4492 * a busy processor. So reset next_balance.
4493 */
4494 this_rq->next_balance = next_balance;
4495 }
4496}
4497
4498/*
4499 * active_load_balance is run by migration threads. It pushes running tasks
4500 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4501 * running on each physical CPU where possible, and avoids physical /
4502 * logical imbalances.
4503 *
4504 * Called with busiest_rq locked.
4505 */
4506static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4507{
4508 int target_cpu = busiest_rq->push_cpu;
4509 struct sched_domain *sd;
4510 struct rq *target_rq;
4511
4512 /* Is there any task to move? */
4513 if (busiest_rq->nr_running <= 1)
4514 return;
4515
4516 target_rq = cpu_rq(target_cpu);
4517
4518 /*
4519 * This condition is "impossible", if it occurs
4520 * we need to fix it. Originally reported by
4521 * Bjorn Helgaas on a 128-cpu setup.
4522 */
4523 BUG_ON(busiest_rq == target_rq);
4524
4525 /* move a task from busiest_rq to target_rq */
4526 double_lock_balance(busiest_rq, target_rq);
4527 update_rq_clock(busiest_rq);
4528 update_rq_clock(target_rq);
4529
4530 /* Search for an sd spanning us and the target CPU. */
4531 for_each_domain(target_cpu, sd) {
4532 if ((sd->flags & SD_LOAD_BALANCE) &&
4533 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4534 break;
4535 }
4536
4537 if (likely(sd)) {
4538 schedstat_inc(sd, alb_count);
4539
4540 if (move_one_task(target_rq, target_cpu, busiest_rq,
4541 sd, CPU_IDLE))
4542 schedstat_inc(sd, alb_pushed);
4543 else
4544 schedstat_inc(sd, alb_failed);
4545 }
4546 double_unlock_balance(busiest_rq, target_rq);
4547}
4548
4549#ifdef CONFIG_NO_HZ
4550static struct {
4551 atomic_t load_balancer;
4552 cpumask_var_t cpu_mask;
4553 cpumask_var_t ilb_grp_nohz_mask;
4554} nohz ____cacheline_aligned = {
4555 .load_balancer = ATOMIC_INIT(-1),
4556};
4557
4558int get_nohz_load_balancer(void)
4559{
4560 return atomic_read(&nohz.load_balancer);
4561}
4562
4563#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4564/**
4565 * lowest_flag_domain - Return lowest sched_domain containing flag.
4566 * @cpu: The cpu whose lowest level of sched domain is to
4567 * be returned.
4568 * @flag: The flag to check for the lowest sched_domain
4569 * for the given cpu.
4570 *
4571 * Returns the lowest sched_domain of a cpu which contains the given flag.
4572 */
4573static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4574{
4575 struct sched_domain *sd;
4576
4577 for_each_domain(cpu, sd)
4578 if (sd && (sd->flags & flag))
4579 break;
4580
4581 return sd;
4582}
4583
4584/**
4585 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4586 * @cpu: The cpu whose domains we're iterating over.
4587 * @sd: variable holding the value of the power_savings_sd
4588 * for cpu.
4589 * @flag: The flag to filter the sched_domains to be iterated.
4590 *
4591 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4592 * set, starting from the lowest sched_domain to the highest.
4593 */
4594#define for_each_flag_domain(cpu, sd, flag) \
4595 for (sd = lowest_flag_domain(cpu, flag); \
4596 (sd && (sd->flags & flag)); sd = sd->parent)
4597
4598/**
4599 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4600 * @ilb_group: group to be checked for semi-idleness
4601 *
4602 * Returns: 1 if the group is semi-idle. 0 otherwise.
4603 *
4604 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4605 * and atleast one non-idle CPU. This helper function checks if the given
4606 * sched_group is semi-idle or not.
4607 */
4608static inline int is_semi_idle_group(struct sched_group *ilb_group)
4609{
4610 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4611 sched_group_cpus(ilb_group));
4612
4613 /*
4614 * A sched_group is semi-idle when it has atleast one busy cpu
4615 * and atleast one idle cpu.
4616 */
4617 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4618 return 0;
4619
4620 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4621 return 0;
4622
4623 return 1;
4624}
4625/**
4626 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4627 * @cpu: The cpu which is nominating a new idle_load_balancer.
4628 *
4629 * Returns: Returns the id of the idle load balancer if it exists,
4630 * Else, returns >= nr_cpu_ids.
4631 *
4632 * This algorithm picks the idle load balancer such that it belongs to a
4633 * semi-idle powersavings sched_domain. The idea is to try and avoid
4634 * completely idle packages/cores just for the purpose of idle load balancing
4635 * when there are other idle cpu's which are better suited for that job.
4636 */
4637static int find_new_ilb(int cpu)
4638{
4639 struct sched_domain *sd;
4640 struct sched_group *ilb_group;
4641
4642 /*
4643 * Have idle load balancer selection from semi-idle packages only
4644 * when power-aware load balancing is enabled
4645 */
4646 if (!(sched_smt_power_savings || sched_mc_power_savings))
4647 goto out_done;
4648
4649 /*
4650 * Optimize for the case when we have no idle CPUs or only one
4651 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4652 */
4653 if (cpumask_weight(nohz.cpu_mask) < 2)
4654 goto out_done;
4655
4656 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4657 ilb_group = sd->groups;
4658
4659 do {
4660 if (is_semi_idle_group(ilb_group))
4661 return cpumask_first(nohz.ilb_grp_nohz_mask);
4662
4663 ilb_group = ilb_group->next;
4664
4665 } while (ilb_group != sd->groups);
4666 }
4667
4668out_done:
4669 return cpumask_first(nohz.cpu_mask);
4670}
4671#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4672static inline int find_new_ilb(int call_cpu)
4673{
4674 return cpumask_first(nohz.cpu_mask);
4675}
4676#endif
4677
4678/*
4679 * This routine will try to nominate the ilb (idle load balancing)
4680 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4681 * load balancing on behalf of all those cpus. If all the cpus in the system
4682 * go into this tickless mode, then there will be no ilb owner (as there is
4683 * no need for one) and all the cpus will sleep till the next wakeup event
4684 * arrives...
4685 *
4686 * For the ilb owner, tick is not stopped. And this tick will be used
4687 * for idle load balancing. ilb owner will still be part of
4688 * nohz.cpu_mask..
4689 *
4690 * While stopping the tick, this cpu will become the ilb owner if there
4691 * is no other owner. And will be the owner till that cpu becomes busy
4692 * or if all cpus in the system stop their ticks at which point
4693 * there is no need for ilb owner.
4694 *
4695 * When the ilb owner becomes busy, it nominates another owner, during the
4696 * next busy scheduler_tick()
4697 */
4698int select_nohz_load_balancer(int stop_tick)
4699{
4700 int cpu = smp_processor_id();
4701
4702 if (stop_tick) {
4703 cpu_rq(cpu)->in_nohz_recently = 1;
4704
4705 if (!cpu_active(cpu)) {
4706 if (atomic_read(&nohz.load_balancer) != cpu)
4707 return 0;
4708
4709 /*
4710 * If we are going offline and still the leader,
4711 * give up!
4712 */
4713 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4714 BUG();
4715
4716 return 0;
4717 }
4718
4719 cpumask_set_cpu(cpu, nohz.cpu_mask);
4720
4721 /* time for ilb owner also to sleep */
4722 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4723 if (atomic_read(&nohz.load_balancer) == cpu)
4724 atomic_set(&nohz.load_balancer, -1);
4725 return 0;
4726 }
4727
4728 if (atomic_read(&nohz.load_balancer) == -1) {
4729 /* make me the ilb owner */
4730 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4731 return 1;
4732 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4733 int new_ilb;
4734
4735 if (!(sched_smt_power_savings ||
4736 sched_mc_power_savings))
4737 return 1;
4738 /*
4739 * Check to see if there is a more power-efficient
4740 * ilb.
4741 */
4742 new_ilb = find_new_ilb(cpu);
4743 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4744 atomic_set(&nohz.load_balancer, -1);
4745 resched_cpu(new_ilb);
4746 return 0;
4747 }
4748 return 1;
4749 }
4750 } else {
4751 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4752 return 0;
4753
4754 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4755
4756 if (atomic_read(&nohz.load_balancer) == cpu)
4757 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4758 BUG();
4759 }
4760 return 0;
4761}
4762#endif
4763
4764static DEFINE_SPINLOCK(balancing);
4765
4766/*
4767 * It checks each scheduling domain to see if it is due to be balanced,
4768 * and initiates a balancing operation if so.
4769 *
4770 * Balancing parameters are set up in arch_init_sched_domains.
4771 */
4772static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4773{
4774 int balance = 1;
4775 struct rq *rq = cpu_rq(cpu);
4776 unsigned long interval;
4777 struct sched_domain *sd;
4778 /* Earliest time when we have to do rebalance again */
4779 unsigned long next_balance = jiffies + 60*HZ;
4780 int update_next_balance = 0;
4781 int need_serialize;
4782
4783 for_each_domain(cpu, sd) {
4784 if (!(sd->flags & SD_LOAD_BALANCE))
4785 continue;
4786
4787 interval = sd->balance_interval;
4788 if (idle != CPU_IDLE)
4789 interval *= sd->busy_factor;
4790
4791 /* scale ms to jiffies */
4792 interval = msecs_to_jiffies(interval);
4793 if (unlikely(!interval))
4794 interval = 1;
4795 if (interval > HZ*NR_CPUS/10)
4796 interval = HZ*NR_CPUS/10;
4797
4798 need_serialize = sd->flags & SD_SERIALIZE;
4799
4800 if (need_serialize) {
4801 if (!spin_trylock(&balancing))
4802 goto out;
4803 }
4804
4805 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4806 if (load_balance(cpu, rq, sd, idle, &balance)) {
4807 /*
4808 * We've pulled tasks over so either we're no
4809 * longer idle, or one of our SMT siblings is
4810 * not idle.
4811 */
4812 idle = CPU_NOT_IDLE;
4813 }
4814 sd->last_balance = jiffies;
4815 }
4816 if (need_serialize)
4817 spin_unlock(&balancing);
4818out:
4819 if (time_after(next_balance, sd->last_balance + interval)) {
4820 next_balance = sd->last_balance + interval;
4821 update_next_balance = 1;
4822 }
4823
4824 /*
4825 * Stop the load balance at this level. There is another
4826 * CPU in our sched group which is doing load balancing more
4827 * actively.
4828 */
4829 if (!balance)
4830 break;
4831 }
4832
4833 /*
4834 * next_balance will be updated only when there is a need.
4835 * When the cpu is attached to null domain for ex, it will not be
4836 * updated.
4837 */
4838 if (likely(update_next_balance))
4839 rq->next_balance = next_balance;
4840}
4841
4842/*
4843 * run_rebalance_domains is triggered when needed from the scheduler tick.
4844 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4845 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4846 */
4847static void run_rebalance_domains(struct softirq_action *h)
4848{
4849 int this_cpu = smp_processor_id();
4850 struct rq *this_rq = cpu_rq(this_cpu);
4851 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4852 CPU_IDLE : CPU_NOT_IDLE;
4853
4854 rebalance_domains(this_cpu, idle);
4855
4856#ifdef CONFIG_NO_HZ
4857 /*
4858 * If this cpu is the owner for idle load balancing, then do the
4859 * balancing on behalf of the other idle cpus whose ticks are
4860 * stopped.
4861 */
4862 if (this_rq->idle_at_tick &&
4863 atomic_read(&nohz.load_balancer) == this_cpu) {
4864 struct rq *rq;
4865 int balance_cpu;
4866
4867 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4868 if (balance_cpu == this_cpu)
4869 continue;
4870
4871 /*
4872 * If this cpu gets work to do, stop the load balancing
4873 * work being done for other cpus. Next load
4874 * balancing owner will pick it up.
4875 */
4876 if (need_resched())
4877 break;
4878
4879 rebalance_domains(balance_cpu, CPU_IDLE);
4880
4881 rq = cpu_rq(balance_cpu);
4882 if (time_after(this_rq->next_balance, rq->next_balance))
4883 this_rq->next_balance = rq->next_balance;
4884 }
4885 }
4886#endif
4887}
4888
4889static inline int on_null_domain(int cpu)
4890{
4891 return !rcu_dereference(cpu_rq(cpu)->sd);
4892}
4893
4894/*
4895 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4896 *
4897 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4898 * idle load balancing owner or decide to stop the periodic load balancing,
4899 * if the whole system is idle.
4900 */
4901static inline void trigger_load_balance(struct rq *rq, int cpu)
4902{
4903#ifdef CONFIG_NO_HZ
4904 /*
4905 * If we were in the nohz mode recently and busy at the current
4906 * scheduler tick, then check if we need to nominate new idle
4907 * load balancer.
4908 */
4909 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4910 rq->in_nohz_recently = 0;
4911
4912 if (atomic_read(&nohz.load_balancer) == cpu) {
4913 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4914 atomic_set(&nohz.load_balancer, -1);
4915 }
4916
4917 if (atomic_read(&nohz.load_balancer) == -1) {
4918 int ilb = find_new_ilb(cpu);
4919
4920 if (ilb < nr_cpu_ids)
4921 resched_cpu(ilb);
4922 }
4923 }
4924
4925 /*
4926 * If this cpu is idle and doing idle load balancing for all the
4927 * cpus with ticks stopped, is it time for that to stop?
4928 */
4929 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4930 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4931 resched_cpu(cpu);
4932 return;
4933 }
4934
4935 /*
4936 * If this cpu is idle and the idle load balancing is done by
4937 * someone else, then no need raise the SCHED_SOFTIRQ
4938 */
4939 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4940 cpumask_test_cpu(cpu, nohz.cpu_mask))
4941 return;
4942#endif
4943 /* Don't need to rebalance while attached to NULL domain */
4944 if (time_after_eq(jiffies, rq->next_balance) &&
4945 likely(!on_null_domain(cpu)))
4946 raise_softirq(SCHED_SOFTIRQ);
4947}
4948
4949#else /* CONFIG_SMP */
4950
4951/*
4952 * on UP we do not need to balance between CPUs:
4953 */
4954static inline void idle_balance(int cpu, struct rq *rq)
4955{
4956}
4957
4958#endif 3164#endif
4959 3165
4960DEFINE_PER_CPU(struct kernel_stat, kstat); 3166DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -5309,7 +3515,7 @@ void scheduler_tick(void)
5309 curr->sched_class->task_tick(rq, curr, 0); 3515 curr->sched_class->task_tick(rq, curr, 0);
5310 raw_spin_unlock(&rq->lock); 3516 raw_spin_unlock(&rq->lock);
5311 3517
5312 perf_event_task_tick(curr, cpu); 3518 perf_event_task_tick(curr);
5313 3519
5314#ifdef CONFIG_SMP 3520#ifdef CONFIG_SMP
5315 rq->idle_at_tick = idle_cpu(cpu); 3521 rq->idle_at_tick = idle_cpu(cpu);
@@ -5523,7 +3729,7 @@ need_resched_nonpreemptible:
5523 3729
5524 if (likely(prev != next)) { 3730 if (likely(prev != next)) {
5525 sched_info_switch(prev, next); 3731 sched_info_switch(prev, next);
5526 perf_event_task_sched_out(prev, next, cpu); 3732 perf_event_task_sched_out(prev, next);
5527 3733
5528 rq->nr_switches++; 3734 rq->nr_switches++;
5529 rq->curr = next; 3735 rq->curr = next;
@@ -6054,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6054 unsigned long flags; 4260 unsigned long flags;
6055 int oldprio, on_rq, running; 4261 int oldprio, on_rq, running;
6056 struct rq *rq; 4262 struct rq *rq;
6057 const struct sched_class *prev_class = p->sched_class; 4263 const struct sched_class *prev_class;
6058 4264
6059 BUG_ON(prio < 0 || prio > MAX_PRIO); 4265 BUG_ON(prio < 0 || prio > MAX_PRIO);
6060 4266
@@ -6062,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6062 update_rq_clock(rq); 4268 update_rq_clock(rq);
6063 4269
6064 oldprio = p->prio; 4270 oldprio = p->prio;
4271 prev_class = p->sched_class;
6065 on_rq = p->se.on_rq; 4272 on_rq = p->se.on_rq;
6066 running = task_current(rq, p); 4273 running = task_current(rq, p);
6067 if (on_rq) 4274 if (on_rq)
@@ -6079,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6079 if (running) 4286 if (running)
6080 p->sched_class->set_curr_task(rq); 4287 p->sched_class->set_curr_task(rq);
6081 if (on_rq) { 4288 if (on_rq) {
6082 enqueue_task(rq, p, 0); 4289 enqueue_task(rq, p, 0, oldprio < prio);
6083 4290
6084 check_class_changed(rq, p, prev_class, oldprio, running); 4291 check_class_changed(rq, p, prev_class, oldprio, running);
6085 } 4292 }
@@ -6123,7 +4330,7 @@ void set_user_nice(struct task_struct *p, long nice)
6123 delta = p->prio - old_prio; 4330 delta = p->prio - old_prio;
6124 4331
6125 if (on_rq) { 4332 if (on_rq) {
6126 enqueue_task(rq, p, 0); 4333 enqueue_task(rq, p, 0, false);
6127 /* 4334 /*
6128 * If the task increased its priority or is running and 4335 * If the task increased its priority or is running and
6129 * lowered its priority, then reschedule its CPU: 4336 * lowered its priority, then reschedule its CPU:
@@ -6281,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6281{ 4488{
6282 int retval, oldprio, oldpolicy = -1, on_rq, running; 4489 int retval, oldprio, oldpolicy = -1, on_rq, running;
6283 unsigned long flags; 4490 unsigned long flags;
6284 const struct sched_class *prev_class = p->sched_class; 4491 const struct sched_class *prev_class;
6285 struct rq *rq; 4492 struct rq *rq;
6286 int reset_on_fork; 4493 int reset_on_fork;
6287 4494
@@ -6395,6 +4602,7 @@ recheck:
6395 p->sched_reset_on_fork = reset_on_fork; 4602 p->sched_reset_on_fork = reset_on_fork;
6396 4603
6397 oldprio = p->prio; 4604 oldprio = p->prio;
4605 prev_class = p->sched_class;
6398 __setscheduler(rq, p, policy, param->sched_priority); 4606 __setscheduler(rq, p, policy, param->sched_priority);
6399 4607
6400 if (running) 4608 if (running)
@@ -7145,27 +5353,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7145 struct rq *rq; 5353 struct rq *rq;
7146 int ret = 0; 5354 int ret = 0;
7147 5355
7148 /*
7149 * Since we rely on wake-ups to migrate sleeping tasks, don't change
7150 * the ->cpus_allowed mask from under waking tasks, which would be
7151 * possible when we change rq->lock in ttwu(), so synchronize against
7152 * TASK_WAKING to avoid that.
7153 *
7154 * Make an exception for freshly cloned tasks, since cpuset namespaces
7155 * might move the task about, we have to validate the target in
7156 * wake_up_new_task() anyway since the cpu might have gone away.
7157 */
7158again:
7159 while (p->state == TASK_WAKING && !(p->flags & PF_STARTING))
7160 cpu_relax();
7161
7162 rq = task_rq_lock(p, &flags); 5356 rq = task_rq_lock(p, &flags);
7163 5357
7164 if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) {
7165 task_rq_unlock(rq, &flags);
7166 goto again;
7167 }
7168
7169 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5358 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7170 ret = -EINVAL; 5359 ret = -EINVAL;
7171 goto out; 5360 goto out;
@@ -9452,7 +7641,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9452 tg->rt_rq[cpu] = rt_rq; 7641 tg->rt_rq[cpu] = rt_rq;
9453 init_rt_rq(rt_rq, rq); 7642 init_rt_rq(rt_rq, rq);
9454 rt_rq->tg = tg; 7643 rt_rq->tg = tg;
9455 rt_rq->rt_se = rt_se;
9456 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7644 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9457 if (add) 7645 if (add)
9458 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7646 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9483,9 +7671,6 @@ void __init sched_init(void)
9483#ifdef CONFIG_RT_GROUP_SCHED 7671#ifdef CONFIG_RT_GROUP_SCHED
9484 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7672 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9485#endif 7673#endif
9486#ifdef CONFIG_USER_SCHED
9487 alloc_size *= 2;
9488#endif
9489#ifdef CONFIG_CPUMASK_OFFSTACK 7674#ifdef CONFIG_CPUMASK_OFFSTACK
9490 alloc_size += num_possible_cpus() * cpumask_size(); 7675 alloc_size += num_possible_cpus() * cpumask_size();
9491#endif 7676#endif
@@ -9499,13 +7684,6 @@ void __init sched_init(void)
9499 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7684 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9500 ptr += nr_cpu_ids * sizeof(void **); 7685 ptr += nr_cpu_ids * sizeof(void **);
9501 7686
9502#ifdef CONFIG_USER_SCHED
9503 root_task_group.se = (struct sched_entity **)ptr;
9504 ptr += nr_cpu_ids * sizeof(void **);
9505
9506 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9507 ptr += nr_cpu_ids * sizeof(void **);
9508#endif /* CONFIG_USER_SCHED */
9509#endif /* CONFIG_FAIR_GROUP_SCHED */ 7687#endif /* CONFIG_FAIR_GROUP_SCHED */
9510#ifdef CONFIG_RT_GROUP_SCHED 7688#ifdef CONFIG_RT_GROUP_SCHED
9511 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7689 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9514,13 +7692,6 @@ void __init sched_init(void)
9514 init_task_group.rt_rq = (struct rt_rq **)ptr; 7692 init_task_group.rt_rq = (struct rt_rq **)ptr;
9515 ptr += nr_cpu_ids * sizeof(void **); 7693 ptr += nr_cpu_ids * sizeof(void **);
9516 7694
9517#ifdef CONFIG_USER_SCHED
9518 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9519 ptr += nr_cpu_ids * sizeof(void **);
9520
9521 root_task_group.rt_rq = (struct rt_rq **)ptr;
9522 ptr += nr_cpu_ids * sizeof(void **);
9523#endif /* CONFIG_USER_SCHED */
9524#endif /* CONFIG_RT_GROUP_SCHED */ 7695#endif /* CONFIG_RT_GROUP_SCHED */
9525#ifdef CONFIG_CPUMASK_OFFSTACK 7696#ifdef CONFIG_CPUMASK_OFFSTACK
9526 for_each_possible_cpu(i) { 7697 for_each_possible_cpu(i) {
@@ -9540,22 +7711,13 @@ void __init sched_init(void)
9540#ifdef CONFIG_RT_GROUP_SCHED 7711#ifdef CONFIG_RT_GROUP_SCHED
9541 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7712 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9542 global_rt_period(), global_rt_runtime()); 7713 global_rt_period(), global_rt_runtime());
9543#ifdef CONFIG_USER_SCHED
9544 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9545 global_rt_period(), RUNTIME_INF);
9546#endif /* CONFIG_USER_SCHED */
9547#endif /* CONFIG_RT_GROUP_SCHED */ 7714#endif /* CONFIG_RT_GROUP_SCHED */
9548 7715
9549#ifdef CONFIG_GROUP_SCHED 7716#ifdef CONFIG_CGROUP_SCHED
9550 list_add(&init_task_group.list, &task_groups); 7717 list_add(&init_task_group.list, &task_groups);
9551 INIT_LIST_HEAD(&init_task_group.children); 7718 INIT_LIST_HEAD(&init_task_group.children);
9552 7719
9553#ifdef CONFIG_USER_SCHED 7720#endif /* CONFIG_CGROUP_SCHED */
9554 INIT_LIST_HEAD(&root_task_group.children);
9555 init_task_group.parent = &root_task_group;
9556 list_add(&init_task_group.siblings, &root_task_group.children);
9557#endif /* CONFIG_USER_SCHED */
9558#endif /* CONFIG_GROUP_SCHED */
9559 7721
9560#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7722#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9561 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7723 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9595,25 +7757,6 @@ void __init sched_init(void)
9595 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7757 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9596 */ 7758 */
9597 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7759 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9598#elif defined CONFIG_USER_SCHED
9599 root_task_group.shares = NICE_0_LOAD;
9600 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9601 /*
9602 * In case of task-groups formed thr' the user id of tasks,
9603 * init_task_group represents tasks belonging to root user.
9604 * Hence it forms a sibling of all subsequent groups formed.
9605 * In this case, init_task_group gets only a fraction of overall
9606 * system cpu resource, based on the weight assigned to root
9607 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9608 * by letting tasks of init_task_group sit in a separate cfs_rq
9609 * (init_tg_cfs_rq) and having one entity represent this group of
9610 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9611 */
9612 init_tg_cfs_entry(&init_task_group,
9613 &per_cpu(init_tg_cfs_rq, i),
9614 &per_cpu(init_sched_entity, i), i, 1,
9615 root_task_group.se[i]);
9616
9617#endif 7760#endif
9618#endif /* CONFIG_FAIR_GROUP_SCHED */ 7761#endif /* CONFIG_FAIR_GROUP_SCHED */
9619 7762
@@ -9622,12 +7765,6 @@ void __init sched_init(void)
9622 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7765 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9623#ifdef CONFIG_CGROUP_SCHED 7766#ifdef CONFIG_CGROUP_SCHED
9624 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7767 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9625#elif defined CONFIG_USER_SCHED
9626 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9627 init_tg_rt_entry(&init_task_group,
9628 &per_cpu(init_rt_rq_var, i),
9629 &per_cpu(init_sched_rt_entity, i), i, 1,
9630 root_task_group.rt_se[i]);
9631#endif 7768#endif
9632#endif 7769#endif
9633 7770
@@ -9712,7 +7849,7 @@ static inline int preempt_count_equals(int preempt_offset)
9712 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7849 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9713} 7850}
9714 7851
9715void __might_sleep(char *file, int line, int preempt_offset) 7852void __might_sleep(const char *file, int line, int preempt_offset)
9716{ 7853{
9717#ifdef in_atomic 7854#ifdef in_atomic
9718 static unsigned long prev_jiffy; /* ratelimiting */ 7855 static unsigned long prev_jiffy; /* ratelimiting */
@@ -10023,7 +8160,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
10023} 8160}
10024#endif /* CONFIG_RT_GROUP_SCHED */ 8161#endif /* CONFIG_RT_GROUP_SCHED */
10025 8162
10026#ifdef CONFIG_GROUP_SCHED 8163#ifdef CONFIG_CGROUP_SCHED
10027static void free_sched_group(struct task_group *tg) 8164static void free_sched_group(struct task_group *tg)
10028{ 8165{
10029 free_fair_sched_group(tg); 8166 free_fair_sched_group(tg);
@@ -10128,11 +8265,11 @@ void sched_move_task(struct task_struct *tsk)
10128 if (unlikely(running)) 8265 if (unlikely(running))
10129 tsk->sched_class->set_curr_task(rq); 8266 tsk->sched_class->set_curr_task(rq);
10130 if (on_rq) 8267 if (on_rq)
10131 enqueue_task(rq, tsk, 0); 8268 enqueue_task(rq, tsk, 0, false);
10132 8269
10133 task_rq_unlock(rq, &flags); 8270 task_rq_unlock(rq, &flags);
10134} 8271}
10135#endif /* CONFIG_GROUP_SCHED */ 8272#endif /* CONFIG_CGROUP_SCHED */
10136 8273
10137#ifdef CONFIG_FAIR_GROUP_SCHED 8274#ifdef CONFIG_FAIR_GROUP_SCHED
10138static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8275static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10274,13 +8411,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10274 runtime = d->rt_runtime; 8411 runtime = d->rt_runtime;
10275 } 8412 }
10276 8413
10277#ifdef CONFIG_USER_SCHED
10278 if (tg == &root_task_group) {
10279 period = global_rt_period();
10280 runtime = global_rt_runtime();
10281 }
10282#endif
10283
10284 /* 8414 /*
10285 * Cannot have more runtime than the period. 8415 * Cannot have more runtime than the period.
10286 */ 8416 */
@@ -10900,12 +9030,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10900} 9030}
10901 9031
10902/* 9032/*
9033 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9034 * in cputime_t units. As a result, cpuacct_update_stats calls
9035 * percpu_counter_add with values large enough to always overflow the
9036 * per cpu batch limit causing bad SMP scalability.
9037 *
9038 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9039 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9040 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9041 */
9042#ifdef CONFIG_SMP
9043#define CPUACCT_BATCH \
9044 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9045#else
9046#define CPUACCT_BATCH 0
9047#endif
9048
9049/*
10903 * Charge the system/user time to the task's accounting group. 9050 * Charge the system/user time to the task's accounting group.
10904 */ 9051 */
10905static void cpuacct_update_stats(struct task_struct *tsk, 9052static void cpuacct_update_stats(struct task_struct *tsk,
10906 enum cpuacct_stat_index idx, cputime_t val) 9053 enum cpuacct_stat_index idx, cputime_t val)
10907{ 9054{
10908 struct cpuacct *ca; 9055 struct cpuacct *ca;
9056 int batch = CPUACCT_BATCH;
10909 9057
10910 if (unlikely(!cpuacct_subsys.active)) 9058 if (unlikely(!cpuacct_subsys.active))
10911 return; 9059 return;
@@ -10914,7 +9062,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10914 ca = task_ca(tsk); 9062 ca = task_ca(tsk);
10915 9063
10916 do { 9064 do {
10917 percpu_counter_add(&ca->cpustat[idx], val); 9065 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10918 ca = ca->parent; 9066 ca = ca->parent;
10919 } while (ca); 9067 } while (ca);
10920 rcu_read_unlock(); 9068 rcu_read_unlock();