aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c2509
1 files changed, 348 insertions, 2161 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 18cceeecce35..b47ceeec1a91 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -233,7 +233,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
233 */ 233 */
234static DEFINE_MUTEX(sched_domains_mutex); 234static DEFINE_MUTEX(sched_domains_mutex);
235 235
236#ifdef CONFIG_GROUP_SCHED 236#ifdef CONFIG_CGROUP_SCHED
237 237
238#include <linux/cgroup.h> 238#include <linux/cgroup.h>
239 239
@@ -243,13 +243,7 @@ static LIST_HEAD(task_groups);
243 243
244/* task group related information */ 244/* task group related information */
245struct task_group { 245struct task_group {
246#ifdef CONFIG_CGROUP_SCHED
247 struct cgroup_subsys_state css; 246 struct cgroup_subsys_state css;
248#endif
249
250#ifdef CONFIG_USER_SCHED
251 uid_t uid;
252#endif
253 247
254#ifdef CONFIG_FAIR_GROUP_SCHED 248#ifdef CONFIG_FAIR_GROUP_SCHED
255 /* schedulable entities of this group on each cpu */ 249 /* schedulable entities of this group on each cpu */
@@ -274,35 +268,7 @@ struct task_group {
274 struct list_head children; 268 struct list_head children;
275}; 269};
276 270
277#ifdef CONFIG_USER_SCHED
278
279/* Helper function to pass uid information to create_sched_user() */
280void set_tg_uid(struct user_struct *user)
281{
282 user->tg->uid = user->uid;
283}
284
285/*
286 * Root task group.
287 * Every UID task group (including init_task_group aka UID-0) will
288 * be a child to this group.
289 */
290struct task_group root_task_group;
291
292#ifdef CONFIG_FAIR_GROUP_SCHED
293/* Default task group's sched entity on each cpu */
294static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
295/* Default task group's cfs_rq on each cpu */
296static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);
297#endif /* CONFIG_FAIR_GROUP_SCHED */
298
299#ifdef CONFIG_RT_GROUP_SCHED
300static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
301static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq_var);
302#endif /* CONFIG_RT_GROUP_SCHED */
303#else /* !CONFIG_USER_SCHED */
304#define root_task_group init_task_group 271#define root_task_group init_task_group
305#endif /* CONFIG_USER_SCHED */
306 272
307/* task_group_lock serializes add/remove of task groups and also changes to 273/* task_group_lock serializes add/remove of task groups and also changes to
308 * a task group's cpu shares. 274 * a task group's cpu shares.
@@ -318,11 +284,7 @@ static int root_task_group_empty(void)
318} 284}
319#endif 285#endif
320 286
321#ifdef CONFIG_USER_SCHED
322# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
323#else /* !CONFIG_USER_SCHED */
324# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 287# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
325#endif /* CONFIG_USER_SCHED */
326 288
327/* 289/*
328 * A weight of 0 or 1 can cause arithmetics problems. 290 * A weight of 0 or 1 can cause arithmetics problems.
@@ -348,11 +310,7 @@ static inline struct task_group *task_group(struct task_struct *p)
348{ 310{
349 struct task_group *tg; 311 struct task_group *tg;
350 312
351#ifdef CONFIG_USER_SCHED 313#ifdef CONFIG_CGROUP_SCHED
352 rcu_read_lock();
353 tg = __task_cred(p)->user->tg;
354 rcu_read_unlock();
355#elif defined(CONFIG_CGROUP_SCHED)
356 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 314 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
357 struct task_group, css); 315 struct task_group, css);
358#else 316#else
@@ -383,7 +341,7 @@ static inline struct task_group *task_group(struct task_struct *p)
383 return NULL; 341 return NULL;
384} 342}
385 343
386#endif /* CONFIG_GROUP_SCHED */ 344#endif /* CONFIG_CGROUP_SCHED */
387 345
388/* CFS-related fields in a runqueue */ 346/* CFS-related fields in a runqueue */
389struct cfs_rq { 347struct cfs_rq {
@@ -478,7 +436,6 @@ struct rt_rq {
478 struct rq *rq; 436 struct rq *rq;
479 struct list_head leaf_rt_rq_list; 437 struct list_head leaf_rt_rq_list;
480 struct task_group *tg; 438 struct task_group *tg;
481 struct sched_rt_entity *rt_se;
482#endif 439#endif
483}; 440};
484 441
@@ -645,6 +602,11 @@ static inline int cpu_of(struct rq *rq)
645#endif 602#endif
646} 603}
647 604
605#define rcu_dereference_check_sched_domain(p) \
606 rcu_dereference_check((p), \
607 rcu_read_lock_sched_held() || \
608 lockdep_is_held(&sched_domains_mutex))
609
648/* 610/*
649 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 611 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
650 * See detach_destroy_domains: synchronize_sched for details. 612 * See detach_destroy_domains: synchronize_sched for details.
@@ -653,7 +615,7 @@ static inline int cpu_of(struct rq *rq)
653 * preempt-disabled sections. 615 * preempt-disabled sections.
654 */ 616 */
655#define for_each_domain(cpu, __sd) \ 617#define for_each_domain(cpu, __sd) \
656 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) 618 for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
657 619
658#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 620#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
659#define this_rq() (&__get_cpu_var(runqueues)) 621#define this_rq() (&__get_cpu_var(runqueues))
@@ -941,16 +903,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
941#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 903#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
942 904
943/* 905/*
906 * Check whether the task is waking, we use this to synchronize against
907 * ttwu() so that task_cpu() reports a stable number.
908 *
909 * We need to make an exception for PF_STARTING tasks because the fork
910 * path might require task_rq_lock() to work, eg. it can call
911 * set_cpus_allowed_ptr() from the cpuset clone_ns code.
912 */
913static inline int task_is_waking(struct task_struct *p)
914{
915 return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
916}
917
918/*
944 * __task_rq_lock - lock the runqueue a given task resides on. 919 * __task_rq_lock - lock the runqueue a given task resides on.
945 * Must be called interrupts disabled. 920 * Must be called interrupts disabled.
946 */ 921 */
947static inline struct rq *__task_rq_lock(struct task_struct *p) 922static inline struct rq *__task_rq_lock(struct task_struct *p)
948 __acquires(rq->lock) 923 __acquires(rq->lock)
949{ 924{
925 struct rq *rq;
926
950 for (;;) { 927 for (;;) {
951 struct rq *rq = task_rq(p); 928 while (task_is_waking(p))
929 cpu_relax();
930 rq = task_rq(p);
952 raw_spin_lock(&rq->lock); 931 raw_spin_lock(&rq->lock);
953 if (likely(rq == task_rq(p))) 932 if (likely(rq == task_rq(p) && !task_is_waking(p)))
954 return rq; 933 return rq;
955 raw_spin_unlock(&rq->lock); 934 raw_spin_unlock(&rq->lock);
956 } 935 }
@@ -967,10 +946,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
967 struct rq *rq; 946 struct rq *rq;
968 947
969 for (;;) { 948 for (;;) {
949 while (task_is_waking(p))
950 cpu_relax();
970 local_irq_save(*flags); 951 local_irq_save(*flags);
971 rq = task_rq(p); 952 rq = task_rq(p);
972 raw_spin_lock(&rq->lock); 953 raw_spin_lock(&rq->lock);
973 if (likely(rq == task_rq(p))) 954 if (likely(rq == task_rq(p) && !task_is_waking(p)))
974 return rq; 955 return rq;
975 raw_spin_unlock_irqrestore(&rq->lock, *flags); 956 raw_spin_unlock_irqrestore(&rq->lock, *flags);
976 } 957 }
@@ -1390,32 +1371,6 @@ static const u32 prio_to_wmult[40] = {
1390 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 1371 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
1391}; 1372};
1392 1373
1393static void activate_task(struct rq *rq, struct task_struct *p, int wakeup);
1394
1395/*
1396 * runqueue iterator, to support SMP load-balancing between different
1397 * scheduling classes, without having to expose their internal data
1398 * structures to the load-balancing proper:
1399 */
1400struct rq_iterator {
1401 void *arg;
1402 struct task_struct *(*start)(void *);
1403 struct task_struct *(*next)(void *);
1404};
1405
1406#ifdef CONFIG_SMP
1407static unsigned long
1408balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1409 unsigned long max_load_move, struct sched_domain *sd,
1410 enum cpu_idle_type idle, int *all_pinned,
1411 int *this_best_prio, struct rq_iterator *iterator);
1412
1413static int
1414iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
1415 struct sched_domain *sd, enum cpu_idle_type idle,
1416 struct rq_iterator *iterator);
1417#endif
1418
1419/* Time spent by the tasks of the cpu accounting group executing in ... */ 1374/* Time spent by the tasks of the cpu accounting group executing in ... */
1420enum cpuacct_stat_index { 1375enum cpuacct_stat_index {
1421 CPUACCT_STAT_USER, /* ... user mode */ 1376 CPUACCT_STAT_USER, /* ... user mode */
@@ -1531,7 +1486,7 @@ static unsigned long target_load(int cpu, int type)
1531 1486
1532static struct sched_group *group_of(int cpu) 1487static struct sched_group *group_of(int cpu)
1533{ 1488{
1534 struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); 1489 struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
1535 1490
1536 if (!sd) 1491 if (!sd)
1537 return NULL; 1492 return NULL;
@@ -1566,7 +1521,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1566 1521
1567#ifdef CONFIG_FAIR_GROUP_SCHED 1522#ifdef CONFIG_FAIR_GROUP_SCHED
1568 1523
1569static __read_mostly unsigned long *update_shares_data; 1524static __read_mostly unsigned long __percpu *update_shares_data;
1570 1525
1571static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1526static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1572 1527
@@ -1701,16 +1656,6 @@ static void update_shares(struct sched_domain *sd)
1701 } 1656 }
1702} 1657}
1703 1658
1704static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1705{
1706 if (root_task_group_empty())
1707 return;
1708
1709 raw_spin_unlock(&rq->lock);
1710 update_shares(sd);
1711 raw_spin_lock(&rq->lock);
1712}
1713
1714static void update_h_load(long cpu) 1659static void update_h_load(long cpu)
1715{ 1660{
1716 if (root_task_group_empty()) 1661 if (root_task_group_empty())
@@ -1725,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd)
1725{ 1670{
1726} 1671}
1727 1672
1728static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1729{
1730}
1731
1732#endif 1673#endif
1733 1674
1734#ifdef CONFIG_PREEMPT 1675#ifdef CONFIG_PREEMPT
@@ -1805,6 +1746,51 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
1805 raw_spin_unlock(&busiest->lock); 1746 raw_spin_unlock(&busiest->lock);
1806 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_); 1747 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
1807} 1748}
1749
1750/*
1751 * double_rq_lock - safely lock two runqueues
1752 *
1753 * Note this does not disable interrupts like task_rq_lock,
1754 * you need to do so manually before calling.
1755 */
1756static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1757 __acquires(rq1->lock)
1758 __acquires(rq2->lock)
1759{
1760 BUG_ON(!irqs_disabled());
1761 if (rq1 == rq2) {
1762 raw_spin_lock(&rq1->lock);
1763 __acquire(rq2->lock); /* Fake it out ;) */
1764 } else {
1765 if (rq1 < rq2) {
1766 raw_spin_lock(&rq1->lock);
1767 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
1768 } else {
1769 raw_spin_lock(&rq2->lock);
1770 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
1771 }
1772 }
1773 update_rq_clock(rq1);
1774 update_rq_clock(rq2);
1775}
1776
1777/*
1778 * double_rq_unlock - safely unlock two runqueues
1779 *
1780 * Note this does not restore interrupts like task_rq_unlock,
1781 * you need to do so manually after calling.
1782 */
1783static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1784 __releases(rq1->lock)
1785 __releases(rq2->lock)
1786{
1787 raw_spin_unlock(&rq1->lock);
1788 if (rq1 != rq2)
1789 raw_spin_unlock(&rq2->lock);
1790 else
1791 __release(rq2->lock);
1792}
1793
1808#endif 1794#endif
1809 1795
1810#ifdef CONFIG_FAIR_GROUP_SCHED 1796#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1834,18 +1820,14 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1834#endif 1820#endif
1835} 1821}
1836 1822
1837#include "sched_stats.h" 1823static const struct sched_class rt_sched_class;
1838#include "sched_idletask.c"
1839#include "sched_fair.c"
1840#include "sched_rt.c"
1841#ifdef CONFIG_SCHED_DEBUG
1842# include "sched_debug.c"
1843#endif
1844 1824
1845#define sched_class_highest (&rt_sched_class) 1825#define sched_class_highest (&rt_sched_class)
1846#define for_each_class(class) \ 1826#define for_each_class(class) \
1847 for (class = sched_class_highest; class; class = class->next) 1827 for (class = sched_class_highest; class; class = class->next)
1848 1828
1829#include "sched_stats.h"
1830
1849static void inc_nr_running(struct rq *rq) 1831static void inc_nr_running(struct rq *rq)
1850{ 1832{
1851 rq->nr_running++; 1833 rq->nr_running++;
@@ -1883,13 +1865,14 @@ static void update_avg(u64 *avg, u64 sample)
1883 *avg += diff >> 3; 1865 *avg += diff >> 3;
1884} 1866}
1885 1867
1886static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1868static void
1869enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, bool head)
1887{ 1870{
1888 if (wakeup) 1871 if (wakeup)
1889 p->se.start_runtime = p->se.sum_exec_runtime; 1872 p->se.start_runtime = p->se.sum_exec_runtime;
1890 1873
1891 sched_info_queued(p); 1874 sched_info_queued(p);
1892 p->sched_class->enqueue_task(rq, p, wakeup); 1875 p->sched_class->enqueue_task(rq, p, wakeup, head);
1893 p->se.on_rq = 1; 1876 p->se.on_rq = 1;
1894} 1877}
1895 1878
@@ -1912,6 +1895,37 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1912} 1895}
1913 1896
1914/* 1897/*
1898 * activate_task - move a task to the runqueue.
1899 */
1900static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1901{
1902 if (task_contributes_to_load(p))
1903 rq->nr_uninterruptible--;
1904
1905 enqueue_task(rq, p, wakeup, false);
1906 inc_nr_running(rq);
1907}
1908
1909/*
1910 * deactivate_task - remove a task from the runqueue.
1911 */
1912static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1913{
1914 if (task_contributes_to_load(p))
1915 rq->nr_uninterruptible++;
1916
1917 dequeue_task(rq, p, sleep);
1918 dec_nr_running(rq);
1919}
1920
1921#include "sched_idletask.c"
1922#include "sched_fair.c"
1923#include "sched_rt.c"
1924#ifdef CONFIG_SCHED_DEBUG
1925# include "sched_debug.c"
1926#endif
1927
1928/*
1915 * __normal_prio - return the priority that is based on the static prio 1929 * __normal_prio - return the priority that is based on the static prio
1916 */ 1930 */
1917static inline int __normal_prio(struct task_struct *p) 1931static inline int __normal_prio(struct task_struct *p)
@@ -1957,30 +1971,6 @@ static int effective_prio(struct task_struct *p)
1957 return p->prio; 1971 return p->prio;
1958} 1972}
1959 1973
1960/*
1961 * activate_task - move a task to the runqueue.
1962 */
1963static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1964{
1965 if (task_contributes_to_load(p))
1966 rq->nr_uninterruptible--;
1967
1968 enqueue_task(rq, p, wakeup);
1969 inc_nr_running(rq);
1970}
1971
1972/*
1973 * deactivate_task - remove a task from the runqueue.
1974 */
1975static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1976{
1977 if (task_contributes_to_load(p))
1978 rq->nr_uninterruptible++;
1979
1980 dequeue_task(rq, p, sleep);
1981 dec_nr_running(rq);
1982}
1983
1984/** 1974/**
1985 * task_curr - is this task currently executing on a CPU? 1975 * task_curr - is this task currently executing on a CPU?
1986 * @p: the task in question. 1976 * @p: the task in question.
@@ -2002,39 +1992,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2002 p->sched_class->prio_changed(rq, p, oldprio, running); 1992 p->sched_class->prio_changed(rq, p, oldprio, running);
2003} 1993}
2004 1994
2005/**
2006 * kthread_bind - bind a just-created kthread to a cpu.
2007 * @p: thread created by kthread_create().
2008 * @cpu: cpu (might not be online, must be possible) for @k to run on.
2009 *
2010 * Description: This function is equivalent to set_cpus_allowed(),
2011 * except that @cpu doesn't need to be online, and the thread must be
2012 * stopped (i.e., just returned from kthread_create()).
2013 *
2014 * Function lives here instead of kthread.c because it messes with
2015 * scheduler internals which require locking.
2016 */
2017void kthread_bind(struct task_struct *p, unsigned int cpu)
2018{
2019 struct rq *rq = cpu_rq(cpu);
2020 unsigned long flags;
2021
2022 /* Must have done schedule() in kthread() before we set_task_cpu */
2023 if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
2024 WARN_ON(1);
2025 return;
2026 }
2027
2028 raw_spin_lock_irqsave(&rq->lock, flags);
2029 update_rq_clock(rq);
2030 set_task_cpu(p, cpu);
2031 p->cpus_allowed = cpumask_of_cpu(cpu);
2032 p->rt.nr_cpus_allowed = 1;
2033 p->flags |= PF_THREAD_BOUND;
2034 raw_spin_unlock_irqrestore(&rq->lock, flags);
2035}
2036EXPORT_SYMBOL(kthread_bind);
2037
2038#ifdef CONFIG_SMP 1995#ifdef CONFIG_SMP
2039/* 1996/*
2040 * Is this task likely cache-hot: 1997 * Is this task likely cache-hot:
@@ -2044,6 +2001,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2044{ 2001{
2045 s64 delta; 2002 s64 delta;
2046 2003
2004 if (p->sched_class != &fair_sched_class)
2005 return 0;
2006
2047 /* 2007 /*
2048 * Buddy candidates are cache hot: 2008 * Buddy candidates are cache hot:
2049 */ 2009 */
@@ -2052,9 +2012,6 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2052 &p->se == cfs_rq_of(&p->se)->last)) 2012 &p->se == cfs_rq_of(&p->se)->last))
2053 return 1; 2013 return 1;
2054 2014
2055 if (p->sched_class != &fair_sched_class)
2056 return 0;
2057
2058 if (sysctl_sched_migration_cost == -1) 2015 if (sysctl_sched_migration_cost == -1)
2059 return 1; 2016 return 1;
2060 if (sysctl_sched_migration_cost == 0) 2017 if (sysctl_sched_migration_cost == 0)
@@ -2065,22 +2022,23 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2065 return delta < (s64)sysctl_sched_migration_cost; 2022 return delta < (s64)sysctl_sched_migration_cost;
2066} 2023}
2067 2024
2068
2069void set_task_cpu(struct task_struct *p, unsigned int new_cpu) 2025void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2070{ 2026{
2071 int old_cpu = task_cpu(p); 2027#ifdef CONFIG_SCHED_DEBUG
2072 struct cfs_rq *old_cfsrq = task_cfs_rq(p), 2028 /*
2073 *new_cfsrq = cpu_cfs_rq(old_cfsrq, new_cpu); 2029 * We should never call set_task_cpu() on a blocked task,
2030 * ttwu() will sort out the placement.
2031 */
2032 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2033 !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
2034#endif
2074 2035
2075 trace_sched_migrate_task(p, new_cpu); 2036 trace_sched_migrate_task(p, new_cpu);
2076 2037
2077 if (old_cpu != new_cpu) { 2038 if (task_cpu(p) != new_cpu) {
2078 p->se.nr_migrations++; 2039 p->se.nr_migrations++;
2079 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 2040 perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
2080 1, 1, NULL, 0);
2081 } 2041 }
2082 p->se.vruntime -= old_cfsrq->min_vruntime -
2083 new_cfsrq->min_vruntime;
2084 2042
2085 __set_task_cpu(p, new_cpu); 2043 __set_task_cpu(p, new_cpu);
2086} 2044}
@@ -2105,13 +2063,10 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
2105 2063
2106 /* 2064 /*
2107 * If the task is not on a runqueue (and not running), then 2065 * If the task is not on a runqueue (and not running), then
2108 * it is sufficient to simply update the task's cpu field. 2066 * the next wake-up will properly place the task.
2109 */ 2067 */
2110 if (!p->se.on_rq && !task_running(rq, p)) { 2068 if (!p->se.on_rq && !task_running(rq, p))
2111 update_rq_clock(rq);
2112 set_task_cpu(p, dest_cpu);
2113 return 0; 2069 return 0;
2114 }
2115 2070
2116 init_completion(&req->done); 2071 init_completion(&req->done);
2117 req->task = p; 2072 req->task = p;
@@ -2317,10 +2272,71 @@ void task_oncpu_function_call(struct task_struct *p,
2317} 2272}
2318 2273
2319#ifdef CONFIG_SMP 2274#ifdef CONFIG_SMP
2275static int select_fallback_rq(int cpu, struct task_struct *p)
2276{
2277 int dest_cpu;
2278 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
2279
2280 /* Look for allowed, online CPU in same node. */
2281 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
2282 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
2283 return dest_cpu;
2284
2285 /* Any allowed, online CPU? */
2286 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
2287 if (dest_cpu < nr_cpu_ids)
2288 return dest_cpu;
2289
2290 /* No more Mr. Nice Guy. */
2291 if (dest_cpu >= nr_cpu_ids) {
2292 rcu_read_lock();
2293 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
2294 rcu_read_unlock();
2295 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
2296
2297 /*
2298 * Don't tell them about moving exiting tasks or
2299 * kernel threads (both mm NULL), since they never
2300 * leave kernel.
2301 */
2302 if (p->mm && printk_ratelimit()) {
2303 printk(KERN_INFO "process %d (%s) no "
2304 "longer affine to cpu%d\n",
2305 task_pid_nr(p), p->comm, cpu);
2306 }
2307 }
2308
2309 return dest_cpu;
2310}
2311
2312/*
2313 * Gets called from 3 sites (exec, fork, wakeup), since it is called without
2314 * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
2315 * by:
2316 *
2317 * exec: is unstable, retry loop
2318 * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
2319 */
2320static inline 2320static inline
2321int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) 2321int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
2322{ 2322{
2323 return p->sched_class->select_task_rq(p, sd_flags, wake_flags); 2323 int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
2324
2325 /*
2326 * In order not to call set_task_cpu() on a blocking task we need
2327 * to rely on ttwu() to place the task on a valid ->cpus_allowed
2328 * cpu.
2329 *
2330 * Since this is common to all placement strategies, this lives here.
2331 *
2332 * [ this allows ->select_task() to simply return task_cpu(p) and
2333 * not worry about this generic constraint ]
2334 */
2335 if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
2336 !cpu_online(cpu)))
2337 cpu = select_fallback_rq(task_cpu(p), p);
2338
2339 return cpu;
2324} 2340}
2325#endif 2341#endif
2326 2342
@@ -2375,17 +2391,34 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
2375 if (task_contributes_to_load(p)) 2391 if (task_contributes_to_load(p))
2376 rq->nr_uninterruptible--; 2392 rq->nr_uninterruptible--;
2377 p->state = TASK_WAKING; 2393 p->state = TASK_WAKING;
2394
2395 if (p->sched_class->task_waking)
2396 p->sched_class->task_waking(rq, p);
2397
2378 __task_rq_unlock(rq); 2398 __task_rq_unlock(rq);
2379 2399
2380 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags); 2400 cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
2381 if (cpu != orig_cpu) 2401 if (cpu != orig_cpu) {
2402 /*
2403 * Since we migrate the task without holding any rq->lock,
2404 * we need to be careful with task_rq_lock(), since that
2405 * might end up locking an invalid rq.
2406 */
2382 set_task_cpu(p, cpu); 2407 set_task_cpu(p, cpu);
2408 }
2383 2409
2384 rq = __task_rq_lock(p); 2410 rq = cpu_rq(cpu);
2411 raw_spin_lock(&rq->lock);
2385 update_rq_clock(rq); 2412 update_rq_clock(rq);
2386 2413
2414 /*
2415 * We migrated the task without holding either rq->lock, however
2416 * since the task is not on the task list itself, nobody else
2417 * will try and migrate the task, hence the rq should match the
2418 * cpu we just moved it to.
2419 */
2420 WARN_ON(task_cpu(p) != cpu);
2387 WARN_ON(p->state != TASK_WAKING); 2421 WARN_ON(p->state != TASK_WAKING);
2388 cpu = task_cpu(p);
2389 2422
2390#ifdef CONFIG_SCHEDSTATS 2423#ifdef CONFIG_SCHEDSTATS
2391 schedstat_inc(rq, ttwu_count); 2424 schedstat_inc(rq, ttwu_count);
@@ -2438,8 +2471,8 @@ out_running:
2438 2471
2439 p->state = TASK_RUNNING; 2472 p->state = TASK_RUNNING;
2440#ifdef CONFIG_SMP 2473#ifdef CONFIG_SMP
2441 if (p->sched_class->task_wake_up) 2474 if (p->sched_class->task_woken)
2442 p->sched_class->task_wake_up(rq, p); 2475 p->sched_class->task_woken(rq, p);
2443 2476
2444 if (unlikely(rq->idle_stamp)) { 2477 if (unlikely(rq->idle_stamp)) {
2445 u64 delta = rq->clock - rq->idle_stamp; 2478 u64 delta = rq->clock - rq->idle_stamp;
@@ -2538,14 +2571,6 @@ static void __sched_fork(struct task_struct *p)
2538#ifdef CONFIG_PREEMPT_NOTIFIERS 2571#ifdef CONFIG_PREEMPT_NOTIFIERS
2539 INIT_HLIST_HEAD(&p->preempt_notifiers); 2572 INIT_HLIST_HEAD(&p->preempt_notifiers);
2540#endif 2573#endif
2541
2542 /*
2543 * We mark the process as running here, but have not actually
2544 * inserted it onto the runqueue yet. This guarantees that
2545 * nobody will actually run it, and a signal or other external
2546 * event cannot wake it up and insert it on the runqueue either.
2547 */
2548 p->state = TASK_RUNNING;
2549} 2574}
2550 2575
2551/* 2576/*
@@ -2556,6 +2581,12 @@ void sched_fork(struct task_struct *p, int clone_flags)
2556 int cpu = get_cpu(); 2581 int cpu = get_cpu();
2557 2582
2558 __sched_fork(p); 2583 __sched_fork(p);
2584 /*
2585 * We mark the process as waking here. This guarantees that
2586 * nobody will actually run it, and a signal or other external
2587 * event cannot wake it up and insert it on the runqueue either.
2588 */
2589 p->state = TASK_WAKING;
2559 2590
2560 /* 2591 /*
2561 * Revert to default priority/policy on fork if requested. 2592 * Revert to default priority/policy on fork if requested.
@@ -2590,9 +2621,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
2590 if (p->sched_class->task_fork) 2621 if (p->sched_class->task_fork)
2591 p->sched_class->task_fork(p); 2622 p->sched_class->task_fork(p);
2592 2623
2593#ifdef CONFIG_SMP
2594 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2595#endif
2596 set_task_cpu(p, cpu); 2624 set_task_cpu(p, cpu);
2597 2625
2598#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 2626#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2622,18 +2650,41 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2622{ 2650{
2623 unsigned long flags; 2651 unsigned long flags;
2624 struct rq *rq; 2652 struct rq *rq;
2653 int cpu = get_cpu();
2625 2654
2626 rq = task_rq_lock(p, &flags); 2655#ifdef CONFIG_SMP
2627 BUG_ON(p->state != TASK_RUNNING); 2656 /*
2657 * Fork balancing, do it here and not earlier because:
2658 * - cpus_allowed can change in the fork path
2659 * - any previously selected cpu might disappear through hotplug
2660 *
2661 * We still have TASK_WAKING but PF_STARTING is gone now, meaning
2662 * ->cpus_allowed is stable, we have preemption disabled, meaning
2663 * cpu_online_mask is stable.
2664 */
2665 cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
2666 set_task_cpu(p, cpu);
2667#endif
2668
2669 /*
2670 * Since the task is not on the rq and we still have TASK_WAKING set
2671 * nobody else will migrate this task.
2672 */
2673 rq = cpu_rq(cpu);
2674 raw_spin_lock_irqsave(&rq->lock, flags);
2675
2676 BUG_ON(p->state != TASK_WAKING);
2677 p->state = TASK_RUNNING;
2628 update_rq_clock(rq); 2678 update_rq_clock(rq);
2629 activate_task(rq, p, 0); 2679 activate_task(rq, p, 0);
2630 trace_sched_wakeup_new(rq, p, 1); 2680 trace_sched_wakeup_new(rq, p, 1);
2631 check_preempt_curr(rq, p, WF_FORK); 2681 check_preempt_curr(rq, p, WF_FORK);
2632#ifdef CONFIG_SMP 2682#ifdef CONFIG_SMP
2633 if (p->sched_class->task_wake_up) 2683 if (p->sched_class->task_woken)
2634 p->sched_class->task_wake_up(rq, p); 2684 p->sched_class->task_woken(rq, p);
2635#endif 2685#endif
2636 task_rq_unlock(rq, &flags); 2686 task_rq_unlock(rq, &flags);
2687 put_cpu();
2637} 2688}
2638 2689
2639#ifdef CONFIG_PREEMPT_NOTIFIERS 2690#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2752,7 +2803,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
2752 */ 2803 */
2753 prev_state = prev->state; 2804 prev_state = prev->state;
2754 finish_arch_switch(prev); 2805 finish_arch_switch(prev);
2755 perf_event_task_sched_in(current, cpu_of(rq)); 2806#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2807 local_irq_disable();
2808#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2809 perf_event_task_sched_in(current);
2810#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
2811 local_irq_enable();
2812#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
2756 finish_lock_switch(rq, prev); 2813 finish_lock_switch(rq, prev);
2757 2814
2758 fire_sched_in_preempt_notifiers(current); 2815 fire_sched_in_preempt_notifiers(current);
@@ -3057,65 +3114,36 @@ static void update_cpu_load(struct rq *this_rq)
3057#ifdef CONFIG_SMP 3114#ifdef CONFIG_SMP
3058 3115
3059/* 3116/*
3060 * double_rq_lock - safely lock two runqueues 3117 * sched_exec - execve() is a valuable balancing opportunity, because at
3061 * 3118 * this point the task has the smallest effective memory and cache footprint.
3062 * Note this does not disable interrupts like task_rq_lock,
3063 * you need to do so manually before calling.
3064 */
3065static void double_rq_lock(struct rq *rq1, struct rq *rq2)
3066 __acquires(rq1->lock)
3067 __acquires(rq2->lock)
3068{
3069 BUG_ON(!irqs_disabled());
3070 if (rq1 == rq2) {
3071 raw_spin_lock(&rq1->lock);
3072 __acquire(rq2->lock); /* Fake it out ;) */
3073 } else {
3074 if (rq1 < rq2) {
3075 raw_spin_lock(&rq1->lock);
3076 raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
3077 } else {
3078 raw_spin_lock(&rq2->lock);
3079 raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
3080 }
3081 }
3082 update_rq_clock(rq1);
3083 update_rq_clock(rq2);
3084}
3085
3086/*
3087 * double_rq_unlock - safely unlock two runqueues
3088 *
3089 * Note this does not restore interrupts like task_rq_unlock,
3090 * you need to do so manually after calling.
3091 */
3092static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
3093 __releases(rq1->lock)
3094 __releases(rq2->lock)
3095{
3096 raw_spin_unlock(&rq1->lock);
3097 if (rq1 != rq2)
3098 raw_spin_unlock(&rq2->lock);
3099 else
3100 __release(rq2->lock);
3101}
3102
3103/*
3104 * If dest_cpu is allowed for this process, migrate the task to it.
3105 * This is accomplished by forcing the cpu_allowed mask to only
3106 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
3107 * the cpu_allowed mask is restored.
3108 */ 3119 */
3109static void sched_migrate_task(struct task_struct *p, int dest_cpu) 3120void sched_exec(void)
3110{ 3121{
3122 struct task_struct *p = current;
3111 struct migration_req req; 3123 struct migration_req req;
3124 int dest_cpu, this_cpu;
3112 unsigned long flags; 3125 unsigned long flags;
3113 struct rq *rq; 3126 struct rq *rq;
3114 3127
3128again:
3129 this_cpu = get_cpu();
3130 dest_cpu = select_task_rq(p, SD_BALANCE_EXEC, 0);
3131 if (dest_cpu == this_cpu) {
3132 put_cpu();
3133 return;
3134 }
3135
3115 rq = task_rq_lock(p, &flags); 3136 rq = task_rq_lock(p, &flags);
3137 put_cpu();
3138
3139 /*
3140 * select_task_rq() can race against ->cpus_allowed
3141 */
3116 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed) 3142 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)
3117 || unlikely(!cpu_active(dest_cpu))) 3143 || unlikely(!cpu_active(dest_cpu))) {
3118 goto out; 3144 task_rq_unlock(rq, &flags);
3145 goto again;
3146 }
3119 3147
3120 /* force the process onto the specified CPU */ 3148 /* force the process onto the specified CPU */
3121 if (migrate_task(p, dest_cpu, &req)) { 3149 if (migrate_task(p, dest_cpu, &req)) {
@@ -3130,1788 +3158,9 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
3130 3158
3131 return; 3159 return;
3132 } 3160 }
3133out:
3134 task_rq_unlock(rq, &flags); 3161 task_rq_unlock(rq, &flags);
3135} 3162}
3136 3163
3137/*
3138 * sched_exec - execve() is a valuable balancing opportunity, because at
3139 * this point the task has the smallest effective memory and cache footprint.
3140 */
3141void sched_exec(void)
3142{
3143 int new_cpu, this_cpu = get_cpu();
3144 new_cpu = select_task_rq(current, SD_BALANCE_EXEC, 0);
3145 put_cpu();
3146 if (new_cpu != this_cpu)
3147 sched_migrate_task(current, new_cpu);
3148}
3149
3150/*
3151 * pull_task - move a task from a remote runqueue to the local runqueue.
3152 * Both runqueues must be locked.
3153 */
3154static void pull_task(struct rq *src_rq, struct task_struct *p,
3155 struct rq *this_rq, int this_cpu)
3156{
3157 deactivate_task(src_rq, p, 0);
3158 set_task_cpu(p, this_cpu);
3159 activate_task(this_rq, p, 0);
3160 check_preempt_curr(this_rq, p, 0);
3161}
3162
3163/*
3164 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3165 */
3166static
3167int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
3168 struct sched_domain *sd, enum cpu_idle_type idle,
3169 int *all_pinned)
3170{
3171 int tsk_cache_hot = 0;
3172 /*
3173 * We do not migrate tasks that are:
3174 * 1) running (obviously), or
3175 * 2) cannot be migrated to this CPU due to cpus_allowed, or
3176 * 3) are cache-hot on their current CPU.
3177 */
3178 if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
3179 schedstat_inc(p, se.nr_failed_migrations_affine);
3180 return 0;
3181 }
3182 *all_pinned = 0;
3183
3184 if (task_running(rq, p)) {
3185 schedstat_inc(p, se.nr_failed_migrations_running);
3186 return 0;
3187 }
3188
3189 /*
3190 * Aggressive migration if:
3191 * 1) task is cache cold, or
3192 * 2) too many balance attempts have failed.
3193 */
3194
3195 tsk_cache_hot = task_hot(p, rq->clock, sd);
3196 if (!tsk_cache_hot ||
3197 sd->nr_balance_failed > sd->cache_nice_tries) {
3198#ifdef CONFIG_SCHEDSTATS
3199 if (tsk_cache_hot) {
3200 schedstat_inc(sd, lb_hot_gained[idle]);
3201 schedstat_inc(p, se.nr_forced_migrations);
3202 }
3203#endif
3204 return 1;
3205 }
3206
3207 if (tsk_cache_hot) {
3208 schedstat_inc(p, se.nr_failed_migrations_hot);
3209 return 0;
3210 }
3211 return 1;
3212}
3213
3214static unsigned long
3215balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3216 unsigned long max_load_move, struct sched_domain *sd,
3217 enum cpu_idle_type idle, int *all_pinned,
3218 int *this_best_prio, struct rq_iterator *iterator)
3219{
3220 int loops = 0, pulled = 0, pinned = 0;
3221 struct task_struct *p;
3222 long rem_load_move = max_load_move;
3223
3224 if (max_load_move == 0)
3225 goto out;
3226
3227 pinned = 1;
3228
3229 /*
3230 * Start the load-balancing iterator:
3231 */
3232 p = iterator->start(iterator->arg);
3233next:
3234 if (!p || loops++ > sysctl_sched_nr_migrate)
3235 goto out;
3236
3237 if ((p->se.load.weight >> 1) > rem_load_move ||
3238 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3239 p = iterator->next(iterator->arg);
3240 goto next;
3241 }
3242
3243 pull_task(busiest, p, this_rq, this_cpu);
3244 pulled++;
3245 rem_load_move -= p->se.load.weight;
3246
3247#ifdef CONFIG_PREEMPT
3248 /*
3249 * NEWIDLE balancing is a source of latency, so preemptible kernels
3250 * will stop after the first task is pulled to minimize the critical
3251 * section.
3252 */
3253 if (idle == CPU_NEWLY_IDLE)
3254 goto out;
3255#endif
3256
3257 /*
3258 * We only want to steal up to the prescribed amount of weighted load.
3259 */
3260 if (rem_load_move > 0) {
3261 if (p->prio < *this_best_prio)
3262 *this_best_prio = p->prio;
3263 p = iterator->next(iterator->arg);
3264 goto next;
3265 }
3266out:
3267 /*
3268 * Right now, this is one of only two places pull_task() is called,
3269 * so we can safely collect pull_task() stats here rather than
3270 * inside pull_task().
3271 */
3272 schedstat_add(sd, lb_gained[idle], pulled);
3273
3274 if (all_pinned)
3275 *all_pinned = pinned;
3276
3277 return max_load_move - rem_load_move;
3278}
3279
3280/*
3281 * move_tasks tries to move up to max_load_move weighted load from busiest to
3282 * this_rq, as part of a balancing operation within domain "sd".
3283 * Returns 1 if successful and 0 otherwise.
3284 *
3285 * Called with both runqueues locked.
3286 */
3287static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
3288 unsigned long max_load_move,
3289 struct sched_domain *sd, enum cpu_idle_type idle,
3290 int *all_pinned)
3291{
3292 const struct sched_class *class = sched_class_highest;
3293 unsigned long total_load_moved = 0;
3294 int this_best_prio = this_rq->curr->prio;
3295
3296 do {
3297 total_load_moved +=
3298 class->load_balance(this_rq, this_cpu, busiest,
3299 max_load_move - total_load_moved,
3300 sd, idle, all_pinned, &this_best_prio);
3301 class = class->next;
3302
3303#ifdef CONFIG_PREEMPT
3304 /*
3305 * NEWIDLE balancing is a source of latency, so preemptible
3306 * kernels will stop after the first task is pulled to minimize
3307 * the critical section.
3308 */
3309 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3310 break;
3311#endif
3312 } while (class && max_load_move > total_load_moved);
3313
3314 return total_load_moved > 0;
3315}
3316
3317static int
3318iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3319 struct sched_domain *sd, enum cpu_idle_type idle,
3320 struct rq_iterator *iterator)
3321{
3322 struct task_struct *p = iterator->start(iterator->arg);
3323 int pinned = 0;
3324
3325 while (p) {
3326 if (can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
3327 pull_task(busiest, p, this_rq, this_cpu);
3328 /*
3329 * Right now, this is only the second place pull_task()
3330 * is called, so we can safely collect pull_task()
3331 * stats here rather than inside pull_task().
3332 */
3333 schedstat_inc(sd, lb_gained[idle]);
3334
3335 return 1;
3336 }
3337 p = iterator->next(iterator->arg);
3338 }
3339
3340 return 0;
3341}
3342
3343/*
3344 * move_one_task tries to move exactly one task from busiest to this_rq, as
3345 * part of active balancing operations within "domain".
3346 * Returns 1 if successful and 0 otherwise.
3347 *
3348 * Called with both runqueues locked.
3349 */
3350static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
3351 struct sched_domain *sd, enum cpu_idle_type idle)
3352{
3353 const struct sched_class *class;
3354
3355 for_each_class(class) {
3356 if (class->move_one_task(this_rq, this_cpu, busiest, sd, idle))
3357 return 1;
3358 }
3359
3360 return 0;
3361}
3362/********** Helpers for find_busiest_group ************************/
3363/*
3364 * sd_lb_stats - Structure to store the statistics of a sched_domain
3365 * during load balancing.
3366 */
3367struct sd_lb_stats {
3368 struct sched_group *busiest; /* Busiest group in this sd */
3369 struct sched_group *this; /* Local group in this sd */
3370 unsigned long total_load; /* Total load of all groups in sd */
3371 unsigned long total_pwr; /* Total power of all groups in sd */
3372 unsigned long avg_load; /* Average load across all groups in sd */
3373
3374 /** Statistics of this group */
3375 unsigned long this_load;
3376 unsigned long this_load_per_task;
3377 unsigned long this_nr_running;
3378
3379 /* Statistics of the busiest group */
3380 unsigned long max_load;
3381 unsigned long busiest_load_per_task;
3382 unsigned long busiest_nr_running;
3383
3384 int group_imb; /* Is there imbalance in this sd */
3385#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3386 int power_savings_balance; /* Is powersave balance needed for this sd */
3387 struct sched_group *group_min; /* Least loaded group in sd */
3388 struct sched_group *group_leader; /* Group which relieves group_min */
3389 unsigned long min_load_per_task; /* load_per_task in group_min */
3390 unsigned long leader_nr_running; /* Nr running of group_leader */
3391 unsigned long min_nr_running; /* Nr running of group_min */
3392#endif
3393};
3394
3395/*
3396 * sg_lb_stats - stats of a sched_group required for load_balancing
3397 */
3398struct sg_lb_stats {
3399 unsigned long avg_load; /*Avg load across the CPUs of the group */
3400 unsigned long group_load; /* Total load over the CPUs of the group */
3401 unsigned long sum_nr_running; /* Nr tasks running in the group */
3402 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
3403 unsigned long group_capacity;
3404 int group_imb; /* Is there an imbalance in the group ? */
3405};
3406
3407/**
3408 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
3409 * @group: The group whose first cpu is to be returned.
3410 */
3411static inline unsigned int group_first_cpu(struct sched_group *group)
3412{
3413 return cpumask_first(sched_group_cpus(group));
3414}
3415
3416/**
3417 * get_sd_load_idx - Obtain the load index for a given sched domain.
3418 * @sd: The sched_domain whose load_idx is to be obtained.
3419 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
3420 */
3421static inline int get_sd_load_idx(struct sched_domain *sd,
3422 enum cpu_idle_type idle)
3423{
3424 int load_idx;
3425
3426 switch (idle) {
3427 case CPU_NOT_IDLE:
3428 load_idx = sd->busy_idx;
3429 break;
3430
3431 case CPU_NEWLY_IDLE:
3432 load_idx = sd->newidle_idx;
3433 break;
3434 default:
3435 load_idx = sd->idle_idx;
3436 break;
3437 }
3438
3439 return load_idx;
3440}
3441
3442
3443#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
3444/**
3445 * init_sd_power_savings_stats - Initialize power savings statistics for
3446 * the given sched_domain, during load balancing.
3447 *
3448 * @sd: Sched domain whose power-savings statistics are to be initialized.
3449 * @sds: Variable containing the statistics for sd.
3450 * @idle: Idle status of the CPU at which we're performing load-balancing.
3451 */
3452static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3453 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3454{
3455 /*
3456 * Busy processors will not participate in power savings
3457 * balance.
3458 */
3459 if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
3460 sds->power_savings_balance = 0;
3461 else {
3462 sds->power_savings_balance = 1;
3463 sds->min_nr_running = ULONG_MAX;
3464 sds->leader_nr_running = 0;
3465 }
3466}
3467
3468/**
3469 * update_sd_power_savings_stats - Update the power saving stats for a
3470 * sched_domain while performing load balancing.
3471 *
3472 * @group: sched_group belonging to the sched_domain under consideration.
3473 * @sds: Variable containing the statistics of the sched_domain
3474 * @local_group: Does group contain the CPU for which we're performing
3475 * load balancing ?
3476 * @sgs: Variable containing the statistics of the group.
3477 */
3478static inline void update_sd_power_savings_stats(struct sched_group *group,
3479 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3480{
3481
3482 if (!sds->power_savings_balance)
3483 return;
3484
3485 /*
3486 * If the local group is idle or completely loaded
3487 * no need to do power savings balance at this domain
3488 */
3489 if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
3490 !sds->this_nr_running))
3491 sds->power_savings_balance = 0;
3492
3493 /*
3494 * If a group is already running at full capacity or idle,
3495 * don't include that group in power savings calculations
3496 */
3497 if (!sds->power_savings_balance ||
3498 sgs->sum_nr_running >= sgs->group_capacity ||
3499 !sgs->sum_nr_running)
3500 return;
3501
3502 /*
3503 * Calculate the group which has the least non-idle load.
3504 * This is the group from where we need to pick up the load
3505 * for saving power
3506 */
3507 if ((sgs->sum_nr_running < sds->min_nr_running) ||
3508 (sgs->sum_nr_running == sds->min_nr_running &&
3509 group_first_cpu(group) > group_first_cpu(sds->group_min))) {
3510 sds->group_min = group;
3511 sds->min_nr_running = sgs->sum_nr_running;
3512 sds->min_load_per_task = sgs->sum_weighted_load /
3513 sgs->sum_nr_running;
3514 }
3515
3516 /*
3517 * Calculate the group which is almost near its
3518 * capacity but still has some space to pick up some load
3519 * from other group and save more power
3520 */
3521 if (sgs->sum_nr_running + 1 > sgs->group_capacity)
3522 return;
3523
3524 if (sgs->sum_nr_running > sds->leader_nr_running ||
3525 (sgs->sum_nr_running == sds->leader_nr_running &&
3526 group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
3527 sds->group_leader = group;
3528 sds->leader_nr_running = sgs->sum_nr_running;
3529 }
3530}
3531
3532/**
3533 * check_power_save_busiest_group - see if there is potential for some power-savings balance
3534 * @sds: Variable containing the statistics of the sched_domain
3535 * under consideration.
3536 * @this_cpu: Cpu at which we're currently performing load-balancing.
3537 * @imbalance: Variable to store the imbalance.
3538 *
3539 * Description:
3540 * Check if we have potential to perform some power-savings balance.
3541 * If yes, set the busiest group to be the least loaded group in the
3542 * sched_domain, so that it's CPUs can be put to idle.
3543 *
3544 * Returns 1 if there is potential to perform power-savings balance.
3545 * Else returns 0.
3546 */
3547static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3548 int this_cpu, unsigned long *imbalance)
3549{
3550 if (!sds->power_savings_balance)
3551 return 0;
3552
3553 if (sds->this != sds->group_leader ||
3554 sds->group_leader == sds->group_min)
3555 return 0;
3556
3557 *imbalance = sds->min_load_per_task;
3558 sds->busiest = sds->group_min;
3559
3560 return 1;
3561
3562}
3563#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3564static inline void init_sd_power_savings_stats(struct sched_domain *sd,
3565 struct sd_lb_stats *sds, enum cpu_idle_type idle)
3566{
3567 return;
3568}
3569
3570static inline void update_sd_power_savings_stats(struct sched_group *group,
3571 struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
3572{
3573 return;
3574}
3575
3576static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
3577 int this_cpu, unsigned long *imbalance)
3578{
3579 return 0;
3580}
3581#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
3582
3583
3584unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
3585{
3586 return SCHED_LOAD_SCALE;
3587}
3588
3589unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
3590{
3591 return default_scale_freq_power(sd, cpu);
3592}
3593
3594unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
3595{
3596 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3597 unsigned long smt_gain = sd->smt_gain;
3598
3599 smt_gain /= weight;
3600
3601 return smt_gain;
3602}
3603
3604unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
3605{
3606 return default_scale_smt_power(sd, cpu);
3607}
3608
3609unsigned long scale_rt_power(int cpu)
3610{
3611 struct rq *rq = cpu_rq(cpu);
3612 u64 total, available;
3613
3614 sched_avg_update(rq);
3615
3616 total = sched_avg_period() + (rq->clock - rq->age_stamp);
3617 available = total - rq->rt_avg;
3618
3619 if (unlikely((s64)total < SCHED_LOAD_SCALE))
3620 total = SCHED_LOAD_SCALE;
3621
3622 total >>= SCHED_LOAD_SHIFT;
3623
3624 return div_u64(available, total);
3625}
3626
3627static void update_cpu_power(struct sched_domain *sd, int cpu)
3628{
3629 unsigned long weight = cpumask_weight(sched_domain_span(sd));
3630 unsigned long power = SCHED_LOAD_SCALE;
3631 struct sched_group *sdg = sd->groups;
3632
3633 if (sched_feat(ARCH_POWER))
3634 power *= arch_scale_freq_power(sd, cpu);
3635 else
3636 power *= default_scale_freq_power(sd, cpu);
3637
3638 power >>= SCHED_LOAD_SHIFT;
3639
3640 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
3641 if (sched_feat(ARCH_POWER))
3642 power *= arch_scale_smt_power(sd, cpu);
3643 else
3644 power *= default_scale_smt_power(sd, cpu);
3645
3646 power >>= SCHED_LOAD_SHIFT;
3647 }
3648
3649 power *= scale_rt_power(cpu);
3650 power >>= SCHED_LOAD_SHIFT;
3651
3652 if (!power)
3653 power = 1;
3654
3655 sdg->cpu_power = power;
3656}
3657
3658static void update_group_power(struct sched_domain *sd, int cpu)
3659{
3660 struct sched_domain *child = sd->child;
3661 struct sched_group *group, *sdg = sd->groups;
3662 unsigned long power;
3663
3664 if (!child) {
3665 update_cpu_power(sd, cpu);
3666 return;
3667 }
3668
3669 power = 0;
3670
3671 group = child->groups;
3672 do {
3673 power += group->cpu_power;
3674 group = group->next;
3675 } while (group != child->groups);
3676
3677 sdg->cpu_power = power;
3678}
3679
3680/**
3681 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
3682 * @sd: The sched_domain whose statistics are to be updated.
3683 * @group: sched_group whose statistics are to be updated.
3684 * @this_cpu: Cpu for which load balance is currently performed.
3685 * @idle: Idle status of this_cpu
3686 * @load_idx: Load index of sched_domain of this_cpu for load calc.
3687 * @sd_idle: Idle status of the sched_domain containing group.
3688 * @local_group: Does group contain this_cpu.
3689 * @cpus: Set of cpus considered for load balancing.
3690 * @balance: Should we balance.
3691 * @sgs: variable to hold the statistics for this group.
3692 */
3693static inline void update_sg_lb_stats(struct sched_domain *sd,
3694 struct sched_group *group, int this_cpu,
3695 enum cpu_idle_type idle, int load_idx, int *sd_idle,
3696 int local_group, const struct cpumask *cpus,
3697 int *balance, struct sg_lb_stats *sgs)
3698{
3699 unsigned long load, max_cpu_load, min_cpu_load;
3700 int i;
3701 unsigned int balance_cpu = -1, first_idle_cpu = 0;
3702 unsigned long sum_avg_load_per_task;
3703 unsigned long avg_load_per_task;
3704
3705 if (local_group) {
3706 balance_cpu = group_first_cpu(group);
3707 if (balance_cpu == this_cpu)
3708 update_group_power(sd, this_cpu);
3709 }
3710
3711 /* Tally up the load of all CPUs in the group */
3712 sum_avg_load_per_task = avg_load_per_task = 0;
3713 max_cpu_load = 0;
3714 min_cpu_load = ~0UL;
3715
3716 for_each_cpu_and(i, sched_group_cpus(group), cpus) {
3717 struct rq *rq = cpu_rq(i);
3718
3719 if (*sd_idle && rq->nr_running)
3720 *sd_idle = 0;
3721
3722 /* Bias balancing toward cpus of our domain */
3723 if (local_group) {
3724 if (idle_cpu(i) && !first_idle_cpu) {
3725 first_idle_cpu = 1;
3726 balance_cpu = i;
3727 }
3728
3729 load = target_load(i, load_idx);
3730 } else {
3731 load = source_load(i, load_idx);
3732 if (load > max_cpu_load)
3733 max_cpu_load = load;
3734 if (min_cpu_load > load)
3735 min_cpu_load = load;
3736 }
3737
3738 sgs->group_load += load;
3739 sgs->sum_nr_running += rq->nr_running;
3740 sgs->sum_weighted_load += weighted_cpuload(i);
3741
3742 sum_avg_load_per_task += cpu_avg_load_per_task(i);
3743 }
3744
3745 /*
3746 * First idle cpu or the first cpu(busiest) in this sched group
3747 * is eligible for doing load balancing at this and above
3748 * domains. In the newly idle case, we will allow all the cpu's
3749 * to do the newly idle load balance.
3750 */
3751 if (idle != CPU_NEWLY_IDLE && local_group &&
3752 balance_cpu != this_cpu && balance) {
3753 *balance = 0;
3754 return;
3755 }
3756
3757 /* Adjust by relative CPU power of the group */
3758 sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
3759
3760
3761 /*
3762 * Consider the group unbalanced when the imbalance is larger
3763 * than the average weight of two tasks.
3764 *
3765 * APZ: with cgroup the avg task weight can vary wildly and
3766 * might not be a suitable number - should we keep a
3767 * normalized nr_running number somewhere that negates
3768 * the hierarchy?
3769 */
3770 avg_load_per_task = (sum_avg_load_per_task * SCHED_LOAD_SCALE) /
3771 group->cpu_power;
3772
3773 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3774 sgs->group_imb = 1;
3775
3776 sgs->group_capacity =
3777 DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
3778}
3779
3780/**
3781 * update_sd_lb_stats - Update sched_group's statistics for load balancing.
3782 * @sd: sched_domain whose statistics are to be updated.
3783 * @this_cpu: Cpu for which load balance is currently performed.
3784 * @idle: Idle status of this_cpu
3785 * @sd_idle: Idle status of the sched_domain containing group.
3786 * @cpus: Set of cpus considered for load balancing.
3787 * @balance: Should we balance.
3788 * @sds: variable to hold the statistics for this sched_domain.
3789 */
3790static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
3791 enum cpu_idle_type idle, int *sd_idle,
3792 const struct cpumask *cpus, int *balance,
3793 struct sd_lb_stats *sds)
3794{
3795 struct sched_domain *child = sd->child;
3796 struct sched_group *group = sd->groups;
3797 struct sg_lb_stats sgs;
3798 int load_idx, prefer_sibling = 0;
3799
3800 if (child && child->flags & SD_PREFER_SIBLING)
3801 prefer_sibling = 1;
3802
3803 init_sd_power_savings_stats(sd, sds, idle);
3804 load_idx = get_sd_load_idx(sd, idle);
3805
3806 do {
3807 int local_group;
3808
3809 local_group = cpumask_test_cpu(this_cpu,
3810 sched_group_cpus(group));
3811 memset(&sgs, 0, sizeof(sgs));
3812 update_sg_lb_stats(sd, group, this_cpu, idle, load_idx, sd_idle,
3813 local_group, cpus, balance, &sgs);
3814
3815 if (local_group && balance && !(*balance))
3816 return;
3817
3818 sds->total_load += sgs.group_load;
3819 sds->total_pwr += group->cpu_power;
3820
3821 /*
3822 * In case the child domain prefers tasks go to siblings
3823 * first, lower the group capacity to one so that we'll try
3824 * and move all the excess tasks away.
3825 */
3826 if (prefer_sibling)
3827 sgs.group_capacity = min(sgs.group_capacity, 1UL);
3828
3829 if (local_group) {
3830 sds->this_load = sgs.avg_load;
3831 sds->this = group;
3832 sds->this_nr_running = sgs.sum_nr_running;
3833 sds->this_load_per_task = sgs.sum_weighted_load;
3834 } else if (sgs.avg_load > sds->max_load &&
3835 (sgs.sum_nr_running > sgs.group_capacity ||
3836 sgs.group_imb)) {
3837 sds->max_load = sgs.avg_load;
3838 sds->busiest = group;
3839 sds->busiest_nr_running = sgs.sum_nr_running;
3840 sds->busiest_load_per_task = sgs.sum_weighted_load;
3841 sds->group_imb = sgs.group_imb;
3842 }
3843
3844 update_sd_power_savings_stats(group, sds, local_group, &sgs);
3845 group = group->next;
3846 } while (group != sd->groups);
3847}
3848
3849/**
3850 * fix_small_imbalance - Calculate the minor imbalance that exists
3851 * amongst the groups of a sched_domain, during
3852 * load balancing.
3853 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
3854 * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
3855 * @imbalance: Variable to store the imbalance.
3856 */
3857static inline void fix_small_imbalance(struct sd_lb_stats *sds,
3858 int this_cpu, unsigned long *imbalance)
3859{
3860 unsigned long tmp, pwr_now = 0, pwr_move = 0;
3861 unsigned int imbn = 2;
3862
3863 if (sds->this_nr_running) {
3864 sds->this_load_per_task /= sds->this_nr_running;
3865 if (sds->busiest_load_per_task >
3866 sds->this_load_per_task)
3867 imbn = 1;
3868 } else
3869 sds->this_load_per_task =
3870 cpu_avg_load_per_task(this_cpu);
3871
3872 if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
3873 sds->busiest_load_per_task * imbn) {
3874 *imbalance = sds->busiest_load_per_task;
3875 return;
3876 }
3877
3878 /*
3879 * OK, we don't have enough imbalance to justify moving tasks,
3880 * however we may be able to increase total CPU power used by
3881 * moving them.
3882 */
3883
3884 pwr_now += sds->busiest->cpu_power *
3885 min(sds->busiest_load_per_task, sds->max_load);
3886 pwr_now += sds->this->cpu_power *
3887 min(sds->this_load_per_task, sds->this_load);
3888 pwr_now /= SCHED_LOAD_SCALE;
3889
3890 /* Amount of load we'd subtract */
3891 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3892 sds->busiest->cpu_power;
3893 if (sds->max_load > tmp)
3894 pwr_move += sds->busiest->cpu_power *
3895 min(sds->busiest_load_per_task, sds->max_load - tmp);
3896
3897 /* Amount of load we'd add */
3898 if (sds->max_load * sds->busiest->cpu_power <
3899 sds->busiest_load_per_task * SCHED_LOAD_SCALE)
3900 tmp = (sds->max_load * sds->busiest->cpu_power) /
3901 sds->this->cpu_power;
3902 else
3903 tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
3904 sds->this->cpu_power;
3905 pwr_move += sds->this->cpu_power *
3906 min(sds->this_load_per_task, sds->this_load + tmp);
3907 pwr_move /= SCHED_LOAD_SCALE;
3908
3909 /* Move if we gain throughput */
3910 if (pwr_move > pwr_now)
3911 *imbalance = sds->busiest_load_per_task;
3912}
3913
3914/**
3915 * calculate_imbalance - Calculate the amount of imbalance present within the
3916 * groups of a given sched_domain during load balance.
3917 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
3918 * @this_cpu: Cpu for which currently load balance is being performed.
3919 * @imbalance: The variable to store the imbalance.
3920 */
3921static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
3922 unsigned long *imbalance)
3923{
3924 unsigned long max_pull;
3925 /*
3926 * In the presence of smp nice balancing, certain scenarios can have
3927 * max load less than avg load(as we skip the groups at or below
3928 * its cpu_power, while calculating max_load..)
3929 */
3930 if (sds->max_load < sds->avg_load) {
3931 *imbalance = 0;
3932 return fix_small_imbalance(sds, this_cpu, imbalance);
3933 }
3934
3935 /* Don't want to pull so many tasks that a group would go idle */
3936 max_pull = min(sds->max_load - sds->avg_load,
3937 sds->max_load - sds->busiest_load_per_task);
3938
3939 /* How much load to actually move to equalise the imbalance */
3940 *imbalance = min(max_pull * sds->busiest->cpu_power,
3941 (sds->avg_load - sds->this_load) * sds->this->cpu_power)
3942 / SCHED_LOAD_SCALE;
3943
3944 /*
3945 * if *imbalance is less than the average load per runnable task
3946 * there is no gaurantee that any tasks will be moved so we'll have
3947 * a think about bumping its value to force at least one task to be
3948 * moved
3949 */
3950 if (*imbalance < sds->busiest_load_per_task)
3951 return fix_small_imbalance(sds, this_cpu, imbalance);
3952
3953}
3954/******* find_busiest_group() helpers end here *********************/
3955
3956/**
3957 * find_busiest_group - Returns the busiest group within the sched_domain
3958 * if there is an imbalance. If there isn't an imbalance, and
3959 * the user has opted for power-savings, it returns a group whose
3960 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
3961 * such a group exists.
3962 *
3963 * Also calculates the amount of weighted load which should be moved
3964 * to restore balance.
3965 *
3966 * @sd: The sched_domain whose busiest group is to be returned.
3967 * @this_cpu: The cpu for which load balancing is currently being performed.
3968 * @imbalance: Variable which stores amount of weighted load which should
3969 * be moved to restore balance/put a group to idle.
3970 * @idle: The idle status of this_cpu.
3971 * @sd_idle: The idleness of sd
3972 * @cpus: The set of CPUs under consideration for load-balancing.
3973 * @balance: Pointer to a variable indicating if this_cpu
3974 * is the appropriate cpu to perform load balancing at this_level.
3975 *
3976 * Returns: - the busiest group if imbalance exists.
3977 * - If no imbalance and user has opted for power-savings balance,
3978 * return the least loaded group whose CPUs can be
3979 * put to idle by rebalancing its tasks onto our group.
3980 */
3981static struct sched_group *
3982find_busiest_group(struct sched_domain *sd, int this_cpu,
3983 unsigned long *imbalance, enum cpu_idle_type idle,
3984 int *sd_idle, const struct cpumask *cpus, int *balance)
3985{
3986 struct sd_lb_stats sds;
3987
3988 memset(&sds, 0, sizeof(sds));
3989
3990 /*
3991 * Compute the various statistics relavent for load balancing at
3992 * this level.
3993 */
3994 update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
3995 balance, &sds);
3996
3997 /* Cases where imbalance does not exist from POV of this_cpu */
3998 /* 1) this_cpu is not the appropriate cpu to perform load balancing
3999 * at this level.
4000 * 2) There is no busy sibling group to pull from.
4001 * 3) This group is the busiest group.
4002 * 4) This group is more busy than the avg busieness at this
4003 * sched_domain.
4004 * 5) The imbalance is within the specified limit.
4005 * 6) Any rebalance would lead to ping-pong
4006 */
4007 if (balance && !(*balance))
4008 goto ret;
4009
4010 if (!sds.busiest || sds.busiest_nr_running == 0)
4011 goto out_balanced;
4012
4013 if (sds.this_load >= sds.max_load)
4014 goto out_balanced;
4015
4016 sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
4017
4018 if (sds.this_load >= sds.avg_load)
4019 goto out_balanced;
4020
4021 if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
4022 goto out_balanced;
4023
4024 sds.busiest_load_per_task /= sds.busiest_nr_running;
4025 if (sds.group_imb)
4026 sds.busiest_load_per_task =
4027 min(sds.busiest_load_per_task, sds.avg_load);
4028
4029 /*
4030 * We're trying to get all the cpus to the average_load, so we don't
4031 * want to push ourselves above the average load, nor do we wish to
4032 * reduce the max loaded cpu below the average load, as either of these
4033 * actions would just result in more rebalancing later, and ping-pong
4034 * tasks around. Thus we look for the minimum possible imbalance.
4035 * Negative imbalances (*we* are more loaded than anyone else) will
4036 * be counted as no imbalance for these purposes -- we can't fix that
4037 * by pulling tasks to us. Be careful of negative numbers as they'll
4038 * appear as very large values with unsigned longs.
4039 */
4040 if (sds.max_load <= sds.busiest_load_per_task)
4041 goto out_balanced;
4042
4043 /* Looks like there is an imbalance. Compute it */
4044 calculate_imbalance(&sds, this_cpu, imbalance);
4045 return sds.busiest;
4046
4047out_balanced:
4048 /*
4049 * There is no obvious imbalance. But check if we can do some balancing
4050 * to save power.
4051 */
4052 if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
4053 return sds.busiest;
4054ret:
4055 *imbalance = 0;
4056 return NULL;
4057}
4058
4059/*
4060 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4061 */
4062static struct rq *
4063find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
4064 unsigned long imbalance, const struct cpumask *cpus)
4065{
4066 struct rq *busiest = NULL, *rq;
4067 unsigned long max_load = 0;
4068 int i;
4069
4070 for_each_cpu(i, sched_group_cpus(group)) {
4071 unsigned long power = power_of(i);
4072 unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
4073 unsigned long wl;
4074
4075 if (!cpumask_test_cpu(i, cpus))
4076 continue;
4077
4078 rq = cpu_rq(i);
4079 wl = weighted_cpuload(i) * SCHED_LOAD_SCALE;
4080 wl /= power;
4081
4082 if (capacity && rq->nr_running == 1 && wl > imbalance)
4083 continue;
4084
4085 if (wl > max_load) {
4086 max_load = wl;
4087 busiest = rq;
4088 }
4089 }
4090
4091 return busiest;
4092}
4093
4094/*
4095 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
4096 * so long as it is large enough.
4097 */
4098#define MAX_PINNED_INTERVAL 512
4099
4100/* Working cpumask for load_balance and load_balance_newidle. */
4101static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
4102
4103/*
4104 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4105 * tasks if there is an imbalance.
4106 */
4107static int load_balance(int this_cpu, struct rq *this_rq,
4108 struct sched_domain *sd, enum cpu_idle_type idle,
4109 int *balance)
4110{
4111 int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
4112 struct sched_group *group;
4113 unsigned long imbalance;
4114 struct rq *busiest;
4115 unsigned long flags;
4116 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4117
4118 cpumask_copy(cpus, cpu_active_mask);
4119
4120 /*
4121 * When power savings policy is enabled for the parent domain, idle
4122 * sibling can pick up load irrespective of busy siblings. In this case,
4123 * let the state of idle sibling percolate up as CPU_IDLE, instead of
4124 * portraying it as CPU_NOT_IDLE.
4125 */
4126 if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
4127 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4128 sd_idle = 1;
4129
4130 schedstat_inc(sd, lb_count[idle]);
4131
4132redo:
4133 update_shares(sd);
4134 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
4135 cpus, balance);
4136
4137 if (*balance == 0)
4138 goto out_balanced;
4139
4140 if (!group) {
4141 schedstat_inc(sd, lb_nobusyg[idle]);
4142 goto out_balanced;
4143 }
4144
4145 busiest = find_busiest_queue(group, idle, imbalance, cpus);
4146 if (!busiest) {
4147 schedstat_inc(sd, lb_nobusyq[idle]);
4148 goto out_balanced;
4149 }
4150
4151 BUG_ON(busiest == this_rq);
4152
4153 schedstat_add(sd, lb_imbalance[idle], imbalance);
4154
4155 ld_moved = 0;
4156 if (busiest->nr_running > 1) {
4157 /*
4158 * Attempt to move tasks. If find_busiest_group has found
4159 * an imbalance but busiest->nr_running <= 1, the group is
4160 * still unbalanced. ld_moved simply stays zero, so it is
4161 * correctly treated as an imbalance.
4162 */
4163 local_irq_save(flags);
4164 double_rq_lock(this_rq, busiest);
4165 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4166 imbalance, sd, idle, &all_pinned);
4167 double_rq_unlock(this_rq, busiest);
4168 local_irq_restore(flags);
4169
4170 /*
4171 * some other cpu did the load balance for us.
4172 */
4173 if (ld_moved && this_cpu != smp_processor_id())
4174 resched_cpu(this_cpu);
4175
4176 /* All tasks on this runqueue were pinned by CPU affinity */
4177 if (unlikely(all_pinned)) {
4178 cpumask_clear_cpu(cpu_of(busiest), cpus);
4179 if (!cpumask_empty(cpus))
4180 goto redo;
4181 goto out_balanced;
4182 }
4183 }
4184
4185 if (!ld_moved) {
4186 schedstat_inc(sd, lb_failed[idle]);
4187 sd->nr_balance_failed++;
4188
4189 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
4190
4191 raw_spin_lock_irqsave(&busiest->lock, flags);
4192
4193 /* don't kick the migration_thread, if the curr
4194 * task on busiest cpu can't be moved to this_cpu
4195 */
4196 if (!cpumask_test_cpu(this_cpu,
4197 &busiest->curr->cpus_allowed)) {
4198 raw_spin_unlock_irqrestore(&busiest->lock,
4199 flags);
4200 all_pinned = 1;
4201 goto out_one_pinned;
4202 }
4203
4204 if (!busiest->active_balance) {
4205 busiest->active_balance = 1;
4206 busiest->push_cpu = this_cpu;
4207 active_balance = 1;
4208 }
4209 raw_spin_unlock_irqrestore(&busiest->lock, flags);
4210 if (active_balance)
4211 wake_up_process(busiest->migration_thread);
4212
4213 /*
4214 * We've kicked active balancing, reset the failure
4215 * counter.
4216 */
4217 sd->nr_balance_failed = sd->cache_nice_tries+1;
4218 }
4219 } else
4220 sd->nr_balance_failed = 0;
4221
4222 if (likely(!active_balance)) {
4223 /* We were unbalanced, so reset the balancing interval */
4224 sd->balance_interval = sd->min_interval;
4225 } else {
4226 /*
4227 * If we've begun active balancing, start to back off. This
4228 * case may not be covered by the all_pinned logic if there
4229 * is only 1 task on the busy runqueue (because we don't call
4230 * move_tasks).
4231 */
4232 if (sd->balance_interval < sd->max_interval)
4233 sd->balance_interval *= 2;
4234 }
4235
4236 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4237 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4238 ld_moved = -1;
4239
4240 goto out;
4241
4242out_balanced:
4243 schedstat_inc(sd, lb_balanced[idle]);
4244
4245 sd->nr_balance_failed = 0;
4246
4247out_one_pinned:
4248 /* tune up the balancing interval */
4249 if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
4250 (sd->balance_interval < sd->max_interval))
4251 sd->balance_interval *= 2;
4252
4253 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4254 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4255 ld_moved = -1;
4256 else
4257 ld_moved = 0;
4258out:
4259 if (ld_moved)
4260 update_shares(sd);
4261 return ld_moved;
4262}
4263
4264/*
4265 * Check this_cpu to ensure it is balanced within domain. Attempt to move
4266 * tasks if there is an imbalance.
4267 *
4268 * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE).
4269 * this_rq is locked.
4270 */
4271static int
4272load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
4273{
4274 struct sched_group *group;
4275 struct rq *busiest = NULL;
4276 unsigned long imbalance;
4277 int ld_moved = 0;
4278 int sd_idle = 0;
4279 int all_pinned = 0;
4280 struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
4281
4282 cpumask_copy(cpus, cpu_active_mask);
4283
4284 /*
4285 * When power savings policy is enabled for the parent domain, idle
4286 * sibling can pick up load irrespective of busy siblings. In this case,
4287 * let the state of idle sibling percolate up as IDLE, instead of
4288 * portraying it as CPU_NOT_IDLE.
4289 */
4290 if (sd->flags & SD_SHARE_CPUPOWER &&
4291 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4292 sd_idle = 1;
4293
4294 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
4295redo:
4296 update_shares_locked(this_rq, sd);
4297 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
4298 &sd_idle, cpus, NULL);
4299 if (!group) {
4300 schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]);
4301 goto out_balanced;
4302 }
4303
4304 busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, cpus);
4305 if (!busiest) {
4306 schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]);
4307 goto out_balanced;
4308 }
4309
4310 BUG_ON(busiest == this_rq);
4311
4312 schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance);
4313
4314 ld_moved = 0;
4315 if (busiest->nr_running > 1) {
4316 /* Attempt to move tasks */
4317 double_lock_balance(this_rq, busiest);
4318 /* this_rq->clock is already updated */
4319 update_rq_clock(busiest);
4320 ld_moved = move_tasks(this_rq, this_cpu, busiest,
4321 imbalance, sd, CPU_NEWLY_IDLE,
4322 &all_pinned);
4323 double_unlock_balance(this_rq, busiest);
4324
4325 if (unlikely(all_pinned)) {
4326 cpumask_clear_cpu(cpu_of(busiest), cpus);
4327 if (!cpumask_empty(cpus))
4328 goto redo;
4329 }
4330 }
4331
4332 if (!ld_moved) {
4333 int active_balance = 0;
4334
4335 schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]);
4336 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4337 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4338 return -1;
4339
4340 if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
4341 return -1;
4342
4343 if (sd->nr_balance_failed++ < 2)
4344 return -1;
4345
4346 /*
4347 * The only task running in a non-idle cpu can be moved to this
4348 * cpu in an attempt to completely freeup the other CPU
4349 * package. The same method used to move task in load_balance()
4350 * have been extended for load_balance_newidle() to speedup
4351 * consolidation at sched_mc=POWERSAVINGS_BALANCE_WAKEUP (2)
4352 *
4353 * The package power saving logic comes from
4354 * find_busiest_group(). If there are no imbalance, then
4355 * f_b_g() will return NULL. However when sched_mc={1,2} then
4356 * f_b_g() will select a group from which a running task may be
4357 * pulled to this cpu in order to make the other package idle.
4358 * If there is no opportunity to make a package idle and if
4359 * there are no imbalance, then f_b_g() will return NULL and no
4360 * action will be taken in load_balance_newidle().
4361 *
4362 * Under normal task pull operation due to imbalance, there
4363 * will be more than one task in the source run queue and
4364 * move_tasks() will succeed. ld_moved will be true and this
4365 * active balance code will not be triggered.
4366 */
4367
4368 /* Lock busiest in correct order while this_rq is held */
4369 double_lock_balance(this_rq, busiest);
4370
4371 /*
4372 * don't kick the migration_thread, if the curr
4373 * task on busiest cpu can't be moved to this_cpu
4374 */
4375 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
4376 double_unlock_balance(this_rq, busiest);
4377 all_pinned = 1;
4378 return ld_moved;
4379 }
4380
4381 if (!busiest->active_balance) {
4382 busiest->active_balance = 1;
4383 busiest->push_cpu = this_cpu;
4384 active_balance = 1;
4385 }
4386
4387 double_unlock_balance(this_rq, busiest);
4388 /*
4389 * Should not call ttwu while holding a rq->lock
4390 */
4391 raw_spin_unlock(&this_rq->lock);
4392 if (active_balance)
4393 wake_up_process(busiest->migration_thread);
4394 raw_spin_lock(&this_rq->lock);
4395
4396 } else
4397 sd->nr_balance_failed = 0;
4398
4399 update_shares_locked(this_rq, sd);
4400 return ld_moved;
4401
4402out_balanced:
4403 schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]);
4404 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
4405 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
4406 return -1;
4407 sd->nr_balance_failed = 0;
4408
4409 return 0;
4410}
4411
4412/*
4413 * idle_balance is called by schedule() if this_cpu is about to become
4414 * idle. Attempts to pull tasks from other CPUs.
4415 */
4416static void idle_balance(int this_cpu, struct rq *this_rq)
4417{
4418 struct sched_domain *sd;
4419 int pulled_task = 0;
4420 unsigned long next_balance = jiffies + HZ;
4421
4422 this_rq->idle_stamp = this_rq->clock;
4423
4424 if (this_rq->avg_idle < sysctl_sched_migration_cost)
4425 return;
4426
4427 for_each_domain(this_cpu, sd) {
4428 unsigned long interval;
4429
4430 if (!(sd->flags & SD_LOAD_BALANCE))
4431 continue;
4432
4433 if (sd->flags & SD_BALANCE_NEWIDLE)
4434 /* If we've pulled tasks over stop searching: */
4435 pulled_task = load_balance_newidle(this_cpu, this_rq,
4436 sd);
4437
4438 interval = msecs_to_jiffies(sd->balance_interval);
4439 if (time_after(next_balance, sd->last_balance + interval))
4440 next_balance = sd->last_balance + interval;
4441 if (pulled_task) {
4442 this_rq->idle_stamp = 0;
4443 break;
4444 }
4445 }
4446 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
4447 /*
4448 * We are going idle. next_balance may be set based on
4449 * a busy processor. So reset next_balance.
4450 */
4451 this_rq->next_balance = next_balance;
4452 }
4453}
4454
4455/*
4456 * active_load_balance is run by migration threads. It pushes running tasks
4457 * off the busiest CPU onto idle CPUs. It requires at least 1 task to be
4458 * running on each physical CPU where possible, and avoids physical /
4459 * logical imbalances.
4460 *
4461 * Called with busiest_rq locked.
4462 */
4463static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
4464{
4465 int target_cpu = busiest_rq->push_cpu;
4466 struct sched_domain *sd;
4467 struct rq *target_rq;
4468
4469 /* Is there any task to move? */
4470 if (busiest_rq->nr_running <= 1)
4471 return;
4472
4473 target_rq = cpu_rq(target_cpu);
4474
4475 /*
4476 * This condition is "impossible", if it occurs
4477 * we need to fix it. Originally reported by
4478 * Bjorn Helgaas on a 128-cpu setup.
4479 */
4480 BUG_ON(busiest_rq == target_rq);
4481
4482 /* move a task from busiest_rq to target_rq */
4483 double_lock_balance(busiest_rq, target_rq);
4484 update_rq_clock(busiest_rq);
4485 update_rq_clock(target_rq);
4486
4487 /* Search for an sd spanning us and the target CPU. */
4488 for_each_domain(target_cpu, sd) {
4489 if ((sd->flags & SD_LOAD_BALANCE) &&
4490 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
4491 break;
4492 }
4493
4494 if (likely(sd)) {
4495 schedstat_inc(sd, alb_count);
4496
4497 if (move_one_task(target_rq, target_cpu, busiest_rq,
4498 sd, CPU_IDLE))
4499 schedstat_inc(sd, alb_pushed);
4500 else
4501 schedstat_inc(sd, alb_failed);
4502 }
4503 double_unlock_balance(busiest_rq, target_rq);
4504}
4505
4506#ifdef CONFIG_NO_HZ
4507static struct {
4508 atomic_t load_balancer;
4509 cpumask_var_t cpu_mask;
4510 cpumask_var_t ilb_grp_nohz_mask;
4511} nohz ____cacheline_aligned = {
4512 .load_balancer = ATOMIC_INIT(-1),
4513};
4514
4515int get_nohz_load_balancer(void)
4516{
4517 return atomic_read(&nohz.load_balancer);
4518}
4519
4520#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
4521/**
4522 * lowest_flag_domain - Return lowest sched_domain containing flag.
4523 * @cpu: The cpu whose lowest level of sched domain is to
4524 * be returned.
4525 * @flag: The flag to check for the lowest sched_domain
4526 * for the given cpu.
4527 *
4528 * Returns the lowest sched_domain of a cpu which contains the given flag.
4529 */
4530static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
4531{
4532 struct sched_domain *sd;
4533
4534 for_each_domain(cpu, sd)
4535 if (sd && (sd->flags & flag))
4536 break;
4537
4538 return sd;
4539}
4540
4541/**
4542 * for_each_flag_domain - Iterates over sched_domains containing the flag.
4543 * @cpu: The cpu whose domains we're iterating over.
4544 * @sd: variable holding the value of the power_savings_sd
4545 * for cpu.
4546 * @flag: The flag to filter the sched_domains to be iterated.
4547 *
4548 * Iterates over all the scheduler domains for a given cpu that has the 'flag'
4549 * set, starting from the lowest sched_domain to the highest.
4550 */
4551#define for_each_flag_domain(cpu, sd, flag) \
4552 for (sd = lowest_flag_domain(cpu, flag); \
4553 (sd && (sd->flags & flag)); sd = sd->parent)
4554
4555/**
4556 * is_semi_idle_group - Checks if the given sched_group is semi-idle.
4557 * @ilb_group: group to be checked for semi-idleness
4558 *
4559 * Returns: 1 if the group is semi-idle. 0 otherwise.
4560 *
4561 * We define a sched_group to be semi idle if it has atleast one idle-CPU
4562 * and atleast one non-idle CPU. This helper function checks if the given
4563 * sched_group is semi-idle or not.
4564 */
4565static inline int is_semi_idle_group(struct sched_group *ilb_group)
4566{
4567 cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
4568 sched_group_cpus(ilb_group));
4569
4570 /*
4571 * A sched_group is semi-idle when it has atleast one busy cpu
4572 * and atleast one idle cpu.
4573 */
4574 if (cpumask_empty(nohz.ilb_grp_nohz_mask))
4575 return 0;
4576
4577 if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
4578 return 0;
4579
4580 return 1;
4581}
4582/**
4583 * find_new_ilb - Finds the optimum idle load balancer for nomination.
4584 * @cpu: The cpu which is nominating a new idle_load_balancer.
4585 *
4586 * Returns: Returns the id of the idle load balancer if it exists,
4587 * Else, returns >= nr_cpu_ids.
4588 *
4589 * This algorithm picks the idle load balancer such that it belongs to a
4590 * semi-idle powersavings sched_domain. The idea is to try and avoid
4591 * completely idle packages/cores just for the purpose of idle load balancing
4592 * when there are other idle cpu's which are better suited for that job.
4593 */
4594static int find_new_ilb(int cpu)
4595{
4596 struct sched_domain *sd;
4597 struct sched_group *ilb_group;
4598
4599 /*
4600 * Have idle load balancer selection from semi-idle packages only
4601 * when power-aware load balancing is enabled
4602 */
4603 if (!(sched_smt_power_savings || sched_mc_power_savings))
4604 goto out_done;
4605
4606 /*
4607 * Optimize for the case when we have no idle CPUs or only one
4608 * idle CPU. Don't walk the sched_domain hierarchy in such cases
4609 */
4610 if (cpumask_weight(nohz.cpu_mask) < 2)
4611 goto out_done;
4612
4613 for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
4614 ilb_group = sd->groups;
4615
4616 do {
4617 if (is_semi_idle_group(ilb_group))
4618 return cpumask_first(nohz.ilb_grp_nohz_mask);
4619
4620 ilb_group = ilb_group->next;
4621
4622 } while (ilb_group != sd->groups);
4623 }
4624
4625out_done:
4626 return cpumask_first(nohz.cpu_mask);
4627}
4628#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
4629static inline int find_new_ilb(int call_cpu)
4630{
4631 return cpumask_first(nohz.cpu_mask);
4632}
4633#endif
4634
4635/*
4636 * This routine will try to nominate the ilb (idle load balancing)
4637 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
4638 * load balancing on behalf of all those cpus. If all the cpus in the system
4639 * go into this tickless mode, then there will be no ilb owner (as there is
4640 * no need for one) and all the cpus will sleep till the next wakeup event
4641 * arrives...
4642 *
4643 * For the ilb owner, tick is not stopped. And this tick will be used
4644 * for idle load balancing. ilb owner will still be part of
4645 * nohz.cpu_mask..
4646 *
4647 * While stopping the tick, this cpu will become the ilb owner if there
4648 * is no other owner. And will be the owner till that cpu becomes busy
4649 * or if all cpus in the system stop their ticks at which point
4650 * there is no need for ilb owner.
4651 *
4652 * When the ilb owner becomes busy, it nominates another owner, during the
4653 * next busy scheduler_tick()
4654 */
4655int select_nohz_load_balancer(int stop_tick)
4656{
4657 int cpu = smp_processor_id();
4658
4659 if (stop_tick) {
4660 cpu_rq(cpu)->in_nohz_recently = 1;
4661
4662 if (!cpu_active(cpu)) {
4663 if (atomic_read(&nohz.load_balancer) != cpu)
4664 return 0;
4665
4666 /*
4667 * If we are going offline and still the leader,
4668 * give up!
4669 */
4670 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4671 BUG();
4672
4673 return 0;
4674 }
4675
4676 cpumask_set_cpu(cpu, nohz.cpu_mask);
4677
4678 /* time for ilb owner also to sleep */
4679 if (cpumask_weight(nohz.cpu_mask) == num_active_cpus()) {
4680 if (atomic_read(&nohz.load_balancer) == cpu)
4681 atomic_set(&nohz.load_balancer, -1);
4682 return 0;
4683 }
4684
4685 if (atomic_read(&nohz.load_balancer) == -1) {
4686 /* make me the ilb owner */
4687 if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
4688 return 1;
4689 } else if (atomic_read(&nohz.load_balancer) == cpu) {
4690 int new_ilb;
4691
4692 if (!(sched_smt_power_savings ||
4693 sched_mc_power_savings))
4694 return 1;
4695 /*
4696 * Check to see if there is a more power-efficient
4697 * ilb.
4698 */
4699 new_ilb = find_new_ilb(cpu);
4700 if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
4701 atomic_set(&nohz.load_balancer, -1);
4702 resched_cpu(new_ilb);
4703 return 0;
4704 }
4705 return 1;
4706 }
4707 } else {
4708 if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
4709 return 0;
4710
4711 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4712
4713 if (atomic_read(&nohz.load_balancer) == cpu)
4714 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
4715 BUG();
4716 }
4717 return 0;
4718}
4719#endif
4720
4721static DEFINE_SPINLOCK(balancing);
4722
4723/*
4724 * It checks each scheduling domain to see if it is due to be balanced,
4725 * and initiates a balancing operation if so.
4726 *
4727 * Balancing parameters are set up in arch_init_sched_domains.
4728 */
4729static void rebalance_domains(int cpu, enum cpu_idle_type idle)
4730{
4731 int balance = 1;
4732 struct rq *rq = cpu_rq(cpu);
4733 unsigned long interval;
4734 struct sched_domain *sd;
4735 /* Earliest time when we have to do rebalance again */
4736 unsigned long next_balance = jiffies + 60*HZ;
4737 int update_next_balance = 0;
4738 int need_serialize;
4739
4740 for_each_domain(cpu, sd) {
4741 if (!(sd->flags & SD_LOAD_BALANCE))
4742 continue;
4743
4744 interval = sd->balance_interval;
4745 if (idle != CPU_IDLE)
4746 interval *= sd->busy_factor;
4747
4748 /* scale ms to jiffies */
4749 interval = msecs_to_jiffies(interval);
4750 if (unlikely(!interval))
4751 interval = 1;
4752 if (interval > HZ*NR_CPUS/10)
4753 interval = HZ*NR_CPUS/10;
4754
4755 need_serialize = sd->flags & SD_SERIALIZE;
4756
4757 if (need_serialize) {
4758 if (!spin_trylock(&balancing))
4759 goto out;
4760 }
4761
4762 if (time_after_eq(jiffies, sd->last_balance + interval)) {
4763 if (load_balance(cpu, rq, sd, idle, &balance)) {
4764 /*
4765 * We've pulled tasks over so either we're no
4766 * longer idle, or one of our SMT siblings is
4767 * not idle.
4768 */
4769 idle = CPU_NOT_IDLE;
4770 }
4771 sd->last_balance = jiffies;
4772 }
4773 if (need_serialize)
4774 spin_unlock(&balancing);
4775out:
4776 if (time_after(next_balance, sd->last_balance + interval)) {
4777 next_balance = sd->last_balance + interval;
4778 update_next_balance = 1;
4779 }
4780
4781 /*
4782 * Stop the load balance at this level. There is another
4783 * CPU in our sched group which is doing load balancing more
4784 * actively.
4785 */
4786 if (!balance)
4787 break;
4788 }
4789
4790 /*
4791 * next_balance will be updated only when there is a need.
4792 * When the cpu is attached to null domain for ex, it will not be
4793 * updated.
4794 */
4795 if (likely(update_next_balance))
4796 rq->next_balance = next_balance;
4797}
4798
4799/*
4800 * run_rebalance_domains is triggered when needed from the scheduler tick.
4801 * In CONFIG_NO_HZ case, the idle load balance owner will do the
4802 * rebalancing for all the cpus for whom scheduler ticks are stopped.
4803 */
4804static void run_rebalance_domains(struct softirq_action *h)
4805{
4806 int this_cpu = smp_processor_id();
4807 struct rq *this_rq = cpu_rq(this_cpu);
4808 enum cpu_idle_type idle = this_rq->idle_at_tick ?
4809 CPU_IDLE : CPU_NOT_IDLE;
4810
4811 rebalance_domains(this_cpu, idle);
4812
4813#ifdef CONFIG_NO_HZ
4814 /*
4815 * If this cpu is the owner for idle load balancing, then do the
4816 * balancing on behalf of the other idle cpus whose ticks are
4817 * stopped.
4818 */
4819 if (this_rq->idle_at_tick &&
4820 atomic_read(&nohz.load_balancer) == this_cpu) {
4821 struct rq *rq;
4822 int balance_cpu;
4823
4824 for_each_cpu(balance_cpu, nohz.cpu_mask) {
4825 if (balance_cpu == this_cpu)
4826 continue;
4827
4828 /*
4829 * If this cpu gets work to do, stop the load balancing
4830 * work being done for other cpus. Next load
4831 * balancing owner will pick it up.
4832 */
4833 if (need_resched())
4834 break;
4835
4836 rebalance_domains(balance_cpu, CPU_IDLE);
4837
4838 rq = cpu_rq(balance_cpu);
4839 if (time_after(this_rq->next_balance, rq->next_balance))
4840 this_rq->next_balance = rq->next_balance;
4841 }
4842 }
4843#endif
4844}
4845
4846static inline int on_null_domain(int cpu)
4847{
4848 return !rcu_dereference(cpu_rq(cpu)->sd);
4849}
4850
4851/*
4852 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
4853 *
4854 * In case of CONFIG_NO_HZ, this is the place where we nominate a new
4855 * idle load balancing owner or decide to stop the periodic load balancing,
4856 * if the whole system is idle.
4857 */
4858static inline void trigger_load_balance(struct rq *rq, int cpu)
4859{
4860#ifdef CONFIG_NO_HZ
4861 /*
4862 * If we were in the nohz mode recently and busy at the current
4863 * scheduler tick, then check if we need to nominate new idle
4864 * load balancer.
4865 */
4866 if (rq->in_nohz_recently && !rq->idle_at_tick) {
4867 rq->in_nohz_recently = 0;
4868
4869 if (atomic_read(&nohz.load_balancer) == cpu) {
4870 cpumask_clear_cpu(cpu, nohz.cpu_mask);
4871 atomic_set(&nohz.load_balancer, -1);
4872 }
4873
4874 if (atomic_read(&nohz.load_balancer) == -1) {
4875 int ilb = find_new_ilb(cpu);
4876
4877 if (ilb < nr_cpu_ids)
4878 resched_cpu(ilb);
4879 }
4880 }
4881
4882 /*
4883 * If this cpu is idle and doing idle load balancing for all the
4884 * cpus with ticks stopped, is it time for that to stop?
4885 */
4886 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu &&
4887 cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
4888 resched_cpu(cpu);
4889 return;
4890 }
4891
4892 /*
4893 * If this cpu is idle and the idle load balancing is done by
4894 * someone else, then no need raise the SCHED_SOFTIRQ
4895 */
4896 if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu &&
4897 cpumask_test_cpu(cpu, nohz.cpu_mask))
4898 return;
4899#endif
4900 /* Don't need to rebalance while attached to NULL domain */
4901 if (time_after_eq(jiffies, rq->next_balance) &&
4902 likely(!on_null_domain(cpu)))
4903 raise_softirq(SCHED_SOFTIRQ);
4904}
4905
4906#else /* CONFIG_SMP */
4907
4908/*
4909 * on UP we do not need to balance between CPUs:
4910 */
4911static inline void idle_balance(int cpu, struct rq *rq)
4912{
4913}
4914
4915#endif 3164#endif
4916 3165
4917DEFINE_PER_CPU(struct kernel_stat, kstat); 3166DEFINE_PER_CPU(struct kernel_stat, kstat);
@@ -5266,7 +3515,7 @@ void scheduler_tick(void)
5266 curr->sched_class->task_tick(rq, curr, 0); 3515 curr->sched_class->task_tick(rq, curr, 0);
5267 raw_spin_unlock(&rq->lock); 3516 raw_spin_unlock(&rq->lock);
5268 3517
5269 perf_event_task_tick(curr, cpu); 3518 perf_event_task_tick(curr);
5270 3519
5271#ifdef CONFIG_SMP 3520#ifdef CONFIG_SMP
5272 rq->idle_at_tick = idle_cpu(cpu); 3521 rq->idle_at_tick = idle_cpu(cpu);
@@ -5480,7 +3729,7 @@ need_resched_nonpreemptible:
5480 3729
5481 if (likely(prev != next)) { 3730 if (likely(prev != next)) {
5482 sched_info_switch(prev, next); 3731 sched_info_switch(prev, next);
5483 perf_event_task_sched_out(prev, next, cpu); 3732 perf_event_task_sched_out(prev, next);
5484 3733
5485 rq->nr_switches++; 3734 rq->nr_switches++;
5486 rq->curr = next; 3735 rq->curr = next;
@@ -5498,8 +3747,11 @@ need_resched_nonpreemptible:
5498 3747
5499 post_schedule(rq); 3748 post_schedule(rq);
5500 3749
5501 if (unlikely(reacquire_kernel_lock(current) < 0)) 3750 if (unlikely(reacquire_kernel_lock(current) < 0)) {
3751 prev = rq->curr;
3752 switch_count = &prev->nivcsw;
5502 goto need_resched_nonpreemptible; 3753 goto need_resched_nonpreemptible;
3754 }
5503 3755
5504 preempt_enable_no_resched(); 3756 preempt_enable_no_resched();
5505 if (need_resched()) 3757 if (need_resched())
@@ -5911,14 +4163,15 @@ EXPORT_SYMBOL(wait_for_completion_killable);
5911 */ 4163 */
5912bool try_wait_for_completion(struct completion *x) 4164bool try_wait_for_completion(struct completion *x)
5913{ 4165{
4166 unsigned long flags;
5914 int ret = 1; 4167 int ret = 1;
5915 4168
5916 spin_lock_irq(&x->wait.lock); 4169 spin_lock_irqsave(&x->wait.lock, flags);
5917 if (!x->done) 4170 if (!x->done)
5918 ret = 0; 4171 ret = 0;
5919 else 4172 else
5920 x->done--; 4173 x->done--;
5921 spin_unlock_irq(&x->wait.lock); 4174 spin_unlock_irqrestore(&x->wait.lock, flags);
5922 return ret; 4175 return ret;
5923} 4176}
5924EXPORT_SYMBOL(try_wait_for_completion); 4177EXPORT_SYMBOL(try_wait_for_completion);
@@ -5933,12 +4186,13 @@ EXPORT_SYMBOL(try_wait_for_completion);
5933 */ 4186 */
5934bool completion_done(struct completion *x) 4187bool completion_done(struct completion *x)
5935{ 4188{
4189 unsigned long flags;
5936 int ret = 1; 4190 int ret = 1;
5937 4191
5938 spin_lock_irq(&x->wait.lock); 4192 spin_lock_irqsave(&x->wait.lock, flags);
5939 if (!x->done) 4193 if (!x->done)
5940 ret = 0; 4194 ret = 0;
5941 spin_unlock_irq(&x->wait.lock); 4195 spin_unlock_irqrestore(&x->wait.lock, flags);
5942 return ret; 4196 return ret;
5943} 4197}
5944EXPORT_SYMBOL(completion_done); 4198EXPORT_SYMBOL(completion_done);
@@ -6006,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6006 unsigned long flags; 4260 unsigned long flags;
6007 int oldprio, on_rq, running; 4261 int oldprio, on_rq, running;
6008 struct rq *rq; 4262 struct rq *rq;
6009 const struct sched_class *prev_class = p->sched_class; 4263 const struct sched_class *prev_class;
6010 4264
6011 BUG_ON(prio < 0 || prio > MAX_PRIO); 4265 BUG_ON(prio < 0 || prio > MAX_PRIO);
6012 4266
@@ -6014,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6014 update_rq_clock(rq); 4268 update_rq_clock(rq);
6015 4269
6016 oldprio = p->prio; 4270 oldprio = p->prio;
4271 prev_class = p->sched_class;
6017 on_rq = p->se.on_rq; 4272 on_rq = p->se.on_rq;
6018 running = task_current(rq, p); 4273 running = task_current(rq, p);
6019 if (on_rq) 4274 if (on_rq)
@@ -6031,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
6031 if (running) 4286 if (running)
6032 p->sched_class->set_curr_task(rq); 4287 p->sched_class->set_curr_task(rq);
6033 if (on_rq) { 4288 if (on_rq) {
6034 enqueue_task(rq, p, 0); 4289 enqueue_task(rq, p, 0, oldprio < prio);
6035 4290
6036 check_class_changed(rq, p, prev_class, oldprio, running); 4291 check_class_changed(rq, p, prev_class, oldprio, running);
6037 } 4292 }
@@ -6075,7 +4330,7 @@ void set_user_nice(struct task_struct *p, long nice)
6075 delta = p->prio - old_prio; 4330 delta = p->prio - old_prio;
6076 4331
6077 if (on_rq) { 4332 if (on_rq) {
6078 enqueue_task(rq, p, 0); 4333 enqueue_task(rq, p, 0, false);
6079 /* 4334 /*
6080 * If the task increased its priority or is running and 4335 * If the task increased its priority or is running and
6081 * lowered its priority, then reschedule its CPU: 4336 * lowered its priority, then reschedule its CPU:
@@ -6098,7 +4353,7 @@ int can_nice(const struct task_struct *p, const int nice)
6098 /* convert nice value [19,-20] to rlimit style value [1,40] */ 4353 /* convert nice value [19,-20] to rlimit style value [1,40] */
6099 int nice_rlim = 20 - nice; 4354 int nice_rlim = 20 - nice;
6100 4355
6101 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 4356 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
6102 capable(CAP_SYS_NICE)); 4357 capable(CAP_SYS_NICE));
6103} 4358}
6104 4359
@@ -6233,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
6233{ 4488{
6234 int retval, oldprio, oldpolicy = -1, on_rq, running; 4489 int retval, oldprio, oldpolicy = -1, on_rq, running;
6235 unsigned long flags; 4490 unsigned long flags;
6236 const struct sched_class *prev_class = p->sched_class; 4491 const struct sched_class *prev_class;
6237 struct rq *rq; 4492 struct rq *rq;
6238 int reset_on_fork; 4493 int reset_on_fork;
6239 4494
@@ -6275,7 +4530,7 @@ recheck:
6275 4530
6276 if (!lock_task_sighand(p, &flags)) 4531 if (!lock_task_sighand(p, &flags))
6277 return -ESRCH; 4532 return -ESRCH;
6278 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; 4533 rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
6279 unlock_task_sighand(p, &flags); 4534 unlock_task_sighand(p, &flags);
6280 4535
6281 /* can't set/change the rt policy */ 4536 /* can't set/change the rt policy */
@@ -6347,6 +4602,7 @@ recheck:
6347 p->sched_reset_on_fork = reset_on_fork; 4602 p->sched_reset_on_fork = reset_on_fork;
6348 4603
6349 oldprio = p->prio; 4604 oldprio = p->prio;
4605 prev_class = p->sched_class;
6350 __setscheduler(rq, p, policy, param->sched_priority); 4606 __setscheduler(rq, p, policy, param->sched_priority);
6351 4607
6352 if (running) 4608 if (running)
@@ -6457,7 +4713,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6457 return -EINVAL; 4713 return -EINVAL;
6458 4714
6459 retval = -ESRCH; 4715 retval = -ESRCH;
6460 read_lock(&tasklist_lock); 4716 rcu_read_lock();
6461 p = find_process_by_pid(pid); 4717 p = find_process_by_pid(pid);
6462 if (p) { 4718 if (p) {
6463 retval = security_task_getscheduler(p); 4719 retval = security_task_getscheduler(p);
@@ -6465,7 +4721,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6465 retval = p->policy 4721 retval = p->policy
6466 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); 4722 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6467 } 4723 }
6468 read_unlock(&tasklist_lock); 4724 rcu_read_unlock();
6469 return retval; 4725 return retval;
6470} 4726}
6471 4727
@@ -6483,7 +4739,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6483 if (!param || pid < 0) 4739 if (!param || pid < 0)
6484 return -EINVAL; 4740 return -EINVAL;
6485 4741
6486 read_lock(&tasklist_lock); 4742 rcu_read_lock();
6487 p = find_process_by_pid(pid); 4743 p = find_process_by_pid(pid);
6488 retval = -ESRCH; 4744 retval = -ESRCH;
6489 if (!p) 4745 if (!p)
@@ -6494,7 +4750,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6494 goto out_unlock; 4750 goto out_unlock;
6495 4751
6496 lp.sched_priority = p->rt_priority; 4752 lp.sched_priority = p->rt_priority;
6497 read_unlock(&tasklist_lock); 4753 rcu_read_unlock();
6498 4754
6499 /* 4755 /*
6500 * This one might sleep, we cannot do it with a spinlock held ... 4756 * This one might sleep, we cannot do it with a spinlock held ...
@@ -6504,7 +4760,7 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6504 return retval; 4760 return retval;
6505 4761
6506out_unlock: 4762out_unlock:
6507 read_unlock(&tasklist_lock); 4763 rcu_read_unlock();
6508 return retval; 4764 return retval;
6509} 4765}
6510 4766
@@ -6515,22 +4771,18 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
6515 int retval; 4771 int retval;
6516 4772
6517 get_online_cpus(); 4773 get_online_cpus();
6518 read_lock(&tasklist_lock); 4774 rcu_read_lock();
6519 4775
6520 p = find_process_by_pid(pid); 4776 p = find_process_by_pid(pid);
6521 if (!p) { 4777 if (!p) {
6522 read_unlock(&tasklist_lock); 4778 rcu_read_unlock();
6523 put_online_cpus(); 4779 put_online_cpus();
6524 return -ESRCH; 4780 return -ESRCH;
6525 } 4781 }
6526 4782
6527 /* 4783 /* Prevent p going away */
6528 * It is not safe to call set_cpus_allowed with the
6529 * tasklist_lock held. We will bump the task_struct's
6530 * usage count and then drop tasklist_lock.
6531 */
6532 get_task_struct(p); 4784 get_task_struct(p);
6533 read_unlock(&tasklist_lock); 4785 rcu_read_unlock();
6534 4786
6535 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { 4787 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
6536 retval = -ENOMEM; 4788 retval = -ENOMEM;
@@ -6616,7 +4868,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6616 int retval; 4868 int retval;
6617 4869
6618 get_online_cpus(); 4870 get_online_cpus();
6619 read_lock(&tasklist_lock); 4871 rcu_read_lock();
6620 4872
6621 retval = -ESRCH; 4873 retval = -ESRCH;
6622 p = find_process_by_pid(pid); 4874 p = find_process_by_pid(pid);
@@ -6632,7 +4884,7 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
6632 task_rq_unlock(rq, &flags); 4884 task_rq_unlock(rq, &flags);
6633 4885
6634out_unlock: 4886out_unlock:
6635 read_unlock(&tasklist_lock); 4887 rcu_read_unlock();
6636 put_online_cpus(); 4888 put_online_cpus();
6637 4889
6638 return retval; 4890 return retval;
@@ -6876,7 +5128,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6876 return -EINVAL; 5128 return -EINVAL;
6877 5129
6878 retval = -ESRCH; 5130 retval = -ESRCH;
6879 read_lock(&tasklist_lock); 5131 rcu_read_lock();
6880 p = find_process_by_pid(pid); 5132 p = find_process_by_pid(pid);
6881 if (!p) 5133 if (!p)
6882 goto out_unlock; 5134 goto out_unlock;
@@ -6889,13 +5141,13 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6889 time_slice = p->sched_class->get_rr_interval(rq, p); 5141 time_slice = p->sched_class->get_rr_interval(rq, p);
6890 task_rq_unlock(rq, &flags); 5142 task_rq_unlock(rq, &flags);
6891 5143
6892 read_unlock(&tasklist_lock); 5144 rcu_read_unlock();
6893 jiffies_to_timespec(time_slice, &t); 5145 jiffies_to_timespec(time_slice, &t);
6894 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 5146 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
6895 return retval; 5147 return retval;
6896 5148
6897out_unlock: 5149out_unlock:
6898 read_unlock(&tasklist_lock); 5150 rcu_read_unlock();
6899 return retval; 5151 return retval;
6900} 5152}
6901 5153
@@ -6986,6 +5238,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
6986 raw_spin_lock_irqsave(&rq->lock, flags); 5238 raw_spin_lock_irqsave(&rq->lock, flags);
6987 5239
6988 __sched_fork(idle); 5240 __sched_fork(idle);
5241 idle->state = TASK_RUNNING;
6989 idle->se.exec_start = sched_clock(); 5242 idle->se.exec_start = sched_clock();
6990 5243
6991 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5244 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
@@ -7101,6 +5354,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
7101 int ret = 0; 5354 int ret = 0;
7102 5355
7103 rq = task_rq_lock(p, &flags); 5356 rq = task_rq_lock(p, &flags);
5357
7104 if (!cpumask_intersects(new_mask, cpu_active_mask)) { 5358 if (!cpumask_intersects(new_mask, cpu_active_mask)) {
7105 ret = -EINVAL; 5359 ret = -EINVAL;
7106 goto out; 5360 goto out;
@@ -7156,7 +5410,7 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
7156static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 5410static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7157{ 5411{
7158 struct rq *rq_dest, *rq_src; 5412 struct rq *rq_dest, *rq_src;
7159 int ret = 0, on_rq; 5413 int ret = 0;
7160 5414
7161 if (unlikely(!cpu_active(dest_cpu))) 5415 if (unlikely(!cpu_active(dest_cpu)))
7162 return ret; 5416 return ret;
@@ -7172,12 +5426,13 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
7172 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 5426 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7173 goto fail; 5427 goto fail;
7174 5428
7175 on_rq = p->se.on_rq; 5429 /*
7176 if (on_rq) 5430 * If we're not on a rq, the next wake-up will ensure we're
5431 * placed properly.
5432 */
5433 if (p->se.on_rq) {
7177 deactivate_task(rq_src, p, 0); 5434 deactivate_task(rq_src, p, 0);
7178 5435 set_task_cpu(p, dest_cpu);
7179 set_task_cpu(p, dest_cpu);
7180 if (on_rq) {
7181 activate_task(rq_dest, p, 0); 5436 activate_task(rq_dest, p, 0);
7182 check_preempt_curr(rq_dest, p, 0); 5437 check_preempt_curr(rq_dest, p, 0);
7183 } 5438 }
@@ -7273,37 +5528,10 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
7273static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5528static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
7274{ 5529{
7275 int dest_cpu; 5530 int dest_cpu;
7276 const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(dead_cpu));
7277 5531
7278again: 5532again:
7279 /* Look for allowed, online CPU in same node. */ 5533 dest_cpu = select_fallback_rq(dead_cpu, p);
7280 for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
7281 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
7282 goto move;
7283 5534
7284 /* Any allowed, online CPU? */
7285 dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
7286 if (dest_cpu < nr_cpu_ids)
7287 goto move;
7288
7289 /* No more Mr. Nice Guy. */
7290 if (dest_cpu >= nr_cpu_ids) {
7291 cpuset_cpus_allowed_locked(p, &p->cpus_allowed);
7292 dest_cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
7293
7294 /*
7295 * Don't tell them about moving exiting tasks or
7296 * kernel threads (both mm NULL), since they never
7297 * leave kernel.
7298 */
7299 if (p->mm && printk_ratelimit()) {
7300 printk(KERN_INFO "process %d (%s) no "
7301 "longer affine to cpu%d\n",
7302 task_pid_nr(p), p->comm, dead_cpu);
7303 }
7304 }
7305
7306move:
7307 /* It can have affinity changed while we were choosing. */ 5535 /* It can have affinity changed while we were choosing. */
7308 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu))) 5536 if (unlikely(!__migrate_task_irq(p, dead_cpu, dest_cpu)))
7309 goto again; 5537 goto again;
@@ -9413,7 +7641,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
9413 tg->rt_rq[cpu] = rt_rq; 7641 tg->rt_rq[cpu] = rt_rq;
9414 init_rt_rq(rt_rq, rq); 7642 init_rt_rq(rt_rq, rq);
9415 rt_rq->tg = tg; 7643 rt_rq->tg = tg;
9416 rt_rq->rt_se = rt_se;
9417 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7644 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
9418 if (add) 7645 if (add)
9419 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); 7646 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -9444,9 +7671,6 @@ void __init sched_init(void)
9444#ifdef CONFIG_RT_GROUP_SCHED 7671#ifdef CONFIG_RT_GROUP_SCHED
9445 alloc_size += 2 * nr_cpu_ids * sizeof(void **); 7672 alloc_size += 2 * nr_cpu_ids * sizeof(void **);
9446#endif 7673#endif
9447#ifdef CONFIG_USER_SCHED
9448 alloc_size *= 2;
9449#endif
9450#ifdef CONFIG_CPUMASK_OFFSTACK 7674#ifdef CONFIG_CPUMASK_OFFSTACK
9451 alloc_size += num_possible_cpus() * cpumask_size(); 7675 alloc_size += num_possible_cpus() * cpumask_size();
9452#endif 7676#endif
@@ -9460,13 +7684,6 @@ void __init sched_init(void)
9460 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7684 init_task_group.cfs_rq = (struct cfs_rq **)ptr;
9461 ptr += nr_cpu_ids * sizeof(void **); 7685 ptr += nr_cpu_ids * sizeof(void **);
9462 7686
9463#ifdef CONFIG_USER_SCHED
9464 root_task_group.se = (struct sched_entity **)ptr;
9465 ptr += nr_cpu_ids * sizeof(void **);
9466
9467 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
9468 ptr += nr_cpu_ids * sizeof(void **);
9469#endif /* CONFIG_USER_SCHED */
9470#endif /* CONFIG_FAIR_GROUP_SCHED */ 7687#endif /* CONFIG_FAIR_GROUP_SCHED */
9471#ifdef CONFIG_RT_GROUP_SCHED 7688#ifdef CONFIG_RT_GROUP_SCHED
9472 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7689 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
@@ -9475,13 +7692,6 @@ void __init sched_init(void)
9475 init_task_group.rt_rq = (struct rt_rq **)ptr; 7692 init_task_group.rt_rq = (struct rt_rq **)ptr;
9476 ptr += nr_cpu_ids * sizeof(void **); 7693 ptr += nr_cpu_ids * sizeof(void **);
9477 7694
9478#ifdef CONFIG_USER_SCHED
9479 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
9480 ptr += nr_cpu_ids * sizeof(void **);
9481
9482 root_task_group.rt_rq = (struct rt_rq **)ptr;
9483 ptr += nr_cpu_ids * sizeof(void **);
9484#endif /* CONFIG_USER_SCHED */
9485#endif /* CONFIG_RT_GROUP_SCHED */ 7695#endif /* CONFIG_RT_GROUP_SCHED */
9486#ifdef CONFIG_CPUMASK_OFFSTACK 7696#ifdef CONFIG_CPUMASK_OFFSTACK
9487 for_each_possible_cpu(i) { 7697 for_each_possible_cpu(i) {
@@ -9501,22 +7711,13 @@ void __init sched_init(void)
9501#ifdef CONFIG_RT_GROUP_SCHED 7711#ifdef CONFIG_RT_GROUP_SCHED
9502 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7712 init_rt_bandwidth(&init_task_group.rt_bandwidth,
9503 global_rt_period(), global_rt_runtime()); 7713 global_rt_period(), global_rt_runtime());
9504#ifdef CONFIG_USER_SCHED
9505 init_rt_bandwidth(&root_task_group.rt_bandwidth,
9506 global_rt_period(), RUNTIME_INF);
9507#endif /* CONFIG_USER_SCHED */
9508#endif /* CONFIG_RT_GROUP_SCHED */ 7714#endif /* CONFIG_RT_GROUP_SCHED */
9509 7715
9510#ifdef CONFIG_GROUP_SCHED 7716#ifdef CONFIG_CGROUP_SCHED
9511 list_add(&init_task_group.list, &task_groups); 7717 list_add(&init_task_group.list, &task_groups);
9512 INIT_LIST_HEAD(&init_task_group.children); 7718 INIT_LIST_HEAD(&init_task_group.children);
9513 7719
9514#ifdef CONFIG_USER_SCHED 7720#endif /* CONFIG_CGROUP_SCHED */
9515 INIT_LIST_HEAD(&root_task_group.children);
9516 init_task_group.parent = &root_task_group;
9517 list_add(&init_task_group.siblings, &root_task_group.children);
9518#endif /* CONFIG_USER_SCHED */
9519#endif /* CONFIG_GROUP_SCHED */
9520 7721
9521#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP 7722#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9522 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), 7723 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
@@ -9556,25 +7757,6 @@ void __init sched_init(void)
9556 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7757 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
9557 */ 7758 */
9558 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7759 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL);
9559#elif defined CONFIG_USER_SCHED
9560 root_task_group.shares = NICE_0_LOAD;
9561 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, 0, NULL);
9562 /*
9563 * In case of task-groups formed thr' the user id of tasks,
9564 * init_task_group represents tasks belonging to root user.
9565 * Hence it forms a sibling of all subsequent groups formed.
9566 * In this case, init_task_group gets only a fraction of overall
9567 * system cpu resource, based on the weight assigned to root
9568 * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished
9569 * by letting tasks of init_task_group sit in a separate cfs_rq
9570 * (init_tg_cfs_rq) and having one entity represent this group of
9571 * tasks in rq->cfs (i.e init_task_group->se[] != NULL).
9572 */
9573 init_tg_cfs_entry(&init_task_group,
9574 &per_cpu(init_tg_cfs_rq, i),
9575 &per_cpu(init_sched_entity, i), i, 1,
9576 root_task_group.se[i]);
9577
9578#endif 7760#endif
9579#endif /* CONFIG_FAIR_GROUP_SCHED */ 7761#endif /* CONFIG_FAIR_GROUP_SCHED */
9580 7762
@@ -9583,12 +7765,6 @@ void __init sched_init(void)
9583 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7765 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
9584#ifdef CONFIG_CGROUP_SCHED 7766#ifdef CONFIG_CGROUP_SCHED
9585 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); 7767 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
9586#elif defined CONFIG_USER_SCHED
9587 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, 0, NULL);
9588 init_tg_rt_entry(&init_task_group,
9589 &per_cpu(init_rt_rq_var, i),
9590 &per_cpu(init_sched_rt_entity, i), i, 1,
9591 root_task_group.rt_se[i]);
9592#endif 7768#endif
9593#endif 7769#endif
9594 7770
@@ -9668,12 +7844,12 @@ void __init sched_init(void)
9668#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 7844#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
9669static inline int preempt_count_equals(int preempt_offset) 7845static inline int preempt_count_equals(int preempt_offset)
9670{ 7846{
9671 int nested = preempt_count() & ~PREEMPT_ACTIVE; 7847 int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
9672 7848
9673 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset); 7849 return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
9674} 7850}
9675 7851
9676void __might_sleep(char *file, int line, int preempt_offset) 7852void __might_sleep(const char *file, int line, int preempt_offset)
9677{ 7853{
9678#ifdef in_atomic 7854#ifdef in_atomic
9679 static unsigned long prev_jiffy; /* ratelimiting */ 7855 static unsigned long prev_jiffy; /* ratelimiting */
@@ -9984,7 +8160,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
9984} 8160}
9985#endif /* CONFIG_RT_GROUP_SCHED */ 8161#endif /* CONFIG_RT_GROUP_SCHED */
9986 8162
9987#ifdef CONFIG_GROUP_SCHED 8163#ifdef CONFIG_CGROUP_SCHED
9988static void free_sched_group(struct task_group *tg) 8164static void free_sched_group(struct task_group *tg)
9989{ 8165{
9990 free_fair_sched_group(tg); 8166 free_fair_sched_group(tg);
@@ -10083,17 +8259,17 @@ void sched_move_task(struct task_struct *tsk)
10083 8259
10084#ifdef CONFIG_FAIR_GROUP_SCHED 8260#ifdef CONFIG_FAIR_GROUP_SCHED
10085 if (tsk->sched_class->moved_group) 8261 if (tsk->sched_class->moved_group)
10086 tsk->sched_class->moved_group(tsk); 8262 tsk->sched_class->moved_group(tsk, on_rq);
10087#endif 8263#endif
10088 8264
10089 if (unlikely(running)) 8265 if (unlikely(running))
10090 tsk->sched_class->set_curr_task(rq); 8266 tsk->sched_class->set_curr_task(rq);
10091 if (on_rq) 8267 if (on_rq)
10092 enqueue_task(rq, tsk, 0); 8268 enqueue_task(rq, tsk, 0, false);
10093 8269
10094 task_rq_unlock(rq, &flags); 8270 task_rq_unlock(rq, &flags);
10095} 8271}
10096#endif /* CONFIG_GROUP_SCHED */ 8272#endif /* CONFIG_CGROUP_SCHED */
10097 8273
10098#ifdef CONFIG_FAIR_GROUP_SCHED 8274#ifdef CONFIG_FAIR_GROUP_SCHED
10099static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8275static void __set_se_shares(struct sched_entity *se, unsigned long shares)
@@ -10235,13 +8411,6 @@ static int tg_schedulable(struct task_group *tg, void *data)
10235 runtime = d->rt_runtime; 8411 runtime = d->rt_runtime;
10236 } 8412 }
10237 8413
10238#ifdef CONFIG_USER_SCHED
10239 if (tg == &root_task_group) {
10240 period = global_rt_period();
10241 runtime = global_rt_runtime();
10242 }
10243#endif
10244
10245 /* 8414 /*
10246 * Cannot have more runtime than the period. 8415 * Cannot have more runtime than the period.
10247 */ 8416 */
@@ -10644,7 +8813,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
10644struct cpuacct { 8813struct cpuacct {
10645 struct cgroup_subsys_state css; 8814 struct cgroup_subsys_state css;
10646 /* cpuusage holds pointer to a u64-type object on every cpu */ 8815 /* cpuusage holds pointer to a u64-type object on every cpu */
10647 u64 *cpuusage; 8816 u64 __percpu *cpuusage;
10648 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; 8817 struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
10649 struct cpuacct *parent; 8818 struct cpuacct *parent;
10650}; 8819};
@@ -10861,12 +9030,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
10861} 9030}
10862 9031
10863/* 9032/*
9033 * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
9034 * in cputime_t units. As a result, cpuacct_update_stats calls
9035 * percpu_counter_add with values large enough to always overflow the
9036 * per cpu batch limit causing bad SMP scalability.
9037 *
9038 * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
9039 * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
9040 * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
9041 */
9042#ifdef CONFIG_SMP
9043#define CPUACCT_BATCH \
9044 min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
9045#else
9046#define CPUACCT_BATCH 0
9047#endif
9048
9049/*
10864 * Charge the system/user time to the task's accounting group. 9050 * Charge the system/user time to the task's accounting group.
10865 */ 9051 */
10866static void cpuacct_update_stats(struct task_struct *tsk, 9052static void cpuacct_update_stats(struct task_struct *tsk,
10867 enum cpuacct_stat_index idx, cputime_t val) 9053 enum cpuacct_stat_index idx, cputime_t val)
10868{ 9054{
10869 struct cpuacct *ca; 9055 struct cpuacct *ca;
9056 int batch = CPUACCT_BATCH;
10870 9057
10871 if (unlikely(!cpuacct_subsys.active)) 9058 if (unlikely(!cpuacct_subsys.active))
10872 return; 9059 return;
@@ -10875,7 +9062,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
10875 ca = task_ca(tsk); 9062 ca = task_ca(tsk);
10876 9063
10877 do { 9064 do {
10878 percpu_counter_add(&ca->cpustat[idx], val); 9065 __percpu_counter_add(&ca->cpustat[idx], val, batch);
10879 ca = ca->parent; 9066 ca = ca->parent;
10880 } while (ca); 9067 } while (ca);
10881 rcu_read_unlock(); 9068 rcu_read_unlock();