aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c964
1 files changed, 401 insertions, 563 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index aa14a56f9d03..04949089e760 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
75 75
76#include <asm/tlb.h> 76#include <asm/tlb.h>
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78#include <asm/mutex.h>
78 79
79#include "sched_cpupri.h" 80#include "sched_cpupri.h"
80#include "workqueue_sched.h" 81#include "workqueue_sched.h"
82#include "sched_autogroup.h"
81 83
82#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h> 85#include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
253 /* runqueue "owned" by this group on each cpu */ 255 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq; 256 struct cfs_rq **cfs_rq;
255 unsigned long shares; 257 unsigned long shares;
258
259 atomic_t load_weight;
256#endif 260#endif
257 261
258#ifdef CONFIG_RT_GROUP_SCHED 262#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,24 +272,19 @@ struct task_group {
268 struct task_group *parent; 272 struct task_group *parent;
269 struct list_head siblings; 273 struct list_head siblings;
270 struct list_head children; 274 struct list_head children;
275
276#ifdef CONFIG_SCHED_AUTOGROUP
277 struct autogroup *autogroup;
278#endif
271}; 279};
272 280
273#define root_task_group init_task_group 281#define root_task_group init_task_group
274 282
275/* task_group_lock serializes add/remove of task groups and also changes to 283/* task_group_lock serializes the addition/removal of task groups */
276 * a task group's cpu shares.
277 */
278static DEFINE_SPINLOCK(task_group_lock); 284static DEFINE_SPINLOCK(task_group_lock);
279 285
280#ifdef CONFIG_FAIR_GROUP_SCHED 286#ifdef CONFIG_FAIR_GROUP_SCHED
281 287
282#ifdef CONFIG_SMP
283static int root_task_group_empty(void)
284{
285 return list_empty(&root_task_group.children);
286}
287#endif
288
289# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 288# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
290 289
291/* 290/*
@@ -342,6 +341,7 @@ struct cfs_rq {
342 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 341 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
343 * list is used during load balance. 342 * list is used during load balance.
344 */ 343 */
344 int on_list;
345 struct list_head leaf_cfs_rq_list; 345 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg; /* group that "owns" this runqueue */ 346 struct task_group *tg; /* group that "owns" this runqueue */
347 347
@@ -360,14 +360,17 @@ struct cfs_rq {
360 unsigned long h_load; 360 unsigned long h_load;
361 361
362 /* 362 /*
363 * this cpu's part of tg->shares 363 * Maintaining per-cpu shares distribution for group scheduling
364 *
365 * load_stamp is the last time we updated the load average
366 * load_last is the last time we updated the load average and saw load
367 * load_unacc_exec_time is currently unaccounted execution time
364 */ 368 */
365 unsigned long shares; 369 u64 load_avg;
370 u64 load_period;
371 u64 load_stamp, load_last, load_unacc_exec_time;
366 372
367 /* 373 unsigned long load_contribution;
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
371#endif 374#endif
372#endif 375#endif
373}; 376};
@@ -560,18 +563,8 @@ struct rq {
560 563
561static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 564static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
562 565
563static inline
564void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
565{
566 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
567 566
568 /* 567static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
569 * A queue event has occurred, and we're going to schedule. In
570 * this case, we can save a useless back to back clock update.
571 */
572 if (test_tsk_need_resched(p))
573 rq->skip_clock_update = 1;
574}
575 568
576static inline int cpu_of(struct rq *rq) 569static inline int cpu_of(struct rq *rq)
577{ 570{
@@ -615,11 +608,14 @@ static inline int cpu_of(struct rq *rq)
615 */ 608 */
616static inline struct task_group *task_group(struct task_struct *p) 609static inline struct task_group *task_group(struct task_struct *p)
617{ 610{
611 struct task_group *tg;
618 struct cgroup_subsys_state *css; 612 struct cgroup_subsys_state *css;
619 613
620 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 614 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
621 lockdep_is_held(&task_rq(p)->lock)); 615 lockdep_is_held(&task_rq(p)->lock));
622 return container_of(css, struct task_group, css); 616 tg = container_of(css, struct task_group, css);
617
618 return autogroup_task_group(p, tg);
623} 619}
624 620
625/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 621/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -646,22 +642,18 @@ static inline struct task_group *task_group(struct task_struct *p)
646 642
647#endif /* CONFIG_CGROUP_SCHED */ 643#endif /* CONFIG_CGROUP_SCHED */
648 644
649static u64 irq_time_cpu(int cpu); 645static void update_rq_clock_task(struct rq *rq, s64 delta);
650static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
651 646
652inline void update_rq_clock(struct rq *rq) 647static void update_rq_clock(struct rq *rq)
653{ 648{
654 if (!rq->skip_clock_update) { 649 s64 delta;
655 int cpu = cpu_of(rq);
656 u64 irq_time;
657 650
658 rq->clock = sched_clock_cpu(cpu); 651 if (rq->skip_clock_update)
659 irq_time = irq_time_cpu(cpu); 652 return;
660 if (rq->clock - irq_time > rq->clock_task)
661 rq->clock_task = rq->clock - irq_time;
662 653
663 sched_irq_time_avg_update(rq, irq_time); 654 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
664 } 655 rq->clock += delta;
656 update_rq_clock_task(rq, delta);
665} 657}
666 658
667/* 659/*
@@ -807,20 +799,6 @@ late_initcall(sched_init_debug);
807const_debug unsigned int sysctl_sched_nr_migrate = 32; 799const_debug unsigned int sysctl_sched_nr_migrate = 32;
808 800
809/* 801/*
810 * ratelimit for updating the group shares.
811 * default: 0.25ms
812 */
813unsigned int sysctl_sched_shares_ratelimit = 250000;
814unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
815
816/*
817 * Inject some fuzzyness into changing the per-cpu group shares
818 * this avoids remote rq-locks at the expense of fairness.
819 * default: 4
820 */
821unsigned int sysctl_sched_shares_thresh = 4;
822
823/*
824 * period over which we average the RT time consumption, measured 802 * period over which we average the RT time consumption, measured
825 * in ms. 803 * in ms.
826 * 804 *
@@ -1369,6 +1347,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1369 lw->inv_weight = 0; 1347 lw->inv_weight = 0;
1370} 1348}
1371 1349
1350static inline void update_load_set(struct load_weight *lw, unsigned long w)
1351{
1352 lw->weight = w;
1353 lw->inv_weight = 0;
1354}
1355
1372/* 1356/*
1373 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1357 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1374 * of tasks with abnormal "nice" values across CPUs the contribution that 1358 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1557,101 +1541,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1557 1541
1558#ifdef CONFIG_FAIR_GROUP_SCHED 1542#ifdef CONFIG_FAIR_GROUP_SCHED
1559 1543
1560static __read_mostly unsigned long __percpu *update_shares_data;
1561
1562static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1563
1564/*
1565 * Calculate and set the cpu's group shares.
1566 */
1567static void update_group_shares_cpu(struct task_group *tg, int cpu,
1568 unsigned long sd_shares,
1569 unsigned long sd_rq_weight,
1570 unsigned long *usd_rq_weight)
1571{
1572 unsigned long shares, rq_weight;
1573 int boost = 0;
1574
1575 rq_weight = usd_rq_weight[cpu];
1576 if (!rq_weight) {
1577 boost = 1;
1578 rq_weight = NICE_0_LOAD;
1579 }
1580
1581 /*
1582 * \Sum_j shares_j * rq_weight_i
1583 * shares_i = -----------------------------
1584 * \Sum_j rq_weight_j
1585 */
1586 shares = (sd_shares * rq_weight) / sd_rq_weight;
1587 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1588
1589 if (abs(shares - tg->se[cpu]->load.weight) >
1590 sysctl_sched_shares_thresh) {
1591 struct rq *rq = cpu_rq(cpu);
1592 unsigned long flags;
1593
1594 raw_spin_lock_irqsave(&rq->lock, flags);
1595 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1596 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1597 __set_se_shares(tg->se[cpu], shares);
1598 raw_spin_unlock_irqrestore(&rq->lock, flags);
1599 }
1600}
1601
1602/*
1603 * Re-compute the task group their per cpu shares over the given domain.
1604 * This needs to be done in a bottom-up fashion because the rq weight of a
1605 * parent group depends on the shares of its child groups.
1606 */
1607static int tg_shares_up(struct task_group *tg, void *data)
1608{
1609 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1610 unsigned long *usd_rq_weight;
1611 struct sched_domain *sd = data;
1612 unsigned long flags;
1613 int i;
1614
1615 if (!tg->se[0])
1616 return 0;
1617
1618 local_irq_save(flags);
1619 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1620
1621 for_each_cpu(i, sched_domain_span(sd)) {
1622 weight = tg->cfs_rq[i]->load.weight;
1623 usd_rq_weight[i] = weight;
1624
1625 rq_weight += weight;
1626 /*
1627 * If there are currently no tasks on the cpu pretend there
1628 * is one of average load so that when a new task gets to
1629 * run here it will not get delayed by group starvation.
1630 */
1631 if (!weight)
1632 weight = NICE_0_LOAD;
1633
1634 sum_weight += weight;
1635 shares += tg->cfs_rq[i]->shares;
1636 }
1637
1638 if (!rq_weight)
1639 rq_weight = sum_weight;
1640
1641 if ((!shares && rq_weight) || shares > tg->shares)
1642 shares = tg->shares;
1643
1644 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1645 shares = tg->shares;
1646
1647 for_each_cpu(i, sched_domain_span(sd))
1648 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1649
1650 local_irq_restore(flags);
1651
1652 return 0;
1653}
1654
1655/* 1544/*
1656 * Compute the cpu's hierarchical load factor for each task group. 1545 * Compute the cpu's hierarchical load factor for each task group.
1657 * This needs to be done in a top-down fashion because the load of a child 1546 * This needs to be done in a top-down fashion because the load of a child
@@ -1666,7 +1555,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1666 load = cpu_rq(cpu)->load.weight; 1555 load = cpu_rq(cpu)->load.weight;
1667 } else { 1556 } else {
1668 load = tg->parent->cfs_rq[cpu]->h_load; 1557 load = tg->parent->cfs_rq[cpu]->h_load;
1669 load *= tg->cfs_rq[cpu]->shares; 1558 load *= tg->se[cpu]->load.weight;
1670 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1559 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1671 } 1560 }
1672 1561
@@ -1675,34 +1564,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1675 return 0; 1564 return 0;
1676} 1565}
1677 1566
1678static void update_shares(struct sched_domain *sd)
1679{
1680 s64 elapsed;
1681 u64 now;
1682
1683 if (root_task_group_empty())
1684 return;
1685
1686 now = local_clock();
1687 elapsed = now - sd->last_update;
1688
1689 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1690 sd->last_update = now;
1691 walk_tg_tree(tg_nop, tg_shares_up, sd);
1692 }
1693}
1694
1695static void update_h_load(long cpu) 1567static void update_h_load(long cpu)
1696{ 1568{
1697 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1569 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1698} 1570}
1699 1571
1700#else
1701
1702static inline void update_shares(struct sched_domain *sd)
1703{
1704}
1705
1706#endif 1572#endif
1707 1573
1708#ifdef CONFIG_PREEMPT 1574#ifdef CONFIG_PREEMPT
@@ -1824,15 +1690,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1824 1690
1825#endif 1691#endif
1826 1692
1827#ifdef CONFIG_FAIR_GROUP_SCHED
1828static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1829{
1830#ifdef CONFIG_SMP
1831 cfs_rq->shares = shares;
1832#endif
1833}
1834#endif
1835
1836static void calc_load_account_idle(struct rq *this_rq); 1693static void calc_load_account_idle(struct rq *this_rq);
1837static void update_sysctl(void); 1694static void update_sysctl(void);
1838static int get_update_sysctl_factor(void); 1695static int get_update_sysctl_factor(void);
@@ -1934,10 +1791,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1934 * They are read and saved off onto struct rq in update_rq_clock(). 1791 * They are read and saved off onto struct rq in update_rq_clock().
1935 * This may result in other CPU reading this CPU's irq time and can 1792 * This may result in other CPU reading this CPU's irq time and can
1936 * race with irq/account_system_vtime on this CPU. We would either get old 1793 * race with irq/account_system_vtime on this CPU. We would either get old
1937 * or new value (or semi updated value on 32 bit) with a side effect of 1794 * or new value with a side effect of accounting a slice of irq time to wrong
1938 * accounting a slice of irq time to wrong task when irq is in progress 1795 * task when irq is in progress while we read rq->clock. That is a worthy
1939 * while we read rq->clock. That is a worthy compromise in place of having 1796 * compromise in place of having locks on each irq in account_system_time.
1940 * locks on each irq in account_system_time.
1941 */ 1797 */
1942static DEFINE_PER_CPU(u64, cpu_hardirq_time); 1798static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1943static DEFINE_PER_CPU(u64, cpu_softirq_time); 1799static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1955,19 +1811,58 @@ void disable_sched_clock_irqtime(void)
1955 sched_clock_irqtime = 0; 1811 sched_clock_irqtime = 0;
1956} 1812}
1957 1813
1958static u64 irq_time_cpu(int cpu) 1814#ifndef CONFIG_64BIT
1815static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1816
1817static inline void irq_time_write_begin(void)
1959{ 1818{
1960 if (!sched_clock_irqtime) 1819 __this_cpu_inc(irq_time_seq.sequence);
1961 return 0; 1820 smp_wmb();
1821}
1822
1823static inline void irq_time_write_end(void)
1824{
1825 smp_wmb();
1826 __this_cpu_inc(irq_time_seq.sequence);
1827}
1828
1829static inline u64 irq_time_read(int cpu)
1830{
1831 u64 irq_time;
1832 unsigned seq;
1833
1834 do {
1835 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1836 irq_time = per_cpu(cpu_softirq_time, cpu) +
1837 per_cpu(cpu_hardirq_time, cpu);
1838 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1962 1839
1840 return irq_time;
1841}
1842#else /* CONFIG_64BIT */
1843static inline void irq_time_write_begin(void)
1844{
1845}
1846
1847static inline void irq_time_write_end(void)
1848{
1849}
1850
1851static inline u64 irq_time_read(int cpu)
1852{
1963 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); 1853 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1964} 1854}
1855#endif /* CONFIG_64BIT */
1965 1856
1857/*
1858 * Called before incrementing preempt_count on {soft,}irq_enter
1859 * and before decrementing preempt_count on {soft,}irq_exit.
1860 */
1966void account_system_vtime(struct task_struct *curr) 1861void account_system_vtime(struct task_struct *curr)
1967{ 1862{
1968 unsigned long flags; 1863 unsigned long flags;
1864 s64 delta;
1969 int cpu; 1865 int cpu;
1970 u64 now, delta;
1971 1866
1972 if (!sched_clock_irqtime) 1867 if (!sched_clock_irqtime)
1973 return; 1868 return;
@@ -1975,9 +1870,10 @@ void account_system_vtime(struct task_struct *curr)
1975 local_irq_save(flags); 1870 local_irq_save(flags);
1976 1871
1977 cpu = smp_processor_id(); 1872 cpu = smp_processor_id();
1978 now = sched_clock_cpu(cpu); 1873 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1979 delta = now - per_cpu(irq_start_time, cpu); 1874 __this_cpu_add(irq_start_time, delta);
1980 per_cpu(irq_start_time, cpu) = now; 1875
1876 irq_time_write_begin();
1981 /* 1877 /*
1982 * We do not account for softirq time from ksoftirqd here. 1878 * We do not account for softirq time from ksoftirqd here.
1983 * We want to continue accounting softirq time to ksoftirqd thread 1879 * We want to continue accounting softirq time to ksoftirqd thread
@@ -1985,37 +1881,60 @@ void account_system_vtime(struct task_struct *curr)
1985 * that do not consume any time, but still wants to run. 1881 * that do not consume any time, but still wants to run.
1986 */ 1882 */
1987 if (hardirq_count()) 1883 if (hardirq_count())
1988 per_cpu(cpu_hardirq_time, cpu) += delta; 1884 __this_cpu_add(cpu_hardirq_time, delta);
1989 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 1885 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1990 per_cpu(cpu_softirq_time, cpu) += delta; 1886 __this_cpu_add(cpu_softirq_time, delta);
1991 1887
1888 irq_time_write_end();
1992 local_irq_restore(flags); 1889 local_irq_restore(flags);
1993} 1890}
1994EXPORT_SYMBOL_GPL(account_system_vtime); 1891EXPORT_SYMBOL_GPL(account_system_vtime);
1995 1892
1996static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) 1893static void update_rq_clock_task(struct rq *rq, s64 delta)
1997{ 1894{
1998 if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { 1895 s64 irq_delta;
1999 u64 delta_irq = curr_irq_time - rq->prev_irq_time; 1896
2000 rq->prev_irq_time = curr_irq_time; 1897 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
2001 sched_rt_avg_update(rq, delta_irq); 1898
2002 } 1899 /*
1900 * Since irq_time is only updated on {soft,}irq_exit, we might run into
1901 * this case when a previous update_rq_clock() happened inside a
1902 * {soft,}irq region.
1903 *
1904 * When this happens, we stop ->clock_task and only update the
1905 * prev_irq_time stamp to account for the part that fit, so that a next
1906 * update will consume the rest. This ensures ->clock_task is
1907 * monotonic.
1908 *
1909 * It does however cause some slight miss-attribution of {soft,}irq
1910 * time, a more accurate solution would be to update the irq_time using
1911 * the current rq->clock timestamp, except that would require using
1912 * atomic ops.
1913 */
1914 if (irq_delta > delta)
1915 irq_delta = delta;
1916
1917 rq->prev_irq_time += irq_delta;
1918 delta -= irq_delta;
1919 rq->clock_task += delta;
1920
1921 if (irq_delta && sched_feat(NONIRQ_POWER))
1922 sched_rt_avg_update(rq, irq_delta);
2003} 1923}
2004 1924
2005#else 1925#else /* CONFIG_IRQ_TIME_ACCOUNTING */
2006 1926
2007static u64 irq_time_cpu(int cpu) 1927static void update_rq_clock_task(struct rq *rq, s64 delta)
2008{ 1928{
2009 return 0; 1929 rq->clock_task += delta;
2010} 1930}
2011 1931
2012static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } 1932#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2013
2014#endif
2015 1933
2016#include "sched_idletask.c" 1934#include "sched_idletask.c"
2017#include "sched_fair.c" 1935#include "sched_fair.c"
2018#include "sched_rt.c" 1936#include "sched_rt.c"
1937#include "sched_autogroup.c"
2019#include "sched_stoptask.c" 1938#include "sched_stoptask.c"
2020#ifdef CONFIG_SCHED_DEBUG 1939#ifdef CONFIG_SCHED_DEBUG
2021# include "sched_debug.c" 1940# include "sched_debug.c"
@@ -2118,6 +2037,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2118 p->sched_class->prio_changed(rq, p, oldprio, running); 2037 p->sched_class->prio_changed(rq, p, oldprio, running);
2119} 2038}
2120 2039
2040static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2041{
2042 const struct sched_class *class;
2043
2044 if (p->sched_class == rq->curr->sched_class) {
2045 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2046 } else {
2047 for_each_class(class) {
2048 if (class == rq->curr->sched_class)
2049 break;
2050 if (class == p->sched_class) {
2051 resched_task(rq->curr);
2052 break;
2053 }
2054 }
2055 }
2056
2057 /*
2058 * A queue event has occurred, and we're going to schedule. In
2059 * this case, we can save a useless back to back clock update.
2060 */
2061 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
2062 rq->skip_clock_update = 1;
2063}
2064
2121#ifdef CONFIG_SMP 2065#ifdef CONFIG_SMP
2122/* 2066/*
2123 * Is this task likely cache-hot: 2067 * Is this task likely cache-hot:
@@ -2183,10 +2127,8 @@ static int migration_cpu_stop(void *data);
2183 * The task's runqueue lock must be held. 2127 * The task's runqueue lock must be held.
2184 * Returns true if you have to wait for migration thread. 2128 * Returns true if you have to wait for migration thread.
2185 */ 2129 */
2186static bool migrate_task(struct task_struct *p, int dest_cpu) 2130static bool migrate_task(struct task_struct *p, struct rq *rq)
2187{ 2131{
2188 struct rq *rq = task_rq(p);
2189
2190 /* 2132 /*
2191 * If the task is not on a runqueue (and not running), then 2133 * If the task is not on a runqueue (and not running), then
2192 * the next wake-up will properly place the task. 2134 * the next wake-up will properly place the task.
@@ -2366,18 +2308,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2366 return dest_cpu; 2308 return dest_cpu;
2367 2309
2368 /* No more Mr. Nice Guy. */ 2310 /* No more Mr. Nice Guy. */
2369 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2311 dest_cpu = cpuset_cpus_allowed_fallback(p);
2370 dest_cpu = cpuset_cpus_allowed_fallback(p); 2312 /*
2371 /* 2313 * Don't tell them about moving exiting tasks or
2372 * Don't tell them about moving exiting tasks or 2314 * kernel threads (both mm NULL), since they never
2373 * kernel threads (both mm NULL), since they never 2315 * leave kernel.
2374 * leave kernel. 2316 */
2375 */ 2317 if (p->mm && printk_ratelimit()) {
2376 if (p->mm && printk_ratelimit()) { 2318 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2377 printk(KERN_INFO "process %d (%s) no " 2319 task_pid_nr(p), p->comm, cpu);
2378 "longer affine to cpu%d\n",
2379 task_pid_nr(p), p->comm, cpu);
2380 }
2381 } 2320 }
2382 2321
2383 return dest_cpu; 2322 return dest_cpu;
@@ -2713,7 +2652,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2713 /* Want to start with kernel preemption disabled. */ 2652 /* Want to start with kernel preemption disabled. */
2714 task_thread_info(p)->preempt_count = 1; 2653 task_thread_info(p)->preempt_count = 1;
2715#endif 2654#endif
2655#ifdef CONFIG_SMP
2716 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2656 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2657#endif
2717 2658
2718 put_cpu(); 2659 put_cpu();
2719} 2660}
@@ -3104,6 +3045,15 @@ static long calc_load_fold_active(struct rq *this_rq)
3104 return delta; 3045 return delta;
3105} 3046}
3106 3047
3048static unsigned long
3049calc_load(unsigned long load, unsigned long exp, unsigned long active)
3050{
3051 load *= exp;
3052 load += active * (FIXED_1 - exp);
3053 load += 1UL << (FSHIFT - 1);
3054 return load >> FSHIFT;
3055}
3056
3107#ifdef CONFIG_NO_HZ 3057#ifdef CONFIG_NO_HZ
3108/* 3058/*
3109 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 3059 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3133,6 +3083,128 @@ static long calc_load_fold_idle(void)
3133 3083
3134 return delta; 3084 return delta;
3135} 3085}
3086
3087/**
3088 * fixed_power_int - compute: x^n, in O(log n) time
3089 *
3090 * @x: base of the power
3091 * @frac_bits: fractional bits of @x
3092 * @n: power to raise @x to.
3093 *
3094 * By exploiting the relation between the definition of the natural power
3095 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3096 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3097 * (where: n_i \elem {0, 1}, the binary vector representing n),
3098 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3099 * of course trivially computable in O(log_2 n), the length of our binary
3100 * vector.
3101 */
3102static unsigned long
3103fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3104{
3105 unsigned long result = 1UL << frac_bits;
3106
3107 if (n) for (;;) {
3108 if (n & 1) {
3109 result *= x;
3110 result += 1UL << (frac_bits - 1);
3111 result >>= frac_bits;
3112 }
3113 n >>= 1;
3114 if (!n)
3115 break;
3116 x *= x;
3117 x += 1UL << (frac_bits - 1);
3118 x >>= frac_bits;
3119 }
3120
3121 return result;
3122}
3123
3124/*
3125 * a1 = a0 * e + a * (1 - e)
3126 *
3127 * a2 = a1 * e + a * (1 - e)
3128 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3129 * = a0 * e^2 + a * (1 - e) * (1 + e)
3130 *
3131 * a3 = a2 * e + a * (1 - e)
3132 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3133 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3134 *
3135 * ...
3136 *
3137 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3138 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3139 * = a0 * e^n + a * (1 - e^n)
3140 *
3141 * [1] application of the geometric series:
3142 *
3143 * n 1 - x^(n+1)
3144 * S_n := \Sum x^i = -------------
3145 * i=0 1 - x
3146 */
3147static unsigned long
3148calc_load_n(unsigned long load, unsigned long exp,
3149 unsigned long active, unsigned int n)
3150{
3151
3152 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3153}
3154
3155/*
3156 * NO_HZ can leave us missing all per-cpu ticks calling
3157 * calc_load_account_active(), but since an idle CPU folds its delta into
3158 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3159 * in the pending idle delta if our idle period crossed a load cycle boundary.
3160 *
3161 * Once we've updated the global active value, we need to apply the exponential
3162 * weights adjusted to the number of cycles missed.
3163 */
3164static void calc_global_nohz(unsigned long ticks)
3165{
3166 long delta, active, n;
3167
3168 if (time_before(jiffies, calc_load_update))
3169 return;
3170
3171 /*
3172 * If we crossed a calc_load_update boundary, make sure to fold
3173 * any pending idle changes, the respective CPUs might have
3174 * missed the tick driven calc_load_account_active() update
3175 * due to NO_HZ.
3176 */
3177 delta = calc_load_fold_idle();
3178 if (delta)
3179 atomic_long_add(delta, &calc_load_tasks);
3180
3181 /*
3182 * If we were idle for multiple load cycles, apply them.
3183 */
3184 if (ticks >= LOAD_FREQ) {
3185 n = ticks / LOAD_FREQ;
3186
3187 active = atomic_long_read(&calc_load_tasks);
3188 active = active > 0 ? active * FIXED_1 : 0;
3189
3190 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3191 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3192 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3193
3194 calc_load_update += n * LOAD_FREQ;
3195 }
3196
3197 /*
3198 * Its possible the remainder of the above division also crosses
3199 * a LOAD_FREQ period, the regular check in calc_global_load()
3200 * which comes after this will take care of that.
3201 *
3202 * Consider us being 11 ticks before a cycle completion, and us
3203 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3204 * age us 4 cycles, and the test in calc_global_load() will
3205 * pick up the final one.
3206 */
3207}
3136#else 3208#else
3137static void calc_load_account_idle(struct rq *this_rq) 3209static void calc_load_account_idle(struct rq *this_rq)
3138{ 3210{
@@ -3142,6 +3214,10 @@ static inline long calc_load_fold_idle(void)
3142{ 3214{
3143 return 0; 3215 return 0;
3144} 3216}
3217
3218static void calc_global_nohz(unsigned long ticks)
3219{
3220}
3145#endif 3221#endif
3146 3222
3147/** 3223/**
@@ -3159,24 +3235,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3159 loads[2] = (avenrun[2] + offset) << shift; 3235 loads[2] = (avenrun[2] + offset) << shift;
3160} 3236}
3161 3237
3162static unsigned long
3163calc_load(unsigned long load, unsigned long exp, unsigned long active)
3164{
3165 load *= exp;
3166 load += active * (FIXED_1 - exp);
3167 return load >> FSHIFT;
3168}
3169
3170/* 3238/*
3171 * calc_load - update the avenrun load estimates 10 ticks after the 3239 * calc_load - update the avenrun load estimates 10 ticks after the
3172 * CPUs have updated calc_load_tasks. 3240 * CPUs have updated calc_load_tasks.
3173 */ 3241 */
3174void calc_global_load(void) 3242void calc_global_load(unsigned long ticks)
3175{ 3243{
3176 unsigned long upd = calc_load_update + 10;
3177 long active; 3244 long active;
3178 3245
3179 if (time_before(jiffies, upd)) 3246 calc_global_nohz(ticks);
3247
3248 if (time_before(jiffies, calc_load_update + 10))
3180 return; 3249 return;
3181 3250
3182 active = atomic_long_read(&calc_load_tasks); 3251 active = atomic_long_read(&calc_load_tasks);
@@ -3349,7 +3418,7 @@ void sched_exec(void)
3349 * select_task_rq() can race against ->cpus_allowed 3418 * select_task_rq() can race against ->cpus_allowed
3350 */ 3419 */
3351 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3420 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3352 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { 3421 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3353 struct migration_arg arg = { p, dest_cpu }; 3422 struct migration_arg arg = { p, dest_cpu };
3354 3423
3355 task_rq_unlock(rq, &flags); 3424 task_rq_unlock(rq, &flags);
@@ -3830,7 +3899,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
3830{ 3899{
3831 if (prev->se.on_rq) 3900 if (prev->se.on_rq)
3832 update_rq_clock(rq); 3901 update_rq_clock(rq);
3833 rq->skip_clock_update = 0;
3834 prev->sched_class->put_prev_task(rq, prev); 3902 prev->sched_class->put_prev_task(rq, prev);
3835} 3903}
3836 3904
@@ -3888,7 +3956,6 @@ need_resched_nonpreemptible:
3888 hrtick_clear(rq); 3956 hrtick_clear(rq);
3889 3957
3890 raw_spin_lock_irq(&rq->lock); 3958 raw_spin_lock_irq(&rq->lock);
3891 clear_tsk_need_resched(prev);
3892 3959
3893 switch_count = &prev->nivcsw; 3960 switch_count = &prev->nivcsw;
3894 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3961 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3920,6 +3987,8 @@ need_resched_nonpreemptible:
3920 3987
3921 put_prev_task(rq, prev); 3988 put_prev_task(rq, prev);
3922 next = pick_next_task(rq); 3989 next = pick_next_task(rq);
3990 clear_tsk_need_resched(prev);
3991 rq->skip_clock_update = 0;
3923 3992
3924 if (likely(prev != next)) { 3993 if (likely(prev != next)) {
3925 sched_info_switch(prev, next); 3994 sched_info_switch(prev, next);
@@ -4014,7 +4083,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4014 if (task_thread_info(rq->curr) != owner || need_resched()) 4083 if (task_thread_info(rq->curr) != owner || need_resched())
4015 return 0; 4084 return 0;
4016 4085
4017 cpu_relax(); 4086 arch_mutex_cpu_relax();
4018 } 4087 }
4019 4088
4020 return 1; 4089 return 1;
@@ -4326,7 +4395,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4326 * This waits for either a completion of a specific task to be signaled or for a 4395 * This waits for either a completion of a specific task to be signaled or for a
4327 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4396 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4328 */ 4397 */
4329unsigned long __sched 4398long __sched
4330wait_for_completion_interruptible_timeout(struct completion *x, 4399wait_for_completion_interruptible_timeout(struct completion *x,
4331 unsigned long timeout) 4400 unsigned long timeout)
4332{ 4401{
@@ -4359,7 +4428,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4359 * signaled or for a specified timeout to expire. It can be 4428 * signaled or for a specified timeout to expire. It can be
4360 * interrupted by a kill signal. The timeout is in jiffies. 4429 * interrupted by a kill signal. The timeout is in jiffies.
4361 */ 4430 */
4362unsigned long __sched 4431long __sched
4363wait_for_completion_killable_timeout(struct completion *x, 4432wait_for_completion_killable_timeout(struct completion *x,
4364 unsigned long timeout) 4433 unsigned long timeout)
4365{ 4434{
@@ -4701,7 +4770,7 @@ static bool check_same_owner(struct task_struct *p)
4701} 4770}
4702 4771
4703static int __sched_setscheduler(struct task_struct *p, int policy, 4772static int __sched_setscheduler(struct task_struct *p, int policy,
4704 struct sched_param *param, bool user) 4773 const struct sched_param *param, bool user)
4705{ 4774{
4706 int retval, oldprio, oldpolicy = -1, on_rq, running; 4775 int retval, oldprio, oldpolicy = -1, on_rq, running;
4707 unsigned long flags; 4776 unsigned long flags;
@@ -4856,7 +4925,7 @@ recheck:
4856 * NOTE that the task may be already dead. 4925 * NOTE that the task may be already dead.
4857 */ 4926 */
4858int sched_setscheduler(struct task_struct *p, int policy, 4927int sched_setscheduler(struct task_struct *p, int policy,
4859 struct sched_param *param) 4928 const struct sched_param *param)
4860{ 4929{
4861 return __sched_setscheduler(p, policy, param, true); 4930 return __sched_setscheduler(p, policy, param, true);
4862} 4931}
@@ -4874,7 +4943,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
4874 * but our caller might not have that capability. 4943 * but our caller might not have that capability.
4875 */ 4944 */
4876int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4945int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4877 struct sched_param *param) 4946 const struct sched_param *param)
4878{ 4947{
4879 return __sched_setscheduler(p, policy, param, false); 4948 return __sched_setscheduler(p, policy, param, false);
4880} 4949}
@@ -5390,7 +5459,7 @@ void sched_show_task(struct task_struct *p)
5390 unsigned state; 5459 unsigned state;
5391 5460
5392 state = p->state ? __ffs(p->state) + 1 : 0; 5461 state = p->state ? __ffs(p->state) + 1 : 0;
5393 printk(KERN_INFO "%-13.13s %c", p->comm, 5462 printk(KERN_INFO "%-15.15s %c", p->comm,
5394 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5463 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5395#if BITS_PER_LONG == 32 5464#if BITS_PER_LONG == 32
5396 if (state == TASK_RUNNING) 5465 if (state == TASK_RUNNING)
@@ -5554,7 +5623,6 @@ static void update_sysctl(void)
5554 SET_SYSCTL(sched_min_granularity); 5623 SET_SYSCTL(sched_min_granularity);
5555 SET_SYSCTL(sched_latency); 5624 SET_SYSCTL(sched_latency);
5556 SET_SYSCTL(sched_wakeup_granularity); 5625 SET_SYSCTL(sched_wakeup_granularity);
5557 SET_SYSCTL(sched_shares_ratelimit);
5558#undef SET_SYSCTL 5626#undef SET_SYSCTL
5559} 5627}
5560 5628
@@ -5630,7 +5698,7 @@ again:
5630 goto out; 5698 goto out;
5631 5699
5632 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5700 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5633 if (migrate_task(p, dest_cpu)) { 5701 if (migrate_task(p, rq)) {
5634 struct migration_arg arg = { p, dest_cpu }; 5702 struct migration_arg arg = { p, dest_cpu };
5635 /* Need help from migration thread: drop lock and wait. */ 5703 /* Need help from migration thread: drop lock and wait. */
5636 task_rq_unlock(rq, &flags); 5704 task_rq_unlock(rq, &flags);
@@ -5712,29 +5780,20 @@ static int migration_cpu_stop(void *data)
5712} 5780}
5713 5781
5714#ifdef CONFIG_HOTPLUG_CPU 5782#ifdef CONFIG_HOTPLUG_CPU
5783
5715/* 5784/*
5716 * Figure out where task on dead CPU should go, use force if necessary. 5785 * Ensures that the idle task is using init_mm right before its cpu goes
5786 * offline.
5717 */ 5787 */
5718void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5788void idle_task_exit(void)
5719{ 5789{
5720 struct rq *rq = cpu_rq(dead_cpu); 5790 struct mm_struct *mm = current->active_mm;
5721 int needs_cpu, uninitialized_var(dest_cpu);
5722 unsigned long flags;
5723 5791
5724 local_irq_save(flags); 5792 BUG_ON(cpu_online(smp_processor_id()));
5725 5793
5726 raw_spin_lock(&rq->lock); 5794 if (mm != &init_mm)
5727 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5795 switch_mm(mm, &init_mm, current);
5728 if (needs_cpu) 5796 mmdrop(mm);
5729 dest_cpu = select_fallback_rq(dead_cpu, p);
5730 raw_spin_unlock(&rq->lock);
5731 /*
5732 * It can only fail if we race with set_cpus_allowed(),
5733 * in the racer should migrate the task anyway.
5734 */
5735 if (needs_cpu)
5736 __migrate_task(p, dead_cpu, dest_cpu);
5737 local_irq_restore(flags);
5738} 5797}
5739 5798
5740/* 5799/*
@@ -5747,128 +5806,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5747static void migrate_nr_uninterruptible(struct rq *rq_src) 5806static void migrate_nr_uninterruptible(struct rq *rq_src)
5748{ 5807{
5749 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5808 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5750 unsigned long flags;
5751 5809
5752 local_irq_save(flags);
5753 double_rq_lock(rq_src, rq_dest);
5754 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5810 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5755 rq_src->nr_uninterruptible = 0; 5811 rq_src->nr_uninterruptible = 0;
5756 double_rq_unlock(rq_src, rq_dest);
5757 local_irq_restore(flags);
5758}
5759
5760/* Run through task list and migrate tasks from the dead cpu. */
5761static void migrate_live_tasks(int src_cpu)
5762{
5763 struct task_struct *p, *t;
5764
5765 read_lock(&tasklist_lock);
5766
5767 do_each_thread(t, p) {
5768 if (p == current)
5769 continue;
5770
5771 if (task_cpu(p) == src_cpu)
5772 move_task_off_dead_cpu(src_cpu, p);
5773 } while_each_thread(t, p);
5774
5775 read_unlock(&tasklist_lock);
5776} 5812}
5777 5813
5778/* 5814/*
5779 * Schedules idle task to be the next runnable task on current CPU. 5815 * remove the tasks which were accounted by rq from calc_load_tasks.
5780 * It does so by boosting its priority to highest possible.
5781 * Used by CPU offline code.
5782 */ 5816 */
5783void sched_idle_next(void) 5817static void calc_global_load_remove(struct rq *rq)
5784{ 5818{
5785 int this_cpu = smp_processor_id(); 5819 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5786 struct rq *rq = cpu_rq(this_cpu); 5820 rq->calc_load_active = 0;
5787 struct task_struct *p = rq->idle;
5788 unsigned long flags;
5789
5790 /* cpu has to be offline */
5791 BUG_ON(cpu_online(this_cpu));
5792
5793 /*
5794 * Strictly not necessary since rest of the CPUs are stopped by now
5795 * and interrupts disabled on the current cpu.
5796 */
5797 raw_spin_lock_irqsave(&rq->lock, flags);
5798
5799 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5800
5801 activate_task(rq, p, 0);
5802
5803 raw_spin_unlock_irqrestore(&rq->lock, flags);
5804} 5821}
5805 5822
5806/* 5823/*
5807 * Ensures that the idle task is using init_mm right before its cpu goes 5824 * Migrate all tasks from the rq, sleeping tasks will be migrated by
5808 * offline. 5825 * try_to_wake_up()->select_task_rq().
5826 *
5827 * Called with rq->lock held even though we'er in stop_machine() and
5828 * there's no concurrency possible, we hold the required locks anyway
5829 * because of lock validation efforts.
5809 */ 5830 */
5810void idle_task_exit(void) 5831static void migrate_tasks(unsigned int dead_cpu)
5811{
5812 struct mm_struct *mm = current->active_mm;
5813
5814 BUG_ON(cpu_online(smp_processor_id()));
5815
5816 if (mm != &init_mm)
5817 switch_mm(mm, &init_mm, current);
5818 mmdrop(mm);
5819}
5820
5821/* called under rq->lock with disabled interrupts */
5822static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5823{ 5832{
5824 struct rq *rq = cpu_rq(dead_cpu); 5833 struct rq *rq = cpu_rq(dead_cpu);
5825 5834 struct task_struct *next, *stop = rq->stop;
5826 /* Must be exiting, otherwise would be on tasklist. */ 5835 int dest_cpu;
5827 BUG_ON(!p->exit_state);
5828
5829 /* Cannot have done final schedule yet: would have vanished. */
5830 BUG_ON(p->state == TASK_DEAD);
5831
5832 get_task_struct(p);
5833 5836
5834 /* 5837 /*
5835 * Drop lock around migration; if someone else moves it, 5838 * Fudge the rq selection such that the below task selection loop
5836 * that's OK. No task can be added to this CPU, so iteration is 5839 * doesn't get stuck on the currently eligible stop task.
5837 * fine. 5840 *
5841 * We're currently inside stop_machine() and the rq is either stuck
5842 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5843 * either way we should never end up calling schedule() until we're
5844 * done here.
5838 */ 5845 */
5839 raw_spin_unlock_irq(&rq->lock); 5846 rq->stop = NULL;
5840 move_task_off_dead_cpu(dead_cpu, p);
5841 raw_spin_lock_irq(&rq->lock);
5842
5843 put_task_struct(p);
5844}
5845
5846/* release_task() removes task from tasklist, so we won't find dead tasks. */
5847static void migrate_dead_tasks(unsigned int dead_cpu)
5848{
5849 struct rq *rq = cpu_rq(dead_cpu);
5850 struct task_struct *next;
5851 5847
5852 for ( ; ; ) { 5848 for ( ; ; ) {
5853 if (!rq->nr_running) 5849 /*
5850 * There's this thread running, bail when that's the only
5851 * remaining thread.
5852 */
5853 if (rq->nr_running == 1)
5854 break; 5854 break;
5855
5855 next = pick_next_task(rq); 5856 next = pick_next_task(rq);
5856 if (!next) 5857 BUG_ON(!next);
5857 break;
5858 next->sched_class->put_prev_task(rq, next); 5858 next->sched_class->put_prev_task(rq, next);
5859 migrate_dead(dead_cpu, next);
5860 5859
5860 /* Find suitable destination for @next, with force if needed. */
5861 dest_cpu = select_fallback_rq(dead_cpu, next);
5862 raw_spin_unlock(&rq->lock);
5863
5864 __migrate_task(next, dead_cpu, dest_cpu);
5865
5866 raw_spin_lock(&rq->lock);
5861 } 5867 }
5862}
5863 5868
5864/* 5869 rq->stop = stop;
5865 * remove the tasks which were accounted by rq from calc_load_tasks.
5866 */
5867static void calc_global_load_remove(struct rq *rq)
5868{
5869 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5870 rq->calc_load_active = 0;
5871} 5870}
5871
5872#endif /* CONFIG_HOTPLUG_CPU */ 5872#endif /* CONFIG_HOTPLUG_CPU */
5873 5873
5874#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5874#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6078,15 +6078,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6078 unsigned long flags; 6078 unsigned long flags;
6079 struct rq *rq = cpu_rq(cpu); 6079 struct rq *rq = cpu_rq(cpu);
6080 6080
6081 switch (action) { 6081 switch (action & ~CPU_TASKS_FROZEN) {
6082 6082
6083 case CPU_UP_PREPARE: 6083 case CPU_UP_PREPARE:
6084 case CPU_UP_PREPARE_FROZEN:
6085 rq->calc_load_update = calc_load_update; 6084 rq->calc_load_update = calc_load_update;
6086 break; 6085 break;
6087 6086
6088 case CPU_ONLINE: 6087 case CPU_ONLINE:
6089 case CPU_ONLINE_FROZEN:
6090 /* Update our root-domain */ 6088 /* Update our root-domain */
6091 raw_spin_lock_irqsave(&rq->lock, flags); 6089 raw_spin_lock_irqsave(&rq->lock, flags);
6092 if (rq->rd) { 6090 if (rq->rd) {
@@ -6098,30 +6096,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6098 break; 6096 break;
6099 6097
6100#ifdef CONFIG_HOTPLUG_CPU 6098#ifdef CONFIG_HOTPLUG_CPU
6101 case CPU_DEAD:
6102 case CPU_DEAD_FROZEN:
6103 migrate_live_tasks(cpu);
6104 /* Idle task back to normal (off runqueue, low prio) */
6105 raw_spin_lock_irq(&rq->lock);
6106 deactivate_task(rq, rq->idle, 0);
6107 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6108 rq->idle->sched_class = &idle_sched_class;
6109 migrate_dead_tasks(cpu);
6110 raw_spin_unlock_irq(&rq->lock);
6111 migrate_nr_uninterruptible(rq);
6112 BUG_ON(rq->nr_running != 0);
6113 calc_global_load_remove(rq);
6114 break;
6115
6116 case CPU_DYING: 6099 case CPU_DYING:
6117 case CPU_DYING_FROZEN:
6118 /* Update our root-domain */ 6100 /* Update our root-domain */
6119 raw_spin_lock_irqsave(&rq->lock, flags); 6101 raw_spin_lock_irqsave(&rq->lock, flags);
6120 if (rq->rd) { 6102 if (rq->rd) {
6121 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6103 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6122 set_rq_offline(rq); 6104 set_rq_offline(rq);
6123 } 6105 }
6106 migrate_tasks(cpu);
6107 BUG_ON(rq->nr_running != 1); /* the migration thread */
6124 raw_spin_unlock_irqrestore(&rq->lock, flags); 6108 raw_spin_unlock_irqrestore(&rq->lock, flags);
6109
6110 migrate_nr_uninterruptible(rq);
6111 calc_global_load_remove(rq);
6125 break; 6112 break;
6126#endif 6113#endif
6127 } 6114 }
@@ -6960,6 +6947,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6960 if (cpu != group_first_cpu(sd->groups)) 6947 if (cpu != group_first_cpu(sd->groups))
6961 return; 6948 return;
6962 6949
6950 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
6951
6963 child = sd->child; 6952 child = sd->child;
6964 6953
6965 sd->groups->cpu_power = 0; 6954 sd->groups->cpu_power = 0;
@@ -7850,15 +7839,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7850 7839
7851#ifdef CONFIG_FAIR_GROUP_SCHED 7840#ifdef CONFIG_FAIR_GROUP_SCHED
7852static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7841static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7853 struct sched_entity *se, int cpu, int add, 7842 struct sched_entity *se, int cpu,
7854 struct sched_entity *parent) 7843 struct sched_entity *parent)
7855{ 7844{
7856 struct rq *rq = cpu_rq(cpu); 7845 struct rq *rq = cpu_rq(cpu);
7857 tg->cfs_rq[cpu] = cfs_rq; 7846 tg->cfs_rq[cpu] = cfs_rq;
7858 init_cfs_rq(cfs_rq, rq); 7847 init_cfs_rq(cfs_rq, rq);
7859 cfs_rq->tg = tg; 7848 cfs_rq->tg = tg;
7860 if (add)
7861 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7862 7849
7863 tg->se[cpu] = se; 7850 tg->se[cpu] = se;
7864 /* se could be NULL for init_task_group */ 7851 /* se could be NULL for init_task_group */
@@ -7871,15 +7858,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7871 se->cfs_rq = parent->my_q; 7858 se->cfs_rq = parent->my_q;
7872 7859
7873 se->my_q = cfs_rq; 7860 se->my_q = cfs_rq;
7874 se->load.weight = tg->shares; 7861 update_load_set(&se->load, 0);
7875 se->load.inv_weight = 0;
7876 se->parent = parent; 7862 se->parent = parent;
7877} 7863}
7878#endif 7864#endif
7879 7865
7880#ifdef CONFIG_RT_GROUP_SCHED 7866#ifdef CONFIG_RT_GROUP_SCHED
7881static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7867static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7882 struct sched_rt_entity *rt_se, int cpu, int add, 7868 struct sched_rt_entity *rt_se, int cpu,
7883 struct sched_rt_entity *parent) 7869 struct sched_rt_entity *parent)
7884{ 7870{
7885 struct rq *rq = cpu_rq(cpu); 7871 struct rq *rq = cpu_rq(cpu);
@@ -7888,8 +7874,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7888 init_rt_rq(rt_rq, rq); 7874 init_rt_rq(rt_rq, rq);
7889 rt_rq->tg = tg; 7875 rt_rq->tg = tg;
7890 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7876 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7891 if (add)
7892 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7893 7877
7894 tg->rt_se[cpu] = rt_se; 7878 tg->rt_se[cpu] = rt_se;
7895 if (!rt_se) 7879 if (!rt_se)
@@ -7962,13 +7946,9 @@ void __init sched_init(void)
7962#ifdef CONFIG_CGROUP_SCHED 7946#ifdef CONFIG_CGROUP_SCHED
7963 list_add(&init_task_group.list, &task_groups); 7947 list_add(&init_task_group.list, &task_groups);
7964 INIT_LIST_HEAD(&init_task_group.children); 7948 INIT_LIST_HEAD(&init_task_group.children);
7965 7949 autogroup_init(&init_task);
7966#endif /* CONFIG_CGROUP_SCHED */ 7950#endif /* CONFIG_CGROUP_SCHED */
7967 7951
7968#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7969 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
7970 __alignof__(unsigned long));
7971#endif
7972 for_each_possible_cpu(i) { 7952 for_each_possible_cpu(i) {
7973 struct rq *rq; 7953 struct rq *rq;
7974 7954
@@ -7982,7 +7962,6 @@ void __init sched_init(void)
7982#ifdef CONFIG_FAIR_GROUP_SCHED 7962#ifdef CONFIG_FAIR_GROUP_SCHED
7983 init_task_group.shares = init_task_group_load; 7963 init_task_group.shares = init_task_group_load;
7984 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7964 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7985#ifdef CONFIG_CGROUP_SCHED
7986 /* 7965 /*
7987 * How much cpu bandwidth does init_task_group get? 7966 * How much cpu bandwidth does init_task_group get?
7988 * 7967 *
@@ -8002,16 +7981,13 @@ void __init sched_init(void)
8002 * We achieve this by letting init_task_group's tasks sit 7981 * We achieve this by letting init_task_group's tasks sit
8003 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7982 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
8004 */ 7983 */
8005 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7984 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL);
8006#endif
8007#endif /* CONFIG_FAIR_GROUP_SCHED */ 7985#endif /* CONFIG_FAIR_GROUP_SCHED */
8008 7986
8009 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7987 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8010#ifdef CONFIG_RT_GROUP_SCHED 7988#ifdef CONFIG_RT_GROUP_SCHED
8011 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7989 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8012#ifdef CONFIG_CGROUP_SCHED 7990 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL);
8013 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8014#endif
8015#endif 7991#endif
8016 7992
8017 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7993 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8091,8 +8067,6 @@ void __init sched_init(void)
8091 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 8067 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8092#endif /* SMP */ 8068#endif /* SMP */
8093 8069
8094 perf_event_init();
8095
8096 scheduler_running = 1; 8070 scheduler_running = 1;
8097} 8071}
8098 8072
@@ -8286,7 +8260,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8286 if (!se) 8260 if (!se)
8287 goto err_free_rq; 8261 goto err_free_rq;
8288 8262
8289 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8263 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8290 } 8264 }
8291 8265
8292 return 1; 8266 return 1;
@@ -8297,15 +8271,21 @@ err:
8297 return 0; 8271 return 0;
8298} 8272}
8299 8273
8300static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8301{
8302 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8303 &cpu_rq(cpu)->leaf_cfs_rq_list);
8304}
8305
8306static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8274static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8307{ 8275{
8308 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8276 struct rq *rq = cpu_rq(cpu);
8277 unsigned long flags;
8278
8279 /*
8280 * Only empty task groups can be destroyed; so we can speculatively
8281 * check on_list without danger of it being re-added.
8282 */
8283 if (!tg->cfs_rq[cpu]->on_list)
8284 return;
8285
8286 raw_spin_lock_irqsave(&rq->lock, flags);
8287 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8288 raw_spin_unlock_irqrestore(&rq->lock, flags);
8309} 8289}
8310#else /* !CONFG_FAIR_GROUP_SCHED */ 8290#else /* !CONFG_FAIR_GROUP_SCHED */
8311static inline void free_fair_sched_group(struct task_group *tg) 8291static inline void free_fair_sched_group(struct task_group *tg)
@@ -8318,10 +8298,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8318 return 1; 8298 return 1;
8319} 8299}
8320 8300
8321static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8322{
8323}
8324
8325static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8301static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8326{ 8302{
8327} 8303}
@@ -8376,7 +8352,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8376 if (!rt_se) 8352 if (!rt_se)
8377 goto err_free_rq; 8353 goto err_free_rq;
8378 8354
8379 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8355 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8380 } 8356 }
8381 8357
8382 return 1; 8358 return 1;
@@ -8386,17 +8362,6 @@ err_free_rq:
8386err: 8362err:
8387 return 0; 8363 return 0;
8388} 8364}
8389
8390static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8391{
8392 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8393 &cpu_rq(cpu)->leaf_rt_rq_list);
8394}
8395
8396static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8397{
8398 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8399}
8400#else /* !CONFIG_RT_GROUP_SCHED */ 8365#else /* !CONFIG_RT_GROUP_SCHED */
8401static inline void free_rt_sched_group(struct task_group *tg) 8366static inline void free_rt_sched_group(struct task_group *tg)
8402{ 8367{
@@ -8407,14 +8372,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8407{ 8372{
8408 return 1; 8373 return 1;
8409} 8374}
8410
8411static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8412{
8413}
8414
8415static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8416{
8417}
8418#endif /* CONFIG_RT_GROUP_SCHED */ 8375#endif /* CONFIG_RT_GROUP_SCHED */
8419 8376
8420#ifdef CONFIG_CGROUP_SCHED 8377#ifdef CONFIG_CGROUP_SCHED
@@ -8430,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8430{ 8387{
8431 struct task_group *tg; 8388 struct task_group *tg;
8432 unsigned long flags; 8389 unsigned long flags;
8433 int i;
8434 8390
8435 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8391 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8436 if (!tg) 8392 if (!tg)
@@ -8443,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8443 goto err; 8399 goto err;
8444 8400
8445 spin_lock_irqsave(&task_group_lock, flags); 8401 spin_lock_irqsave(&task_group_lock, flags);
8446 for_each_possible_cpu(i) {
8447 register_fair_sched_group(tg, i);
8448 register_rt_sched_group(tg, i);
8449 }
8450 list_add_rcu(&tg->list, &task_groups); 8402 list_add_rcu(&tg->list, &task_groups);
8451 8403
8452 WARN_ON(!parent); /* root should already exist */ 8404 WARN_ON(!parent); /* root should already exist */
@@ -8476,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
8476 unsigned long flags; 8428 unsigned long flags;
8477 int i; 8429 int i;
8478 8430
8479 spin_lock_irqsave(&task_group_lock, flags); 8431 /* end participation in shares distribution */
8480 for_each_possible_cpu(i) { 8432 for_each_possible_cpu(i)
8481 unregister_fair_sched_group(tg, i); 8433 unregister_fair_sched_group(tg, i);
8482 unregister_rt_sched_group(tg, i); 8434
8483 } 8435 spin_lock_irqsave(&task_group_lock, flags);
8484 list_del_rcu(&tg->list); 8436 list_del_rcu(&tg->list);
8485 list_del_rcu(&tg->siblings); 8437 list_del_rcu(&tg->siblings);
8486 spin_unlock_irqrestore(&task_group_lock, flags); 8438 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8527,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
8527#endif /* CONFIG_CGROUP_SCHED */ 8479#endif /* CONFIG_CGROUP_SCHED */
8528 8480
8529#ifdef CONFIG_FAIR_GROUP_SCHED 8481#ifdef CONFIG_FAIR_GROUP_SCHED
8530static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8531{
8532 struct cfs_rq *cfs_rq = se->cfs_rq;
8533 int on_rq;
8534
8535 on_rq = se->on_rq;
8536 if (on_rq)
8537 dequeue_entity(cfs_rq, se, 0);
8538
8539 se->load.weight = shares;
8540 se->load.inv_weight = 0;
8541
8542 if (on_rq)
8543 enqueue_entity(cfs_rq, se, 0);
8544}
8545
8546static void set_se_shares(struct sched_entity *se, unsigned long shares)
8547{
8548 struct cfs_rq *cfs_rq = se->cfs_rq;
8549 struct rq *rq = cfs_rq->rq;
8550 unsigned long flags;
8551
8552 raw_spin_lock_irqsave(&rq->lock, flags);
8553 __set_se_shares(se, shares);
8554 raw_spin_unlock_irqrestore(&rq->lock, flags);
8555}
8556
8557static DEFINE_MUTEX(shares_mutex); 8482static DEFINE_MUTEX(shares_mutex);
8558 8483
8559int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8484int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8576,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8576 if (tg->shares == shares) 8501 if (tg->shares == shares)
8577 goto done; 8502 goto done;
8578 8503
8579 spin_lock_irqsave(&task_group_lock, flags);
8580 for_each_possible_cpu(i)
8581 unregister_fair_sched_group(tg, i);
8582 list_del_rcu(&tg->siblings);
8583 spin_unlock_irqrestore(&task_group_lock, flags);
8584
8585 /* wait for any ongoing reference to this group to finish */
8586 synchronize_sched();
8587
8588 /*
8589 * Now we are free to modify the group's share on each cpu
8590 * w/o tripping rebalance_share or load_balance_fair.
8591 */
8592 tg->shares = shares; 8504 tg->shares = shares;
8593 for_each_possible_cpu(i) { 8505 for_each_possible_cpu(i) {
8594 /* 8506 struct rq *rq = cpu_rq(i);
8595 * force a rebalance 8507 struct sched_entity *se;
8596 */ 8508
8597 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8509 se = tg->se[i];
8598 set_se_shares(tg->se[i], shares); 8510 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0);
8514 raw_spin_unlock_irqrestore(&rq->lock, flags);
8599 } 8515 }
8600 8516
8601 /*
8602 * Enable load balance activity on this group, by inserting it back on
8603 * each cpu's rq->leaf_cfs_rq_list.
8604 */
8605 spin_lock_irqsave(&task_group_lock, flags);
8606 for_each_possible_cpu(i)
8607 register_fair_sched_group(tg, i);
8608 list_add_rcu(&tg->siblings, &tg->parent->children);
8609 spin_unlock_irqrestore(&task_group_lock, flags);
8610done: 8517done:
8611 mutex_unlock(&shares_mutex); 8518 mutex_unlock(&shares_mutex);
8612 return 0; 8519 return 0;
@@ -9332,72 +9239,3 @@ struct cgroup_subsys cpuacct_subsys = {
9332}; 9239};
9333#endif /* CONFIG_CGROUP_CPUACCT */ 9240#endif /* CONFIG_CGROUP_CPUACCT */
9334 9241
9335#ifndef CONFIG_SMP
9336
9337void synchronize_sched_expedited(void)
9338{
9339 barrier();
9340}
9341EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9342
9343#else /* #ifndef CONFIG_SMP */
9344
9345static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9346
9347static int synchronize_sched_expedited_cpu_stop(void *data)
9348{
9349 /*
9350 * There must be a full memory barrier on each affected CPU
9351 * between the time that try_stop_cpus() is called and the
9352 * time that it returns.
9353 *
9354 * In the current initial implementation of cpu_stop, the
9355 * above condition is already met when the control reaches
9356 * this point and the following smp_mb() is not strictly
9357 * necessary. Do smp_mb() anyway for documentation and
9358 * robustness against future implementation changes.
9359 */
9360 smp_mb(); /* See above comment block. */
9361 return 0;
9362}
9363
9364/*
9365 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
9366 * approach to force grace period to end quickly. This consumes
9367 * significant time on all CPUs, and is thus not recommended for
9368 * any sort of common-case code.
9369 *
9370 * Note that it is illegal to call this function while holding any
9371 * lock that is acquired by a CPU-hotplug notifier. Failing to
9372 * observe this restriction will result in deadlock.
9373 */
9374void synchronize_sched_expedited(void)
9375{
9376 int snap, trycount = 0;
9377
9378 smp_mb(); /* ensure prior mod happens before capturing snap. */
9379 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9380 get_online_cpus();
9381 while (try_stop_cpus(cpu_online_mask,
9382 synchronize_sched_expedited_cpu_stop,
9383 NULL) == -EAGAIN) {
9384 put_online_cpus();
9385 if (trycount++ < 10)
9386 udelay(trycount * num_online_cpus());
9387 else {
9388 synchronize_sched();
9389 return;
9390 }
9391 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9392 smp_mb(); /* ensure test happens before caller kfree */
9393 return;
9394 }
9395 get_online_cpus();
9396 }
9397 atomic_inc(&synchronize_sched_expedited_count);
9398 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9399 put_online_cpus();
9400}
9401EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9402
9403#endif /* #else #ifndef CONFIG_SMP */