aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1033
1 files changed, 443 insertions, 590 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index aa14a56f9d03..18d38e4ec7ba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
75 75
76#include <asm/tlb.h> 76#include <asm/tlb.h>
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78#include <asm/mutex.h>
78 79
79#include "sched_cpupri.h" 80#include "sched_cpupri.h"
80#include "workqueue_sched.h" 81#include "workqueue_sched.h"
82#include "sched_autogroup.h"
81 83
82#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h> 85#include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
253 /* runqueue "owned" by this group on each cpu */ 255 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq; 256 struct cfs_rq **cfs_rq;
255 unsigned long shares; 257 unsigned long shares;
258
259 atomic_t load_weight;
256#endif 260#endif
257 261
258#ifdef CONFIG_RT_GROUP_SCHED 262#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,25 +272,18 @@ struct task_group {
268 struct task_group *parent; 272 struct task_group *parent;
269 struct list_head siblings; 273 struct list_head siblings;
270 struct list_head children; 274 struct list_head children;
271};
272 275
273#define root_task_group init_task_group 276#ifdef CONFIG_SCHED_AUTOGROUP
277 struct autogroup *autogroup;
278#endif
279};
274 280
275/* task_group_lock serializes add/remove of task groups and also changes to 281/* task_group_lock serializes the addition/removal of task groups */
276 * a task group's cpu shares.
277 */
278static DEFINE_SPINLOCK(task_group_lock); 282static DEFINE_SPINLOCK(task_group_lock);
279 283
280#ifdef CONFIG_FAIR_GROUP_SCHED 284#ifdef CONFIG_FAIR_GROUP_SCHED
281 285
282#ifdef CONFIG_SMP 286# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
283static int root_task_group_empty(void)
284{
285 return list_empty(&root_task_group.children);
286}
287#endif
288
289# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
290 287
291/* 288/*
292 * A weight of 0 or 1 can cause arithmetics problems. 289 * A weight of 0 or 1 can cause arithmetics problems.
@@ -299,13 +296,13 @@ static int root_task_group_empty(void)
299#define MIN_SHARES 2 296#define MIN_SHARES 2
300#define MAX_SHARES (1UL << 18) 297#define MAX_SHARES (1UL << 18)
301 298
302static int init_task_group_load = INIT_TASK_GROUP_LOAD; 299static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
303#endif 300#endif
304 301
305/* Default task group. 302/* Default task group.
306 * Every task in system belong to this group at bootup. 303 * Every task in system belong to this group at bootup.
307 */ 304 */
308struct task_group init_task_group; 305struct task_group root_task_group;
309 306
310#endif /* CONFIG_CGROUP_SCHED */ 307#endif /* CONFIG_CGROUP_SCHED */
311 308
@@ -342,6 +339,7 @@ struct cfs_rq {
342 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 339 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
343 * list is used during load balance. 340 * list is used during load balance.
344 */ 341 */
342 int on_list;
345 struct list_head leaf_cfs_rq_list; 343 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg; /* group that "owns" this runqueue */ 344 struct task_group *tg; /* group that "owns" this runqueue */
347 345
@@ -360,14 +358,17 @@ struct cfs_rq {
360 unsigned long h_load; 358 unsigned long h_load;
361 359
362 /* 360 /*
363 * this cpu's part of tg->shares 361 * Maintaining per-cpu shares distribution for group scheduling
362 *
363 * load_stamp is the last time we updated the load average
364 * load_last is the last time we updated the load average and saw load
365 * load_unacc_exec_time is currently unaccounted execution time
364 */ 366 */
365 unsigned long shares; 367 u64 load_avg;
368 u64 load_period;
369 u64 load_stamp, load_last, load_unacc_exec_time;
366 370
367 /* 371 unsigned long load_contribution;
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
371#endif 372#endif
372#endif 373#endif
373}; 374};
@@ -552,26 +553,13 @@ struct rq {
552 /* try_to_wake_up() stats */ 553 /* try_to_wake_up() stats */
553 unsigned int ttwu_count; 554 unsigned int ttwu_count;
554 unsigned int ttwu_local; 555 unsigned int ttwu_local;
555
556 /* BKL stats */
557 unsigned int bkl_count;
558#endif 556#endif
559}; 557};
560 558
561static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 559static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
562 560
563static inline
564void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
565{
566 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
567 561
568 /* 562static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
569 * A queue event has occurred, and we're going to schedule. In
570 * this case, we can save a useless back to back clock update.
571 */
572 if (test_tsk_need_resched(p))
573 rq->skip_clock_update = 1;
574}
575 563
576static inline int cpu_of(struct rq *rq) 564static inline int cpu_of(struct rq *rq)
577{ 565{
@@ -615,11 +603,17 @@ static inline int cpu_of(struct rq *rq)
615 */ 603 */
616static inline struct task_group *task_group(struct task_struct *p) 604static inline struct task_group *task_group(struct task_struct *p)
617{ 605{
606 struct task_group *tg;
618 struct cgroup_subsys_state *css; 607 struct cgroup_subsys_state *css;
619 608
609 if (p->flags & PF_EXITING)
610 return &root_task_group;
611
620 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
621 lockdep_is_held(&task_rq(p)->lock)); 613 lockdep_is_held(&task_rq(p)->lock));
622 return container_of(css, struct task_group, css); 614 tg = container_of(css, struct task_group, css);
615
616 return autogroup_task_group(p, tg);
623} 617}
624 618
625/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 619/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -646,22 +640,18 @@ static inline struct task_group *task_group(struct task_struct *p)
646 640
647#endif /* CONFIG_CGROUP_SCHED */ 641#endif /* CONFIG_CGROUP_SCHED */
648 642
649static u64 irq_time_cpu(int cpu); 643static void update_rq_clock_task(struct rq *rq, s64 delta);
650static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
651 644
652inline void update_rq_clock(struct rq *rq) 645static void update_rq_clock(struct rq *rq)
653{ 646{
654 if (!rq->skip_clock_update) { 647 s64 delta;
655 int cpu = cpu_of(rq);
656 u64 irq_time;
657 648
658 rq->clock = sched_clock_cpu(cpu); 649 if (rq->skip_clock_update)
659 irq_time = irq_time_cpu(cpu); 650 return;
660 if (rq->clock - irq_time > rq->clock_task)
661 rq->clock_task = rq->clock - irq_time;
662 651
663 sched_irq_time_avg_update(rq, irq_time); 652 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
664 } 653 rq->clock += delta;
654 update_rq_clock_task(rq, delta);
665} 655}
666 656
667/* 657/*
@@ -751,7 +741,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
751 buf[cnt] = 0; 741 buf[cnt] = 0;
752 cmp = strstrip(buf); 742 cmp = strstrip(buf);
753 743
754 if (strncmp(buf, "NO_", 3) == 0) { 744 if (strncmp(cmp, "NO_", 3) == 0) {
755 neg = 1; 745 neg = 1;
756 cmp += 3; 746 cmp += 3;
757 } 747 }
@@ -807,20 +797,6 @@ late_initcall(sched_init_debug);
807const_debug unsigned int sysctl_sched_nr_migrate = 32; 797const_debug unsigned int sysctl_sched_nr_migrate = 32;
808 798
809/* 799/*
810 * ratelimit for updating the group shares.
811 * default: 0.25ms
812 */
813unsigned int sysctl_sched_shares_ratelimit = 250000;
814unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
815
816/*
817 * Inject some fuzzyness into changing the per-cpu group shares
818 * this avoids remote rq-locks at the expense of fairness.
819 * default: 4
820 */
821unsigned int sysctl_sched_shares_thresh = 4;
822
823/*
824 * period over which we average the RT time consumption, measured 800 * period over which we average the RT time consumption, measured
825 * in ms. 801 * in ms.
826 * 802 *
@@ -1369,6 +1345,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1369 lw->inv_weight = 0; 1345 lw->inv_weight = 0;
1370} 1346}
1371 1347
1348static inline void update_load_set(struct load_weight *lw, unsigned long w)
1349{
1350 lw->weight = w;
1351 lw->inv_weight = 0;
1352}
1353
1372/* 1354/*
1373 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1355 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1374 * of tasks with abnormal "nice" values across CPUs the contribution that 1356 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1557,101 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1557 1539
1558#ifdef CONFIG_FAIR_GROUP_SCHED 1540#ifdef CONFIG_FAIR_GROUP_SCHED
1559 1541
1560static __read_mostly unsigned long __percpu *update_shares_data;
1561
1562static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1563
1564/*
1565 * Calculate and set the cpu's group shares.
1566 */
1567static void update_group_shares_cpu(struct task_group *tg, int cpu,
1568 unsigned long sd_shares,
1569 unsigned long sd_rq_weight,
1570 unsigned long *usd_rq_weight)
1571{
1572 unsigned long shares, rq_weight;
1573 int boost = 0;
1574
1575 rq_weight = usd_rq_weight[cpu];
1576 if (!rq_weight) {
1577 boost = 1;
1578 rq_weight = NICE_0_LOAD;
1579 }
1580
1581 /*
1582 * \Sum_j shares_j * rq_weight_i
1583 * shares_i = -----------------------------
1584 * \Sum_j rq_weight_j
1585 */
1586 shares = (sd_shares * rq_weight) / sd_rq_weight;
1587 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1588
1589 if (abs(shares - tg->se[cpu]->load.weight) >
1590 sysctl_sched_shares_thresh) {
1591 struct rq *rq = cpu_rq(cpu);
1592 unsigned long flags;
1593
1594 raw_spin_lock_irqsave(&rq->lock, flags);
1595 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1596 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1597 __set_se_shares(tg->se[cpu], shares);
1598 raw_spin_unlock_irqrestore(&rq->lock, flags);
1599 }
1600}
1601
1602/*
1603 * Re-compute the task group their per cpu shares over the given domain.
1604 * This needs to be done in a bottom-up fashion because the rq weight of a
1605 * parent group depends on the shares of its child groups.
1606 */
1607static int tg_shares_up(struct task_group *tg, void *data)
1608{
1609 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1610 unsigned long *usd_rq_weight;
1611 struct sched_domain *sd = data;
1612 unsigned long flags;
1613 int i;
1614
1615 if (!tg->se[0])
1616 return 0;
1617
1618 local_irq_save(flags);
1619 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1620
1621 for_each_cpu(i, sched_domain_span(sd)) {
1622 weight = tg->cfs_rq[i]->load.weight;
1623 usd_rq_weight[i] = weight;
1624
1625 rq_weight += weight;
1626 /*
1627 * If there are currently no tasks on the cpu pretend there
1628 * is one of average load so that when a new task gets to
1629 * run here it will not get delayed by group starvation.
1630 */
1631 if (!weight)
1632 weight = NICE_0_LOAD;
1633
1634 sum_weight += weight;
1635 shares += tg->cfs_rq[i]->shares;
1636 }
1637
1638 if (!rq_weight)
1639 rq_weight = sum_weight;
1640
1641 if ((!shares && rq_weight) || shares > tg->shares)
1642 shares = tg->shares;
1643
1644 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1645 shares = tg->shares;
1646
1647 for_each_cpu(i, sched_domain_span(sd))
1648 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1649
1650 local_irq_restore(flags);
1651
1652 return 0;
1653}
1654
1655/* 1542/*
1656 * Compute the cpu's hierarchical load factor for each task group. 1543 * Compute the cpu's hierarchical load factor for each task group.
1657 * This needs to be done in a top-down fashion because the load of a child 1544 * This needs to be done in a top-down fashion because the load of a child
@@ -1666,7 +1553,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1666 load = cpu_rq(cpu)->load.weight; 1553 load = cpu_rq(cpu)->load.weight;
1667 } else { 1554 } else {
1668 load = tg->parent->cfs_rq[cpu]->h_load; 1555 load = tg->parent->cfs_rq[cpu]->h_load;
1669 load *= tg->cfs_rq[cpu]->shares; 1556 load *= tg->se[cpu]->load.weight;
1670 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1557 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1671 } 1558 }
1672 1559
@@ -1675,34 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1675 return 0; 1562 return 0;
1676} 1563}
1677 1564
1678static void update_shares(struct sched_domain *sd)
1679{
1680 s64 elapsed;
1681 u64 now;
1682
1683 if (root_task_group_empty())
1684 return;
1685
1686 now = local_clock();
1687 elapsed = now - sd->last_update;
1688
1689 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1690 sd->last_update = now;
1691 walk_tg_tree(tg_nop, tg_shares_up, sd);
1692 }
1693}
1694
1695static void update_h_load(long cpu) 1565static void update_h_load(long cpu)
1696{ 1566{
1697 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1567 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1698} 1568}
1699 1569
1700#else
1701
1702static inline void update_shares(struct sched_domain *sd)
1703{
1704}
1705
1706#endif 1570#endif
1707 1571
1708#ifdef CONFIG_PREEMPT 1572#ifdef CONFIG_PREEMPT
@@ -1824,15 +1688,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1824 1688
1825#endif 1689#endif
1826 1690
1827#ifdef CONFIG_FAIR_GROUP_SCHED
1828static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1829{
1830#ifdef CONFIG_SMP
1831 cfs_rq->shares = shares;
1832#endif
1833}
1834#endif
1835
1836static void calc_load_account_idle(struct rq *this_rq); 1691static void calc_load_account_idle(struct rq *this_rq);
1837static void update_sysctl(void); 1692static void update_sysctl(void);
1838static int get_update_sysctl_factor(void); 1693static int get_update_sysctl_factor(void);
@@ -1934,10 +1789,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1934 * They are read and saved off onto struct rq in update_rq_clock(). 1789 * They are read and saved off onto struct rq in update_rq_clock().
1935 * This may result in other CPU reading this CPU's irq time and can 1790 * This may result in other CPU reading this CPU's irq time and can
1936 * race with irq/account_system_vtime on this CPU. We would either get old 1791 * race with irq/account_system_vtime on this CPU. We would either get old
1937 * or new value (or semi updated value on 32 bit) with a side effect of 1792 * or new value with a side effect of accounting a slice of irq time to wrong
1938 * accounting a slice of irq time to wrong task when irq is in progress 1793 * task when irq is in progress while we read rq->clock. That is a worthy
1939 * while we read rq->clock. That is a worthy compromise in place of having 1794 * compromise in place of having locks on each irq in account_system_time.
1940 * locks on each irq in account_system_time.
1941 */ 1795 */
1942static DEFINE_PER_CPU(u64, cpu_hardirq_time); 1796static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1943static DEFINE_PER_CPU(u64, cpu_softirq_time); 1797static DEFINE_PER_CPU(u64, cpu_softirq_time);
@@ -1955,19 +1809,58 @@ void disable_sched_clock_irqtime(void)
1955 sched_clock_irqtime = 0; 1809 sched_clock_irqtime = 0;
1956} 1810}
1957 1811
1958static u64 irq_time_cpu(int cpu) 1812#ifndef CONFIG_64BIT
1813static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1814
1815static inline void irq_time_write_begin(void)
1959{ 1816{
1960 if (!sched_clock_irqtime) 1817 __this_cpu_inc(irq_time_seq.sequence);
1961 return 0; 1818 smp_wmb();
1819}
1820
1821static inline void irq_time_write_end(void)
1822{
1823 smp_wmb();
1824 __this_cpu_inc(irq_time_seq.sequence);
1825}
1826
1827static inline u64 irq_time_read(int cpu)
1828{
1829 u64 irq_time;
1830 unsigned seq;
1831
1832 do {
1833 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1834 irq_time = per_cpu(cpu_softirq_time, cpu) +
1835 per_cpu(cpu_hardirq_time, cpu);
1836 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1962 1837
1838 return irq_time;
1839}
1840#else /* CONFIG_64BIT */
1841static inline void irq_time_write_begin(void)
1842{
1843}
1844
1845static inline void irq_time_write_end(void)
1846{
1847}
1848
1849static inline u64 irq_time_read(int cpu)
1850{
1963 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); 1851 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1964} 1852}
1853#endif /* CONFIG_64BIT */
1965 1854
1855/*
1856 * Called before incrementing preempt_count on {soft,}irq_enter
1857 * and before decrementing preempt_count on {soft,}irq_exit.
1858 */
1966void account_system_vtime(struct task_struct *curr) 1859void account_system_vtime(struct task_struct *curr)
1967{ 1860{
1968 unsigned long flags; 1861 unsigned long flags;
1862 s64 delta;
1969 int cpu; 1863 int cpu;
1970 u64 now, delta;
1971 1864
1972 if (!sched_clock_irqtime) 1865 if (!sched_clock_irqtime)
1973 return; 1866 return;
@@ -1975,9 +1868,10 @@ void account_system_vtime(struct task_struct *curr)
1975 local_irq_save(flags); 1868 local_irq_save(flags);
1976 1869
1977 cpu = smp_processor_id(); 1870 cpu = smp_processor_id();
1978 now = sched_clock_cpu(cpu); 1871 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1979 delta = now - per_cpu(irq_start_time, cpu); 1872 __this_cpu_add(irq_start_time, delta);
1980 per_cpu(irq_start_time, cpu) = now; 1873
1874 irq_time_write_begin();
1981 /* 1875 /*
1982 * We do not account for softirq time from ksoftirqd here. 1876 * We do not account for softirq time from ksoftirqd here.
1983 * We want to continue accounting softirq time to ksoftirqd thread 1877 * We want to continue accounting softirq time to ksoftirqd thread
@@ -1985,37 +1879,60 @@ void account_system_vtime(struct task_struct *curr)
1985 * that do not consume any time, but still wants to run. 1879 * that do not consume any time, but still wants to run.
1986 */ 1880 */
1987 if (hardirq_count()) 1881 if (hardirq_count())
1988 per_cpu(cpu_hardirq_time, cpu) += delta; 1882 __this_cpu_add(cpu_hardirq_time, delta);
1989 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) 1883 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1990 per_cpu(cpu_softirq_time, cpu) += delta; 1884 __this_cpu_add(cpu_softirq_time, delta);
1991 1885
1886 irq_time_write_end();
1992 local_irq_restore(flags); 1887 local_irq_restore(flags);
1993} 1888}
1994EXPORT_SYMBOL_GPL(account_system_vtime); 1889EXPORT_SYMBOL_GPL(account_system_vtime);
1995 1890
1996static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) 1891static void update_rq_clock_task(struct rq *rq, s64 delta)
1997{ 1892{
1998 if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { 1893 s64 irq_delta;
1999 u64 delta_irq = curr_irq_time - rq->prev_irq_time; 1894
2000 rq->prev_irq_time = curr_irq_time; 1895 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
2001 sched_rt_avg_update(rq, delta_irq); 1896
2002 } 1897 /*
1898 * Since irq_time is only updated on {soft,}irq_exit, we might run into
1899 * this case when a previous update_rq_clock() happened inside a
1900 * {soft,}irq region.
1901 *
1902 * When this happens, we stop ->clock_task and only update the
1903 * prev_irq_time stamp to account for the part that fit, so that a next
1904 * update will consume the rest. This ensures ->clock_task is
1905 * monotonic.
1906 *
1907 * It does however cause some slight miss-attribution of {soft,}irq
1908 * time, a more accurate solution would be to update the irq_time using
1909 * the current rq->clock timestamp, except that would require using
1910 * atomic ops.
1911 */
1912 if (irq_delta > delta)
1913 irq_delta = delta;
1914
1915 rq->prev_irq_time += irq_delta;
1916 delta -= irq_delta;
1917 rq->clock_task += delta;
1918
1919 if (irq_delta && sched_feat(NONIRQ_POWER))
1920 sched_rt_avg_update(rq, irq_delta);
2003} 1921}
2004 1922
2005#else 1923#else /* CONFIG_IRQ_TIME_ACCOUNTING */
2006 1924
2007static u64 irq_time_cpu(int cpu) 1925static void update_rq_clock_task(struct rq *rq, s64 delta)
2008{ 1926{
2009 return 0; 1927 rq->clock_task += delta;
2010} 1928}
2011 1929
2012static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } 1930#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
2013
2014#endif
2015 1931
2016#include "sched_idletask.c" 1932#include "sched_idletask.c"
2017#include "sched_fair.c" 1933#include "sched_fair.c"
2018#include "sched_rt.c" 1934#include "sched_rt.c"
1935#include "sched_autogroup.c"
2019#include "sched_stoptask.c" 1936#include "sched_stoptask.c"
2020#ifdef CONFIG_SCHED_DEBUG 1937#ifdef CONFIG_SCHED_DEBUG
2021# include "sched_debug.c" 1938# include "sched_debug.c"
@@ -2118,6 +2035,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
2118 p->sched_class->prio_changed(rq, p, oldprio, running); 2035 p->sched_class->prio_changed(rq, p, oldprio, running);
2119} 2036}
2120 2037
2038static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2039{
2040 const struct sched_class *class;
2041
2042 if (p->sched_class == rq->curr->sched_class) {
2043 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2044 } else {
2045 for_each_class(class) {
2046 if (class == rq->curr->sched_class)
2047 break;
2048 if (class == p->sched_class) {
2049 resched_task(rq->curr);
2050 break;
2051 }
2052 }
2053 }
2054
2055 /*
2056 * A queue event has occurred, and we're going to schedule. In
2057 * this case, we can save a useless back to back clock update.
2058 */
2059 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
2060 rq->skip_clock_update = 1;
2061}
2062
2121#ifdef CONFIG_SMP 2063#ifdef CONFIG_SMP
2122/* 2064/*
2123 * Is this task likely cache-hot: 2065 * Is this task likely cache-hot:
@@ -2183,10 +2125,8 @@ static int migration_cpu_stop(void *data);
2183 * The task's runqueue lock must be held. 2125 * The task's runqueue lock must be held.
2184 * Returns true if you have to wait for migration thread. 2126 * Returns true if you have to wait for migration thread.
2185 */ 2127 */
2186static bool migrate_task(struct task_struct *p, int dest_cpu) 2128static bool migrate_task(struct task_struct *p, struct rq *rq)
2187{ 2129{
2188 struct rq *rq = task_rq(p);
2189
2190 /* 2130 /*
2191 * If the task is not on a runqueue (and not running), then 2131 * If the task is not on a runqueue (and not running), then
2192 * the next wake-up will properly place the task. 2132 * the next wake-up will properly place the task.
@@ -2366,18 +2306,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2366 return dest_cpu; 2306 return dest_cpu;
2367 2307
2368 /* No more Mr. Nice Guy. */ 2308 /* No more Mr. Nice Guy. */
2369 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2309 dest_cpu = cpuset_cpus_allowed_fallback(p);
2370 dest_cpu = cpuset_cpus_allowed_fallback(p); 2310 /*
2371 /* 2311 * Don't tell them about moving exiting tasks or
2372 * Don't tell them about moving exiting tasks or 2312 * kernel threads (both mm NULL), since they never
2373 * kernel threads (both mm NULL), since they never 2313 * leave kernel.
2374 * leave kernel. 2314 */
2375 */ 2315 if (p->mm && printk_ratelimit()) {
2376 if (p->mm && printk_ratelimit()) { 2316 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2377 printk(KERN_INFO "process %d (%s) no " 2317 task_pid_nr(p), p->comm, cpu);
2378 "longer affine to cpu%d\n",
2379 task_pid_nr(p), p->comm, cpu);
2380 }
2381 } 2318 }
2382 2319
2383 return dest_cpu; 2320 return dest_cpu;
@@ -2568,7 +2505,7 @@ out:
2568 * try_to_wake_up_local - try to wake up a local task with rq lock held 2505 * try_to_wake_up_local - try to wake up a local task with rq lock held
2569 * @p: the thread to be awakened 2506 * @p: the thread to be awakened
2570 * 2507 *
2571 * Put @p on the run-queue if it's not alredy there. The caller must 2508 * Put @p on the run-queue if it's not already there. The caller must
2572 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2509 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2573 * the current task. this_rq() stays locked over invocation. 2510 * the current task. this_rq() stays locked over invocation.
2574 */ 2511 */
@@ -2713,7 +2650,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2713 /* Want to start with kernel preemption disabled. */ 2650 /* Want to start with kernel preemption disabled. */
2714 task_thread_info(p)->preempt_count = 1; 2651 task_thread_info(p)->preempt_count = 1;
2715#endif 2652#endif
2653#ifdef CONFIG_SMP
2716 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2654 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2655#endif
2717 2656
2718 put_cpu(); 2657 put_cpu();
2719} 2658}
@@ -3104,6 +3043,15 @@ static long calc_load_fold_active(struct rq *this_rq)
3104 return delta; 3043 return delta;
3105} 3044}
3106 3045
3046static unsigned long
3047calc_load(unsigned long load, unsigned long exp, unsigned long active)
3048{
3049 load *= exp;
3050 load += active * (FIXED_1 - exp);
3051 load += 1UL << (FSHIFT - 1);
3052 return load >> FSHIFT;
3053}
3054
3107#ifdef CONFIG_NO_HZ 3055#ifdef CONFIG_NO_HZ
3108/* 3056/*
3109 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 3057 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3133,6 +3081,128 @@ static long calc_load_fold_idle(void)
3133 3081
3134 return delta; 3082 return delta;
3135} 3083}
3084
3085/**
3086 * fixed_power_int - compute: x^n, in O(log n) time
3087 *
3088 * @x: base of the power
3089 * @frac_bits: fractional bits of @x
3090 * @n: power to raise @x to.
3091 *
3092 * By exploiting the relation between the definition of the natural power
3093 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3094 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3095 * (where: n_i \elem {0, 1}, the binary vector representing n),
3096 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3097 * of course trivially computable in O(log_2 n), the length of our binary
3098 * vector.
3099 */
3100static unsigned long
3101fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3102{
3103 unsigned long result = 1UL << frac_bits;
3104
3105 if (n) for (;;) {
3106 if (n & 1) {
3107 result *= x;
3108 result += 1UL << (frac_bits - 1);
3109 result >>= frac_bits;
3110 }
3111 n >>= 1;
3112 if (!n)
3113 break;
3114 x *= x;
3115 x += 1UL << (frac_bits - 1);
3116 x >>= frac_bits;
3117 }
3118
3119 return result;
3120}
3121
3122/*
3123 * a1 = a0 * e + a * (1 - e)
3124 *
3125 * a2 = a1 * e + a * (1 - e)
3126 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3127 * = a0 * e^2 + a * (1 - e) * (1 + e)
3128 *
3129 * a3 = a2 * e + a * (1 - e)
3130 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3131 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3132 *
3133 * ...
3134 *
3135 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3136 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3137 * = a0 * e^n + a * (1 - e^n)
3138 *
3139 * [1] application of the geometric series:
3140 *
3141 * n 1 - x^(n+1)
3142 * S_n := \Sum x^i = -------------
3143 * i=0 1 - x
3144 */
3145static unsigned long
3146calc_load_n(unsigned long load, unsigned long exp,
3147 unsigned long active, unsigned int n)
3148{
3149
3150 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3151}
3152
3153/*
3154 * NO_HZ can leave us missing all per-cpu ticks calling
3155 * calc_load_account_active(), but since an idle CPU folds its delta into
3156 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3157 * in the pending idle delta if our idle period crossed a load cycle boundary.
3158 *
3159 * Once we've updated the global active value, we need to apply the exponential
3160 * weights adjusted to the number of cycles missed.
3161 */
3162static void calc_global_nohz(unsigned long ticks)
3163{
3164 long delta, active, n;
3165
3166 if (time_before(jiffies, calc_load_update))
3167 return;
3168
3169 /*
3170 * If we crossed a calc_load_update boundary, make sure to fold
3171 * any pending idle changes, the respective CPUs might have
3172 * missed the tick driven calc_load_account_active() update
3173 * due to NO_HZ.
3174 */
3175 delta = calc_load_fold_idle();
3176 if (delta)
3177 atomic_long_add(delta, &calc_load_tasks);
3178
3179 /*
3180 * If we were idle for multiple load cycles, apply them.
3181 */
3182 if (ticks >= LOAD_FREQ) {
3183 n = ticks / LOAD_FREQ;
3184
3185 active = atomic_long_read(&calc_load_tasks);
3186 active = active > 0 ? active * FIXED_1 : 0;
3187
3188 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3189 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3190 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3191
3192 calc_load_update += n * LOAD_FREQ;
3193 }
3194
3195 /*
3196 * Its possible the remainder of the above division also crosses
3197 * a LOAD_FREQ period, the regular check in calc_global_load()
3198 * which comes after this will take care of that.
3199 *
3200 * Consider us being 11 ticks before a cycle completion, and us
3201 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3202 * age us 4 cycles, and the test in calc_global_load() will
3203 * pick up the final one.
3204 */
3205}
3136#else 3206#else
3137static void calc_load_account_idle(struct rq *this_rq) 3207static void calc_load_account_idle(struct rq *this_rq)
3138{ 3208{
@@ -3142,6 +3212,10 @@ static inline long calc_load_fold_idle(void)
3142{ 3212{
3143 return 0; 3213 return 0;
3144} 3214}
3215
3216static void calc_global_nohz(unsigned long ticks)
3217{
3218}
3145#endif 3219#endif
3146 3220
3147/** 3221/**
@@ -3159,24 +3233,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3159 loads[2] = (avenrun[2] + offset) << shift; 3233 loads[2] = (avenrun[2] + offset) << shift;
3160} 3234}
3161 3235
3162static unsigned long
3163calc_load(unsigned long load, unsigned long exp, unsigned long active)
3164{
3165 load *= exp;
3166 load += active * (FIXED_1 - exp);
3167 return load >> FSHIFT;
3168}
3169
3170/* 3236/*
3171 * calc_load - update the avenrun load estimates 10 ticks after the 3237 * calc_load - update the avenrun load estimates 10 ticks after the
3172 * CPUs have updated calc_load_tasks. 3238 * CPUs have updated calc_load_tasks.
3173 */ 3239 */
3174void calc_global_load(void) 3240void calc_global_load(unsigned long ticks)
3175{ 3241{
3176 unsigned long upd = calc_load_update + 10;
3177 long active; 3242 long active;
3178 3243
3179 if (time_before(jiffies, upd)) 3244 calc_global_nohz(ticks);
3245
3246 if (time_before(jiffies, calc_load_update + 10))
3180 return; 3247 return;
3181 3248
3182 active = atomic_long_read(&calc_load_tasks); 3249 active = atomic_long_read(&calc_load_tasks);
@@ -3349,7 +3416,7 @@ void sched_exec(void)
3349 * select_task_rq() can race against ->cpus_allowed 3416 * select_task_rq() can race against ->cpus_allowed
3350 */ 3417 */
3351 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3418 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3352 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { 3419 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3353 struct migration_arg arg = { p, dest_cpu }; 3420 struct migration_arg arg = { p, dest_cpu };
3354 3421
3355 task_rq_unlock(rq, &flags); 3422 task_rq_unlock(rq, &flags);
@@ -3820,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev)
3820 schedstat_inc(this_rq(), sched_count); 3887 schedstat_inc(this_rq(), sched_count);
3821#ifdef CONFIG_SCHEDSTATS 3888#ifdef CONFIG_SCHEDSTATS
3822 if (unlikely(prev->lock_depth >= 0)) { 3889 if (unlikely(prev->lock_depth >= 0)) {
3823 schedstat_inc(this_rq(), bkl_count); 3890 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
3824 schedstat_inc(prev, sched_info.bkl_count); 3891 schedstat_inc(prev, sched_info.bkl_count);
3825 } 3892 }
3826#endif 3893#endif
@@ -3830,7 +3897,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
3830{ 3897{
3831 if (prev->se.on_rq) 3898 if (prev->se.on_rq)
3832 update_rq_clock(rq); 3899 update_rq_clock(rq);
3833 rq->skip_clock_update = 0;
3834 prev->sched_class->put_prev_task(rq, prev); 3900 prev->sched_class->put_prev_task(rq, prev);
3835} 3901}
3836 3902
@@ -3888,7 +3954,6 @@ need_resched_nonpreemptible:
3888 hrtick_clear(rq); 3954 hrtick_clear(rq);
3889 3955
3890 raw_spin_lock_irq(&rq->lock); 3956 raw_spin_lock_irq(&rq->lock);
3891 clear_tsk_need_resched(prev);
3892 3957
3893 switch_count = &prev->nivcsw; 3958 switch_count = &prev->nivcsw;
3894 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3959 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3920,6 +3985,8 @@ need_resched_nonpreemptible:
3920 3985
3921 put_prev_task(rq, prev); 3986 put_prev_task(rq, prev);
3922 next = pick_next_task(rq); 3987 next = pick_next_task(rq);
3988 clear_tsk_need_resched(prev);
3989 rq->skip_clock_update = 0;
3923 3990
3924 if (likely(prev != next)) { 3991 if (likely(prev != next)) {
3925 sched_info_switch(prev, next); 3992 sched_info_switch(prev, next);
@@ -4014,7 +4081,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
4014 if (task_thread_info(rq->curr) != owner || need_resched()) 4081 if (task_thread_info(rq->curr) != owner || need_resched())
4015 return 0; 4082 return 0;
4016 4083
4017 cpu_relax(); 4084 arch_mutex_cpu_relax();
4018 } 4085 }
4019 4086
4020 return 1; 4087 return 1;
@@ -4326,7 +4393,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4326 * This waits for either a completion of a specific task to be signaled or for a 4393 * This waits for either a completion of a specific task to be signaled or for a
4327 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4394 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4328 */ 4395 */
4329unsigned long __sched 4396long __sched
4330wait_for_completion_interruptible_timeout(struct completion *x, 4397wait_for_completion_interruptible_timeout(struct completion *x,
4331 unsigned long timeout) 4398 unsigned long timeout)
4332{ 4399{
@@ -4359,7 +4426,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4359 * signaled or for a specified timeout to expire. It can be 4426 * signaled or for a specified timeout to expire. It can be
4360 * interrupted by a kill signal. The timeout is in jiffies. 4427 * interrupted by a kill signal. The timeout is in jiffies.
4361 */ 4428 */
4362unsigned long __sched 4429long __sched
4363wait_for_completion_killable_timeout(struct completion *x, 4430wait_for_completion_killable_timeout(struct completion *x,
4364 unsigned long timeout) 4431 unsigned long timeout)
4365{ 4432{
@@ -4701,7 +4768,7 @@ static bool check_same_owner(struct task_struct *p)
4701} 4768}
4702 4769
4703static int __sched_setscheduler(struct task_struct *p, int policy, 4770static int __sched_setscheduler(struct task_struct *p, int policy,
4704 struct sched_param *param, bool user) 4771 const struct sched_param *param, bool user)
4705{ 4772{
4706 int retval, oldprio, oldpolicy = -1, on_rq, running; 4773 int retval, oldprio, oldpolicy = -1, on_rq, running;
4707 unsigned long flags; 4774 unsigned long flags;
@@ -4804,7 +4871,8 @@ recheck:
4804 * assigned. 4871 * assigned.
4805 */ 4872 */
4806 if (rt_bandwidth_enabled() && rt_policy(policy) && 4873 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4807 task_group(p)->rt_bandwidth.rt_runtime == 0) { 4874 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4875 !task_group_is_autogroup(task_group(p))) {
4808 __task_rq_unlock(rq); 4876 __task_rq_unlock(rq);
4809 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4877 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4810 return -EPERM; 4878 return -EPERM;
@@ -4856,7 +4924,7 @@ recheck:
4856 * NOTE that the task may be already dead. 4924 * NOTE that the task may be already dead.
4857 */ 4925 */
4858int sched_setscheduler(struct task_struct *p, int policy, 4926int sched_setscheduler(struct task_struct *p, int policy,
4859 struct sched_param *param) 4927 const struct sched_param *param)
4860{ 4928{
4861 return __sched_setscheduler(p, policy, param, true); 4929 return __sched_setscheduler(p, policy, param, true);
4862} 4930}
@@ -4874,7 +4942,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
4874 * but our caller might not have that capability. 4942 * but our caller might not have that capability.
4875 */ 4943 */
4876int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4944int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4877 struct sched_param *param) 4945 const struct sched_param *param)
4878{ 4946{
4879 return __sched_setscheduler(p, policy, param, false); 4947 return __sched_setscheduler(p, policy, param, false);
4880} 4948}
@@ -5390,7 +5458,7 @@ void sched_show_task(struct task_struct *p)
5390 unsigned state; 5458 unsigned state;
5391 5459
5392 state = p->state ? __ffs(p->state) + 1 : 0; 5460 state = p->state ? __ffs(p->state) + 1 : 0;
5393 printk(KERN_INFO "%-13.13s %c", p->comm, 5461 printk(KERN_INFO "%-15.15s %c", p->comm,
5394 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5462 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5395#if BITS_PER_LONG == 32 5463#if BITS_PER_LONG == 32
5396 if (state == TASK_RUNNING) 5464 if (state == TASK_RUNNING)
@@ -5554,7 +5622,6 @@ static void update_sysctl(void)
5554 SET_SYSCTL(sched_min_granularity); 5622 SET_SYSCTL(sched_min_granularity);
5555 SET_SYSCTL(sched_latency); 5623 SET_SYSCTL(sched_latency);
5556 SET_SYSCTL(sched_wakeup_granularity); 5624 SET_SYSCTL(sched_wakeup_granularity);
5557 SET_SYSCTL(sched_shares_ratelimit);
5558#undef SET_SYSCTL 5625#undef SET_SYSCTL
5559} 5626}
5560 5627
@@ -5630,7 +5697,7 @@ again:
5630 goto out; 5697 goto out;
5631 5698
5632 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5699 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5633 if (migrate_task(p, dest_cpu)) { 5700 if (migrate_task(p, rq)) {
5634 struct migration_arg arg = { p, dest_cpu }; 5701 struct migration_arg arg = { p, dest_cpu };
5635 /* Need help from migration thread: drop lock and wait. */ 5702 /* Need help from migration thread: drop lock and wait. */
5636 task_rq_unlock(rq, &flags); 5703 task_rq_unlock(rq, &flags);
@@ -5712,29 +5779,20 @@ static int migration_cpu_stop(void *data)
5712} 5779}
5713 5780
5714#ifdef CONFIG_HOTPLUG_CPU 5781#ifdef CONFIG_HOTPLUG_CPU
5782
5715/* 5783/*
5716 * Figure out where task on dead CPU should go, use force if necessary. 5784 * Ensures that the idle task is using init_mm right before its cpu goes
5785 * offline.
5717 */ 5786 */
5718void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5787void idle_task_exit(void)
5719{ 5788{
5720 struct rq *rq = cpu_rq(dead_cpu); 5789 struct mm_struct *mm = current->active_mm;
5721 int needs_cpu, uninitialized_var(dest_cpu);
5722 unsigned long flags;
5723 5790
5724 local_irq_save(flags); 5791 BUG_ON(cpu_online(smp_processor_id()));
5725 5792
5726 raw_spin_lock(&rq->lock); 5793 if (mm != &init_mm)
5727 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5794 switch_mm(mm, &init_mm, current);
5728 if (needs_cpu) 5795 mmdrop(mm);
5729 dest_cpu = select_fallback_rq(dead_cpu, p);
5730 raw_spin_unlock(&rq->lock);
5731 /*
5732 * It can only fail if we race with set_cpus_allowed(),
5733 * in the racer should migrate the task anyway.
5734 */
5735 if (needs_cpu)
5736 __migrate_task(p, dead_cpu, dest_cpu);
5737 local_irq_restore(flags);
5738} 5796}
5739 5797
5740/* 5798/*
@@ -5747,128 +5805,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5747static void migrate_nr_uninterruptible(struct rq *rq_src) 5805static void migrate_nr_uninterruptible(struct rq *rq_src)
5748{ 5806{
5749 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5807 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5750 unsigned long flags;
5751 5808
5752 local_irq_save(flags);
5753 double_rq_lock(rq_src, rq_dest);
5754 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5809 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5755 rq_src->nr_uninterruptible = 0; 5810 rq_src->nr_uninterruptible = 0;
5756 double_rq_unlock(rq_src, rq_dest);
5757 local_irq_restore(flags);
5758}
5759
5760/* Run through task list and migrate tasks from the dead cpu. */
5761static void migrate_live_tasks(int src_cpu)
5762{
5763 struct task_struct *p, *t;
5764
5765 read_lock(&tasklist_lock);
5766
5767 do_each_thread(t, p) {
5768 if (p == current)
5769 continue;
5770
5771 if (task_cpu(p) == src_cpu)
5772 move_task_off_dead_cpu(src_cpu, p);
5773 } while_each_thread(t, p);
5774
5775 read_unlock(&tasklist_lock);
5776} 5811}
5777 5812
5778/* 5813/*
5779 * Schedules idle task to be the next runnable task on current CPU. 5814 * remove the tasks which were accounted by rq from calc_load_tasks.
5780 * It does so by boosting its priority to highest possible.
5781 * Used by CPU offline code.
5782 */ 5815 */
5783void sched_idle_next(void) 5816static void calc_global_load_remove(struct rq *rq)
5784{ 5817{
5785 int this_cpu = smp_processor_id(); 5818 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5786 struct rq *rq = cpu_rq(this_cpu); 5819 rq->calc_load_active = 0;
5787 struct task_struct *p = rq->idle;
5788 unsigned long flags;
5789
5790 /* cpu has to be offline */
5791 BUG_ON(cpu_online(this_cpu));
5792
5793 /*
5794 * Strictly not necessary since rest of the CPUs are stopped by now
5795 * and interrupts disabled on the current cpu.
5796 */
5797 raw_spin_lock_irqsave(&rq->lock, flags);
5798
5799 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5800
5801 activate_task(rq, p, 0);
5802
5803 raw_spin_unlock_irqrestore(&rq->lock, flags);
5804} 5820}
5805 5821
5806/* 5822/*
5807 * Ensures that the idle task is using init_mm right before its cpu goes 5823 * Migrate all tasks from the rq, sleeping tasks will be migrated by
5808 * offline. 5824 * try_to_wake_up()->select_task_rq().
5825 *
5826 * Called with rq->lock held even though we'er in stop_machine() and
5827 * there's no concurrency possible, we hold the required locks anyway
5828 * because of lock validation efforts.
5809 */ 5829 */
5810void idle_task_exit(void) 5830static void migrate_tasks(unsigned int dead_cpu)
5811{
5812 struct mm_struct *mm = current->active_mm;
5813
5814 BUG_ON(cpu_online(smp_processor_id()));
5815
5816 if (mm != &init_mm)
5817 switch_mm(mm, &init_mm, current);
5818 mmdrop(mm);
5819}
5820
5821/* called under rq->lock with disabled interrupts */
5822static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5823{ 5831{
5824 struct rq *rq = cpu_rq(dead_cpu); 5832 struct rq *rq = cpu_rq(dead_cpu);
5825 5833 struct task_struct *next, *stop = rq->stop;
5826 /* Must be exiting, otherwise would be on tasklist. */ 5834 int dest_cpu;
5827 BUG_ON(!p->exit_state);
5828
5829 /* Cannot have done final schedule yet: would have vanished. */
5830 BUG_ON(p->state == TASK_DEAD);
5831
5832 get_task_struct(p);
5833 5835
5834 /* 5836 /*
5835 * Drop lock around migration; if someone else moves it, 5837 * Fudge the rq selection such that the below task selection loop
5836 * that's OK. No task can be added to this CPU, so iteration is 5838 * doesn't get stuck on the currently eligible stop task.
5837 * fine. 5839 *
5840 * We're currently inside stop_machine() and the rq is either stuck
5841 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5842 * either way we should never end up calling schedule() until we're
5843 * done here.
5838 */ 5844 */
5839 raw_spin_unlock_irq(&rq->lock); 5845 rq->stop = NULL;
5840 move_task_off_dead_cpu(dead_cpu, p);
5841 raw_spin_lock_irq(&rq->lock);
5842
5843 put_task_struct(p);
5844}
5845
5846/* release_task() removes task from tasklist, so we won't find dead tasks. */
5847static void migrate_dead_tasks(unsigned int dead_cpu)
5848{
5849 struct rq *rq = cpu_rq(dead_cpu);
5850 struct task_struct *next;
5851 5846
5852 for ( ; ; ) { 5847 for ( ; ; ) {
5853 if (!rq->nr_running) 5848 /*
5849 * There's this thread running, bail when that's the only
5850 * remaining thread.
5851 */
5852 if (rq->nr_running == 1)
5854 break; 5853 break;
5854
5855 next = pick_next_task(rq); 5855 next = pick_next_task(rq);
5856 if (!next) 5856 BUG_ON(!next);
5857 break;
5858 next->sched_class->put_prev_task(rq, next); 5857 next->sched_class->put_prev_task(rq, next);
5859 migrate_dead(dead_cpu, next);
5860 5858
5859 /* Find suitable destination for @next, with force if needed. */
5860 dest_cpu = select_fallback_rq(dead_cpu, next);
5861 raw_spin_unlock(&rq->lock);
5862
5863 __migrate_task(next, dead_cpu, dest_cpu);
5864
5865 raw_spin_lock(&rq->lock);
5861 } 5866 }
5862}
5863 5867
5864/* 5868 rq->stop = stop;
5865 * remove the tasks which were accounted by rq from calc_load_tasks.
5866 */
5867static void calc_global_load_remove(struct rq *rq)
5868{
5869 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5870 rq->calc_load_active = 0;
5871} 5869}
5870
5872#endif /* CONFIG_HOTPLUG_CPU */ 5871#endif /* CONFIG_HOTPLUG_CPU */
5873 5872
5874#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5873#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -6078,15 +6077,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6078 unsigned long flags; 6077 unsigned long flags;
6079 struct rq *rq = cpu_rq(cpu); 6078 struct rq *rq = cpu_rq(cpu);
6080 6079
6081 switch (action) { 6080 switch (action & ~CPU_TASKS_FROZEN) {
6082 6081
6083 case CPU_UP_PREPARE: 6082 case CPU_UP_PREPARE:
6084 case CPU_UP_PREPARE_FROZEN:
6085 rq->calc_load_update = calc_load_update; 6083 rq->calc_load_update = calc_load_update;
6086 break; 6084 break;
6087 6085
6088 case CPU_ONLINE: 6086 case CPU_ONLINE:
6089 case CPU_ONLINE_FROZEN:
6090 /* Update our root-domain */ 6087 /* Update our root-domain */
6091 raw_spin_lock_irqsave(&rq->lock, flags); 6088 raw_spin_lock_irqsave(&rq->lock, flags);
6092 if (rq->rd) { 6089 if (rq->rd) {
@@ -6098,30 +6095,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6098 break; 6095 break;
6099 6096
6100#ifdef CONFIG_HOTPLUG_CPU 6097#ifdef CONFIG_HOTPLUG_CPU
6101 case CPU_DEAD:
6102 case CPU_DEAD_FROZEN:
6103 migrate_live_tasks(cpu);
6104 /* Idle task back to normal (off runqueue, low prio) */
6105 raw_spin_lock_irq(&rq->lock);
6106 deactivate_task(rq, rq->idle, 0);
6107 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
6108 rq->idle->sched_class = &idle_sched_class;
6109 migrate_dead_tasks(cpu);
6110 raw_spin_unlock_irq(&rq->lock);
6111 migrate_nr_uninterruptible(rq);
6112 BUG_ON(rq->nr_running != 0);
6113 calc_global_load_remove(rq);
6114 break;
6115
6116 case CPU_DYING: 6098 case CPU_DYING:
6117 case CPU_DYING_FROZEN:
6118 /* Update our root-domain */ 6099 /* Update our root-domain */
6119 raw_spin_lock_irqsave(&rq->lock, flags); 6100 raw_spin_lock_irqsave(&rq->lock, flags);
6120 if (rq->rd) { 6101 if (rq->rd) {
6121 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6102 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
6122 set_rq_offline(rq); 6103 set_rq_offline(rq);
6123 } 6104 }
6105 migrate_tasks(cpu);
6106 BUG_ON(rq->nr_running != 1); /* the migration thread */
6124 raw_spin_unlock_irqrestore(&rq->lock, flags); 6107 raw_spin_unlock_irqrestore(&rq->lock, flags);
6108
6109 migrate_nr_uninterruptible(rq);
6110 calc_global_load_remove(rq);
6125 break; 6111 break;
6126#endif 6112#endif
6127 } 6113 }
@@ -6960,6 +6946,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6960 if (cpu != group_first_cpu(sd->groups)) 6946 if (cpu != group_first_cpu(sd->groups))
6961 return; 6947 return;
6962 6948
6949 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
6950
6963 child = sd->child; 6951 child = sd->child;
6964 6952
6965 sd->groups->cpu_power = 0; 6953 sd->groups->cpu_power = 0;
@@ -7850,18 +7838,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7850 7838
7851#ifdef CONFIG_FAIR_GROUP_SCHED 7839#ifdef CONFIG_FAIR_GROUP_SCHED
7852static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7840static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7853 struct sched_entity *se, int cpu, int add, 7841 struct sched_entity *se, int cpu,
7854 struct sched_entity *parent) 7842 struct sched_entity *parent)
7855{ 7843{
7856 struct rq *rq = cpu_rq(cpu); 7844 struct rq *rq = cpu_rq(cpu);
7857 tg->cfs_rq[cpu] = cfs_rq; 7845 tg->cfs_rq[cpu] = cfs_rq;
7858 init_cfs_rq(cfs_rq, rq); 7846 init_cfs_rq(cfs_rq, rq);
7859 cfs_rq->tg = tg; 7847 cfs_rq->tg = tg;
7860 if (add)
7861 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7862 7848
7863 tg->se[cpu] = se; 7849 tg->se[cpu] = se;
7864 /* se could be NULL for init_task_group */ 7850 /* se could be NULL for root_task_group */
7865 if (!se) 7851 if (!se)
7866 return; 7852 return;
7867 7853
@@ -7871,15 +7857,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7871 se->cfs_rq = parent->my_q; 7857 se->cfs_rq = parent->my_q;
7872 7858
7873 se->my_q = cfs_rq; 7859 se->my_q = cfs_rq;
7874 se->load.weight = tg->shares; 7860 update_load_set(&se->load, 0);
7875 se->load.inv_weight = 0;
7876 se->parent = parent; 7861 se->parent = parent;
7877} 7862}
7878#endif 7863#endif
7879 7864
7880#ifdef CONFIG_RT_GROUP_SCHED 7865#ifdef CONFIG_RT_GROUP_SCHED
7881static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7866static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7882 struct sched_rt_entity *rt_se, int cpu, int add, 7867 struct sched_rt_entity *rt_se, int cpu,
7883 struct sched_rt_entity *parent) 7868 struct sched_rt_entity *parent)
7884{ 7869{
7885 struct rq *rq = cpu_rq(cpu); 7870 struct rq *rq = cpu_rq(cpu);
@@ -7888,8 +7873,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7888 init_rt_rq(rt_rq, rq); 7873 init_rt_rq(rt_rq, rq);
7889 rt_rq->tg = tg; 7874 rt_rq->tg = tg;
7890 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7875 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7891 if (add)
7892 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7893 7876
7894 tg->rt_se[cpu] = rt_se; 7877 tg->rt_se[cpu] = rt_se;
7895 if (!rt_se) 7878 if (!rt_se)
@@ -7924,18 +7907,18 @@ void __init sched_init(void)
7924 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7907 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7925 7908
7926#ifdef CONFIG_FAIR_GROUP_SCHED 7909#ifdef CONFIG_FAIR_GROUP_SCHED
7927 init_task_group.se = (struct sched_entity **)ptr; 7910 root_task_group.se = (struct sched_entity **)ptr;
7928 ptr += nr_cpu_ids * sizeof(void **); 7911 ptr += nr_cpu_ids * sizeof(void **);
7929 7912
7930 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7913 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7931 ptr += nr_cpu_ids * sizeof(void **); 7914 ptr += nr_cpu_ids * sizeof(void **);
7932 7915
7933#endif /* CONFIG_FAIR_GROUP_SCHED */ 7916#endif /* CONFIG_FAIR_GROUP_SCHED */
7934#ifdef CONFIG_RT_GROUP_SCHED 7917#ifdef CONFIG_RT_GROUP_SCHED
7935 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7918 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7936 ptr += nr_cpu_ids * sizeof(void **); 7919 ptr += nr_cpu_ids * sizeof(void **);
7937 7920
7938 init_task_group.rt_rq = (struct rt_rq **)ptr; 7921 root_task_group.rt_rq = (struct rt_rq **)ptr;
7939 ptr += nr_cpu_ids * sizeof(void **); 7922 ptr += nr_cpu_ids * sizeof(void **);
7940 7923
7941#endif /* CONFIG_RT_GROUP_SCHED */ 7924#endif /* CONFIG_RT_GROUP_SCHED */
@@ -7955,20 +7938,16 @@ void __init sched_init(void)
7955 global_rt_period(), global_rt_runtime()); 7938 global_rt_period(), global_rt_runtime());
7956 7939
7957#ifdef CONFIG_RT_GROUP_SCHED 7940#ifdef CONFIG_RT_GROUP_SCHED
7958 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7941 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7959 global_rt_period(), global_rt_runtime()); 7942 global_rt_period(), global_rt_runtime());
7960#endif /* CONFIG_RT_GROUP_SCHED */ 7943#endif /* CONFIG_RT_GROUP_SCHED */
7961 7944
7962#ifdef CONFIG_CGROUP_SCHED 7945#ifdef CONFIG_CGROUP_SCHED
7963 list_add(&init_task_group.list, &task_groups); 7946 list_add(&root_task_group.list, &task_groups);
7964 INIT_LIST_HEAD(&init_task_group.children); 7947 INIT_LIST_HEAD(&root_task_group.children);
7965 7948 autogroup_init(&init_task);
7966#endif /* CONFIG_CGROUP_SCHED */ 7949#endif /* CONFIG_CGROUP_SCHED */
7967 7950
7968#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7969 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
7970 __alignof__(unsigned long));
7971#endif
7972 for_each_possible_cpu(i) { 7951 for_each_possible_cpu(i) {
7973 struct rq *rq; 7952 struct rq *rq;
7974 7953
@@ -7980,38 +7959,34 @@ void __init sched_init(void)
7980 init_cfs_rq(&rq->cfs, rq); 7959 init_cfs_rq(&rq->cfs, rq);
7981 init_rt_rq(&rq->rt, rq); 7960 init_rt_rq(&rq->rt, rq);
7982#ifdef CONFIG_FAIR_GROUP_SCHED 7961#ifdef CONFIG_FAIR_GROUP_SCHED
7983 init_task_group.shares = init_task_group_load; 7962 root_task_group.shares = root_task_group_load;
7984 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7963 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7985#ifdef CONFIG_CGROUP_SCHED
7986 /* 7964 /*
7987 * How much cpu bandwidth does init_task_group get? 7965 * How much cpu bandwidth does root_task_group get?
7988 * 7966 *
7989 * In case of task-groups formed thr' the cgroup filesystem, it 7967 * In case of task-groups formed thr' the cgroup filesystem, it
7990 * gets 100% of the cpu resources in the system. This overall 7968 * gets 100% of the cpu resources in the system. This overall
7991 * system cpu resource is divided among the tasks of 7969 * system cpu resource is divided among the tasks of
7992 * init_task_group and its child task-groups in a fair manner, 7970 * root_task_group and its child task-groups in a fair manner,
7993 * based on each entity's (task or task-group's) weight 7971 * based on each entity's (task or task-group's) weight
7994 * (se->load.weight). 7972 * (se->load.weight).
7995 * 7973 *
7996 * In other words, if init_task_group has 10 tasks of weight 7974 * In other words, if root_task_group has 10 tasks of weight
7997 * 1024) and two child groups A0 and A1 (of weight 1024 each), 7975 * 1024) and two child groups A0 and A1 (of weight 1024 each),
7998 * then A0's share of the cpu resource is: 7976 * then A0's share of the cpu resource is:
7999 * 7977 *
8000 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 7978 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8001 * 7979 *
8002 * We achieve this by letting init_task_group's tasks sit 7980 * We achieve this by letting root_task_group's tasks sit
8003 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7981 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8004 */ 7982 */
8005 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7983 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8006#endif
8007#endif /* CONFIG_FAIR_GROUP_SCHED */ 7984#endif /* CONFIG_FAIR_GROUP_SCHED */
8008 7985
8009 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7986 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8010#ifdef CONFIG_RT_GROUP_SCHED 7987#ifdef CONFIG_RT_GROUP_SCHED
8011 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7988 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
8012#ifdef CONFIG_CGROUP_SCHED 7989 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
8013 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
8014#endif
8015#endif 7990#endif
8016 7991
8017 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7992 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -8091,8 +8066,6 @@ void __init sched_init(void)
8091 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 8066 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
8092#endif /* SMP */ 8067#endif /* SMP */
8093 8068
8094 perf_event_init();
8095
8096 scheduler_running = 1; 8069 scheduler_running = 1;
8097} 8070}
8098 8071
@@ -8286,7 +8259,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8286 if (!se) 8259 if (!se)
8287 goto err_free_rq; 8260 goto err_free_rq;
8288 8261
8289 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8262 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8290 } 8263 }
8291 8264
8292 return 1; 8265 return 1;
@@ -8297,15 +8270,21 @@ err:
8297 return 0; 8270 return 0;
8298} 8271}
8299 8272
8300static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8301{
8302 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8303 &cpu_rq(cpu)->leaf_cfs_rq_list);
8304}
8305
8306static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8273static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8307{ 8274{
8308 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8275 struct rq *rq = cpu_rq(cpu);
8276 unsigned long flags;
8277
8278 /*
8279 * Only empty task groups can be destroyed; so we can speculatively
8280 * check on_list without danger of it being re-added.
8281 */
8282 if (!tg->cfs_rq[cpu]->on_list)
8283 return;
8284
8285 raw_spin_lock_irqsave(&rq->lock, flags);
8286 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8287 raw_spin_unlock_irqrestore(&rq->lock, flags);
8309} 8288}
8310#else /* !CONFG_FAIR_GROUP_SCHED */ 8289#else /* !CONFG_FAIR_GROUP_SCHED */
8311static inline void free_fair_sched_group(struct task_group *tg) 8290static inline void free_fair_sched_group(struct task_group *tg)
@@ -8318,10 +8297,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8318 return 1; 8297 return 1;
8319} 8298}
8320 8299
8321static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8322{
8323}
8324
8325static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8300static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8326{ 8301{
8327} 8302}
@@ -8376,7 +8351,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8376 if (!rt_se) 8351 if (!rt_se)
8377 goto err_free_rq; 8352 goto err_free_rq;
8378 8353
8379 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8354 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8380 } 8355 }
8381 8356
8382 return 1; 8357 return 1;
@@ -8386,17 +8361,6 @@ err_free_rq:
8386err: 8361err:
8387 return 0; 8362 return 0;
8388} 8363}
8389
8390static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8391{
8392 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8393 &cpu_rq(cpu)->leaf_rt_rq_list);
8394}
8395
8396static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8397{
8398 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8399}
8400#else /* !CONFIG_RT_GROUP_SCHED */ 8364#else /* !CONFIG_RT_GROUP_SCHED */
8401static inline void free_rt_sched_group(struct task_group *tg) 8365static inline void free_rt_sched_group(struct task_group *tg)
8402{ 8366{
@@ -8407,14 +8371,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8407{ 8371{
8408 return 1; 8372 return 1;
8409} 8373}
8410
8411static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8412{
8413}
8414
8415static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8416{
8417}
8418#endif /* CONFIG_RT_GROUP_SCHED */ 8374#endif /* CONFIG_RT_GROUP_SCHED */
8419 8375
8420#ifdef CONFIG_CGROUP_SCHED 8376#ifdef CONFIG_CGROUP_SCHED
@@ -8422,6 +8378,7 @@ static void free_sched_group(struct task_group *tg)
8422{ 8378{
8423 free_fair_sched_group(tg); 8379 free_fair_sched_group(tg);
8424 free_rt_sched_group(tg); 8380 free_rt_sched_group(tg);
8381 autogroup_free(tg);
8425 kfree(tg); 8382 kfree(tg);
8426} 8383}
8427 8384
@@ -8430,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8430{ 8387{
8431 struct task_group *tg; 8388 struct task_group *tg;
8432 unsigned long flags; 8389 unsigned long flags;
8433 int i;
8434 8390
8435 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8391 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8436 if (!tg) 8392 if (!tg)
@@ -8443,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8443 goto err; 8399 goto err;
8444 8400
8445 spin_lock_irqsave(&task_group_lock, flags); 8401 spin_lock_irqsave(&task_group_lock, flags);
8446 for_each_possible_cpu(i) {
8447 register_fair_sched_group(tg, i);
8448 register_rt_sched_group(tg, i);
8449 }
8450 list_add_rcu(&tg->list, &task_groups); 8402 list_add_rcu(&tg->list, &task_groups);
8451 8403
8452 WARN_ON(!parent); /* root should already exist */ 8404 WARN_ON(!parent); /* root should already exist */
@@ -8476,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
8476 unsigned long flags; 8428 unsigned long flags;
8477 int i; 8429 int i;
8478 8430
8479 spin_lock_irqsave(&task_group_lock, flags); 8431 /* end participation in shares distribution */
8480 for_each_possible_cpu(i) { 8432 for_each_possible_cpu(i)
8481 unregister_fair_sched_group(tg, i); 8433 unregister_fair_sched_group(tg, i);
8482 unregister_rt_sched_group(tg, i); 8434
8483 } 8435 spin_lock_irqsave(&task_group_lock, flags);
8484 list_del_rcu(&tg->list); 8436 list_del_rcu(&tg->list);
8485 list_del_rcu(&tg->siblings); 8437 list_del_rcu(&tg->siblings);
8486 spin_unlock_irqrestore(&task_group_lock, flags); 8438 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8527,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
8527#endif /* CONFIG_CGROUP_SCHED */ 8479#endif /* CONFIG_CGROUP_SCHED */
8528 8480
8529#ifdef CONFIG_FAIR_GROUP_SCHED 8481#ifdef CONFIG_FAIR_GROUP_SCHED
8530static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8531{
8532 struct cfs_rq *cfs_rq = se->cfs_rq;
8533 int on_rq;
8534
8535 on_rq = se->on_rq;
8536 if (on_rq)
8537 dequeue_entity(cfs_rq, se, 0);
8538
8539 se->load.weight = shares;
8540 se->load.inv_weight = 0;
8541
8542 if (on_rq)
8543 enqueue_entity(cfs_rq, se, 0);
8544}
8545
8546static void set_se_shares(struct sched_entity *se, unsigned long shares)
8547{
8548 struct cfs_rq *cfs_rq = se->cfs_rq;
8549 struct rq *rq = cfs_rq->rq;
8550 unsigned long flags;
8551
8552 raw_spin_lock_irqsave(&rq->lock, flags);
8553 __set_se_shares(se, shares);
8554 raw_spin_unlock_irqrestore(&rq->lock, flags);
8555}
8556
8557static DEFINE_MUTEX(shares_mutex); 8482static DEFINE_MUTEX(shares_mutex);
8558 8483
8559int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8484int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8576,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8576 if (tg->shares == shares) 8501 if (tg->shares == shares)
8577 goto done; 8502 goto done;
8578 8503
8579 spin_lock_irqsave(&task_group_lock, flags);
8580 for_each_possible_cpu(i)
8581 unregister_fair_sched_group(tg, i);
8582 list_del_rcu(&tg->siblings);
8583 spin_unlock_irqrestore(&task_group_lock, flags);
8584
8585 /* wait for any ongoing reference to this group to finish */
8586 synchronize_sched();
8587
8588 /*
8589 * Now we are free to modify the group's share on each cpu
8590 * w/o tripping rebalance_share or load_balance_fair.
8591 */
8592 tg->shares = shares; 8504 tg->shares = shares;
8593 for_each_possible_cpu(i) { 8505 for_each_possible_cpu(i) {
8594 /* 8506 struct rq *rq = cpu_rq(i);
8595 * force a rebalance 8507 struct sched_entity *se;
8596 */ 8508
8597 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8509 se = tg->se[i];
8598 set_se_shares(tg->se[i], shares); 8510 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0);
8514 raw_spin_unlock_irqrestore(&rq->lock, flags);
8599 } 8515 }
8600 8516
8601 /*
8602 * Enable load balance activity on this group, by inserting it back on
8603 * each cpu's rq->leaf_cfs_rq_list.
8604 */
8605 spin_lock_irqsave(&task_group_lock, flags);
8606 for_each_possible_cpu(i)
8607 register_fair_sched_group(tg, i);
8608 list_add_rcu(&tg->siblings, &tg->parent->children);
8609 spin_unlock_irqrestore(&task_group_lock, flags);
8610done: 8517done:
8611 mutex_unlock(&shares_mutex); 8518 mutex_unlock(&shares_mutex);
8612 return 0; 8519 return 0;
@@ -8905,7 +8812,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8905 8812
8906 if (!cgrp->parent) { 8813 if (!cgrp->parent) {
8907 /* This is early initialization for the top cgroup */ 8814 /* This is early initialization for the top cgroup */
8908 return &init_task_group.css; 8815 return &root_task_group.css;
8909 } 8816 }
8910 8817
8911 parent = cgroup_tg(cgrp->parent); 8818 parent = cgroup_tg(cgrp->parent);
@@ -8976,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8976 } 8883 }
8977} 8884}
8978 8885
8886static void
8887cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
8888{
8889 /*
8890 * cgroup_exit() is called in the copy_process() failure path.
8891 * Ignore this case since the task hasn't ran yet, this avoids
8892 * trying to poke a half freed task state from generic code.
8893 */
8894 if (!(task->flags & PF_EXITING))
8895 return;
8896
8897 sched_move_task(task);
8898}
8899
8979#ifdef CONFIG_FAIR_GROUP_SCHED 8900#ifdef CONFIG_FAIR_GROUP_SCHED
8980static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 8901static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8981 u64 shareval) 8902 u64 shareval)
@@ -9048,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
9048 .destroy = cpu_cgroup_destroy, 8969 .destroy = cpu_cgroup_destroy,
9049 .can_attach = cpu_cgroup_can_attach, 8970 .can_attach = cpu_cgroup_can_attach,
9050 .attach = cpu_cgroup_attach, 8971 .attach = cpu_cgroup_attach,
8972 .exit = cpu_cgroup_exit,
9051 .populate = cpu_cgroup_populate, 8973 .populate = cpu_cgroup_populate,
9052 .subsys_id = cpu_cgroup_subsys_id, 8974 .subsys_id = cpu_cgroup_subsys_id,
9053 .early_init = 1, 8975 .early_init = 1,
@@ -9332,72 +9254,3 @@ struct cgroup_subsys cpuacct_subsys = {
9332}; 9254};
9333#endif /* CONFIG_CGROUP_CPUACCT */ 9255#endif /* CONFIG_CGROUP_CPUACCT */
9334 9256
9335#ifndef CONFIG_SMP
9336
9337void synchronize_sched_expedited(void)
9338{
9339 barrier();
9340}
9341EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9342
9343#else /* #ifndef CONFIG_SMP */
9344
9345static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9346
9347static int synchronize_sched_expedited_cpu_stop(void *data)
9348{
9349 /*
9350 * There must be a full memory barrier on each affected CPU
9351 * between the time that try_stop_cpus() is called and the
9352 * time that it returns.
9353 *
9354 * In the current initial implementation of cpu_stop, the
9355 * above condition is already met when the control reaches
9356 * this point and the following smp_mb() is not strictly
9357 * necessary. Do smp_mb() anyway for documentation and
9358 * robustness against future implementation changes.
9359 */
9360 smp_mb(); /* See above comment block. */
9361 return 0;
9362}
9363
9364/*
9365 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
9366 * approach to force grace period to end quickly. This consumes
9367 * significant time on all CPUs, and is thus not recommended for
9368 * any sort of common-case code.
9369 *
9370 * Note that it is illegal to call this function while holding any
9371 * lock that is acquired by a CPU-hotplug notifier. Failing to
9372 * observe this restriction will result in deadlock.
9373 */
9374void synchronize_sched_expedited(void)
9375{
9376 int snap, trycount = 0;
9377
9378 smp_mb(); /* ensure prior mod happens before capturing snap. */
9379 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9380 get_online_cpus();
9381 while (try_stop_cpus(cpu_online_mask,
9382 synchronize_sched_expedited_cpu_stop,
9383 NULL) == -EAGAIN) {
9384 put_online_cpus();
9385 if (trycount++ < 10)
9386 udelay(trycount * num_online_cpus());
9387 else {
9388 synchronize_sched();
9389 return;
9390 }
9391 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9392 smp_mb(); /* ensure test happens before caller kfree */
9393 return;
9394 }
9395 get_online_cpus();
9396 }
9397 atomic_inc(&synchronize_sched_expedited_count);
9398 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9399 put_online_cpus();
9400}
9401EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9402
9403#endif /* #else #ifndef CONFIG_SMP */