diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 964 |
1 files changed, 401 insertions, 563 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index aa14a56f9d03..04949089e760 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -75,9 +75,11 @@ | |||
75 | 75 | ||
76 | #include <asm/tlb.h> | 76 | #include <asm/tlb.h> |
77 | #include <asm/irq_regs.h> | 77 | #include <asm/irq_regs.h> |
78 | #include <asm/mutex.h> | ||
78 | 79 | ||
79 | #include "sched_cpupri.h" | 80 | #include "sched_cpupri.h" |
80 | #include "workqueue_sched.h" | 81 | #include "workqueue_sched.h" |
82 | #include "sched_autogroup.h" | ||
81 | 83 | ||
82 | #define CREATE_TRACE_POINTS | 84 | #define CREATE_TRACE_POINTS |
83 | #include <trace/events/sched.h> | 85 | #include <trace/events/sched.h> |
@@ -253,6 +255,8 @@ struct task_group { | |||
253 | /* runqueue "owned" by this group on each cpu */ | 255 | /* runqueue "owned" by this group on each cpu */ |
254 | struct cfs_rq **cfs_rq; | 256 | struct cfs_rq **cfs_rq; |
255 | unsigned long shares; | 257 | unsigned long shares; |
258 | |||
259 | atomic_t load_weight; | ||
256 | #endif | 260 | #endif |
257 | 261 | ||
258 | #ifdef CONFIG_RT_GROUP_SCHED | 262 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -268,24 +272,19 @@ struct task_group { | |||
268 | struct task_group *parent; | 272 | struct task_group *parent; |
269 | struct list_head siblings; | 273 | struct list_head siblings; |
270 | struct list_head children; | 274 | struct list_head children; |
275 | |||
276 | #ifdef CONFIG_SCHED_AUTOGROUP | ||
277 | struct autogroup *autogroup; | ||
278 | #endif | ||
271 | }; | 279 | }; |
272 | 280 | ||
273 | #define root_task_group init_task_group | 281 | #define root_task_group init_task_group |
274 | 282 | ||
275 | /* task_group_lock serializes add/remove of task groups and also changes to | 283 | /* task_group_lock serializes the addition/removal of task groups */ |
276 | * a task group's cpu shares. | ||
277 | */ | ||
278 | static DEFINE_SPINLOCK(task_group_lock); | 284 | static DEFINE_SPINLOCK(task_group_lock); |
279 | 285 | ||
280 | #ifdef CONFIG_FAIR_GROUP_SCHED | 286 | #ifdef CONFIG_FAIR_GROUP_SCHED |
281 | 287 | ||
282 | #ifdef CONFIG_SMP | ||
283 | static int root_task_group_empty(void) | ||
284 | { | ||
285 | return list_empty(&root_task_group.children); | ||
286 | } | ||
287 | #endif | ||
288 | |||
289 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 288 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
290 | 289 | ||
291 | /* | 290 | /* |
@@ -342,6 +341,7 @@ struct cfs_rq { | |||
342 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 341 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
343 | * list is used during load balance. | 342 | * list is used during load balance. |
344 | */ | 343 | */ |
344 | int on_list; | ||
345 | struct list_head leaf_cfs_rq_list; | 345 | struct list_head leaf_cfs_rq_list; |
346 | struct task_group *tg; /* group that "owns" this runqueue */ | 346 | struct task_group *tg; /* group that "owns" this runqueue */ |
347 | 347 | ||
@@ -360,14 +360,17 @@ struct cfs_rq { | |||
360 | unsigned long h_load; | 360 | unsigned long h_load; |
361 | 361 | ||
362 | /* | 362 | /* |
363 | * this cpu's part of tg->shares | 363 | * Maintaining per-cpu shares distribution for group scheduling |
364 | * | ||
365 | * load_stamp is the last time we updated the load average | ||
366 | * load_last is the last time we updated the load average and saw load | ||
367 | * load_unacc_exec_time is currently unaccounted execution time | ||
364 | */ | 368 | */ |
365 | unsigned long shares; | 369 | u64 load_avg; |
370 | u64 load_period; | ||
371 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
366 | 372 | ||
367 | /* | 373 | unsigned long load_contribution; |
368 | * load.weight at the time we set shares | ||
369 | */ | ||
370 | unsigned long rq_weight; | ||
371 | #endif | 374 | #endif |
372 | #endif | 375 | #endif |
373 | }; | 376 | }; |
@@ -560,18 +563,8 @@ struct rq { | |||
560 | 563 | ||
561 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 564 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
562 | 565 | ||
563 | static inline | ||
564 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
565 | { | ||
566 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); | ||
567 | 566 | ||
568 | /* | 567 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); |
569 | * A queue event has occurred, and we're going to schedule. In | ||
570 | * this case, we can save a useless back to back clock update. | ||
571 | */ | ||
572 | if (test_tsk_need_resched(p)) | ||
573 | rq->skip_clock_update = 1; | ||
574 | } | ||
575 | 568 | ||
576 | static inline int cpu_of(struct rq *rq) | 569 | static inline int cpu_of(struct rq *rq) |
577 | { | 570 | { |
@@ -615,11 +608,14 @@ static inline int cpu_of(struct rq *rq) | |||
615 | */ | 608 | */ |
616 | static inline struct task_group *task_group(struct task_struct *p) | 609 | static inline struct task_group *task_group(struct task_struct *p) |
617 | { | 610 | { |
611 | struct task_group *tg; | ||
618 | struct cgroup_subsys_state *css; | 612 | struct cgroup_subsys_state *css; |
619 | 613 | ||
620 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 614 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
621 | lockdep_is_held(&task_rq(p)->lock)); | 615 | lockdep_is_held(&task_rq(p)->lock)); |
622 | return container_of(css, struct task_group, css); | 616 | tg = container_of(css, struct task_group, css); |
617 | |||
618 | return autogroup_task_group(p, tg); | ||
623 | } | 619 | } |
624 | 620 | ||
625 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 621 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
@@ -646,22 +642,18 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
646 | 642 | ||
647 | #endif /* CONFIG_CGROUP_SCHED */ | 643 | #endif /* CONFIG_CGROUP_SCHED */ |
648 | 644 | ||
649 | static u64 irq_time_cpu(int cpu); | 645 | static void update_rq_clock_task(struct rq *rq, s64 delta); |
650 | static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time); | ||
651 | 646 | ||
652 | inline void update_rq_clock(struct rq *rq) | 647 | static void update_rq_clock(struct rq *rq) |
653 | { | 648 | { |
654 | if (!rq->skip_clock_update) { | 649 | s64 delta; |
655 | int cpu = cpu_of(rq); | ||
656 | u64 irq_time; | ||
657 | 650 | ||
658 | rq->clock = sched_clock_cpu(cpu); | 651 | if (rq->skip_clock_update) |
659 | irq_time = irq_time_cpu(cpu); | 652 | return; |
660 | if (rq->clock - irq_time > rq->clock_task) | ||
661 | rq->clock_task = rq->clock - irq_time; | ||
662 | 653 | ||
663 | sched_irq_time_avg_update(rq, irq_time); | 654 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; |
664 | } | 655 | rq->clock += delta; |
656 | update_rq_clock_task(rq, delta); | ||
665 | } | 657 | } |
666 | 658 | ||
667 | /* | 659 | /* |
@@ -807,20 +799,6 @@ late_initcall(sched_init_debug); | |||
807 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 799 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
808 | 800 | ||
809 | /* | 801 | /* |
810 | * ratelimit for updating the group shares. | ||
811 | * default: 0.25ms | ||
812 | */ | ||
813 | unsigned int sysctl_sched_shares_ratelimit = 250000; | ||
814 | unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; | ||
815 | |||
816 | /* | ||
817 | * Inject some fuzzyness into changing the per-cpu group shares | ||
818 | * this avoids remote rq-locks at the expense of fairness. | ||
819 | * default: 4 | ||
820 | */ | ||
821 | unsigned int sysctl_sched_shares_thresh = 4; | ||
822 | |||
823 | /* | ||
824 | * period over which we average the RT time consumption, measured | 802 | * period over which we average the RT time consumption, measured |
825 | * in ms. | 803 | * in ms. |
826 | * | 804 | * |
@@ -1369,6 +1347,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | |||
1369 | lw->inv_weight = 0; | 1347 | lw->inv_weight = 0; |
1370 | } | 1348 | } |
1371 | 1349 | ||
1350 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
1351 | { | ||
1352 | lw->weight = w; | ||
1353 | lw->inv_weight = 0; | ||
1354 | } | ||
1355 | |||
1372 | /* | 1356 | /* |
1373 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1357 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
1374 | * of tasks with abnormal "nice" values across CPUs the contribution that | 1358 | * of tasks with abnormal "nice" values across CPUs the contribution that |
@@ -1557,101 +1541,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1557 | 1541 | ||
1558 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1542 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1559 | 1543 | ||
1560 | static __read_mostly unsigned long __percpu *update_shares_data; | ||
1561 | |||
1562 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1563 | |||
1564 | /* | ||
1565 | * Calculate and set the cpu's group shares. | ||
1566 | */ | ||
1567 | static void update_group_shares_cpu(struct task_group *tg, int cpu, | ||
1568 | unsigned long sd_shares, | ||
1569 | unsigned long sd_rq_weight, | ||
1570 | unsigned long *usd_rq_weight) | ||
1571 | { | ||
1572 | unsigned long shares, rq_weight; | ||
1573 | int boost = 0; | ||
1574 | |||
1575 | rq_weight = usd_rq_weight[cpu]; | ||
1576 | if (!rq_weight) { | ||
1577 | boost = 1; | ||
1578 | rq_weight = NICE_0_LOAD; | ||
1579 | } | ||
1580 | |||
1581 | /* | ||
1582 | * \Sum_j shares_j * rq_weight_i | ||
1583 | * shares_i = ----------------------------- | ||
1584 | * \Sum_j rq_weight_j | ||
1585 | */ | ||
1586 | shares = (sd_shares * rq_weight) / sd_rq_weight; | ||
1587 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | ||
1588 | |||
1589 | if (abs(shares - tg->se[cpu]->load.weight) > | ||
1590 | sysctl_sched_shares_thresh) { | ||
1591 | struct rq *rq = cpu_rq(cpu); | ||
1592 | unsigned long flags; | ||
1593 | |||
1594 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
1595 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; | ||
1596 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
1597 | __set_se_shares(tg->se[cpu], shares); | ||
1598 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
1599 | } | ||
1600 | } | ||
1601 | |||
1602 | /* | ||
1603 | * Re-compute the task group their per cpu shares over the given domain. | ||
1604 | * This needs to be done in a bottom-up fashion because the rq weight of a | ||
1605 | * parent group depends on the shares of its child groups. | ||
1606 | */ | ||
1607 | static int tg_shares_up(struct task_group *tg, void *data) | ||
1608 | { | ||
1609 | unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; | ||
1610 | unsigned long *usd_rq_weight; | ||
1611 | struct sched_domain *sd = data; | ||
1612 | unsigned long flags; | ||
1613 | int i; | ||
1614 | |||
1615 | if (!tg->se[0]) | ||
1616 | return 0; | ||
1617 | |||
1618 | local_irq_save(flags); | ||
1619 | usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); | ||
1620 | |||
1621 | for_each_cpu(i, sched_domain_span(sd)) { | ||
1622 | weight = tg->cfs_rq[i]->load.weight; | ||
1623 | usd_rq_weight[i] = weight; | ||
1624 | |||
1625 | rq_weight += weight; | ||
1626 | /* | ||
1627 | * If there are currently no tasks on the cpu pretend there | ||
1628 | * is one of average load so that when a new task gets to | ||
1629 | * run here it will not get delayed by group starvation. | ||
1630 | */ | ||
1631 | if (!weight) | ||
1632 | weight = NICE_0_LOAD; | ||
1633 | |||
1634 | sum_weight += weight; | ||
1635 | shares += tg->cfs_rq[i]->shares; | ||
1636 | } | ||
1637 | |||
1638 | if (!rq_weight) | ||
1639 | rq_weight = sum_weight; | ||
1640 | |||
1641 | if ((!shares && rq_weight) || shares > tg->shares) | ||
1642 | shares = tg->shares; | ||
1643 | |||
1644 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | ||
1645 | shares = tg->shares; | ||
1646 | |||
1647 | for_each_cpu(i, sched_domain_span(sd)) | ||
1648 | update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); | ||
1649 | |||
1650 | local_irq_restore(flags); | ||
1651 | |||
1652 | return 0; | ||
1653 | } | ||
1654 | |||
1655 | /* | 1544 | /* |
1656 | * Compute the cpu's hierarchical load factor for each task group. | 1545 | * Compute the cpu's hierarchical load factor for each task group. |
1657 | * This needs to be done in a top-down fashion because the load of a child | 1546 | * This needs to be done in a top-down fashion because the load of a child |
@@ -1666,7 +1555,7 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1666 | load = cpu_rq(cpu)->load.weight; | 1555 | load = cpu_rq(cpu)->load.weight; |
1667 | } else { | 1556 | } else { |
1668 | load = tg->parent->cfs_rq[cpu]->h_load; | 1557 | load = tg->parent->cfs_rq[cpu]->h_load; |
1669 | load *= tg->cfs_rq[cpu]->shares; | 1558 | load *= tg->se[cpu]->load.weight; |
1670 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | 1559 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; |
1671 | } | 1560 | } |
1672 | 1561 | ||
@@ -1675,34 +1564,11 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1675 | return 0; | 1564 | return 0; |
1676 | } | 1565 | } |
1677 | 1566 | ||
1678 | static void update_shares(struct sched_domain *sd) | ||
1679 | { | ||
1680 | s64 elapsed; | ||
1681 | u64 now; | ||
1682 | |||
1683 | if (root_task_group_empty()) | ||
1684 | return; | ||
1685 | |||
1686 | now = local_clock(); | ||
1687 | elapsed = now - sd->last_update; | ||
1688 | |||
1689 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | ||
1690 | sd->last_update = now; | ||
1691 | walk_tg_tree(tg_nop, tg_shares_up, sd); | ||
1692 | } | ||
1693 | } | ||
1694 | |||
1695 | static void update_h_load(long cpu) | 1567 | static void update_h_load(long cpu) |
1696 | { | 1568 | { |
1697 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1569 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1698 | } | 1570 | } |
1699 | 1571 | ||
1700 | #else | ||
1701 | |||
1702 | static inline void update_shares(struct sched_domain *sd) | ||
1703 | { | ||
1704 | } | ||
1705 | |||
1706 | #endif | 1572 | #endif |
1707 | 1573 | ||
1708 | #ifdef CONFIG_PREEMPT | 1574 | #ifdef CONFIG_PREEMPT |
@@ -1824,15 +1690,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1824 | 1690 | ||
1825 | #endif | 1691 | #endif |
1826 | 1692 | ||
1827 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1828 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1829 | { | ||
1830 | #ifdef CONFIG_SMP | ||
1831 | cfs_rq->shares = shares; | ||
1832 | #endif | ||
1833 | } | ||
1834 | #endif | ||
1835 | |||
1836 | static void calc_load_account_idle(struct rq *this_rq); | 1693 | static void calc_load_account_idle(struct rq *this_rq); |
1837 | static void update_sysctl(void); | 1694 | static void update_sysctl(void); |
1838 | static int get_update_sysctl_factor(void); | 1695 | static int get_update_sysctl_factor(void); |
@@ -1934,10 +1791,9 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1934 | * They are read and saved off onto struct rq in update_rq_clock(). | 1791 | * They are read and saved off onto struct rq in update_rq_clock(). |
1935 | * This may result in other CPU reading this CPU's irq time and can | 1792 | * This may result in other CPU reading this CPU's irq time and can |
1936 | * race with irq/account_system_vtime on this CPU. We would either get old | 1793 | * race with irq/account_system_vtime on this CPU. We would either get old |
1937 | * or new value (or semi updated value on 32 bit) with a side effect of | 1794 | * or new value with a side effect of accounting a slice of irq time to wrong |
1938 | * accounting a slice of irq time to wrong task when irq is in progress | 1795 | * task when irq is in progress while we read rq->clock. That is a worthy |
1939 | * while we read rq->clock. That is a worthy compromise in place of having | 1796 | * compromise in place of having locks on each irq in account_system_time. |
1940 | * locks on each irq in account_system_time. | ||
1941 | */ | 1797 | */ |
1942 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | 1798 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); |
1943 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | 1799 | static DEFINE_PER_CPU(u64, cpu_softirq_time); |
@@ -1955,19 +1811,58 @@ void disable_sched_clock_irqtime(void) | |||
1955 | sched_clock_irqtime = 0; | 1811 | sched_clock_irqtime = 0; |
1956 | } | 1812 | } |
1957 | 1813 | ||
1958 | static u64 irq_time_cpu(int cpu) | 1814 | #ifndef CONFIG_64BIT |
1815 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
1816 | |||
1817 | static inline void irq_time_write_begin(void) | ||
1959 | { | 1818 | { |
1960 | if (!sched_clock_irqtime) | 1819 | __this_cpu_inc(irq_time_seq.sequence); |
1961 | return 0; | 1820 | smp_wmb(); |
1821 | } | ||
1822 | |||
1823 | static inline void irq_time_write_end(void) | ||
1824 | { | ||
1825 | smp_wmb(); | ||
1826 | __this_cpu_inc(irq_time_seq.sequence); | ||
1827 | } | ||
1828 | |||
1829 | static inline u64 irq_time_read(int cpu) | ||
1830 | { | ||
1831 | u64 irq_time; | ||
1832 | unsigned seq; | ||
1833 | |||
1834 | do { | ||
1835 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
1836 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
1837 | per_cpu(cpu_hardirq_time, cpu); | ||
1838 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
1962 | 1839 | ||
1840 | return irq_time; | ||
1841 | } | ||
1842 | #else /* CONFIG_64BIT */ | ||
1843 | static inline void irq_time_write_begin(void) | ||
1844 | { | ||
1845 | } | ||
1846 | |||
1847 | static inline void irq_time_write_end(void) | ||
1848 | { | ||
1849 | } | ||
1850 | |||
1851 | static inline u64 irq_time_read(int cpu) | ||
1852 | { | ||
1963 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | 1853 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); |
1964 | } | 1854 | } |
1855 | #endif /* CONFIG_64BIT */ | ||
1965 | 1856 | ||
1857 | /* | ||
1858 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
1859 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
1860 | */ | ||
1966 | void account_system_vtime(struct task_struct *curr) | 1861 | void account_system_vtime(struct task_struct *curr) |
1967 | { | 1862 | { |
1968 | unsigned long flags; | 1863 | unsigned long flags; |
1864 | s64 delta; | ||
1969 | int cpu; | 1865 | int cpu; |
1970 | u64 now, delta; | ||
1971 | 1866 | ||
1972 | if (!sched_clock_irqtime) | 1867 | if (!sched_clock_irqtime) |
1973 | return; | 1868 | return; |
@@ -1975,9 +1870,10 @@ void account_system_vtime(struct task_struct *curr) | |||
1975 | local_irq_save(flags); | 1870 | local_irq_save(flags); |
1976 | 1871 | ||
1977 | cpu = smp_processor_id(); | 1872 | cpu = smp_processor_id(); |
1978 | now = sched_clock_cpu(cpu); | 1873 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); |
1979 | delta = now - per_cpu(irq_start_time, cpu); | 1874 | __this_cpu_add(irq_start_time, delta); |
1980 | per_cpu(irq_start_time, cpu) = now; | 1875 | |
1876 | irq_time_write_begin(); | ||
1981 | /* | 1877 | /* |
1982 | * We do not account for softirq time from ksoftirqd here. | 1878 | * We do not account for softirq time from ksoftirqd here. |
1983 | * We want to continue accounting softirq time to ksoftirqd thread | 1879 | * We want to continue accounting softirq time to ksoftirqd thread |
@@ -1985,37 +1881,60 @@ void account_system_vtime(struct task_struct *curr) | |||
1985 | * that do not consume any time, but still wants to run. | 1881 | * that do not consume any time, but still wants to run. |
1986 | */ | 1882 | */ |
1987 | if (hardirq_count()) | 1883 | if (hardirq_count()) |
1988 | per_cpu(cpu_hardirq_time, cpu) += delta; | 1884 | __this_cpu_add(cpu_hardirq_time, delta); |
1989 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) | 1885 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) |
1990 | per_cpu(cpu_softirq_time, cpu) += delta; | 1886 | __this_cpu_add(cpu_softirq_time, delta); |
1991 | 1887 | ||
1888 | irq_time_write_end(); | ||
1992 | local_irq_restore(flags); | 1889 | local_irq_restore(flags); |
1993 | } | 1890 | } |
1994 | EXPORT_SYMBOL_GPL(account_system_vtime); | 1891 | EXPORT_SYMBOL_GPL(account_system_vtime); |
1995 | 1892 | ||
1996 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) | 1893 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
1997 | { | 1894 | { |
1998 | if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) { | 1895 | s64 irq_delta; |
1999 | u64 delta_irq = curr_irq_time - rq->prev_irq_time; | 1896 | |
2000 | rq->prev_irq_time = curr_irq_time; | 1897 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; |
2001 | sched_rt_avg_update(rq, delta_irq); | 1898 | |
2002 | } | 1899 | /* |
1900 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | ||
1901 | * this case when a previous update_rq_clock() happened inside a | ||
1902 | * {soft,}irq region. | ||
1903 | * | ||
1904 | * When this happens, we stop ->clock_task and only update the | ||
1905 | * prev_irq_time stamp to account for the part that fit, so that a next | ||
1906 | * update will consume the rest. This ensures ->clock_task is | ||
1907 | * monotonic. | ||
1908 | * | ||
1909 | * It does however cause some slight miss-attribution of {soft,}irq | ||
1910 | * time, a more accurate solution would be to update the irq_time using | ||
1911 | * the current rq->clock timestamp, except that would require using | ||
1912 | * atomic ops. | ||
1913 | */ | ||
1914 | if (irq_delta > delta) | ||
1915 | irq_delta = delta; | ||
1916 | |||
1917 | rq->prev_irq_time += irq_delta; | ||
1918 | delta -= irq_delta; | ||
1919 | rq->clock_task += delta; | ||
1920 | |||
1921 | if (irq_delta && sched_feat(NONIRQ_POWER)) | ||
1922 | sched_rt_avg_update(rq, irq_delta); | ||
2003 | } | 1923 | } |
2004 | 1924 | ||
2005 | #else | 1925 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
2006 | 1926 | ||
2007 | static u64 irq_time_cpu(int cpu) | 1927 | static void update_rq_clock_task(struct rq *rq, s64 delta) |
2008 | { | 1928 | { |
2009 | return 0; | 1929 | rq->clock_task += delta; |
2010 | } | 1930 | } |
2011 | 1931 | ||
2012 | static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { } | 1932 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
2013 | |||
2014 | #endif | ||
2015 | 1933 | ||
2016 | #include "sched_idletask.c" | 1934 | #include "sched_idletask.c" |
2017 | #include "sched_fair.c" | 1935 | #include "sched_fair.c" |
2018 | #include "sched_rt.c" | 1936 | #include "sched_rt.c" |
1937 | #include "sched_autogroup.c" | ||
2019 | #include "sched_stoptask.c" | 1938 | #include "sched_stoptask.c" |
2020 | #ifdef CONFIG_SCHED_DEBUG | 1939 | #ifdef CONFIG_SCHED_DEBUG |
2021 | # include "sched_debug.c" | 1940 | # include "sched_debug.c" |
@@ -2118,6 +2037,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
2118 | p->sched_class->prio_changed(rq, p, oldprio, running); | 2037 | p->sched_class->prio_changed(rq, p, oldprio, running); |
2119 | } | 2038 | } |
2120 | 2039 | ||
2040 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
2041 | { | ||
2042 | const struct sched_class *class; | ||
2043 | |||
2044 | if (p->sched_class == rq->curr->sched_class) { | ||
2045 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); | ||
2046 | } else { | ||
2047 | for_each_class(class) { | ||
2048 | if (class == rq->curr->sched_class) | ||
2049 | break; | ||
2050 | if (class == p->sched_class) { | ||
2051 | resched_task(rq->curr); | ||
2052 | break; | ||
2053 | } | ||
2054 | } | ||
2055 | } | ||
2056 | |||
2057 | /* | ||
2058 | * A queue event has occurred, and we're going to schedule. In | ||
2059 | * this case, we can save a useless back to back clock update. | ||
2060 | */ | ||
2061 | if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) | ||
2062 | rq->skip_clock_update = 1; | ||
2063 | } | ||
2064 | |||
2121 | #ifdef CONFIG_SMP | 2065 | #ifdef CONFIG_SMP |
2122 | /* | 2066 | /* |
2123 | * Is this task likely cache-hot: | 2067 | * Is this task likely cache-hot: |
@@ -2183,10 +2127,8 @@ static int migration_cpu_stop(void *data); | |||
2183 | * The task's runqueue lock must be held. | 2127 | * The task's runqueue lock must be held. |
2184 | * Returns true if you have to wait for migration thread. | 2128 | * Returns true if you have to wait for migration thread. |
2185 | */ | 2129 | */ |
2186 | static bool migrate_task(struct task_struct *p, int dest_cpu) | 2130 | static bool migrate_task(struct task_struct *p, struct rq *rq) |
2187 | { | 2131 | { |
2188 | struct rq *rq = task_rq(p); | ||
2189 | |||
2190 | /* | 2132 | /* |
2191 | * If the task is not on a runqueue (and not running), then | 2133 | * If the task is not on a runqueue (and not running), then |
2192 | * the next wake-up will properly place the task. | 2134 | * the next wake-up will properly place the task. |
@@ -2366,18 +2308,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2366 | return dest_cpu; | 2308 | return dest_cpu; |
2367 | 2309 | ||
2368 | /* No more Mr. Nice Guy. */ | 2310 | /* No more Mr. Nice Guy. */ |
2369 | if (unlikely(dest_cpu >= nr_cpu_ids)) { | 2311 | dest_cpu = cpuset_cpus_allowed_fallback(p); |
2370 | dest_cpu = cpuset_cpus_allowed_fallback(p); | 2312 | /* |
2371 | /* | 2313 | * Don't tell them about moving exiting tasks or |
2372 | * Don't tell them about moving exiting tasks or | 2314 | * kernel threads (both mm NULL), since they never |
2373 | * kernel threads (both mm NULL), since they never | 2315 | * leave kernel. |
2374 | * leave kernel. | 2316 | */ |
2375 | */ | 2317 | if (p->mm && printk_ratelimit()) { |
2376 | if (p->mm && printk_ratelimit()) { | 2318 | printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", |
2377 | printk(KERN_INFO "process %d (%s) no " | 2319 | task_pid_nr(p), p->comm, cpu); |
2378 | "longer affine to cpu%d\n", | ||
2379 | task_pid_nr(p), p->comm, cpu); | ||
2380 | } | ||
2381 | } | 2320 | } |
2382 | 2321 | ||
2383 | return dest_cpu; | 2322 | return dest_cpu; |
@@ -2713,7 +2652,9 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2713 | /* Want to start with kernel preemption disabled. */ | 2652 | /* Want to start with kernel preemption disabled. */ |
2714 | task_thread_info(p)->preempt_count = 1; | 2653 | task_thread_info(p)->preempt_count = 1; |
2715 | #endif | 2654 | #endif |
2655 | #ifdef CONFIG_SMP | ||
2716 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 2656 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
2657 | #endif | ||
2717 | 2658 | ||
2718 | put_cpu(); | 2659 | put_cpu(); |
2719 | } | 2660 | } |
@@ -3104,6 +3045,15 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
3104 | return delta; | 3045 | return delta; |
3105 | } | 3046 | } |
3106 | 3047 | ||
3048 | static unsigned long | ||
3049 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3050 | { | ||
3051 | load *= exp; | ||
3052 | load += active * (FIXED_1 - exp); | ||
3053 | load += 1UL << (FSHIFT - 1); | ||
3054 | return load >> FSHIFT; | ||
3055 | } | ||
3056 | |||
3107 | #ifdef CONFIG_NO_HZ | 3057 | #ifdef CONFIG_NO_HZ |
3108 | /* | 3058 | /* |
3109 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 3059 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. |
@@ -3133,6 +3083,128 @@ static long calc_load_fold_idle(void) | |||
3133 | 3083 | ||
3134 | return delta; | 3084 | return delta; |
3135 | } | 3085 | } |
3086 | |||
3087 | /** | ||
3088 | * fixed_power_int - compute: x^n, in O(log n) time | ||
3089 | * | ||
3090 | * @x: base of the power | ||
3091 | * @frac_bits: fractional bits of @x | ||
3092 | * @n: power to raise @x to. | ||
3093 | * | ||
3094 | * By exploiting the relation between the definition of the natural power | ||
3095 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
3096 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
3097 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
3098 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
3099 | * of course trivially computable in O(log_2 n), the length of our binary | ||
3100 | * vector. | ||
3101 | */ | ||
3102 | static unsigned long | ||
3103 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
3104 | { | ||
3105 | unsigned long result = 1UL << frac_bits; | ||
3106 | |||
3107 | if (n) for (;;) { | ||
3108 | if (n & 1) { | ||
3109 | result *= x; | ||
3110 | result += 1UL << (frac_bits - 1); | ||
3111 | result >>= frac_bits; | ||
3112 | } | ||
3113 | n >>= 1; | ||
3114 | if (!n) | ||
3115 | break; | ||
3116 | x *= x; | ||
3117 | x += 1UL << (frac_bits - 1); | ||
3118 | x >>= frac_bits; | ||
3119 | } | ||
3120 | |||
3121 | return result; | ||
3122 | } | ||
3123 | |||
3124 | /* | ||
3125 | * a1 = a0 * e + a * (1 - e) | ||
3126 | * | ||
3127 | * a2 = a1 * e + a * (1 - e) | ||
3128 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
3129 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
3130 | * | ||
3131 | * a3 = a2 * e + a * (1 - e) | ||
3132 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
3133 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
3134 | * | ||
3135 | * ... | ||
3136 | * | ||
3137 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
3138 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
3139 | * = a0 * e^n + a * (1 - e^n) | ||
3140 | * | ||
3141 | * [1] application of the geometric series: | ||
3142 | * | ||
3143 | * n 1 - x^(n+1) | ||
3144 | * S_n := \Sum x^i = ------------- | ||
3145 | * i=0 1 - x | ||
3146 | */ | ||
3147 | static unsigned long | ||
3148 | calc_load_n(unsigned long load, unsigned long exp, | ||
3149 | unsigned long active, unsigned int n) | ||
3150 | { | ||
3151 | |||
3152 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
3153 | } | ||
3154 | |||
3155 | /* | ||
3156 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
3157 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
3158 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
3159 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
3160 | * | ||
3161 | * Once we've updated the global active value, we need to apply the exponential | ||
3162 | * weights adjusted to the number of cycles missed. | ||
3163 | */ | ||
3164 | static void calc_global_nohz(unsigned long ticks) | ||
3165 | { | ||
3166 | long delta, active, n; | ||
3167 | |||
3168 | if (time_before(jiffies, calc_load_update)) | ||
3169 | return; | ||
3170 | |||
3171 | /* | ||
3172 | * If we crossed a calc_load_update boundary, make sure to fold | ||
3173 | * any pending idle changes, the respective CPUs might have | ||
3174 | * missed the tick driven calc_load_account_active() update | ||
3175 | * due to NO_HZ. | ||
3176 | */ | ||
3177 | delta = calc_load_fold_idle(); | ||
3178 | if (delta) | ||
3179 | atomic_long_add(delta, &calc_load_tasks); | ||
3180 | |||
3181 | /* | ||
3182 | * If we were idle for multiple load cycles, apply them. | ||
3183 | */ | ||
3184 | if (ticks >= LOAD_FREQ) { | ||
3185 | n = ticks / LOAD_FREQ; | ||
3186 | |||
3187 | active = atomic_long_read(&calc_load_tasks); | ||
3188 | active = active > 0 ? active * FIXED_1 : 0; | ||
3189 | |||
3190 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
3191 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
3192 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
3193 | |||
3194 | calc_load_update += n * LOAD_FREQ; | ||
3195 | } | ||
3196 | |||
3197 | /* | ||
3198 | * Its possible the remainder of the above division also crosses | ||
3199 | * a LOAD_FREQ period, the regular check in calc_global_load() | ||
3200 | * which comes after this will take care of that. | ||
3201 | * | ||
3202 | * Consider us being 11 ticks before a cycle completion, and us | ||
3203 | * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will | ||
3204 | * age us 4 cycles, and the test in calc_global_load() will | ||
3205 | * pick up the final one. | ||
3206 | */ | ||
3207 | } | ||
3136 | #else | 3208 | #else |
3137 | static void calc_load_account_idle(struct rq *this_rq) | 3209 | static void calc_load_account_idle(struct rq *this_rq) |
3138 | { | 3210 | { |
@@ -3142,6 +3214,10 @@ static inline long calc_load_fold_idle(void) | |||
3142 | { | 3214 | { |
3143 | return 0; | 3215 | return 0; |
3144 | } | 3216 | } |
3217 | |||
3218 | static void calc_global_nohz(unsigned long ticks) | ||
3219 | { | ||
3220 | } | ||
3145 | #endif | 3221 | #endif |
3146 | 3222 | ||
3147 | /** | 3223 | /** |
@@ -3159,24 +3235,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
3159 | loads[2] = (avenrun[2] + offset) << shift; | 3235 | loads[2] = (avenrun[2] + offset) << shift; |
3160 | } | 3236 | } |
3161 | 3237 | ||
3162 | static unsigned long | ||
3163 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3164 | { | ||
3165 | load *= exp; | ||
3166 | load += active * (FIXED_1 - exp); | ||
3167 | return load >> FSHIFT; | ||
3168 | } | ||
3169 | |||
3170 | /* | 3238 | /* |
3171 | * calc_load - update the avenrun load estimates 10 ticks after the | 3239 | * calc_load - update the avenrun load estimates 10 ticks after the |
3172 | * CPUs have updated calc_load_tasks. | 3240 | * CPUs have updated calc_load_tasks. |
3173 | */ | 3241 | */ |
3174 | void calc_global_load(void) | 3242 | void calc_global_load(unsigned long ticks) |
3175 | { | 3243 | { |
3176 | unsigned long upd = calc_load_update + 10; | ||
3177 | long active; | 3244 | long active; |
3178 | 3245 | ||
3179 | if (time_before(jiffies, upd)) | 3246 | calc_global_nohz(ticks); |
3247 | |||
3248 | if (time_before(jiffies, calc_load_update + 10)) | ||
3180 | return; | 3249 | return; |
3181 | 3250 | ||
3182 | active = atomic_long_read(&calc_load_tasks); | 3251 | active = atomic_long_read(&calc_load_tasks); |
@@ -3349,7 +3418,7 @@ void sched_exec(void) | |||
3349 | * select_task_rq() can race against ->cpus_allowed | 3418 | * select_task_rq() can race against ->cpus_allowed |
3350 | */ | 3419 | */ |
3351 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && | 3420 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && |
3352 | likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { | 3421 | likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { |
3353 | struct migration_arg arg = { p, dest_cpu }; | 3422 | struct migration_arg arg = { p, dest_cpu }; |
3354 | 3423 | ||
3355 | task_rq_unlock(rq, &flags); | 3424 | task_rq_unlock(rq, &flags); |
@@ -3830,7 +3899,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) | |||
3830 | { | 3899 | { |
3831 | if (prev->se.on_rq) | 3900 | if (prev->se.on_rq) |
3832 | update_rq_clock(rq); | 3901 | update_rq_clock(rq); |
3833 | rq->skip_clock_update = 0; | ||
3834 | prev->sched_class->put_prev_task(rq, prev); | 3902 | prev->sched_class->put_prev_task(rq, prev); |
3835 | } | 3903 | } |
3836 | 3904 | ||
@@ -3888,7 +3956,6 @@ need_resched_nonpreemptible: | |||
3888 | hrtick_clear(rq); | 3956 | hrtick_clear(rq); |
3889 | 3957 | ||
3890 | raw_spin_lock_irq(&rq->lock); | 3958 | raw_spin_lock_irq(&rq->lock); |
3891 | clear_tsk_need_resched(prev); | ||
3892 | 3959 | ||
3893 | switch_count = &prev->nivcsw; | 3960 | switch_count = &prev->nivcsw; |
3894 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3961 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
@@ -3920,6 +3987,8 @@ need_resched_nonpreemptible: | |||
3920 | 3987 | ||
3921 | put_prev_task(rq, prev); | 3988 | put_prev_task(rq, prev); |
3922 | next = pick_next_task(rq); | 3989 | next = pick_next_task(rq); |
3990 | clear_tsk_need_resched(prev); | ||
3991 | rq->skip_clock_update = 0; | ||
3923 | 3992 | ||
3924 | if (likely(prev != next)) { | 3993 | if (likely(prev != next)) { |
3925 | sched_info_switch(prev, next); | 3994 | sched_info_switch(prev, next); |
@@ -4014,7 +4083,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
4014 | if (task_thread_info(rq->curr) != owner || need_resched()) | 4083 | if (task_thread_info(rq->curr) != owner || need_resched()) |
4015 | return 0; | 4084 | return 0; |
4016 | 4085 | ||
4017 | cpu_relax(); | 4086 | arch_mutex_cpu_relax(); |
4018 | } | 4087 | } |
4019 | 4088 | ||
4020 | return 1; | 4089 | return 1; |
@@ -4326,7 +4395,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); | |||
4326 | * This waits for either a completion of a specific task to be signaled or for a | 4395 | * This waits for either a completion of a specific task to be signaled or for a |
4327 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | 4396 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. |
4328 | */ | 4397 | */ |
4329 | unsigned long __sched | 4398 | long __sched |
4330 | wait_for_completion_interruptible_timeout(struct completion *x, | 4399 | wait_for_completion_interruptible_timeout(struct completion *x, |
4331 | unsigned long timeout) | 4400 | unsigned long timeout) |
4332 | { | 4401 | { |
@@ -4359,7 +4428,7 @@ EXPORT_SYMBOL(wait_for_completion_killable); | |||
4359 | * signaled or for a specified timeout to expire. It can be | 4428 | * signaled or for a specified timeout to expire. It can be |
4360 | * interrupted by a kill signal. The timeout is in jiffies. | 4429 | * interrupted by a kill signal. The timeout is in jiffies. |
4361 | */ | 4430 | */ |
4362 | unsigned long __sched | 4431 | long __sched |
4363 | wait_for_completion_killable_timeout(struct completion *x, | 4432 | wait_for_completion_killable_timeout(struct completion *x, |
4364 | unsigned long timeout) | 4433 | unsigned long timeout) |
4365 | { | 4434 | { |
@@ -4701,7 +4770,7 @@ static bool check_same_owner(struct task_struct *p) | |||
4701 | } | 4770 | } |
4702 | 4771 | ||
4703 | static int __sched_setscheduler(struct task_struct *p, int policy, | 4772 | static int __sched_setscheduler(struct task_struct *p, int policy, |
4704 | struct sched_param *param, bool user) | 4773 | const struct sched_param *param, bool user) |
4705 | { | 4774 | { |
4706 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4775 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4707 | unsigned long flags; | 4776 | unsigned long flags; |
@@ -4856,7 +4925,7 @@ recheck: | |||
4856 | * NOTE that the task may be already dead. | 4925 | * NOTE that the task may be already dead. |
4857 | */ | 4926 | */ |
4858 | int sched_setscheduler(struct task_struct *p, int policy, | 4927 | int sched_setscheduler(struct task_struct *p, int policy, |
4859 | struct sched_param *param) | 4928 | const struct sched_param *param) |
4860 | { | 4929 | { |
4861 | return __sched_setscheduler(p, policy, param, true); | 4930 | return __sched_setscheduler(p, policy, param, true); |
4862 | } | 4931 | } |
@@ -4874,7 +4943,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
4874 | * but our caller might not have that capability. | 4943 | * but our caller might not have that capability. |
4875 | */ | 4944 | */ |
4876 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 4945 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
4877 | struct sched_param *param) | 4946 | const struct sched_param *param) |
4878 | { | 4947 | { |
4879 | return __sched_setscheduler(p, policy, param, false); | 4948 | return __sched_setscheduler(p, policy, param, false); |
4880 | } | 4949 | } |
@@ -5390,7 +5459,7 @@ void sched_show_task(struct task_struct *p) | |||
5390 | unsigned state; | 5459 | unsigned state; |
5391 | 5460 | ||
5392 | state = p->state ? __ffs(p->state) + 1 : 0; | 5461 | state = p->state ? __ffs(p->state) + 1 : 0; |
5393 | printk(KERN_INFO "%-13.13s %c", p->comm, | 5462 | printk(KERN_INFO "%-15.15s %c", p->comm, |
5394 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 5463 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
5395 | #if BITS_PER_LONG == 32 | 5464 | #if BITS_PER_LONG == 32 |
5396 | if (state == TASK_RUNNING) | 5465 | if (state == TASK_RUNNING) |
@@ -5554,7 +5623,6 @@ static void update_sysctl(void) | |||
5554 | SET_SYSCTL(sched_min_granularity); | 5623 | SET_SYSCTL(sched_min_granularity); |
5555 | SET_SYSCTL(sched_latency); | 5624 | SET_SYSCTL(sched_latency); |
5556 | SET_SYSCTL(sched_wakeup_granularity); | 5625 | SET_SYSCTL(sched_wakeup_granularity); |
5557 | SET_SYSCTL(sched_shares_ratelimit); | ||
5558 | #undef SET_SYSCTL | 5626 | #undef SET_SYSCTL |
5559 | } | 5627 | } |
5560 | 5628 | ||
@@ -5630,7 +5698,7 @@ again: | |||
5630 | goto out; | 5698 | goto out; |
5631 | 5699 | ||
5632 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 5700 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
5633 | if (migrate_task(p, dest_cpu)) { | 5701 | if (migrate_task(p, rq)) { |
5634 | struct migration_arg arg = { p, dest_cpu }; | 5702 | struct migration_arg arg = { p, dest_cpu }; |
5635 | /* Need help from migration thread: drop lock and wait. */ | 5703 | /* Need help from migration thread: drop lock and wait. */ |
5636 | task_rq_unlock(rq, &flags); | 5704 | task_rq_unlock(rq, &flags); |
@@ -5712,29 +5780,20 @@ static int migration_cpu_stop(void *data) | |||
5712 | } | 5780 | } |
5713 | 5781 | ||
5714 | #ifdef CONFIG_HOTPLUG_CPU | 5782 | #ifdef CONFIG_HOTPLUG_CPU |
5783 | |||
5715 | /* | 5784 | /* |
5716 | * Figure out where task on dead CPU should go, use force if necessary. | 5785 | * Ensures that the idle task is using init_mm right before its cpu goes |
5786 | * offline. | ||
5717 | */ | 5787 | */ |
5718 | void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 5788 | void idle_task_exit(void) |
5719 | { | 5789 | { |
5720 | struct rq *rq = cpu_rq(dead_cpu); | 5790 | struct mm_struct *mm = current->active_mm; |
5721 | int needs_cpu, uninitialized_var(dest_cpu); | ||
5722 | unsigned long flags; | ||
5723 | 5791 | ||
5724 | local_irq_save(flags); | 5792 | BUG_ON(cpu_online(smp_processor_id())); |
5725 | 5793 | ||
5726 | raw_spin_lock(&rq->lock); | 5794 | if (mm != &init_mm) |
5727 | needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); | 5795 | switch_mm(mm, &init_mm, current); |
5728 | if (needs_cpu) | 5796 | mmdrop(mm); |
5729 | dest_cpu = select_fallback_rq(dead_cpu, p); | ||
5730 | raw_spin_unlock(&rq->lock); | ||
5731 | /* | ||
5732 | * It can only fail if we race with set_cpus_allowed(), | ||
5733 | * in the racer should migrate the task anyway. | ||
5734 | */ | ||
5735 | if (needs_cpu) | ||
5736 | __migrate_task(p, dead_cpu, dest_cpu); | ||
5737 | local_irq_restore(flags); | ||
5738 | } | 5797 | } |
5739 | 5798 | ||
5740 | /* | 5799 | /* |
@@ -5747,128 +5806,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5747 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 5806 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
5748 | { | 5807 | { |
5749 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); | 5808 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); |
5750 | unsigned long flags; | ||
5751 | 5809 | ||
5752 | local_irq_save(flags); | ||
5753 | double_rq_lock(rq_src, rq_dest); | ||
5754 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | 5810 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; |
5755 | rq_src->nr_uninterruptible = 0; | 5811 | rq_src->nr_uninterruptible = 0; |
5756 | double_rq_unlock(rq_src, rq_dest); | ||
5757 | local_irq_restore(flags); | ||
5758 | } | ||
5759 | |||
5760 | /* Run through task list and migrate tasks from the dead cpu. */ | ||
5761 | static void migrate_live_tasks(int src_cpu) | ||
5762 | { | ||
5763 | struct task_struct *p, *t; | ||
5764 | |||
5765 | read_lock(&tasklist_lock); | ||
5766 | |||
5767 | do_each_thread(t, p) { | ||
5768 | if (p == current) | ||
5769 | continue; | ||
5770 | |||
5771 | if (task_cpu(p) == src_cpu) | ||
5772 | move_task_off_dead_cpu(src_cpu, p); | ||
5773 | } while_each_thread(t, p); | ||
5774 | |||
5775 | read_unlock(&tasklist_lock); | ||
5776 | } | 5812 | } |
5777 | 5813 | ||
5778 | /* | 5814 | /* |
5779 | * Schedules idle task to be the next runnable task on current CPU. | 5815 | * remove the tasks which were accounted by rq from calc_load_tasks. |
5780 | * It does so by boosting its priority to highest possible. | ||
5781 | * Used by CPU offline code. | ||
5782 | */ | 5816 | */ |
5783 | void sched_idle_next(void) | 5817 | static void calc_global_load_remove(struct rq *rq) |
5784 | { | 5818 | { |
5785 | int this_cpu = smp_processor_id(); | 5819 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); |
5786 | struct rq *rq = cpu_rq(this_cpu); | 5820 | rq->calc_load_active = 0; |
5787 | struct task_struct *p = rq->idle; | ||
5788 | unsigned long flags; | ||
5789 | |||
5790 | /* cpu has to be offline */ | ||
5791 | BUG_ON(cpu_online(this_cpu)); | ||
5792 | |||
5793 | /* | ||
5794 | * Strictly not necessary since rest of the CPUs are stopped by now | ||
5795 | * and interrupts disabled on the current cpu. | ||
5796 | */ | ||
5797 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5798 | |||
5799 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | ||
5800 | |||
5801 | activate_task(rq, p, 0); | ||
5802 | |||
5803 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5804 | } | 5821 | } |
5805 | 5822 | ||
5806 | /* | 5823 | /* |
5807 | * Ensures that the idle task is using init_mm right before its cpu goes | 5824 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
5808 | * offline. | 5825 | * try_to_wake_up()->select_task_rq(). |
5826 | * | ||
5827 | * Called with rq->lock held even though we'er in stop_machine() and | ||
5828 | * there's no concurrency possible, we hold the required locks anyway | ||
5829 | * because of lock validation efforts. | ||
5809 | */ | 5830 | */ |
5810 | void idle_task_exit(void) | 5831 | static void migrate_tasks(unsigned int dead_cpu) |
5811 | { | ||
5812 | struct mm_struct *mm = current->active_mm; | ||
5813 | |||
5814 | BUG_ON(cpu_online(smp_processor_id())); | ||
5815 | |||
5816 | if (mm != &init_mm) | ||
5817 | switch_mm(mm, &init_mm, current); | ||
5818 | mmdrop(mm); | ||
5819 | } | ||
5820 | |||
5821 | /* called under rq->lock with disabled interrupts */ | ||
5822 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | ||
5823 | { | 5832 | { |
5824 | struct rq *rq = cpu_rq(dead_cpu); | 5833 | struct rq *rq = cpu_rq(dead_cpu); |
5825 | 5834 | struct task_struct *next, *stop = rq->stop; | |
5826 | /* Must be exiting, otherwise would be on tasklist. */ | 5835 | int dest_cpu; |
5827 | BUG_ON(!p->exit_state); | ||
5828 | |||
5829 | /* Cannot have done final schedule yet: would have vanished. */ | ||
5830 | BUG_ON(p->state == TASK_DEAD); | ||
5831 | |||
5832 | get_task_struct(p); | ||
5833 | 5836 | ||
5834 | /* | 5837 | /* |
5835 | * Drop lock around migration; if someone else moves it, | 5838 | * Fudge the rq selection such that the below task selection loop |
5836 | * that's OK. No task can be added to this CPU, so iteration is | 5839 | * doesn't get stuck on the currently eligible stop task. |
5837 | * fine. | 5840 | * |
5841 | * We're currently inside stop_machine() and the rq is either stuck | ||
5842 | * in the stop_machine_cpu_stop() loop, or we're executing this code, | ||
5843 | * either way we should never end up calling schedule() until we're | ||
5844 | * done here. | ||
5838 | */ | 5845 | */ |
5839 | raw_spin_unlock_irq(&rq->lock); | 5846 | rq->stop = NULL; |
5840 | move_task_off_dead_cpu(dead_cpu, p); | ||
5841 | raw_spin_lock_irq(&rq->lock); | ||
5842 | |||
5843 | put_task_struct(p); | ||
5844 | } | ||
5845 | |||
5846 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | ||
5847 | static void migrate_dead_tasks(unsigned int dead_cpu) | ||
5848 | { | ||
5849 | struct rq *rq = cpu_rq(dead_cpu); | ||
5850 | struct task_struct *next; | ||
5851 | 5847 | ||
5852 | for ( ; ; ) { | 5848 | for ( ; ; ) { |
5853 | if (!rq->nr_running) | 5849 | /* |
5850 | * There's this thread running, bail when that's the only | ||
5851 | * remaining thread. | ||
5852 | */ | ||
5853 | if (rq->nr_running == 1) | ||
5854 | break; | 5854 | break; |
5855 | |||
5855 | next = pick_next_task(rq); | 5856 | next = pick_next_task(rq); |
5856 | if (!next) | 5857 | BUG_ON(!next); |
5857 | break; | ||
5858 | next->sched_class->put_prev_task(rq, next); | 5858 | next->sched_class->put_prev_task(rq, next); |
5859 | migrate_dead(dead_cpu, next); | ||
5860 | 5859 | ||
5860 | /* Find suitable destination for @next, with force if needed. */ | ||
5861 | dest_cpu = select_fallback_rq(dead_cpu, next); | ||
5862 | raw_spin_unlock(&rq->lock); | ||
5863 | |||
5864 | __migrate_task(next, dead_cpu, dest_cpu); | ||
5865 | |||
5866 | raw_spin_lock(&rq->lock); | ||
5861 | } | 5867 | } |
5862 | } | ||
5863 | 5868 | ||
5864 | /* | 5869 | rq->stop = stop; |
5865 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
5866 | */ | ||
5867 | static void calc_global_load_remove(struct rq *rq) | ||
5868 | { | ||
5869 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||
5870 | rq->calc_load_active = 0; | ||
5871 | } | 5870 | } |
5871 | |||
5872 | #endif /* CONFIG_HOTPLUG_CPU */ | 5872 | #endif /* CONFIG_HOTPLUG_CPU */ |
5873 | 5873 | ||
5874 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 5874 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
@@ -6078,15 +6078,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6078 | unsigned long flags; | 6078 | unsigned long flags; |
6079 | struct rq *rq = cpu_rq(cpu); | 6079 | struct rq *rq = cpu_rq(cpu); |
6080 | 6080 | ||
6081 | switch (action) { | 6081 | switch (action & ~CPU_TASKS_FROZEN) { |
6082 | 6082 | ||
6083 | case CPU_UP_PREPARE: | 6083 | case CPU_UP_PREPARE: |
6084 | case CPU_UP_PREPARE_FROZEN: | ||
6085 | rq->calc_load_update = calc_load_update; | 6084 | rq->calc_load_update = calc_load_update; |
6086 | break; | 6085 | break; |
6087 | 6086 | ||
6088 | case CPU_ONLINE: | 6087 | case CPU_ONLINE: |
6089 | case CPU_ONLINE_FROZEN: | ||
6090 | /* Update our root-domain */ | 6088 | /* Update our root-domain */ |
6091 | raw_spin_lock_irqsave(&rq->lock, flags); | 6089 | raw_spin_lock_irqsave(&rq->lock, flags); |
6092 | if (rq->rd) { | 6090 | if (rq->rd) { |
@@ -6098,30 +6096,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
6098 | break; | 6096 | break; |
6099 | 6097 | ||
6100 | #ifdef CONFIG_HOTPLUG_CPU | 6098 | #ifdef CONFIG_HOTPLUG_CPU |
6101 | case CPU_DEAD: | ||
6102 | case CPU_DEAD_FROZEN: | ||
6103 | migrate_live_tasks(cpu); | ||
6104 | /* Idle task back to normal (off runqueue, low prio) */ | ||
6105 | raw_spin_lock_irq(&rq->lock); | ||
6106 | deactivate_task(rq, rq->idle, 0); | ||
6107 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); | ||
6108 | rq->idle->sched_class = &idle_sched_class; | ||
6109 | migrate_dead_tasks(cpu); | ||
6110 | raw_spin_unlock_irq(&rq->lock); | ||
6111 | migrate_nr_uninterruptible(rq); | ||
6112 | BUG_ON(rq->nr_running != 0); | ||
6113 | calc_global_load_remove(rq); | ||
6114 | break; | ||
6115 | |||
6116 | case CPU_DYING: | 6099 | case CPU_DYING: |
6117 | case CPU_DYING_FROZEN: | ||
6118 | /* Update our root-domain */ | 6100 | /* Update our root-domain */ |
6119 | raw_spin_lock_irqsave(&rq->lock, flags); | 6101 | raw_spin_lock_irqsave(&rq->lock, flags); |
6120 | if (rq->rd) { | 6102 | if (rq->rd) { |
6121 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 6103 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
6122 | set_rq_offline(rq); | 6104 | set_rq_offline(rq); |
6123 | } | 6105 | } |
6106 | migrate_tasks(cpu); | ||
6107 | BUG_ON(rq->nr_running != 1); /* the migration thread */ | ||
6124 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6108 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6109 | |||
6110 | migrate_nr_uninterruptible(rq); | ||
6111 | calc_global_load_remove(rq); | ||
6125 | break; | 6112 | break; |
6126 | #endif | 6113 | #endif |
6127 | } | 6114 | } |
@@ -6960,6 +6947,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6960 | if (cpu != group_first_cpu(sd->groups)) | 6947 | if (cpu != group_first_cpu(sd->groups)) |
6961 | return; | 6948 | return; |
6962 | 6949 | ||
6950 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); | ||
6951 | |||
6963 | child = sd->child; | 6952 | child = sd->child; |
6964 | 6953 | ||
6965 | sd->groups->cpu_power = 0; | 6954 | sd->groups->cpu_power = 0; |
@@ -7850,15 +7839,13 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
7850 | 7839 | ||
7851 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7840 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7852 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 7841 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
7853 | struct sched_entity *se, int cpu, int add, | 7842 | struct sched_entity *se, int cpu, |
7854 | struct sched_entity *parent) | 7843 | struct sched_entity *parent) |
7855 | { | 7844 | { |
7856 | struct rq *rq = cpu_rq(cpu); | 7845 | struct rq *rq = cpu_rq(cpu); |
7857 | tg->cfs_rq[cpu] = cfs_rq; | 7846 | tg->cfs_rq[cpu] = cfs_rq; |
7858 | init_cfs_rq(cfs_rq, rq); | 7847 | init_cfs_rq(cfs_rq, rq); |
7859 | cfs_rq->tg = tg; | 7848 | cfs_rq->tg = tg; |
7860 | if (add) | ||
7861 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7862 | 7849 | ||
7863 | tg->se[cpu] = se; | 7850 | tg->se[cpu] = se; |
7864 | /* se could be NULL for init_task_group */ | 7851 | /* se could be NULL for init_task_group */ |
@@ -7871,15 +7858,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7871 | se->cfs_rq = parent->my_q; | 7858 | se->cfs_rq = parent->my_q; |
7872 | 7859 | ||
7873 | se->my_q = cfs_rq; | 7860 | se->my_q = cfs_rq; |
7874 | se->load.weight = tg->shares; | 7861 | update_load_set(&se->load, 0); |
7875 | se->load.inv_weight = 0; | ||
7876 | se->parent = parent; | 7862 | se->parent = parent; |
7877 | } | 7863 | } |
7878 | #endif | 7864 | #endif |
7879 | 7865 | ||
7880 | #ifdef CONFIG_RT_GROUP_SCHED | 7866 | #ifdef CONFIG_RT_GROUP_SCHED |
7881 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | 7867 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, |
7882 | struct sched_rt_entity *rt_se, int cpu, int add, | 7868 | struct sched_rt_entity *rt_se, int cpu, |
7883 | struct sched_rt_entity *parent) | 7869 | struct sched_rt_entity *parent) |
7884 | { | 7870 | { |
7885 | struct rq *rq = cpu_rq(cpu); | 7871 | struct rq *rq = cpu_rq(cpu); |
@@ -7888,8 +7874,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
7888 | init_rt_rq(rt_rq, rq); | 7874 | init_rt_rq(rt_rq, rq); |
7889 | rt_rq->tg = tg; | 7875 | rt_rq->tg = tg; |
7890 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 7876 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
7891 | if (add) | ||
7892 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7893 | 7877 | ||
7894 | tg->rt_se[cpu] = rt_se; | 7878 | tg->rt_se[cpu] = rt_se; |
7895 | if (!rt_se) | 7879 | if (!rt_se) |
@@ -7962,13 +7946,9 @@ void __init sched_init(void) | |||
7962 | #ifdef CONFIG_CGROUP_SCHED | 7946 | #ifdef CONFIG_CGROUP_SCHED |
7963 | list_add(&init_task_group.list, &task_groups); | 7947 | list_add(&init_task_group.list, &task_groups); |
7964 | INIT_LIST_HEAD(&init_task_group.children); | 7948 | INIT_LIST_HEAD(&init_task_group.children); |
7965 | 7949 | autogroup_init(&init_task); | |
7966 | #endif /* CONFIG_CGROUP_SCHED */ | 7950 | #endif /* CONFIG_CGROUP_SCHED */ |
7967 | 7951 | ||
7968 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
7969 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | ||
7970 | __alignof__(unsigned long)); | ||
7971 | #endif | ||
7972 | for_each_possible_cpu(i) { | 7952 | for_each_possible_cpu(i) { |
7973 | struct rq *rq; | 7953 | struct rq *rq; |
7974 | 7954 | ||
@@ -7982,7 +7962,6 @@ void __init sched_init(void) | |||
7982 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7962 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7983 | init_task_group.shares = init_task_group_load; | 7963 | init_task_group.shares = init_task_group_load; |
7984 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7964 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7985 | #ifdef CONFIG_CGROUP_SCHED | ||
7986 | /* | 7965 | /* |
7987 | * How much cpu bandwidth does init_task_group get? | 7966 | * How much cpu bandwidth does init_task_group get? |
7988 | * | 7967 | * |
@@ -8002,16 +7981,13 @@ void __init sched_init(void) | |||
8002 | * We achieve this by letting init_task_group's tasks sit | 7981 | * We achieve this by letting init_task_group's tasks sit |
8003 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | 7982 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). |
8004 | */ | 7983 | */ |
8005 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | 7984 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, NULL); |
8006 | #endif | ||
8007 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7985 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
8008 | 7986 | ||
8009 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | 7987 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; |
8010 | #ifdef CONFIG_RT_GROUP_SCHED | 7988 | #ifdef CONFIG_RT_GROUP_SCHED |
8011 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 7989 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
8012 | #ifdef CONFIG_CGROUP_SCHED | 7990 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, NULL); |
8013 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | ||
8014 | #endif | ||
8015 | #endif | 7991 | #endif |
8016 | 7992 | ||
8017 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7993 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
@@ -8091,8 +8067,6 @@ void __init sched_init(void) | |||
8091 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 8067 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
8092 | #endif /* SMP */ | 8068 | #endif /* SMP */ |
8093 | 8069 | ||
8094 | perf_event_init(); | ||
8095 | |||
8096 | scheduler_running = 1; | 8070 | scheduler_running = 1; |
8097 | } | 8071 | } |
8098 | 8072 | ||
@@ -8286,7 +8260,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8286 | if (!se) | 8260 | if (!se) |
8287 | goto err_free_rq; | 8261 | goto err_free_rq; |
8288 | 8262 | ||
8289 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); | 8263 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
8290 | } | 8264 | } |
8291 | 8265 | ||
8292 | return 1; | 8266 | return 1; |
@@ -8297,15 +8271,21 @@ err: | |||
8297 | return 0; | 8271 | return 0; |
8298 | } | 8272 | } |
8299 | 8273 | ||
8300 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8301 | { | ||
8302 | list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, | ||
8303 | &cpu_rq(cpu)->leaf_cfs_rq_list); | ||
8304 | } | ||
8305 | |||
8306 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8274 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8307 | { | 8275 | { |
8308 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); | 8276 | struct rq *rq = cpu_rq(cpu); |
8277 | unsigned long flags; | ||
8278 | |||
8279 | /* | ||
8280 | * Only empty task groups can be destroyed; so we can speculatively | ||
8281 | * check on_list without danger of it being re-added. | ||
8282 | */ | ||
8283 | if (!tg->cfs_rq[cpu]->on_list) | ||
8284 | return; | ||
8285 | |||
8286 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8287 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
8288 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8309 | } | 8289 | } |
8310 | #else /* !CONFG_FAIR_GROUP_SCHED */ | 8290 | #else /* !CONFG_FAIR_GROUP_SCHED */ |
8311 | static inline void free_fair_sched_group(struct task_group *tg) | 8291 | static inline void free_fair_sched_group(struct task_group *tg) |
@@ -8318,10 +8298,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8318 | return 1; | 8298 | return 1; |
8319 | } | 8299 | } |
8320 | 8300 | ||
8321 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8322 | { | ||
8323 | } | ||
8324 | |||
8325 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8301 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8326 | { | 8302 | { |
8327 | } | 8303 | } |
@@ -8376,7 +8352,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8376 | if (!rt_se) | 8352 | if (!rt_se) |
8377 | goto err_free_rq; | 8353 | goto err_free_rq; |
8378 | 8354 | ||
8379 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); | 8355 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
8380 | } | 8356 | } |
8381 | 8357 | ||
8382 | return 1; | 8358 | return 1; |
@@ -8386,17 +8362,6 @@ err_free_rq: | |||
8386 | err: | 8362 | err: |
8387 | return 0; | 8363 | return 0; |
8388 | } | 8364 | } |
8389 | |||
8390 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8391 | { | ||
8392 | list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, | ||
8393 | &cpu_rq(cpu)->leaf_rt_rq_list); | ||
8394 | } | ||
8395 | |||
8396 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8397 | { | ||
8398 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); | ||
8399 | } | ||
8400 | #else /* !CONFIG_RT_GROUP_SCHED */ | 8365 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8401 | static inline void free_rt_sched_group(struct task_group *tg) | 8366 | static inline void free_rt_sched_group(struct task_group *tg) |
8402 | { | 8367 | { |
@@ -8407,14 +8372,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8407 | { | 8372 | { |
8408 | return 1; | 8373 | return 1; |
8409 | } | 8374 | } |
8410 | |||
8411 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8412 | { | ||
8413 | } | ||
8414 | |||
8415 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8416 | { | ||
8417 | } | ||
8418 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8375 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8419 | 8376 | ||
8420 | #ifdef CONFIG_CGROUP_SCHED | 8377 | #ifdef CONFIG_CGROUP_SCHED |
@@ -8430,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8430 | { | 8387 | { |
8431 | struct task_group *tg; | 8388 | struct task_group *tg; |
8432 | unsigned long flags; | 8389 | unsigned long flags; |
8433 | int i; | ||
8434 | 8390 | ||
8435 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | 8391 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); |
8436 | if (!tg) | 8392 | if (!tg) |
@@ -8443,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8443 | goto err; | 8399 | goto err; |
8444 | 8400 | ||
8445 | spin_lock_irqsave(&task_group_lock, flags); | 8401 | spin_lock_irqsave(&task_group_lock, flags); |
8446 | for_each_possible_cpu(i) { | ||
8447 | register_fair_sched_group(tg, i); | ||
8448 | register_rt_sched_group(tg, i); | ||
8449 | } | ||
8450 | list_add_rcu(&tg->list, &task_groups); | 8402 | list_add_rcu(&tg->list, &task_groups); |
8451 | 8403 | ||
8452 | WARN_ON(!parent); /* root should already exist */ | 8404 | WARN_ON(!parent); /* root should already exist */ |
@@ -8476,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg) | |||
8476 | unsigned long flags; | 8428 | unsigned long flags; |
8477 | int i; | 8429 | int i; |
8478 | 8430 | ||
8479 | spin_lock_irqsave(&task_group_lock, flags); | 8431 | /* end participation in shares distribution */ |
8480 | for_each_possible_cpu(i) { | 8432 | for_each_possible_cpu(i) |
8481 | unregister_fair_sched_group(tg, i); | 8433 | unregister_fair_sched_group(tg, i); |
8482 | unregister_rt_sched_group(tg, i); | 8434 | |
8483 | } | 8435 | spin_lock_irqsave(&task_group_lock, flags); |
8484 | list_del_rcu(&tg->list); | 8436 | list_del_rcu(&tg->list); |
8485 | list_del_rcu(&tg->siblings); | 8437 | list_del_rcu(&tg->siblings); |
8486 | spin_unlock_irqrestore(&task_group_lock, flags); | 8438 | spin_unlock_irqrestore(&task_group_lock, flags); |
@@ -8527,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk) | |||
8527 | #endif /* CONFIG_CGROUP_SCHED */ | 8479 | #endif /* CONFIG_CGROUP_SCHED */ |
8528 | 8480 | ||
8529 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8481 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8530 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) | ||
8531 | { | ||
8532 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8533 | int on_rq; | ||
8534 | |||
8535 | on_rq = se->on_rq; | ||
8536 | if (on_rq) | ||
8537 | dequeue_entity(cfs_rq, se, 0); | ||
8538 | |||
8539 | se->load.weight = shares; | ||
8540 | se->load.inv_weight = 0; | ||
8541 | |||
8542 | if (on_rq) | ||
8543 | enqueue_entity(cfs_rq, se, 0); | ||
8544 | } | ||
8545 | |||
8546 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | ||
8547 | { | ||
8548 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8549 | struct rq *rq = cfs_rq->rq; | ||
8550 | unsigned long flags; | ||
8551 | |||
8552 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8553 | __set_se_shares(se, shares); | ||
8554 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8555 | } | ||
8556 | |||
8557 | static DEFINE_MUTEX(shares_mutex); | 8482 | static DEFINE_MUTEX(shares_mutex); |
8558 | 8483 | ||
8559 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 8484 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
@@ -8576,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8576 | if (tg->shares == shares) | 8501 | if (tg->shares == shares) |
8577 | goto done; | 8502 | goto done; |
8578 | 8503 | ||
8579 | spin_lock_irqsave(&task_group_lock, flags); | ||
8580 | for_each_possible_cpu(i) | ||
8581 | unregister_fair_sched_group(tg, i); | ||
8582 | list_del_rcu(&tg->siblings); | ||
8583 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8584 | |||
8585 | /* wait for any ongoing reference to this group to finish */ | ||
8586 | synchronize_sched(); | ||
8587 | |||
8588 | /* | ||
8589 | * Now we are free to modify the group's share on each cpu | ||
8590 | * w/o tripping rebalance_share or load_balance_fair. | ||
8591 | */ | ||
8592 | tg->shares = shares; | 8504 | tg->shares = shares; |
8593 | for_each_possible_cpu(i) { | 8505 | for_each_possible_cpu(i) { |
8594 | /* | 8506 | struct rq *rq = cpu_rq(i); |
8595 | * force a rebalance | 8507 | struct sched_entity *se; |
8596 | */ | 8508 | |
8597 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | 8509 | se = tg->se[i]; |
8598 | set_se_shares(tg->se[i], shares); | 8510 | /* Propagate contribution to hierarchy */ |
8511 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8512 | for_each_sched_entity(se) | ||
8513 | update_cfs_shares(group_cfs_rq(se), 0); | ||
8514 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8599 | } | 8515 | } |
8600 | 8516 | ||
8601 | /* | ||
8602 | * Enable load balance activity on this group, by inserting it back on | ||
8603 | * each cpu's rq->leaf_cfs_rq_list. | ||
8604 | */ | ||
8605 | spin_lock_irqsave(&task_group_lock, flags); | ||
8606 | for_each_possible_cpu(i) | ||
8607 | register_fair_sched_group(tg, i); | ||
8608 | list_add_rcu(&tg->siblings, &tg->parent->children); | ||
8609 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8610 | done: | 8517 | done: |
8611 | mutex_unlock(&shares_mutex); | 8518 | mutex_unlock(&shares_mutex); |
8612 | return 0; | 8519 | return 0; |
@@ -9332,72 +9239,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
9332 | }; | 9239 | }; |
9333 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9240 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9334 | 9241 | ||
9335 | #ifndef CONFIG_SMP | ||
9336 | |||
9337 | void synchronize_sched_expedited(void) | ||
9338 | { | ||
9339 | barrier(); | ||
9340 | } | ||
9341 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
9342 | |||
9343 | #else /* #ifndef CONFIG_SMP */ | ||
9344 | |||
9345 | static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); | ||
9346 | |||
9347 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
9348 | { | ||
9349 | /* | ||
9350 | * There must be a full memory barrier on each affected CPU | ||
9351 | * between the time that try_stop_cpus() is called and the | ||
9352 | * time that it returns. | ||
9353 | * | ||
9354 | * In the current initial implementation of cpu_stop, the | ||
9355 | * above condition is already met when the control reaches | ||
9356 | * this point and the following smp_mb() is not strictly | ||
9357 | * necessary. Do smp_mb() anyway for documentation and | ||
9358 | * robustness against future implementation changes. | ||
9359 | */ | ||
9360 | smp_mb(); /* See above comment block. */ | ||
9361 | return 0; | ||
9362 | } | ||
9363 | |||
9364 | /* | ||
9365 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
9366 | * approach to force grace period to end quickly. This consumes | ||
9367 | * significant time on all CPUs, and is thus not recommended for | ||
9368 | * any sort of common-case code. | ||
9369 | * | ||
9370 | * Note that it is illegal to call this function while holding any | ||
9371 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
9372 | * observe this restriction will result in deadlock. | ||
9373 | */ | ||
9374 | void synchronize_sched_expedited(void) | ||
9375 | { | ||
9376 | int snap, trycount = 0; | ||
9377 | |||
9378 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | ||
9379 | snap = atomic_read(&synchronize_sched_expedited_count) + 1; | ||
9380 | get_online_cpus(); | ||
9381 | while (try_stop_cpus(cpu_online_mask, | ||
9382 | synchronize_sched_expedited_cpu_stop, | ||
9383 | NULL) == -EAGAIN) { | ||
9384 | put_online_cpus(); | ||
9385 | if (trycount++ < 10) | ||
9386 | udelay(trycount * num_online_cpus()); | ||
9387 | else { | ||
9388 | synchronize_sched(); | ||
9389 | return; | ||
9390 | } | ||
9391 | if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { | ||
9392 | smp_mb(); /* ensure test happens before caller kfree */ | ||
9393 | return; | ||
9394 | } | ||
9395 | get_online_cpus(); | ||
9396 | } | ||
9397 | atomic_inc(&synchronize_sched_expedited_count); | ||
9398 | smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ | ||
9399 | put_online_cpus(); | ||
9400 | } | ||
9401 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
9402 | |||
9403 | #endif /* #else #ifndef CONFIG_SMP */ | ||