aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c1278
1 files changed, 672 insertions, 606 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index dc85ceb90832..18d38e4ec7ba 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,9 +75,11 @@
75 75
76#include <asm/tlb.h> 76#include <asm/tlb.h>
77#include <asm/irq_regs.h> 77#include <asm/irq_regs.h>
78#include <asm/mutex.h>
78 79
79#include "sched_cpupri.h" 80#include "sched_cpupri.h"
80#include "workqueue_sched.h" 81#include "workqueue_sched.h"
82#include "sched_autogroup.h"
81 83
82#define CREATE_TRACE_POINTS 84#define CREATE_TRACE_POINTS
83#include <trace/events/sched.h> 85#include <trace/events/sched.h>
@@ -253,6 +255,8 @@ struct task_group {
253 /* runqueue "owned" by this group on each cpu */ 255 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq; 256 struct cfs_rq **cfs_rq;
255 unsigned long shares; 257 unsigned long shares;
258
259 atomic_t load_weight;
256#endif 260#endif
257 261
258#ifdef CONFIG_RT_GROUP_SCHED 262#ifdef CONFIG_RT_GROUP_SCHED
@@ -268,25 +272,18 @@ struct task_group {
268 struct task_group *parent; 272 struct task_group *parent;
269 struct list_head siblings; 273 struct list_head siblings;
270 struct list_head children; 274 struct list_head children;
271};
272 275
273#define root_task_group init_task_group 276#ifdef CONFIG_SCHED_AUTOGROUP
277 struct autogroup *autogroup;
278#endif
279};
274 280
275/* task_group_lock serializes add/remove of task groups and also changes to 281/* task_group_lock serializes the addition/removal of task groups */
276 * a task group's cpu shares.
277 */
278static DEFINE_SPINLOCK(task_group_lock); 282static DEFINE_SPINLOCK(task_group_lock);
279 283
280#ifdef CONFIG_FAIR_GROUP_SCHED 284#ifdef CONFIG_FAIR_GROUP_SCHED
281 285
282#ifdef CONFIG_SMP 286# define ROOT_TASK_GROUP_LOAD NICE_0_LOAD
283static int root_task_group_empty(void)
284{
285 return list_empty(&root_task_group.children);
286}
287#endif
288
289# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
290 287
291/* 288/*
292 * A weight of 0 or 1 can cause arithmetics problems. 289 * A weight of 0 or 1 can cause arithmetics problems.
@@ -299,13 +296,13 @@ static int root_task_group_empty(void)
299#define MIN_SHARES 2 296#define MIN_SHARES 2
300#define MAX_SHARES (1UL << 18) 297#define MAX_SHARES (1UL << 18)
301 298
302static int init_task_group_load = INIT_TASK_GROUP_LOAD; 299static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
303#endif 300#endif
304 301
305/* Default task group. 302/* Default task group.
306 * Every task in system belong to this group at bootup. 303 * Every task in system belong to this group at bootup.
307 */ 304 */
308struct task_group init_task_group; 305struct task_group root_task_group;
309 306
310#endif /* CONFIG_CGROUP_SCHED */ 307#endif /* CONFIG_CGROUP_SCHED */
311 308
@@ -342,6 +339,7 @@ struct cfs_rq {
342 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 339 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
343 * list is used during load balance. 340 * list is used during load balance.
344 */ 341 */
342 int on_list;
345 struct list_head leaf_cfs_rq_list; 343 struct list_head leaf_cfs_rq_list;
346 struct task_group *tg; /* group that "owns" this runqueue */ 344 struct task_group *tg; /* group that "owns" this runqueue */
347 345
@@ -360,14 +358,17 @@ struct cfs_rq {
360 unsigned long h_load; 358 unsigned long h_load;
361 359
362 /* 360 /*
363 * this cpu's part of tg->shares 361 * Maintaining per-cpu shares distribution for group scheduling
362 *
363 * load_stamp is the last time we updated the load average
364 * load_last is the last time we updated the load average and saw load
365 * load_unacc_exec_time is currently unaccounted execution time
364 */ 366 */
365 unsigned long shares; 367 u64 load_avg;
368 u64 load_period;
369 u64 load_stamp, load_last, load_unacc_exec_time;
366 370
367 /* 371 unsigned long load_contribution;
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
371#endif 372#endif
372#endif 373#endif
373}; 374};
@@ -426,9 +427,7 @@ struct root_domain {
426 */ 427 */
427 cpumask_var_t rto_mask; 428 cpumask_var_t rto_mask;
428 atomic_t rto_count; 429 atomic_t rto_count;
429#ifdef CONFIG_SMP
430 struct cpupri cpupri; 430 struct cpupri cpupri;
431#endif
432}; 431};
433 432
434/* 433/*
@@ -437,7 +436,7 @@ struct root_domain {
437 */ 436 */
438static struct root_domain def_root_domain; 437static struct root_domain def_root_domain;
439 438
440#endif 439#endif /* CONFIG_SMP */
441 440
442/* 441/*
443 * This is the main, per-CPU runqueue data structure. 442 * This is the main, per-CPU runqueue data structure.
@@ -488,11 +487,12 @@ struct rq {
488 */ 487 */
489 unsigned long nr_uninterruptible; 488 unsigned long nr_uninterruptible;
490 489
491 struct task_struct *curr, *idle; 490 struct task_struct *curr, *idle, *stop;
492 unsigned long next_balance; 491 unsigned long next_balance;
493 struct mm_struct *prev_mm; 492 struct mm_struct *prev_mm;
494 493
495 u64 clock; 494 u64 clock;
495 u64 clock_task;
496 496
497 atomic_t nr_iowait; 497 atomic_t nr_iowait;
498 498
@@ -520,6 +520,10 @@ struct rq {
520 u64 avg_idle; 520 u64 avg_idle;
521#endif 521#endif
522 522
523#ifdef CONFIG_IRQ_TIME_ACCOUNTING
524 u64 prev_irq_time;
525#endif
526
523 /* calc_load related fields */ 527 /* calc_load related fields */
524 unsigned long calc_load_update; 528 unsigned long calc_load_update;
525 long calc_load_active; 529 long calc_load_active;
@@ -549,26 +553,13 @@ struct rq {
549 /* try_to_wake_up() stats */ 553 /* try_to_wake_up() stats */
550 unsigned int ttwu_count; 554 unsigned int ttwu_count;
551 unsigned int ttwu_local; 555 unsigned int ttwu_local;
552
553 /* BKL stats */
554 unsigned int bkl_count;
555#endif 556#endif
556}; 557};
557 558
558static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 559static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
559 560
560static inline
561void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
562{
563 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
564 561
565 /* 562static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
566 * A queue event has occurred, and we're going to schedule. In
567 * this case, we can save a useless back to back clock update.
568 */
569 if (test_tsk_need_resched(p))
570 rq->skip_clock_update = 1;
571}
572 563
573static inline int cpu_of(struct rq *rq) 564static inline int cpu_of(struct rq *rq)
574{ 565{
@@ -612,11 +603,17 @@ static inline int cpu_of(struct rq *rq)
612 */ 603 */
613static inline struct task_group *task_group(struct task_struct *p) 604static inline struct task_group *task_group(struct task_struct *p)
614{ 605{
606 struct task_group *tg;
615 struct cgroup_subsys_state *css; 607 struct cgroup_subsys_state *css;
616 608
609 if (p->flags & PF_EXITING)
610 return &root_task_group;
611
617 css = task_subsys_state_check(p, cpu_cgroup_subsys_id, 612 css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
618 lockdep_is_held(&task_rq(p)->lock)); 613 lockdep_is_held(&task_rq(p)->lock));
619 return container_of(css, struct task_group, css); 614 tg = container_of(css, struct task_group, css);
615
616 return autogroup_task_group(p, tg);
620} 617}
621 618
622/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 619/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
@@ -643,10 +640,18 @@ static inline struct task_group *task_group(struct task_struct *p)
643 640
644#endif /* CONFIG_CGROUP_SCHED */ 641#endif /* CONFIG_CGROUP_SCHED */
645 642
646inline void update_rq_clock(struct rq *rq) 643static void update_rq_clock_task(struct rq *rq, s64 delta);
644
645static void update_rq_clock(struct rq *rq)
647{ 646{
648 if (!rq->skip_clock_update) 647 s64 delta;
649 rq->clock = sched_clock_cpu(cpu_of(rq)); 648
649 if (rq->skip_clock_update)
650 return;
651
652 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
653 rq->clock += delta;
654 update_rq_clock_task(rq, delta);
650} 655}
651 656
652/* 657/*
@@ -723,7 +728,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
723 size_t cnt, loff_t *ppos) 728 size_t cnt, loff_t *ppos)
724{ 729{
725 char buf[64]; 730 char buf[64];
726 char *cmp = buf; 731 char *cmp;
727 int neg = 0; 732 int neg = 0;
728 int i; 733 int i;
729 734
@@ -734,16 +739,15 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
734 return -EFAULT; 739 return -EFAULT;
735 740
736 buf[cnt] = 0; 741 buf[cnt] = 0;
742 cmp = strstrip(buf);
737 743
738 if (strncmp(buf, "NO_", 3) == 0) { 744 if (strncmp(cmp, "NO_", 3) == 0) {
739 neg = 1; 745 neg = 1;
740 cmp += 3; 746 cmp += 3;
741 } 747 }
742 748
743 for (i = 0; sched_feat_names[i]; i++) { 749 for (i = 0; sched_feat_names[i]; i++) {
744 int len = strlen(sched_feat_names[i]); 750 if (strcmp(cmp, sched_feat_names[i]) == 0) {
745
746 if (strncmp(cmp, sched_feat_names[i], len) == 0) {
747 if (neg) 751 if (neg)
748 sysctl_sched_features &= ~(1UL << i); 752 sysctl_sched_features &= ~(1UL << i);
749 else 753 else
@@ -793,20 +797,6 @@ late_initcall(sched_init_debug);
793const_debug unsigned int sysctl_sched_nr_migrate = 32; 797const_debug unsigned int sysctl_sched_nr_migrate = 32;
794 798
795/* 799/*
796 * ratelimit for updating the group shares.
797 * default: 0.25ms
798 */
799unsigned int sysctl_sched_shares_ratelimit = 250000;
800unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
801
802/*
803 * Inject some fuzzyness into changing the per-cpu group shares
804 * this avoids remote rq-locks at the expense of fairness.
805 * default: 4
806 */
807unsigned int sysctl_sched_shares_thresh = 4;
808
809/*
810 * period over which we average the RT time consumption, measured 800 * period over which we average the RT time consumption, measured
811 * in ms. 801 * in ms.
812 * 802 *
@@ -1355,6 +1345,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1355 lw->inv_weight = 0; 1345 lw->inv_weight = 0;
1356} 1346}
1357 1347
1348static inline void update_load_set(struct load_weight *lw, unsigned long w)
1349{
1350 lw->weight = w;
1351 lw->inv_weight = 0;
1352}
1353
1358/* 1354/*
1359 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1355 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1360 * of tasks with abnormal "nice" values across CPUs the contribution that 1356 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1543,101 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1543 1539
1544#ifdef CONFIG_FAIR_GROUP_SCHED 1540#ifdef CONFIG_FAIR_GROUP_SCHED
1545 1541
1546static __read_mostly unsigned long __percpu *update_shares_data;
1547
1548static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1549
1550/*
1551 * Calculate and set the cpu's group shares.
1552 */
1553static void update_group_shares_cpu(struct task_group *tg, int cpu,
1554 unsigned long sd_shares,
1555 unsigned long sd_rq_weight,
1556 unsigned long *usd_rq_weight)
1557{
1558 unsigned long shares, rq_weight;
1559 int boost = 0;
1560
1561 rq_weight = usd_rq_weight[cpu];
1562 if (!rq_weight) {
1563 boost = 1;
1564 rq_weight = NICE_0_LOAD;
1565 }
1566
1567 /*
1568 * \Sum_j shares_j * rq_weight_i
1569 * shares_i = -----------------------------
1570 * \Sum_j rq_weight_j
1571 */
1572 shares = (sd_shares * rq_weight) / sd_rq_weight;
1573 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1574
1575 if (abs(shares - tg->se[cpu]->load.weight) >
1576 sysctl_sched_shares_thresh) {
1577 struct rq *rq = cpu_rq(cpu);
1578 unsigned long flags;
1579
1580 raw_spin_lock_irqsave(&rq->lock, flags);
1581 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1582 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1583 __set_se_shares(tg->se[cpu], shares);
1584 raw_spin_unlock_irqrestore(&rq->lock, flags);
1585 }
1586}
1587
1588/*
1589 * Re-compute the task group their per cpu shares over the given domain.
1590 * This needs to be done in a bottom-up fashion because the rq weight of a
1591 * parent group depends on the shares of its child groups.
1592 */
1593static int tg_shares_up(struct task_group *tg, void *data)
1594{
1595 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0;
1596 unsigned long *usd_rq_weight;
1597 struct sched_domain *sd = data;
1598 unsigned long flags;
1599 int i;
1600
1601 if (!tg->se[0])
1602 return 0;
1603
1604 local_irq_save(flags);
1605 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1606
1607 for_each_cpu(i, sched_domain_span(sd)) {
1608 weight = tg->cfs_rq[i]->load.weight;
1609 usd_rq_weight[i] = weight;
1610
1611 rq_weight += weight;
1612 /*
1613 * If there are currently no tasks on the cpu pretend there
1614 * is one of average load so that when a new task gets to
1615 * run here it will not get delayed by group starvation.
1616 */
1617 if (!weight)
1618 weight = NICE_0_LOAD;
1619
1620 sum_weight += weight;
1621 shares += tg->cfs_rq[i]->shares;
1622 }
1623
1624 if (!rq_weight)
1625 rq_weight = sum_weight;
1626
1627 if ((!shares && rq_weight) || shares > tg->shares)
1628 shares = tg->shares;
1629
1630 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1631 shares = tg->shares;
1632
1633 for_each_cpu(i, sched_domain_span(sd))
1634 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1635
1636 local_irq_restore(flags);
1637
1638 return 0;
1639}
1640
1641/* 1542/*
1642 * Compute the cpu's hierarchical load factor for each task group. 1543 * Compute the cpu's hierarchical load factor for each task group.
1643 * This needs to be done in a top-down fashion because the load of a child 1544 * This needs to be done in a top-down fashion because the load of a child
@@ -1652,7 +1553,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1652 load = cpu_rq(cpu)->load.weight; 1553 load = cpu_rq(cpu)->load.weight;
1653 } else { 1554 } else {
1654 load = tg->parent->cfs_rq[cpu]->h_load; 1555 load = tg->parent->cfs_rq[cpu]->h_load;
1655 load *= tg->cfs_rq[cpu]->shares; 1556 load *= tg->se[cpu]->load.weight;
1656 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1557 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1657 } 1558 }
1658 1559
@@ -1661,34 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data)
1661 return 0; 1562 return 0;
1662} 1563}
1663 1564
1664static void update_shares(struct sched_domain *sd)
1665{
1666 s64 elapsed;
1667 u64 now;
1668
1669 if (root_task_group_empty())
1670 return;
1671
1672 now = local_clock();
1673 elapsed = now - sd->last_update;
1674
1675 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1676 sd->last_update = now;
1677 walk_tg_tree(tg_nop, tg_shares_up, sd);
1678 }
1679}
1680
1681static void update_h_load(long cpu) 1565static void update_h_load(long cpu)
1682{ 1566{
1683 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); 1567 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1684} 1568}
1685 1569
1686#else
1687
1688static inline void update_shares(struct sched_domain *sd)
1689{
1690}
1691
1692#endif 1570#endif
1693 1571
1694#ifdef CONFIG_PREEMPT 1572#ifdef CONFIG_PREEMPT
@@ -1810,15 +1688,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1810 1688
1811#endif 1689#endif
1812 1690
1813#ifdef CONFIG_FAIR_GROUP_SCHED
1814static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1815{
1816#ifdef CONFIG_SMP
1817 cfs_rq->shares = shares;
1818#endif
1819}
1820#endif
1821
1822static void calc_load_account_idle(struct rq *this_rq); 1691static void calc_load_account_idle(struct rq *this_rq);
1823static void update_sysctl(void); 1692static void update_sysctl(void);
1824static int get_update_sysctl_factor(void); 1693static int get_update_sysctl_factor(void);
@@ -1840,7 +1709,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1840 1709
1841static const struct sched_class rt_sched_class; 1710static const struct sched_class rt_sched_class;
1842 1711
1843#define sched_class_highest (&rt_sched_class) 1712#define sched_class_highest (&stop_sched_class)
1844#define for_each_class(class) \ 1713#define for_each_class(class) \
1845 for (class = sched_class_highest; class; class = class->next) 1714 for (class = sched_class_highest; class; class = class->next)
1846 1715
@@ -1858,12 +1727,6 @@ static void dec_nr_running(struct rq *rq)
1858 1727
1859static void set_load_weight(struct task_struct *p) 1728static void set_load_weight(struct task_struct *p)
1860{ 1729{
1861 if (task_has_rt_policy(p)) {
1862 p->se.load.weight = 0;
1863 p->se.load.inv_weight = WMULT_CONST;
1864 return;
1865 }
1866
1867 /* 1730 /*
1868 * SCHED_IDLE tasks get minimal weight: 1731 * SCHED_IDLE tasks get minimal weight:
1869 */ 1732 */
@@ -1917,13 +1780,194 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1917 dec_nr_running(rq); 1780 dec_nr_running(rq);
1918} 1781}
1919 1782
1783#ifdef CONFIG_IRQ_TIME_ACCOUNTING
1784
1785/*
1786 * There are no locks covering percpu hardirq/softirq time.
1787 * They are only modified in account_system_vtime, on corresponding CPU
1788 * with interrupts disabled. So, writes are safe.
1789 * They are read and saved off onto struct rq in update_rq_clock().
1790 * This may result in other CPU reading this CPU's irq time and can
1791 * race with irq/account_system_vtime on this CPU. We would either get old
1792 * or new value with a side effect of accounting a slice of irq time to wrong
1793 * task when irq is in progress while we read rq->clock. That is a worthy
1794 * compromise in place of having locks on each irq in account_system_time.
1795 */
1796static DEFINE_PER_CPU(u64, cpu_hardirq_time);
1797static DEFINE_PER_CPU(u64, cpu_softirq_time);
1798
1799static DEFINE_PER_CPU(u64, irq_start_time);
1800static int sched_clock_irqtime;
1801
1802void enable_sched_clock_irqtime(void)
1803{
1804 sched_clock_irqtime = 1;
1805}
1806
1807void disable_sched_clock_irqtime(void)
1808{
1809 sched_clock_irqtime = 0;
1810}
1811
1812#ifndef CONFIG_64BIT
1813static DEFINE_PER_CPU(seqcount_t, irq_time_seq);
1814
1815static inline void irq_time_write_begin(void)
1816{
1817 __this_cpu_inc(irq_time_seq.sequence);
1818 smp_wmb();
1819}
1820
1821static inline void irq_time_write_end(void)
1822{
1823 smp_wmb();
1824 __this_cpu_inc(irq_time_seq.sequence);
1825}
1826
1827static inline u64 irq_time_read(int cpu)
1828{
1829 u64 irq_time;
1830 unsigned seq;
1831
1832 do {
1833 seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
1834 irq_time = per_cpu(cpu_softirq_time, cpu) +
1835 per_cpu(cpu_hardirq_time, cpu);
1836 } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
1837
1838 return irq_time;
1839}
1840#else /* CONFIG_64BIT */
1841static inline void irq_time_write_begin(void)
1842{
1843}
1844
1845static inline void irq_time_write_end(void)
1846{
1847}
1848
1849static inline u64 irq_time_read(int cpu)
1850{
1851 return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
1852}
1853#endif /* CONFIG_64BIT */
1854
1855/*
1856 * Called before incrementing preempt_count on {soft,}irq_enter
1857 * and before decrementing preempt_count on {soft,}irq_exit.
1858 */
1859void account_system_vtime(struct task_struct *curr)
1860{
1861 unsigned long flags;
1862 s64 delta;
1863 int cpu;
1864
1865 if (!sched_clock_irqtime)
1866 return;
1867
1868 local_irq_save(flags);
1869
1870 cpu = smp_processor_id();
1871 delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
1872 __this_cpu_add(irq_start_time, delta);
1873
1874 irq_time_write_begin();
1875 /*
1876 * We do not account for softirq time from ksoftirqd here.
1877 * We want to continue accounting softirq time to ksoftirqd thread
1878 * in that case, so as not to confuse scheduler with a special task
1879 * that do not consume any time, but still wants to run.
1880 */
1881 if (hardirq_count())
1882 __this_cpu_add(cpu_hardirq_time, delta);
1883 else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
1884 __this_cpu_add(cpu_softirq_time, delta);
1885
1886 irq_time_write_end();
1887 local_irq_restore(flags);
1888}
1889EXPORT_SYMBOL_GPL(account_system_vtime);
1890
1891static void update_rq_clock_task(struct rq *rq, s64 delta)
1892{
1893 s64 irq_delta;
1894
1895 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
1896
1897 /*
1898 * Since irq_time is only updated on {soft,}irq_exit, we might run into
1899 * this case when a previous update_rq_clock() happened inside a
1900 * {soft,}irq region.
1901 *
1902 * When this happens, we stop ->clock_task and only update the
1903 * prev_irq_time stamp to account for the part that fit, so that a next
1904 * update will consume the rest. This ensures ->clock_task is
1905 * monotonic.
1906 *
1907 * It does however cause some slight miss-attribution of {soft,}irq
1908 * time, a more accurate solution would be to update the irq_time using
1909 * the current rq->clock timestamp, except that would require using
1910 * atomic ops.
1911 */
1912 if (irq_delta > delta)
1913 irq_delta = delta;
1914
1915 rq->prev_irq_time += irq_delta;
1916 delta -= irq_delta;
1917 rq->clock_task += delta;
1918
1919 if (irq_delta && sched_feat(NONIRQ_POWER))
1920 sched_rt_avg_update(rq, irq_delta);
1921}
1922
1923#else /* CONFIG_IRQ_TIME_ACCOUNTING */
1924
1925static void update_rq_clock_task(struct rq *rq, s64 delta)
1926{
1927 rq->clock_task += delta;
1928}
1929
1930#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
1931
1920#include "sched_idletask.c" 1932#include "sched_idletask.c"
1921#include "sched_fair.c" 1933#include "sched_fair.c"
1922#include "sched_rt.c" 1934#include "sched_rt.c"
1935#include "sched_autogroup.c"
1936#include "sched_stoptask.c"
1923#ifdef CONFIG_SCHED_DEBUG 1937#ifdef CONFIG_SCHED_DEBUG
1924# include "sched_debug.c" 1938# include "sched_debug.c"
1925#endif 1939#endif
1926 1940
1941void sched_set_stop_task(int cpu, struct task_struct *stop)
1942{
1943 struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
1944 struct task_struct *old_stop = cpu_rq(cpu)->stop;
1945
1946 if (stop) {
1947 /*
1948 * Make it appear like a SCHED_FIFO task, its something
1949 * userspace knows about and won't get confused about.
1950 *
1951 * Also, it will make PI more or less work without too
1952 * much confusion -- but then, stop work should not
1953 * rely on PI working anyway.
1954 */
1955 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
1956
1957 stop->sched_class = &stop_sched_class;
1958 }
1959
1960 cpu_rq(cpu)->stop = stop;
1961
1962 if (old_stop) {
1963 /*
1964 * Reset it back to a normal scheduling class so that
1965 * it can die in pieces.
1966 */
1967 old_stop->sched_class = &rt_sched_class;
1968 }
1969}
1970
1927/* 1971/*
1928 * __normal_prio - return the priority that is based on the static prio 1972 * __normal_prio - return the priority that is based on the static prio
1929 */ 1973 */
@@ -1991,6 +2035,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1991 p->sched_class->prio_changed(rq, p, oldprio, running); 2035 p->sched_class->prio_changed(rq, p, oldprio, running);
1992} 2036}
1993 2037
2038static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
2039{
2040 const struct sched_class *class;
2041
2042 if (p->sched_class == rq->curr->sched_class) {
2043 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
2044 } else {
2045 for_each_class(class) {
2046 if (class == rq->curr->sched_class)
2047 break;
2048 if (class == p->sched_class) {
2049 resched_task(rq->curr);
2050 break;
2051 }
2052 }
2053 }
2054
2055 /*
2056 * A queue event has occurred, and we're going to schedule. In
2057 * this case, we can save a useless back to back clock update.
2058 */
2059 if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
2060 rq->skip_clock_update = 1;
2061}
2062
1994#ifdef CONFIG_SMP 2063#ifdef CONFIG_SMP
1995/* 2064/*
1996 * Is this task likely cache-hot: 2065 * Is this task likely cache-hot:
@@ -2003,6 +2072,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
2003 if (p->sched_class != &fair_sched_class) 2072 if (p->sched_class != &fair_sched_class)
2004 return 0; 2073 return 0;
2005 2074
2075 if (unlikely(p->policy == SCHED_IDLE))
2076 return 0;
2077
2006 /* 2078 /*
2007 * Buddy candidates are cache hot: 2079 * Buddy candidates are cache hot:
2008 */ 2080 */
@@ -2053,10 +2125,8 @@ static int migration_cpu_stop(void *data);
2053 * The task's runqueue lock must be held. 2125 * The task's runqueue lock must be held.
2054 * Returns true if you have to wait for migration thread. 2126 * Returns true if you have to wait for migration thread.
2055 */ 2127 */
2056static bool migrate_task(struct task_struct *p, int dest_cpu) 2128static bool migrate_task(struct task_struct *p, struct rq *rq)
2057{ 2129{
2058 struct rq *rq = task_rq(p);
2059
2060 /* 2130 /*
2061 * If the task is not on a runqueue (and not running), then 2131 * If the task is not on a runqueue (and not running), then
2062 * the next wake-up will properly place the task. 2132 * the next wake-up will properly place the task.
@@ -2236,18 +2306,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
2236 return dest_cpu; 2306 return dest_cpu;
2237 2307
2238 /* No more Mr. Nice Guy. */ 2308 /* No more Mr. Nice Guy. */
2239 if (unlikely(dest_cpu >= nr_cpu_ids)) { 2309 dest_cpu = cpuset_cpus_allowed_fallback(p);
2240 dest_cpu = cpuset_cpus_allowed_fallback(p); 2310 /*
2241 /* 2311 * Don't tell them about moving exiting tasks or
2242 * Don't tell them about moving exiting tasks or 2312 * kernel threads (both mm NULL), since they never
2243 * kernel threads (both mm NULL), since they never 2313 * leave kernel.
2244 * leave kernel. 2314 */
2245 */ 2315 if (p->mm && printk_ratelimit()) {
2246 if (p->mm && printk_ratelimit()) { 2316 printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n",
2247 printk(KERN_INFO "process %d (%s) no " 2317 task_pid_nr(p), p->comm, cpu);
2248 "longer affine to cpu%d\n",
2249 task_pid_nr(p), p->comm, cpu);
2250 }
2251 } 2318 }
2252 2319
2253 return dest_cpu; 2320 return dest_cpu;
@@ -2438,7 +2505,7 @@ out:
2438 * try_to_wake_up_local - try to wake up a local task with rq lock held 2505 * try_to_wake_up_local - try to wake up a local task with rq lock held
2439 * @p: the thread to be awakened 2506 * @p: the thread to be awakened
2440 * 2507 *
2441 * Put @p on the run-queue if it's not alredy there. The caller must 2508 * Put @p on the run-queue if it's not already there. The caller must
2442 * ensure that this_rq() is locked, @p is bound to this_rq() and not 2509 * ensure that this_rq() is locked, @p is bound to this_rq() and not
2443 * the current task. this_rq() stays locked over invocation. 2510 * the current task. this_rq() stays locked over invocation.
2444 */ 2511 */
@@ -2583,7 +2650,9 @@ void sched_fork(struct task_struct *p, int clone_flags)
2583 /* Want to start with kernel preemption disabled. */ 2650 /* Want to start with kernel preemption disabled. */
2584 task_thread_info(p)->preempt_count = 1; 2651 task_thread_info(p)->preempt_count = 1;
2585#endif 2652#endif
2653#ifdef CONFIG_SMP
2586 plist_node_init(&p->pushable_tasks, MAX_PRIO); 2654 plist_node_init(&p->pushable_tasks, MAX_PRIO);
2655#endif
2587 2656
2588 put_cpu(); 2657 put_cpu();
2589} 2658}
@@ -2852,14 +2921,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
2852 */ 2921 */
2853 arch_start_context_switch(prev); 2922 arch_start_context_switch(prev);
2854 2923
2855 if (likely(!mm)) { 2924 if (!mm) {
2856 next->active_mm = oldmm; 2925 next->active_mm = oldmm;
2857 atomic_inc(&oldmm->mm_count); 2926 atomic_inc(&oldmm->mm_count);
2858 enter_lazy_tlb(oldmm, next); 2927 enter_lazy_tlb(oldmm, next);
2859 } else 2928 } else
2860 switch_mm(oldmm, mm, next); 2929 switch_mm(oldmm, mm, next);
2861 2930
2862 if (likely(!prev->mm)) { 2931 if (!prev->mm) {
2863 prev->active_mm = NULL; 2932 prev->active_mm = NULL;
2864 rq->prev_mm = oldmm; 2933 rq->prev_mm = oldmm;
2865 } 2934 }
@@ -2974,6 +3043,15 @@ static long calc_load_fold_active(struct rq *this_rq)
2974 return delta; 3043 return delta;
2975} 3044}
2976 3045
3046static unsigned long
3047calc_load(unsigned long load, unsigned long exp, unsigned long active)
3048{
3049 load *= exp;
3050 load += active * (FIXED_1 - exp);
3051 load += 1UL << (FSHIFT - 1);
3052 return load >> FSHIFT;
3053}
3054
2977#ifdef CONFIG_NO_HZ 3055#ifdef CONFIG_NO_HZ
2978/* 3056/*
2979 * For NO_HZ we delay the active fold to the next LOAD_FREQ update. 3057 * For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -3003,6 +3081,128 @@ static long calc_load_fold_idle(void)
3003 3081
3004 return delta; 3082 return delta;
3005} 3083}
3084
3085/**
3086 * fixed_power_int - compute: x^n, in O(log n) time
3087 *
3088 * @x: base of the power
3089 * @frac_bits: fractional bits of @x
3090 * @n: power to raise @x to.
3091 *
3092 * By exploiting the relation between the definition of the natural power
3093 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
3094 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
3095 * (where: n_i \elem {0, 1}, the binary vector representing n),
3096 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
3097 * of course trivially computable in O(log_2 n), the length of our binary
3098 * vector.
3099 */
3100static unsigned long
3101fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
3102{
3103 unsigned long result = 1UL << frac_bits;
3104
3105 if (n) for (;;) {
3106 if (n & 1) {
3107 result *= x;
3108 result += 1UL << (frac_bits - 1);
3109 result >>= frac_bits;
3110 }
3111 n >>= 1;
3112 if (!n)
3113 break;
3114 x *= x;
3115 x += 1UL << (frac_bits - 1);
3116 x >>= frac_bits;
3117 }
3118
3119 return result;
3120}
3121
3122/*
3123 * a1 = a0 * e + a * (1 - e)
3124 *
3125 * a2 = a1 * e + a * (1 - e)
3126 * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
3127 * = a0 * e^2 + a * (1 - e) * (1 + e)
3128 *
3129 * a3 = a2 * e + a * (1 - e)
3130 * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
3131 * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
3132 *
3133 * ...
3134 *
3135 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
3136 * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
3137 * = a0 * e^n + a * (1 - e^n)
3138 *
3139 * [1] application of the geometric series:
3140 *
3141 * n 1 - x^(n+1)
3142 * S_n := \Sum x^i = -------------
3143 * i=0 1 - x
3144 */
3145static unsigned long
3146calc_load_n(unsigned long load, unsigned long exp,
3147 unsigned long active, unsigned int n)
3148{
3149
3150 return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
3151}
3152
3153/*
3154 * NO_HZ can leave us missing all per-cpu ticks calling
3155 * calc_load_account_active(), but since an idle CPU folds its delta into
3156 * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
3157 * in the pending idle delta if our idle period crossed a load cycle boundary.
3158 *
3159 * Once we've updated the global active value, we need to apply the exponential
3160 * weights adjusted to the number of cycles missed.
3161 */
3162static void calc_global_nohz(unsigned long ticks)
3163{
3164 long delta, active, n;
3165
3166 if (time_before(jiffies, calc_load_update))
3167 return;
3168
3169 /*
3170 * If we crossed a calc_load_update boundary, make sure to fold
3171 * any pending idle changes, the respective CPUs might have
3172 * missed the tick driven calc_load_account_active() update
3173 * due to NO_HZ.
3174 */
3175 delta = calc_load_fold_idle();
3176 if (delta)
3177 atomic_long_add(delta, &calc_load_tasks);
3178
3179 /*
3180 * If we were idle for multiple load cycles, apply them.
3181 */
3182 if (ticks >= LOAD_FREQ) {
3183 n = ticks / LOAD_FREQ;
3184
3185 active = atomic_long_read(&calc_load_tasks);
3186 active = active > 0 ? active * FIXED_1 : 0;
3187
3188 avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
3189 avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
3190 avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
3191
3192 calc_load_update += n * LOAD_FREQ;
3193 }
3194
3195 /*
3196 * Its possible the remainder of the above division also crosses
3197 * a LOAD_FREQ period, the regular check in calc_global_load()
3198 * which comes after this will take care of that.
3199 *
3200 * Consider us being 11 ticks before a cycle completion, and us
3201 * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
3202 * age us 4 cycles, and the test in calc_global_load() will
3203 * pick up the final one.
3204 */
3205}
3006#else 3206#else
3007static void calc_load_account_idle(struct rq *this_rq) 3207static void calc_load_account_idle(struct rq *this_rq)
3008{ 3208{
@@ -3012,6 +3212,10 @@ static inline long calc_load_fold_idle(void)
3012{ 3212{
3013 return 0; 3213 return 0;
3014} 3214}
3215
3216static void calc_global_nohz(unsigned long ticks)
3217{
3218}
3015#endif 3219#endif
3016 3220
3017/** 3221/**
@@ -3029,24 +3233,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
3029 loads[2] = (avenrun[2] + offset) << shift; 3233 loads[2] = (avenrun[2] + offset) << shift;
3030} 3234}
3031 3235
3032static unsigned long
3033calc_load(unsigned long load, unsigned long exp, unsigned long active)
3034{
3035 load *= exp;
3036 load += active * (FIXED_1 - exp);
3037 return load >> FSHIFT;
3038}
3039
3040/* 3236/*
3041 * calc_load - update the avenrun load estimates 10 ticks after the 3237 * calc_load - update the avenrun load estimates 10 ticks after the
3042 * CPUs have updated calc_load_tasks. 3238 * CPUs have updated calc_load_tasks.
3043 */ 3239 */
3044void calc_global_load(void) 3240void calc_global_load(unsigned long ticks)
3045{ 3241{
3046 unsigned long upd = calc_load_update + 10;
3047 long active; 3242 long active;
3048 3243
3049 if (time_before(jiffies, upd)) 3244 calc_global_nohz(ticks);
3245
3246 if (time_before(jiffies, calc_load_update + 10))
3050 return; 3247 return;
3051 3248
3052 active = atomic_long_read(&calc_load_tasks); 3249 active = atomic_long_read(&calc_load_tasks);
@@ -3219,7 +3416,7 @@ void sched_exec(void)
3219 * select_task_rq() can race against ->cpus_allowed 3416 * select_task_rq() can race against ->cpus_allowed
3220 */ 3417 */
3221 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && 3418 if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
3222 likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { 3419 likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
3223 struct migration_arg arg = { p, dest_cpu }; 3420 struct migration_arg arg = { p, dest_cpu };
3224 3421
3225 task_rq_unlock(rq, &flags); 3422 task_rq_unlock(rq, &flags);
@@ -3248,7 +3445,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
3248 3445
3249 if (task_current(rq, p)) { 3446 if (task_current(rq, p)) {
3250 update_rq_clock(rq); 3447 update_rq_clock(rq);
3251 ns = rq->clock - p->se.exec_start; 3448 ns = rq->clock_task - p->se.exec_start;
3252 if ((s64)ns < 0) 3449 if ((s64)ns < 0)
3253 ns = 0; 3450 ns = 0;
3254 } 3451 }
@@ -3397,7 +3594,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3397 tmp = cputime_to_cputime64(cputime); 3594 tmp = cputime_to_cputime64(cputime);
3398 if (hardirq_count() - hardirq_offset) 3595 if (hardirq_count() - hardirq_offset)
3399 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3596 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3400 else if (softirq_count()) 3597 else if (in_serving_softirq())
3401 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3598 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3402 else 3599 else
3403 cpustat->system = cputime64_add(cpustat->system, tmp); 3600 cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3584,7 +3781,7 @@ void scheduler_tick(void)
3584 curr->sched_class->task_tick(rq, curr, 0); 3781 curr->sched_class->task_tick(rq, curr, 0);
3585 raw_spin_unlock(&rq->lock); 3782 raw_spin_unlock(&rq->lock);
3586 3783
3587 perf_event_task_tick(curr); 3784 perf_event_task_tick();
3588 3785
3589#ifdef CONFIG_SMP 3786#ifdef CONFIG_SMP
3590 rq->idle_at_tick = idle_cpu(cpu); 3787 rq->idle_at_tick = idle_cpu(cpu);
@@ -3690,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev)
3690 schedstat_inc(this_rq(), sched_count); 3887 schedstat_inc(this_rq(), sched_count);
3691#ifdef CONFIG_SCHEDSTATS 3888#ifdef CONFIG_SCHEDSTATS
3692 if (unlikely(prev->lock_depth >= 0)) { 3889 if (unlikely(prev->lock_depth >= 0)) {
3693 schedstat_inc(this_rq(), bkl_count); 3890 schedstat_inc(this_rq(), rq_sched_info.bkl_count);
3694 schedstat_inc(prev, sched_info.bkl_count); 3891 schedstat_inc(prev, sched_info.bkl_count);
3695 } 3892 }
3696#endif 3893#endif
@@ -3700,7 +3897,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)
3700{ 3897{
3701 if (prev->se.on_rq) 3898 if (prev->se.on_rq)
3702 update_rq_clock(rq); 3899 update_rq_clock(rq);
3703 rq->skip_clock_update = 0;
3704 prev->sched_class->put_prev_task(rq, prev); 3900 prev->sched_class->put_prev_task(rq, prev);
3705} 3901}
3706 3902
@@ -3723,17 +3919,13 @@ pick_next_task(struct rq *rq)
3723 return p; 3919 return p;
3724 } 3920 }
3725 3921
3726 class = sched_class_highest; 3922 for_each_class(class) {
3727 for ( ; ; ) {
3728 p = class->pick_next_task(rq); 3923 p = class->pick_next_task(rq);
3729 if (p) 3924 if (p)
3730 return p; 3925 return p;
3731 /*
3732 * Will never be NULL as the idle class always
3733 * returns a non-NULL p:
3734 */
3735 class = class->next;
3736 } 3926 }
3927
3928 BUG(); /* the idle class will always have a runnable task */
3737} 3929}
3738 3930
3739/* 3931/*
@@ -3762,7 +3954,6 @@ need_resched_nonpreemptible:
3762 hrtick_clear(rq); 3954 hrtick_clear(rq);
3763 3955
3764 raw_spin_lock_irq(&rq->lock); 3956 raw_spin_lock_irq(&rq->lock);
3765 clear_tsk_need_resched(prev);
3766 3957
3767 switch_count = &prev->nivcsw; 3958 switch_count = &prev->nivcsw;
3768 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3959 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -3794,6 +3985,8 @@ need_resched_nonpreemptible:
3794 3985
3795 put_prev_task(rq, prev); 3986 put_prev_task(rq, prev);
3796 next = pick_next_task(rq); 3987 next = pick_next_task(rq);
3988 clear_tsk_need_resched(prev);
3989 rq->skip_clock_update = 0;
3797 3990
3798 if (likely(prev != next)) { 3991 if (likely(prev != next)) {
3799 sched_info_switch(prev, next); 3992 sched_info_switch(prev, next);
@@ -3888,7 +4081,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
3888 if (task_thread_info(rq->curr) != owner || need_resched()) 4081 if (task_thread_info(rq->curr) != owner || need_resched())
3889 return 0; 4082 return 0;
3890 4083
3891 cpu_relax(); 4084 arch_mutex_cpu_relax();
3892 } 4085 }
3893 4086
3894 return 1; 4087 return 1;
@@ -4200,7 +4393,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
4200 * This waits for either a completion of a specific task to be signaled or for a 4393 * This waits for either a completion of a specific task to be signaled or for a
4201 * specified timeout to expire. It is interruptible. The timeout is in jiffies. 4394 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4202 */ 4395 */
4203unsigned long __sched 4396long __sched
4204wait_for_completion_interruptible_timeout(struct completion *x, 4397wait_for_completion_interruptible_timeout(struct completion *x,
4205 unsigned long timeout) 4398 unsigned long timeout)
4206{ 4399{
@@ -4233,7 +4426,7 @@ EXPORT_SYMBOL(wait_for_completion_killable);
4233 * signaled or for a specified timeout to expire. It can be 4426 * signaled or for a specified timeout to expire. It can be
4234 * interrupted by a kill signal. The timeout is in jiffies. 4427 * interrupted by a kill signal. The timeout is in jiffies.
4235 */ 4428 */
4236unsigned long __sched 4429long __sched
4237wait_for_completion_killable_timeout(struct completion *x, 4430wait_for_completion_killable_timeout(struct completion *x,
4238 unsigned long timeout) 4431 unsigned long timeout)
4239{ 4432{
@@ -4358,6 +4551,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4358 4551
4359 rq = task_rq_lock(p, &flags); 4552 rq = task_rq_lock(p, &flags);
4360 4553
4554 trace_sched_pi_setprio(p, prio);
4361 oldprio = p->prio; 4555 oldprio = p->prio;
4362 prev_class = p->sched_class; 4556 prev_class = p->sched_class;
4363 on_rq = p->se.on_rq; 4557 on_rq = p->se.on_rq;
@@ -4574,7 +4768,7 @@ static bool check_same_owner(struct task_struct *p)
4574} 4768}
4575 4769
4576static int __sched_setscheduler(struct task_struct *p, int policy, 4770static int __sched_setscheduler(struct task_struct *p, int policy,
4577 struct sched_param *param, bool user) 4771 const struct sched_param *param, bool user)
4578{ 4772{
4579 int retval, oldprio, oldpolicy = -1, on_rq, running; 4773 int retval, oldprio, oldpolicy = -1, on_rq, running;
4580 unsigned long flags; 4774 unsigned long flags;
@@ -4645,7 +4839,7 @@ recheck:
4645 } 4839 }
4646 4840
4647 if (user) { 4841 if (user) {
4648 retval = security_task_setscheduler(p, policy, param); 4842 retval = security_task_setscheduler(p);
4649 if (retval) 4843 if (retval)
4650 return retval; 4844 return retval;
4651 } 4845 }
@@ -4661,6 +4855,15 @@ recheck:
4661 */ 4855 */
4662 rq = __task_rq_lock(p); 4856 rq = __task_rq_lock(p);
4663 4857
4858 /*
4859 * Changing the policy of the stop threads its a very bad idea
4860 */
4861 if (p == rq->stop) {
4862 __task_rq_unlock(rq);
4863 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4864 return -EINVAL;
4865 }
4866
4664#ifdef CONFIG_RT_GROUP_SCHED 4867#ifdef CONFIG_RT_GROUP_SCHED
4665 if (user) { 4868 if (user) {
4666 /* 4869 /*
@@ -4668,7 +4871,8 @@ recheck:
4668 * assigned. 4871 * assigned.
4669 */ 4872 */
4670 if (rt_bandwidth_enabled() && rt_policy(policy) && 4873 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4671 task_group(p)->rt_bandwidth.rt_runtime == 0) { 4874 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4875 !task_group_is_autogroup(task_group(p))) {
4672 __task_rq_unlock(rq); 4876 __task_rq_unlock(rq);
4673 raw_spin_unlock_irqrestore(&p->pi_lock, flags); 4877 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4674 return -EPERM; 4878 return -EPERM;
@@ -4720,7 +4924,7 @@ recheck:
4720 * NOTE that the task may be already dead. 4924 * NOTE that the task may be already dead.
4721 */ 4925 */
4722int sched_setscheduler(struct task_struct *p, int policy, 4926int sched_setscheduler(struct task_struct *p, int policy,
4723 struct sched_param *param) 4927 const struct sched_param *param)
4724{ 4928{
4725 return __sched_setscheduler(p, policy, param, true); 4929 return __sched_setscheduler(p, policy, param, true);
4726} 4930}
@@ -4738,7 +4942,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
4738 * but our caller might not have that capability. 4942 * but our caller might not have that capability.
4739 */ 4943 */
4740int sched_setscheduler_nocheck(struct task_struct *p, int policy, 4944int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4741 struct sched_param *param) 4945 const struct sched_param *param)
4742{ 4946{
4743 return __sched_setscheduler(p, policy, param, false); 4947 return __sched_setscheduler(p, policy, param, false);
4744} 4948}
@@ -4887,13 +5091,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4887 if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) 5091 if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
4888 goto out_unlock; 5092 goto out_unlock;
4889 5093
4890 retval = security_task_setscheduler(p, 0, NULL); 5094 retval = security_task_setscheduler(p);
4891 if (retval) 5095 if (retval)
4892 goto out_unlock; 5096 goto out_unlock;
4893 5097
4894 cpuset_cpus_allowed(p, cpus_allowed); 5098 cpuset_cpus_allowed(p, cpus_allowed);
4895 cpumask_and(new_mask, in_mask, cpus_allowed); 5099 cpumask_and(new_mask, in_mask, cpus_allowed);
4896 again: 5100again:
4897 retval = set_cpus_allowed_ptr(p, new_mask); 5101 retval = set_cpus_allowed_ptr(p, new_mask);
4898 5102
4899 if (!retval) { 5103 if (!retval) {
@@ -5254,7 +5458,7 @@ void sched_show_task(struct task_struct *p)
5254 unsigned state; 5458 unsigned state;
5255 5459
5256 state = p->state ? __ffs(p->state) + 1 : 0; 5460 state = p->state ? __ffs(p->state) + 1 : 0;
5257 printk(KERN_INFO "%-13.13s %c", p->comm, 5461 printk(KERN_INFO "%-15.15s %c", p->comm,
5258 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); 5462 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
5259#if BITS_PER_LONG == 32 5463#if BITS_PER_LONG == 32
5260 if (state == TASK_RUNNING) 5464 if (state == TASK_RUNNING)
@@ -5337,7 +5541,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
5337 idle->se.exec_start = sched_clock(); 5541 idle->se.exec_start = sched_clock();
5338 5542
5339 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); 5543 cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
5544 /*
5545 * We're having a chicken and egg problem, even though we are
5546 * holding rq->lock, the cpu isn't yet set to this cpu so the
5547 * lockdep check in task_group() will fail.
5548 *
5549 * Similar case to sched_fork(). / Alternatively we could
5550 * use task_rq_lock() here and obtain the other rq->lock.
5551 *
5552 * Silence PROVE_RCU
5553 */
5554 rcu_read_lock();
5340 __set_task_cpu(idle, cpu); 5555 __set_task_cpu(idle, cpu);
5556 rcu_read_unlock();
5341 5557
5342 rq->curr = rq->idle = idle; 5558 rq->curr = rq->idle = idle;
5343#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 5559#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
@@ -5406,7 +5622,6 @@ static void update_sysctl(void)
5406 SET_SYSCTL(sched_min_granularity); 5622 SET_SYSCTL(sched_min_granularity);
5407 SET_SYSCTL(sched_latency); 5623 SET_SYSCTL(sched_latency);
5408 SET_SYSCTL(sched_wakeup_granularity); 5624 SET_SYSCTL(sched_wakeup_granularity);
5409 SET_SYSCTL(sched_shares_ratelimit);
5410#undef SET_SYSCTL 5625#undef SET_SYSCTL
5411} 5626}
5412 5627
@@ -5482,7 +5697,7 @@ again:
5482 goto out; 5697 goto out;
5483 5698
5484 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); 5699 dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
5485 if (migrate_task(p, dest_cpu)) { 5700 if (migrate_task(p, rq)) {
5486 struct migration_arg arg = { p, dest_cpu }; 5701 struct migration_arg arg = { p, dest_cpu };
5487 /* Need help from migration thread: drop lock and wait. */ 5702 /* Need help from migration thread: drop lock and wait. */
5488 task_rq_unlock(rq, &flags); 5703 task_rq_unlock(rq, &flags);
@@ -5564,29 +5779,20 @@ static int migration_cpu_stop(void *data)
5564} 5779}
5565 5780
5566#ifdef CONFIG_HOTPLUG_CPU 5781#ifdef CONFIG_HOTPLUG_CPU
5782
5567/* 5783/*
5568 * Figure out where task on dead CPU should go, use force if necessary. 5784 * Ensures that the idle task is using init_mm right before its cpu goes
5785 * offline.
5569 */ 5786 */
5570void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) 5787void idle_task_exit(void)
5571{ 5788{
5572 struct rq *rq = cpu_rq(dead_cpu); 5789 struct mm_struct *mm = current->active_mm;
5573 int needs_cpu, uninitialized_var(dest_cpu);
5574 unsigned long flags;
5575 5790
5576 local_irq_save(flags); 5791 BUG_ON(cpu_online(smp_processor_id()));
5577 5792
5578 raw_spin_lock(&rq->lock); 5793 if (mm != &init_mm)
5579 needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); 5794 switch_mm(mm, &init_mm, current);
5580 if (needs_cpu) 5795 mmdrop(mm);
5581 dest_cpu = select_fallback_rq(dead_cpu, p);
5582 raw_spin_unlock(&rq->lock);
5583 /*
5584 * It can only fail if we race with set_cpus_allowed(),
5585 * in the racer should migrate the task anyway.
5586 */
5587 if (needs_cpu)
5588 __migrate_task(p, dead_cpu, dest_cpu);
5589 local_irq_restore(flags);
5590} 5796}
5591 5797
5592/* 5798/*
@@ -5599,128 +5805,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5599static void migrate_nr_uninterruptible(struct rq *rq_src) 5805static void migrate_nr_uninterruptible(struct rq *rq_src)
5600{ 5806{
5601 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); 5807 struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask));
5602 unsigned long flags;
5603 5808
5604 local_irq_save(flags);
5605 double_rq_lock(rq_src, rq_dest);
5606 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; 5809 rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
5607 rq_src->nr_uninterruptible = 0; 5810 rq_src->nr_uninterruptible = 0;
5608 double_rq_unlock(rq_src, rq_dest);
5609 local_irq_restore(flags);
5610}
5611
5612/* Run through task list and migrate tasks from the dead cpu. */
5613static void migrate_live_tasks(int src_cpu)
5614{
5615 struct task_struct *p, *t;
5616
5617 read_lock(&tasklist_lock);
5618
5619 do_each_thread(t, p) {
5620 if (p == current)
5621 continue;
5622
5623 if (task_cpu(p) == src_cpu)
5624 move_task_off_dead_cpu(src_cpu, p);
5625 } while_each_thread(t, p);
5626
5627 read_unlock(&tasklist_lock);
5628} 5811}
5629 5812
5630/* 5813/*
5631 * Schedules idle task to be the next runnable task on current CPU. 5814 * remove the tasks which were accounted by rq from calc_load_tasks.
5632 * It does so by boosting its priority to highest possible.
5633 * Used by CPU offline code.
5634 */ 5815 */
5635void sched_idle_next(void) 5816static void calc_global_load_remove(struct rq *rq)
5636{ 5817{
5637 int this_cpu = smp_processor_id(); 5818 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5638 struct rq *rq = cpu_rq(this_cpu); 5819 rq->calc_load_active = 0;
5639 struct task_struct *p = rq->idle;
5640 unsigned long flags;
5641
5642 /* cpu has to be offline */
5643 BUG_ON(cpu_online(this_cpu));
5644
5645 /*
5646 * Strictly not necessary since rest of the CPUs are stopped by now
5647 * and interrupts disabled on the current cpu.
5648 */
5649 raw_spin_lock_irqsave(&rq->lock, flags);
5650
5651 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5652
5653 activate_task(rq, p, 0);
5654
5655 raw_spin_unlock_irqrestore(&rq->lock, flags);
5656} 5820}
5657 5821
5658/* 5822/*
5659 * Ensures that the idle task is using init_mm right before its cpu goes 5823 * Migrate all tasks from the rq, sleeping tasks will be migrated by
5660 * offline. 5824 * try_to_wake_up()->select_task_rq().
5825 *
5826 * Called with rq->lock held even though we'er in stop_machine() and
5827 * there's no concurrency possible, we hold the required locks anyway
5828 * because of lock validation efforts.
5661 */ 5829 */
5662void idle_task_exit(void) 5830static void migrate_tasks(unsigned int dead_cpu)
5663{
5664 struct mm_struct *mm = current->active_mm;
5665
5666 BUG_ON(cpu_online(smp_processor_id()));
5667
5668 if (mm != &init_mm)
5669 switch_mm(mm, &init_mm, current);
5670 mmdrop(mm);
5671}
5672
5673/* called under rq->lock with disabled interrupts */
5674static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5675{ 5831{
5676 struct rq *rq = cpu_rq(dead_cpu); 5832 struct rq *rq = cpu_rq(dead_cpu);
5677 5833 struct task_struct *next, *stop = rq->stop;
5678 /* Must be exiting, otherwise would be on tasklist. */ 5834 int dest_cpu;
5679 BUG_ON(!p->exit_state);
5680
5681 /* Cannot have done final schedule yet: would have vanished. */
5682 BUG_ON(p->state == TASK_DEAD);
5683
5684 get_task_struct(p);
5685 5835
5686 /* 5836 /*
5687 * Drop lock around migration; if someone else moves it, 5837 * Fudge the rq selection such that the below task selection loop
5688 * that's OK. No task can be added to this CPU, so iteration is 5838 * doesn't get stuck on the currently eligible stop task.
5689 * fine. 5839 *
5840 * We're currently inside stop_machine() and the rq is either stuck
5841 * in the stop_machine_cpu_stop() loop, or we're executing this code,
5842 * either way we should never end up calling schedule() until we're
5843 * done here.
5690 */ 5844 */
5691 raw_spin_unlock_irq(&rq->lock); 5845 rq->stop = NULL;
5692 move_task_off_dead_cpu(dead_cpu, p);
5693 raw_spin_lock_irq(&rq->lock);
5694
5695 put_task_struct(p);
5696}
5697
5698/* release_task() removes task from tasklist, so we won't find dead tasks. */
5699static void migrate_dead_tasks(unsigned int dead_cpu)
5700{
5701 struct rq *rq = cpu_rq(dead_cpu);
5702 struct task_struct *next;
5703 5846
5704 for ( ; ; ) { 5847 for ( ; ; ) {
5705 if (!rq->nr_running) 5848 /*
5849 * There's this thread running, bail when that's the only
5850 * remaining thread.
5851 */
5852 if (rq->nr_running == 1)
5706 break; 5853 break;
5854
5707 next = pick_next_task(rq); 5855 next = pick_next_task(rq);
5708 if (!next) 5856 BUG_ON(!next);
5709 break;
5710 next->sched_class->put_prev_task(rq, next); 5857 next->sched_class->put_prev_task(rq, next);
5711 migrate_dead(dead_cpu, next);
5712 5858
5859 /* Find suitable destination for @next, with force if needed. */
5860 dest_cpu = select_fallback_rq(dead_cpu, next);
5861 raw_spin_unlock(&rq->lock);
5862
5863 __migrate_task(next, dead_cpu, dest_cpu);
5864
5865 raw_spin_lock(&rq->lock);
5713 } 5866 }
5714}
5715 5867
5716/* 5868 rq->stop = stop;
5717 * remove the tasks which were accounted by rq from calc_load_tasks.
5718 */
5719static void calc_global_load_remove(struct rq *rq)
5720{
5721 atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
5722 rq->calc_load_active = 0;
5723} 5869}
5870
5724#endif /* CONFIG_HOTPLUG_CPU */ 5871#endif /* CONFIG_HOTPLUG_CPU */
5725 5872
5726#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) 5873#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -5930,15 +6077,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5930 unsigned long flags; 6077 unsigned long flags;
5931 struct rq *rq = cpu_rq(cpu); 6078 struct rq *rq = cpu_rq(cpu);
5932 6079
5933 switch (action) { 6080 switch (action & ~CPU_TASKS_FROZEN) {
5934 6081
5935 case CPU_UP_PREPARE: 6082 case CPU_UP_PREPARE:
5936 case CPU_UP_PREPARE_FROZEN:
5937 rq->calc_load_update = calc_load_update; 6083 rq->calc_load_update = calc_load_update;
5938 break; 6084 break;
5939 6085
5940 case CPU_ONLINE: 6086 case CPU_ONLINE:
5941 case CPU_ONLINE_FROZEN:
5942 /* Update our root-domain */ 6087 /* Update our root-domain */
5943 raw_spin_lock_irqsave(&rq->lock, flags); 6088 raw_spin_lock_irqsave(&rq->lock, flags);
5944 if (rq->rd) { 6089 if (rq->rd) {
@@ -5950,30 +6095,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5950 break; 6095 break;
5951 6096
5952#ifdef CONFIG_HOTPLUG_CPU 6097#ifdef CONFIG_HOTPLUG_CPU
5953 case CPU_DEAD:
5954 case CPU_DEAD_FROZEN:
5955 migrate_live_tasks(cpu);
5956 /* Idle task back to normal (off runqueue, low prio) */
5957 raw_spin_lock_irq(&rq->lock);
5958 deactivate_task(rq, rq->idle, 0);
5959 __setscheduler(rq, rq->idle, SCHED_NORMAL, 0);
5960 rq->idle->sched_class = &idle_sched_class;
5961 migrate_dead_tasks(cpu);
5962 raw_spin_unlock_irq(&rq->lock);
5963 migrate_nr_uninterruptible(rq);
5964 BUG_ON(rq->nr_running != 0);
5965 calc_global_load_remove(rq);
5966 break;
5967
5968 case CPU_DYING: 6098 case CPU_DYING:
5969 case CPU_DYING_FROZEN:
5970 /* Update our root-domain */ 6099 /* Update our root-domain */
5971 raw_spin_lock_irqsave(&rq->lock, flags); 6100 raw_spin_lock_irqsave(&rq->lock, flags);
5972 if (rq->rd) { 6101 if (rq->rd) {
5973 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); 6102 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5974 set_rq_offline(rq); 6103 set_rq_offline(rq);
5975 } 6104 }
6105 migrate_tasks(cpu);
6106 BUG_ON(rq->nr_running != 1); /* the migration thread */
5976 raw_spin_unlock_irqrestore(&rq->lock, flags); 6107 raw_spin_unlock_irqrestore(&rq->lock, flags);
6108
6109 migrate_nr_uninterruptible(rq);
6110 calc_global_load_remove(rq);
5977 break; 6111 break;
5978#endif 6112#endif
5979 } 6113 }
@@ -6514,6 +6648,7 @@ struct s_data {
6514 cpumask_var_t nodemask; 6648 cpumask_var_t nodemask;
6515 cpumask_var_t this_sibling_map; 6649 cpumask_var_t this_sibling_map;
6516 cpumask_var_t this_core_map; 6650 cpumask_var_t this_core_map;
6651 cpumask_var_t this_book_map;
6517 cpumask_var_t send_covered; 6652 cpumask_var_t send_covered;
6518 cpumask_var_t tmpmask; 6653 cpumask_var_t tmpmask;
6519 struct sched_group **sched_group_nodes; 6654 struct sched_group **sched_group_nodes;
@@ -6525,6 +6660,7 @@ enum s_alloc {
6525 sa_rootdomain, 6660 sa_rootdomain,
6526 sa_tmpmask, 6661 sa_tmpmask,
6527 sa_send_covered, 6662 sa_send_covered,
6663 sa_this_book_map,
6528 sa_this_core_map, 6664 sa_this_core_map,
6529 sa_this_sibling_map, 6665 sa_this_sibling_map,
6530 sa_nodemask, 6666 sa_nodemask,
@@ -6560,31 +6696,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
6560#ifdef CONFIG_SCHED_MC 6696#ifdef CONFIG_SCHED_MC
6561static DEFINE_PER_CPU(struct static_sched_domain, core_domains); 6697static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
6562static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); 6698static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
6563#endif /* CONFIG_SCHED_MC */
6564 6699
6565#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6566static int 6700static int
6567cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6701cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
6568 struct sched_group **sg, struct cpumask *mask) 6702 struct sched_group **sg, struct cpumask *mask)
6569{ 6703{
6570 int group; 6704 int group;
6571 6705#ifdef CONFIG_SCHED_SMT
6572 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); 6706 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6573 group = cpumask_first(mask); 6707 group = cpumask_first(mask);
6708#else
6709 group = cpu;
6710#endif
6574 if (sg) 6711 if (sg)
6575 *sg = &per_cpu(sched_group_core, group).sg; 6712 *sg = &per_cpu(sched_group_core, group).sg;
6576 return group; 6713 return group;
6577} 6714}
6578#elif defined(CONFIG_SCHED_MC) 6715#endif /* CONFIG_SCHED_MC */
6716
6717/*
6718 * book sched-domains:
6719 */
6720#ifdef CONFIG_SCHED_BOOK
6721static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
6722static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
6723
6579static int 6724static int
6580cpu_to_core_group(int cpu, const struct cpumask *cpu_map, 6725cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
6581 struct sched_group **sg, struct cpumask *unused) 6726 struct sched_group **sg, struct cpumask *mask)
6582{ 6727{
6728 int group = cpu;
6729#ifdef CONFIG_SCHED_MC
6730 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6731 group = cpumask_first(mask);
6732#elif defined(CONFIG_SCHED_SMT)
6733 cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
6734 group = cpumask_first(mask);
6735#endif
6583 if (sg) 6736 if (sg)
6584 *sg = &per_cpu(sched_group_core, cpu).sg; 6737 *sg = &per_cpu(sched_group_book, group).sg;
6585 return cpu; 6738 return group;
6586} 6739}
6587#endif 6740#endif /* CONFIG_SCHED_BOOK */
6588 6741
6589static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); 6742static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
6590static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); 6743static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
@@ -6594,7 +6747,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
6594 struct sched_group **sg, struct cpumask *mask) 6747 struct sched_group **sg, struct cpumask *mask)
6595{ 6748{
6596 int group; 6749 int group;
6597#ifdef CONFIG_SCHED_MC 6750#ifdef CONFIG_SCHED_BOOK
6751 cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
6752 group = cpumask_first(mask);
6753#elif defined(CONFIG_SCHED_MC)
6598 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); 6754 cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
6599 group = cpumask_first(mask); 6755 group = cpumask_first(mask);
6600#elif defined(CONFIG_SCHED_SMT) 6756#elif defined(CONFIG_SCHED_SMT)
@@ -6790,6 +6946,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6790 if (cpu != group_first_cpu(sd->groups)) 6946 if (cpu != group_first_cpu(sd->groups))
6791 return; 6947 return;
6792 6948
6949 sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
6950
6793 child = sd->child; 6951 child = sd->child;
6794 6952
6795 sd->groups->cpu_power = 0; 6953 sd->groups->cpu_power = 0;
@@ -6855,6 +7013,9 @@ SD_INIT_FUNC(CPU)
6855#ifdef CONFIG_SCHED_MC 7013#ifdef CONFIG_SCHED_MC
6856 SD_INIT_FUNC(MC) 7014 SD_INIT_FUNC(MC)
6857#endif 7015#endif
7016#ifdef CONFIG_SCHED_BOOK
7017 SD_INIT_FUNC(BOOK)
7018#endif
6858 7019
6859static int default_relax_domain_level = -1; 7020static int default_relax_domain_level = -1;
6860 7021
@@ -6904,6 +7065,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
6904 free_cpumask_var(d->tmpmask); /* fall through */ 7065 free_cpumask_var(d->tmpmask); /* fall through */
6905 case sa_send_covered: 7066 case sa_send_covered:
6906 free_cpumask_var(d->send_covered); /* fall through */ 7067 free_cpumask_var(d->send_covered); /* fall through */
7068 case sa_this_book_map:
7069 free_cpumask_var(d->this_book_map); /* fall through */
6907 case sa_this_core_map: 7070 case sa_this_core_map:
6908 free_cpumask_var(d->this_core_map); /* fall through */ 7071 free_cpumask_var(d->this_core_map); /* fall through */
6909 case sa_this_sibling_map: 7072 case sa_this_sibling_map:
@@ -6950,8 +7113,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
6950 return sa_nodemask; 7113 return sa_nodemask;
6951 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) 7114 if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
6952 return sa_this_sibling_map; 7115 return sa_this_sibling_map;
6953 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) 7116 if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
6954 return sa_this_core_map; 7117 return sa_this_core_map;
7118 if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
7119 return sa_this_book_map;
6955 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) 7120 if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
6956 return sa_send_covered; 7121 return sa_send_covered;
6957 d->rd = alloc_rootdomain(); 7122 d->rd = alloc_rootdomain();
@@ -7009,6 +7174,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
7009 return sd; 7174 return sd;
7010} 7175}
7011 7176
7177static struct sched_domain *__build_book_sched_domain(struct s_data *d,
7178 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7179 struct sched_domain *parent, int i)
7180{
7181 struct sched_domain *sd = parent;
7182#ifdef CONFIG_SCHED_BOOK
7183 sd = &per_cpu(book_domains, i).sd;
7184 SD_INIT(sd, BOOK);
7185 set_domain_attribute(sd, attr);
7186 cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
7187 sd->parent = parent;
7188 parent->child = sd;
7189 cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
7190#endif
7191 return sd;
7192}
7193
7012static struct sched_domain *__build_mc_sched_domain(struct s_data *d, 7194static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
7013 const struct cpumask *cpu_map, struct sched_domain_attr *attr, 7195 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
7014 struct sched_domain *parent, int i) 7196 struct sched_domain *parent, int i)
@@ -7066,6 +7248,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
7066 d->send_covered, d->tmpmask); 7248 d->send_covered, d->tmpmask);
7067 break; 7249 break;
7068#endif 7250#endif
7251#ifdef CONFIG_SCHED_BOOK
7252 case SD_LV_BOOK: /* set up book groups */
7253 cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
7254 if (cpu == cpumask_first(d->this_book_map))
7255 init_sched_build_groups(d->this_book_map, cpu_map,
7256 &cpu_to_book_group,
7257 d->send_covered, d->tmpmask);
7258 break;
7259#endif
7069 case SD_LV_CPU: /* set up physical groups */ 7260 case SD_LV_CPU: /* set up physical groups */
7070 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); 7261 cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
7071 if (!cpumask_empty(d->nodemask)) 7262 if (!cpumask_empty(d->nodemask))
@@ -7113,12 +7304,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7113 7304
7114 sd = __build_numa_sched_domains(&d, cpu_map, attr, i); 7305 sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
7115 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); 7306 sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
7307 sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
7116 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); 7308 sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
7117 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); 7309 sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
7118 } 7310 }
7119 7311
7120 for_each_cpu(i, cpu_map) { 7312 for_each_cpu(i, cpu_map) {
7121 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); 7313 build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
7314 build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
7122 build_sched_groups(&d, SD_LV_MC, cpu_map, i); 7315 build_sched_groups(&d, SD_LV_MC, cpu_map, i);
7123 } 7316 }
7124 7317
@@ -7149,6 +7342,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7149 init_sched_groups_power(i, sd); 7342 init_sched_groups_power(i, sd);
7150 } 7343 }
7151#endif 7344#endif
7345#ifdef CONFIG_SCHED_BOOK
7346 for_each_cpu(i, cpu_map) {
7347 sd = &per_cpu(book_domains, i).sd;
7348 init_sched_groups_power(i, sd);
7349 }
7350#endif
7152 7351
7153 for_each_cpu(i, cpu_map) { 7352 for_each_cpu(i, cpu_map) {
7154 sd = &per_cpu(phys_domains, i).sd; 7353 sd = &per_cpu(phys_domains, i).sd;
@@ -7174,6 +7373,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
7174 sd = &per_cpu(cpu_domains, i).sd; 7373 sd = &per_cpu(cpu_domains, i).sd;
7175#elif defined(CONFIG_SCHED_MC) 7374#elif defined(CONFIG_SCHED_MC)
7176 sd = &per_cpu(core_domains, i).sd; 7375 sd = &per_cpu(core_domains, i).sd;
7376#elif defined(CONFIG_SCHED_BOOK)
7377 sd = &per_cpu(book_domains, i).sd;
7177#else 7378#else
7178 sd = &per_cpu(phys_domains, i).sd; 7379 sd = &per_cpu(phys_domains, i).sd;
7179#endif 7380#endif
@@ -7637,18 +7838,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7637 7838
7638#ifdef CONFIG_FAIR_GROUP_SCHED 7839#ifdef CONFIG_FAIR_GROUP_SCHED
7639static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, 7840static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7640 struct sched_entity *se, int cpu, int add, 7841 struct sched_entity *se, int cpu,
7641 struct sched_entity *parent) 7842 struct sched_entity *parent)
7642{ 7843{
7643 struct rq *rq = cpu_rq(cpu); 7844 struct rq *rq = cpu_rq(cpu);
7644 tg->cfs_rq[cpu] = cfs_rq; 7845 tg->cfs_rq[cpu] = cfs_rq;
7645 init_cfs_rq(cfs_rq, rq); 7846 init_cfs_rq(cfs_rq, rq);
7646 cfs_rq->tg = tg; 7847 cfs_rq->tg = tg;
7647 if (add)
7648 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7649 7848
7650 tg->se[cpu] = se; 7849 tg->se[cpu] = se;
7651 /* se could be NULL for init_task_group */ 7850 /* se could be NULL for root_task_group */
7652 if (!se) 7851 if (!se)
7653 return; 7852 return;
7654 7853
@@ -7658,15 +7857,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7658 se->cfs_rq = parent->my_q; 7857 se->cfs_rq = parent->my_q;
7659 7858
7660 se->my_q = cfs_rq; 7859 se->my_q = cfs_rq;
7661 se->load.weight = tg->shares; 7860 update_load_set(&se->load, 0);
7662 se->load.inv_weight = 0;
7663 se->parent = parent; 7861 se->parent = parent;
7664} 7862}
7665#endif 7863#endif
7666 7864
7667#ifdef CONFIG_RT_GROUP_SCHED 7865#ifdef CONFIG_RT_GROUP_SCHED
7668static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, 7866static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7669 struct sched_rt_entity *rt_se, int cpu, int add, 7867 struct sched_rt_entity *rt_se, int cpu,
7670 struct sched_rt_entity *parent) 7868 struct sched_rt_entity *parent)
7671{ 7869{
7672 struct rq *rq = cpu_rq(cpu); 7870 struct rq *rq = cpu_rq(cpu);
@@ -7675,8 +7873,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
7675 init_rt_rq(rt_rq, rq); 7873 init_rt_rq(rt_rq, rq);
7676 rt_rq->tg = tg; 7874 rt_rq->tg = tg;
7677 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; 7875 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
7678 if (add)
7679 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7680 7876
7681 tg->rt_se[cpu] = rt_se; 7877 tg->rt_se[cpu] = rt_se;
7682 if (!rt_se) 7878 if (!rt_se)
@@ -7711,18 +7907,18 @@ void __init sched_init(void)
7711 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); 7907 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
7712 7908
7713#ifdef CONFIG_FAIR_GROUP_SCHED 7909#ifdef CONFIG_FAIR_GROUP_SCHED
7714 init_task_group.se = (struct sched_entity **)ptr; 7910 root_task_group.se = (struct sched_entity **)ptr;
7715 ptr += nr_cpu_ids * sizeof(void **); 7911 ptr += nr_cpu_ids * sizeof(void **);
7716 7912
7717 init_task_group.cfs_rq = (struct cfs_rq **)ptr; 7913 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7718 ptr += nr_cpu_ids * sizeof(void **); 7914 ptr += nr_cpu_ids * sizeof(void **);
7719 7915
7720#endif /* CONFIG_FAIR_GROUP_SCHED */ 7916#endif /* CONFIG_FAIR_GROUP_SCHED */
7721#ifdef CONFIG_RT_GROUP_SCHED 7917#ifdef CONFIG_RT_GROUP_SCHED
7722 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 7918 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
7723 ptr += nr_cpu_ids * sizeof(void **); 7919 ptr += nr_cpu_ids * sizeof(void **);
7724 7920
7725 init_task_group.rt_rq = (struct rt_rq **)ptr; 7921 root_task_group.rt_rq = (struct rt_rq **)ptr;
7726 ptr += nr_cpu_ids * sizeof(void **); 7922 ptr += nr_cpu_ids * sizeof(void **);
7727 7923
7728#endif /* CONFIG_RT_GROUP_SCHED */ 7924#endif /* CONFIG_RT_GROUP_SCHED */
@@ -7742,20 +7938,16 @@ void __init sched_init(void)
7742 global_rt_period(), global_rt_runtime()); 7938 global_rt_period(), global_rt_runtime());
7743 7939
7744#ifdef CONFIG_RT_GROUP_SCHED 7940#ifdef CONFIG_RT_GROUP_SCHED
7745 init_rt_bandwidth(&init_task_group.rt_bandwidth, 7941 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7746 global_rt_period(), global_rt_runtime()); 7942 global_rt_period(), global_rt_runtime());
7747#endif /* CONFIG_RT_GROUP_SCHED */ 7943#endif /* CONFIG_RT_GROUP_SCHED */
7748 7944
7749#ifdef CONFIG_CGROUP_SCHED 7945#ifdef CONFIG_CGROUP_SCHED
7750 list_add(&init_task_group.list, &task_groups); 7946 list_add(&root_task_group.list, &task_groups);
7751 INIT_LIST_HEAD(&init_task_group.children); 7947 INIT_LIST_HEAD(&root_task_group.children);
7752 7948 autogroup_init(&init_task);
7753#endif /* CONFIG_CGROUP_SCHED */ 7949#endif /* CONFIG_CGROUP_SCHED */
7754 7950
7755#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7756 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
7757 __alignof__(unsigned long));
7758#endif
7759 for_each_possible_cpu(i) { 7951 for_each_possible_cpu(i) {
7760 struct rq *rq; 7952 struct rq *rq;
7761 7953
@@ -7767,38 +7959,34 @@ void __init sched_init(void)
7767 init_cfs_rq(&rq->cfs, rq); 7959 init_cfs_rq(&rq->cfs, rq);
7768 init_rt_rq(&rq->rt, rq); 7960 init_rt_rq(&rq->rt, rq);
7769#ifdef CONFIG_FAIR_GROUP_SCHED 7961#ifdef CONFIG_FAIR_GROUP_SCHED
7770 init_task_group.shares = init_task_group_load; 7962 root_task_group.shares = root_task_group_load;
7771 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); 7963 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7772#ifdef CONFIG_CGROUP_SCHED
7773 /* 7964 /*
7774 * How much cpu bandwidth does init_task_group get? 7965 * How much cpu bandwidth does root_task_group get?
7775 * 7966 *
7776 * In case of task-groups formed thr' the cgroup filesystem, it 7967 * In case of task-groups formed thr' the cgroup filesystem, it
7777 * gets 100% of the cpu resources in the system. This overall 7968 * gets 100% of the cpu resources in the system. This overall
7778 * system cpu resource is divided among the tasks of 7969 * system cpu resource is divided among the tasks of
7779 * init_task_group and its child task-groups in a fair manner, 7970 * root_task_group and its child task-groups in a fair manner,
7780 * based on each entity's (task or task-group's) weight 7971 * based on each entity's (task or task-group's) weight
7781 * (se->load.weight). 7972 * (se->load.weight).
7782 * 7973 *
7783 * In other words, if init_task_group has 10 tasks of weight 7974 * In other words, if root_task_group has 10 tasks of weight
7784 * 1024) and two child groups A0 and A1 (of weight 1024 each), 7975 * 1024) and two child groups A0 and A1 (of weight 1024 each),
7785 * then A0's share of the cpu resource is: 7976 * then A0's share of the cpu resource is:
7786 * 7977 *
7787 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% 7978 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
7788 * 7979 *
7789 * We achieve this by letting init_task_group's tasks sit 7980 * We achieve this by letting root_task_group's tasks sit
7790 * directly in rq->cfs (i.e init_task_group->se[] = NULL). 7981 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
7791 */ 7982 */
7792 init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); 7983 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
7793#endif
7794#endif /* CONFIG_FAIR_GROUP_SCHED */ 7984#endif /* CONFIG_FAIR_GROUP_SCHED */
7795 7985
7796 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; 7986 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
7797#ifdef CONFIG_RT_GROUP_SCHED 7987#ifdef CONFIG_RT_GROUP_SCHED
7798 INIT_LIST_HEAD(&rq->leaf_rt_rq_list); 7988 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7799#ifdef CONFIG_CGROUP_SCHED 7989 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
7800 init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL);
7801#endif
7802#endif 7990#endif
7803 7991
7804 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7992 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -7878,8 +8066,6 @@ void __init sched_init(void)
7878 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); 8066 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
7879#endif /* SMP */ 8067#endif /* SMP */
7880 8068
7881 perf_event_init();
7882
7883 scheduler_running = 1; 8069 scheduler_running = 1;
7884} 8070}
7885 8071
@@ -8073,26 +8259,32 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8073 if (!se) 8259 if (!se)
8074 goto err_free_rq; 8260 goto err_free_rq;
8075 8261
8076 init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); 8262 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
8077 } 8263 }
8078 8264
8079 return 1; 8265 return 1;
8080 8266
8081 err_free_rq: 8267err_free_rq:
8082 kfree(cfs_rq); 8268 kfree(cfs_rq);
8083 err: 8269err:
8084 return 0; 8270 return 0;
8085} 8271}
8086 8272
8087static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8088{
8089 list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list,
8090 &cpu_rq(cpu)->leaf_cfs_rq_list);
8091}
8092
8093static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8273static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8094{ 8274{
8095 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8275 struct rq *rq = cpu_rq(cpu);
8276 unsigned long flags;
8277
8278 /*
8279 * Only empty task groups can be destroyed; so we can speculatively
8280 * check on_list without danger of it being re-added.
8281 */
8282 if (!tg->cfs_rq[cpu]->on_list)
8283 return;
8284
8285 raw_spin_lock_irqsave(&rq->lock, flags);
8286 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
8287 raw_spin_unlock_irqrestore(&rq->lock, flags);
8096} 8288}
8097#else /* !CONFG_FAIR_GROUP_SCHED */ 8289#else /* !CONFG_FAIR_GROUP_SCHED */
8098static inline void free_fair_sched_group(struct task_group *tg) 8290static inline void free_fair_sched_group(struct task_group *tg)
@@ -8105,10 +8297,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
8105 return 1; 8297 return 1;
8106} 8298}
8107 8299
8108static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8109{
8110}
8111
8112static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8300static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8113{ 8301{
8114} 8302}
@@ -8163,27 +8351,16 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8163 if (!rt_se) 8351 if (!rt_se)
8164 goto err_free_rq; 8352 goto err_free_rq;
8165 8353
8166 init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); 8354 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
8167 } 8355 }
8168 8356
8169 return 1; 8357 return 1;
8170 8358
8171 err_free_rq: 8359err_free_rq:
8172 kfree(rt_rq); 8360 kfree(rt_rq);
8173 err: 8361err:
8174 return 0; 8362 return 0;
8175} 8363}
8176
8177static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8178{
8179 list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list,
8180 &cpu_rq(cpu)->leaf_rt_rq_list);
8181}
8182
8183static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8184{
8185 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8186}
8187#else /* !CONFIG_RT_GROUP_SCHED */ 8364#else /* !CONFIG_RT_GROUP_SCHED */
8188static inline void free_rt_sched_group(struct task_group *tg) 8365static inline void free_rt_sched_group(struct task_group *tg)
8189{ 8366{
@@ -8194,14 +8371,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
8194{ 8371{
8195 return 1; 8372 return 1;
8196} 8373}
8197
8198static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8199{
8200}
8201
8202static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8203{
8204}
8205#endif /* CONFIG_RT_GROUP_SCHED */ 8374#endif /* CONFIG_RT_GROUP_SCHED */
8206 8375
8207#ifdef CONFIG_CGROUP_SCHED 8376#ifdef CONFIG_CGROUP_SCHED
@@ -8209,6 +8378,7 @@ static void free_sched_group(struct task_group *tg)
8209{ 8378{
8210 free_fair_sched_group(tg); 8379 free_fair_sched_group(tg);
8211 free_rt_sched_group(tg); 8380 free_rt_sched_group(tg);
8381 autogroup_free(tg);
8212 kfree(tg); 8382 kfree(tg);
8213} 8383}
8214 8384
@@ -8217,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8217{ 8387{
8218 struct task_group *tg; 8388 struct task_group *tg;
8219 unsigned long flags; 8389 unsigned long flags;
8220 int i;
8221 8390
8222 tg = kzalloc(sizeof(*tg), GFP_KERNEL); 8391 tg = kzalloc(sizeof(*tg), GFP_KERNEL);
8223 if (!tg) 8392 if (!tg)
@@ -8230,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent)
8230 goto err; 8399 goto err;
8231 8400
8232 spin_lock_irqsave(&task_group_lock, flags); 8401 spin_lock_irqsave(&task_group_lock, flags);
8233 for_each_possible_cpu(i) {
8234 register_fair_sched_group(tg, i);
8235 register_rt_sched_group(tg, i);
8236 }
8237 list_add_rcu(&tg->list, &task_groups); 8402 list_add_rcu(&tg->list, &task_groups);
8238 8403
8239 WARN_ON(!parent); /* root should already exist */ 8404 WARN_ON(!parent); /* root should already exist */
@@ -8263,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg)
8263 unsigned long flags; 8428 unsigned long flags;
8264 int i; 8429 int i;
8265 8430
8266 spin_lock_irqsave(&task_group_lock, flags); 8431 /* end participation in shares distribution */
8267 for_each_possible_cpu(i) { 8432 for_each_possible_cpu(i)
8268 unregister_fair_sched_group(tg, i); 8433 unregister_fair_sched_group(tg, i);
8269 unregister_rt_sched_group(tg, i); 8434
8270 } 8435 spin_lock_irqsave(&task_group_lock, flags);
8271 list_del_rcu(&tg->list); 8436 list_del_rcu(&tg->list);
8272 list_del_rcu(&tg->siblings); 8437 list_del_rcu(&tg->siblings);
8273 spin_unlock_irqrestore(&task_group_lock, flags); 8438 spin_unlock_irqrestore(&task_group_lock, flags);
@@ -8297,12 +8462,12 @@ void sched_move_task(struct task_struct *tsk)
8297 if (unlikely(running)) 8462 if (unlikely(running))
8298 tsk->sched_class->put_prev_task(rq, tsk); 8463 tsk->sched_class->put_prev_task(rq, tsk);
8299 8464
8300 set_task_rq(tsk, task_cpu(tsk));
8301
8302#ifdef CONFIG_FAIR_GROUP_SCHED 8465#ifdef CONFIG_FAIR_GROUP_SCHED
8303 if (tsk->sched_class->moved_group) 8466 if (tsk->sched_class->task_move_group)
8304 tsk->sched_class->moved_group(tsk, on_rq); 8467 tsk->sched_class->task_move_group(tsk, on_rq);
8468 else
8305#endif 8469#endif
8470 set_task_rq(tsk, task_cpu(tsk));
8306 8471
8307 if (unlikely(running)) 8472 if (unlikely(running))
8308 tsk->sched_class->set_curr_task(rq); 8473 tsk->sched_class->set_curr_task(rq);
@@ -8314,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk)
8314#endif /* CONFIG_CGROUP_SCHED */ 8479#endif /* CONFIG_CGROUP_SCHED */
8315 8480
8316#ifdef CONFIG_FAIR_GROUP_SCHED 8481#ifdef CONFIG_FAIR_GROUP_SCHED
8317static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8318{
8319 struct cfs_rq *cfs_rq = se->cfs_rq;
8320 int on_rq;
8321
8322 on_rq = se->on_rq;
8323 if (on_rq)
8324 dequeue_entity(cfs_rq, se, 0);
8325
8326 se->load.weight = shares;
8327 se->load.inv_weight = 0;
8328
8329 if (on_rq)
8330 enqueue_entity(cfs_rq, se, 0);
8331}
8332
8333static void set_se_shares(struct sched_entity *se, unsigned long shares)
8334{
8335 struct cfs_rq *cfs_rq = se->cfs_rq;
8336 struct rq *rq = cfs_rq->rq;
8337 unsigned long flags;
8338
8339 raw_spin_lock_irqsave(&rq->lock, flags);
8340 __set_se_shares(se, shares);
8341 raw_spin_unlock_irqrestore(&rq->lock, flags);
8342}
8343
8344static DEFINE_MUTEX(shares_mutex); 8482static DEFINE_MUTEX(shares_mutex);
8345 8483
8346int sched_group_set_shares(struct task_group *tg, unsigned long shares) 8484int sched_group_set_shares(struct task_group *tg, unsigned long shares)
@@ -8363,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8363 if (tg->shares == shares) 8501 if (tg->shares == shares)
8364 goto done; 8502 goto done;
8365 8503
8366 spin_lock_irqsave(&task_group_lock, flags);
8367 for_each_possible_cpu(i)
8368 unregister_fair_sched_group(tg, i);
8369 list_del_rcu(&tg->siblings);
8370 spin_unlock_irqrestore(&task_group_lock, flags);
8371
8372 /* wait for any ongoing reference to this group to finish */
8373 synchronize_sched();
8374
8375 /*
8376 * Now we are free to modify the group's share on each cpu
8377 * w/o tripping rebalance_share or load_balance_fair.
8378 */
8379 tg->shares = shares; 8504 tg->shares = shares;
8380 for_each_possible_cpu(i) { 8505 for_each_possible_cpu(i) {
8381 /* 8506 struct rq *rq = cpu_rq(i);
8382 * force a rebalance 8507 struct sched_entity *se;
8383 */ 8508
8384 cfs_rq_set_shares(tg->cfs_rq[i], 0); 8509 se = tg->se[i];
8385 set_se_shares(tg->se[i], shares); 8510 /* Propagate contribution to hierarchy */
8511 raw_spin_lock_irqsave(&rq->lock, flags);
8512 for_each_sched_entity(se)
8513 update_cfs_shares(group_cfs_rq(se), 0);
8514 raw_spin_unlock_irqrestore(&rq->lock, flags);
8386 } 8515 }
8387 8516
8388 /*
8389 * Enable load balance activity on this group, by inserting it back on
8390 * each cpu's rq->leaf_cfs_rq_list.
8391 */
8392 spin_lock_irqsave(&task_group_lock, flags);
8393 for_each_possible_cpu(i)
8394 register_fair_sched_group(tg, i);
8395 list_add_rcu(&tg->siblings, &tg->parent->children);
8396 spin_unlock_irqrestore(&task_group_lock, flags);
8397done: 8517done:
8398 mutex_unlock(&shares_mutex); 8518 mutex_unlock(&shares_mutex);
8399 return 0; 8519 return 0;
@@ -8528,7 +8648,7 @@ static int tg_set_bandwidth(struct task_group *tg,
8528 raw_spin_unlock(&rt_rq->rt_runtime_lock); 8648 raw_spin_unlock(&rt_rq->rt_runtime_lock);
8529 } 8649 }
8530 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8650 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8531 unlock: 8651unlock:
8532 read_unlock(&tasklist_lock); 8652 read_unlock(&tasklist_lock);
8533 mutex_unlock(&rt_constraints_mutex); 8653 mutex_unlock(&rt_constraints_mutex);
8534 8654
@@ -8692,7 +8812,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8692 8812
8693 if (!cgrp->parent) { 8813 if (!cgrp->parent) {
8694 /* This is early initialization for the top cgroup */ 8814 /* This is early initialization for the top cgroup */
8695 return &init_task_group.css; 8815 return &root_task_group.css;
8696 } 8816 }
8697 8817
8698 parent = cgroup_tg(cgrp->parent); 8818 parent = cgroup_tg(cgrp->parent);
@@ -8763,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
8763 } 8883 }
8764} 8884}
8765 8885
8886static void
8887cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
8888{
8889 /*
8890 * cgroup_exit() is called in the copy_process() failure path.
8891 * Ignore this case since the task hasn't ran yet, this avoids
8892 * trying to poke a half freed task state from generic code.
8893 */
8894 if (!(task->flags & PF_EXITING))
8895 return;
8896
8897 sched_move_task(task);
8898}
8899
8766#ifdef CONFIG_FAIR_GROUP_SCHED 8900#ifdef CONFIG_FAIR_GROUP_SCHED
8767static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, 8901static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
8768 u64 shareval) 8902 u64 shareval)
@@ -8835,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
8835 .destroy = cpu_cgroup_destroy, 8969 .destroy = cpu_cgroup_destroy,
8836 .can_attach = cpu_cgroup_can_attach, 8970 .can_attach = cpu_cgroup_can_attach,
8837 .attach = cpu_cgroup_attach, 8971 .attach = cpu_cgroup_attach,
8972 .exit = cpu_cgroup_exit,
8838 .populate = cpu_cgroup_populate, 8973 .populate = cpu_cgroup_populate,
8839 .subsys_id = cpu_cgroup_subsys_id, 8974 .subsys_id = cpu_cgroup_subsys_id,
8840 .early_init = 1, 8975 .early_init = 1,
@@ -9119,72 +9254,3 @@ struct cgroup_subsys cpuacct_subsys = {
9119}; 9254};
9120#endif /* CONFIG_CGROUP_CPUACCT */ 9255#endif /* CONFIG_CGROUP_CPUACCT */
9121 9256
9122#ifndef CONFIG_SMP
9123
9124void synchronize_sched_expedited(void)
9125{
9126 barrier();
9127}
9128EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9129
9130#else /* #ifndef CONFIG_SMP */
9131
9132static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
9133
9134static int synchronize_sched_expedited_cpu_stop(void *data)
9135{
9136 /*
9137 * There must be a full memory barrier on each affected CPU
9138 * between the time that try_stop_cpus() is called and the
9139 * time that it returns.
9140 *
9141 * In the current initial implementation of cpu_stop, the
9142 * above condition is already met when the control reaches
9143 * this point and the following smp_mb() is not strictly
9144 * necessary. Do smp_mb() anyway for documentation and
9145 * robustness against future implementation changes.
9146 */
9147 smp_mb(); /* See above comment block. */
9148 return 0;
9149}
9150
9151/*
9152 * Wait for an rcu-sched grace period to elapse, but use "big hammer"
9153 * approach to force grace period to end quickly. This consumes
9154 * significant time on all CPUs, and is thus not recommended for
9155 * any sort of common-case code.
9156 *
9157 * Note that it is illegal to call this function while holding any
9158 * lock that is acquired by a CPU-hotplug notifier. Failing to
9159 * observe this restriction will result in deadlock.
9160 */
9161void synchronize_sched_expedited(void)
9162{
9163 int snap, trycount = 0;
9164
9165 smp_mb(); /* ensure prior mod happens before capturing snap. */
9166 snap = atomic_read(&synchronize_sched_expedited_count) + 1;
9167 get_online_cpus();
9168 while (try_stop_cpus(cpu_online_mask,
9169 synchronize_sched_expedited_cpu_stop,
9170 NULL) == -EAGAIN) {
9171 put_online_cpus();
9172 if (trycount++ < 10)
9173 udelay(trycount * num_online_cpus());
9174 else {
9175 synchronize_sched();
9176 return;
9177 }
9178 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
9179 smp_mb(); /* ensure test happens before caller kfree */
9180 return;
9181 }
9182 get_online_cpus();
9183 }
9184 atomic_inc(&synchronize_sched_expedited_count);
9185 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
9186 put_online_cpus();
9187}
9188EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
9189
9190#endif /* #else #ifndef CONFIG_SMP */