diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 1278 |
1 files changed, 672 insertions, 606 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index dc85ceb90832..18d38e4ec7ba 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -75,9 +75,11 @@ | |||
75 | 75 | ||
76 | #include <asm/tlb.h> | 76 | #include <asm/tlb.h> |
77 | #include <asm/irq_regs.h> | 77 | #include <asm/irq_regs.h> |
78 | #include <asm/mutex.h> | ||
78 | 79 | ||
79 | #include "sched_cpupri.h" | 80 | #include "sched_cpupri.h" |
80 | #include "workqueue_sched.h" | 81 | #include "workqueue_sched.h" |
82 | #include "sched_autogroup.h" | ||
81 | 83 | ||
82 | #define CREATE_TRACE_POINTS | 84 | #define CREATE_TRACE_POINTS |
83 | #include <trace/events/sched.h> | 85 | #include <trace/events/sched.h> |
@@ -253,6 +255,8 @@ struct task_group { | |||
253 | /* runqueue "owned" by this group on each cpu */ | 255 | /* runqueue "owned" by this group on each cpu */ |
254 | struct cfs_rq **cfs_rq; | 256 | struct cfs_rq **cfs_rq; |
255 | unsigned long shares; | 257 | unsigned long shares; |
258 | |||
259 | atomic_t load_weight; | ||
256 | #endif | 260 | #endif |
257 | 261 | ||
258 | #ifdef CONFIG_RT_GROUP_SCHED | 262 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -268,25 +272,18 @@ struct task_group { | |||
268 | struct task_group *parent; | 272 | struct task_group *parent; |
269 | struct list_head siblings; | 273 | struct list_head siblings; |
270 | struct list_head children; | 274 | struct list_head children; |
271 | }; | ||
272 | 275 | ||
273 | #define root_task_group init_task_group | 276 | #ifdef CONFIG_SCHED_AUTOGROUP |
277 | struct autogroup *autogroup; | ||
278 | #endif | ||
279 | }; | ||
274 | 280 | ||
275 | /* task_group_lock serializes add/remove of task groups and also changes to | 281 | /* task_group_lock serializes the addition/removal of task groups */ |
276 | * a task group's cpu shares. | ||
277 | */ | ||
278 | static DEFINE_SPINLOCK(task_group_lock); | 282 | static DEFINE_SPINLOCK(task_group_lock); |
279 | 283 | ||
280 | #ifdef CONFIG_FAIR_GROUP_SCHED | 284 | #ifdef CONFIG_FAIR_GROUP_SCHED |
281 | 285 | ||
282 | #ifdef CONFIG_SMP | 286 | # define ROOT_TASK_GROUP_LOAD NICE_0_LOAD |
283 | static int root_task_group_empty(void) | ||
284 | { | ||
285 | return list_empty(&root_task_group.children); | ||
286 | } | ||
287 | #endif | ||
288 | |||
289 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | ||
290 | 287 | ||
291 | /* | 288 | /* |
292 | * A weight of 0 or 1 can cause arithmetics problems. | 289 | * A weight of 0 or 1 can cause arithmetics problems. |
@@ -299,13 +296,13 @@ static int root_task_group_empty(void) | |||
299 | #define MIN_SHARES 2 | 296 | #define MIN_SHARES 2 |
300 | #define MAX_SHARES (1UL << 18) | 297 | #define MAX_SHARES (1UL << 18) |
301 | 298 | ||
302 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 299 | static int root_task_group_load = ROOT_TASK_GROUP_LOAD; |
303 | #endif | 300 | #endif |
304 | 301 | ||
305 | /* Default task group. | 302 | /* Default task group. |
306 | * Every task in system belong to this group at bootup. | 303 | * Every task in system belong to this group at bootup. |
307 | */ | 304 | */ |
308 | struct task_group init_task_group; | 305 | struct task_group root_task_group; |
309 | 306 | ||
310 | #endif /* CONFIG_CGROUP_SCHED */ | 307 | #endif /* CONFIG_CGROUP_SCHED */ |
311 | 308 | ||
@@ -342,6 +339,7 @@ struct cfs_rq { | |||
342 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 339 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
343 | * list is used during load balance. | 340 | * list is used during load balance. |
344 | */ | 341 | */ |
342 | int on_list; | ||
345 | struct list_head leaf_cfs_rq_list; | 343 | struct list_head leaf_cfs_rq_list; |
346 | struct task_group *tg; /* group that "owns" this runqueue */ | 344 | struct task_group *tg; /* group that "owns" this runqueue */ |
347 | 345 | ||
@@ -360,14 +358,17 @@ struct cfs_rq { | |||
360 | unsigned long h_load; | 358 | unsigned long h_load; |
361 | 359 | ||
362 | /* | 360 | /* |
363 | * this cpu's part of tg->shares | 361 | * Maintaining per-cpu shares distribution for group scheduling |
362 | * | ||
363 | * load_stamp is the last time we updated the load average | ||
364 | * load_last is the last time we updated the load average and saw load | ||
365 | * load_unacc_exec_time is currently unaccounted execution time | ||
364 | */ | 366 | */ |
365 | unsigned long shares; | 367 | u64 load_avg; |
368 | u64 load_period; | ||
369 | u64 load_stamp, load_last, load_unacc_exec_time; | ||
366 | 370 | ||
367 | /* | 371 | unsigned long load_contribution; |
368 | * load.weight at the time we set shares | ||
369 | */ | ||
370 | unsigned long rq_weight; | ||
371 | #endif | 372 | #endif |
372 | #endif | 373 | #endif |
373 | }; | 374 | }; |
@@ -426,9 +427,7 @@ struct root_domain { | |||
426 | */ | 427 | */ |
427 | cpumask_var_t rto_mask; | 428 | cpumask_var_t rto_mask; |
428 | atomic_t rto_count; | 429 | atomic_t rto_count; |
429 | #ifdef CONFIG_SMP | ||
430 | struct cpupri cpupri; | 430 | struct cpupri cpupri; |
431 | #endif | ||
432 | }; | 431 | }; |
433 | 432 | ||
434 | /* | 433 | /* |
@@ -437,7 +436,7 @@ struct root_domain { | |||
437 | */ | 436 | */ |
438 | static struct root_domain def_root_domain; | 437 | static struct root_domain def_root_domain; |
439 | 438 | ||
440 | #endif | 439 | #endif /* CONFIG_SMP */ |
441 | 440 | ||
442 | /* | 441 | /* |
443 | * This is the main, per-CPU runqueue data structure. | 442 | * This is the main, per-CPU runqueue data structure. |
@@ -488,11 +487,12 @@ struct rq { | |||
488 | */ | 487 | */ |
489 | unsigned long nr_uninterruptible; | 488 | unsigned long nr_uninterruptible; |
490 | 489 | ||
491 | struct task_struct *curr, *idle; | 490 | struct task_struct *curr, *idle, *stop; |
492 | unsigned long next_balance; | 491 | unsigned long next_balance; |
493 | struct mm_struct *prev_mm; | 492 | struct mm_struct *prev_mm; |
494 | 493 | ||
495 | u64 clock; | 494 | u64 clock; |
495 | u64 clock_task; | ||
496 | 496 | ||
497 | atomic_t nr_iowait; | 497 | atomic_t nr_iowait; |
498 | 498 | ||
@@ -520,6 +520,10 @@ struct rq { | |||
520 | u64 avg_idle; | 520 | u64 avg_idle; |
521 | #endif | 521 | #endif |
522 | 522 | ||
523 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
524 | u64 prev_irq_time; | ||
525 | #endif | ||
526 | |||
523 | /* calc_load related fields */ | 527 | /* calc_load related fields */ |
524 | unsigned long calc_load_update; | 528 | unsigned long calc_load_update; |
525 | long calc_load_active; | 529 | long calc_load_active; |
@@ -549,26 +553,13 @@ struct rq { | |||
549 | /* try_to_wake_up() stats */ | 553 | /* try_to_wake_up() stats */ |
550 | unsigned int ttwu_count; | 554 | unsigned int ttwu_count; |
551 | unsigned int ttwu_local; | 555 | unsigned int ttwu_local; |
552 | |||
553 | /* BKL stats */ | ||
554 | unsigned int bkl_count; | ||
555 | #endif | 556 | #endif |
556 | }; | 557 | }; |
557 | 558 | ||
558 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 559 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
559 | 560 | ||
560 | static inline | ||
561 | void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
562 | { | ||
563 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); | ||
564 | 561 | ||
565 | /* | 562 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); |
566 | * A queue event has occurred, and we're going to schedule. In | ||
567 | * this case, we can save a useless back to back clock update. | ||
568 | */ | ||
569 | if (test_tsk_need_resched(p)) | ||
570 | rq->skip_clock_update = 1; | ||
571 | } | ||
572 | 563 | ||
573 | static inline int cpu_of(struct rq *rq) | 564 | static inline int cpu_of(struct rq *rq) |
574 | { | 565 | { |
@@ -612,11 +603,17 @@ static inline int cpu_of(struct rq *rq) | |||
612 | */ | 603 | */ |
613 | static inline struct task_group *task_group(struct task_struct *p) | 604 | static inline struct task_group *task_group(struct task_struct *p) |
614 | { | 605 | { |
606 | struct task_group *tg; | ||
615 | struct cgroup_subsys_state *css; | 607 | struct cgroup_subsys_state *css; |
616 | 608 | ||
609 | if (p->flags & PF_EXITING) | ||
610 | return &root_task_group; | ||
611 | |||
617 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, | 612 | css = task_subsys_state_check(p, cpu_cgroup_subsys_id, |
618 | lockdep_is_held(&task_rq(p)->lock)); | 613 | lockdep_is_held(&task_rq(p)->lock)); |
619 | return container_of(css, struct task_group, css); | 614 | tg = container_of(css, struct task_group, css); |
615 | |||
616 | return autogroup_task_group(p, tg); | ||
620 | } | 617 | } |
621 | 618 | ||
622 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 619 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
@@ -643,10 +640,18 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
643 | 640 | ||
644 | #endif /* CONFIG_CGROUP_SCHED */ | 641 | #endif /* CONFIG_CGROUP_SCHED */ |
645 | 642 | ||
646 | inline void update_rq_clock(struct rq *rq) | 643 | static void update_rq_clock_task(struct rq *rq, s64 delta); |
644 | |||
645 | static void update_rq_clock(struct rq *rq) | ||
647 | { | 646 | { |
648 | if (!rq->skip_clock_update) | 647 | s64 delta; |
649 | rq->clock = sched_clock_cpu(cpu_of(rq)); | 648 | |
649 | if (rq->skip_clock_update) | ||
650 | return; | ||
651 | |||
652 | delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; | ||
653 | rq->clock += delta; | ||
654 | update_rq_clock_task(rq, delta); | ||
650 | } | 655 | } |
651 | 656 | ||
652 | /* | 657 | /* |
@@ -723,7 +728,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
723 | size_t cnt, loff_t *ppos) | 728 | size_t cnt, loff_t *ppos) |
724 | { | 729 | { |
725 | char buf[64]; | 730 | char buf[64]; |
726 | char *cmp = buf; | 731 | char *cmp; |
727 | int neg = 0; | 732 | int neg = 0; |
728 | int i; | 733 | int i; |
729 | 734 | ||
@@ -734,16 +739,15 @@ sched_feat_write(struct file *filp, const char __user *ubuf, | |||
734 | return -EFAULT; | 739 | return -EFAULT; |
735 | 740 | ||
736 | buf[cnt] = 0; | 741 | buf[cnt] = 0; |
742 | cmp = strstrip(buf); | ||
737 | 743 | ||
738 | if (strncmp(buf, "NO_", 3) == 0) { | 744 | if (strncmp(cmp, "NO_", 3) == 0) { |
739 | neg = 1; | 745 | neg = 1; |
740 | cmp += 3; | 746 | cmp += 3; |
741 | } | 747 | } |
742 | 748 | ||
743 | for (i = 0; sched_feat_names[i]; i++) { | 749 | for (i = 0; sched_feat_names[i]; i++) { |
744 | int len = strlen(sched_feat_names[i]); | 750 | if (strcmp(cmp, sched_feat_names[i]) == 0) { |
745 | |||
746 | if (strncmp(cmp, sched_feat_names[i], len) == 0) { | ||
747 | if (neg) | 751 | if (neg) |
748 | sysctl_sched_features &= ~(1UL << i); | 752 | sysctl_sched_features &= ~(1UL << i); |
749 | else | 753 | else |
@@ -793,20 +797,6 @@ late_initcall(sched_init_debug); | |||
793 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 797 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
794 | 798 | ||
795 | /* | 799 | /* |
796 | * ratelimit for updating the group shares. | ||
797 | * default: 0.25ms | ||
798 | */ | ||
799 | unsigned int sysctl_sched_shares_ratelimit = 250000; | ||
800 | unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; | ||
801 | |||
802 | /* | ||
803 | * Inject some fuzzyness into changing the per-cpu group shares | ||
804 | * this avoids remote rq-locks at the expense of fairness. | ||
805 | * default: 4 | ||
806 | */ | ||
807 | unsigned int sysctl_sched_shares_thresh = 4; | ||
808 | |||
809 | /* | ||
810 | * period over which we average the RT time consumption, measured | 800 | * period over which we average the RT time consumption, measured |
811 | * in ms. | 801 | * in ms. |
812 | * | 802 | * |
@@ -1355,6 +1345,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | |||
1355 | lw->inv_weight = 0; | 1345 | lw->inv_weight = 0; |
1356 | } | 1346 | } |
1357 | 1347 | ||
1348 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
1349 | { | ||
1350 | lw->weight = w; | ||
1351 | lw->inv_weight = 0; | ||
1352 | } | ||
1353 | |||
1358 | /* | 1354 | /* |
1359 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1355 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
1360 | * of tasks with abnormal "nice" values across CPUs the contribution that | 1356 | * of tasks with abnormal "nice" values across CPUs the contribution that |
@@ -1543,101 +1539,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1543 | 1539 | ||
1544 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1540 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1545 | 1541 | ||
1546 | static __read_mostly unsigned long __percpu *update_shares_data; | ||
1547 | |||
1548 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1549 | |||
1550 | /* | ||
1551 | * Calculate and set the cpu's group shares. | ||
1552 | */ | ||
1553 | static void update_group_shares_cpu(struct task_group *tg, int cpu, | ||
1554 | unsigned long sd_shares, | ||
1555 | unsigned long sd_rq_weight, | ||
1556 | unsigned long *usd_rq_weight) | ||
1557 | { | ||
1558 | unsigned long shares, rq_weight; | ||
1559 | int boost = 0; | ||
1560 | |||
1561 | rq_weight = usd_rq_weight[cpu]; | ||
1562 | if (!rq_weight) { | ||
1563 | boost = 1; | ||
1564 | rq_weight = NICE_0_LOAD; | ||
1565 | } | ||
1566 | |||
1567 | /* | ||
1568 | * \Sum_j shares_j * rq_weight_i | ||
1569 | * shares_i = ----------------------------- | ||
1570 | * \Sum_j rq_weight_j | ||
1571 | */ | ||
1572 | shares = (sd_shares * rq_weight) / sd_rq_weight; | ||
1573 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | ||
1574 | |||
1575 | if (abs(shares - tg->se[cpu]->load.weight) > | ||
1576 | sysctl_sched_shares_thresh) { | ||
1577 | struct rq *rq = cpu_rq(cpu); | ||
1578 | unsigned long flags; | ||
1579 | |||
1580 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
1581 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; | ||
1582 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
1583 | __set_se_shares(tg->se[cpu], shares); | ||
1584 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
1585 | } | ||
1586 | } | ||
1587 | |||
1588 | /* | ||
1589 | * Re-compute the task group their per cpu shares over the given domain. | ||
1590 | * This needs to be done in a bottom-up fashion because the rq weight of a | ||
1591 | * parent group depends on the shares of its child groups. | ||
1592 | */ | ||
1593 | static int tg_shares_up(struct task_group *tg, void *data) | ||
1594 | { | ||
1595 | unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; | ||
1596 | unsigned long *usd_rq_weight; | ||
1597 | struct sched_domain *sd = data; | ||
1598 | unsigned long flags; | ||
1599 | int i; | ||
1600 | |||
1601 | if (!tg->se[0]) | ||
1602 | return 0; | ||
1603 | |||
1604 | local_irq_save(flags); | ||
1605 | usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); | ||
1606 | |||
1607 | for_each_cpu(i, sched_domain_span(sd)) { | ||
1608 | weight = tg->cfs_rq[i]->load.weight; | ||
1609 | usd_rq_weight[i] = weight; | ||
1610 | |||
1611 | rq_weight += weight; | ||
1612 | /* | ||
1613 | * If there are currently no tasks on the cpu pretend there | ||
1614 | * is one of average load so that when a new task gets to | ||
1615 | * run here it will not get delayed by group starvation. | ||
1616 | */ | ||
1617 | if (!weight) | ||
1618 | weight = NICE_0_LOAD; | ||
1619 | |||
1620 | sum_weight += weight; | ||
1621 | shares += tg->cfs_rq[i]->shares; | ||
1622 | } | ||
1623 | |||
1624 | if (!rq_weight) | ||
1625 | rq_weight = sum_weight; | ||
1626 | |||
1627 | if ((!shares && rq_weight) || shares > tg->shares) | ||
1628 | shares = tg->shares; | ||
1629 | |||
1630 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | ||
1631 | shares = tg->shares; | ||
1632 | |||
1633 | for_each_cpu(i, sched_domain_span(sd)) | ||
1634 | update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); | ||
1635 | |||
1636 | local_irq_restore(flags); | ||
1637 | |||
1638 | return 0; | ||
1639 | } | ||
1640 | |||
1641 | /* | 1542 | /* |
1642 | * Compute the cpu's hierarchical load factor for each task group. | 1543 | * Compute the cpu's hierarchical load factor for each task group. |
1643 | * This needs to be done in a top-down fashion because the load of a child | 1544 | * This needs to be done in a top-down fashion because the load of a child |
@@ -1652,7 +1553,7 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1652 | load = cpu_rq(cpu)->load.weight; | 1553 | load = cpu_rq(cpu)->load.weight; |
1653 | } else { | 1554 | } else { |
1654 | load = tg->parent->cfs_rq[cpu]->h_load; | 1555 | load = tg->parent->cfs_rq[cpu]->h_load; |
1655 | load *= tg->cfs_rq[cpu]->shares; | 1556 | load *= tg->se[cpu]->load.weight; |
1656 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | 1557 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; |
1657 | } | 1558 | } |
1658 | 1559 | ||
@@ -1661,34 +1562,11 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1661 | return 0; | 1562 | return 0; |
1662 | } | 1563 | } |
1663 | 1564 | ||
1664 | static void update_shares(struct sched_domain *sd) | ||
1665 | { | ||
1666 | s64 elapsed; | ||
1667 | u64 now; | ||
1668 | |||
1669 | if (root_task_group_empty()) | ||
1670 | return; | ||
1671 | |||
1672 | now = local_clock(); | ||
1673 | elapsed = now - sd->last_update; | ||
1674 | |||
1675 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | ||
1676 | sd->last_update = now; | ||
1677 | walk_tg_tree(tg_nop, tg_shares_up, sd); | ||
1678 | } | ||
1679 | } | ||
1680 | |||
1681 | static void update_h_load(long cpu) | 1565 | static void update_h_load(long cpu) |
1682 | { | 1566 | { |
1683 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); | 1567 | walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); |
1684 | } | 1568 | } |
1685 | 1569 | ||
1686 | #else | ||
1687 | |||
1688 | static inline void update_shares(struct sched_domain *sd) | ||
1689 | { | ||
1690 | } | ||
1691 | |||
1692 | #endif | 1570 | #endif |
1693 | 1571 | ||
1694 | #ifdef CONFIG_PREEMPT | 1572 | #ifdef CONFIG_PREEMPT |
@@ -1810,15 +1688,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1810 | 1688 | ||
1811 | #endif | 1689 | #endif |
1812 | 1690 | ||
1813 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1814 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1815 | { | ||
1816 | #ifdef CONFIG_SMP | ||
1817 | cfs_rq->shares = shares; | ||
1818 | #endif | ||
1819 | } | ||
1820 | #endif | ||
1821 | |||
1822 | static void calc_load_account_idle(struct rq *this_rq); | 1691 | static void calc_load_account_idle(struct rq *this_rq); |
1823 | static void update_sysctl(void); | 1692 | static void update_sysctl(void); |
1824 | static int get_update_sysctl_factor(void); | 1693 | static int get_update_sysctl_factor(void); |
@@ -1840,7 +1709,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1840 | 1709 | ||
1841 | static const struct sched_class rt_sched_class; | 1710 | static const struct sched_class rt_sched_class; |
1842 | 1711 | ||
1843 | #define sched_class_highest (&rt_sched_class) | 1712 | #define sched_class_highest (&stop_sched_class) |
1844 | #define for_each_class(class) \ | 1713 | #define for_each_class(class) \ |
1845 | for (class = sched_class_highest; class; class = class->next) | 1714 | for (class = sched_class_highest; class; class = class->next) |
1846 | 1715 | ||
@@ -1858,12 +1727,6 @@ static void dec_nr_running(struct rq *rq) | |||
1858 | 1727 | ||
1859 | static void set_load_weight(struct task_struct *p) | 1728 | static void set_load_weight(struct task_struct *p) |
1860 | { | 1729 | { |
1861 | if (task_has_rt_policy(p)) { | ||
1862 | p->se.load.weight = 0; | ||
1863 | p->se.load.inv_weight = WMULT_CONST; | ||
1864 | return; | ||
1865 | } | ||
1866 | |||
1867 | /* | 1730 | /* |
1868 | * SCHED_IDLE tasks get minimal weight: | 1731 | * SCHED_IDLE tasks get minimal weight: |
1869 | */ | 1732 | */ |
@@ -1917,13 +1780,194 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) | |||
1917 | dec_nr_running(rq); | 1780 | dec_nr_running(rq); |
1918 | } | 1781 | } |
1919 | 1782 | ||
1783 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | ||
1784 | |||
1785 | /* | ||
1786 | * There are no locks covering percpu hardirq/softirq time. | ||
1787 | * They are only modified in account_system_vtime, on corresponding CPU | ||
1788 | * with interrupts disabled. So, writes are safe. | ||
1789 | * They are read and saved off onto struct rq in update_rq_clock(). | ||
1790 | * This may result in other CPU reading this CPU's irq time and can | ||
1791 | * race with irq/account_system_vtime on this CPU. We would either get old | ||
1792 | * or new value with a side effect of accounting a slice of irq time to wrong | ||
1793 | * task when irq is in progress while we read rq->clock. That is a worthy | ||
1794 | * compromise in place of having locks on each irq in account_system_time. | ||
1795 | */ | ||
1796 | static DEFINE_PER_CPU(u64, cpu_hardirq_time); | ||
1797 | static DEFINE_PER_CPU(u64, cpu_softirq_time); | ||
1798 | |||
1799 | static DEFINE_PER_CPU(u64, irq_start_time); | ||
1800 | static int sched_clock_irqtime; | ||
1801 | |||
1802 | void enable_sched_clock_irqtime(void) | ||
1803 | { | ||
1804 | sched_clock_irqtime = 1; | ||
1805 | } | ||
1806 | |||
1807 | void disable_sched_clock_irqtime(void) | ||
1808 | { | ||
1809 | sched_clock_irqtime = 0; | ||
1810 | } | ||
1811 | |||
1812 | #ifndef CONFIG_64BIT | ||
1813 | static DEFINE_PER_CPU(seqcount_t, irq_time_seq); | ||
1814 | |||
1815 | static inline void irq_time_write_begin(void) | ||
1816 | { | ||
1817 | __this_cpu_inc(irq_time_seq.sequence); | ||
1818 | smp_wmb(); | ||
1819 | } | ||
1820 | |||
1821 | static inline void irq_time_write_end(void) | ||
1822 | { | ||
1823 | smp_wmb(); | ||
1824 | __this_cpu_inc(irq_time_seq.sequence); | ||
1825 | } | ||
1826 | |||
1827 | static inline u64 irq_time_read(int cpu) | ||
1828 | { | ||
1829 | u64 irq_time; | ||
1830 | unsigned seq; | ||
1831 | |||
1832 | do { | ||
1833 | seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu)); | ||
1834 | irq_time = per_cpu(cpu_softirq_time, cpu) + | ||
1835 | per_cpu(cpu_hardirq_time, cpu); | ||
1836 | } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq)); | ||
1837 | |||
1838 | return irq_time; | ||
1839 | } | ||
1840 | #else /* CONFIG_64BIT */ | ||
1841 | static inline void irq_time_write_begin(void) | ||
1842 | { | ||
1843 | } | ||
1844 | |||
1845 | static inline void irq_time_write_end(void) | ||
1846 | { | ||
1847 | } | ||
1848 | |||
1849 | static inline u64 irq_time_read(int cpu) | ||
1850 | { | ||
1851 | return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu); | ||
1852 | } | ||
1853 | #endif /* CONFIG_64BIT */ | ||
1854 | |||
1855 | /* | ||
1856 | * Called before incrementing preempt_count on {soft,}irq_enter | ||
1857 | * and before decrementing preempt_count on {soft,}irq_exit. | ||
1858 | */ | ||
1859 | void account_system_vtime(struct task_struct *curr) | ||
1860 | { | ||
1861 | unsigned long flags; | ||
1862 | s64 delta; | ||
1863 | int cpu; | ||
1864 | |||
1865 | if (!sched_clock_irqtime) | ||
1866 | return; | ||
1867 | |||
1868 | local_irq_save(flags); | ||
1869 | |||
1870 | cpu = smp_processor_id(); | ||
1871 | delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time); | ||
1872 | __this_cpu_add(irq_start_time, delta); | ||
1873 | |||
1874 | irq_time_write_begin(); | ||
1875 | /* | ||
1876 | * We do not account for softirq time from ksoftirqd here. | ||
1877 | * We want to continue accounting softirq time to ksoftirqd thread | ||
1878 | * in that case, so as not to confuse scheduler with a special task | ||
1879 | * that do not consume any time, but still wants to run. | ||
1880 | */ | ||
1881 | if (hardirq_count()) | ||
1882 | __this_cpu_add(cpu_hardirq_time, delta); | ||
1883 | else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD)) | ||
1884 | __this_cpu_add(cpu_softirq_time, delta); | ||
1885 | |||
1886 | irq_time_write_end(); | ||
1887 | local_irq_restore(flags); | ||
1888 | } | ||
1889 | EXPORT_SYMBOL_GPL(account_system_vtime); | ||
1890 | |||
1891 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
1892 | { | ||
1893 | s64 irq_delta; | ||
1894 | |||
1895 | irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; | ||
1896 | |||
1897 | /* | ||
1898 | * Since irq_time is only updated on {soft,}irq_exit, we might run into | ||
1899 | * this case when a previous update_rq_clock() happened inside a | ||
1900 | * {soft,}irq region. | ||
1901 | * | ||
1902 | * When this happens, we stop ->clock_task and only update the | ||
1903 | * prev_irq_time stamp to account for the part that fit, so that a next | ||
1904 | * update will consume the rest. This ensures ->clock_task is | ||
1905 | * monotonic. | ||
1906 | * | ||
1907 | * It does however cause some slight miss-attribution of {soft,}irq | ||
1908 | * time, a more accurate solution would be to update the irq_time using | ||
1909 | * the current rq->clock timestamp, except that would require using | ||
1910 | * atomic ops. | ||
1911 | */ | ||
1912 | if (irq_delta > delta) | ||
1913 | irq_delta = delta; | ||
1914 | |||
1915 | rq->prev_irq_time += irq_delta; | ||
1916 | delta -= irq_delta; | ||
1917 | rq->clock_task += delta; | ||
1918 | |||
1919 | if (irq_delta && sched_feat(NONIRQ_POWER)) | ||
1920 | sched_rt_avg_update(rq, irq_delta); | ||
1921 | } | ||
1922 | |||
1923 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
1924 | |||
1925 | static void update_rq_clock_task(struct rq *rq, s64 delta) | ||
1926 | { | ||
1927 | rq->clock_task += delta; | ||
1928 | } | ||
1929 | |||
1930 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | ||
1931 | |||
1920 | #include "sched_idletask.c" | 1932 | #include "sched_idletask.c" |
1921 | #include "sched_fair.c" | 1933 | #include "sched_fair.c" |
1922 | #include "sched_rt.c" | 1934 | #include "sched_rt.c" |
1935 | #include "sched_autogroup.c" | ||
1936 | #include "sched_stoptask.c" | ||
1923 | #ifdef CONFIG_SCHED_DEBUG | 1937 | #ifdef CONFIG_SCHED_DEBUG |
1924 | # include "sched_debug.c" | 1938 | # include "sched_debug.c" |
1925 | #endif | 1939 | #endif |
1926 | 1940 | ||
1941 | void sched_set_stop_task(int cpu, struct task_struct *stop) | ||
1942 | { | ||
1943 | struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; | ||
1944 | struct task_struct *old_stop = cpu_rq(cpu)->stop; | ||
1945 | |||
1946 | if (stop) { | ||
1947 | /* | ||
1948 | * Make it appear like a SCHED_FIFO task, its something | ||
1949 | * userspace knows about and won't get confused about. | ||
1950 | * | ||
1951 | * Also, it will make PI more or less work without too | ||
1952 | * much confusion -- but then, stop work should not | ||
1953 | * rely on PI working anyway. | ||
1954 | */ | ||
1955 | sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); | ||
1956 | |||
1957 | stop->sched_class = &stop_sched_class; | ||
1958 | } | ||
1959 | |||
1960 | cpu_rq(cpu)->stop = stop; | ||
1961 | |||
1962 | if (old_stop) { | ||
1963 | /* | ||
1964 | * Reset it back to a normal scheduling class so that | ||
1965 | * it can die in pieces. | ||
1966 | */ | ||
1967 | old_stop->sched_class = &rt_sched_class; | ||
1968 | } | ||
1969 | } | ||
1970 | |||
1927 | /* | 1971 | /* |
1928 | * __normal_prio - return the priority that is based on the static prio | 1972 | * __normal_prio - return the priority that is based on the static prio |
1929 | */ | 1973 | */ |
@@ -1991,6 +2035,31 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, | |||
1991 | p->sched_class->prio_changed(rq, p, oldprio, running); | 2035 | p->sched_class->prio_changed(rq, p, oldprio, running); |
1992 | } | 2036 | } |
1993 | 2037 | ||
2038 | static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) | ||
2039 | { | ||
2040 | const struct sched_class *class; | ||
2041 | |||
2042 | if (p->sched_class == rq->curr->sched_class) { | ||
2043 | rq->curr->sched_class->check_preempt_curr(rq, p, flags); | ||
2044 | } else { | ||
2045 | for_each_class(class) { | ||
2046 | if (class == rq->curr->sched_class) | ||
2047 | break; | ||
2048 | if (class == p->sched_class) { | ||
2049 | resched_task(rq->curr); | ||
2050 | break; | ||
2051 | } | ||
2052 | } | ||
2053 | } | ||
2054 | |||
2055 | /* | ||
2056 | * A queue event has occurred, and we're going to schedule. In | ||
2057 | * this case, we can save a useless back to back clock update. | ||
2058 | */ | ||
2059 | if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr)) | ||
2060 | rq->skip_clock_update = 1; | ||
2061 | } | ||
2062 | |||
1994 | #ifdef CONFIG_SMP | 2063 | #ifdef CONFIG_SMP |
1995 | /* | 2064 | /* |
1996 | * Is this task likely cache-hot: | 2065 | * Is this task likely cache-hot: |
@@ -2003,6 +2072,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | |||
2003 | if (p->sched_class != &fair_sched_class) | 2072 | if (p->sched_class != &fair_sched_class) |
2004 | return 0; | 2073 | return 0; |
2005 | 2074 | ||
2075 | if (unlikely(p->policy == SCHED_IDLE)) | ||
2076 | return 0; | ||
2077 | |||
2006 | /* | 2078 | /* |
2007 | * Buddy candidates are cache hot: | 2079 | * Buddy candidates are cache hot: |
2008 | */ | 2080 | */ |
@@ -2053,10 +2125,8 @@ static int migration_cpu_stop(void *data); | |||
2053 | * The task's runqueue lock must be held. | 2125 | * The task's runqueue lock must be held. |
2054 | * Returns true if you have to wait for migration thread. | 2126 | * Returns true if you have to wait for migration thread. |
2055 | */ | 2127 | */ |
2056 | static bool migrate_task(struct task_struct *p, int dest_cpu) | 2128 | static bool migrate_task(struct task_struct *p, struct rq *rq) |
2057 | { | 2129 | { |
2058 | struct rq *rq = task_rq(p); | ||
2059 | |||
2060 | /* | 2130 | /* |
2061 | * If the task is not on a runqueue (and not running), then | 2131 | * If the task is not on a runqueue (and not running), then |
2062 | * the next wake-up will properly place the task. | 2132 | * the next wake-up will properly place the task. |
@@ -2236,18 +2306,15 @@ static int select_fallback_rq(int cpu, struct task_struct *p) | |||
2236 | return dest_cpu; | 2306 | return dest_cpu; |
2237 | 2307 | ||
2238 | /* No more Mr. Nice Guy. */ | 2308 | /* No more Mr. Nice Guy. */ |
2239 | if (unlikely(dest_cpu >= nr_cpu_ids)) { | 2309 | dest_cpu = cpuset_cpus_allowed_fallback(p); |
2240 | dest_cpu = cpuset_cpus_allowed_fallback(p); | 2310 | /* |
2241 | /* | 2311 | * Don't tell them about moving exiting tasks or |
2242 | * Don't tell them about moving exiting tasks or | 2312 | * kernel threads (both mm NULL), since they never |
2243 | * kernel threads (both mm NULL), since they never | 2313 | * leave kernel. |
2244 | * leave kernel. | 2314 | */ |
2245 | */ | 2315 | if (p->mm && printk_ratelimit()) { |
2246 | if (p->mm && printk_ratelimit()) { | 2316 | printk(KERN_INFO "process %d (%s) no longer affine to cpu%d\n", |
2247 | printk(KERN_INFO "process %d (%s) no " | 2317 | task_pid_nr(p), p->comm, cpu); |
2248 | "longer affine to cpu%d\n", | ||
2249 | task_pid_nr(p), p->comm, cpu); | ||
2250 | } | ||
2251 | } | 2318 | } |
2252 | 2319 | ||
2253 | return dest_cpu; | 2320 | return dest_cpu; |
@@ -2438,7 +2505,7 @@ out: | |||
2438 | * try_to_wake_up_local - try to wake up a local task with rq lock held | 2505 | * try_to_wake_up_local - try to wake up a local task with rq lock held |
2439 | * @p: the thread to be awakened | 2506 | * @p: the thread to be awakened |
2440 | * | 2507 | * |
2441 | * Put @p on the run-queue if it's not alredy there. The caller must | 2508 | * Put @p on the run-queue if it's not already there. The caller must |
2442 | * ensure that this_rq() is locked, @p is bound to this_rq() and not | 2509 | * ensure that this_rq() is locked, @p is bound to this_rq() and not |
2443 | * the current task. this_rq() stays locked over invocation. | 2510 | * the current task. this_rq() stays locked over invocation. |
2444 | */ | 2511 | */ |
@@ -2583,7 +2650,9 @@ void sched_fork(struct task_struct *p, int clone_flags) | |||
2583 | /* Want to start with kernel preemption disabled. */ | 2650 | /* Want to start with kernel preemption disabled. */ |
2584 | task_thread_info(p)->preempt_count = 1; | 2651 | task_thread_info(p)->preempt_count = 1; |
2585 | #endif | 2652 | #endif |
2653 | #ifdef CONFIG_SMP | ||
2586 | plist_node_init(&p->pushable_tasks, MAX_PRIO); | 2654 | plist_node_init(&p->pushable_tasks, MAX_PRIO); |
2655 | #endif | ||
2587 | 2656 | ||
2588 | put_cpu(); | 2657 | put_cpu(); |
2589 | } | 2658 | } |
@@ -2852,14 +2921,14 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
2852 | */ | 2921 | */ |
2853 | arch_start_context_switch(prev); | 2922 | arch_start_context_switch(prev); |
2854 | 2923 | ||
2855 | if (likely(!mm)) { | 2924 | if (!mm) { |
2856 | next->active_mm = oldmm; | 2925 | next->active_mm = oldmm; |
2857 | atomic_inc(&oldmm->mm_count); | 2926 | atomic_inc(&oldmm->mm_count); |
2858 | enter_lazy_tlb(oldmm, next); | 2927 | enter_lazy_tlb(oldmm, next); |
2859 | } else | 2928 | } else |
2860 | switch_mm(oldmm, mm, next); | 2929 | switch_mm(oldmm, mm, next); |
2861 | 2930 | ||
2862 | if (likely(!prev->mm)) { | 2931 | if (!prev->mm) { |
2863 | prev->active_mm = NULL; | 2932 | prev->active_mm = NULL; |
2864 | rq->prev_mm = oldmm; | 2933 | rq->prev_mm = oldmm; |
2865 | } | 2934 | } |
@@ -2974,6 +3043,15 @@ static long calc_load_fold_active(struct rq *this_rq) | |||
2974 | return delta; | 3043 | return delta; |
2975 | } | 3044 | } |
2976 | 3045 | ||
3046 | static unsigned long | ||
3047 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3048 | { | ||
3049 | load *= exp; | ||
3050 | load += active * (FIXED_1 - exp); | ||
3051 | load += 1UL << (FSHIFT - 1); | ||
3052 | return load >> FSHIFT; | ||
3053 | } | ||
3054 | |||
2977 | #ifdef CONFIG_NO_HZ | 3055 | #ifdef CONFIG_NO_HZ |
2978 | /* | 3056 | /* |
2979 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. | 3057 | * For NO_HZ we delay the active fold to the next LOAD_FREQ update. |
@@ -3003,6 +3081,128 @@ static long calc_load_fold_idle(void) | |||
3003 | 3081 | ||
3004 | return delta; | 3082 | return delta; |
3005 | } | 3083 | } |
3084 | |||
3085 | /** | ||
3086 | * fixed_power_int - compute: x^n, in O(log n) time | ||
3087 | * | ||
3088 | * @x: base of the power | ||
3089 | * @frac_bits: fractional bits of @x | ||
3090 | * @n: power to raise @x to. | ||
3091 | * | ||
3092 | * By exploiting the relation between the definition of the natural power | ||
3093 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | ||
3094 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | ||
3095 | * (where: n_i \elem {0, 1}, the binary vector representing n), | ||
3096 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | ||
3097 | * of course trivially computable in O(log_2 n), the length of our binary | ||
3098 | * vector. | ||
3099 | */ | ||
3100 | static unsigned long | ||
3101 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) | ||
3102 | { | ||
3103 | unsigned long result = 1UL << frac_bits; | ||
3104 | |||
3105 | if (n) for (;;) { | ||
3106 | if (n & 1) { | ||
3107 | result *= x; | ||
3108 | result += 1UL << (frac_bits - 1); | ||
3109 | result >>= frac_bits; | ||
3110 | } | ||
3111 | n >>= 1; | ||
3112 | if (!n) | ||
3113 | break; | ||
3114 | x *= x; | ||
3115 | x += 1UL << (frac_bits - 1); | ||
3116 | x >>= frac_bits; | ||
3117 | } | ||
3118 | |||
3119 | return result; | ||
3120 | } | ||
3121 | |||
3122 | /* | ||
3123 | * a1 = a0 * e + a * (1 - e) | ||
3124 | * | ||
3125 | * a2 = a1 * e + a * (1 - e) | ||
3126 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) | ||
3127 | * = a0 * e^2 + a * (1 - e) * (1 + e) | ||
3128 | * | ||
3129 | * a3 = a2 * e + a * (1 - e) | ||
3130 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) | ||
3131 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) | ||
3132 | * | ||
3133 | * ... | ||
3134 | * | ||
3135 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] | ||
3136 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) | ||
3137 | * = a0 * e^n + a * (1 - e^n) | ||
3138 | * | ||
3139 | * [1] application of the geometric series: | ||
3140 | * | ||
3141 | * n 1 - x^(n+1) | ||
3142 | * S_n := \Sum x^i = ------------- | ||
3143 | * i=0 1 - x | ||
3144 | */ | ||
3145 | static unsigned long | ||
3146 | calc_load_n(unsigned long load, unsigned long exp, | ||
3147 | unsigned long active, unsigned int n) | ||
3148 | { | ||
3149 | |||
3150 | return calc_load(load, fixed_power_int(exp, FSHIFT, n), active); | ||
3151 | } | ||
3152 | |||
3153 | /* | ||
3154 | * NO_HZ can leave us missing all per-cpu ticks calling | ||
3155 | * calc_load_account_active(), but since an idle CPU folds its delta into | ||
3156 | * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold | ||
3157 | * in the pending idle delta if our idle period crossed a load cycle boundary. | ||
3158 | * | ||
3159 | * Once we've updated the global active value, we need to apply the exponential | ||
3160 | * weights adjusted to the number of cycles missed. | ||
3161 | */ | ||
3162 | static void calc_global_nohz(unsigned long ticks) | ||
3163 | { | ||
3164 | long delta, active, n; | ||
3165 | |||
3166 | if (time_before(jiffies, calc_load_update)) | ||
3167 | return; | ||
3168 | |||
3169 | /* | ||
3170 | * If we crossed a calc_load_update boundary, make sure to fold | ||
3171 | * any pending idle changes, the respective CPUs might have | ||
3172 | * missed the tick driven calc_load_account_active() update | ||
3173 | * due to NO_HZ. | ||
3174 | */ | ||
3175 | delta = calc_load_fold_idle(); | ||
3176 | if (delta) | ||
3177 | atomic_long_add(delta, &calc_load_tasks); | ||
3178 | |||
3179 | /* | ||
3180 | * If we were idle for multiple load cycles, apply them. | ||
3181 | */ | ||
3182 | if (ticks >= LOAD_FREQ) { | ||
3183 | n = ticks / LOAD_FREQ; | ||
3184 | |||
3185 | active = atomic_long_read(&calc_load_tasks); | ||
3186 | active = active > 0 ? active * FIXED_1 : 0; | ||
3187 | |||
3188 | avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n); | ||
3189 | avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n); | ||
3190 | avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n); | ||
3191 | |||
3192 | calc_load_update += n * LOAD_FREQ; | ||
3193 | } | ||
3194 | |||
3195 | /* | ||
3196 | * Its possible the remainder of the above division also crosses | ||
3197 | * a LOAD_FREQ period, the regular check in calc_global_load() | ||
3198 | * which comes after this will take care of that. | ||
3199 | * | ||
3200 | * Consider us being 11 ticks before a cycle completion, and us | ||
3201 | * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will | ||
3202 | * age us 4 cycles, and the test in calc_global_load() will | ||
3203 | * pick up the final one. | ||
3204 | */ | ||
3205 | } | ||
3006 | #else | 3206 | #else |
3007 | static void calc_load_account_idle(struct rq *this_rq) | 3207 | static void calc_load_account_idle(struct rq *this_rq) |
3008 | { | 3208 | { |
@@ -3012,6 +3212,10 @@ static inline long calc_load_fold_idle(void) | |||
3012 | { | 3212 | { |
3013 | return 0; | 3213 | return 0; |
3014 | } | 3214 | } |
3215 | |||
3216 | static void calc_global_nohz(unsigned long ticks) | ||
3217 | { | ||
3218 | } | ||
3015 | #endif | 3219 | #endif |
3016 | 3220 | ||
3017 | /** | 3221 | /** |
@@ -3029,24 +3233,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift) | |||
3029 | loads[2] = (avenrun[2] + offset) << shift; | 3233 | loads[2] = (avenrun[2] + offset) << shift; |
3030 | } | 3234 | } |
3031 | 3235 | ||
3032 | static unsigned long | ||
3033 | calc_load(unsigned long load, unsigned long exp, unsigned long active) | ||
3034 | { | ||
3035 | load *= exp; | ||
3036 | load += active * (FIXED_1 - exp); | ||
3037 | return load >> FSHIFT; | ||
3038 | } | ||
3039 | |||
3040 | /* | 3236 | /* |
3041 | * calc_load - update the avenrun load estimates 10 ticks after the | 3237 | * calc_load - update the avenrun load estimates 10 ticks after the |
3042 | * CPUs have updated calc_load_tasks. | 3238 | * CPUs have updated calc_load_tasks. |
3043 | */ | 3239 | */ |
3044 | void calc_global_load(void) | 3240 | void calc_global_load(unsigned long ticks) |
3045 | { | 3241 | { |
3046 | unsigned long upd = calc_load_update + 10; | ||
3047 | long active; | 3242 | long active; |
3048 | 3243 | ||
3049 | if (time_before(jiffies, upd)) | 3244 | calc_global_nohz(ticks); |
3245 | |||
3246 | if (time_before(jiffies, calc_load_update + 10)) | ||
3050 | return; | 3247 | return; |
3051 | 3248 | ||
3052 | active = atomic_long_read(&calc_load_tasks); | 3249 | active = atomic_long_read(&calc_load_tasks); |
@@ -3219,7 +3416,7 @@ void sched_exec(void) | |||
3219 | * select_task_rq() can race against ->cpus_allowed | 3416 | * select_task_rq() can race against ->cpus_allowed |
3220 | */ | 3417 | */ |
3221 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && | 3418 | if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) && |
3222 | likely(cpu_active(dest_cpu)) && migrate_task(p, dest_cpu)) { | 3419 | likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) { |
3223 | struct migration_arg arg = { p, dest_cpu }; | 3420 | struct migration_arg arg = { p, dest_cpu }; |
3224 | 3421 | ||
3225 | task_rq_unlock(rq, &flags); | 3422 | task_rq_unlock(rq, &flags); |
@@ -3248,7 +3445,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) | |||
3248 | 3445 | ||
3249 | if (task_current(rq, p)) { | 3446 | if (task_current(rq, p)) { |
3250 | update_rq_clock(rq); | 3447 | update_rq_clock(rq); |
3251 | ns = rq->clock - p->se.exec_start; | 3448 | ns = rq->clock_task - p->se.exec_start; |
3252 | if ((s64)ns < 0) | 3449 | if ((s64)ns < 0) |
3253 | ns = 0; | 3450 | ns = 0; |
3254 | } | 3451 | } |
@@ -3397,7 +3594,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3397 | tmp = cputime_to_cputime64(cputime); | 3594 | tmp = cputime_to_cputime64(cputime); |
3398 | if (hardirq_count() - hardirq_offset) | 3595 | if (hardirq_count() - hardirq_offset) |
3399 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3596 | cpustat->irq = cputime64_add(cpustat->irq, tmp); |
3400 | else if (softirq_count()) | 3597 | else if (in_serving_softirq()) |
3401 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3598 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); |
3402 | else | 3599 | else |
3403 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3600 | cpustat->system = cputime64_add(cpustat->system, tmp); |
@@ -3584,7 +3781,7 @@ void scheduler_tick(void) | |||
3584 | curr->sched_class->task_tick(rq, curr, 0); | 3781 | curr->sched_class->task_tick(rq, curr, 0); |
3585 | raw_spin_unlock(&rq->lock); | 3782 | raw_spin_unlock(&rq->lock); |
3586 | 3783 | ||
3587 | perf_event_task_tick(curr); | 3784 | perf_event_task_tick(); |
3588 | 3785 | ||
3589 | #ifdef CONFIG_SMP | 3786 | #ifdef CONFIG_SMP |
3590 | rq->idle_at_tick = idle_cpu(cpu); | 3787 | rq->idle_at_tick = idle_cpu(cpu); |
@@ -3690,7 +3887,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
3690 | schedstat_inc(this_rq(), sched_count); | 3887 | schedstat_inc(this_rq(), sched_count); |
3691 | #ifdef CONFIG_SCHEDSTATS | 3888 | #ifdef CONFIG_SCHEDSTATS |
3692 | if (unlikely(prev->lock_depth >= 0)) { | 3889 | if (unlikely(prev->lock_depth >= 0)) { |
3693 | schedstat_inc(this_rq(), bkl_count); | 3890 | schedstat_inc(this_rq(), rq_sched_info.bkl_count); |
3694 | schedstat_inc(prev, sched_info.bkl_count); | 3891 | schedstat_inc(prev, sched_info.bkl_count); |
3695 | } | 3892 | } |
3696 | #endif | 3893 | #endif |
@@ -3700,7 +3897,6 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev) | |||
3700 | { | 3897 | { |
3701 | if (prev->se.on_rq) | 3898 | if (prev->se.on_rq) |
3702 | update_rq_clock(rq); | 3899 | update_rq_clock(rq); |
3703 | rq->skip_clock_update = 0; | ||
3704 | prev->sched_class->put_prev_task(rq, prev); | 3900 | prev->sched_class->put_prev_task(rq, prev); |
3705 | } | 3901 | } |
3706 | 3902 | ||
@@ -3723,17 +3919,13 @@ pick_next_task(struct rq *rq) | |||
3723 | return p; | 3919 | return p; |
3724 | } | 3920 | } |
3725 | 3921 | ||
3726 | class = sched_class_highest; | 3922 | for_each_class(class) { |
3727 | for ( ; ; ) { | ||
3728 | p = class->pick_next_task(rq); | 3923 | p = class->pick_next_task(rq); |
3729 | if (p) | 3924 | if (p) |
3730 | return p; | 3925 | return p; |
3731 | /* | ||
3732 | * Will never be NULL as the idle class always | ||
3733 | * returns a non-NULL p: | ||
3734 | */ | ||
3735 | class = class->next; | ||
3736 | } | 3926 | } |
3927 | |||
3928 | BUG(); /* the idle class will always have a runnable task */ | ||
3737 | } | 3929 | } |
3738 | 3930 | ||
3739 | /* | 3931 | /* |
@@ -3762,7 +3954,6 @@ need_resched_nonpreemptible: | |||
3762 | hrtick_clear(rq); | 3954 | hrtick_clear(rq); |
3763 | 3955 | ||
3764 | raw_spin_lock_irq(&rq->lock); | 3956 | raw_spin_lock_irq(&rq->lock); |
3765 | clear_tsk_need_resched(prev); | ||
3766 | 3957 | ||
3767 | switch_count = &prev->nivcsw; | 3958 | switch_count = &prev->nivcsw; |
3768 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 3959 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
@@ -3794,6 +3985,8 @@ need_resched_nonpreemptible: | |||
3794 | 3985 | ||
3795 | put_prev_task(rq, prev); | 3986 | put_prev_task(rq, prev); |
3796 | next = pick_next_task(rq); | 3987 | next = pick_next_task(rq); |
3988 | clear_tsk_need_resched(prev); | ||
3989 | rq->skip_clock_update = 0; | ||
3797 | 3990 | ||
3798 | if (likely(prev != next)) { | 3991 | if (likely(prev != next)) { |
3799 | sched_info_switch(prev, next); | 3992 | sched_info_switch(prev, next); |
@@ -3888,7 +4081,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) | |||
3888 | if (task_thread_info(rq->curr) != owner || need_resched()) | 4081 | if (task_thread_info(rq->curr) != owner || need_resched()) |
3889 | return 0; | 4082 | return 0; |
3890 | 4083 | ||
3891 | cpu_relax(); | 4084 | arch_mutex_cpu_relax(); |
3892 | } | 4085 | } |
3893 | 4086 | ||
3894 | return 1; | 4087 | return 1; |
@@ -4200,7 +4393,7 @@ EXPORT_SYMBOL(wait_for_completion_interruptible); | |||
4200 | * This waits for either a completion of a specific task to be signaled or for a | 4393 | * This waits for either a completion of a specific task to be signaled or for a |
4201 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. | 4394 | * specified timeout to expire. It is interruptible. The timeout is in jiffies. |
4202 | */ | 4395 | */ |
4203 | unsigned long __sched | 4396 | long __sched |
4204 | wait_for_completion_interruptible_timeout(struct completion *x, | 4397 | wait_for_completion_interruptible_timeout(struct completion *x, |
4205 | unsigned long timeout) | 4398 | unsigned long timeout) |
4206 | { | 4399 | { |
@@ -4233,7 +4426,7 @@ EXPORT_SYMBOL(wait_for_completion_killable); | |||
4233 | * signaled or for a specified timeout to expire. It can be | 4426 | * signaled or for a specified timeout to expire. It can be |
4234 | * interrupted by a kill signal. The timeout is in jiffies. | 4427 | * interrupted by a kill signal. The timeout is in jiffies. |
4235 | */ | 4428 | */ |
4236 | unsigned long __sched | 4429 | long __sched |
4237 | wait_for_completion_killable_timeout(struct completion *x, | 4430 | wait_for_completion_killable_timeout(struct completion *x, |
4238 | unsigned long timeout) | 4431 | unsigned long timeout) |
4239 | { | 4432 | { |
@@ -4358,6 +4551,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4358 | 4551 | ||
4359 | rq = task_rq_lock(p, &flags); | 4552 | rq = task_rq_lock(p, &flags); |
4360 | 4553 | ||
4554 | trace_sched_pi_setprio(p, prio); | ||
4361 | oldprio = p->prio; | 4555 | oldprio = p->prio; |
4362 | prev_class = p->sched_class; | 4556 | prev_class = p->sched_class; |
4363 | on_rq = p->se.on_rq; | 4557 | on_rq = p->se.on_rq; |
@@ -4574,7 +4768,7 @@ static bool check_same_owner(struct task_struct *p) | |||
4574 | } | 4768 | } |
4575 | 4769 | ||
4576 | static int __sched_setscheduler(struct task_struct *p, int policy, | 4770 | static int __sched_setscheduler(struct task_struct *p, int policy, |
4577 | struct sched_param *param, bool user) | 4771 | const struct sched_param *param, bool user) |
4578 | { | 4772 | { |
4579 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4773 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4580 | unsigned long flags; | 4774 | unsigned long flags; |
@@ -4645,7 +4839,7 @@ recheck: | |||
4645 | } | 4839 | } |
4646 | 4840 | ||
4647 | if (user) { | 4841 | if (user) { |
4648 | retval = security_task_setscheduler(p, policy, param); | 4842 | retval = security_task_setscheduler(p); |
4649 | if (retval) | 4843 | if (retval) |
4650 | return retval; | 4844 | return retval; |
4651 | } | 4845 | } |
@@ -4661,6 +4855,15 @@ recheck: | |||
4661 | */ | 4855 | */ |
4662 | rq = __task_rq_lock(p); | 4856 | rq = __task_rq_lock(p); |
4663 | 4857 | ||
4858 | /* | ||
4859 | * Changing the policy of the stop threads its a very bad idea | ||
4860 | */ | ||
4861 | if (p == rq->stop) { | ||
4862 | __task_rq_unlock(rq); | ||
4863 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4864 | return -EINVAL; | ||
4865 | } | ||
4866 | |||
4664 | #ifdef CONFIG_RT_GROUP_SCHED | 4867 | #ifdef CONFIG_RT_GROUP_SCHED |
4665 | if (user) { | 4868 | if (user) { |
4666 | /* | 4869 | /* |
@@ -4668,7 +4871,8 @@ recheck: | |||
4668 | * assigned. | 4871 | * assigned. |
4669 | */ | 4872 | */ |
4670 | if (rt_bandwidth_enabled() && rt_policy(policy) && | 4873 | if (rt_bandwidth_enabled() && rt_policy(policy) && |
4671 | task_group(p)->rt_bandwidth.rt_runtime == 0) { | 4874 | task_group(p)->rt_bandwidth.rt_runtime == 0 && |
4875 | !task_group_is_autogroup(task_group(p))) { | ||
4672 | __task_rq_unlock(rq); | 4876 | __task_rq_unlock(rq); |
4673 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); | 4877 | raw_spin_unlock_irqrestore(&p->pi_lock, flags); |
4674 | return -EPERM; | 4878 | return -EPERM; |
@@ -4720,7 +4924,7 @@ recheck: | |||
4720 | * NOTE that the task may be already dead. | 4924 | * NOTE that the task may be already dead. |
4721 | */ | 4925 | */ |
4722 | int sched_setscheduler(struct task_struct *p, int policy, | 4926 | int sched_setscheduler(struct task_struct *p, int policy, |
4723 | struct sched_param *param) | 4927 | const struct sched_param *param) |
4724 | { | 4928 | { |
4725 | return __sched_setscheduler(p, policy, param, true); | 4929 | return __sched_setscheduler(p, policy, param, true); |
4726 | } | 4930 | } |
@@ -4738,7 +4942,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler); | |||
4738 | * but our caller might not have that capability. | 4942 | * but our caller might not have that capability. |
4739 | */ | 4943 | */ |
4740 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, | 4944 | int sched_setscheduler_nocheck(struct task_struct *p, int policy, |
4741 | struct sched_param *param) | 4945 | const struct sched_param *param) |
4742 | { | 4946 | { |
4743 | return __sched_setscheduler(p, policy, param, false); | 4947 | return __sched_setscheduler(p, policy, param, false); |
4744 | } | 4948 | } |
@@ -4887,13 +5091,13 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) | |||
4887 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) | 5091 | if (!check_same_owner(p) && !capable(CAP_SYS_NICE)) |
4888 | goto out_unlock; | 5092 | goto out_unlock; |
4889 | 5093 | ||
4890 | retval = security_task_setscheduler(p, 0, NULL); | 5094 | retval = security_task_setscheduler(p); |
4891 | if (retval) | 5095 | if (retval) |
4892 | goto out_unlock; | 5096 | goto out_unlock; |
4893 | 5097 | ||
4894 | cpuset_cpus_allowed(p, cpus_allowed); | 5098 | cpuset_cpus_allowed(p, cpus_allowed); |
4895 | cpumask_and(new_mask, in_mask, cpus_allowed); | 5099 | cpumask_and(new_mask, in_mask, cpus_allowed); |
4896 | again: | 5100 | again: |
4897 | retval = set_cpus_allowed_ptr(p, new_mask); | 5101 | retval = set_cpus_allowed_ptr(p, new_mask); |
4898 | 5102 | ||
4899 | if (!retval) { | 5103 | if (!retval) { |
@@ -5254,7 +5458,7 @@ void sched_show_task(struct task_struct *p) | |||
5254 | unsigned state; | 5458 | unsigned state; |
5255 | 5459 | ||
5256 | state = p->state ? __ffs(p->state) + 1 : 0; | 5460 | state = p->state ? __ffs(p->state) + 1 : 0; |
5257 | printk(KERN_INFO "%-13.13s %c", p->comm, | 5461 | printk(KERN_INFO "%-15.15s %c", p->comm, |
5258 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); | 5462 | state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?'); |
5259 | #if BITS_PER_LONG == 32 | 5463 | #if BITS_PER_LONG == 32 |
5260 | if (state == TASK_RUNNING) | 5464 | if (state == TASK_RUNNING) |
@@ -5337,7 +5541,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
5337 | idle->se.exec_start = sched_clock(); | 5541 | idle->se.exec_start = sched_clock(); |
5338 | 5542 | ||
5339 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); | 5543 | cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu)); |
5544 | /* | ||
5545 | * We're having a chicken and egg problem, even though we are | ||
5546 | * holding rq->lock, the cpu isn't yet set to this cpu so the | ||
5547 | * lockdep check in task_group() will fail. | ||
5548 | * | ||
5549 | * Similar case to sched_fork(). / Alternatively we could | ||
5550 | * use task_rq_lock() here and obtain the other rq->lock. | ||
5551 | * | ||
5552 | * Silence PROVE_RCU | ||
5553 | */ | ||
5554 | rcu_read_lock(); | ||
5340 | __set_task_cpu(idle, cpu); | 5555 | __set_task_cpu(idle, cpu); |
5556 | rcu_read_unlock(); | ||
5341 | 5557 | ||
5342 | rq->curr = rq->idle = idle; | 5558 | rq->curr = rq->idle = idle; |
5343 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) | 5559 | #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) |
@@ -5406,7 +5622,6 @@ static void update_sysctl(void) | |||
5406 | SET_SYSCTL(sched_min_granularity); | 5622 | SET_SYSCTL(sched_min_granularity); |
5407 | SET_SYSCTL(sched_latency); | 5623 | SET_SYSCTL(sched_latency); |
5408 | SET_SYSCTL(sched_wakeup_granularity); | 5624 | SET_SYSCTL(sched_wakeup_granularity); |
5409 | SET_SYSCTL(sched_shares_ratelimit); | ||
5410 | #undef SET_SYSCTL | 5625 | #undef SET_SYSCTL |
5411 | } | 5626 | } |
5412 | 5627 | ||
@@ -5482,7 +5697,7 @@ again: | |||
5482 | goto out; | 5697 | goto out; |
5483 | 5698 | ||
5484 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); | 5699 | dest_cpu = cpumask_any_and(cpu_active_mask, new_mask); |
5485 | if (migrate_task(p, dest_cpu)) { | 5700 | if (migrate_task(p, rq)) { |
5486 | struct migration_arg arg = { p, dest_cpu }; | 5701 | struct migration_arg arg = { p, dest_cpu }; |
5487 | /* Need help from migration thread: drop lock and wait. */ | 5702 | /* Need help from migration thread: drop lock and wait. */ |
5488 | task_rq_unlock(rq, &flags); | 5703 | task_rq_unlock(rq, &flags); |
@@ -5564,29 +5779,20 @@ static int migration_cpu_stop(void *data) | |||
5564 | } | 5779 | } |
5565 | 5780 | ||
5566 | #ifdef CONFIG_HOTPLUG_CPU | 5781 | #ifdef CONFIG_HOTPLUG_CPU |
5782 | |||
5567 | /* | 5783 | /* |
5568 | * Figure out where task on dead CPU should go, use force if necessary. | 5784 | * Ensures that the idle task is using init_mm right before its cpu goes |
5785 | * offline. | ||
5569 | */ | 5786 | */ |
5570 | void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | 5787 | void idle_task_exit(void) |
5571 | { | 5788 | { |
5572 | struct rq *rq = cpu_rq(dead_cpu); | 5789 | struct mm_struct *mm = current->active_mm; |
5573 | int needs_cpu, uninitialized_var(dest_cpu); | ||
5574 | unsigned long flags; | ||
5575 | 5790 | ||
5576 | local_irq_save(flags); | 5791 | BUG_ON(cpu_online(smp_processor_id())); |
5577 | 5792 | ||
5578 | raw_spin_lock(&rq->lock); | 5793 | if (mm != &init_mm) |
5579 | needs_cpu = (task_cpu(p) == dead_cpu) && (p->state != TASK_WAKING); | 5794 | switch_mm(mm, &init_mm, current); |
5580 | if (needs_cpu) | 5795 | mmdrop(mm); |
5581 | dest_cpu = select_fallback_rq(dead_cpu, p); | ||
5582 | raw_spin_unlock(&rq->lock); | ||
5583 | /* | ||
5584 | * It can only fail if we race with set_cpus_allowed(), | ||
5585 | * in the racer should migrate the task anyway. | ||
5586 | */ | ||
5587 | if (needs_cpu) | ||
5588 | __migrate_task(p, dead_cpu, dest_cpu); | ||
5589 | local_irq_restore(flags); | ||
5590 | } | 5796 | } |
5591 | 5797 | ||
5592 | /* | 5798 | /* |
@@ -5599,128 +5805,69 @@ void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5599 | static void migrate_nr_uninterruptible(struct rq *rq_src) | 5805 | static void migrate_nr_uninterruptible(struct rq *rq_src) |
5600 | { | 5806 | { |
5601 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); | 5807 | struct rq *rq_dest = cpu_rq(cpumask_any(cpu_active_mask)); |
5602 | unsigned long flags; | ||
5603 | 5808 | ||
5604 | local_irq_save(flags); | ||
5605 | double_rq_lock(rq_src, rq_dest); | ||
5606 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; | 5809 | rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible; |
5607 | rq_src->nr_uninterruptible = 0; | 5810 | rq_src->nr_uninterruptible = 0; |
5608 | double_rq_unlock(rq_src, rq_dest); | ||
5609 | local_irq_restore(flags); | ||
5610 | } | ||
5611 | |||
5612 | /* Run through task list and migrate tasks from the dead cpu. */ | ||
5613 | static void migrate_live_tasks(int src_cpu) | ||
5614 | { | ||
5615 | struct task_struct *p, *t; | ||
5616 | |||
5617 | read_lock(&tasklist_lock); | ||
5618 | |||
5619 | do_each_thread(t, p) { | ||
5620 | if (p == current) | ||
5621 | continue; | ||
5622 | |||
5623 | if (task_cpu(p) == src_cpu) | ||
5624 | move_task_off_dead_cpu(src_cpu, p); | ||
5625 | } while_each_thread(t, p); | ||
5626 | |||
5627 | read_unlock(&tasklist_lock); | ||
5628 | } | 5811 | } |
5629 | 5812 | ||
5630 | /* | 5813 | /* |
5631 | * Schedules idle task to be the next runnable task on current CPU. | 5814 | * remove the tasks which were accounted by rq from calc_load_tasks. |
5632 | * It does so by boosting its priority to highest possible. | ||
5633 | * Used by CPU offline code. | ||
5634 | */ | 5815 | */ |
5635 | void sched_idle_next(void) | 5816 | static void calc_global_load_remove(struct rq *rq) |
5636 | { | 5817 | { |
5637 | int this_cpu = smp_processor_id(); | 5818 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); |
5638 | struct rq *rq = cpu_rq(this_cpu); | 5819 | rq->calc_load_active = 0; |
5639 | struct task_struct *p = rq->idle; | ||
5640 | unsigned long flags; | ||
5641 | |||
5642 | /* cpu has to be offline */ | ||
5643 | BUG_ON(cpu_online(this_cpu)); | ||
5644 | |||
5645 | /* | ||
5646 | * Strictly not necessary since rest of the CPUs are stopped by now | ||
5647 | * and interrupts disabled on the current cpu. | ||
5648 | */ | ||
5649 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
5650 | |||
5651 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | ||
5652 | |||
5653 | activate_task(rq, p, 0); | ||
5654 | |||
5655 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
5656 | } | 5820 | } |
5657 | 5821 | ||
5658 | /* | 5822 | /* |
5659 | * Ensures that the idle task is using init_mm right before its cpu goes | 5823 | * Migrate all tasks from the rq, sleeping tasks will be migrated by |
5660 | * offline. | 5824 | * try_to_wake_up()->select_task_rq(). |
5825 | * | ||
5826 | * Called with rq->lock held even though we'er in stop_machine() and | ||
5827 | * there's no concurrency possible, we hold the required locks anyway | ||
5828 | * because of lock validation efforts. | ||
5661 | */ | 5829 | */ |
5662 | void idle_task_exit(void) | 5830 | static void migrate_tasks(unsigned int dead_cpu) |
5663 | { | ||
5664 | struct mm_struct *mm = current->active_mm; | ||
5665 | |||
5666 | BUG_ON(cpu_online(smp_processor_id())); | ||
5667 | |||
5668 | if (mm != &init_mm) | ||
5669 | switch_mm(mm, &init_mm, current); | ||
5670 | mmdrop(mm); | ||
5671 | } | ||
5672 | |||
5673 | /* called under rq->lock with disabled interrupts */ | ||
5674 | static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | ||
5675 | { | 5831 | { |
5676 | struct rq *rq = cpu_rq(dead_cpu); | 5832 | struct rq *rq = cpu_rq(dead_cpu); |
5677 | 5833 | struct task_struct *next, *stop = rq->stop; | |
5678 | /* Must be exiting, otherwise would be on tasklist. */ | 5834 | int dest_cpu; |
5679 | BUG_ON(!p->exit_state); | ||
5680 | |||
5681 | /* Cannot have done final schedule yet: would have vanished. */ | ||
5682 | BUG_ON(p->state == TASK_DEAD); | ||
5683 | |||
5684 | get_task_struct(p); | ||
5685 | 5835 | ||
5686 | /* | 5836 | /* |
5687 | * Drop lock around migration; if someone else moves it, | 5837 | * Fudge the rq selection such that the below task selection loop |
5688 | * that's OK. No task can be added to this CPU, so iteration is | 5838 | * doesn't get stuck on the currently eligible stop task. |
5689 | * fine. | 5839 | * |
5840 | * We're currently inside stop_machine() and the rq is either stuck | ||
5841 | * in the stop_machine_cpu_stop() loop, or we're executing this code, | ||
5842 | * either way we should never end up calling schedule() until we're | ||
5843 | * done here. | ||
5690 | */ | 5844 | */ |
5691 | raw_spin_unlock_irq(&rq->lock); | 5845 | rq->stop = NULL; |
5692 | move_task_off_dead_cpu(dead_cpu, p); | ||
5693 | raw_spin_lock_irq(&rq->lock); | ||
5694 | |||
5695 | put_task_struct(p); | ||
5696 | } | ||
5697 | |||
5698 | /* release_task() removes task from tasklist, so we won't find dead tasks. */ | ||
5699 | static void migrate_dead_tasks(unsigned int dead_cpu) | ||
5700 | { | ||
5701 | struct rq *rq = cpu_rq(dead_cpu); | ||
5702 | struct task_struct *next; | ||
5703 | 5846 | ||
5704 | for ( ; ; ) { | 5847 | for ( ; ; ) { |
5705 | if (!rq->nr_running) | 5848 | /* |
5849 | * There's this thread running, bail when that's the only | ||
5850 | * remaining thread. | ||
5851 | */ | ||
5852 | if (rq->nr_running == 1) | ||
5706 | break; | 5853 | break; |
5854 | |||
5707 | next = pick_next_task(rq); | 5855 | next = pick_next_task(rq); |
5708 | if (!next) | 5856 | BUG_ON(!next); |
5709 | break; | ||
5710 | next->sched_class->put_prev_task(rq, next); | 5857 | next->sched_class->put_prev_task(rq, next); |
5711 | migrate_dead(dead_cpu, next); | ||
5712 | 5858 | ||
5859 | /* Find suitable destination for @next, with force if needed. */ | ||
5860 | dest_cpu = select_fallback_rq(dead_cpu, next); | ||
5861 | raw_spin_unlock(&rq->lock); | ||
5862 | |||
5863 | __migrate_task(next, dead_cpu, dest_cpu); | ||
5864 | |||
5865 | raw_spin_lock(&rq->lock); | ||
5713 | } | 5866 | } |
5714 | } | ||
5715 | 5867 | ||
5716 | /* | 5868 | rq->stop = stop; |
5717 | * remove the tasks which were accounted by rq from calc_load_tasks. | ||
5718 | */ | ||
5719 | static void calc_global_load_remove(struct rq *rq) | ||
5720 | { | ||
5721 | atomic_long_sub(rq->calc_load_active, &calc_load_tasks); | ||
5722 | rq->calc_load_active = 0; | ||
5723 | } | 5869 | } |
5870 | |||
5724 | #endif /* CONFIG_HOTPLUG_CPU */ | 5871 | #endif /* CONFIG_HOTPLUG_CPU */ |
5725 | 5872 | ||
5726 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) | 5873 | #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) |
@@ -5930,15 +6077,13 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5930 | unsigned long flags; | 6077 | unsigned long flags; |
5931 | struct rq *rq = cpu_rq(cpu); | 6078 | struct rq *rq = cpu_rq(cpu); |
5932 | 6079 | ||
5933 | switch (action) { | 6080 | switch (action & ~CPU_TASKS_FROZEN) { |
5934 | 6081 | ||
5935 | case CPU_UP_PREPARE: | 6082 | case CPU_UP_PREPARE: |
5936 | case CPU_UP_PREPARE_FROZEN: | ||
5937 | rq->calc_load_update = calc_load_update; | 6083 | rq->calc_load_update = calc_load_update; |
5938 | break; | 6084 | break; |
5939 | 6085 | ||
5940 | case CPU_ONLINE: | 6086 | case CPU_ONLINE: |
5941 | case CPU_ONLINE_FROZEN: | ||
5942 | /* Update our root-domain */ | 6087 | /* Update our root-domain */ |
5943 | raw_spin_lock_irqsave(&rq->lock, flags); | 6088 | raw_spin_lock_irqsave(&rq->lock, flags); |
5944 | if (rq->rd) { | 6089 | if (rq->rd) { |
@@ -5950,30 +6095,19 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5950 | break; | 6095 | break; |
5951 | 6096 | ||
5952 | #ifdef CONFIG_HOTPLUG_CPU | 6097 | #ifdef CONFIG_HOTPLUG_CPU |
5953 | case CPU_DEAD: | ||
5954 | case CPU_DEAD_FROZEN: | ||
5955 | migrate_live_tasks(cpu); | ||
5956 | /* Idle task back to normal (off runqueue, low prio) */ | ||
5957 | raw_spin_lock_irq(&rq->lock); | ||
5958 | deactivate_task(rq, rq->idle, 0); | ||
5959 | __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); | ||
5960 | rq->idle->sched_class = &idle_sched_class; | ||
5961 | migrate_dead_tasks(cpu); | ||
5962 | raw_spin_unlock_irq(&rq->lock); | ||
5963 | migrate_nr_uninterruptible(rq); | ||
5964 | BUG_ON(rq->nr_running != 0); | ||
5965 | calc_global_load_remove(rq); | ||
5966 | break; | ||
5967 | |||
5968 | case CPU_DYING: | 6098 | case CPU_DYING: |
5969 | case CPU_DYING_FROZEN: | ||
5970 | /* Update our root-domain */ | 6099 | /* Update our root-domain */ |
5971 | raw_spin_lock_irqsave(&rq->lock, flags); | 6100 | raw_spin_lock_irqsave(&rq->lock, flags); |
5972 | if (rq->rd) { | 6101 | if (rq->rd) { |
5973 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); | 6102 | BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); |
5974 | set_rq_offline(rq); | 6103 | set_rq_offline(rq); |
5975 | } | 6104 | } |
6105 | migrate_tasks(cpu); | ||
6106 | BUG_ON(rq->nr_running != 1); /* the migration thread */ | ||
5976 | raw_spin_unlock_irqrestore(&rq->lock, flags); | 6107 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
6108 | |||
6109 | migrate_nr_uninterruptible(rq); | ||
6110 | calc_global_load_remove(rq); | ||
5977 | break; | 6111 | break; |
5978 | #endif | 6112 | #endif |
5979 | } | 6113 | } |
@@ -6514,6 +6648,7 @@ struct s_data { | |||
6514 | cpumask_var_t nodemask; | 6648 | cpumask_var_t nodemask; |
6515 | cpumask_var_t this_sibling_map; | 6649 | cpumask_var_t this_sibling_map; |
6516 | cpumask_var_t this_core_map; | 6650 | cpumask_var_t this_core_map; |
6651 | cpumask_var_t this_book_map; | ||
6517 | cpumask_var_t send_covered; | 6652 | cpumask_var_t send_covered; |
6518 | cpumask_var_t tmpmask; | 6653 | cpumask_var_t tmpmask; |
6519 | struct sched_group **sched_group_nodes; | 6654 | struct sched_group **sched_group_nodes; |
@@ -6525,6 +6660,7 @@ enum s_alloc { | |||
6525 | sa_rootdomain, | 6660 | sa_rootdomain, |
6526 | sa_tmpmask, | 6661 | sa_tmpmask, |
6527 | sa_send_covered, | 6662 | sa_send_covered, |
6663 | sa_this_book_map, | ||
6528 | sa_this_core_map, | 6664 | sa_this_core_map, |
6529 | sa_this_sibling_map, | 6665 | sa_this_sibling_map, |
6530 | sa_nodemask, | 6666 | sa_nodemask, |
@@ -6560,31 +6696,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map, | |||
6560 | #ifdef CONFIG_SCHED_MC | 6696 | #ifdef CONFIG_SCHED_MC |
6561 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); | 6697 | static DEFINE_PER_CPU(struct static_sched_domain, core_domains); |
6562 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); | 6698 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_core); |
6563 | #endif /* CONFIG_SCHED_MC */ | ||
6564 | 6699 | ||
6565 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | ||
6566 | static int | 6700 | static int |
6567 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 6701 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, |
6568 | struct sched_group **sg, struct cpumask *mask) | 6702 | struct sched_group **sg, struct cpumask *mask) |
6569 | { | 6703 | { |
6570 | int group; | 6704 | int group; |
6571 | 6705 | #ifdef CONFIG_SCHED_SMT | |
6572 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | 6706 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); |
6573 | group = cpumask_first(mask); | 6707 | group = cpumask_first(mask); |
6708 | #else | ||
6709 | group = cpu; | ||
6710 | #endif | ||
6574 | if (sg) | 6711 | if (sg) |
6575 | *sg = &per_cpu(sched_group_core, group).sg; | 6712 | *sg = &per_cpu(sched_group_core, group).sg; |
6576 | return group; | 6713 | return group; |
6577 | } | 6714 | } |
6578 | #elif defined(CONFIG_SCHED_MC) | 6715 | #endif /* CONFIG_SCHED_MC */ |
6716 | |||
6717 | /* | ||
6718 | * book sched-domains: | ||
6719 | */ | ||
6720 | #ifdef CONFIG_SCHED_BOOK | ||
6721 | static DEFINE_PER_CPU(struct static_sched_domain, book_domains); | ||
6722 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_book); | ||
6723 | |||
6579 | static int | 6724 | static int |
6580 | cpu_to_core_group(int cpu, const struct cpumask *cpu_map, | 6725 | cpu_to_book_group(int cpu, const struct cpumask *cpu_map, |
6581 | struct sched_group **sg, struct cpumask *unused) | 6726 | struct sched_group **sg, struct cpumask *mask) |
6582 | { | 6727 | { |
6728 | int group = cpu; | ||
6729 | #ifdef CONFIG_SCHED_MC | ||
6730 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | ||
6731 | group = cpumask_first(mask); | ||
6732 | #elif defined(CONFIG_SCHED_SMT) | ||
6733 | cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map); | ||
6734 | group = cpumask_first(mask); | ||
6735 | #endif | ||
6583 | if (sg) | 6736 | if (sg) |
6584 | *sg = &per_cpu(sched_group_core, cpu).sg; | 6737 | *sg = &per_cpu(sched_group_book, group).sg; |
6585 | return cpu; | 6738 | return group; |
6586 | } | 6739 | } |
6587 | #endif | 6740 | #endif /* CONFIG_SCHED_BOOK */ |
6588 | 6741 | ||
6589 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); | 6742 | static DEFINE_PER_CPU(struct static_sched_domain, phys_domains); |
6590 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); | 6743 | static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys); |
@@ -6594,7 +6747,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map, | |||
6594 | struct sched_group **sg, struct cpumask *mask) | 6747 | struct sched_group **sg, struct cpumask *mask) |
6595 | { | 6748 | { |
6596 | int group; | 6749 | int group; |
6597 | #ifdef CONFIG_SCHED_MC | 6750 | #ifdef CONFIG_SCHED_BOOK |
6751 | cpumask_and(mask, cpu_book_mask(cpu), cpu_map); | ||
6752 | group = cpumask_first(mask); | ||
6753 | #elif defined(CONFIG_SCHED_MC) | ||
6598 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); | 6754 | cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map); |
6599 | group = cpumask_first(mask); | 6755 | group = cpumask_first(mask); |
6600 | #elif defined(CONFIG_SCHED_SMT) | 6756 | #elif defined(CONFIG_SCHED_SMT) |
@@ -6790,6 +6946,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6790 | if (cpu != group_first_cpu(sd->groups)) | 6946 | if (cpu != group_first_cpu(sd->groups)) |
6791 | return; | 6947 | return; |
6792 | 6948 | ||
6949 | sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups)); | ||
6950 | |||
6793 | child = sd->child; | 6951 | child = sd->child; |
6794 | 6952 | ||
6795 | sd->groups->cpu_power = 0; | 6953 | sd->groups->cpu_power = 0; |
@@ -6855,6 +7013,9 @@ SD_INIT_FUNC(CPU) | |||
6855 | #ifdef CONFIG_SCHED_MC | 7013 | #ifdef CONFIG_SCHED_MC |
6856 | SD_INIT_FUNC(MC) | 7014 | SD_INIT_FUNC(MC) |
6857 | #endif | 7015 | #endif |
7016 | #ifdef CONFIG_SCHED_BOOK | ||
7017 | SD_INIT_FUNC(BOOK) | ||
7018 | #endif | ||
6858 | 7019 | ||
6859 | static int default_relax_domain_level = -1; | 7020 | static int default_relax_domain_level = -1; |
6860 | 7021 | ||
@@ -6904,6 +7065,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what, | |||
6904 | free_cpumask_var(d->tmpmask); /* fall through */ | 7065 | free_cpumask_var(d->tmpmask); /* fall through */ |
6905 | case sa_send_covered: | 7066 | case sa_send_covered: |
6906 | free_cpumask_var(d->send_covered); /* fall through */ | 7067 | free_cpumask_var(d->send_covered); /* fall through */ |
7068 | case sa_this_book_map: | ||
7069 | free_cpumask_var(d->this_book_map); /* fall through */ | ||
6907 | case sa_this_core_map: | 7070 | case sa_this_core_map: |
6908 | free_cpumask_var(d->this_core_map); /* fall through */ | 7071 | free_cpumask_var(d->this_core_map); /* fall through */ |
6909 | case sa_this_sibling_map: | 7072 | case sa_this_sibling_map: |
@@ -6950,8 +7113,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d, | |||
6950 | return sa_nodemask; | 7113 | return sa_nodemask; |
6951 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) | 7114 | if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL)) |
6952 | return sa_this_sibling_map; | 7115 | return sa_this_sibling_map; |
6953 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | 7116 | if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL)) |
6954 | return sa_this_core_map; | 7117 | return sa_this_core_map; |
7118 | if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL)) | ||
7119 | return sa_this_book_map; | ||
6955 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) | 7120 | if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL)) |
6956 | return sa_send_covered; | 7121 | return sa_send_covered; |
6957 | d->rd = alloc_rootdomain(); | 7122 | d->rd = alloc_rootdomain(); |
@@ -7009,6 +7174,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d, | |||
7009 | return sd; | 7174 | return sd; |
7010 | } | 7175 | } |
7011 | 7176 | ||
7177 | static struct sched_domain *__build_book_sched_domain(struct s_data *d, | ||
7178 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | ||
7179 | struct sched_domain *parent, int i) | ||
7180 | { | ||
7181 | struct sched_domain *sd = parent; | ||
7182 | #ifdef CONFIG_SCHED_BOOK | ||
7183 | sd = &per_cpu(book_domains, i).sd; | ||
7184 | SD_INIT(sd, BOOK); | ||
7185 | set_domain_attribute(sd, attr); | ||
7186 | cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i)); | ||
7187 | sd->parent = parent; | ||
7188 | parent->child = sd; | ||
7189 | cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask); | ||
7190 | #endif | ||
7191 | return sd; | ||
7192 | } | ||
7193 | |||
7012 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, | 7194 | static struct sched_domain *__build_mc_sched_domain(struct s_data *d, |
7013 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, | 7195 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
7014 | struct sched_domain *parent, int i) | 7196 | struct sched_domain *parent, int i) |
@@ -7066,6 +7248,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l, | |||
7066 | d->send_covered, d->tmpmask); | 7248 | d->send_covered, d->tmpmask); |
7067 | break; | 7249 | break; |
7068 | #endif | 7250 | #endif |
7251 | #ifdef CONFIG_SCHED_BOOK | ||
7252 | case SD_LV_BOOK: /* set up book groups */ | ||
7253 | cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu)); | ||
7254 | if (cpu == cpumask_first(d->this_book_map)) | ||
7255 | init_sched_build_groups(d->this_book_map, cpu_map, | ||
7256 | &cpu_to_book_group, | ||
7257 | d->send_covered, d->tmpmask); | ||
7258 | break; | ||
7259 | #endif | ||
7069 | case SD_LV_CPU: /* set up physical groups */ | 7260 | case SD_LV_CPU: /* set up physical groups */ |
7070 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); | 7261 | cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map); |
7071 | if (!cpumask_empty(d->nodemask)) | 7262 | if (!cpumask_empty(d->nodemask)) |
@@ -7113,12 +7304,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
7113 | 7304 | ||
7114 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); | 7305 | sd = __build_numa_sched_domains(&d, cpu_map, attr, i); |
7115 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); | 7306 | sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i); |
7307 | sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i); | ||
7116 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); | 7308 | sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i); |
7117 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); | 7309 | sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i); |
7118 | } | 7310 | } |
7119 | 7311 | ||
7120 | for_each_cpu(i, cpu_map) { | 7312 | for_each_cpu(i, cpu_map) { |
7121 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); | 7313 | build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i); |
7314 | build_sched_groups(&d, SD_LV_BOOK, cpu_map, i); | ||
7122 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); | 7315 | build_sched_groups(&d, SD_LV_MC, cpu_map, i); |
7123 | } | 7316 | } |
7124 | 7317 | ||
@@ -7149,6 +7342,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
7149 | init_sched_groups_power(i, sd); | 7342 | init_sched_groups_power(i, sd); |
7150 | } | 7343 | } |
7151 | #endif | 7344 | #endif |
7345 | #ifdef CONFIG_SCHED_BOOK | ||
7346 | for_each_cpu(i, cpu_map) { | ||
7347 | sd = &per_cpu(book_domains, i).sd; | ||
7348 | init_sched_groups_power(i, sd); | ||
7349 | } | ||
7350 | #endif | ||
7152 | 7351 | ||
7153 | for_each_cpu(i, cpu_map) { | 7352 | for_each_cpu(i, cpu_map) { |
7154 | sd = &per_cpu(phys_domains, i).sd; | 7353 | sd = &per_cpu(phys_domains, i).sd; |
@@ -7174,6 +7373,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map, | |||
7174 | sd = &per_cpu(cpu_domains, i).sd; | 7373 | sd = &per_cpu(cpu_domains, i).sd; |
7175 | #elif defined(CONFIG_SCHED_MC) | 7374 | #elif defined(CONFIG_SCHED_MC) |
7176 | sd = &per_cpu(core_domains, i).sd; | 7375 | sd = &per_cpu(core_domains, i).sd; |
7376 | #elif defined(CONFIG_SCHED_BOOK) | ||
7377 | sd = &per_cpu(book_domains, i).sd; | ||
7177 | #else | 7378 | #else |
7178 | sd = &per_cpu(phys_domains, i).sd; | 7379 | sd = &per_cpu(phys_domains, i).sd; |
7179 | #endif | 7380 | #endif |
@@ -7637,18 +7838,16 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | |||
7637 | 7838 | ||
7638 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7839 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7639 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | 7840 | static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, |
7640 | struct sched_entity *se, int cpu, int add, | 7841 | struct sched_entity *se, int cpu, |
7641 | struct sched_entity *parent) | 7842 | struct sched_entity *parent) |
7642 | { | 7843 | { |
7643 | struct rq *rq = cpu_rq(cpu); | 7844 | struct rq *rq = cpu_rq(cpu); |
7644 | tg->cfs_rq[cpu] = cfs_rq; | 7845 | tg->cfs_rq[cpu] = cfs_rq; |
7645 | init_cfs_rq(cfs_rq, rq); | 7846 | init_cfs_rq(cfs_rq, rq); |
7646 | cfs_rq->tg = tg; | 7847 | cfs_rq->tg = tg; |
7647 | if (add) | ||
7648 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7649 | 7848 | ||
7650 | tg->se[cpu] = se; | 7849 | tg->se[cpu] = se; |
7651 | /* se could be NULL for init_task_group */ | 7850 | /* se could be NULL for root_task_group */ |
7652 | if (!se) | 7851 | if (!se) |
7653 | return; | 7852 | return; |
7654 | 7853 | ||
@@ -7658,15 +7857,14 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7658 | se->cfs_rq = parent->my_q; | 7857 | se->cfs_rq = parent->my_q; |
7659 | 7858 | ||
7660 | se->my_q = cfs_rq; | 7859 | se->my_q = cfs_rq; |
7661 | se->load.weight = tg->shares; | 7860 | update_load_set(&se->load, 0); |
7662 | se->load.inv_weight = 0; | ||
7663 | se->parent = parent; | 7861 | se->parent = parent; |
7664 | } | 7862 | } |
7665 | #endif | 7863 | #endif |
7666 | 7864 | ||
7667 | #ifdef CONFIG_RT_GROUP_SCHED | 7865 | #ifdef CONFIG_RT_GROUP_SCHED |
7668 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | 7866 | static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, |
7669 | struct sched_rt_entity *rt_se, int cpu, int add, | 7867 | struct sched_rt_entity *rt_se, int cpu, |
7670 | struct sched_rt_entity *parent) | 7868 | struct sched_rt_entity *parent) |
7671 | { | 7869 | { |
7672 | struct rq *rq = cpu_rq(cpu); | 7870 | struct rq *rq = cpu_rq(cpu); |
@@ -7675,8 +7873,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
7675 | init_rt_rq(rt_rq, rq); | 7873 | init_rt_rq(rt_rq, rq); |
7676 | rt_rq->tg = tg; | 7874 | rt_rq->tg = tg; |
7677 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; | 7875 | rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; |
7678 | if (add) | ||
7679 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7680 | 7876 | ||
7681 | tg->rt_se[cpu] = rt_se; | 7877 | tg->rt_se[cpu] = rt_se; |
7682 | if (!rt_se) | 7878 | if (!rt_se) |
@@ -7711,18 +7907,18 @@ void __init sched_init(void) | |||
7711 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); | 7907 | ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT); |
7712 | 7908 | ||
7713 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7909 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7714 | init_task_group.se = (struct sched_entity **)ptr; | 7910 | root_task_group.se = (struct sched_entity **)ptr; |
7715 | ptr += nr_cpu_ids * sizeof(void **); | 7911 | ptr += nr_cpu_ids * sizeof(void **); |
7716 | 7912 | ||
7717 | init_task_group.cfs_rq = (struct cfs_rq **)ptr; | 7913 | root_task_group.cfs_rq = (struct cfs_rq **)ptr; |
7718 | ptr += nr_cpu_ids * sizeof(void **); | 7914 | ptr += nr_cpu_ids * sizeof(void **); |
7719 | 7915 | ||
7720 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7916 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7721 | #ifdef CONFIG_RT_GROUP_SCHED | 7917 | #ifdef CONFIG_RT_GROUP_SCHED |
7722 | init_task_group.rt_se = (struct sched_rt_entity **)ptr; | 7918 | root_task_group.rt_se = (struct sched_rt_entity **)ptr; |
7723 | ptr += nr_cpu_ids * sizeof(void **); | 7919 | ptr += nr_cpu_ids * sizeof(void **); |
7724 | 7920 | ||
7725 | init_task_group.rt_rq = (struct rt_rq **)ptr; | 7921 | root_task_group.rt_rq = (struct rt_rq **)ptr; |
7726 | ptr += nr_cpu_ids * sizeof(void **); | 7922 | ptr += nr_cpu_ids * sizeof(void **); |
7727 | 7923 | ||
7728 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7924 | #endif /* CONFIG_RT_GROUP_SCHED */ |
@@ -7742,20 +7938,16 @@ void __init sched_init(void) | |||
7742 | global_rt_period(), global_rt_runtime()); | 7938 | global_rt_period(), global_rt_runtime()); |
7743 | 7939 | ||
7744 | #ifdef CONFIG_RT_GROUP_SCHED | 7940 | #ifdef CONFIG_RT_GROUP_SCHED |
7745 | init_rt_bandwidth(&init_task_group.rt_bandwidth, | 7941 | init_rt_bandwidth(&root_task_group.rt_bandwidth, |
7746 | global_rt_period(), global_rt_runtime()); | 7942 | global_rt_period(), global_rt_runtime()); |
7747 | #endif /* CONFIG_RT_GROUP_SCHED */ | 7943 | #endif /* CONFIG_RT_GROUP_SCHED */ |
7748 | 7944 | ||
7749 | #ifdef CONFIG_CGROUP_SCHED | 7945 | #ifdef CONFIG_CGROUP_SCHED |
7750 | list_add(&init_task_group.list, &task_groups); | 7946 | list_add(&root_task_group.list, &task_groups); |
7751 | INIT_LIST_HEAD(&init_task_group.children); | 7947 | INIT_LIST_HEAD(&root_task_group.children); |
7752 | 7948 | autogroup_init(&init_task); | |
7753 | #endif /* CONFIG_CGROUP_SCHED */ | 7949 | #endif /* CONFIG_CGROUP_SCHED */ |
7754 | 7950 | ||
7755 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
7756 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | ||
7757 | __alignof__(unsigned long)); | ||
7758 | #endif | ||
7759 | for_each_possible_cpu(i) { | 7951 | for_each_possible_cpu(i) { |
7760 | struct rq *rq; | 7952 | struct rq *rq; |
7761 | 7953 | ||
@@ -7767,38 +7959,34 @@ void __init sched_init(void) | |||
7767 | init_cfs_rq(&rq->cfs, rq); | 7959 | init_cfs_rq(&rq->cfs, rq); |
7768 | init_rt_rq(&rq->rt, rq); | 7960 | init_rt_rq(&rq->rt, rq); |
7769 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7961 | #ifdef CONFIG_FAIR_GROUP_SCHED |
7770 | init_task_group.shares = init_task_group_load; | 7962 | root_task_group.shares = root_task_group_load; |
7771 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | 7963 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7772 | #ifdef CONFIG_CGROUP_SCHED | ||
7773 | /* | 7964 | /* |
7774 | * How much cpu bandwidth does init_task_group get? | 7965 | * How much cpu bandwidth does root_task_group get? |
7775 | * | 7966 | * |
7776 | * In case of task-groups formed thr' the cgroup filesystem, it | 7967 | * In case of task-groups formed thr' the cgroup filesystem, it |
7777 | * gets 100% of the cpu resources in the system. This overall | 7968 | * gets 100% of the cpu resources in the system. This overall |
7778 | * system cpu resource is divided among the tasks of | 7969 | * system cpu resource is divided among the tasks of |
7779 | * init_task_group and its child task-groups in a fair manner, | 7970 | * root_task_group and its child task-groups in a fair manner, |
7780 | * based on each entity's (task or task-group's) weight | 7971 | * based on each entity's (task or task-group's) weight |
7781 | * (se->load.weight). | 7972 | * (se->load.weight). |
7782 | * | 7973 | * |
7783 | * In other words, if init_task_group has 10 tasks of weight | 7974 | * In other words, if root_task_group has 10 tasks of weight |
7784 | * 1024) and two child groups A0 and A1 (of weight 1024 each), | 7975 | * 1024) and two child groups A0 and A1 (of weight 1024 each), |
7785 | * then A0's share of the cpu resource is: | 7976 | * then A0's share of the cpu resource is: |
7786 | * | 7977 | * |
7787 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% | 7978 | * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% |
7788 | * | 7979 | * |
7789 | * We achieve this by letting init_task_group's tasks sit | 7980 | * We achieve this by letting root_task_group's tasks sit |
7790 | * directly in rq->cfs (i.e init_task_group->se[] = NULL). | 7981 | * directly in rq->cfs (i.e root_task_group->se[] = NULL). |
7791 | */ | 7982 | */ |
7792 | init_tg_cfs_entry(&init_task_group, &rq->cfs, NULL, i, 1, NULL); | 7983 | init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); |
7793 | #endif | ||
7794 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7984 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7795 | 7985 | ||
7796 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; | 7986 | rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; |
7797 | #ifdef CONFIG_RT_GROUP_SCHED | 7987 | #ifdef CONFIG_RT_GROUP_SCHED |
7798 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | 7988 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); |
7799 | #ifdef CONFIG_CGROUP_SCHED | 7989 | init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); |
7800 | init_tg_rt_entry(&init_task_group, &rq->rt, NULL, i, 1, NULL); | ||
7801 | #endif | ||
7802 | #endif | 7990 | #endif |
7803 | 7991 | ||
7804 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7992 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
@@ -7878,8 +8066,6 @@ void __init sched_init(void) | |||
7878 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); | 8066 | zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT); |
7879 | #endif /* SMP */ | 8067 | #endif /* SMP */ |
7880 | 8068 | ||
7881 | perf_event_init(); | ||
7882 | |||
7883 | scheduler_running = 1; | 8069 | scheduler_running = 1; |
7884 | } | 8070 | } |
7885 | 8071 | ||
@@ -8073,26 +8259,32 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8073 | if (!se) | 8259 | if (!se) |
8074 | goto err_free_rq; | 8260 | goto err_free_rq; |
8075 | 8261 | ||
8076 | init_tg_cfs_entry(tg, cfs_rq, se, i, 0, parent->se[i]); | 8262 | init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); |
8077 | } | 8263 | } |
8078 | 8264 | ||
8079 | return 1; | 8265 | return 1; |
8080 | 8266 | ||
8081 | err_free_rq: | 8267 | err_free_rq: |
8082 | kfree(cfs_rq); | 8268 | kfree(cfs_rq); |
8083 | err: | 8269 | err: |
8084 | return 0; | 8270 | return 0; |
8085 | } | 8271 | } |
8086 | 8272 | ||
8087 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8088 | { | ||
8089 | list_add_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list, | ||
8090 | &cpu_rq(cpu)->leaf_cfs_rq_list); | ||
8091 | } | ||
8092 | |||
8093 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8273 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8094 | { | 8274 | { |
8095 | list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); | 8275 | struct rq *rq = cpu_rq(cpu); |
8276 | unsigned long flags; | ||
8277 | |||
8278 | /* | ||
8279 | * Only empty task groups can be destroyed; so we can speculatively | ||
8280 | * check on_list without danger of it being re-added. | ||
8281 | */ | ||
8282 | if (!tg->cfs_rq[cpu]->on_list) | ||
8283 | return; | ||
8284 | |||
8285 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8286 | list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); | ||
8287 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8096 | } | 8288 | } |
8097 | #else /* !CONFG_FAIR_GROUP_SCHED */ | 8289 | #else /* !CONFG_FAIR_GROUP_SCHED */ |
8098 | static inline void free_fair_sched_group(struct task_group *tg) | 8290 | static inline void free_fair_sched_group(struct task_group *tg) |
@@ -8105,10 +8297,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) | |||
8105 | return 1; | 8297 | return 1; |
8106 | } | 8298 | } |
8107 | 8299 | ||
8108 | static inline void register_fair_sched_group(struct task_group *tg, int cpu) | ||
8109 | { | ||
8110 | } | ||
8111 | |||
8112 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) | 8300 | static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) |
8113 | { | 8301 | { |
8114 | } | 8302 | } |
@@ -8163,27 +8351,16 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8163 | if (!rt_se) | 8351 | if (!rt_se) |
8164 | goto err_free_rq; | 8352 | goto err_free_rq; |
8165 | 8353 | ||
8166 | init_tg_rt_entry(tg, rt_rq, rt_se, i, 0, parent->rt_se[i]); | 8354 | init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); |
8167 | } | 8355 | } |
8168 | 8356 | ||
8169 | return 1; | 8357 | return 1; |
8170 | 8358 | ||
8171 | err_free_rq: | 8359 | err_free_rq: |
8172 | kfree(rt_rq); | 8360 | kfree(rt_rq); |
8173 | err: | 8361 | err: |
8174 | return 0; | 8362 | return 0; |
8175 | } | 8363 | } |
8176 | |||
8177 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8178 | { | ||
8179 | list_add_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list, | ||
8180 | &cpu_rq(cpu)->leaf_rt_rq_list); | ||
8181 | } | ||
8182 | |||
8183 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8184 | { | ||
8185 | list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); | ||
8186 | } | ||
8187 | #else /* !CONFIG_RT_GROUP_SCHED */ | 8364 | #else /* !CONFIG_RT_GROUP_SCHED */ |
8188 | static inline void free_rt_sched_group(struct task_group *tg) | 8365 | static inline void free_rt_sched_group(struct task_group *tg) |
8189 | { | 8366 | { |
@@ -8194,14 +8371,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) | |||
8194 | { | 8371 | { |
8195 | return 1; | 8372 | return 1; |
8196 | } | 8373 | } |
8197 | |||
8198 | static inline void register_rt_sched_group(struct task_group *tg, int cpu) | ||
8199 | { | ||
8200 | } | ||
8201 | |||
8202 | static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) | ||
8203 | { | ||
8204 | } | ||
8205 | #endif /* CONFIG_RT_GROUP_SCHED */ | 8374 | #endif /* CONFIG_RT_GROUP_SCHED */ |
8206 | 8375 | ||
8207 | #ifdef CONFIG_CGROUP_SCHED | 8376 | #ifdef CONFIG_CGROUP_SCHED |
@@ -8209,6 +8378,7 @@ static void free_sched_group(struct task_group *tg) | |||
8209 | { | 8378 | { |
8210 | free_fair_sched_group(tg); | 8379 | free_fair_sched_group(tg); |
8211 | free_rt_sched_group(tg); | 8380 | free_rt_sched_group(tg); |
8381 | autogroup_free(tg); | ||
8212 | kfree(tg); | 8382 | kfree(tg); |
8213 | } | 8383 | } |
8214 | 8384 | ||
@@ -8217,7 +8387,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8217 | { | 8387 | { |
8218 | struct task_group *tg; | 8388 | struct task_group *tg; |
8219 | unsigned long flags; | 8389 | unsigned long flags; |
8220 | int i; | ||
8221 | 8390 | ||
8222 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); | 8391 | tg = kzalloc(sizeof(*tg), GFP_KERNEL); |
8223 | if (!tg) | 8392 | if (!tg) |
@@ -8230,10 +8399,6 @@ struct task_group *sched_create_group(struct task_group *parent) | |||
8230 | goto err; | 8399 | goto err; |
8231 | 8400 | ||
8232 | spin_lock_irqsave(&task_group_lock, flags); | 8401 | spin_lock_irqsave(&task_group_lock, flags); |
8233 | for_each_possible_cpu(i) { | ||
8234 | register_fair_sched_group(tg, i); | ||
8235 | register_rt_sched_group(tg, i); | ||
8236 | } | ||
8237 | list_add_rcu(&tg->list, &task_groups); | 8402 | list_add_rcu(&tg->list, &task_groups); |
8238 | 8403 | ||
8239 | WARN_ON(!parent); /* root should already exist */ | 8404 | WARN_ON(!parent); /* root should already exist */ |
@@ -8263,11 +8428,11 @@ void sched_destroy_group(struct task_group *tg) | |||
8263 | unsigned long flags; | 8428 | unsigned long flags; |
8264 | int i; | 8429 | int i; |
8265 | 8430 | ||
8266 | spin_lock_irqsave(&task_group_lock, flags); | 8431 | /* end participation in shares distribution */ |
8267 | for_each_possible_cpu(i) { | 8432 | for_each_possible_cpu(i) |
8268 | unregister_fair_sched_group(tg, i); | 8433 | unregister_fair_sched_group(tg, i); |
8269 | unregister_rt_sched_group(tg, i); | 8434 | |
8270 | } | 8435 | spin_lock_irqsave(&task_group_lock, flags); |
8271 | list_del_rcu(&tg->list); | 8436 | list_del_rcu(&tg->list); |
8272 | list_del_rcu(&tg->siblings); | 8437 | list_del_rcu(&tg->siblings); |
8273 | spin_unlock_irqrestore(&task_group_lock, flags); | 8438 | spin_unlock_irqrestore(&task_group_lock, flags); |
@@ -8297,12 +8462,12 @@ void sched_move_task(struct task_struct *tsk) | |||
8297 | if (unlikely(running)) | 8462 | if (unlikely(running)) |
8298 | tsk->sched_class->put_prev_task(rq, tsk); | 8463 | tsk->sched_class->put_prev_task(rq, tsk); |
8299 | 8464 | ||
8300 | set_task_rq(tsk, task_cpu(tsk)); | ||
8301 | |||
8302 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8465 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8303 | if (tsk->sched_class->moved_group) | 8466 | if (tsk->sched_class->task_move_group) |
8304 | tsk->sched_class->moved_group(tsk, on_rq); | 8467 | tsk->sched_class->task_move_group(tsk, on_rq); |
8468 | else | ||
8305 | #endif | 8469 | #endif |
8470 | set_task_rq(tsk, task_cpu(tsk)); | ||
8306 | 8471 | ||
8307 | if (unlikely(running)) | 8472 | if (unlikely(running)) |
8308 | tsk->sched_class->set_curr_task(rq); | 8473 | tsk->sched_class->set_curr_task(rq); |
@@ -8314,33 +8479,6 @@ void sched_move_task(struct task_struct *tsk) | |||
8314 | #endif /* CONFIG_CGROUP_SCHED */ | 8479 | #endif /* CONFIG_CGROUP_SCHED */ |
8315 | 8480 | ||
8316 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8481 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8317 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) | ||
8318 | { | ||
8319 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8320 | int on_rq; | ||
8321 | |||
8322 | on_rq = se->on_rq; | ||
8323 | if (on_rq) | ||
8324 | dequeue_entity(cfs_rq, se, 0); | ||
8325 | |||
8326 | se->load.weight = shares; | ||
8327 | se->load.inv_weight = 0; | ||
8328 | |||
8329 | if (on_rq) | ||
8330 | enqueue_entity(cfs_rq, se, 0); | ||
8331 | } | ||
8332 | |||
8333 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | ||
8334 | { | ||
8335 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8336 | struct rq *rq = cfs_rq->rq; | ||
8337 | unsigned long flags; | ||
8338 | |||
8339 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8340 | __set_se_shares(se, shares); | ||
8341 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8342 | } | ||
8343 | |||
8344 | static DEFINE_MUTEX(shares_mutex); | 8482 | static DEFINE_MUTEX(shares_mutex); |
8345 | 8483 | ||
8346 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 8484 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
@@ -8363,37 +8501,19 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8363 | if (tg->shares == shares) | 8501 | if (tg->shares == shares) |
8364 | goto done; | 8502 | goto done; |
8365 | 8503 | ||
8366 | spin_lock_irqsave(&task_group_lock, flags); | ||
8367 | for_each_possible_cpu(i) | ||
8368 | unregister_fair_sched_group(tg, i); | ||
8369 | list_del_rcu(&tg->siblings); | ||
8370 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8371 | |||
8372 | /* wait for any ongoing reference to this group to finish */ | ||
8373 | synchronize_sched(); | ||
8374 | |||
8375 | /* | ||
8376 | * Now we are free to modify the group's share on each cpu | ||
8377 | * w/o tripping rebalance_share or load_balance_fair. | ||
8378 | */ | ||
8379 | tg->shares = shares; | 8504 | tg->shares = shares; |
8380 | for_each_possible_cpu(i) { | 8505 | for_each_possible_cpu(i) { |
8381 | /* | 8506 | struct rq *rq = cpu_rq(i); |
8382 | * force a rebalance | 8507 | struct sched_entity *se; |
8383 | */ | 8508 | |
8384 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | 8509 | se = tg->se[i]; |
8385 | set_se_shares(tg->se[i], shares); | 8510 | /* Propagate contribution to hierarchy */ |
8511 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
8512 | for_each_sched_entity(se) | ||
8513 | update_cfs_shares(group_cfs_rq(se), 0); | ||
8514 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
8386 | } | 8515 | } |
8387 | 8516 | ||
8388 | /* | ||
8389 | * Enable load balance activity on this group, by inserting it back on | ||
8390 | * each cpu's rq->leaf_cfs_rq_list. | ||
8391 | */ | ||
8392 | spin_lock_irqsave(&task_group_lock, flags); | ||
8393 | for_each_possible_cpu(i) | ||
8394 | register_fair_sched_group(tg, i); | ||
8395 | list_add_rcu(&tg->siblings, &tg->parent->children); | ||
8396 | spin_unlock_irqrestore(&task_group_lock, flags); | ||
8397 | done: | 8517 | done: |
8398 | mutex_unlock(&shares_mutex); | 8518 | mutex_unlock(&shares_mutex); |
8399 | return 0; | 8519 | return 0; |
@@ -8528,7 +8648,7 @@ static int tg_set_bandwidth(struct task_group *tg, | |||
8528 | raw_spin_unlock(&rt_rq->rt_runtime_lock); | 8648 | raw_spin_unlock(&rt_rq->rt_runtime_lock); |
8529 | } | 8649 | } |
8530 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); | 8650 | raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock); |
8531 | unlock: | 8651 | unlock: |
8532 | read_unlock(&tasklist_lock); | 8652 | read_unlock(&tasklist_lock); |
8533 | mutex_unlock(&rt_constraints_mutex); | 8653 | mutex_unlock(&rt_constraints_mutex); |
8534 | 8654 | ||
@@ -8692,7 +8812,7 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
8692 | 8812 | ||
8693 | if (!cgrp->parent) { | 8813 | if (!cgrp->parent) { |
8694 | /* This is early initialization for the top cgroup */ | 8814 | /* This is early initialization for the top cgroup */ |
8695 | return &init_task_group.css; | 8815 | return &root_task_group.css; |
8696 | } | 8816 | } |
8697 | 8817 | ||
8698 | parent = cgroup_tg(cgrp->parent); | 8818 | parent = cgroup_tg(cgrp->parent); |
@@ -8763,6 +8883,20 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, | |||
8763 | } | 8883 | } |
8764 | } | 8884 | } |
8765 | 8885 | ||
8886 | static void | ||
8887 | cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task) | ||
8888 | { | ||
8889 | /* | ||
8890 | * cgroup_exit() is called in the copy_process() failure path. | ||
8891 | * Ignore this case since the task hasn't ran yet, this avoids | ||
8892 | * trying to poke a half freed task state from generic code. | ||
8893 | */ | ||
8894 | if (!(task->flags & PF_EXITING)) | ||
8895 | return; | ||
8896 | |||
8897 | sched_move_task(task); | ||
8898 | } | ||
8899 | |||
8766 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8900 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8767 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, | 8901 | static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, |
8768 | u64 shareval) | 8902 | u64 shareval) |
@@ -8835,6 +8969,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { | |||
8835 | .destroy = cpu_cgroup_destroy, | 8969 | .destroy = cpu_cgroup_destroy, |
8836 | .can_attach = cpu_cgroup_can_attach, | 8970 | .can_attach = cpu_cgroup_can_attach, |
8837 | .attach = cpu_cgroup_attach, | 8971 | .attach = cpu_cgroup_attach, |
8972 | .exit = cpu_cgroup_exit, | ||
8838 | .populate = cpu_cgroup_populate, | 8973 | .populate = cpu_cgroup_populate, |
8839 | .subsys_id = cpu_cgroup_subsys_id, | 8974 | .subsys_id = cpu_cgroup_subsys_id, |
8840 | .early_init = 1, | 8975 | .early_init = 1, |
@@ -9119,72 +9254,3 @@ struct cgroup_subsys cpuacct_subsys = { | |||
9119 | }; | 9254 | }; |
9120 | #endif /* CONFIG_CGROUP_CPUACCT */ | 9255 | #endif /* CONFIG_CGROUP_CPUACCT */ |
9121 | 9256 | ||
9122 | #ifndef CONFIG_SMP | ||
9123 | |||
9124 | void synchronize_sched_expedited(void) | ||
9125 | { | ||
9126 | barrier(); | ||
9127 | } | ||
9128 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
9129 | |||
9130 | #else /* #ifndef CONFIG_SMP */ | ||
9131 | |||
9132 | static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); | ||
9133 | |||
9134 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
9135 | { | ||
9136 | /* | ||
9137 | * There must be a full memory barrier on each affected CPU | ||
9138 | * between the time that try_stop_cpus() is called and the | ||
9139 | * time that it returns. | ||
9140 | * | ||
9141 | * In the current initial implementation of cpu_stop, the | ||
9142 | * above condition is already met when the control reaches | ||
9143 | * this point and the following smp_mb() is not strictly | ||
9144 | * necessary. Do smp_mb() anyway for documentation and | ||
9145 | * robustness against future implementation changes. | ||
9146 | */ | ||
9147 | smp_mb(); /* See above comment block. */ | ||
9148 | return 0; | ||
9149 | } | ||
9150 | |||
9151 | /* | ||
9152 | * Wait for an rcu-sched grace period to elapse, but use "big hammer" | ||
9153 | * approach to force grace period to end quickly. This consumes | ||
9154 | * significant time on all CPUs, and is thus not recommended for | ||
9155 | * any sort of common-case code. | ||
9156 | * | ||
9157 | * Note that it is illegal to call this function while holding any | ||
9158 | * lock that is acquired by a CPU-hotplug notifier. Failing to | ||
9159 | * observe this restriction will result in deadlock. | ||
9160 | */ | ||
9161 | void synchronize_sched_expedited(void) | ||
9162 | { | ||
9163 | int snap, trycount = 0; | ||
9164 | |||
9165 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | ||
9166 | snap = atomic_read(&synchronize_sched_expedited_count) + 1; | ||
9167 | get_online_cpus(); | ||
9168 | while (try_stop_cpus(cpu_online_mask, | ||
9169 | synchronize_sched_expedited_cpu_stop, | ||
9170 | NULL) == -EAGAIN) { | ||
9171 | put_online_cpus(); | ||
9172 | if (trycount++ < 10) | ||
9173 | udelay(trycount * num_online_cpus()); | ||
9174 | else { | ||
9175 | synchronize_sched(); | ||
9176 | return; | ||
9177 | } | ||
9178 | if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { | ||
9179 | smp_mb(); /* ensure test happens before caller kfree */ | ||
9180 | return; | ||
9181 | } | ||
9182 | get_online_cpus(); | ||
9183 | } | ||
9184 | atomic_inc(&synchronize_sched_expedited_count); | ||
9185 | smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ | ||
9186 | put_online_cpus(); | ||
9187 | } | ||
9188 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
9189 | |||
9190 | #endif /* #else #ifndef CONFIG_SMP */ | ||