diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2008-06-27 07:41:14 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-06-27 08:31:29 -0400 |
commit | c09595f63bb1909c5dc4dca288f4fe818561b5f3 (patch) | |
tree | 42631e6986f3ea4543b125ca62a99df8548e0eb9 /kernel | |
parent | ced8aa16e1db55c33c507174c1b1f9e107445865 (diff) |
sched: revert revert of: fair-group: SMP-nice for group scheduling
Try again..
Initial commit: 18d95a2832c1392a2d63227a7a6d433cb9f2037e
Revert: 6363ca57c76b7b83639ca8c83fc285fa26a7880e
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched.c | 430 | ||||
-rw-r--r-- | kernel/sched_debug.c | 5 | ||||
-rw-r--r-- | kernel/sched_fair.c | 124 | ||||
-rw-r--r-- | kernel/sched_rt.c | 4 |
4 files changed, 488 insertions, 75 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index f653af684fb3..874b6da15430 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -403,6 +403,43 @@ struct cfs_rq { | |||
403 | */ | 403 | */ |
404 | struct list_head leaf_cfs_rq_list; | 404 | struct list_head leaf_cfs_rq_list; |
405 | struct task_group *tg; /* group that "owns" this runqueue */ | 405 | struct task_group *tg; /* group that "owns" this runqueue */ |
406 | |||
407 | #ifdef CONFIG_SMP | ||
408 | unsigned long task_weight; | ||
409 | unsigned long shares; | ||
410 | /* | ||
411 | * We need space to build a sched_domain wide view of the full task | ||
412 | * group tree, in order to avoid depending on dynamic memory allocation | ||
413 | * during the load balancing we place this in the per cpu task group | ||
414 | * hierarchy. This limits the load balancing to one instance per cpu, | ||
415 | * but more should not be needed anyway. | ||
416 | */ | ||
417 | struct aggregate_struct { | ||
418 | /* | ||
419 | * load = weight(cpus) * f(tg) | ||
420 | * | ||
421 | * Where f(tg) is the recursive weight fraction assigned to | ||
422 | * this group. | ||
423 | */ | ||
424 | unsigned long load; | ||
425 | |||
426 | /* | ||
427 | * part of the group weight distributed to this span. | ||
428 | */ | ||
429 | unsigned long shares; | ||
430 | |||
431 | /* | ||
432 | * The sum of all runqueue weights within this span. | ||
433 | */ | ||
434 | unsigned long rq_weight; | ||
435 | |||
436 | /* | ||
437 | * Weight contributed by tasks; this is the part we can | ||
438 | * influence by moving tasks around. | ||
439 | */ | ||
440 | unsigned long task_weight; | ||
441 | } aggregate; | ||
442 | #endif | ||
406 | #endif | 443 | #endif |
407 | }; | 444 | }; |
408 | 445 | ||
@@ -1484,6 +1521,326 @@ static unsigned long source_load(int cpu, int type); | |||
1484 | static unsigned long target_load(int cpu, int type); | 1521 | static unsigned long target_load(int cpu, int type); |
1485 | static unsigned long cpu_avg_load_per_task(int cpu); | 1522 | static unsigned long cpu_avg_load_per_task(int cpu); |
1486 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1523 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
1524 | |||
1525 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1526 | |||
1527 | /* | ||
1528 | * Group load balancing. | ||
1529 | * | ||
1530 | * We calculate a few balance domain wide aggregate numbers; load and weight. | ||
1531 | * Given the pictures below, and assuming each item has equal weight: | ||
1532 | * | ||
1533 | * root 1 - thread | ||
1534 | * / | \ A - group | ||
1535 | * A 1 B | ||
1536 | * /|\ / \ | ||
1537 | * C 2 D 3 4 | ||
1538 | * | | | ||
1539 | * 5 6 | ||
1540 | * | ||
1541 | * load: | ||
1542 | * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, | ||
1543 | * which equals 1/9-th of the total load. | ||
1544 | * | ||
1545 | * shares: | ||
1546 | * The weight of this group on the selected cpus. | ||
1547 | * | ||
1548 | * rq_weight: | ||
1549 | * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while | ||
1550 | * B would get 2. | ||
1551 | * | ||
1552 | * task_weight: | ||
1553 | * Part of the rq_weight contributed by tasks; all groups except B would | ||
1554 | * get 1, B gets 2. | ||
1555 | */ | ||
1556 | |||
1557 | static inline struct aggregate_struct * | ||
1558 | aggregate(struct task_group *tg, struct sched_domain *sd) | ||
1559 | { | ||
1560 | return &tg->cfs_rq[sd->first_cpu]->aggregate; | ||
1561 | } | ||
1562 | |||
1563 | typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); | ||
1564 | |||
1565 | /* | ||
1566 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1567 | * leaving it for the final time. | ||
1568 | */ | ||
1569 | static | ||
1570 | void aggregate_walk_tree(aggregate_func down, aggregate_func up, | ||
1571 | struct sched_domain *sd) | ||
1572 | { | ||
1573 | struct task_group *parent, *child; | ||
1574 | |||
1575 | rcu_read_lock(); | ||
1576 | parent = &root_task_group; | ||
1577 | down: | ||
1578 | (*down)(parent, sd); | ||
1579 | list_for_each_entry_rcu(child, &parent->children, siblings) { | ||
1580 | parent = child; | ||
1581 | goto down; | ||
1582 | |||
1583 | up: | ||
1584 | continue; | ||
1585 | } | ||
1586 | (*up)(parent, sd); | ||
1587 | |||
1588 | child = parent; | ||
1589 | parent = parent->parent; | ||
1590 | if (parent) | ||
1591 | goto up; | ||
1592 | rcu_read_unlock(); | ||
1593 | } | ||
1594 | |||
1595 | /* | ||
1596 | * Calculate the aggregate runqueue weight. | ||
1597 | */ | ||
1598 | static | ||
1599 | void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) | ||
1600 | { | ||
1601 | unsigned long rq_weight = 0; | ||
1602 | unsigned long task_weight = 0; | ||
1603 | int i; | ||
1604 | |||
1605 | for_each_cpu_mask(i, sd->span) { | ||
1606 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
1607 | task_weight += tg->cfs_rq[i]->task_weight; | ||
1608 | } | ||
1609 | |||
1610 | aggregate(tg, sd)->rq_weight = rq_weight; | ||
1611 | aggregate(tg, sd)->task_weight = task_weight; | ||
1612 | } | ||
1613 | |||
1614 | /* | ||
1615 | * Compute the weight of this group on the given cpus. | ||
1616 | */ | ||
1617 | static | ||
1618 | void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) | ||
1619 | { | ||
1620 | unsigned long shares = 0; | ||
1621 | int i; | ||
1622 | |||
1623 | for_each_cpu_mask(i, sd->span) | ||
1624 | shares += tg->cfs_rq[i]->shares; | ||
1625 | |||
1626 | if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) | ||
1627 | shares = tg->shares; | ||
1628 | |||
1629 | aggregate(tg, sd)->shares = shares; | ||
1630 | } | ||
1631 | |||
1632 | /* | ||
1633 | * Compute the load fraction assigned to this group, relies on the aggregate | ||
1634 | * weight and this group's parent's load, i.e. top-down. | ||
1635 | */ | ||
1636 | static | ||
1637 | void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) | ||
1638 | { | ||
1639 | unsigned long load; | ||
1640 | |||
1641 | if (!tg->parent) { | ||
1642 | int i; | ||
1643 | |||
1644 | load = 0; | ||
1645 | for_each_cpu_mask(i, sd->span) | ||
1646 | load += cpu_rq(i)->load.weight; | ||
1647 | |||
1648 | } else { | ||
1649 | load = aggregate(tg->parent, sd)->load; | ||
1650 | |||
1651 | /* | ||
1652 | * shares is our weight in the parent's rq so | ||
1653 | * shares/parent->rq_weight gives our fraction of the load | ||
1654 | */ | ||
1655 | load *= aggregate(tg, sd)->shares; | ||
1656 | load /= aggregate(tg->parent, sd)->rq_weight + 1; | ||
1657 | } | ||
1658 | |||
1659 | aggregate(tg, sd)->load = load; | ||
1660 | } | ||
1661 | |||
1662 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1663 | |||
1664 | /* | ||
1665 | * Calculate and set the cpu's group shares. | ||
1666 | */ | ||
1667 | static void | ||
1668 | __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, | ||
1669 | int tcpu) | ||
1670 | { | ||
1671 | int boost = 0; | ||
1672 | unsigned long shares; | ||
1673 | unsigned long rq_weight; | ||
1674 | |||
1675 | if (!tg->se[tcpu]) | ||
1676 | return; | ||
1677 | |||
1678 | rq_weight = tg->cfs_rq[tcpu]->load.weight; | ||
1679 | |||
1680 | /* | ||
1681 | * If there are currently no tasks on the cpu pretend there is one of | ||
1682 | * average load so that when a new task gets to run here it will not | ||
1683 | * get delayed by group starvation. | ||
1684 | */ | ||
1685 | if (!rq_weight) { | ||
1686 | boost = 1; | ||
1687 | rq_weight = NICE_0_LOAD; | ||
1688 | } | ||
1689 | |||
1690 | /* | ||
1691 | * \Sum shares * rq_weight | ||
1692 | * shares = ----------------------- | ||
1693 | * \Sum rq_weight | ||
1694 | * | ||
1695 | */ | ||
1696 | shares = aggregate(tg, sd)->shares * rq_weight; | ||
1697 | shares /= aggregate(tg, sd)->rq_weight + 1; | ||
1698 | |||
1699 | /* | ||
1700 | * record the actual number of shares, not the boosted amount. | ||
1701 | */ | ||
1702 | tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; | ||
1703 | |||
1704 | if (shares < MIN_SHARES) | ||
1705 | shares = MIN_SHARES; | ||
1706 | else if (shares > MAX_SHARES) | ||
1707 | shares = MAX_SHARES; | ||
1708 | |||
1709 | __set_se_shares(tg->se[tcpu], shares); | ||
1710 | } | ||
1711 | |||
1712 | /* | ||
1713 | * Re-adjust the weights on the cpu the task came from and on the cpu the | ||
1714 | * task went to. | ||
1715 | */ | ||
1716 | static void | ||
1717 | __move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
1718 | int scpu, int dcpu) | ||
1719 | { | ||
1720 | unsigned long shares; | ||
1721 | |||
1722 | shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
1723 | |||
1724 | __update_group_shares_cpu(tg, sd, scpu); | ||
1725 | __update_group_shares_cpu(tg, sd, dcpu); | ||
1726 | |||
1727 | /* | ||
1728 | * ensure we never loose shares due to rounding errors in the | ||
1729 | * above redistribution. | ||
1730 | */ | ||
1731 | shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
1732 | if (shares) | ||
1733 | tg->cfs_rq[dcpu]->shares += shares; | ||
1734 | } | ||
1735 | |||
1736 | /* | ||
1737 | * Because changing a group's shares changes the weight of the super-group | ||
1738 | * we need to walk up the tree and change all shares until we hit the root. | ||
1739 | */ | ||
1740 | static void | ||
1741 | move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
1742 | int scpu, int dcpu) | ||
1743 | { | ||
1744 | while (tg) { | ||
1745 | __move_group_shares(tg, sd, scpu, dcpu); | ||
1746 | tg = tg->parent; | ||
1747 | } | ||
1748 | } | ||
1749 | |||
1750 | static | ||
1751 | void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) | ||
1752 | { | ||
1753 | unsigned long shares = aggregate(tg, sd)->shares; | ||
1754 | int i; | ||
1755 | |||
1756 | for_each_cpu_mask(i, sd->span) { | ||
1757 | struct rq *rq = cpu_rq(i); | ||
1758 | unsigned long flags; | ||
1759 | |||
1760 | spin_lock_irqsave(&rq->lock, flags); | ||
1761 | __update_group_shares_cpu(tg, sd, i); | ||
1762 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1763 | } | ||
1764 | |||
1765 | aggregate_group_shares(tg, sd); | ||
1766 | |||
1767 | /* | ||
1768 | * ensure we never loose shares due to rounding errors in the | ||
1769 | * above redistribution. | ||
1770 | */ | ||
1771 | shares -= aggregate(tg, sd)->shares; | ||
1772 | if (shares) { | ||
1773 | tg->cfs_rq[sd->first_cpu]->shares += shares; | ||
1774 | aggregate(tg, sd)->shares += shares; | ||
1775 | } | ||
1776 | } | ||
1777 | |||
1778 | /* | ||
1779 | * Calculate the accumulative weight and recursive load of each task group | ||
1780 | * while walking down the tree. | ||
1781 | */ | ||
1782 | static | ||
1783 | void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) | ||
1784 | { | ||
1785 | aggregate_group_weight(tg, sd); | ||
1786 | aggregate_group_shares(tg, sd); | ||
1787 | aggregate_group_load(tg, sd); | ||
1788 | } | ||
1789 | |||
1790 | /* | ||
1791 | * Rebalance the cpu shares while walking back up the tree. | ||
1792 | */ | ||
1793 | static | ||
1794 | void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) | ||
1795 | { | ||
1796 | aggregate_group_set_shares(tg, sd); | ||
1797 | } | ||
1798 | |||
1799 | static DEFINE_PER_CPU(spinlock_t, aggregate_lock); | ||
1800 | |||
1801 | static void __init init_aggregate(void) | ||
1802 | { | ||
1803 | int i; | ||
1804 | |||
1805 | for_each_possible_cpu(i) | ||
1806 | spin_lock_init(&per_cpu(aggregate_lock, i)); | ||
1807 | } | ||
1808 | |||
1809 | static int get_aggregate(struct sched_domain *sd) | ||
1810 | { | ||
1811 | if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) | ||
1812 | return 0; | ||
1813 | |||
1814 | aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); | ||
1815 | return 1; | ||
1816 | } | ||
1817 | |||
1818 | static void put_aggregate(struct sched_domain *sd) | ||
1819 | { | ||
1820 | spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); | ||
1821 | } | ||
1822 | |||
1823 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1824 | { | ||
1825 | cfs_rq->shares = shares; | ||
1826 | } | ||
1827 | |||
1828 | #else | ||
1829 | |||
1830 | static inline void init_aggregate(void) | ||
1831 | { | ||
1832 | } | ||
1833 | |||
1834 | static inline int get_aggregate(struct sched_domain *sd) | ||
1835 | { | ||
1836 | return 0; | ||
1837 | } | ||
1838 | |||
1839 | static inline void put_aggregate(struct sched_domain *sd) | ||
1840 | { | ||
1841 | } | ||
1842 | #endif | ||
1843 | |||
1487 | #endif | 1844 | #endif |
1488 | 1845 | ||
1489 | #include "sched_stats.h" | 1846 | #include "sched_stats.h" |
@@ -1498,26 +1855,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | |||
1498 | #define for_each_class(class) \ | 1855 | #define for_each_class(class) \ |
1499 | for (class = sched_class_highest; class; class = class->next) | 1856 | for (class = sched_class_highest; class; class = class->next) |
1500 | 1857 | ||
1501 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | 1858 | static void inc_nr_running(struct rq *rq) |
1502 | { | ||
1503 | update_load_add(&rq->load, p->se.load.weight); | ||
1504 | } | ||
1505 | |||
1506 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
1507 | { | ||
1508 | update_load_sub(&rq->load, p->se.load.weight); | ||
1509 | } | ||
1510 | |||
1511 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
1512 | { | 1859 | { |
1513 | rq->nr_running++; | 1860 | rq->nr_running++; |
1514 | inc_load(rq, p); | ||
1515 | } | 1861 | } |
1516 | 1862 | ||
1517 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1863 | static void dec_nr_running(struct rq *rq) |
1518 | { | 1864 | { |
1519 | rq->nr_running--; | 1865 | rq->nr_running--; |
1520 | dec_load(rq, p); | ||
1521 | } | 1866 | } |
1522 | 1867 | ||
1523 | static void set_load_weight(struct task_struct *p) | 1868 | static void set_load_weight(struct task_struct *p) |
@@ -1609,7 +1954,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1609 | rq->nr_uninterruptible--; | 1954 | rq->nr_uninterruptible--; |
1610 | 1955 | ||
1611 | enqueue_task(rq, p, wakeup); | 1956 | enqueue_task(rq, p, wakeup); |
1612 | inc_nr_running(p, rq); | 1957 | inc_nr_running(rq); |
1613 | } | 1958 | } |
1614 | 1959 | ||
1615 | /* | 1960 | /* |
@@ -1621,7 +1966,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1621 | rq->nr_uninterruptible++; | 1966 | rq->nr_uninterruptible++; |
1622 | 1967 | ||
1623 | dequeue_task(rq, p, sleep); | 1968 | dequeue_task(rq, p, sleep); |
1624 | dec_nr_running(p, rq); | 1969 | dec_nr_running(rq); |
1625 | } | 1970 | } |
1626 | 1971 | ||
1627 | /** | 1972 | /** |
@@ -2274,7 +2619,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2274 | * management (if any): | 2619 | * management (if any): |
2275 | */ | 2620 | */ |
2276 | p->sched_class->task_new(rq, p); | 2621 | p->sched_class->task_new(rq, p); |
2277 | inc_nr_running(p, rq); | 2622 | inc_nr_running(rq); |
2278 | } | 2623 | } |
2279 | check_preempt_curr(rq, p); | 2624 | check_preempt_curr(rq, p); |
2280 | #ifdef CONFIG_SMP | 2625 | #ifdef CONFIG_SMP |
@@ -3265,9 +3610,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3265 | unsigned long imbalance; | 3610 | unsigned long imbalance; |
3266 | struct rq *busiest; | 3611 | struct rq *busiest; |
3267 | unsigned long flags; | 3612 | unsigned long flags; |
3613 | int unlock_aggregate; | ||
3268 | 3614 | ||
3269 | cpus_setall(*cpus); | 3615 | cpus_setall(*cpus); |
3270 | 3616 | ||
3617 | unlock_aggregate = get_aggregate(sd); | ||
3618 | |||
3271 | /* | 3619 | /* |
3272 | * When power savings policy is enabled for the parent domain, idle | 3620 | * When power savings policy is enabled for the parent domain, idle |
3273 | * sibling can pick up load irrespective of busy siblings. In this case, | 3621 | * sibling can pick up load irrespective of busy siblings. In this case, |
@@ -3383,8 +3731,9 @@ redo: | |||
3383 | 3731 | ||
3384 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3732 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3385 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3733 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3386 | return -1; | 3734 | ld_moved = -1; |
3387 | return ld_moved; | 3735 | |
3736 | goto out; | ||
3388 | 3737 | ||
3389 | out_balanced: | 3738 | out_balanced: |
3390 | schedstat_inc(sd, lb_balanced[idle]); | 3739 | schedstat_inc(sd, lb_balanced[idle]); |
@@ -3399,8 +3748,13 @@ out_one_pinned: | |||
3399 | 3748 | ||
3400 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3749 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3401 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3750 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3402 | return -1; | 3751 | ld_moved = -1; |
3403 | return 0; | 3752 | else |
3753 | ld_moved = 0; | ||
3754 | out: | ||
3755 | if (unlock_aggregate) | ||
3756 | put_aggregate(sd); | ||
3757 | return ld_moved; | ||
3404 | } | 3758 | } |
3405 | 3759 | ||
3406 | /* | 3760 | /* |
@@ -4588,10 +4942,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4588 | goto out_unlock; | 4942 | goto out_unlock; |
4589 | } | 4943 | } |
4590 | on_rq = p->se.on_rq; | 4944 | on_rq = p->se.on_rq; |
4591 | if (on_rq) { | 4945 | if (on_rq) |
4592 | dequeue_task(rq, p, 0); | 4946 | dequeue_task(rq, p, 0); |
4593 | dec_load(rq, p); | ||
4594 | } | ||
4595 | 4947 | ||
4596 | p->static_prio = NICE_TO_PRIO(nice); | 4948 | p->static_prio = NICE_TO_PRIO(nice); |
4597 | set_load_weight(p); | 4949 | set_load_weight(p); |
@@ -4601,7 +4953,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4601 | 4953 | ||
4602 | if (on_rq) { | 4954 | if (on_rq) { |
4603 | enqueue_task(rq, p, 0); | 4955 | enqueue_task(rq, p, 0); |
4604 | inc_load(rq, p); | ||
4605 | /* | 4956 | /* |
4606 | * If the task increased its priority or is running and | 4957 | * If the task increased its priority or is running and |
4607 | * lowered its priority, then reschedule its CPU: | 4958 | * lowered its priority, then reschedule its CPU: |
@@ -7016,6 +7367,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7016 | SD_INIT(sd, ALLNODES); | 7367 | SD_INIT(sd, ALLNODES); |
7017 | set_domain_attribute(sd, attr); | 7368 | set_domain_attribute(sd, attr); |
7018 | sd->span = *cpu_map; | 7369 | sd->span = *cpu_map; |
7370 | sd->first_cpu = first_cpu(sd->span); | ||
7019 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | 7371 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); |
7020 | p = sd; | 7372 | p = sd; |
7021 | sd_allnodes = 1; | 7373 | sd_allnodes = 1; |
@@ -7026,6 +7378,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7026 | SD_INIT(sd, NODE); | 7378 | SD_INIT(sd, NODE); |
7027 | set_domain_attribute(sd, attr); | 7379 | set_domain_attribute(sd, attr); |
7028 | sched_domain_node_span(cpu_to_node(i), &sd->span); | 7380 | sched_domain_node_span(cpu_to_node(i), &sd->span); |
7381 | sd->first_cpu = first_cpu(sd->span); | ||
7029 | sd->parent = p; | 7382 | sd->parent = p; |
7030 | if (p) | 7383 | if (p) |
7031 | p->child = sd; | 7384 | p->child = sd; |
@@ -7037,6 +7390,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7037 | SD_INIT(sd, CPU); | 7390 | SD_INIT(sd, CPU); |
7038 | set_domain_attribute(sd, attr); | 7391 | set_domain_attribute(sd, attr); |
7039 | sd->span = *nodemask; | 7392 | sd->span = *nodemask; |
7393 | sd->first_cpu = first_cpu(sd->span); | ||
7040 | sd->parent = p; | 7394 | sd->parent = p; |
7041 | if (p) | 7395 | if (p) |
7042 | p->child = sd; | 7396 | p->child = sd; |
@@ -7048,6 +7402,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7048 | SD_INIT(sd, MC); | 7402 | SD_INIT(sd, MC); |
7049 | set_domain_attribute(sd, attr); | 7403 | set_domain_attribute(sd, attr); |
7050 | sd->span = cpu_coregroup_map(i); | 7404 | sd->span = cpu_coregroup_map(i); |
7405 | sd->first_cpu = first_cpu(sd->span); | ||
7051 | cpus_and(sd->span, sd->span, *cpu_map); | 7406 | cpus_and(sd->span, sd->span, *cpu_map); |
7052 | sd->parent = p; | 7407 | sd->parent = p; |
7053 | p->child = sd; | 7408 | p->child = sd; |
@@ -7060,6 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7060 | SD_INIT(sd, SIBLING); | 7415 | SD_INIT(sd, SIBLING); |
7061 | set_domain_attribute(sd, attr); | 7416 | set_domain_attribute(sd, attr); |
7062 | sd->span = per_cpu(cpu_sibling_map, i); | 7417 | sd->span = per_cpu(cpu_sibling_map, i); |
7418 | sd->first_cpu = first_cpu(sd->span); | ||
7063 | cpus_and(sd->span, sd->span, *cpu_map); | 7419 | cpus_and(sd->span, sd->span, *cpu_map); |
7064 | sd->parent = p; | 7420 | sd->parent = p; |
7065 | p->child = sd; | 7421 | p->child = sd; |
@@ -7757,6 +8113,7 @@ void __init sched_init(void) | |||
7757 | } | 8113 | } |
7758 | 8114 | ||
7759 | #ifdef CONFIG_SMP | 8115 | #ifdef CONFIG_SMP |
8116 | init_aggregate(); | ||
7760 | init_defrootdomain(); | 8117 | init_defrootdomain(); |
7761 | #endif | 8118 | #endif |
7762 | 8119 | ||
@@ -8322,14 +8679,11 @@ void sched_move_task(struct task_struct *tsk) | |||
8322 | #endif /* CONFIG_GROUP_SCHED */ | 8679 | #endif /* CONFIG_GROUP_SCHED */ |
8323 | 8680 | ||
8324 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8681 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8325 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 8682 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) |
8326 | { | 8683 | { |
8327 | struct cfs_rq *cfs_rq = se->cfs_rq; | 8684 | struct cfs_rq *cfs_rq = se->cfs_rq; |
8328 | struct rq *rq = cfs_rq->rq; | ||
8329 | int on_rq; | 8685 | int on_rq; |
8330 | 8686 | ||
8331 | spin_lock_irq(&rq->lock); | ||
8332 | |||
8333 | on_rq = se->on_rq; | 8687 | on_rq = se->on_rq; |
8334 | if (on_rq) | 8688 | if (on_rq) |
8335 | dequeue_entity(cfs_rq, se, 0); | 8689 | dequeue_entity(cfs_rq, se, 0); |
@@ -8339,8 +8693,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) | |||
8339 | 8693 | ||
8340 | if (on_rq) | 8694 | if (on_rq) |
8341 | enqueue_entity(cfs_rq, se, 0); | 8695 | enqueue_entity(cfs_rq, se, 0); |
8696 | } | ||
8342 | 8697 | ||
8343 | spin_unlock_irq(&rq->lock); | 8698 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
8699 | { | ||
8700 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8701 | struct rq *rq = cfs_rq->rq; | ||
8702 | unsigned long flags; | ||
8703 | |||
8704 | spin_lock_irqsave(&rq->lock, flags); | ||
8705 | __set_se_shares(se, shares); | ||
8706 | spin_unlock_irqrestore(&rq->lock, flags); | ||
8344 | } | 8707 | } |
8345 | 8708 | ||
8346 | static DEFINE_MUTEX(shares_mutex); | 8709 | static DEFINE_MUTEX(shares_mutex); |
@@ -8379,8 +8742,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8379 | * w/o tripping rebalance_share or load_balance_fair. | 8742 | * w/o tripping rebalance_share or load_balance_fair. |
8380 | */ | 8743 | */ |
8381 | tg->shares = shares; | 8744 | tg->shares = shares; |
8382 | for_each_possible_cpu(i) | 8745 | for_each_possible_cpu(i) { |
8746 | /* | ||
8747 | * force a rebalance | ||
8748 | */ | ||
8749 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | ||
8383 | set_se_shares(tg->se[i], shares); | 8750 | set_se_shares(tg->se[i], shares); |
8751 | } | ||
8384 | 8752 | ||
8385 | /* | 8753 | /* |
8386 | * Enable load balance activity on this group, by inserting it back on | 8754 | * Enable load balance activity on this group, by inserting it back on |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 8e077b9c91cb..04394ccac88d 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -167,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
167 | #endif | 167 | #endif |
168 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", | 168 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", |
169 | cfs_rq->nr_spread_over); | 169 | cfs_rq->nr_spread_over); |
170 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
171 | #ifdef CONFIG_SMP | ||
172 | SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); | ||
173 | #endif | ||
174 | #endif | ||
170 | } | 175 | } |
171 | 176 | ||
172 | void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) | 177 | void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 2e197b8e43f1..183388c4dead 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -567,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
567 | * Scheduling class queueing methods: | 567 | * Scheduling class queueing methods: |
568 | */ | 568 | */ |
569 | 569 | ||
570 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
571 | static void | ||
572 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
573 | { | ||
574 | cfs_rq->task_weight += weight; | ||
575 | } | ||
576 | #else | ||
577 | static inline void | ||
578 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
579 | { | ||
580 | } | ||
581 | #endif | ||
582 | |||
570 | static void | 583 | static void |
571 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 584 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
572 | { | 585 | { |
573 | update_load_add(&cfs_rq->load, se->load.weight); | 586 | update_load_add(&cfs_rq->load, se->load.weight); |
587 | if (!parent_entity(se)) | ||
588 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
589 | if (entity_is_task(se)) | ||
590 | add_cfs_task_weight(cfs_rq, se->load.weight); | ||
574 | cfs_rq->nr_running++; | 591 | cfs_rq->nr_running++; |
575 | se->on_rq = 1; | 592 | se->on_rq = 1; |
576 | list_add(&se->group_node, &cfs_rq->tasks); | 593 | list_add(&se->group_node, &cfs_rq->tasks); |
@@ -580,6 +597,10 @@ static void | |||
580 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 597 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
581 | { | 598 | { |
582 | update_load_sub(&cfs_rq->load, se->load.weight); | 599 | update_load_sub(&cfs_rq->load, se->load.weight); |
600 | if (!parent_entity(se)) | ||
601 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
602 | if (entity_is_task(se)) | ||
603 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
583 | cfs_rq->nr_running--; | 604 | cfs_rq->nr_running--; |
584 | se->on_rq = 0; | 605 | se->on_rq = 0; |
585 | list_del_init(&se->group_node); | 606 | list_del_init(&se->group_node); |
@@ -1372,75 +1393,90 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
1372 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); | 1393 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); |
1373 | } | 1394 | } |
1374 | 1395 | ||
1375 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1396 | static unsigned long |
1376 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | 1397 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1398 | unsigned long max_load_move, struct sched_domain *sd, | ||
1399 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | ||
1400 | struct cfs_rq *cfs_rq) | ||
1377 | { | 1401 | { |
1378 | struct sched_entity *curr; | 1402 | struct rq_iterator cfs_rq_iterator; |
1379 | struct task_struct *p; | ||
1380 | |||
1381 | if (!cfs_rq->nr_running || !first_fair(cfs_rq)) | ||
1382 | return MAX_PRIO; | ||
1383 | |||
1384 | curr = cfs_rq->curr; | ||
1385 | if (!curr) | ||
1386 | curr = __pick_next_entity(cfs_rq); | ||
1387 | 1403 | ||
1388 | p = task_of(curr); | 1404 | cfs_rq_iterator.start = load_balance_start_fair; |
1405 | cfs_rq_iterator.next = load_balance_next_fair; | ||
1406 | cfs_rq_iterator.arg = cfs_rq; | ||
1389 | 1407 | ||
1390 | return p->prio; | 1408 | return balance_tasks(this_rq, this_cpu, busiest, |
1409 | max_load_move, sd, idle, all_pinned, | ||
1410 | this_best_prio, &cfs_rq_iterator); | ||
1391 | } | 1411 | } |
1392 | #endif | ||
1393 | 1412 | ||
1413 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1394 | static unsigned long | 1414 | static unsigned long |
1395 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1415 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1396 | unsigned long max_load_move, | 1416 | unsigned long max_load_move, |
1397 | struct sched_domain *sd, enum cpu_idle_type idle, | 1417 | struct sched_domain *sd, enum cpu_idle_type idle, |
1398 | int *all_pinned, int *this_best_prio) | 1418 | int *all_pinned, int *this_best_prio) |
1399 | { | 1419 | { |
1400 | struct cfs_rq *busy_cfs_rq; | ||
1401 | long rem_load_move = max_load_move; | 1420 | long rem_load_move = max_load_move; |
1402 | struct rq_iterator cfs_rq_iterator; | 1421 | int busiest_cpu = cpu_of(busiest); |
1403 | 1422 | struct task_group *tg; | |
1404 | cfs_rq_iterator.start = load_balance_start_fair; | ||
1405 | cfs_rq_iterator.next = load_balance_next_fair; | ||
1406 | 1423 | ||
1407 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1424 | rcu_read_lock(); |
1408 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1425 | list_for_each_entry(tg, &task_groups, list) { |
1409 | struct cfs_rq *this_cfs_rq; | ||
1410 | long imbalance; | 1426 | long imbalance; |
1411 | unsigned long maxload; | 1427 | unsigned long this_weight, busiest_weight; |
1428 | long rem_load, max_load, moved_load; | ||
1429 | |||
1430 | /* | ||
1431 | * empty group | ||
1432 | */ | ||
1433 | if (!aggregate(tg, sd)->task_weight) | ||
1434 | continue; | ||
1435 | |||
1436 | rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; | ||
1437 | rem_load /= aggregate(tg, sd)->load + 1; | ||
1438 | |||
1439 | this_weight = tg->cfs_rq[this_cpu]->task_weight; | ||
1440 | busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; | ||
1412 | 1441 | ||
1413 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1442 | imbalance = (busiest_weight - this_weight) / 2; |
1414 | 1443 | ||
1415 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1444 | if (imbalance < 0) |
1416 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | 1445 | imbalance = busiest_weight; |
1417 | if (imbalance <= 0) | 1446 | |
1447 | max_load = max(rem_load, imbalance); | ||
1448 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, | ||
1449 | max_load, sd, idle, all_pinned, this_best_prio, | ||
1450 | tg->cfs_rq[busiest_cpu]); | ||
1451 | |||
1452 | if (!moved_load) | ||
1418 | continue; | 1453 | continue; |
1419 | 1454 | ||
1420 | /* Don't pull more than imbalance/2 */ | 1455 | move_group_shares(tg, sd, busiest_cpu, this_cpu); |
1421 | imbalance /= 2; | ||
1422 | maxload = min(rem_load_move, imbalance); | ||
1423 | 1456 | ||
1424 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1457 | moved_load *= aggregate(tg, sd)->load; |
1425 | #else | 1458 | moved_load /= aggregate(tg, sd)->rq_weight + 1; |
1426 | # define maxload rem_load_move | ||
1427 | #endif | ||
1428 | /* | ||
1429 | * pass busy_cfs_rq argument into | ||
1430 | * load_balance_[start|next]_fair iterators | ||
1431 | */ | ||
1432 | cfs_rq_iterator.arg = busy_cfs_rq; | ||
1433 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | ||
1434 | maxload, sd, idle, all_pinned, | ||
1435 | this_best_prio, | ||
1436 | &cfs_rq_iterator); | ||
1437 | 1459 | ||
1438 | if (rem_load_move <= 0) | 1460 | rem_load_move -= moved_load; |
1461 | if (rem_load_move < 0) | ||
1439 | break; | 1462 | break; |
1440 | } | 1463 | } |
1464 | rcu_read_unlock(); | ||
1441 | 1465 | ||
1442 | return max_load_move - rem_load_move; | 1466 | return max_load_move - rem_load_move; |
1443 | } | 1467 | } |
1468 | #else | ||
1469 | static unsigned long | ||
1470 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1471 | unsigned long max_load_move, | ||
1472 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
1473 | int *all_pinned, int *this_best_prio) | ||
1474 | { | ||
1475 | return __load_balance_fair(this_rq, this_cpu, busiest, | ||
1476 | max_load_move, sd, idle, all_pinned, | ||
1477 | this_best_prio, &busiest->cfs); | ||
1478 | } | ||
1479 | #endif | ||
1444 | 1480 | ||
1445 | static int | 1481 | static int |
1446 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1482 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 6b4a6b5a4167..765932d0399d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -670,6 +670,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | |||
670 | rt_se->timeout = 0; | 670 | rt_se->timeout = 0; |
671 | 671 | ||
672 | enqueue_rt_entity(rt_se); | 672 | enqueue_rt_entity(rt_se); |
673 | |||
674 | inc_cpu_load(rq, p->se.load.weight); | ||
673 | } | 675 | } |
674 | 676 | ||
675 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 677 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
@@ -678,6 +680,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | |||
678 | 680 | ||
679 | update_curr_rt(rq); | 681 | update_curr_rt(rq); |
680 | dequeue_rt_entity(rt_se); | 682 | dequeue_rt_entity(rt_se); |
683 | |||
684 | dec_cpu_load(rq, p->se.load.weight); | ||
681 | } | 685 | } |
682 | 686 | ||
683 | /* | 687 | /* |