diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-05-29 05:28:57 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-05-29 05:28:57 -0400 |
commit | 6363ca57c76b7b83639ca8c83fc285fa26a7880e (patch) | |
tree | b8630b4af286409efdd648920a546fae24d4db88 | |
parent | 4285f594f84d1f0641fc962d00e6638dec4a19c4 (diff) |
revert ("sched: fair-group: SMP-nice for group scheduling")
Yanmin Zhang reported:
Comparing with 2.6.25, volanoMark has big regression with kernel 2.6.26-rc1.
It's about 50% on my 8-core stoakley, 16-core tigerton, and Itanium Montecito.
With bisect, I located the following patch:
| 18d95a2832c1392a2d63227a7a6d433cb9f2037e is first bad commit
| commit 18d95a2832c1392a2d63227a7a6d433cb9f2037e
| Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
| Date: Sat Apr 19 19:45:00 2008 +0200
|
| sched: fair-group: SMP-nice for group scheduling
Revert it so that we get v2.6.25 behavior.
Bisected-by: Yanmin Zhang <yanmin_zhang@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | include/linux/sched.h | 1 | ||||
-rw-r--r-- | kernel/sched.c | 430 | ||||
-rw-r--r-- | kernel/sched_debug.c | 5 | ||||
-rw-r--r-- | kernel/sched_fair.c | 124 | ||||
-rw-r--r-- | kernel/sched_rt.c | 4 |
5 files changed, 75 insertions, 489 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 5395a6176f4b..8a888499954e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -766,7 +766,6 @@ struct sched_domain { | |||
766 | struct sched_domain *child; /* bottom domain must be null terminated */ | 766 | struct sched_domain *child; /* bottom domain must be null terminated */ |
767 | struct sched_group *groups; /* the balancing groups of the domain */ | 767 | struct sched_group *groups; /* the balancing groups of the domain */ |
768 | cpumask_t span; /* span of all CPUs in this domain */ | 768 | cpumask_t span; /* span of all CPUs in this domain */ |
769 | int first_cpu; /* cache of the first cpu in this domain */ | ||
770 | unsigned long min_interval; /* Minimum balance interval ms */ | 769 | unsigned long min_interval; /* Minimum balance interval ms */ |
771 | unsigned long max_interval; /* Maximum balance interval ms */ | 770 | unsigned long max_interval; /* Maximum balance interval ms */ |
772 | unsigned int busy_factor; /* less balancing by factor if busy */ | 771 | unsigned int busy_factor; /* less balancing by factor if busy */ |
diff --git a/kernel/sched.c b/kernel/sched.c index 3dc13f05b10e..bfb8ad8ed171 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -398,43 +398,6 @@ struct cfs_rq { | |||
398 | */ | 398 | */ |
399 | struct list_head leaf_cfs_rq_list; | 399 | struct list_head leaf_cfs_rq_list; |
400 | struct task_group *tg; /* group that "owns" this runqueue */ | 400 | struct task_group *tg; /* group that "owns" this runqueue */ |
401 | |||
402 | #ifdef CONFIG_SMP | ||
403 | unsigned long task_weight; | ||
404 | unsigned long shares; | ||
405 | /* | ||
406 | * We need space to build a sched_domain wide view of the full task | ||
407 | * group tree, in order to avoid depending on dynamic memory allocation | ||
408 | * during the load balancing we place this in the per cpu task group | ||
409 | * hierarchy. This limits the load balancing to one instance per cpu, | ||
410 | * but more should not be needed anyway. | ||
411 | */ | ||
412 | struct aggregate_struct { | ||
413 | /* | ||
414 | * load = weight(cpus) * f(tg) | ||
415 | * | ||
416 | * Where f(tg) is the recursive weight fraction assigned to | ||
417 | * this group. | ||
418 | */ | ||
419 | unsigned long load; | ||
420 | |||
421 | /* | ||
422 | * part of the group weight distributed to this span. | ||
423 | */ | ||
424 | unsigned long shares; | ||
425 | |||
426 | /* | ||
427 | * The sum of all runqueue weights within this span. | ||
428 | */ | ||
429 | unsigned long rq_weight; | ||
430 | |||
431 | /* | ||
432 | * Weight contributed by tasks; this is the part we can | ||
433 | * influence by moving tasks around. | ||
434 | */ | ||
435 | unsigned long task_weight; | ||
436 | } aggregate; | ||
437 | #endif | ||
438 | #endif | 401 | #endif |
439 | }; | 402 | }; |
440 | 403 | ||
@@ -1508,326 +1471,6 @@ static unsigned long source_load(int cpu, int type); | |||
1508 | static unsigned long target_load(int cpu, int type); | 1471 | static unsigned long target_load(int cpu, int type); |
1509 | static unsigned long cpu_avg_load_per_task(int cpu); | 1472 | static unsigned long cpu_avg_load_per_task(int cpu); |
1510 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1473 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
1511 | |||
1512 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1513 | |||
1514 | /* | ||
1515 | * Group load balancing. | ||
1516 | * | ||
1517 | * We calculate a few balance domain wide aggregate numbers; load and weight. | ||
1518 | * Given the pictures below, and assuming each item has equal weight: | ||
1519 | * | ||
1520 | * root 1 - thread | ||
1521 | * / | \ A - group | ||
1522 | * A 1 B | ||
1523 | * /|\ / \ | ||
1524 | * C 2 D 3 4 | ||
1525 | * | | | ||
1526 | * 5 6 | ||
1527 | * | ||
1528 | * load: | ||
1529 | * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, | ||
1530 | * which equals 1/9-th of the total load. | ||
1531 | * | ||
1532 | * shares: | ||
1533 | * The weight of this group on the selected cpus. | ||
1534 | * | ||
1535 | * rq_weight: | ||
1536 | * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while | ||
1537 | * B would get 2. | ||
1538 | * | ||
1539 | * task_weight: | ||
1540 | * Part of the rq_weight contributed by tasks; all groups except B would | ||
1541 | * get 1, B gets 2. | ||
1542 | */ | ||
1543 | |||
1544 | static inline struct aggregate_struct * | ||
1545 | aggregate(struct task_group *tg, struct sched_domain *sd) | ||
1546 | { | ||
1547 | return &tg->cfs_rq[sd->first_cpu]->aggregate; | ||
1548 | } | ||
1549 | |||
1550 | typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); | ||
1551 | |||
1552 | /* | ||
1553 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1554 | * leaving it for the final time. | ||
1555 | */ | ||
1556 | static | ||
1557 | void aggregate_walk_tree(aggregate_func down, aggregate_func up, | ||
1558 | struct sched_domain *sd) | ||
1559 | { | ||
1560 | struct task_group *parent, *child; | ||
1561 | |||
1562 | rcu_read_lock(); | ||
1563 | parent = &root_task_group; | ||
1564 | down: | ||
1565 | (*down)(parent, sd); | ||
1566 | list_for_each_entry_rcu(child, &parent->children, siblings) { | ||
1567 | parent = child; | ||
1568 | goto down; | ||
1569 | |||
1570 | up: | ||
1571 | continue; | ||
1572 | } | ||
1573 | (*up)(parent, sd); | ||
1574 | |||
1575 | child = parent; | ||
1576 | parent = parent->parent; | ||
1577 | if (parent) | ||
1578 | goto up; | ||
1579 | rcu_read_unlock(); | ||
1580 | } | ||
1581 | |||
1582 | /* | ||
1583 | * Calculate the aggregate runqueue weight. | ||
1584 | */ | ||
1585 | static | ||
1586 | void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) | ||
1587 | { | ||
1588 | unsigned long rq_weight = 0; | ||
1589 | unsigned long task_weight = 0; | ||
1590 | int i; | ||
1591 | |||
1592 | for_each_cpu_mask(i, sd->span) { | ||
1593 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
1594 | task_weight += tg->cfs_rq[i]->task_weight; | ||
1595 | } | ||
1596 | |||
1597 | aggregate(tg, sd)->rq_weight = rq_weight; | ||
1598 | aggregate(tg, sd)->task_weight = task_weight; | ||
1599 | } | ||
1600 | |||
1601 | /* | ||
1602 | * Compute the weight of this group on the given cpus. | ||
1603 | */ | ||
1604 | static | ||
1605 | void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) | ||
1606 | { | ||
1607 | unsigned long shares = 0; | ||
1608 | int i; | ||
1609 | |||
1610 | for_each_cpu_mask(i, sd->span) | ||
1611 | shares += tg->cfs_rq[i]->shares; | ||
1612 | |||
1613 | if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) | ||
1614 | shares = tg->shares; | ||
1615 | |||
1616 | aggregate(tg, sd)->shares = shares; | ||
1617 | } | ||
1618 | |||
1619 | /* | ||
1620 | * Compute the load fraction assigned to this group, relies on the aggregate | ||
1621 | * weight and this group's parent's load, i.e. top-down. | ||
1622 | */ | ||
1623 | static | ||
1624 | void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) | ||
1625 | { | ||
1626 | unsigned long load; | ||
1627 | |||
1628 | if (!tg->parent) { | ||
1629 | int i; | ||
1630 | |||
1631 | load = 0; | ||
1632 | for_each_cpu_mask(i, sd->span) | ||
1633 | load += cpu_rq(i)->load.weight; | ||
1634 | |||
1635 | } else { | ||
1636 | load = aggregate(tg->parent, sd)->load; | ||
1637 | |||
1638 | /* | ||
1639 | * shares is our weight in the parent's rq so | ||
1640 | * shares/parent->rq_weight gives our fraction of the load | ||
1641 | */ | ||
1642 | load *= aggregate(tg, sd)->shares; | ||
1643 | load /= aggregate(tg->parent, sd)->rq_weight + 1; | ||
1644 | } | ||
1645 | |||
1646 | aggregate(tg, sd)->load = load; | ||
1647 | } | ||
1648 | |||
1649 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1650 | |||
1651 | /* | ||
1652 | * Calculate and set the cpu's group shares. | ||
1653 | */ | ||
1654 | static void | ||
1655 | __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, | ||
1656 | int tcpu) | ||
1657 | { | ||
1658 | int boost = 0; | ||
1659 | unsigned long shares; | ||
1660 | unsigned long rq_weight; | ||
1661 | |||
1662 | if (!tg->se[tcpu]) | ||
1663 | return; | ||
1664 | |||
1665 | rq_weight = tg->cfs_rq[tcpu]->load.weight; | ||
1666 | |||
1667 | /* | ||
1668 | * If there are currently no tasks on the cpu pretend there is one of | ||
1669 | * average load so that when a new task gets to run here it will not | ||
1670 | * get delayed by group starvation. | ||
1671 | */ | ||
1672 | if (!rq_weight) { | ||
1673 | boost = 1; | ||
1674 | rq_weight = NICE_0_LOAD; | ||
1675 | } | ||
1676 | |||
1677 | /* | ||
1678 | * \Sum shares * rq_weight | ||
1679 | * shares = ----------------------- | ||
1680 | * \Sum rq_weight | ||
1681 | * | ||
1682 | */ | ||
1683 | shares = aggregate(tg, sd)->shares * rq_weight; | ||
1684 | shares /= aggregate(tg, sd)->rq_weight + 1; | ||
1685 | |||
1686 | /* | ||
1687 | * record the actual number of shares, not the boosted amount. | ||
1688 | */ | ||
1689 | tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; | ||
1690 | |||
1691 | if (shares < MIN_SHARES) | ||
1692 | shares = MIN_SHARES; | ||
1693 | else if (shares > MAX_SHARES) | ||
1694 | shares = MAX_SHARES; | ||
1695 | |||
1696 | __set_se_shares(tg->se[tcpu], shares); | ||
1697 | } | ||
1698 | |||
1699 | /* | ||
1700 | * Re-adjust the weights on the cpu the task came from and on the cpu the | ||
1701 | * task went to. | ||
1702 | */ | ||
1703 | static void | ||
1704 | __move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
1705 | int scpu, int dcpu) | ||
1706 | { | ||
1707 | unsigned long shares; | ||
1708 | |||
1709 | shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
1710 | |||
1711 | __update_group_shares_cpu(tg, sd, scpu); | ||
1712 | __update_group_shares_cpu(tg, sd, dcpu); | ||
1713 | |||
1714 | /* | ||
1715 | * ensure we never loose shares due to rounding errors in the | ||
1716 | * above redistribution. | ||
1717 | */ | ||
1718 | shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
1719 | if (shares) | ||
1720 | tg->cfs_rq[dcpu]->shares += shares; | ||
1721 | } | ||
1722 | |||
1723 | /* | ||
1724 | * Because changing a group's shares changes the weight of the super-group | ||
1725 | * we need to walk up the tree and change all shares until we hit the root. | ||
1726 | */ | ||
1727 | static void | ||
1728 | move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
1729 | int scpu, int dcpu) | ||
1730 | { | ||
1731 | while (tg) { | ||
1732 | __move_group_shares(tg, sd, scpu, dcpu); | ||
1733 | tg = tg->parent; | ||
1734 | } | ||
1735 | } | ||
1736 | |||
1737 | static | ||
1738 | void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) | ||
1739 | { | ||
1740 | unsigned long shares = aggregate(tg, sd)->shares; | ||
1741 | int i; | ||
1742 | |||
1743 | for_each_cpu_mask(i, sd->span) { | ||
1744 | struct rq *rq = cpu_rq(i); | ||
1745 | unsigned long flags; | ||
1746 | |||
1747 | spin_lock_irqsave(&rq->lock, flags); | ||
1748 | __update_group_shares_cpu(tg, sd, i); | ||
1749 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1750 | } | ||
1751 | |||
1752 | aggregate_group_shares(tg, sd); | ||
1753 | |||
1754 | /* | ||
1755 | * ensure we never loose shares due to rounding errors in the | ||
1756 | * above redistribution. | ||
1757 | */ | ||
1758 | shares -= aggregate(tg, sd)->shares; | ||
1759 | if (shares) { | ||
1760 | tg->cfs_rq[sd->first_cpu]->shares += shares; | ||
1761 | aggregate(tg, sd)->shares += shares; | ||
1762 | } | ||
1763 | } | ||
1764 | |||
1765 | /* | ||
1766 | * Calculate the accumulative weight and recursive load of each task group | ||
1767 | * while walking down the tree. | ||
1768 | */ | ||
1769 | static | ||
1770 | void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) | ||
1771 | { | ||
1772 | aggregate_group_weight(tg, sd); | ||
1773 | aggregate_group_shares(tg, sd); | ||
1774 | aggregate_group_load(tg, sd); | ||
1775 | } | ||
1776 | |||
1777 | /* | ||
1778 | * Rebalance the cpu shares while walking back up the tree. | ||
1779 | */ | ||
1780 | static | ||
1781 | void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) | ||
1782 | { | ||
1783 | aggregate_group_set_shares(tg, sd); | ||
1784 | } | ||
1785 | |||
1786 | static DEFINE_PER_CPU(spinlock_t, aggregate_lock); | ||
1787 | |||
1788 | static void __init init_aggregate(void) | ||
1789 | { | ||
1790 | int i; | ||
1791 | |||
1792 | for_each_possible_cpu(i) | ||
1793 | spin_lock_init(&per_cpu(aggregate_lock, i)); | ||
1794 | } | ||
1795 | |||
1796 | static int get_aggregate(struct sched_domain *sd) | ||
1797 | { | ||
1798 | if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) | ||
1799 | return 0; | ||
1800 | |||
1801 | aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); | ||
1802 | return 1; | ||
1803 | } | ||
1804 | |||
1805 | static void put_aggregate(struct sched_domain *sd) | ||
1806 | { | ||
1807 | spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); | ||
1808 | } | ||
1809 | |||
1810 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1811 | { | ||
1812 | cfs_rq->shares = shares; | ||
1813 | } | ||
1814 | |||
1815 | #else | ||
1816 | |||
1817 | static inline void init_aggregate(void) | ||
1818 | { | ||
1819 | } | ||
1820 | |||
1821 | static inline int get_aggregate(struct sched_domain *sd) | ||
1822 | { | ||
1823 | return 0; | ||
1824 | } | ||
1825 | |||
1826 | static inline void put_aggregate(struct sched_domain *sd) | ||
1827 | { | ||
1828 | } | ||
1829 | #endif | ||
1830 | |||
1831 | #else /* CONFIG_SMP */ | 1474 | #else /* CONFIG_SMP */ |
1832 | 1475 | ||
1833 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1476 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -1848,14 +1491,26 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
1848 | 1491 | ||
1849 | #define sched_class_highest (&rt_sched_class) | 1492 | #define sched_class_highest (&rt_sched_class) |
1850 | 1493 | ||
1851 | static void inc_nr_running(struct rq *rq) | 1494 | static inline void inc_load(struct rq *rq, const struct task_struct *p) |
1495 | { | ||
1496 | update_load_add(&rq->load, p->se.load.weight); | ||
1497 | } | ||
1498 | |||
1499 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
1500 | { | ||
1501 | update_load_sub(&rq->load, p->se.load.weight); | ||
1502 | } | ||
1503 | |||
1504 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
1852 | { | 1505 | { |
1853 | rq->nr_running++; | 1506 | rq->nr_running++; |
1507 | inc_load(rq, p); | ||
1854 | } | 1508 | } |
1855 | 1509 | ||
1856 | static void dec_nr_running(struct rq *rq) | 1510 | static void dec_nr_running(struct task_struct *p, struct rq *rq) |
1857 | { | 1511 | { |
1858 | rq->nr_running--; | 1512 | rq->nr_running--; |
1513 | dec_load(rq, p); | ||
1859 | } | 1514 | } |
1860 | 1515 | ||
1861 | static void set_load_weight(struct task_struct *p) | 1516 | static void set_load_weight(struct task_struct *p) |
@@ -1947,7 +1602,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1947 | rq->nr_uninterruptible--; | 1602 | rq->nr_uninterruptible--; |
1948 | 1603 | ||
1949 | enqueue_task(rq, p, wakeup); | 1604 | enqueue_task(rq, p, wakeup); |
1950 | inc_nr_running(rq); | 1605 | inc_nr_running(p, rq); |
1951 | } | 1606 | } |
1952 | 1607 | ||
1953 | /* | 1608 | /* |
@@ -1959,7 +1614,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1959 | rq->nr_uninterruptible++; | 1614 | rq->nr_uninterruptible++; |
1960 | 1615 | ||
1961 | dequeue_task(rq, p, sleep); | 1616 | dequeue_task(rq, p, sleep); |
1962 | dec_nr_running(rq); | 1617 | dec_nr_running(p, rq); |
1963 | } | 1618 | } |
1964 | 1619 | ||
1965 | /** | 1620 | /** |
@@ -2612,7 +2267,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2612 | * management (if any): | 2267 | * management (if any): |
2613 | */ | 2268 | */ |
2614 | p->sched_class->task_new(rq, p); | 2269 | p->sched_class->task_new(rq, p); |
2615 | inc_nr_running(rq); | 2270 | inc_nr_running(p, rq); |
2616 | } | 2271 | } |
2617 | check_preempt_curr(rq, p); | 2272 | check_preempt_curr(rq, p); |
2618 | #ifdef CONFIG_SMP | 2273 | #ifdef CONFIG_SMP |
@@ -3603,12 +3258,9 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3603 | unsigned long imbalance; | 3258 | unsigned long imbalance; |
3604 | struct rq *busiest; | 3259 | struct rq *busiest; |
3605 | unsigned long flags; | 3260 | unsigned long flags; |
3606 | int unlock_aggregate; | ||
3607 | 3261 | ||
3608 | cpus_setall(*cpus); | 3262 | cpus_setall(*cpus); |
3609 | 3263 | ||
3610 | unlock_aggregate = get_aggregate(sd); | ||
3611 | |||
3612 | /* | 3264 | /* |
3613 | * When power savings policy is enabled for the parent domain, idle | 3265 | * When power savings policy is enabled for the parent domain, idle |
3614 | * sibling can pick up load irrespective of busy siblings. In this case, | 3266 | * sibling can pick up load irrespective of busy siblings. In this case, |
@@ -3724,9 +3376,8 @@ redo: | |||
3724 | 3376 | ||
3725 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3377 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3726 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3378 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3727 | ld_moved = -1; | 3379 | return -1; |
3728 | 3380 | return ld_moved; | |
3729 | goto out; | ||
3730 | 3381 | ||
3731 | out_balanced: | 3382 | out_balanced: |
3732 | schedstat_inc(sd, lb_balanced[idle]); | 3383 | schedstat_inc(sd, lb_balanced[idle]); |
@@ -3741,13 +3392,8 @@ out_one_pinned: | |||
3741 | 3392 | ||
3742 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3393 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3743 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3394 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3744 | ld_moved = -1; | 3395 | return -1; |
3745 | else | 3396 | return 0; |
3746 | ld_moved = 0; | ||
3747 | out: | ||
3748 | if (unlock_aggregate) | ||
3749 | put_aggregate(sd); | ||
3750 | return ld_moved; | ||
3751 | } | 3397 | } |
3752 | 3398 | ||
3753 | /* | 3399 | /* |
@@ -4934,8 +4580,10 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4934 | goto out_unlock; | 4580 | goto out_unlock; |
4935 | } | 4581 | } |
4936 | on_rq = p->se.on_rq; | 4582 | on_rq = p->se.on_rq; |
4937 | if (on_rq) | 4583 | if (on_rq) { |
4938 | dequeue_task(rq, p, 0); | 4584 | dequeue_task(rq, p, 0); |
4585 | dec_load(rq, p); | ||
4586 | } | ||
4939 | 4587 | ||
4940 | p->static_prio = NICE_TO_PRIO(nice); | 4588 | p->static_prio = NICE_TO_PRIO(nice); |
4941 | set_load_weight(p); | 4589 | set_load_weight(p); |
@@ -4945,6 +4593,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4945 | 4593 | ||
4946 | if (on_rq) { | 4594 | if (on_rq) { |
4947 | enqueue_task(rq, p, 0); | 4595 | enqueue_task(rq, p, 0); |
4596 | inc_load(rq, p); | ||
4948 | /* | 4597 | /* |
4949 | * If the task increased its priority or is running and | 4598 | * If the task increased its priority or is running and |
4950 | * lowered its priority, then reschedule its CPU: | 4599 | * lowered its priority, then reschedule its CPU: |
@@ -7319,7 +6968,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7319 | SD_INIT(sd, ALLNODES); | 6968 | SD_INIT(sd, ALLNODES); |
7320 | set_domain_attribute(sd, attr); | 6969 | set_domain_attribute(sd, attr); |
7321 | sd->span = *cpu_map; | 6970 | sd->span = *cpu_map; |
7322 | sd->first_cpu = first_cpu(sd->span); | ||
7323 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | 6971 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); |
7324 | p = sd; | 6972 | p = sd; |
7325 | sd_allnodes = 1; | 6973 | sd_allnodes = 1; |
@@ -7330,7 +6978,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7330 | SD_INIT(sd, NODE); | 6978 | SD_INIT(sd, NODE); |
7331 | set_domain_attribute(sd, attr); | 6979 | set_domain_attribute(sd, attr); |
7332 | sched_domain_node_span(cpu_to_node(i), &sd->span); | 6980 | sched_domain_node_span(cpu_to_node(i), &sd->span); |
7333 | sd->first_cpu = first_cpu(sd->span); | ||
7334 | sd->parent = p; | 6981 | sd->parent = p; |
7335 | if (p) | 6982 | if (p) |
7336 | p->child = sd; | 6983 | p->child = sd; |
@@ -7342,7 +6989,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7342 | SD_INIT(sd, CPU); | 6989 | SD_INIT(sd, CPU); |
7343 | set_domain_attribute(sd, attr); | 6990 | set_domain_attribute(sd, attr); |
7344 | sd->span = *nodemask; | 6991 | sd->span = *nodemask; |
7345 | sd->first_cpu = first_cpu(sd->span); | ||
7346 | sd->parent = p; | 6992 | sd->parent = p; |
7347 | if (p) | 6993 | if (p) |
7348 | p->child = sd; | 6994 | p->child = sd; |
@@ -7354,7 +7000,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7354 | SD_INIT(sd, MC); | 7000 | SD_INIT(sd, MC); |
7355 | set_domain_attribute(sd, attr); | 7001 | set_domain_attribute(sd, attr); |
7356 | sd->span = cpu_coregroup_map(i); | 7002 | sd->span = cpu_coregroup_map(i); |
7357 | sd->first_cpu = first_cpu(sd->span); | ||
7358 | cpus_and(sd->span, sd->span, *cpu_map); | 7003 | cpus_and(sd->span, sd->span, *cpu_map); |
7359 | sd->parent = p; | 7004 | sd->parent = p; |
7360 | p->child = sd; | 7005 | p->child = sd; |
@@ -7367,7 +7012,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7367 | SD_INIT(sd, SIBLING); | 7012 | SD_INIT(sd, SIBLING); |
7368 | set_domain_attribute(sd, attr); | 7013 | set_domain_attribute(sd, attr); |
7369 | sd->span = per_cpu(cpu_sibling_map, i); | 7014 | sd->span = per_cpu(cpu_sibling_map, i); |
7370 | sd->first_cpu = first_cpu(sd->span); | ||
7371 | cpus_and(sd->span, sd->span, *cpu_map); | 7015 | cpus_and(sd->span, sd->span, *cpu_map); |
7372 | sd->parent = p; | 7016 | sd->parent = p; |
7373 | p->child = sd; | 7017 | p->child = sd; |
@@ -8037,7 +7681,6 @@ void __init sched_init(void) | |||
8037 | } | 7681 | } |
8038 | 7682 | ||
8039 | #ifdef CONFIG_SMP | 7683 | #ifdef CONFIG_SMP |
8040 | init_aggregate(); | ||
8041 | init_defrootdomain(); | 7684 | init_defrootdomain(); |
8042 | #endif | 7685 | #endif |
8043 | 7686 | ||
@@ -8602,11 +8245,14 @@ void sched_move_task(struct task_struct *tsk) | |||
8602 | #endif | 8245 | #endif |
8603 | 8246 | ||
8604 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8247 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8605 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) | 8248 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
8606 | { | 8249 | { |
8607 | struct cfs_rq *cfs_rq = se->cfs_rq; | 8250 | struct cfs_rq *cfs_rq = se->cfs_rq; |
8251 | struct rq *rq = cfs_rq->rq; | ||
8608 | int on_rq; | 8252 | int on_rq; |
8609 | 8253 | ||
8254 | spin_lock_irq(&rq->lock); | ||
8255 | |||
8610 | on_rq = se->on_rq; | 8256 | on_rq = se->on_rq; |
8611 | if (on_rq) | 8257 | if (on_rq) |
8612 | dequeue_entity(cfs_rq, se, 0); | 8258 | dequeue_entity(cfs_rq, se, 0); |
@@ -8616,17 +8262,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares) | |||
8616 | 8262 | ||
8617 | if (on_rq) | 8263 | if (on_rq) |
8618 | enqueue_entity(cfs_rq, se, 0); | 8264 | enqueue_entity(cfs_rq, se, 0); |
8619 | } | ||
8620 | 8265 | ||
8621 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 8266 | spin_unlock_irq(&rq->lock); |
8622 | { | ||
8623 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8624 | struct rq *rq = cfs_rq->rq; | ||
8625 | unsigned long flags; | ||
8626 | |||
8627 | spin_lock_irqsave(&rq->lock, flags); | ||
8628 | __set_se_shares(se, shares); | ||
8629 | spin_unlock_irqrestore(&rq->lock, flags); | ||
8630 | } | 8267 | } |
8631 | 8268 | ||
8632 | static DEFINE_MUTEX(shares_mutex); | 8269 | static DEFINE_MUTEX(shares_mutex); |
@@ -8665,13 +8302,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8665 | * w/o tripping rebalance_share or load_balance_fair. | 8302 | * w/o tripping rebalance_share or load_balance_fair. |
8666 | */ | 8303 | */ |
8667 | tg->shares = shares; | 8304 | tg->shares = shares; |
8668 | for_each_possible_cpu(i) { | 8305 | for_each_possible_cpu(i) |
8669 | /* | ||
8670 | * force a rebalance | ||
8671 | */ | ||
8672 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | ||
8673 | set_se_shares(tg->se[i], shares); | 8306 | set_se_shares(tg->se[i], shares); |
8674 | } | ||
8675 | 8307 | ||
8676 | /* | 8308 | /* |
8677 | * Enable load balance activity on this group, by inserting it back on | 8309 | * Enable load balance activity on this group, by inserting it back on |
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5f06118fbc31..8bb713040ac9 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -167,11 +167,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
167 | #endif | 167 | #endif |
168 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", | 168 | SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", |
169 | cfs_rq->nr_spread_over); | 169 | cfs_rq->nr_spread_over); |
170 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
171 | #ifdef CONFIG_SMP | ||
172 | SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); | ||
173 | #endif | ||
174 | #endif | ||
175 | } | 170 | } |
176 | 171 | ||
177 | static void print_cpu(struct seq_file *m, int cpu) | 172 | static void print_cpu(struct seq_file *m, int cpu) |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 0eb0ae879542..f0f25fc12d0a 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -510,27 +510,10 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
510 | * Scheduling class queueing methods: | 510 | * Scheduling class queueing methods: |
511 | */ | 511 | */ |
512 | 512 | ||
513 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
514 | static void | ||
515 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
516 | { | ||
517 | cfs_rq->task_weight += weight; | ||
518 | } | ||
519 | #else | ||
520 | static inline void | ||
521 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
522 | { | ||
523 | } | ||
524 | #endif | ||
525 | |||
526 | static void | 513 | static void |
527 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 514 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
528 | { | 515 | { |
529 | update_load_add(&cfs_rq->load, se->load.weight); | 516 | update_load_add(&cfs_rq->load, se->load.weight); |
530 | if (!parent_entity(se)) | ||
531 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
532 | if (entity_is_task(se)) | ||
533 | add_cfs_task_weight(cfs_rq, se->load.weight); | ||
534 | cfs_rq->nr_running++; | 517 | cfs_rq->nr_running++; |
535 | se->on_rq = 1; | 518 | se->on_rq = 1; |
536 | list_add(&se->group_node, &cfs_rq->tasks); | 519 | list_add(&se->group_node, &cfs_rq->tasks); |
@@ -540,10 +523,6 @@ static void | |||
540 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 523 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
541 | { | 524 | { |
542 | update_load_sub(&cfs_rq->load, se->load.weight); | 525 | update_load_sub(&cfs_rq->load, se->load.weight); |
543 | if (!parent_entity(se)) | ||
544 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
545 | if (entity_is_task(se)) | ||
546 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
547 | cfs_rq->nr_running--; | 526 | cfs_rq->nr_running--; |
548 | se->on_rq = 0; | 527 | se->on_rq = 0; |
549 | list_del_init(&se->group_node); | 528 | list_del_init(&se->group_node); |
@@ -1327,90 +1306,75 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
1327 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); | 1306 | return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); |
1328 | } | 1307 | } |
1329 | 1308 | ||
1330 | static unsigned long | 1309 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1331 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1310 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) |
1332 | unsigned long max_load_move, struct sched_domain *sd, | ||
1333 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | ||
1334 | struct cfs_rq *cfs_rq) | ||
1335 | { | 1311 | { |
1336 | struct rq_iterator cfs_rq_iterator; | 1312 | struct sched_entity *curr; |
1313 | struct task_struct *p; | ||
1337 | 1314 | ||
1338 | cfs_rq_iterator.start = load_balance_start_fair; | 1315 | if (!cfs_rq->nr_running || !first_fair(cfs_rq)) |
1339 | cfs_rq_iterator.next = load_balance_next_fair; | 1316 | return MAX_PRIO; |
1340 | cfs_rq_iterator.arg = cfs_rq; | 1317 | |
1318 | curr = cfs_rq->curr; | ||
1319 | if (!curr) | ||
1320 | curr = __pick_next_entity(cfs_rq); | ||
1341 | 1321 | ||
1342 | return balance_tasks(this_rq, this_cpu, busiest, | 1322 | p = task_of(curr); |
1343 | max_load_move, sd, idle, all_pinned, | 1323 | |
1344 | this_best_prio, &cfs_rq_iterator); | 1324 | return p->prio; |
1345 | } | 1325 | } |
1326 | #endif | ||
1346 | 1327 | ||
1347 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1348 | static unsigned long | 1328 | static unsigned long |
1349 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1329 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1350 | unsigned long max_load_move, | 1330 | unsigned long max_load_move, |
1351 | struct sched_domain *sd, enum cpu_idle_type idle, | 1331 | struct sched_domain *sd, enum cpu_idle_type idle, |
1352 | int *all_pinned, int *this_best_prio) | 1332 | int *all_pinned, int *this_best_prio) |
1353 | { | 1333 | { |
1334 | struct cfs_rq *busy_cfs_rq; | ||
1354 | long rem_load_move = max_load_move; | 1335 | long rem_load_move = max_load_move; |
1355 | int busiest_cpu = cpu_of(busiest); | 1336 | struct rq_iterator cfs_rq_iterator; |
1356 | struct task_group *tg; | ||
1357 | |||
1358 | rcu_read_lock(); | ||
1359 | list_for_each_entry(tg, &task_groups, list) { | ||
1360 | long imbalance; | ||
1361 | unsigned long this_weight, busiest_weight; | ||
1362 | long rem_load, max_load, moved_load; | ||
1363 | |||
1364 | /* | ||
1365 | * empty group | ||
1366 | */ | ||
1367 | if (!aggregate(tg, sd)->task_weight) | ||
1368 | continue; | ||
1369 | |||
1370 | rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; | ||
1371 | rem_load /= aggregate(tg, sd)->load + 1; | ||
1372 | |||
1373 | this_weight = tg->cfs_rq[this_cpu]->task_weight; | ||
1374 | busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; | ||
1375 | 1337 | ||
1376 | imbalance = (busiest_weight - this_weight) / 2; | 1338 | cfs_rq_iterator.start = load_balance_start_fair; |
1339 | cfs_rq_iterator.next = load_balance_next_fair; | ||
1377 | 1340 | ||
1378 | if (imbalance < 0) | 1341 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { |
1379 | imbalance = busiest_weight; | 1342 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1343 | struct cfs_rq *this_cfs_rq; | ||
1344 | long imbalance; | ||
1345 | unsigned long maxload; | ||
1380 | 1346 | ||
1381 | max_load = max(rem_load, imbalance); | 1347 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); |
1382 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, | ||
1383 | max_load, sd, idle, all_pinned, this_best_prio, | ||
1384 | tg->cfs_rq[busiest_cpu]); | ||
1385 | 1348 | ||
1386 | if (!moved_load) | 1349 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; |
1350 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | ||
1351 | if (imbalance <= 0) | ||
1387 | continue; | 1352 | continue; |
1388 | 1353 | ||
1389 | move_group_shares(tg, sd, busiest_cpu, this_cpu); | 1354 | /* Don't pull more than imbalance/2 */ |
1355 | imbalance /= 2; | ||
1356 | maxload = min(rem_load_move, imbalance); | ||
1390 | 1357 | ||
1391 | moved_load *= aggregate(tg, sd)->load; | 1358 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); |
1392 | moved_load /= aggregate(tg, sd)->rq_weight + 1; | 1359 | #else |
1360 | # define maxload rem_load_move | ||
1361 | #endif | ||
1362 | /* | ||
1363 | * pass busy_cfs_rq argument into | ||
1364 | * load_balance_[start|next]_fair iterators | ||
1365 | */ | ||
1366 | cfs_rq_iterator.arg = busy_cfs_rq; | ||
1367 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | ||
1368 | maxload, sd, idle, all_pinned, | ||
1369 | this_best_prio, | ||
1370 | &cfs_rq_iterator); | ||
1393 | 1371 | ||
1394 | rem_load_move -= moved_load; | 1372 | if (rem_load_move <= 0) |
1395 | if (rem_load_move < 0) | ||
1396 | break; | 1373 | break; |
1397 | } | 1374 | } |
1398 | rcu_read_unlock(); | ||
1399 | 1375 | ||
1400 | return max_load_move - rem_load_move; | 1376 | return max_load_move - rem_load_move; |
1401 | } | 1377 | } |
1402 | #else | ||
1403 | static unsigned long | ||
1404 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1405 | unsigned long max_load_move, | ||
1406 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
1407 | int *all_pinned, int *this_best_prio) | ||
1408 | { | ||
1409 | return __load_balance_fair(this_rq, this_cpu, busiest, | ||
1410 | max_load_move, sd, idle, all_pinned, | ||
1411 | this_best_prio, &busiest->cfs); | ||
1412 | } | ||
1413 | #endif | ||
1414 | 1378 | ||
1415 | static int | 1379 | static int |
1416 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1380 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 060e87b0cb1c..3432d573205d 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -513,8 +513,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | |||
513 | */ | 513 | */ |
514 | for_each_sched_rt_entity(rt_se) | 514 | for_each_sched_rt_entity(rt_se) |
515 | enqueue_rt_entity(rt_se); | 515 | enqueue_rt_entity(rt_se); |
516 | |||
517 | inc_cpu_load(rq, p->se.load.weight); | ||
518 | } | 516 | } |
519 | 517 | ||
520 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 518 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
@@ -534,8 +532,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | |||
534 | if (rt_rq && rt_rq->rt_nr_running) | 532 | if (rt_rq && rt_rq->rt_nr_running) |
535 | enqueue_rt_entity(rt_se); | 533 | enqueue_rt_entity(rt_se); |
536 | } | 534 | } |
537 | |||
538 | dec_cpu_load(rq, p->se.load.weight); | ||
539 | } | 535 | } |
540 | 536 | ||
541 | /* | 537 | /* |