aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2008-06-27 07:41:14 -0400
committerIngo Molnar <mingo@elte.hu>2008-06-27 08:31:29 -0400
commitc09595f63bb1909c5dc4dca288f4fe818561b5f3 (patch)
tree42631e6986f3ea4543b125ca62a99df8548e0eb9
parentced8aa16e1db55c33c507174c1b1f9e107445865 (diff)
sched: revert revert of: fair-group: SMP-nice for group scheduling
Try again.. Initial commit: 18d95a2832c1392a2d63227a7a6d433cb9f2037e Revert: 6363ca57c76b7b83639ca8c83fc285fa26a7880e Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Mike Galbraith <efault@gmx.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/sched.h1
-rw-r--r--kernel/sched.c430
-rw-r--r--kernel/sched_debug.c5
-rw-r--r--kernel/sched_fair.c124
-rw-r--r--kernel/sched_rt.c4
5 files changed, 489 insertions, 75 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index eaf821072dbd..97a58b622ee1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -765,6 +765,7 @@ struct sched_domain {
765 struct sched_domain *child; /* bottom domain must be null terminated */ 765 struct sched_domain *child; /* bottom domain must be null terminated */
766 struct sched_group *groups; /* the balancing groups of the domain */ 766 struct sched_group *groups; /* the balancing groups of the domain */
767 cpumask_t span; /* span of all CPUs in this domain */ 767 cpumask_t span; /* span of all CPUs in this domain */
768 int first_cpu; /* cache of the first cpu in this domain */
768 unsigned long min_interval; /* Minimum balance interval ms */ 769 unsigned long min_interval; /* Minimum balance interval ms */
769 unsigned long max_interval; /* Maximum balance interval ms */ 770 unsigned long max_interval; /* Maximum balance interval ms */
770 unsigned int busy_factor; /* less balancing by factor if busy */ 771 unsigned int busy_factor; /* less balancing by factor if busy */
diff --git a/kernel/sched.c b/kernel/sched.c
index f653af684fb3..874b6da15430 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -403,6 +403,43 @@ struct cfs_rq {
403 */ 403 */
404 struct list_head leaf_cfs_rq_list; 404 struct list_head leaf_cfs_rq_list;
405 struct task_group *tg; /* group that "owns" this runqueue */ 405 struct task_group *tg; /* group that "owns" this runqueue */
406
407#ifdef CONFIG_SMP
408 unsigned long task_weight;
409 unsigned long shares;
410 /*
411 * We need space to build a sched_domain wide view of the full task
412 * group tree, in order to avoid depending on dynamic memory allocation
413 * during the load balancing we place this in the per cpu task group
414 * hierarchy. This limits the load balancing to one instance per cpu,
415 * but more should not be needed anyway.
416 */
417 struct aggregate_struct {
418 /*
419 * load = weight(cpus) * f(tg)
420 *
421 * Where f(tg) is the recursive weight fraction assigned to
422 * this group.
423 */
424 unsigned long load;
425
426 /*
427 * part of the group weight distributed to this span.
428 */
429 unsigned long shares;
430
431 /*
432 * The sum of all runqueue weights within this span.
433 */
434 unsigned long rq_weight;
435
436 /*
437 * Weight contributed by tasks; this is the part we can
438 * influence by moving tasks around.
439 */
440 unsigned long task_weight;
441 } aggregate;
442#endif
406#endif 443#endif
407}; 444};
408 445
@@ -1484,6 +1521,326 @@ static unsigned long source_load(int cpu, int type);
1484static unsigned long target_load(int cpu, int type); 1521static unsigned long target_load(int cpu, int type);
1485static unsigned long cpu_avg_load_per_task(int cpu); 1522static unsigned long cpu_avg_load_per_task(int cpu);
1486static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1523static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1524
1525#ifdef CONFIG_FAIR_GROUP_SCHED
1526
1527/*
1528 * Group load balancing.
1529 *
1530 * We calculate a few balance domain wide aggregate numbers; load and weight.
1531 * Given the pictures below, and assuming each item has equal weight:
1532 *
1533 * root 1 - thread
1534 * / | \ A - group
1535 * A 1 B
1536 * /|\ / \
1537 * C 2 D 3 4
1538 * | |
1539 * 5 6
1540 *
1541 * load:
1542 * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
1543 * which equals 1/9-th of the total load.
1544 *
1545 * shares:
1546 * The weight of this group on the selected cpus.
1547 *
1548 * rq_weight:
1549 * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
1550 * B would get 2.
1551 *
1552 * task_weight:
1553 * Part of the rq_weight contributed by tasks; all groups except B would
1554 * get 1, B gets 2.
1555 */
1556
1557static inline struct aggregate_struct *
1558aggregate(struct task_group *tg, struct sched_domain *sd)
1559{
1560 return &tg->cfs_rq[sd->first_cpu]->aggregate;
1561}
1562
1563typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
1564
1565/*
1566 * Iterate the full tree, calling @down when first entering a node and @up when
1567 * leaving it for the final time.
1568 */
1569static
1570void aggregate_walk_tree(aggregate_func down, aggregate_func up,
1571 struct sched_domain *sd)
1572{
1573 struct task_group *parent, *child;
1574
1575 rcu_read_lock();
1576 parent = &root_task_group;
1577down:
1578 (*down)(parent, sd);
1579 list_for_each_entry_rcu(child, &parent->children, siblings) {
1580 parent = child;
1581 goto down;
1582
1583up:
1584 continue;
1585 }
1586 (*up)(parent, sd);
1587
1588 child = parent;
1589 parent = parent->parent;
1590 if (parent)
1591 goto up;
1592 rcu_read_unlock();
1593}
1594
1595/*
1596 * Calculate the aggregate runqueue weight.
1597 */
1598static
1599void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
1600{
1601 unsigned long rq_weight = 0;
1602 unsigned long task_weight = 0;
1603 int i;
1604
1605 for_each_cpu_mask(i, sd->span) {
1606 rq_weight += tg->cfs_rq[i]->load.weight;
1607 task_weight += tg->cfs_rq[i]->task_weight;
1608 }
1609
1610 aggregate(tg, sd)->rq_weight = rq_weight;
1611 aggregate(tg, sd)->task_weight = task_weight;
1612}
1613
1614/*
1615 * Compute the weight of this group on the given cpus.
1616 */
1617static
1618void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
1619{
1620 unsigned long shares = 0;
1621 int i;
1622
1623 for_each_cpu_mask(i, sd->span)
1624 shares += tg->cfs_rq[i]->shares;
1625
1626 if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
1627 shares = tg->shares;
1628
1629 aggregate(tg, sd)->shares = shares;
1630}
1631
1632/*
1633 * Compute the load fraction assigned to this group, relies on the aggregate
1634 * weight and this group's parent's load, i.e. top-down.
1635 */
1636static
1637void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
1638{
1639 unsigned long load;
1640
1641 if (!tg->parent) {
1642 int i;
1643
1644 load = 0;
1645 for_each_cpu_mask(i, sd->span)
1646 load += cpu_rq(i)->load.weight;
1647
1648 } else {
1649 load = aggregate(tg->parent, sd)->load;
1650
1651 /*
1652 * shares is our weight in the parent's rq so
1653 * shares/parent->rq_weight gives our fraction of the load
1654 */
1655 load *= aggregate(tg, sd)->shares;
1656 load /= aggregate(tg->parent, sd)->rq_weight + 1;
1657 }
1658
1659 aggregate(tg, sd)->load = load;
1660}
1661
1662static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1663
1664/*
1665 * Calculate and set the cpu's group shares.
1666 */
1667static void
1668__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1669 int tcpu)
1670{
1671 int boost = 0;
1672 unsigned long shares;
1673 unsigned long rq_weight;
1674
1675 if (!tg->se[tcpu])
1676 return;
1677
1678 rq_weight = tg->cfs_rq[tcpu]->load.weight;
1679
1680 /*
1681 * If there are currently no tasks on the cpu pretend there is one of
1682 * average load so that when a new task gets to run here it will not
1683 * get delayed by group starvation.
1684 */
1685 if (!rq_weight) {
1686 boost = 1;
1687 rq_weight = NICE_0_LOAD;
1688 }
1689
1690 /*
1691 * \Sum shares * rq_weight
1692 * shares = -----------------------
1693 * \Sum rq_weight
1694 *
1695 */
1696 shares = aggregate(tg, sd)->shares * rq_weight;
1697 shares /= aggregate(tg, sd)->rq_weight + 1;
1698
1699 /*
1700 * record the actual number of shares, not the boosted amount.
1701 */
1702 tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
1703
1704 if (shares < MIN_SHARES)
1705 shares = MIN_SHARES;
1706 else if (shares > MAX_SHARES)
1707 shares = MAX_SHARES;
1708
1709 __set_se_shares(tg->se[tcpu], shares);
1710}
1711
1712/*
1713 * Re-adjust the weights on the cpu the task came from and on the cpu the
1714 * task went to.
1715 */
1716static void
1717__move_group_shares(struct task_group *tg, struct sched_domain *sd,
1718 int scpu, int dcpu)
1719{
1720 unsigned long shares;
1721
1722 shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1723
1724 __update_group_shares_cpu(tg, sd, scpu);
1725 __update_group_shares_cpu(tg, sd, dcpu);
1726
1727 /*
1728 * ensure we never loose shares due to rounding errors in the
1729 * above redistribution.
1730 */
1731 shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1732 if (shares)
1733 tg->cfs_rq[dcpu]->shares += shares;
1734}
1735
1736/*
1737 * Because changing a group's shares changes the weight of the super-group
1738 * we need to walk up the tree and change all shares until we hit the root.
1739 */
1740static void
1741move_group_shares(struct task_group *tg, struct sched_domain *sd,
1742 int scpu, int dcpu)
1743{
1744 while (tg) {
1745 __move_group_shares(tg, sd, scpu, dcpu);
1746 tg = tg->parent;
1747 }
1748}
1749
1750static
1751void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
1752{
1753 unsigned long shares = aggregate(tg, sd)->shares;
1754 int i;
1755
1756 for_each_cpu_mask(i, sd->span) {
1757 struct rq *rq = cpu_rq(i);
1758 unsigned long flags;
1759
1760 spin_lock_irqsave(&rq->lock, flags);
1761 __update_group_shares_cpu(tg, sd, i);
1762 spin_unlock_irqrestore(&rq->lock, flags);
1763 }
1764
1765 aggregate_group_shares(tg, sd);
1766
1767 /*
1768 * ensure we never loose shares due to rounding errors in the
1769 * above redistribution.
1770 */
1771 shares -= aggregate(tg, sd)->shares;
1772 if (shares) {
1773 tg->cfs_rq[sd->first_cpu]->shares += shares;
1774 aggregate(tg, sd)->shares += shares;
1775 }
1776}
1777
1778/*
1779 * Calculate the accumulative weight and recursive load of each task group
1780 * while walking down the tree.
1781 */
1782static
1783void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
1784{
1785 aggregate_group_weight(tg, sd);
1786 aggregate_group_shares(tg, sd);
1787 aggregate_group_load(tg, sd);
1788}
1789
1790/*
1791 * Rebalance the cpu shares while walking back up the tree.
1792 */
1793static
1794void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
1795{
1796 aggregate_group_set_shares(tg, sd);
1797}
1798
1799static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
1800
1801static void __init init_aggregate(void)
1802{
1803 int i;
1804
1805 for_each_possible_cpu(i)
1806 spin_lock_init(&per_cpu(aggregate_lock, i));
1807}
1808
1809static int get_aggregate(struct sched_domain *sd)
1810{
1811 if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
1812 return 0;
1813
1814 aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
1815 return 1;
1816}
1817
1818static void put_aggregate(struct sched_domain *sd)
1819{
1820 spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
1821}
1822
1823static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1824{
1825 cfs_rq->shares = shares;
1826}
1827
1828#else
1829
1830static inline void init_aggregate(void)
1831{
1832}
1833
1834static inline int get_aggregate(struct sched_domain *sd)
1835{
1836 return 0;
1837}
1838
1839static inline void put_aggregate(struct sched_domain *sd)
1840{
1841}
1842#endif
1843
1487#endif 1844#endif
1488 1845
1489#include "sched_stats.h" 1846#include "sched_stats.h"
@@ -1498,26 +1855,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1498#define for_each_class(class) \ 1855#define for_each_class(class) \
1499 for (class = sched_class_highest; class; class = class->next) 1856 for (class = sched_class_highest; class; class = class->next)
1500 1857
1501static inline void inc_load(struct rq *rq, const struct task_struct *p) 1858static void inc_nr_running(struct rq *rq)
1502{
1503 update_load_add(&rq->load, p->se.load.weight);
1504}
1505
1506static inline void dec_load(struct rq *rq, const struct task_struct *p)
1507{
1508 update_load_sub(&rq->load, p->se.load.weight);
1509}
1510
1511static void inc_nr_running(struct task_struct *p, struct rq *rq)
1512{ 1859{
1513 rq->nr_running++; 1860 rq->nr_running++;
1514 inc_load(rq, p);
1515} 1861}
1516 1862
1517static void dec_nr_running(struct task_struct *p, struct rq *rq) 1863static void dec_nr_running(struct rq *rq)
1518{ 1864{
1519 rq->nr_running--; 1865 rq->nr_running--;
1520 dec_load(rq, p);
1521} 1866}
1522 1867
1523static void set_load_weight(struct task_struct *p) 1868static void set_load_weight(struct task_struct *p)
@@ -1609,7 +1954,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1609 rq->nr_uninterruptible--; 1954 rq->nr_uninterruptible--;
1610 1955
1611 enqueue_task(rq, p, wakeup); 1956 enqueue_task(rq, p, wakeup);
1612 inc_nr_running(p, rq); 1957 inc_nr_running(rq);
1613} 1958}
1614 1959
1615/* 1960/*
@@ -1621,7 +1966,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1621 rq->nr_uninterruptible++; 1966 rq->nr_uninterruptible++;
1622 1967
1623 dequeue_task(rq, p, sleep); 1968 dequeue_task(rq, p, sleep);
1624 dec_nr_running(p, rq); 1969 dec_nr_running(rq);
1625} 1970}
1626 1971
1627/** 1972/**
@@ -2274,7 +2619,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2274 * management (if any): 2619 * management (if any):
2275 */ 2620 */
2276 p->sched_class->task_new(rq, p); 2621 p->sched_class->task_new(rq, p);
2277 inc_nr_running(p, rq); 2622 inc_nr_running(rq);
2278 } 2623 }
2279 check_preempt_curr(rq, p); 2624 check_preempt_curr(rq, p);
2280#ifdef CONFIG_SMP 2625#ifdef CONFIG_SMP
@@ -3265,9 +3610,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3265 unsigned long imbalance; 3610 unsigned long imbalance;
3266 struct rq *busiest; 3611 struct rq *busiest;
3267 unsigned long flags; 3612 unsigned long flags;
3613 int unlock_aggregate;
3268 3614
3269 cpus_setall(*cpus); 3615 cpus_setall(*cpus);
3270 3616
3617 unlock_aggregate = get_aggregate(sd);
3618
3271 /* 3619 /*
3272 * When power savings policy is enabled for the parent domain, idle 3620 * When power savings policy is enabled for the parent domain, idle
3273 * sibling can pick up load irrespective of busy siblings. In this case, 3621 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3383,8 +3731,9 @@ redo:
3383 3731
3384 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3732 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3385 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3733 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3386 return -1; 3734 ld_moved = -1;
3387 return ld_moved; 3735
3736 goto out;
3388 3737
3389out_balanced: 3738out_balanced:
3390 schedstat_inc(sd, lb_balanced[idle]); 3739 schedstat_inc(sd, lb_balanced[idle]);
@@ -3399,8 +3748,13 @@ out_one_pinned:
3399 3748
3400 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3749 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3401 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3750 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3402 return -1; 3751 ld_moved = -1;
3403 return 0; 3752 else
3753 ld_moved = 0;
3754out:
3755 if (unlock_aggregate)
3756 put_aggregate(sd);
3757 return ld_moved;
3404} 3758}
3405 3759
3406/* 3760/*
@@ -4588,10 +4942,8 @@ void set_user_nice(struct task_struct *p, long nice)
4588 goto out_unlock; 4942 goto out_unlock;
4589 } 4943 }
4590 on_rq = p->se.on_rq; 4944 on_rq = p->se.on_rq;
4591 if (on_rq) { 4945 if (on_rq)
4592 dequeue_task(rq, p, 0); 4946 dequeue_task(rq, p, 0);
4593 dec_load(rq, p);
4594 }
4595 4947
4596 p->static_prio = NICE_TO_PRIO(nice); 4948 p->static_prio = NICE_TO_PRIO(nice);
4597 set_load_weight(p); 4949 set_load_weight(p);
@@ -4601,7 +4953,6 @@ void set_user_nice(struct task_struct *p, long nice)
4601 4953
4602 if (on_rq) { 4954 if (on_rq) {
4603 enqueue_task(rq, p, 0); 4955 enqueue_task(rq, p, 0);
4604 inc_load(rq, p);
4605 /* 4956 /*
4606 * If the task increased its priority or is running and 4957 * If the task increased its priority or is running and
4607 * lowered its priority, then reschedule its CPU: 4958 * lowered its priority, then reschedule its CPU:
@@ -7016,6 +7367,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7016 SD_INIT(sd, ALLNODES); 7367 SD_INIT(sd, ALLNODES);
7017 set_domain_attribute(sd, attr); 7368 set_domain_attribute(sd, attr);
7018 sd->span = *cpu_map; 7369 sd->span = *cpu_map;
7370 sd->first_cpu = first_cpu(sd->span);
7019 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); 7371 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7020 p = sd; 7372 p = sd;
7021 sd_allnodes = 1; 7373 sd_allnodes = 1;
@@ -7026,6 +7378,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7026 SD_INIT(sd, NODE); 7378 SD_INIT(sd, NODE);
7027 set_domain_attribute(sd, attr); 7379 set_domain_attribute(sd, attr);
7028 sched_domain_node_span(cpu_to_node(i), &sd->span); 7380 sched_domain_node_span(cpu_to_node(i), &sd->span);
7381 sd->first_cpu = first_cpu(sd->span);
7029 sd->parent = p; 7382 sd->parent = p;
7030 if (p) 7383 if (p)
7031 p->child = sd; 7384 p->child = sd;
@@ -7037,6 +7390,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7037 SD_INIT(sd, CPU); 7390 SD_INIT(sd, CPU);
7038 set_domain_attribute(sd, attr); 7391 set_domain_attribute(sd, attr);
7039 sd->span = *nodemask; 7392 sd->span = *nodemask;
7393 sd->first_cpu = first_cpu(sd->span);
7040 sd->parent = p; 7394 sd->parent = p;
7041 if (p) 7395 if (p)
7042 p->child = sd; 7396 p->child = sd;
@@ -7048,6 +7402,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7048 SD_INIT(sd, MC); 7402 SD_INIT(sd, MC);
7049 set_domain_attribute(sd, attr); 7403 set_domain_attribute(sd, attr);
7050 sd->span = cpu_coregroup_map(i); 7404 sd->span = cpu_coregroup_map(i);
7405 sd->first_cpu = first_cpu(sd->span);
7051 cpus_and(sd->span, sd->span, *cpu_map); 7406 cpus_and(sd->span, sd->span, *cpu_map);
7052 sd->parent = p; 7407 sd->parent = p;
7053 p->child = sd; 7408 p->child = sd;
@@ -7060,6 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7060 SD_INIT(sd, SIBLING); 7415 SD_INIT(sd, SIBLING);
7061 set_domain_attribute(sd, attr); 7416 set_domain_attribute(sd, attr);
7062 sd->span = per_cpu(cpu_sibling_map, i); 7417 sd->span = per_cpu(cpu_sibling_map, i);
7418 sd->first_cpu = first_cpu(sd->span);
7063 cpus_and(sd->span, sd->span, *cpu_map); 7419 cpus_and(sd->span, sd->span, *cpu_map);
7064 sd->parent = p; 7420 sd->parent = p;
7065 p->child = sd; 7421 p->child = sd;
@@ -7757,6 +8113,7 @@ void __init sched_init(void)
7757 } 8113 }
7758 8114
7759#ifdef CONFIG_SMP 8115#ifdef CONFIG_SMP
8116 init_aggregate();
7760 init_defrootdomain(); 8117 init_defrootdomain();
7761#endif 8118#endif
7762 8119
@@ -8322,14 +8679,11 @@ void sched_move_task(struct task_struct *tsk)
8322#endif /* CONFIG_GROUP_SCHED */ 8679#endif /* CONFIG_GROUP_SCHED */
8323 8680
8324#ifdef CONFIG_FAIR_GROUP_SCHED 8681#ifdef CONFIG_FAIR_GROUP_SCHED
8325static void set_se_shares(struct sched_entity *se, unsigned long shares) 8682static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8326{ 8683{
8327 struct cfs_rq *cfs_rq = se->cfs_rq; 8684 struct cfs_rq *cfs_rq = se->cfs_rq;
8328 struct rq *rq = cfs_rq->rq;
8329 int on_rq; 8685 int on_rq;
8330 8686
8331 spin_lock_irq(&rq->lock);
8332
8333 on_rq = se->on_rq; 8687 on_rq = se->on_rq;
8334 if (on_rq) 8688 if (on_rq)
8335 dequeue_entity(cfs_rq, se, 0); 8689 dequeue_entity(cfs_rq, se, 0);
@@ -8339,8 +8693,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
8339 8693
8340 if (on_rq) 8694 if (on_rq)
8341 enqueue_entity(cfs_rq, se, 0); 8695 enqueue_entity(cfs_rq, se, 0);
8696}
8342 8697
8343 spin_unlock_irq(&rq->lock); 8698static void set_se_shares(struct sched_entity *se, unsigned long shares)
8699{
8700 struct cfs_rq *cfs_rq = se->cfs_rq;
8701 struct rq *rq = cfs_rq->rq;
8702 unsigned long flags;
8703
8704 spin_lock_irqsave(&rq->lock, flags);
8705 __set_se_shares(se, shares);
8706 spin_unlock_irqrestore(&rq->lock, flags);
8344} 8707}
8345 8708
8346static DEFINE_MUTEX(shares_mutex); 8709static DEFINE_MUTEX(shares_mutex);
@@ -8379,8 +8742,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8379 * w/o tripping rebalance_share or load_balance_fair. 8742 * w/o tripping rebalance_share or load_balance_fair.
8380 */ 8743 */
8381 tg->shares = shares; 8744 tg->shares = shares;
8382 for_each_possible_cpu(i) 8745 for_each_possible_cpu(i) {
8746 /*
8747 * force a rebalance
8748 */
8749 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8383 set_se_shares(tg->se[i], shares); 8750 set_se_shares(tg->se[i], shares);
8751 }
8384 8752
8385 /* 8753 /*
8386 * Enable load balance activity on this group, by inserting it back on 8754 * Enable load balance activity on this group, by inserting it back on
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8e077b9c91cb..04394ccac88d 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -167,6 +167,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
167#endif 167#endif
168 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", 168 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
169 cfs_rq->nr_spread_over); 169 cfs_rq->nr_spread_over);
170#ifdef CONFIG_FAIR_GROUP_SCHED
171#ifdef CONFIG_SMP
172 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
173#endif
174#endif
170} 175}
171 176
172void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) 177void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 2e197b8e43f1..183388c4dead 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -567,10 +567,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
567 * Scheduling class queueing methods: 567 * Scheduling class queueing methods:
568 */ 568 */
569 569
570#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
571static void
572add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
573{
574 cfs_rq->task_weight += weight;
575}
576#else
577static inline void
578add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
579{
580}
581#endif
582
570static void 583static void
571account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 584account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
572{ 585{
573 update_load_add(&cfs_rq->load, se->load.weight); 586 update_load_add(&cfs_rq->load, se->load.weight);
587 if (!parent_entity(se))
588 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
589 if (entity_is_task(se))
590 add_cfs_task_weight(cfs_rq, se->load.weight);
574 cfs_rq->nr_running++; 591 cfs_rq->nr_running++;
575 se->on_rq = 1; 592 se->on_rq = 1;
576 list_add(&se->group_node, &cfs_rq->tasks); 593 list_add(&se->group_node, &cfs_rq->tasks);
@@ -580,6 +597,10 @@ static void
580account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 597account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
581{ 598{
582 update_load_sub(&cfs_rq->load, se->load.weight); 599 update_load_sub(&cfs_rq->load, se->load.weight);
600 if (!parent_entity(se))
601 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
602 if (entity_is_task(se))
603 add_cfs_task_weight(cfs_rq, -se->load.weight);
583 cfs_rq->nr_running--; 604 cfs_rq->nr_running--;
584 se->on_rq = 0; 605 se->on_rq = 0;
585 list_del_init(&se->group_node); 606 list_del_init(&se->group_node);
@@ -1372,75 +1393,90 @@ static struct task_struct *load_balance_next_fair(void *arg)
1372 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1393 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
1373} 1394}
1374 1395
1375#ifdef CONFIG_FAIR_GROUP_SCHED 1396static unsigned long
1376static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) 1397__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1398 unsigned long max_load_move, struct sched_domain *sd,
1399 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
1400 struct cfs_rq *cfs_rq)
1377{ 1401{
1378 struct sched_entity *curr; 1402 struct rq_iterator cfs_rq_iterator;
1379 struct task_struct *p;
1380
1381 if (!cfs_rq->nr_running || !first_fair(cfs_rq))
1382 return MAX_PRIO;
1383
1384 curr = cfs_rq->curr;
1385 if (!curr)
1386 curr = __pick_next_entity(cfs_rq);
1387 1403
1388 p = task_of(curr); 1404 cfs_rq_iterator.start = load_balance_start_fair;
1405 cfs_rq_iterator.next = load_balance_next_fair;
1406 cfs_rq_iterator.arg = cfs_rq;
1389 1407
1390 return p->prio; 1408 return balance_tasks(this_rq, this_cpu, busiest,
1409 max_load_move, sd, idle, all_pinned,
1410 this_best_prio, &cfs_rq_iterator);
1391} 1411}
1392#endif
1393 1412
1413#ifdef CONFIG_FAIR_GROUP_SCHED
1394static unsigned long 1414static unsigned long
1395load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1415load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1396 unsigned long max_load_move, 1416 unsigned long max_load_move,
1397 struct sched_domain *sd, enum cpu_idle_type idle, 1417 struct sched_domain *sd, enum cpu_idle_type idle,
1398 int *all_pinned, int *this_best_prio) 1418 int *all_pinned, int *this_best_prio)
1399{ 1419{
1400 struct cfs_rq *busy_cfs_rq;
1401 long rem_load_move = max_load_move; 1420 long rem_load_move = max_load_move;
1402 struct rq_iterator cfs_rq_iterator; 1421 int busiest_cpu = cpu_of(busiest);
1403 1422 struct task_group *tg;
1404 cfs_rq_iterator.start = load_balance_start_fair;
1405 cfs_rq_iterator.next = load_balance_next_fair;
1406 1423
1407 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1424 rcu_read_lock();
1408#ifdef CONFIG_FAIR_GROUP_SCHED 1425 list_for_each_entry(tg, &task_groups, list) {
1409 struct cfs_rq *this_cfs_rq;
1410 long imbalance; 1426 long imbalance;
1411 unsigned long maxload; 1427 unsigned long this_weight, busiest_weight;
1428 long rem_load, max_load, moved_load;
1429
1430 /*
1431 * empty group
1432 */
1433 if (!aggregate(tg, sd)->task_weight)
1434 continue;
1435
1436 rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
1437 rem_load /= aggregate(tg, sd)->load + 1;
1438
1439 this_weight = tg->cfs_rq[this_cpu]->task_weight;
1440 busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
1412 1441
1413 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 1442 imbalance = (busiest_weight - this_weight) / 2;
1414 1443
1415 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 1444 if (imbalance < 0)
1416 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 1445 imbalance = busiest_weight;
1417 if (imbalance <= 0) 1446
1447 max_load = max(rem_load, imbalance);
1448 moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1449 max_load, sd, idle, all_pinned, this_best_prio,
1450 tg->cfs_rq[busiest_cpu]);
1451
1452 if (!moved_load)
1418 continue; 1453 continue;
1419 1454
1420 /* Don't pull more than imbalance/2 */ 1455 move_group_shares(tg, sd, busiest_cpu, this_cpu);
1421 imbalance /= 2;
1422 maxload = min(rem_load_move, imbalance);
1423 1456
1424 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1457 moved_load *= aggregate(tg, sd)->load;
1425#else 1458 moved_load /= aggregate(tg, sd)->rq_weight + 1;
1426# define maxload rem_load_move
1427#endif
1428 /*
1429 * pass busy_cfs_rq argument into
1430 * load_balance_[start|next]_fair iterators
1431 */
1432 cfs_rq_iterator.arg = busy_cfs_rq;
1433 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1434 maxload, sd, idle, all_pinned,
1435 this_best_prio,
1436 &cfs_rq_iterator);
1437 1459
1438 if (rem_load_move <= 0) 1460 rem_load_move -= moved_load;
1461 if (rem_load_move < 0)
1439 break; 1462 break;
1440 } 1463 }
1464 rcu_read_unlock();
1441 1465
1442 return max_load_move - rem_load_move; 1466 return max_load_move - rem_load_move;
1443} 1467}
1468#else
1469static unsigned long
1470load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1471 unsigned long max_load_move,
1472 struct sched_domain *sd, enum cpu_idle_type idle,
1473 int *all_pinned, int *this_best_prio)
1474{
1475 return __load_balance_fair(this_rq, this_cpu, busiest,
1476 max_load_move, sd, idle, all_pinned,
1477 this_best_prio, &busiest->cfs);
1478}
1479#endif
1444 1480
1445static int 1481static int
1446move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1482move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 6b4a6b5a4167..765932d0399d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -670,6 +670,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
670 rt_se->timeout = 0; 670 rt_se->timeout = 0;
671 671
672 enqueue_rt_entity(rt_se); 672 enqueue_rt_entity(rt_se);
673
674 inc_cpu_load(rq, p->se.load.weight);
673} 675}
674 676
675static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 677static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -678,6 +680,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
678 680
679 update_curr_rt(rq); 681 update_curr_rt(rq);
680 dequeue_rt_entity(rt_se); 682 dequeue_rt_entity(rt_se);
683
684 dec_cpu_load(rq, p->se.load.weight);
681} 685}
682 686
683/* 687/*