aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-05-29 05:28:57 -0400
committerIngo Molnar <mingo@elte.hu>2008-05-29 05:28:57 -0400
commit6363ca57c76b7b83639ca8c83fc285fa26a7880e (patch)
treeb8630b4af286409efdd648920a546fae24d4db88
parent4285f594f84d1f0641fc962d00e6638dec4a19c4 (diff)
revert ("sched: fair-group: SMP-nice for group scheduling")
Yanmin Zhang reported: Comparing with 2.6.25, volanoMark has big regression with kernel 2.6.26-rc1. It's about 50% on my 8-core stoakley, 16-core tigerton, and Itanium Montecito. With bisect, I located the following patch: | 18d95a2832c1392a2d63227a7a6d433cb9f2037e is first bad commit | commit 18d95a2832c1392a2d63227a7a6d433cb9f2037e | Author: Peter Zijlstra <a.p.zijlstra@chello.nl> | Date: Sat Apr 19 19:45:00 2008 +0200 | | sched: fair-group: SMP-nice for group scheduling Revert it so that we get v2.6.25 behavior. Bisected-by: Yanmin Zhang <yanmin_zhang@linux.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/sched.h1
-rw-r--r--kernel/sched.c430
-rw-r--r--kernel/sched_debug.c5
-rw-r--r--kernel/sched_fair.c124
-rw-r--r--kernel/sched_rt.c4
5 files changed, 75 insertions, 489 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5395a6176f4b..8a888499954e 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -766,7 +766,6 @@ struct sched_domain {
766 struct sched_domain *child; /* bottom domain must be null terminated */ 766 struct sched_domain *child; /* bottom domain must be null terminated */
767 struct sched_group *groups; /* the balancing groups of the domain */ 767 struct sched_group *groups; /* the balancing groups of the domain */
768 cpumask_t span; /* span of all CPUs in this domain */ 768 cpumask_t span; /* span of all CPUs in this domain */
769 int first_cpu; /* cache of the first cpu in this domain */
770 unsigned long min_interval; /* Minimum balance interval ms */ 769 unsigned long min_interval; /* Minimum balance interval ms */
771 unsigned long max_interval; /* Maximum balance interval ms */ 770 unsigned long max_interval; /* Maximum balance interval ms */
772 unsigned int busy_factor; /* less balancing by factor if busy */ 771 unsigned int busy_factor; /* less balancing by factor if busy */
diff --git a/kernel/sched.c b/kernel/sched.c
index 3dc13f05b10e..bfb8ad8ed171 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -398,43 +398,6 @@ struct cfs_rq {
398 */ 398 */
399 struct list_head leaf_cfs_rq_list; 399 struct list_head leaf_cfs_rq_list;
400 struct task_group *tg; /* group that "owns" this runqueue */ 400 struct task_group *tg; /* group that "owns" this runqueue */
401
402#ifdef CONFIG_SMP
403 unsigned long task_weight;
404 unsigned long shares;
405 /*
406 * We need space to build a sched_domain wide view of the full task
407 * group tree, in order to avoid depending on dynamic memory allocation
408 * during the load balancing we place this in the per cpu task group
409 * hierarchy. This limits the load balancing to one instance per cpu,
410 * but more should not be needed anyway.
411 */
412 struct aggregate_struct {
413 /*
414 * load = weight(cpus) * f(tg)
415 *
416 * Where f(tg) is the recursive weight fraction assigned to
417 * this group.
418 */
419 unsigned long load;
420
421 /*
422 * part of the group weight distributed to this span.
423 */
424 unsigned long shares;
425
426 /*
427 * The sum of all runqueue weights within this span.
428 */
429 unsigned long rq_weight;
430
431 /*
432 * Weight contributed by tasks; this is the part we can
433 * influence by moving tasks around.
434 */
435 unsigned long task_weight;
436 } aggregate;
437#endif
438#endif 401#endif
439}; 402};
440 403
@@ -1508,326 +1471,6 @@ static unsigned long source_load(int cpu, int type);
1508static unsigned long target_load(int cpu, int type); 1471static unsigned long target_load(int cpu, int type);
1509static unsigned long cpu_avg_load_per_task(int cpu); 1472static unsigned long cpu_avg_load_per_task(int cpu);
1510static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1473static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1511
1512#ifdef CONFIG_FAIR_GROUP_SCHED
1513
1514/*
1515 * Group load balancing.
1516 *
1517 * We calculate a few balance domain wide aggregate numbers; load and weight.
1518 * Given the pictures below, and assuming each item has equal weight:
1519 *
1520 * root 1 - thread
1521 * / | \ A - group
1522 * A 1 B
1523 * /|\ / \
1524 * C 2 D 3 4
1525 * | |
1526 * 5 6
1527 *
1528 * load:
1529 * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
1530 * which equals 1/9-th of the total load.
1531 *
1532 * shares:
1533 * The weight of this group on the selected cpus.
1534 *
1535 * rq_weight:
1536 * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
1537 * B would get 2.
1538 *
1539 * task_weight:
1540 * Part of the rq_weight contributed by tasks; all groups except B would
1541 * get 1, B gets 2.
1542 */
1543
1544static inline struct aggregate_struct *
1545aggregate(struct task_group *tg, struct sched_domain *sd)
1546{
1547 return &tg->cfs_rq[sd->first_cpu]->aggregate;
1548}
1549
1550typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
1551
1552/*
1553 * Iterate the full tree, calling @down when first entering a node and @up when
1554 * leaving it for the final time.
1555 */
1556static
1557void aggregate_walk_tree(aggregate_func down, aggregate_func up,
1558 struct sched_domain *sd)
1559{
1560 struct task_group *parent, *child;
1561
1562 rcu_read_lock();
1563 parent = &root_task_group;
1564down:
1565 (*down)(parent, sd);
1566 list_for_each_entry_rcu(child, &parent->children, siblings) {
1567 parent = child;
1568 goto down;
1569
1570up:
1571 continue;
1572 }
1573 (*up)(parent, sd);
1574
1575 child = parent;
1576 parent = parent->parent;
1577 if (parent)
1578 goto up;
1579 rcu_read_unlock();
1580}
1581
1582/*
1583 * Calculate the aggregate runqueue weight.
1584 */
1585static
1586void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
1587{
1588 unsigned long rq_weight = 0;
1589 unsigned long task_weight = 0;
1590 int i;
1591
1592 for_each_cpu_mask(i, sd->span) {
1593 rq_weight += tg->cfs_rq[i]->load.weight;
1594 task_weight += tg->cfs_rq[i]->task_weight;
1595 }
1596
1597 aggregate(tg, sd)->rq_weight = rq_weight;
1598 aggregate(tg, sd)->task_weight = task_weight;
1599}
1600
1601/*
1602 * Compute the weight of this group on the given cpus.
1603 */
1604static
1605void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
1606{
1607 unsigned long shares = 0;
1608 int i;
1609
1610 for_each_cpu_mask(i, sd->span)
1611 shares += tg->cfs_rq[i]->shares;
1612
1613 if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
1614 shares = tg->shares;
1615
1616 aggregate(tg, sd)->shares = shares;
1617}
1618
1619/*
1620 * Compute the load fraction assigned to this group, relies on the aggregate
1621 * weight and this group's parent's load, i.e. top-down.
1622 */
1623static
1624void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
1625{
1626 unsigned long load;
1627
1628 if (!tg->parent) {
1629 int i;
1630
1631 load = 0;
1632 for_each_cpu_mask(i, sd->span)
1633 load += cpu_rq(i)->load.weight;
1634
1635 } else {
1636 load = aggregate(tg->parent, sd)->load;
1637
1638 /*
1639 * shares is our weight in the parent's rq so
1640 * shares/parent->rq_weight gives our fraction of the load
1641 */
1642 load *= aggregate(tg, sd)->shares;
1643 load /= aggregate(tg->parent, sd)->rq_weight + 1;
1644 }
1645
1646 aggregate(tg, sd)->load = load;
1647}
1648
1649static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1650
1651/*
1652 * Calculate and set the cpu's group shares.
1653 */
1654static void
1655__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1656 int tcpu)
1657{
1658 int boost = 0;
1659 unsigned long shares;
1660 unsigned long rq_weight;
1661
1662 if (!tg->se[tcpu])
1663 return;
1664
1665 rq_weight = tg->cfs_rq[tcpu]->load.weight;
1666
1667 /*
1668 * If there are currently no tasks on the cpu pretend there is one of
1669 * average load so that when a new task gets to run here it will not
1670 * get delayed by group starvation.
1671 */
1672 if (!rq_weight) {
1673 boost = 1;
1674 rq_weight = NICE_0_LOAD;
1675 }
1676
1677 /*
1678 * \Sum shares * rq_weight
1679 * shares = -----------------------
1680 * \Sum rq_weight
1681 *
1682 */
1683 shares = aggregate(tg, sd)->shares * rq_weight;
1684 shares /= aggregate(tg, sd)->rq_weight + 1;
1685
1686 /*
1687 * record the actual number of shares, not the boosted amount.
1688 */
1689 tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
1690
1691 if (shares < MIN_SHARES)
1692 shares = MIN_SHARES;
1693 else if (shares > MAX_SHARES)
1694 shares = MAX_SHARES;
1695
1696 __set_se_shares(tg->se[tcpu], shares);
1697}
1698
1699/*
1700 * Re-adjust the weights on the cpu the task came from and on the cpu the
1701 * task went to.
1702 */
1703static void
1704__move_group_shares(struct task_group *tg, struct sched_domain *sd,
1705 int scpu, int dcpu)
1706{
1707 unsigned long shares;
1708
1709 shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1710
1711 __update_group_shares_cpu(tg, sd, scpu);
1712 __update_group_shares_cpu(tg, sd, dcpu);
1713
1714 /*
1715 * ensure we never loose shares due to rounding errors in the
1716 * above redistribution.
1717 */
1718 shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1719 if (shares)
1720 tg->cfs_rq[dcpu]->shares += shares;
1721}
1722
1723/*
1724 * Because changing a group's shares changes the weight of the super-group
1725 * we need to walk up the tree and change all shares until we hit the root.
1726 */
1727static void
1728move_group_shares(struct task_group *tg, struct sched_domain *sd,
1729 int scpu, int dcpu)
1730{
1731 while (tg) {
1732 __move_group_shares(tg, sd, scpu, dcpu);
1733 tg = tg->parent;
1734 }
1735}
1736
1737static
1738void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
1739{
1740 unsigned long shares = aggregate(tg, sd)->shares;
1741 int i;
1742
1743 for_each_cpu_mask(i, sd->span) {
1744 struct rq *rq = cpu_rq(i);
1745 unsigned long flags;
1746
1747 spin_lock_irqsave(&rq->lock, flags);
1748 __update_group_shares_cpu(tg, sd, i);
1749 spin_unlock_irqrestore(&rq->lock, flags);
1750 }
1751
1752 aggregate_group_shares(tg, sd);
1753
1754 /*
1755 * ensure we never loose shares due to rounding errors in the
1756 * above redistribution.
1757 */
1758 shares -= aggregate(tg, sd)->shares;
1759 if (shares) {
1760 tg->cfs_rq[sd->first_cpu]->shares += shares;
1761 aggregate(tg, sd)->shares += shares;
1762 }
1763}
1764
1765/*
1766 * Calculate the accumulative weight and recursive load of each task group
1767 * while walking down the tree.
1768 */
1769static
1770void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
1771{
1772 aggregate_group_weight(tg, sd);
1773 aggregate_group_shares(tg, sd);
1774 aggregate_group_load(tg, sd);
1775}
1776
1777/*
1778 * Rebalance the cpu shares while walking back up the tree.
1779 */
1780static
1781void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
1782{
1783 aggregate_group_set_shares(tg, sd);
1784}
1785
1786static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
1787
1788static void __init init_aggregate(void)
1789{
1790 int i;
1791
1792 for_each_possible_cpu(i)
1793 spin_lock_init(&per_cpu(aggregate_lock, i));
1794}
1795
1796static int get_aggregate(struct sched_domain *sd)
1797{
1798 if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
1799 return 0;
1800
1801 aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
1802 return 1;
1803}
1804
1805static void put_aggregate(struct sched_domain *sd)
1806{
1807 spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
1808}
1809
1810static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1811{
1812 cfs_rq->shares = shares;
1813}
1814
1815#else
1816
1817static inline void init_aggregate(void)
1818{
1819}
1820
1821static inline int get_aggregate(struct sched_domain *sd)
1822{
1823 return 0;
1824}
1825
1826static inline void put_aggregate(struct sched_domain *sd)
1827{
1828}
1829#endif
1830
1831#else /* CONFIG_SMP */ 1474#else /* CONFIG_SMP */
1832 1475
1833#ifdef CONFIG_FAIR_GROUP_SCHED 1476#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1848,14 +1491,26 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1848 1491
1849#define sched_class_highest (&rt_sched_class) 1492#define sched_class_highest (&rt_sched_class)
1850 1493
1851static void inc_nr_running(struct rq *rq) 1494static inline void inc_load(struct rq *rq, const struct task_struct *p)
1495{
1496 update_load_add(&rq->load, p->se.load.weight);
1497}
1498
1499static inline void dec_load(struct rq *rq, const struct task_struct *p)
1500{
1501 update_load_sub(&rq->load, p->se.load.weight);
1502}
1503
1504static void inc_nr_running(struct task_struct *p, struct rq *rq)
1852{ 1505{
1853 rq->nr_running++; 1506 rq->nr_running++;
1507 inc_load(rq, p);
1854} 1508}
1855 1509
1856static void dec_nr_running(struct rq *rq) 1510static void dec_nr_running(struct task_struct *p, struct rq *rq)
1857{ 1511{
1858 rq->nr_running--; 1512 rq->nr_running--;
1513 dec_load(rq, p);
1859} 1514}
1860 1515
1861static void set_load_weight(struct task_struct *p) 1516static void set_load_weight(struct task_struct *p)
@@ -1947,7 +1602,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1947 rq->nr_uninterruptible--; 1602 rq->nr_uninterruptible--;
1948 1603
1949 enqueue_task(rq, p, wakeup); 1604 enqueue_task(rq, p, wakeup);
1950 inc_nr_running(rq); 1605 inc_nr_running(p, rq);
1951} 1606}
1952 1607
1953/* 1608/*
@@ -1959,7 +1614,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1959 rq->nr_uninterruptible++; 1614 rq->nr_uninterruptible++;
1960 1615
1961 dequeue_task(rq, p, sleep); 1616 dequeue_task(rq, p, sleep);
1962 dec_nr_running(rq); 1617 dec_nr_running(p, rq);
1963} 1618}
1964 1619
1965/** 1620/**
@@ -2612,7 +2267,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2612 * management (if any): 2267 * management (if any):
2613 */ 2268 */
2614 p->sched_class->task_new(rq, p); 2269 p->sched_class->task_new(rq, p);
2615 inc_nr_running(rq); 2270 inc_nr_running(p, rq);
2616 } 2271 }
2617 check_preempt_curr(rq, p); 2272 check_preempt_curr(rq, p);
2618#ifdef CONFIG_SMP 2273#ifdef CONFIG_SMP
@@ -3603,12 +3258,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3603 unsigned long imbalance; 3258 unsigned long imbalance;
3604 struct rq *busiest; 3259 struct rq *busiest;
3605 unsigned long flags; 3260 unsigned long flags;
3606 int unlock_aggregate;
3607 3261
3608 cpus_setall(*cpus); 3262 cpus_setall(*cpus);
3609 3263
3610 unlock_aggregate = get_aggregate(sd);
3611
3612 /* 3264 /*
3613 * When power savings policy is enabled for the parent domain, idle 3265 * When power savings policy is enabled for the parent domain, idle
3614 * sibling can pick up load irrespective of busy siblings. In this case, 3266 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3724,9 +3376,8 @@ redo:
3724 3376
3725 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3377 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3726 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3378 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3727 ld_moved = -1; 3379 return -1;
3728 3380 return ld_moved;
3729 goto out;
3730 3381
3731out_balanced: 3382out_balanced:
3732 schedstat_inc(sd, lb_balanced[idle]); 3383 schedstat_inc(sd, lb_balanced[idle]);
@@ -3741,13 +3392,8 @@ out_one_pinned:
3741 3392
3742 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3393 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3743 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3394 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3744 ld_moved = -1; 3395 return -1;
3745 else 3396 return 0;
3746 ld_moved = 0;
3747out:
3748 if (unlock_aggregate)
3749 put_aggregate(sd);
3750 return ld_moved;
3751} 3397}
3752 3398
3753/* 3399/*
@@ -4934,8 +4580,10 @@ void set_user_nice(struct task_struct *p, long nice)
4934 goto out_unlock; 4580 goto out_unlock;
4935 } 4581 }
4936 on_rq = p->se.on_rq; 4582 on_rq = p->se.on_rq;
4937 if (on_rq) 4583 if (on_rq) {
4938 dequeue_task(rq, p, 0); 4584 dequeue_task(rq, p, 0);
4585 dec_load(rq, p);
4586 }
4939 4587
4940 p->static_prio = NICE_TO_PRIO(nice); 4588 p->static_prio = NICE_TO_PRIO(nice);
4941 set_load_weight(p); 4589 set_load_weight(p);
@@ -4945,6 +4593,7 @@ void set_user_nice(struct task_struct *p, long nice)
4945 4593
4946 if (on_rq) { 4594 if (on_rq) {
4947 enqueue_task(rq, p, 0); 4595 enqueue_task(rq, p, 0);
4596 inc_load(rq, p);
4948 /* 4597 /*
4949 * If the task increased its priority or is running and 4598 * If the task increased its priority or is running and
4950 * lowered its priority, then reschedule its CPU: 4599 * lowered its priority, then reschedule its CPU:
@@ -7319,7 +6968,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7319 SD_INIT(sd, ALLNODES); 6968 SD_INIT(sd, ALLNODES);
7320 set_domain_attribute(sd, attr); 6969 set_domain_attribute(sd, attr);
7321 sd->span = *cpu_map; 6970 sd->span = *cpu_map;
7322 sd->first_cpu = first_cpu(sd->span);
7323 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); 6971 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7324 p = sd; 6972 p = sd;
7325 sd_allnodes = 1; 6973 sd_allnodes = 1;
@@ -7330,7 +6978,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7330 SD_INIT(sd, NODE); 6978 SD_INIT(sd, NODE);
7331 set_domain_attribute(sd, attr); 6979 set_domain_attribute(sd, attr);
7332 sched_domain_node_span(cpu_to_node(i), &sd->span); 6980 sched_domain_node_span(cpu_to_node(i), &sd->span);
7333 sd->first_cpu = first_cpu(sd->span);
7334 sd->parent = p; 6981 sd->parent = p;
7335 if (p) 6982 if (p)
7336 p->child = sd; 6983 p->child = sd;
@@ -7342,7 +6989,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7342 SD_INIT(sd, CPU); 6989 SD_INIT(sd, CPU);
7343 set_domain_attribute(sd, attr); 6990 set_domain_attribute(sd, attr);
7344 sd->span = *nodemask; 6991 sd->span = *nodemask;
7345 sd->first_cpu = first_cpu(sd->span);
7346 sd->parent = p; 6992 sd->parent = p;
7347 if (p) 6993 if (p)
7348 p->child = sd; 6994 p->child = sd;
@@ -7354,7 +7000,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7354 SD_INIT(sd, MC); 7000 SD_INIT(sd, MC);
7355 set_domain_attribute(sd, attr); 7001 set_domain_attribute(sd, attr);
7356 sd->span = cpu_coregroup_map(i); 7002 sd->span = cpu_coregroup_map(i);
7357 sd->first_cpu = first_cpu(sd->span);
7358 cpus_and(sd->span, sd->span, *cpu_map); 7003 cpus_and(sd->span, sd->span, *cpu_map);
7359 sd->parent = p; 7004 sd->parent = p;
7360 p->child = sd; 7005 p->child = sd;
@@ -7367,7 +7012,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7367 SD_INIT(sd, SIBLING); 7012 SD_INIT(sd, SIBLING);
7368 set_domain_attribute(sd, attr); 7013 set_domain_attribute(sd, attr);
7369 sd->span = per_cpu(cpu_sibling_map, i); 7014 sd->span = per_cpu(cpu_sibling_map, i);
7370 sd->first_cpu = first_cpu(sd->span);
7371 cpus_and(sd->span, sd->span, *cpu_map); 7015 cpus_and(sd->span, sd->span, *cpu_map);
7372 sd->parent = p; 7016 sd->parent = p;
7373 p->child = sd; 7017 p->child = sd;
@@ -8037,7 +7681,6 @@ void __init sched_init(void)
8037 } 7681 }
8038 7682
8039#ifdef CONFIG_SMP 7683#ifdef CONFIG_SMP
8040 init_aggregate();
8041 init_defrootdomain(); 7684 init_defrootdomain();
8042#endif 7685#endif
8043 7686
@@ -8602,11 +8245,14 @@ void sched_move_task(struct task_struct *tsk)
8602#endif 8245#endif
8603 8246
8604#ifdef CONFIG_FAIR_GROUP_SCHED 8247#ifdef CONFIG_FAIR_GROUP_SCHED
8605static void __set_se_shares(struct sched_entity *se, unsigned long shares) 8248static void set_se_shares(struct sched_entity *se, unsigned long shares)
8606{ 8249{
8607 struct cfs_rq *cfs_rq = se->cfs_rq; 8250 struct cfs_rq *cfs_rq = se->cfs_rq;
8251 struct rq *rq = cfs_rq->rq;
8608 int on_rq; 8252 int on_rq;
8609 8253
8254 spin_lock_irq(&rq->lock);
8255
8610 on_rq = se->on_rq; 8256 on_rq = se->on_rq;
8611 if (on_rq) 8257 if (on_rq)
8612 dequeue_entity(cfs_rq, se, 0); 8258 dequeue_entity(cfs_rq, se, 0);
@@ -8616,17 +8262,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8616 8262
8617 if (on_rq) 8263 if (on_rq)
8618 enqueue_entity(cfs_rq, se, 0); 8264 enqueue_entity(cfs_rq, se, 0);
8619}
8620 8265
8621static void set_se_shares(struct sched_entity *se, unsigned long shares) 8266 spin_unlock_irq(&rq->lock);
8622{
8623 struct cfs_rq *cfs_rq = se->cfs_rq;
8624 struct rq *rq = cfs_rq->rq;
8625 unsigned long flags;
8626
8627 spin_lock_irqsave(&rq->lock, flags);
8628 __set_se_shares(se, shares);
8629 spin_unlock_irqrestore(&rq->lock, flags);
8630} 8267}
8631 8268
8632static DEFINE_MUTEX(shares_mutex); 8269static DEFINE_MUTEX(shares_mutex);
@@ -8665,13 +8302,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8665 * w/o tripping rebalance_share or load_balance_fair. 8302 * w/o tripping rebalance_share or load_balance_fair.
8666 */ 8303 */
8667 tg->shares = shares; 8304 tg->shares = shares;
8668 for_each_possible_cpu(i) { 8305 for_each_possible_cpu(i)
8669 /*
8670 * force a rebalance
8671 */
8672 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8673 set_se_shares(tg->se[i], shares); 8306 set_se_shares(tg->se[i], shares);
8674 }
8675 8307
8676 /* 8308 /*
8677 * Enable load balance activity on this group, by inserting it back on 8309 * Enable load balance activity on this group, by inserting it back on
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 5f06118fbc31..8bb713040ac9 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -167,11 +167,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
167#endif 167#endif
168 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", 168 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
169 cfs_rq->nr_spread_over); 169 cfs_rq->nr_spread_over);
170#ifdef CONFIG_FAIR_GROUP_SCHED
171#ifdef CONFIG_SMP
172 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
173#endif
174#endif
175} 170}
176 171
177static void print_cpu(struct seq_file *m, int cpu) 172static void print_cpu(struct seq_file *m, int cpu)
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0eb0ae879542..f0f25fc12d0a 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -510,27 +510,10 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
510 * Scheduling class queueing methods: 510 * Scheduling class queueing methods:
511 */ 511 */
512 512
513#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
514static void
515add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
516{
517 cfs_rq->task_weight += weight;
518}
519#else
520static inline void
521add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
522{
523}
524#endif
525
526static void 513static void
527account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 514account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
528{ 515{
529 update_load_add(&cfs_rq->load, se->load.weight); 516 update_load_add(&cfs_rq->load, se->load.weight);
530 if (!parent_entity(se))
531 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
532 if (entity_is_task(se))
533 add_cfs_task_weight(cfs_rq, se->load.weight);
534 cfs_rq->nr_running++; 517 cfs_rq->nr_running++;
535 se->on_rq = 1; 518 se->on_rq = 1;
536 list_add(&se->group_node, &cfs_rq->tasks); 519 list_add(&se->group_node, &cfs_rq->tasks);
@@ -540,10 +523,6 @@ static void
540account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 523account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
541{ 524{
542 update_load_sub(&cfs_rq->load, se->load.weight); 525 update_load_sub(&cfs_rq->load, se->load.weight);
543 if (!parent_entity(se))
544 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
545 if (entity_is_task(se))
546 add_cfs_task_weight(cfs_rq, -se->load.weight);
547 cfs_rq->nr_running--; 526 cfs_rq->nr_running--;
548 se->on_rq = 0; 527 se->on_rq = 0;
549 list_del_init(&se->group_node); 528 list_del_init(&se->group_node);
@@ -1327,90 +1306,75 @@ static struct task_struct *load_balance_next_fair(void *arg)
1327 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1306 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
1328} 1307}
1329 1308
1330static unsigned long 1309#ifdef CONFIG_FAIR_GROUP_SCHED
1331__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1310static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
1332 unsigned long max_load_move, struct sched_domain *sd,
1333 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
1334 struct cfs_rq *cfs_rq)
1335{ 1311{
1336 struct rq_iterator cfs_rq_iterator; 1312 struct sched_entity *curr;
1313 struct task_struct *p;
1337 1314
1338 cfs_rq_iterator.start = load_balance_start_fair; 1315 if (!cfs_rq->nr_running || !first_fair(cfs_rq))
1339 cfs_rq_iterator.next = load_balance_next_fair; 1316 return MAX_PRIO;
1340 cfs_rq_iterator.arg = cfs_rq; 1317
1318 curr = cfs_rq->curr;
1319 if (!curr)
1320 curr = __pick_next_entity(cfs_rq);
1341 1321
1342 return balance_tasks(this_rq, this_cpu, busiest, 1322 p = task_of(curr);
1343 max_load_move, sd, idle, all_pinned, 1323
1344 this_best_prio, &cfs_rq_iterator); 1324 return p->prio;
1345} 1325}
1326#endif
1346 1327
1347#ifdef CONFIG_FAIR_GROUP_SCHED
1348static unsigned long 1328static unsigned long
1349load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1329load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1350 unsigned long max_load_move, 1330 unsigned long max_load_move,
1351 struct sched_domain *sd, enum cpu_idle_type idle, 1331 struct sched_domain *sd, enum cpu_idle_type idle,
1352 int *all_pinned, int *this_best_prio) 1332 int *all_pinned, int *this_best_prio)
1353{ 1333{
1334 struct cfs_rq *busy_cfs_rq;
1354 long rem_load_move = max_load_move; 1335 long rem_load_move = max_load_move;
1355 int busiest_cpu = cpu_of(busiest); 1336 struct rq_iterator cfs_rq_iterator;
1356 struct task_group *tg;
1357
1358 rcu_read_lock();
1359 list_for_each_entry(tg, &task_groups, list) {
1360 long imbalance;
1361 unsigned long this_weight, busiest_weight;
1362 long rem_load, max_load, moved_load;
1363
1364 /*
1365 * empty group
1366 */
1367 if (!aggregate(tg, sd)->task_weight)
1368 continue;
1369
1370 rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
1371 rem_load /= aggregate(tg, sd)->load + 1;
1372
1373 this_weight = tg->cfs_rq[this_cpu]->task_weight;
1374 busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
1375 1337
1376 imbalance = (busiest_weight - this_weight) / 2; 1338 cfs_rq_iterator.start = load_balance_start_fair;
1339 cfs_rq_iterator.next = load_balance_next_fair;
1377 1340
1378 if (imbalance < 0) 1341 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1379 imbalance = busiest_weight; 1342#ifdef CONFIG_FAIR_GROUP_SCHED
1343 struct cfs_rq *this_cfs_rq;
1344 long imbalance;
1345 unsigned long maxload;
1380 1346
1381 max_load = max(rem_load, imbalance); 1347 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
1382 moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1383 max_load, sd, idle, all_pinned, this_best_prio,
1384 tg->cfs_rq[busiest_cpu]);
1385 1348
1386 if (!moved_load) 1349 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
1350 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
1351 if (imbalance <= 0)
1387 continue; 1352 continue;
1388 1353
1389 move_group_shares(tg, sd, busiest_cpu, this_cpu); 1354 /* Don't pull more than imbalance/2 */
1355 imbalance /= 2;
1356 maxload = min(rem_load_move, imbalance);
1390 1357
1391 moved_load *= aggregate(tg, sd)->load; 1358 *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
1392 moved_load /= aggregate(tg, sd)->rq_weight + 1; 1359#else
1360# define maxload rem_load_move
1361#endif
1362 /*
1363 * pass busy_cfs_rq argument into
1364 * load_balance_[start|next]_fair iterators
1365 */
1366 cfs_rq_iterator.arg = busy_cfs_rq;
1367 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1368 maxload, sd, idle, all_pinned,
1369 this_best_prio,
1370 &cfs_rq_iterator);
1393 1371
1394 rem_load_move -= moved_load; 1372 if (rem_load_move <= 0)
1395 if (rem_load_move < 0)
1396 break; 1373 break;
1397 } 1374 }
1398 rcu_read_unlock();
1399 1375
1400 return max_load_move - rem_load_move; 1376 return max_load_move - rem_load_move;
1401} 1377}
1402#else
1403static unsigned long
1404load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1405 unsigned long max_load_move,
1406 struct sched_domain *sd, enum cpu_idle_type idle,
1407 int *all_pinned, int *this_best_prio)
1408{
1409 return __load_balance_fair(this_rq, this_cpu, busiest,
1410 max_load_move, sd, idle, all_pinned,
1411 this_best_prio, &busiest->cfs);
1412}
1413#endif
1414 1378
1415static int 1379static int
1416move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1380move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 060e87b0cb1c..3432d573205d 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -513,8 +513,6 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
513 */ 513 */
514 for_each_sched_rt_entity(rt_se) 514 for_each_sched_rt_entity(rt_se)
515 enqueue_rt_entity(rt_se); 515 enqueue_rt_entity(rt_se);
516
517 inc_cpu_load(rq, p->se.load.weight);
518} 516}
519 517
520static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 518static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -534,8 +532,6 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
534 if (rt_rq && rt_rq->rt_nr_running) 532 if (rt_rq && rt_rq->rt_nr_running)
535 enqueue_rt_entity(rt_se); 533 enqueue_rt_entity(rt_se);
536 } 534 }
537
538 dec_cpu_load(rq, p->se.load.weight);
539} 535}
540 536
541/* 537/*