aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2008-04-19 13:45:00 -0400
committerIngo Molnar <mingo@elte.hu>2008-04-19 13:45:00 -0400
commit18d95a2832c1392a2d63227a7a6d433cb9f2037e (patch)
treefa85b700aa3caac5b1309edd8e31d9b957957a83
parent1d3504fcf5606579d60b649d19f44b3871c1ddae (diff)
sched: fair-group: SMP-nice for group scheduling
Implement SMP nice support for the full group hierarchy. On each load-balance action, compile a sched_domain wide view of the full task_group tree. We compute the domain wide view when walking down the hierarchy, and readjust the weights when walking back up. After collecting and readjusting the domain wide view, we try to balance the tasks within the task_groups. The current approach is a naively balance each task group until we've moved the targeted amount of load. Inspired by Srivatsa Vaddsgiri's previous code and Abhishek Chandra's H-SMP paper. XXX: there will be some numerical issues due to the limited nature of SCHED_LOAD_SCALE wrt to representing a task_groups influence on the total weight. When the tree is deep enough, or the task weight small enough, we'll run out of bits. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> CC: Abhishek Chandra <chandra@cs.umn.edu> CC: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/sched.h1
-rw-r--r--kernel/sched.c497
-rw-r--r--kernel/sched_fair.c124
-rw-r--r--kernel/sched_rt.c4
4 files changed, 548 insertions, 78 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11f47249cdd2..0a32059e6ed4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -758,6 +758,7 @@ struct sched_domain {
758 struct sched_domain *child; /* bottom domain must be null terminated */ 758 struct sched_domain *child; /* bottom domain must be null terminated */
759 struct sched_group *groups; /* the balancing groups of the domain */ 759 struct sched_group *groups; /* the balancing groups of the domain */
760 cpumask_t span; /* span of all CPUs in this domain */ 760 cpumask_t span; /* span of all CPUs in this domain */
761 int first_cpu; /* cache of the first cpu in this domain */
761 unsigned long min_interval; /* Minimum balance interval ms */ 762 unsigned long min_interval; /* Minimum balance interval ms */
762 unsigned long max_interval; /* Maximum balance interval ms */ 763 unsigned long max_interval; /* Maximum balance interval ms */
763 unsigned int busy_factor; /* less balancing by factor if busy */ 764 unsigned int busy_factor; /* less balancing by factor if busy */
diff --git a/kernel/sched.c b/kernel/sched.c
index 62d7481caca5..ae1a3e936d28 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -316,6 +316,8 @@ static DEFINE_MUTEX(doms_cur_mutex);
316# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 316# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
317#endif 317#endif
318 318
319#define MIN_SHARES 2
320
319static int init_task_group_load = INIT_TASK_GROUP_LOAD; 321static int init_task_group_load = INIT_TASK_GROUP_LOAD;
320#endif 322#endif
321 323
@@ -403,6 +405,43 @@ struct cfs_rq {
403 */ 405 */
404 struct list_head leaf_cfs_rq_list; 406 struct list_head leaf_cfs_rq_list;
405 struct task_group *tg; /* group that "owns" this runqueue */ 407 struct task_group *tg; /* group that "owns" this runqueue */
408
409#ifdef CONFIG_SMP
410 unsigned long task_weight;
411 unsigned long shares;
412 /*
413 * We need space to build a sched_domain wide view of the full task
414 * group tree, in order to avoid depending on dynamic memory allocation
415 * during the load balancing we place this in the per cpu task group
416 * hierarchy. This limits the load balancing to one instance per cpu,
417 * but more should not be needed anyway.
418 */
419 struct aggregate_struct {
420 /*
421 * load = weight(cpus) * f(tg)
422 *
423 * Where f(tg) is the recursive weight fraction assigned to
424 * this group.
425 */
426 unsigned long load;
427
428 /*
429 * part of the group weight distributed to this span.
430 */
431 unsigned long shares;
432
433 /*
434 * The sum of all runqueue weights within this span.
435 */
436 unsigned long rq_weight;
437
438 /*
439 * Weight contributed by tasks; this is the part we can
440 * influence by moving tasks around.
441 */
442 unsigned long task_weight;
443 } aggregate;
444#endif
406#endif 445#endif
407}; 446};
408 447
@@ -1402,11 +1441,390 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1402static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1441static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1403#endif 1442#endif
1404 1443
1444static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1445{
1446 update_load_add(&rq->load, load);
1447}
1448
1449static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1450{
1451 update_load_sub(&rq->load, load);
1452}
1453
1405#ifdef CONFIG_SMP 1454#ifdef CONFIG_SMP
1406static unsigned long source_load(int cpu, int type); 1455static unsigned long source_load(int cpu, int type);
1407static unsigned long target_load(int cpu, int type); 1456static unsigned long target_load(int cpu, int type);
1408static unsigned long cpu_avg_load_per_task(int cpu); 1457static unsigned long cpu_avg_load_per_task(int cpu);
1409static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1458static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1459
1460#ifdef CONFIG_FAIR_GROUP_SCHED
1461
1462/*
1463 * Group load balancing.
1464 *
1465 * We calculate a few balance domain wide aggregate numbers; load and weight.
1466 * Given the pictures below, and assuming each item has equal weight:
1467 *
1468 * root 1 - thread
1469 * / | \ A - group
1470 * A 1 B
1471 * /|\ / \
1472 * C 2 D 3 4
1473 * | |
1474 * 5 6
1475 *
1476 * load:
1477 * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
1478 * which equals 1/9-th of the total load.
1479 *
1480 * shares:
1481 * The weight of this group on the selected cpus.
1482 *
1483 * rq_weight:
1484 * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
1485 * B would get 2.
1486 *
1487 * task_weight:
1488 * Part of the rq_weight contributed by tasks; all groups except B would
1489 * get 1, B gets 2.
1490 */
1491
1492static inline struct aggregate_struct *
1493aggregate(struct task_group *tg, struct sched_domain *sd)
1494{
1495 return &tg->cfs_rq[sd->first_cpu]->aggregate;
1496}
1497
1498typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
1499
1500/*
1501 * Iterate the full tree, calling @down when first entering a node and @up when
1502 * leaving it for the final time.
1503 */
1504static
1505void aggregate_walk_tree(aggregate_func down, aggregate_func up,
1506 struct sched_domain *sd)
1507{
1508 struct task_group *parent, *child;
1509
1510 rcu_read_lock();
1511 parent = &root_task_group;
1512down:
1513 (*down)(parent, sd);
1514 list_for_each_entry_rcu(child, &parent->children, siblings) {
1515 parent = child;
1516 goto down;
1517
1518up:
1519 continue;
1520 }
1521 (*up)(parent, sd);
1522
1523 child = parent;
1524 parent = parent->parent;
1525 if (parent)
1526 goto up;
1527 rcu_read_unlock();
1528}
1529
1530/*
1531 * Calculate the aggregate runqueue weight.
1532 */
1533static
1534void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
1535{
1536 unsigned long rq_weight = 0;
1537 unsigned long task_weight = 0;
1538 int i;
1539
1540 for_each_cpu_mask(i, sd->span) {
1541 rq_weight += tg->cfs_rq[i]->load.weight;
1542 task_weight += tg->cfs_rq[i]->task_weight;
1543 }
1544
1545 aggregate(tg, sd)->rq_weight = rq_weight;
1546 aggregate(tg, sd)->task_weight = task_weight;
1547}
1548
1549/*
1550 * Redistribute tg->shares amongst all tg->cfs_rq[]s.
1551 */
1552static void __aggregate_redistribute_shares(struct task_group *tg)
1553{
1554 int i, max_cpu = smp_processor_id();
1555 unsigned long rq_weight = 0;
1556 unsigned long shares, max_shares = 0, shares_rem = tg->shares;
1557
1558 for_each_possible_cpu(i)
1559 rq_weight += tg->cfs_rq[i]->load.weight;
1560
1561 for_each_possible_cpu(i) {
1562 /*
1563 * divide shares proportional to the rq_weights.
1564 */
1565 shares = tg->shares * tg->cfs_rq[i]->load.weight;
1566 shares /= rq_weight + 1;
1567
1568 tg->cfs_rq[i]->shares = shares;
1569
1570 if (shares > max_shares) {
1571 max_shares = shares;
1572 max_cpu = i;
1573 }
1574 shares_rem -= shares;
1575 }
1576
1577 /*
1578 * Ensure it all adds up to tg->shares; we can loose a few
1579 * due to rounding down when computing the per-cpu shares.
1580 */
1581 if (shares_rem)
1582 tg->cfs_rq[max_cpu]->shares += shares_rem;
1583}
1584
1585/*
1586 * Compute the weight of this group on the given cpus.
1587 */
1588static
1589void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
1590{
1591 unsigned long shares = 0;
1592 int i;
1593
1594again:
1595 for_each_cpu_mask(i, sd->span)
1596 shares += tg->cfs_rq[i]->shares;
1597
1598 /*
1599 * When the span doesn't have any shares assigned, but does have
1600 * tasks to run do a machine wide rebalance (should be rare).
1601 */
1602 if (unlikely(!shares && aggregate(tg, sd)->rq_weight)) {
1603 __aggregate_redistribute_shares(tg);
1604 goto again;
1605 }
1606
1607 aggregate(tg, sd)->shares = shares;
1608}
1609
1610/*
1611 * Compute the load fraction assigned to this group, relies on the aggregate
1612 * weight and this group's parent's load, i.e. top-down.
1613 */
1614static
1615void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
1616{
1617 unsigned long load;
1618
1619 if (!tg->parent) {
1620 int i;
1621
1622 load = 0;
1623 for_each_cpu_mask(i, sd->span)
1624 load += cpu_rq(i)->load.weight;
1625
1626 } else {
1627 load = aggregate(tg->parent, sd)->load;
1628
1629 /*
1630 * shares is our weight in the parent's rq so
1631 * shares/parent->rq_weight gives our fraction of the load
1632 */
1633 load *= aggregate(tg, sd)->shares;
1634 load /= aggregate(tg->parent, sd)->rq_weight + 1;
1635 }
1636
1637 aggregate(tg, sd)->load = load;
1638}
1639
1640static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1641
1642/*
1643 * Calculate and set the cpu's group shares.
1644 */
1645static void
1646__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1647 int tcpu)
1648{
1649 int boost = 0;
1650 unsigned long shares;
1651 unsigned long rq_weight;
1652
1653 if (!tg->se[tcpu])
1654 return;
1655
1656 rq_weight = tg->cfs_rq[tcpu]->load.weight;
1657
1658 /*
1659 * If there are currently no tasks on the cpu pretend there is one of
1660 * average load so that when a new task gets to run here it will not
1661 * get delayed by group starvation.
1662 */
1663 if (!rq_weight) {
1664 boost = 1;
1665 rq_weight = NICE_0_LOAD;
1666 }
1667
1668 /*
1669 * \Sum shares * rq_weight
1670 * shares = -----------------------
1671 * \Sum rq_weight
1672 *
1673 */
1674 shares = aggregate(tg, sd)->shares * rq_weight;
1675 shares /= aggregate(tg, sd)->rq_weight + 1;
1676
1677 /*
1678 * record the actual number of shares, not the boosted amount.
1679 */
1680 tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
1681
1682 if (shares < MIN_SHARES)
1683 shares = MIN_SHARES;
1684
1685 __set_se_shares(tg->se[tcpu], shares);
1686}
1687
1688/*
1689 * Re-adjust the weights on the cpu the task came from and on the cpu the
1690 * task went to.
1691 */
1692static void
1693__move_group_shares(struct task_group *tg, struct sched_domain *sd,
1694 int scpu, int dcpu)
1695{
1696 unsigned long shares;
1697
1698 shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1699
1700 __update_group_shares_cpu(tg, sd, scpu);
1701 __update_group_shares_cpu(tg, sd, dcpu);
1702
1703 /*
1704 * ensure we never loose shares due to rounding errors in the
1705 * above redistribution.
1706 */
1707 shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1708 if (shares)
1709 tg->cfs_rq[dcpu]->shares += shares;
1710}
1711
1712/*
1713 * Because changing a group's shares changes the weight of the super-group
1714 * we need to walk up the tree and change all shares until we hit the root.
1715 */
1716static void
1717move_group_shares(struct task_group *tg, struct sched_domain *sd,
1718 int scpu, int dcpu)
1719{
1720 while (tg) {
1721 __move_group_shares(tg, sd, scpu, dcpu);
1722 tg = tg->parent;
1723 }
1724}
1725
1726static
1727void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
1728{
1729 unsigned long shares = aggregate(tg, sd)->shares;
1730 int i;
1731
1732 for_each_cpu_mask(i, sd->span) {
1733 struct rq *rq = cpu_rq(i);
1734 unsigned long flags;
1735
1736 spin_lock_irqsave(&rq->lock, flags);
1737 __update_group_shares_cpu(tg, sd, i);
1738 spin_unlock_irqrestore(&rq->lock, flags);
1739 }
1740
1741 aggregate_group_shares(tg, sd);
1742
1743 /*
1744 * ensure we never loose shares due to rounding errors in the
1745 * above redistribution.
1746 */
1747 shares -= aggregate(tg, sd)->shares;
1748 if (shares) {
1749 tg->cfs_rq[sd->first_cpu]->shares += shares;
1750 aggregate(tg, sd)->shares += shares;
1751 }
1752}
1753
1754/*
1755 * Calculate the accumulative weight and recursive load of each task group
1756 * while walking down the tree.
1757 */
1758static
1759void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
1760{
1761 aggregate_group_weight(tg, sd);
1762 aggregate_group_shares(tg, sd);
1763 aggregate_group_load(tg, sd);
1764}
1765
1766/*
1767 * Rebalance the cpu shares while walking back up the tree.
1768 */
1769static
1770void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
1771{
1772 aggregate_group_set_shares(tg, sd);
1773}
1774
1775static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
1776
1777static void __init init_aggregate(void)
1778{
1779 int i;
1780
1781 for_each_possible_cpu(i)
1782 spin_lock_init(&per_cpu(aggregate_lock, i));
1783}
1784
1785static int get_aggregate(struct sched_domain *sd)
1786{
1787 if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
1788 return 0;
1789
1790 aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
1791 return 1;
1792}
1793
1794static void put_aggregate(struct sched_domain *sd)
1795{
1796 spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
1797}
1798
1799static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1800{
1801 cfs_rq->shares = shares;
1802}
1803
1804#else
1805
1806static inline void init_aggregate(void)
1807{
1808}
1809
1810static inline int get_aggregate(struct sched_domain *sd)
1811{
1812 return 0;
1813}
1814
1815static inline void put_aggregate(struct sched_domain *sd)
1816{
1817}
1818#endif
1819
1820#else /* CONFIG_SMP */
1821
1822#ifdef CONFIG_FAIR_GROUP_SCHED
1823static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1824{
1825}
1826#endif
1827
1410#endif /* CONFIG_SMP */ 1828#endif /* CONFIG_SMP */
1411 1829
1412#include "sched_stats.h" 1830#include "sched_stats.h"
@@ -1419,26 +1837,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1419 1837
1420#define sched_class_highest (&rt_sched_class) 1838#define sched_class_highest (&rt_sched_class)
1421 1839
1422static inline void inc_load(struct rq *rq, const struct task_struct *p) 1840static void inc_nr_running(struct rq *rq)
1423{
1424 update_load_add(&rq->load, p->se.load.weight);
1425}
1426
1427static inline void dec_load(struct rq *rq, const struct task_struct *p)
1428{
1429 update_load_sub(&rq->load, p->se.load.weight);
1430}
1431
1432static void inc_nr_running(struct task_struct *p, struct rq *rq)
1433{ 1841{
1434 rq->nr_running++; 1842 rq->nr_running++;
1435 inc_load(rq, p);
1436} 1843}
1437 1844
1438static void dec_nr_running(struct task_struct *p, struct rq *rq) 1845static void dec_nr_running(struct rq *rq)
1439{ 1846{
1440 rq->nr_running--; 1847 rq->nr_running--;
1441 dec_load(rq, p);
1442} 1848}
1443 1849
1444static void set_load_weight(struct task_struct *p) 1850static void set_load_weight(struct task_struct *p)
@@ -1530,7 +1936,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1530 rq->nr_uninterruptible--; 1936 rq->nr_uninterruptible--;
1531 1937
1532 enqueue_task(rq, p, wakeup); 1938 enqueue_task(rq, p, wakeup);
1533 inc_nr_running(p, rq); 1939 inc_nr_running(rq);
1534} 1940}
1535 1941
1536/* 1942/*
@@ -1542,7 +1948,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1542 rq->nr_uninterruptible++; 1948 rq->nr_uninterruptible++;
1543 1949
1544 dequeue_task(rq, p, sleep); 1950 dequeue_task(rq, p, sleep);
1545 dec_nr_running(p, rq); 1951 dec_nr_running(rq);
1546} 1952}
1547 1953
1548/** 1954/**
@@ -2194,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2194 * management (if any): 2600 * management (if any):
2195 */ 2601 */
2196 p->sched_class->task_new(rq, p); 2602 p->sched_class->task_new(rq, p);
2197 inc_nr_running(p, rq); 2603 inc_nr_running(rq);
2198 } 2604 }
2199 check_preempt_curr(rq, p); 2605 check_preempt_curr(rq, p);
2200#ifdef CONFIG_SMP 2606#ifdef CONFIG_SMP
@@ -3185,9 +3591,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3185 unsigned long imbalance; 3591 unsigned long imbalance;
3186 struct rq *busiest; 3592 struct rq *busiest;
3187 unsigned long flags; 3593 unsigned long flags;
3594 int unlock_aggregate;
3188 3595
3189 cpus_setall(*cpus); 3596 cpus_setall(*cpus);
3190 3597
3598 unlock_aggregate = get_aggregate(sd);
3599
3191 /* 3600 /*
3192 * When power savings policy is enabled for the parent domain, idle 3601 * When power savings policy is enabled for the parent domain, idle
3193 * sibling can pick up load irrespective of busy siblings. In this case, 3602 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3303,8 +3712,9 @@ redo:
3303 3712
3304 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3713 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3305 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3714 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3306 return -1; 3715 ld_moved = -1;
3307 return ld_moved; 3716
3717 goto out;
3308 3718
3309out_balanced: 3719out_balanced:
3310 schedstat_inc(sd, lb_balanced[idle]); 3720 schedstat_inc(sd, lb_balanced[idle]);
@@ -3319,8 +3729,13 @@ out_one_pinned:
3319 3729
3320 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3730 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3321 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3731 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3322 return -1; 3732 ld_moved = -1;
3323 return 0; 3733 else
3734 ld_moved = 0;
3735out:
3736 if (unlock_aggregate)
3737 put_aggregate(sd);
3738 return ld_moved;
3324} 3739}
3325 3740
3326/* 3741/*
@@ -4535,10 +4950,8 @@ void set_user_nice(struct task_struct *p, long nice)
4535 goto out_unlock; 4950 goto out_unlock;
4536 } 4951 }
4537 on_rq = p->se.on_rq; 4952 on_rq = p->se.on_rq;
4538 if (on_rq) { 4953 if (on_rq)
4539 dequeue_task(rq, p, 0); 4954 dequeue_task(rq, p, 0);
4540 dec_load(rq, p);
4541 }
4542 4955
4543 p->static_prio = NICE_TO_PRIO(nice); 4956 p->static_prio = NICE_TO_PRIO(nice);
4544 set_load_weight(p); 4957 set_load_weight(p);
@@ -4548,7 +4961,6 @@ void set_user_nice(struct task_struct *p, long nice)
4548 4961
4549 if (on_rq) { 4962 if (on_rq) {
4550 enqueue_task(rq, p, 0); 4963 enqueue_task(rq, p, 0);
4551 inc_load(rq, p);
4552 /* 4964 /*
4553 * If the task increased its priority or is running and 4965 * If the task increased its priority or is running and
4554 * lowered its priority, then reschedule its CPU: 4966 * lowered its priority, then reschedule its CPU:
@@ -6921,6 +7333,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6921 SD_INIT(sd, ALLNODES); 7333 SD_INIT(sd, ALLNODES);
6922 set_domain_attribute(sd, attr); 7334 set_domain_attribute(sd, attr);
6923 sd->span = *cpu_map; 7335 sd->span = *cpu_map;
7336 sd->first_cpu = first_cpu(sd->span);
6924 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); 7337 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
6925 p = sd; 7338 p = sd;
6926 sd_allnodes = 1; 7339 sd_allnodes = 1;
@@ -6931,6 +7344,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6931 SD_INIT(sd, NODE); 7344 SD_INIT(sd, NODE);
6932 set_domain_attribute(sd, attr); 7345 set_domain_attribute(sd, attr);
6933 sched_domain_node_span(cpu_to_node(i), &sd->span); 7346 sched_domain_node_span(cpu_to_node(i), &sd->span);
7347 sd->first_cpu = first_cpu(sd->span);
6934 sd->parent = p; 7348 sd->parent = p;
6935 if (p) 7349 if (p)
6936 p->child = sd; 7350 p->child = sd;
@@ -6942,6 +7356,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6942 SD_INIT(sd, CPU); 7356 SD_INIT(sd, CPU);
6943 set_domain_attribute(sd, attr); 7357 set_domain_attribute(sd, attr);
6944 sd->span = *nodemask; 7358 sd->span = *nodemask;
7359 sd->first_cpu = first_cpu(sd->span);
6945 sd->parent = p; 7360 sd->parent = p;
6946 if (p) 7361 if (p)
6947 p->child = sd; 7362 p->child = sd;
@@ -6953,6 +7368,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6953 SD_INIT(sd, MC); 7368 SD_INIT(sd, MC);
6954 set_domain_attribute(sd, attr); 7369 set_domain_attribute(sd, attr);
6955 sd->span = cpu_coregroup_map(i); 7370 sd->span = cpu_coregroup_map(i);
7371 sd->first_cpu = first_cpu(sd->span);
6956 cpus_and(sd->span, sd->span, *cpu_map); 7372 cpus_and(sd->span, sd->span, *cpu_map);
6957 sd->parent = p; 7373 sd->parent = p;
6958 p->child = sd; 7374 p->child = sd;
@@ -6965,6 +7381,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6965 SD_INIT(sd, SIBLING); 7381 SD_INIT(sd, SIBLING);
6966 set_domain_attribute(sd, attr); 7382 set_domain_attribute(sd, attr);
6967 sd->span = per_cpu(cpu_sibling_map, i); 7383 sd->span = per_cpu(cpu_sibling_map, i);
7384 sd->first_cpu = first_cpu(sd->span);
6968 cpus_and(sd->span, sd->span, *cpu_map); 7385 cpus_and(sd->span, sd->span, *cpu_map);
6969 sd->parent = p; 7386 sd->parent = p;
6970 p->child = sd; 7387 p->child = sd;
@@ -7633,6 +8050,7 @@ void __init sched_init(void)
7633 } 8050 }
7634 8051
7635#ifdef CONFIG_SMP 8052#ifdef CONFIG_SMP
8053 init_aggregate();
7636 init_defrootdomain(); 8054 init_defrootdomain();
7637#endif 8055#endif
7638 8056
@@ -8199,14 +8617,11 @@ void sched_move_task(struct task_struct *tsk)
8199#endif 8617#endif
8200 8618
8201#ifdef CONFIG_FAIR_GROUP_SCHED 8619#ifdef CONFIG_FAIR_GROUP_SCHED
8202static void set_se_shares(struct sched_entity *se, unsigned long shares) 8620static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8203{ 8621{
8204 struct cfs_rq *cfs_rq = se->cfs_rq; 8622 struct cfs_rq *cfs_rq = se->cfs_rq;
8205 struct rq *rq = cfs_rq->rq;
8206 int on_rq; 8623 int on_rq;
8207 8624
8208 spin_lock_irq(&rq->lock);
8209
8210 on_rq = se->on_rq; 8625 on_rq = se->on_rq;
8211 if (on_rq) 8626 if (on_rq)
8212 dequeue_entity(cfs_rq, se, 0); 8627 dequeue_entity(cfs_rq, se, 0);
@@ -8216,8 +8631,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
8216 8631
8217 if (on_rq) 8632 if (on_rq)
8218 enqueue_entity(cfs_rq, se, 0); 8633 enqueue_entity(cfs_rq, se, 0);
8634}
8219 8635
8220 spin_unlock_irq(&rq->lock); 8636static void set_se_shares(struct sched_entity *se, unsigned long shares)
8637{
8638 struct cfs_rq *cfs_rq = se->cfs_rq;
8639 struct rq *rq = cfs_rq->rq;
8640 unsigned long flags;
8641
8642 spin_lock_irqsave(&rq->lock, flags);
8643 __set_se_shares(se, shares);
8644 spin_unlock_irqrestore(&rq->lock, flags);
8221} 8645}
8222 8646
8223static DEFINE_MUTEX(shares_mutex); 8647static DEFINE_MUTEX(shares_mutex);
@@ -8238,8 +8662,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8238 * (The default weight is 1024 - so there's no practical 8662 * (The default weight is 1024 - so there's no practical
8239 * limitation from this.) 8663 * limitation from this.)
8240 */ 8664 */
8241 if (shares < 2) 8665 if (shares < MIN_SHARES)
8242 shares = 2; 8666 shares = MIN_SHARES;
8243 8667
8244 mutex_lock(&shares_mutex); 8668 mutex_lock(&shares_mutex);
8245 if (tg->shares == shares) 8669 if (tg->shares == shares)
@@ -8259,8 +8683,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8259 * w/o tripping rebalance_share or load_balance_fair. 8683 * w/o tripping rebalance_share or load_balance_fair.
8260 */ 8684 */
8261 tg->shares = shares; 8685 tg->shares = shares;
8262 for_each_possible_cpu(i) 8686 for_each_possible_cpu(i) {
8263 set_se_shares(tg->se[i], shares); 8687 /*
8688 * force a rebalance
8689 */
8690 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8691 set_se_shares(tg->se[i], shares/nr_cpu_ids);
8692 }
8264 8693
8265 /* 8694 /*
8266 * Enable load balance activity on this group, by inserting it back on 8695 * Enable load balance activity on this group, by inserting it back on
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b43748efaa7f..b89fec93a237 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -492,10 +492,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
492 * Scheduling class queueing methods: 492 * Scheduling class queueing methods:
493 */ 493 */
494 494
495#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
496static void
497add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
498{
499 cfs_rq->task_weight += weight;
500}
501#else
502static inline void
503add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
504{
505}
506#endif
507
495static void 508static void
496account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 509account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
497{ 510{
498 update_load_add(&cfs_rq->load, se->load.weight); 511 update_load_add(&cfs_rq->load, se->load.weight);
512 if (!parent_entity(se))
513 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
514 if (entity_is_task(se))
515 add_cfs_task_weight(cfs_rq, se->load.weight);
499 cfs_rq->nr_running++; 516 cfs_rq->nr_running++;
500 se->on_rq = 1; 517 se->on_rq = 1;
501} 518}
@@ -504,6 +521,10 @@ static void
504account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 521account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
505{ 522{
506 update_load_sub(&cfs_rq->load, se->load.weight); 523 update_load_sub(&cfs_rq->load, se->load.weight);
524 if (!parent_entity(se))
525 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
526 if (entity_is_task(se))
527 add_cfs_task_weight(cfs_rq, -se->load.weight);
507 cfs_rq->nr_running--; 528 cfs_rq->nr_running--;
508 se->on_rq = 0; 529 se->on_rq = 0;
509} 530}
@@ -1286,75 +1307,90 @@ static struct task_struct *load_balance_next_fair(void *arg)
1286 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 1307 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
1287} 1308}
1288 1309
1289#ifdef CONFIG_FAIR_GROUP_SCHED 1310static unsigned long
1290static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) 1311__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1312 unsigned long max_load_move, struct sched_domain *sd,
1313 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
1314 struct cfs_rq *cfs_rq)
1291{ 1315{
1292 struct sched_entity *curr; 1316 struct rq_iterator cfs_rq_iterator;
1293 struct task_struct *p;
1294
1295 if (!cfs_rq->nr_running || !first_fair(cfs_rq))
1296 return MAX_PRIO;
1297
1298 curr = cfs_rq->curr;
1299 if (!curr)
1300 curr = __pick_next_entity(cfs_rq);
1301 1317
1302 p = task_of(curr); 1318 cfs_rq_iterator.start = load_balance_start_fair;
1319 cfs_rq_iterator.next = load_balance_next_fair;
1320 cfs_rq_iterator.arg = cfs_rq;
1303 1321
1304 return p->prio; 1322 return balance_tasks(this_rq, this_cpu, busiest,
1323 max_load_move, sd, idle, all_pinned,
1324 this_best_prio, &cfs_rq_iterator);
1305} 1325}
1306#endif
1307 1326
1327#ifdef CONFIG_FAIR_GROUP_SCHED
1308static unsigned long 1328static unsigned long
1309load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1329load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1310 unsigned long max_load_move, 1330 unsigned long max_load_move,
1311 struct sched_domain *sd, enum cpu_idle_type idle, 1331 struct sched_domain *sd, enum cpu_idle_type idle,
1312 int *all_pinned, int *this_best_prio) 1332 int *all_pinned, int *this_best_prio)
1313{ 1333{
1314 struct cfs_rq *busy_cfs_rq;
1315 long rem_load_move = max_load_move; 1334 long rem_load_move = max_load_move;
1316 struct rq_iterator cfs_rq_iterator; 1335 int busiest_cpu = cpu_of(busiest);
1336 struct task_group *tg;
1317 1337
1318 cfs_rq_iterator.start = load_balance_start_fair; 1338 rcu_read_lock();
1319 cfs_rq_iterator.next = load_balance_next_fair; 1339 list_for_each_entry(tg, &task_groups, list) {
1320
1321 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1322#ifdef CONFIG_FAIR_GROUP_SCHED
1323 struct cfs_rq *this_cfs_rq;
1324 long imbalance; 1340 long imbalance;
1325 unsigned long maxload; 1341 unsigned long this_weight, busiest_weight;
1342 long rem_load, max_load, moved_load;
1343
1344 /*
1345 * empty group
1346 */
1347 if (!aggregate(tg, sd)->task_weight)
1348 continue;
1349
1350 rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
1351 rem_load /= aggregate(tg, sd)->load + 1;
1326 1352
1327 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 1353 this_weight = tg->cfs_rq[this_cpu]->task_weight;
1354 busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
1328 1355
1329 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 1356 imbalance = (busiest_weight - this_weight) / 2;
1330 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 1357
1331 if (imbalance <= 0) 1358 if (imbalance < 0)
1359 imbalance = busiest_weight;
1360
1361 max_load = max(rem_load, imbalance);
1362 moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1363 max_load, sd, idle, all_pinned, this_best_prio,
1364 tg->cfs_rq[busiest_cpu]);
1365
1366 if (!moved_load)
1332 continue; 1367 continue;
1333 1368
1334 /* Don't pull more than imbalance/2 */ 1369 move_group_shares(tg, sd, busiest_cpu, this_cpu);
1335 imbalance /= 2;
1336 maxload = min(rem_load_move, imbalance);
1337 1370
1338 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1371 moved_load *= aggregate(tg, sd)->load;
1339#else 1372 moved_load /= aggregate(tg, sd)->rq_weight + 1;
1340# define maxload rem_load_move
1341#endif
1342 /*
1343 * pass busy_cfs_rq argument into
1344 * load_balance_[start|next]_fair iterators
1345 */
1346 cfs_rq_iterator.arg = busy_cfs_rq;
1347 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1348 maxload, sd, idle, all_pinned,
1349 this_best_prio,
1350 &cfs_rq_iterator);
1351 1373
1352 if (rem_load_move <= 0) 1374 rem_load_move -= moved_load;
1375 if (rem_load_move < 0)
1353 break; 1376 break;
1354 } 1377 }
1378 rcu_read_unlock();
1355 1379
1356 return max_load_move - rem_load_move; 1380 return max_load_move - rem_load_move;
1357} 1381}
1382#else
1383static unsigned long
1384load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1385 unsigned long max_load_move,
1386 struct sched_domain *sd, enum cpu_idle_type idle,
1387 int *all_pinned, int *this_best_prio)
1388{
1389 return __load_balance_fair(this_rq, this_cpu, busiest,
1390 max_load_move, sd, idle, all_pinned,
1391 this_best_prio, &busiest->cfs);
1392}
1393#endif
1358 1394
1359static int 1395static int
1360move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1396move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 201a69382a42..736fb8fd8977 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -518,6 +518,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
518 */ 518 */
519 for_each_sched_rt_entity(rt_se) 519 for_each_sched_rt_entity(rt_se)
520 enqueue_rt_entity(rt_se); 520 enqueue_rt_entity(rt_se);
521
522 inc_cpu_load(rq, p->se.load.weight);
521} 523}
522 524
523static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 525static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -537,6 +539,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
537 if (rt_rq && rt_rq->rt_nr_running) 539 if (rt_rq && rt_rq->rt_nr_running)
538 enqueue_rt_entity(rt_se); 540 enqueue_rt_entity(rt_se);
539 } 541 }
542
543 dec_cpu_load(rq, p->se.load.weight);
540} 544}
541 545
542/* 546/*