aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h1
-rw-r--r--kernel/sched.c497
-rw-r--r--kernel/sched_fair.c124
-rw-r--r--kernel/sched_rt.c4
4 files changed, 548 insertions, 78 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 11f47249cdd2..0a32059e6ed4 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -758,6 +758,7 @@ struct sched_domain {
758 struct sched_domain *child; /* bottom domain must be null terminated */ 758 struct sched_domain *child; /* bottom domain must be null terminated */
759 struct sched_group *groups; /* the balancing groups of the domain */ 759 struct sched_group *groups; /* the balancing groups of the domain */
760 cpumask_t span; /* span of all CPUs in this domain */ 760 cpumask_t span; /* span of all CPUs in this domain */
761 int first_cpu; /* cache of the first cpu in this domain */
761 unsigned long min_interval; /* Minimum balance interval ms */ 762 unsigned long min_interval; /* Minimum balance interval ms */
762 unsigned long max_interval; /* Maximum balance interval ms */ 763 unsigned long max_interval; /* Maximum balance interval ms */
763 unsigned int busy_factor; /* less balancing by factor if busy */ 764 unsigned int busy_factor; /* less balancing by factor if busy */
diff --git a/kernel/sched.c b/kernel/sched.c
index 62d7481caca5..ae1a3e936d28 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -316,6 +316,8 @@ static DEFINE_MUTEX(doms_cur_mutex);
316# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 316# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
317#endif 317#endif
318 318
319#define MIN_SHARES 2
320
319static int init_task_group_load = INIT_TASK_GROUP_LOAD; 321static int init_task_group_load = INIT_TASK_GROUP_LOAD;
320#endif 322#endif
321 323
@@ -403,6 +405,43 @@ struct cfs_rq {
403 */ 405 */
404 struct list_head leaf_cfs_rq_list; 406 struct list_head leaf_cfs_rq_list;
405 struct task_group *tg; /* group that "owns" this runqueue */ 407 struct task_group *tg; /* group that "owns" this runqueue */
408
409#ifdef CONFIG_SMP
410 unsigned long task_weight;
411 unsigned long shares;
412 /*
413 * We need space to build a sched_domain wide view of the full task
414 * group tree, in order to avoid depending on dynamic memory allocation
415 * during the load balancing we place this in the per cpu task group
416 * hierarchy. This limits the load balancing to one instance per cpu,
417 * but more should not be needed anyway.
418 */
419 struct aggregate_struct {
420 /*
421 * load = weight(cpus) * f(tg)
422 *
423 * Where f(tg) is the recursive weight fraction assigned to
424 * this group.
425 */
426 unsigned long load;
427
428 /*
429 * part of the group weight distributed to this span.
430 */
431 unsigned long shares;
432
433 /*
434 * The sum of all runqueue weights within this span.
435 */
436 unsigned long rq_weight;
437
438 /*
439 * Weight contributed by tasks; this is the part we can
440 * influence by moving tasks around.
441 */
442 unsigned long task_weight;
443 } aggregate;
444#endif
406#endif 445#endif
407}; 446};
408 447
@@ -1402,11 +1441,390 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1402static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} 1441static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1403#endif 1442#endif
1404 1443
1444static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1445{
1446 update_load_add(&rq->load, load);
1447}
1448
1449static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1450{
1451 update_load_sub(&rq->load, load);
1452}
1453
1405#ifdef CONFIG_SMP 1454#ifdef CONFIG_SMP
1406static unsigned long source_load(int cpu, int type); 1455static unsigned long source_load(int cpu, int type);
1407static unsigned long target_load(int cpu, int type); 1456static unsigned long target_load(int cpu, int type);
1408static unsigned long cpu_avg_load_per_task(int cpu); 1457static unsigned long cpu_avg_load_per_task(int cpu);
1409static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1458static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1459
1460#ifdef CONFIG_FAIR_GROUP_SCHED
1461
1462/*
1463 * Group load balancing.
1464 *
1465 * We calculate a few balance domain wide aggregate numbers; load and weight.
1466 * Given the pictures below, and assuming each item has equal weight:
1467 *
1468 * root 1 - thread
1469 * / | \ A - group
1470 * A 1 B
1471 * /|\ / \
1472 * C 2 D 3 4
1473 * | |
1474 * 5 6
1475 *
1476 * load:
1477 * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
1478 * which equals 1/9-th of the total load.
1479 *
1480 * shares:
1481 * The weight of this group on the selected cpus.
1482 *
1483 * rq_weight:
1484 * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
1485 * B would get 2.
1486 *
1487 * task_weight:
1488 * Part of the rq_weight contributed by tasks; all groups except B would
1489 * get 1, B gets 2.
1490 */
1491
1492static inline struct aggregate_struct *
1493aggregate(struct task_group *tg, struct sched_domain *sd)
1494{
1495 return &tg->cfs_rq[sd->first_cpu]->aggregate;
1496}
1497
1498typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
1499
1500/*
1501 * Iterate the full tree, calling @down when first entering a node and @up when
1502 * leaving it for the final time.
1503 */
1504static
1505void aggregate_walk_tree(aggregate_func down, aggregate_func up,
1506 struct sched_domain *sd)
1507{
1508 struct task_group *parent, *child;
1509
1510 rcu_read_lock();
1511 parent = &root_task_group;
1512down:
1513 (*down)(parent, sd);
1514 list_for_each_entry_rcu(child, &parent->children, siblings) {
1515 parent = child;
1516 goto down;
1517
1518up:
1519 continue;
1520 }
1521 (*up)(parent, sd);
1522
1523 child = parent;
1524 parent = parent->parent;
1525 if (parent)
1526 goto up;
1527 rcu_read_unlock();
1528}
1529
1530/*
1531 * Calculate the aggregate runqueue weight.
1532 */
1533static
1534void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
1535{
1536 unsigned long rq_weight = 0;
1537 unsigned long task_weight = 0;
1538 int i;
1539
1540 for_each_cpu_mask(i, sd->span) {
1541 rq_weight += tg->cfs_rq[i]->load.weight;
1542 task_weight += tg->cfs_rq[i]->task_weight;
1543 }
1544
1545 aggregate(tg, sd)->rq_weight = rq_weight;
1546 aggregate(tg, sd)->task_weight = task_weight;
1547}
1548
1549/*
1550 * Redistribute tg->shares amongst all tg->cfs_rq[]s.
1551 */
1552static void __aggregate_redistribute_shares(struct task_group *tg)
1553{
1554 int i, max_cpu = smp_processor_id();
1555 unsigned long rq_weight = 0;
1556 unsigned long shares, max_shares = 0, shares_rem = tg->shares;
1557
1558 for_each_possible_cpu(i)
1559 rq_weight += tg->cfs_rq[i]->load.weight;
1560
1561 for_each_possible_cpu(i) {
1562 /*
1563 * divide shares proportional to the rq_weights.
1564 */
1565 shares = tg->shares * tg->cfs_rq[i]->load.weight;
1566 shares /= rq_weight + 1;
1567
1568 tg->cfs_rq[i]->shares = shares;
1569
1570 if (shares > max_shares) {
1571 max_shares = shares;
1572 max_cpu = i;
1573 }
1574 shares_rem -= shares;
1575 }
1576
1577 /*
1578 * Ensure it all adds up to tg->shares; we can loose a few
1579 * due to rounding down when computing the per-cpu shares.
1580 */
1581 if (shares_rem)
1582 tg->cfs_rq[max_cpu]->shares += shares_rem;
1583}
1584
1585/*
1586 * Compute the weight of this group on the given cpus.
1587 */
1588static
1589void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
1590{
1591 unsigned long shares = 0;
1592 int i;
1593
1594again:
1595 for_each_cpu_mask(i, sd->span)
1596 shares += tg->cfs_rq[i]->shares;
1597
1598 /*
1599 * When the span doesn't have any shares assigned, but does have
1600 * tasks to run do a machine wide rebalance (should be rare).
1601 */
1602 if (unlikely(!shares && aggregate(tg, sd)->rq_weight)) {
1603 __aggregate_redistribute_shares(tg);
1604 goto again;
1605 }
1606
1607 aggregate(tg, sd)->shares = shares;
1608}
1609
1610/*
1611 * Compute the load fraction assigned to this group, relies on the aggregate
1612 * weight and this group's parent's load, i.e. top-down.
1613 */
1614static
1615void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
1616{
1617 unsigned long load;
1618
1619 if (!tg->parent) {
1620 int i;
1621
1622 load = 0;
1623 for_each_cpu_mask(i, sd->span)
1624 load += cpu_rq(i)->load.weight;
1625
1626 } else {
1627 load = aggregate(tg->parent, sd)->load;
1628
1629 /*
1630 * shares is our weight in the parent's rq so
1631 * shares/parent->rq_weight gives our fraction of the load
1632 */
1633 load *= aggregate(tg, sd)->shares;
1634 load /= aggregate(tg->parent, sd)->rq_weight + 1;
1635 }
1636
1637 aggregate(tg, sd)->load = load;
1638}
1639
1640static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1641
1642/*
1643 * Calculate and set the cpu's group shares.
1644 */
1645static void
1646__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1647 int tcpu)
1648{
1649 int boost = 0;
1650 unsigned long shares;
1651 unsigned long rq_weight;
1652
1653 if (!tg->se[tcpu])
1654 return;
1655
1656 rq_weight = tg->cfs_rq[tcpu]->load.weight;
1657
1658 /*
1659 * If there are currently no tasks on the cpu pretend there is one of
1660 * average load so that when a new task gets to run here it will not
1661 * get delayed by group starvation.
1662 */
1663 if (!rq_weight) {
1664 boost = 1;
1665 rq_weight = NICE_0_LOAD;
1666 }
1667
1668 /*
1669 * \Sum shares * rq_weight
1670 * shares = -----------------------
1671 * \Sum rq_weight
1672 *
1673 */
1674 shares = aggregate(tg, sd)->shares * rq_weight;
1675 shares /= aggregate(tg, sd)->rq_weight + 1;
1676
1677 /*
1678 * record the actual number of shares, not the boosted amount.
1679 */
1680 tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
1681
1682 if (shares < MIN_SHARES)
1683 shares = MIN_SHARES;
1684
1685 __set_se_shares(tg->se[tcpu], shares);
1686}
1687
1688/*
1689 * Re-adjust the weights on the cpu the task came from and on the cpu the
1690 * task went to.
1691 */
1692static void
1693__move_group_shares(struct task_group *tg, struct sched_domain *sd,
1694 int scpu, int dcpu)
1695{
1696 unsigned long shares;
1697
1698 shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1699
1700 __update_group_shares_cpu(tg, sd, scpu);
1701 __update_group_shares_cpu(tg, sd, dcpu);
1702
1703 /*
1704 * ensure we never loose shares due to rounding errors in the
1705 * above redistribution.
1706 */
1707 shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1708 if (shares)
1709 tg->cfs_rq[dcpu]->shares += shares;
1710}
1711
1712/*
1713 * Because changing a group's shares changes the weight of the super-group
1714 * we need to walk up the tree and change all shares until we hit the root.
1715 */
1716static void
1717move_group_shares(struct task_group *tg, struct sched_domain *sd,
1718 int scpu, int dcpu)
1719{
1720 while (tg) {
1721 __move_group_shares(tg, sd, scpu, dcpu);
1722 tg = tg->parent;
1723 }
1724}
1725
1726static
1727void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
1728{
1729 unsigned long shares = aggregate(tg, sd)->shares;
1730 int i;
1731
1732 for_each_cpu_mask(i, sd->span) {
1733 struct rq *rq = cpu_rq(i);
1734 unsigned long flags;
1735
1736 spin_lock_irqsave(&rq->lock, flags);
1737 __update_group_shares_cpu(tg, sd, i);
1738 spin_unlock_irqrestore(&rq->lock, flags);
1739 }
1740
1741 aggregate_group_shares(tg, sd);
1742
1743 /*
1744 * ensure we never loose shares due to rounding errors in the
1745 * above redistribution.
1746 */
1747 shares -= aggregate(tg, sd)->shares;
1748 if (shares) {
1749 tg->cfs_rq[sd->first_cpu]->shares += shares;
1750 aggregate(tg, sd)->shares += shares;
1751 }
1752}
1753
1754/*
1755 * Calculate the accumulative weight and recursive load of each task group
1756 * while walking down the tree.
1757 */
1758static
1759void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
1760{
1761 aggregate_group_weight(tg, sd);
1762 aggregate_group_shares(tg, sd);
1763 aggregate_group_load(tg, sd);
1764}
1765
1766/*
1767 * Rebalance the cpu shares while walking back up the tree.
1768 */
1769static
1770void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
1771{
1772 aggregate_group_set_shares(tg, sd);
1773}
1774
1775static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
1776
1777static void __init init_aggregate(void)
1778{
1779 int i;
1780
1781 for_each_possible_cpu(i)
1782 spin_lock_init(&per_cpu(aggregate_lock, i));
1783}
1784
1785static int get_aggregate(struct sched_domain *sd)
1786{
1787 if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
1788 return 0;
1789
1790 aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
1791 return 1;
1792}
1793
1794static void put_aggregate(struct sched_domain *sd)
1795{
1796 spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
1797}
1798
1799static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1800{
1801 cfs_rq->shares = shares;
1802}
1803
1804#else
1805
1806static inline void init_aggregate(void)
1807{
1808}
1809
1810static inline int get_aggregate(struct sched_domain *sd)
1811{
1812 return 0;
1813}
1814
1815static inline void put_aggregate(struct sched_domain *sd)
1816{
1817}
1818#endif
1819
1820#else /* CONFIG_SMP */
1821
1822#ifdef CONFIG_FAIR_GROUP_SCHED
1823static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1824{
1825}
1826#endif
1827
1410#endif /* CONFIG_SMP */ 1828#endif /* CONFIG_SMP */
1411 1829
1412#include "sched_stats.h" 1830#include "sched_stats.h"
@@ -1419,26 +1837,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1419 1837
1420#define sched_class_highest (&rt_sched_class) 1838#define sched_class_highest (&rt_sched_class)
1421 1839
1422static inline void inc_load(struct rq *rq, const struct task_struct *p) 1840static void inc_nr_running(struct rq *rq)
1423{
1424 update_load_add(&rq->load, p->se.load.weight);
1425}
1426
1427static inline void dec_load(struct rq *rq, const struct task_struct *p)
1428{
1429 update_load_sub(&rq->load, p->se.load.weight);
1430}
1431
1432static void inc_nr_running(struct task_struct *p, struct rq *rq)
1433{ 1841{
1434 rq->nr_running++; 1842 rq->nr_running++;
1435 inc_load(rq, p);
1436} 1843}
1437 1844
1438static void dec_nr_running(struct task_struct *p, struct rq *rq) 1845static void dec_nr_running(struct rq *rq)
1439{ 1846{
1440 rq->nr_running--; 1847 rq->nr_running--;
1441 dec_load(rq, p);
1442} 1848}
1443 1849
1444static void set_load_weight(struct task_struct *p) 1850static void set_load_weight(struct task_struct *p)
@@ -1530,7 +1936,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1530 rq->nr_uninterruptible--; 1936 rq->nr_uninterruptible--;
1531 1937
1532 enqueue_task(rq, p, wakeup); 1938 enqueue_task(rq, p, wakeup);
1533 inc_nr_running(p, rq); 1939 inc_nr_running(rq);
1534} 1940}
1535 1941
1536/* 1942/*
@@ -1542,7 +1948,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1542 rq->nr_uninterruptible++; 1948 rq->nr_uninterruptible++;
1543 1949
1544 dequeue_task(rq, p, sleep); 1950 dequeue_task(rq, p, sleep);
1545 dec_nr_running(p, rq); 1951 dec_nr_running(rq);
1546} 1952}
1547 1953
1548/** 1954/**
@@ -2194,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2194 * management (if any): 2600 * management (if any):
2195 */ 2601 */
2196 p->sched_class->task_new(rq, p); 2602 p->sched_class->task_new(rq, p);
2197 inc_nr_running(p, rq); 2603 inc_nr_running(rq);
2198 } 2604 }
2199 check_preempt_curr(rq, p); 2605 check_preempt_curr(rq, p);
2200#ifdef CONFIG_SMP 2606#ifdef CONFIG_SMP
@@ -3185,9 +3591,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3185 unsigned long imbalance; 3591 unsigned long imbalance;
3186 struct rq *busiest; 3592 struct rq *busiest;
3187 unsigned long flags; 3593 unsigned long flags;
3594 int unlock_aggregate;
3188 3595
3189 cpus_setall(*cpus); 3596 cpus_setall(*cpus);
3190 3597
3598 unlock_aggregate = get_aggregate(sd);
3599
3191 /* 3600 /*
3192 * When power savings policy is enabled for the parent domain, idle 3601 * When power savings policy is enabled for the parent domain, idle
3193 * sibling can pick up load irrespective of busy siblings. In this case, 3602 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3303,8 +3712,9 @@ redo:
3303 3712
3304 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3713 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3305 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3714 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3306 return -1; 3715 ld_moved = -1;
3307 return ld_moved; 3716
3717 goto out;
3308 3718
3309out_balanced: 3719out_balanced:
3310 schedstat_inc(sd, lb_balanced[idle]); 3720 schedstat_inc(sd, lb_balanced[idle]);
@@ -3319,8 +3729,13 @@ out_one_pinned:
3319 3729
3320 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3730 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3321 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3731 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3322 return -1; 3732 ld_moved = -1;
3323 return 0; 3733 else
3734 ld_moved = 0;
3735out:
3736 if (unlock_aggregate)
3737 put_aggregate(sd);
3738 return ld_moved;
3324} 3739}
3325 3740
3326/* 3741/*
@@ -4535,10 +4950,8 @@ void set_user_nice(struct task_struct *p, long nice)
4535 goto out_unlock; 4950 goto out_unlock;
4536 } 4951 }
4537 on_rq = p->se.on_rq; 4952 on_rq = p->se.on_rq;
4538 if (on_rq) { 4953 if (on_rq)
4539 dequeue_task(rq, p, 0); 4954 dequeue_task(rq, p, 0);
4540 dec_load(rq, p);
4541 }
4542 4955
4543 p->static_prio = NICE_TO_PRIO(nice); 4956 p->static_prio = NICE_TO_PRIO(nice);
4544 set_load_weight(p); 4957 set_load_weight(p);
@@ -4548,7 +4961,6 @@ void set_user_nice(struct task_struct *p, long nice)
4548 4961
4549 if (on_rq) { 4962 if (on_rq) {
4550 enqueue_task(rq, p, 0); 4963 enqueue_task(rq, p, 0);
4551 inc_load(rq, p);
4552 /* 4964 /*
4553 * If the task increased its priority or is running and 4965 * If the task increased its priority or is running and
4554 * lowered its priority, then reschedule its CPU: 4966 * lowered its priority, then reschedule its CPU:
@@ -6921,6 +7333,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6921 SD_INIT(sd, ALLNODES); 7333 SD_INIT(sd, ALLNODES);
6922 set_domain_attribute(sd, attr); 7334 set_domain_attribute(sd, attr);
6923 sd->span = *cpu_map; 7335 sd->span = *cpu_map;
7336 sd->first_cpu = first_cpu(sd->span);
6924 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); 7337 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
6925 p = sd; 7338 p = sd;
6926 sd_allnodes = 1; 7339 sd_allnodes = 1;
@@ -6931,6 +7344,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6931 SD_INIT(sd, NODE); 7344 SD_INIT(sd, NODE);
6932 set_domain_attribute(sd, attr); 7345 set_domain_attribute(sd, attr);
6933 sched_domain_node_span(cpu_to_node(i), &sd->span); 7346 sched_domain_node_span(cpu_to_node(i), &sd->span);
7347 sd->first_cpu = first_cpu(sd->span);
6934 sd->parent = p; 7348 sd->parent = p;
6935 if (p) 7349 if (p)
6936 p->child = sd; 7350 p->child = sd;
@@ -6942,6 +7356,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6942 SD_INIT(sd, CPU); 7356 SD_INIT(sd, CPU);
6943 set_domain_attribute(sd, attr); 7357 set_domain_attribute(sd, attr);
6944 sd->span = *nodemask; 7358 sd->span = *nodemask;
7359 sd->first_cpu = first_cpu(sd->span);
6945 sd->parent = p; 7360 sd->parent = p;
6946 if (p) 7361 if (p)
6947 p->child = sd; 7362 p->child = sd;
@@ -6953,6 +7368,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6953 SD_INIT(sd, MC); 7368 SD_INIT(sd, MC);
6954 set_domain_attribute(sd, attr); 7369 set_domain_attribute(sd, attr);
6955 sd->span = cpu_coregroup_map(i); 7370 sd->span = cpu_coregroup_map(i);
7371 sd->first_cpu = first_cpu(sd->span);
6956 cpus_and(sd->span, sd->span, *cpu_map); 7372 cpus_and(sd->span, sd->span, *cpu_map);
6957 sd->parent = p; 7373 sd->parent = p;
6958 p->child = sd; 7374 p->child = sd;
@@ -6965,6 +7381,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6965 SD_INIT(sd, SIBLING); 7381 SD_INIT(sd, SIBLING);
6966 set_domain_attribute(sd, attr); 7382 set_domain_attribute(sd, attr);
6967 sd->span = per_cpu(cpu_sibling_map, i); 7383 sd->span = per_cpu(cpu_sibling_map, i);
7384 sd->first_cpu = first_cpu(sd->span);
6968 cpus_and(sd->span, sd->span, *cpu_map); 7385 cpus_and(sd->span, sd->span, *cpu_map);
6969 sd->parent = p; 7386 sd->parent = p;
6970 p->child = sd; 7387 p->child = sd;
@@ -7633,6 +8050,7 @@ void __init sched_init(void)
7633 } 8050 }
7634 8051
7635#ifdef CONFIG_SMP 8052#ifdef CONFIG_SMP
8053 init_aggregate();
7636 init_defrootdomain(); 8054 init_defrootdomain();
7637#endif 8055#endif
7638 8056
@@ -8199,14 +8617,11 @@ void sched_move_task(struct task_struct *tsk)
8199#endif 8617#endif
8200 8618
8201#ifdef CONFIG_FAIR_GROUP_SCHED 8619#ifdef CONFIG_FAIR_GROUP_SCHED
8202static void set_se_shares(struct sched_entity *se, unsigned long shares) 8620static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8203{ 8621{
8204 struct cfs_rq *cfs_rq = se->cfs_rq; 8622 struct cfs_rq *cfs_rq = se->cfs_rq;
8205 struct rq *rq = cfs_rq->rq;
8206 int on_rq; 8623 int on_rq;
8207 8624
8208 spin_lock_irq(&rq->lock);
8209
8210 on_rq = se->on_rq; 8625 on_rq = se->on_rq;
8211 if (on_rq) 8626 if (on_rq)
8212 dequeue_entity(cfs_rq, se, 0); 8627 dequeue_entity(cfs_rq, se, 0);
@@ -8216,8 +8631,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
8216 8631
8217 if (on_rq) 8632 if (on_rq)
8218 enqueue_entity(cfs_rq, se, 0); 8633 enqueue_entity(cfs_rq, se, 0);
8634}
8219 8635
8220 spin_unlock_irq(&rq->lock); 8636static void set_se_shares(struct sched_entity *se, unsigned long shares)
8637{
8638 struct cfs_rq *cfs_rq = se->cfs_rq;
8639 struct rq *rq = cfs_rq->rq;
8640 unsigned long flags;
8641
8642 spin_lock_irqsave(&rq->lock, flags);
8643 __set_se_shares(se, shares);
8644 spin_unlock_irqrestore(&rq->lock, flags);
8221} 8645}
8222 8646
8223static DEFINE_MUTEX(shares_mutex); 8647static DEFINE_MUTEX(shares_mutex);
@@ -8238,8 +8662,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8238 * (The default weight is 1024 - so there's no practical 8662 * (The default weight is 1024 - so there's no practical
8239 * limitation from this.) 8663 * limitation from this.)
8240 */ 8664 */
8241 if (shares < 2) 8665 if (shares < MIN_SHARES)
8242 shares = 2; 8666 shares = MIN_SHARES;
8243 8667
8244 mutex_lock(&shares_mutex); 8668 mutex_lock(&shares_mutex);
8245 if (tg->shares == shares) 8669 if (tg->shares == shares)
@@ -8259,8 +8683,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8259 * w/o tripping rebalance_share or load_balance_fair. 8683 * w/o tripping rebalance_share or load_balance_fair.
8260 */ 8684 */
8261 tg->shares = shares; 8685 tg->shares = shares;
8262 for_each_possible_cpu(i) 8686 for_each_possible_cpu(i) {
8263 set_se_shares(tg->se[i], shares); 8687 /*
8688 * force a rebalance
8689 */
8690 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8691 set_se_shares(tg->se[i], shares/nr_cpu_ids);
8692 }
8264 8693
8265 /* 8694 /*
8266 * Enable load balance activity on this group, by inserting it back on 8695 * Enable load balance activity on this group, by inserting it back on
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index b43748efaa7f..b89fec93a237 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -492,10 +492,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
492 * Scheduling class queueing methods: 492 * Scheduling class queueing methods:
493 */ 493 */
494 494
495#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
496static void
497add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
498{
499 cfs_rq->task_weight += weight;
500}
501#else
502static inline void
503add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
504{
505}
506#endif
507
495static void 508static void
496account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 509account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
497{ 510{
498 update_load_add(&cfs_rq->load, se->load.weight); 511 update_load_add(&cfs_rq->load, se->load.weight);
512 if (!parent_entity(se))
513 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
514 if (entity_is_task(se))
515 add_cfs_task_weight(cfs_rq, se->load.weight);
499 cfs_rq->nr_running++; 516 cfs_rq->nr_running++;
500 se->on_rq = 1; 517 se->on_rq = 1;
501} 518}
@@ -504,6 +521,10 @@ static void
504account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 521account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
505{ 522{
506 update_load_sub(&cfs_rq->load, se->load.weight); 523 update_load_sub(&cfs_rq->load, se->load.weight);
524 if (!parent_entity(se))
525 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
526 if (entity_is_task(se))
527 add_cfs_task_weight(cfs_rq, -se->load.weight);
507 cfs_rq->nr_running--; 528 cfs_rq->nr_running--;
508 se->on_rq = 0; 529 se->on_rq = 0;
509} 530}
@@ -1286,75 +1307,90 @@ static struct task_struct *load_balance_next_fair(void *arg)
1286 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 1307 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
1287} 1308}
1288 1309
1289#ifdef CONFIG_FAIR_GROUP_SCHED 1310static unsigned long
1290static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) 1311__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1312 unsigned long max_load_move, struct sched_domain *sd,
1313 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
1314 struct cfs_rq *cfs_rq)
1291{ 1315{
1292 struct sched_entity *curr; 1316 struct rq_iterator cfs_rq_iterator;
1293 struct task_struct *p;
1294
1295 if (!cfs_rq->nr_running || !first_fair(cfs_rq))
1296 return MAX_PRIO;
1297
1298 curr = cfs_rq->curr;
1299 if (!curr)
1300 curr = __pick_next_entity(cfs_rq);
1301 1317
1302 p = task_of(curr); 1318 cfs_rq_iterator.start = load_balance_start_fair;
1319 cfs_rq_iterator.next = load_balance_next_fair;
1320 cfs_rq_iterator.arg = cfs_rq;
1303 1321
1304 return p->prio; 1322 return balance_tasks(this_rq, this_cpu, busiest,
1323 max_load_move, sd, idle, all_pinned,
1324 this_best_prio, &cfs_rq_iterator);
1305} 1325}
1306#endif
1307 1326
1327#ifdef CONFIG_FAIR_GROUP_SCHED
1308static unsigned long 1328static unsigned long
1309load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1329load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1310 unsigned long max_load_move, 1330 unsigned long max_load_move,
1311 struct sched_domain *sd, enum cpu_idle_type idle, 1331 struct sched_domain *sd, enum cpu_idle_type idle,
1312 int *all_pinned, int *this_best_prio) 1332 int *all_pinned, int *this_best_prio)
1313{ 1333{
1314 struct cfs_rq *busy_cfs_rq;
1315 long rem_load_move = max_load_move; 1334 long rem_load_move = max_load_move;
1316 struct rq_iterator cfs_rq_iterator; 1335 int busiest_cpu = cpu_of(busiest);
1336 struct task_group *tg;
1317 1337
1318 cfs_rq_iterator.start = load_balance_start_fair; 1338 rcu_read_lock();
1319 cfs_rq_iterator.next = load_balance_next_fair; 1339 list_for_each_entry(tg, &task_groups, list) {
1320
1321 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1322#ifdef CONFIG_FAIR_GROUP_SCHED
1323 struct cfs_rq *this_cfs_rq;
1324 long imbalance; 1340 long imbalance;
1325 unsigned long maxload; 1341 unsigned long this_weight, busiest_weight;
1342 long rem_load, max_load, moved_load;
1343
1344 /*
1345 * empty group
1346 */
1347 if (!aggregate(tg, sd)->task_weight)
1348 continue;
1349
1350 rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
1351 rem_load /= aggregate(tg, sd)->load + 1;
1326 1352
1327 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 1353 this_weight = tg->cfs_rq[this_cpu]->task_weight;
1354 busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
1328 1355
1329 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 1356 imbalance = (busiest_weight - this_weight) / 2;
1330 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 1357
1331 if (imbalance <= 0) 1358 if (imbalance < 0)
1359 imbalance = busiest_weight;
1360
1361 max_load = max(rem_load, imbalance);
1362 moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1363 max_load, sd, idle, all_pinned, this_best_prio,
1364 tg->cfs_rq[busiest_cpu]);
1365
1366 if (!moved_load)
1332 continue; 1367 continue;
1333 1368
1334 /* Don't pull more than imbalance/2 */ 1369 move_group_shares(tg, sd, busiest_cpu, this_cpu);
1335 imbalance /= 2;
1336 maxload = min(rem_load_move, imbalance);
1337 1370
1338 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1371 moved_load *= aggregate(tg, sd)->load;
1339#else 1372 moved_load /= aggregate(tg, sd)->rq_weight + 1;
1340# define maxload rem_load_move
1341#endif
1342 /*
1343 * pass busy_cfs_rq argument into
1344 * load_balance_[start|next]_fair iterators
1345 */
1346 cfs_rq_iterator.arg = busy_cfs_rq;
1347 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1348 maxload, sd, idle, all_pinned,
1349 this_best_prio,
1350 &cfs_rq_iterator);
1351 1373
1352 if (rem_load_move <= 0) 1374 rem_load_move -= moved_load;
1375 if (rem_load_move < 0)
1353 break; 1376 break;
1354 } 1377 }
1378 rcu_read_unlock();
1355 1379
1356 return max_load_move - rem_load_move; 1380 return max_load_move - rem_load_move;
1357} 1381}
1382#else
1383static unsigned long
1384load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1385 unsigned long max_load_move,
1386 struct sched_domain *sd, enum cpu_idle_type idle,
1387 int *all_pinned, int *this_best_prio)
1388{
1389 return __load_balance_fair(this_rq, this_cpu, busiest,
1390 max_load_move, sd, idle, all_pinned,
1391 this_best_prio, &busiest->cfs);
1392}
1393#endif
1358 1394
1359static int 1395static int
1360move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1396move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 201a69382a42..736fb8fd8977 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -518,6 +518,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
518 */ 518 */
519 for_each_sched_rt_entity(rt_se) 519 for_each_sched_rt_entity(rt_se)
520 enqueue_rt_entity(rt_se); 520 enqueue_rt_entity(rt_se);
521
522 inc_cpu_load(rq, p->se.load.weight);
521} 523}
522 524
523static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 525static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -537,6 +539,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
537 if (rt_rq && rt_rq->rt_nr_running) 539 if (rt_rq && rt_rq->rt_nr_running)
538 enqueue_rt_entity(rt_se); 540 enqueue_rt_entity(rt_se);
539 } 541 }
542
543 dec_cpu_load(rq, p->se.load.weight);
540} 544}
541 545
542/* 546/*