diff options
-rw-r--r-- | include/linux/sched.h | 1 | ||||
-rw-r--r-- | kernel/sched.c | 497 | ||||
-rw-r--r-- | kernel/sched_fair.c | 124 | ||||
-rw-r--r-- | kernel/sched_rt.c | 4 |
4 files changed, 548 insertions, 78 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 11f47249cdd2..0a32059e6ed4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -758,6 +758,7 @@ struct sched_domain { | |||
758 | struct sched_domain *child; /* bottom domain must be null terminated */ | 758 | struct sched_domain *child; /* bottom domain must be null terminated */ |
759 | struct sched_group *groups; /* the balancing groups of the domain */ | 759 | struct sched_group *groups; /* the balancing groups of the domain */ |
760 | cpumask_t span; /* span of all CPUs in this domain */ | 760 | cpumask_t span; /* span of all CPUs in this domain */ |
761 | int first_cpu; /* cache of the first cpu in this domain */ | ||
761 | unsigned long min_interval; /* Minimum balance interval ms */ | 762 | unsigned long min_interval; /* Minimum balance interval ms */ |
762 | unsigned long max_interval; /* Maximum balance interval ms */ | 763 | unsigned long max_interval; /* Maximum balance interval ms */ |
763 | unsigned int busy_factor; /* less balancing by factor if busy */ | 764 | unsigned int busy_factor; /* less balancing by factor if busy */ |
diff --git a/kernel/sched.c b/kernel/sched.c index 62d7481caca5..ae1a3e936d28 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -316,6 +316,8 @@ static DEFINE_MUTEX(doms_cur_mutex); | |||
316 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD | 316 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
317 | #endif | 317 | #endif |
318 | 318 | ||
319 | #define MIN_SHARES 2 | ||
320 | |||
319 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 321 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; |
320 | #endif | 322 | #endif |
321 | 323 | ||
@@ -403,6 +405,43 @@ struct cfs_rq { | |||
403 | */ | 405 | */ |
404 | struct list_head leaf_cfs_rq_list; | 406 | struct list_head leaf_cfs_rq_list; |
405 | struct task_group *tg; /* group that "owns" this runqueue */ | 407 | struct task_group *tg; /* group that "owns" this runqueue */ |
408 | |||
409 | #ifdef CONFIG_SMP | ||
410 | unsigned long task_weight; | ||
411 | unsigned long shares; | ||
412 | /* | ||
413 | * We need space to build a sched_domain wide view of the full task | ||
414 | * group tree, in order to avoid depending on dynamic memory allocation | ||
415 | * during the load balancing we place this in the per cpu task group | ||
416 | * hierarchy. This limits the load balancing to one instance per cpu, | ||
417 | * but more should not be needed anyway. | ||
418 | */ | ||
419 | struct aggregate_struct { | ||
420 | /* | ||
421 | * load = weight(cpus) * f(tg) | ||
422 | * | ||
423 | * Where f(tg) is the recursive weight fraction assigned to | ||
424 | * this group. | ||
425 | */ | ||
426 | unsigned long load; | ||
427 | |||
428 | /* | ||
429 | * part of the group weight distributed to this span. | ||
430 | */ | ||
431 | unsigned long shares; | ||
432 | |||
433 | /* | ||
434 | * The sum of all runqueue weights within this span. | ||
435 | */ | ||
436 | unsigned long rq_weight; | ||
437 | |||
438 | /* | ||
439 | * Weight contributed by tasks; this is the part we can | ||
440 | * influence by moving tasks around. | ||
441 | */ | ||
442 | unsigned long task_weight; | ||
443 | } aggregate; | ||
444 | #endif | ||
406 | #endif | 445 | #endif |
407 | }; | 446 | }; |
408 | 447 | ||
@@ -1402,11 +1441,390 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | |||
1402 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | 1441 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} |
1403 | #endif | 1442 | #endif |
1404 | 1443 | ||
1444 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
1445 | { | ||
1446 | update_load_add(&rq->load, load); | ||
1447 | } | ||
1448 | |||
1449 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
1450 | { | ||
1451 | update_load_sub(&rq->load, load); | ||
1452 | } | ||
1453 | |||
1405 | #ifdef CONFIG_SMP | 1454 | #ifdef CONFIG_SMP |
1406 | static unsigned long source_load(int cpu, int type); | 1455 | static unsigned long source_load(int cpu, int type); |
1407 | static unsigned long target_load(int cpu, int type); | 1456 | static unsigned long target_load(int cpu, int type); |
1408 | static unsigned long cpu_avg_load_per_task(int cpu); | 1457 | static unsigned long cpu_avg_load_per_task(int cpu); |
1409 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1458 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
1459 | |||
1460 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1461 | |||
1462 | /* | ||
1463 | * Group load balancing. | ||
1464 | * | ||
1465 | * We calculate a few balance domain wide aggregate numbers; load and weight. | ||
1466 | * Given the pictures below, and assuming each item has equal weight: | ||
1467 | * | ||
1468 | * root 1 - thread | ||
1469 | * / | \ A - group | ||
1470 | * A 1 B | ||
1471 | * /|\ / \ | ||
1472 | * C 2 D 3 4 | ||
1473 | * | | | ||
1474 | * 5 6 | ||
1475 | * | ||
1476 | * load: | ||
1477 | * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, | ||
1478 | * which equals 1/9-th of the total load. | ||
1479 | * | ||
1480 | * shares: | ||
1481 | * The weight of this group on the selected cpus. | ||
1482 | * | ||
1483 | * rq_weight: | ||
1484 | * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while | ||
1485 | * B would get 2. | ||
1486 | * | ||
1487 | * task_weight: | ||
1488 | * Part of the rq_weight contributed by tasks; all groups except B would | ||
1489 | * get 1, B gets 2. | ||
1490 | */ | ||
1491 | |||
1492 | static inline struct aggregate_struct * | ||
1493 | aggregate(struct task_group *tg, struct sched_domain *sd) | ||
1494 | { | ||
1495 | return &tg->cfs_rq[sd->first_cpu]->aggregate; | ||
1496 | } | ||
1497 | |||
1498 | typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); | ||
1499 | |||
1500 | /* | ||
1501 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1502 | * leaving it for the final time. | ||
1503 | */ | ||
1504 | static | ||
1505 | void aggregate_walk_tree(aggregate_func down, aggregate_func up, | ||
1506 | struct sched_domain *sd) | ||
1507 | { | ||
1508 | struct task_group *parent, *child; | ||
1509 | |||
1510 | rcu_read_lock(); | ||
1511 | parent = &root_task_group; | ||
1512 | down: | ||
1513 | (*down)(parent, sd); | ||
1514 | list_for_each_entry_rcu(child, &parent->children, siblings) { | ||
1515 | parent = child; | ||
1516 | goto down; | ||
1517 | |||
1518 | up: | ||
1519 | continue; | ||
1520 | } | ||
1521 | (*up)(parent, sd); | ||
1522 | |||
1523 | child = parent; | ||
1524 | parent = parent->parent; | ||
1525 | if (parent) | ||
1526 | goto up; | ||
1527 | rcu_read_unlock(); | ||
1528 | } | ||
1529 | |||
1530 | /* | ||
1531 | * Calculate the aggregate runqueue weight. | ||
1532 | */ | ||
1533 | static | ||
1534 | void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) | ||
1535 | { | ||
1536 | unsigned long rq_weight = 0; | ||
1537 | unsigned long task_weight = 0; | ||
1538 | int i; | ||
1539 | |||
1540 | for_each_cpu_mask(i, sd->span) { | ||
1541 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
1542 | task_weight += tg->cfs_rq[i]->task_weight; | ||
1543 | } | ||
1544 | |||
1545 | aggregate(tg, sd)->rq_weight = rq_weight; | ||
1546 | aggregate(tg, sd)->task_weight = task_weight; | ||
1547 | } | ||
1548 | |||
1549 | /* | ||
1550 | * Redistribute tg->shares amongst all tg->cfs_rq[]s. | ||
1551 | */ | ||
1552 | static void __aggregate_redistribute_shares(struct task_group *tg) | ||
1553 | { | ||
1554 | int i, max_cpu = smp_processor_id(); | ||
1555 | unsigned long rq_weight = 0; | ||
1556 | unsigned long shares, max_shares = 0, shares_rem = tg->shares; | ||
1557 | |||
1558 | for_each_possible_cpu(i) | ||
1559 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
1560 | |||
1561 | for_each_possible_cpu(i) { | ||
1562 | /* | ||
1563 | * divide shares proportional to the rq_weights. | ||
1564 | */ | ||
1565 | shares = tg->shares * tg->cfs_rq[i]->load.weight; | ||
1566 | shares /= rq_weight + 1; | ||
1567 | |||
1568 | tg->cfs_rq[i]->shares = shares; | ||
1569 | |||
1570 | if (shares > max_shares) { | ||
1571 | max_shares = shares; | ||
1572 | max_cpu = i; | ||
1573 | } | ||
1574 | shares_rem -= shares; | ||
1575 | } | ||
1576 | |||
1577 | /* | ||
1578 | * Ensure it all adds up to tg->shares; we can loose a few | ||
1579 | * due to rounding down when computing the per-cpu shares. | ||
1580 | */ | ||
1581 | if (shares_rem) | ||
1582 | tg->cfs_rq[max_cpu]->shares += shares_rem; | ||
1583 | } | ||
1584 | |||
1585 | /* | ||
1586 | * Compute the weight of this group on the given cpus. | ||
1587 | */ | ||
1588 | static | ||
1589 | void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) | ||
1590 | { | ||
1591 | unsigned long shares = 0; | ||
1592 | int i; | ||
1593 | |||
1594 | again: | ||
1595 | for_each_cpu_mask(i, sd->span) | ||
1596 | shares += tg->cfs_rq[i]->shares; | ||
1597 | |||
1598 | /* | ||
1599 | * When the span doesn't have any shares assigned, but does have | ||
1600 | * tasks to run do a machine wide rebalance (should be rare). | ||
1601 | */ | ||
1602 | if (unlikely(!shares && aggregate(tg, sd)->rq_weight)) { | ||
1603 | __aggregate_redistribute_shares(tg); | ||
1604 | goto again; | ||
1605 | } | ||
1606 | |||
1607 | aggregate(tg, sd)->shares = shares; | ||
1608 | } | ||
1609 | |||
1610 | /* | ||
1611 | * Compute the load fraction assigned to this group, relies on the aggregate | ||
1612 | * weight and this group's parent's load, i.e. top-down. | ||
1613 | */ | ||
1614 | static | ||
1615 | void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) | ||
1616 | { | ||
1617 | unsigned long load; | ||
1618 | |||
1619 | if (!tg->parent) { | ||
1620 | int i; | ||
1621 | |||
1622 | load = 0; | ||
1623 | for_each_cpu_mask(i, sd->span) | ||
1624 | load += cpu_rq(i)->load.weight; | ||
1625 | |||
1626 | } else { | ||
1627 | load = aggregate(tg->parent, sd)->load; | ||
1628 | |||
1629 | /* | ||
1630 | * shares is our weight in the parent's rq so | ||
1631 | * shares/parent->rq_weight gives our fraction of the load | ||
1632 | */ | ||
1633 | load *= aggregate(tg, sd)->shares; | ||
1634 | load /= aggregate(tg->parent, sd)->rq_weight + 1; | ||
1635 | } | ||
1636 | |||
1637 | aggregate(tg, sd)->load = load; | ||
1638 | } | ||
1639 | |||
1640 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1641 | |||
1642 | /* | ||
1643 | * Calculate and set the cpu's group shares. | ||
1644 | */ | ||
1645 | static void | ||
1646 | __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, | ||
1647 | int tcpu) | ||
1648 | { | ||
1649 | int boost = 0; | ||
1650 | unsigned long shares; | ||
1651 | unsigned long rq_weight; | ||
1652 | |||
1653 | if (!tg->se[tcpu]) | ||
1654 | return; | ||
1655 | |||
1656 | rq_weight = tg->cfs_rq[tcpu]->load.weight; | ||
1657 | |||
1658 | /* | ||
1659 | * If there are currently no tasks on the cpu pretend there is one of | ||
1660 | * average load so that when a new task gets to run here it will not | ||
1661 | * get delayed by group starvation. | ||
1662 | */ | ||
1663 | if (!rq_weight) { | ||
1664 | boost = 1; | ||
1665 | rq_weight = NICE_0_LOAD; | ||
1666 | } | ||
1667 | |||
1668 | /* | ||
1669 | * \Sum shares * rq_weight | ||
1670 | * shares = ----------------------- | ||
1671 | * \Sum rq_weight | ||
1672 | * | ||
1673 | */ | ||
1674 | shares = aggregate(tg, sd)->shares * rq_weight; | ||
1675 | shares /= aggregate(tg, sd)->rq_weight + 1; | ||
1676 | |||
1677 | /* | ||
1678 | * record the actual number of shares, not the boosted amount. | ||
1679 | */ | ||
1680 | tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; | ||
1681 | |||
1682 | if (shares < MIN_SHARES) | ||
1683 | shares = MIN_SHARES; | ||
1684 | |||
1685 | __set_se_shares(tg->se[tcpu], shares); | ||
1686 | } | ||
1687 | |||
1688 | /* | ||
1689 | * Re-adjust the weights on the cpu the task came from and on the cpu the | ||
1690 | * task went to. | ||
1691 | */ | ||
1692 | static void | ||
1693 | __move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
1694 | int scpu, int dcpu) | ||
1695 | { | ||
1696 | unsigned long shares; | ||
1697 | |||
1698 | shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
1699 | |||
1700 | __update_group_shares_cpu(tg, sd, scpu); | ||
1701 | __update_group_shares_cpu(tg, sd, dcpu); | ||
1702 | |||
1703 | /* | ||
1704 | * ensure we never loose shares due to rounding errors in the | ||
1705 | * above redistribution. | ||
1706 | */ | ||
1707 | shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
1708 | if (shares) | ||
1709 | tg->cfs_rq[dcpu]->shares += shares; | ||
1710 | } | ||
1711 | |||
1712 | /* | ||
1713 | * Because changing a group's shares changes the weight of the super-group | ||
1714 | * we need to walk up the tree and change all shares until we hit the root. | ||
1715 | */ | ||
1716 | static void | ||
1717 | move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
1718 | int scpu, int dcpu) | ||
1719 | { | ||
1720 | while (tg) { | ||
1721 | __move_group_shares(tg, sd, scpu, dcpu); | ||
1722 | tg = tg->parent; | ||
1723 | } | ||
1724 | } | ||
1725 | |||
1726 | static | ||
1727 | void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) | ||
1728 | { | ||
1729 | unsigned long shares = aggregate(tg, sd)->shares; | ||
1730 | int i; | ||
1731 | |||
1732 | for_each_cpu_mask(i, sd->span) { | ||
1733 | struct rq *rq = cpu_rq(i); | ||
1734 | unsigned long flags; | ||
1735 | |||
1736 | spin_lock_irqsave(&rq->lock, flags); | ||
1737 | __update_group_shares_cpu(tg, sd, i); | ||
1738 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1739 | } | ||
1740 | |||
1741 | aggregate_group_shares(tg, sd); | ||
1742 | |||
1743 | /* | ||
1744 | * ensure we never loose shares due to rounding errors in the | ||
1745 | * above redistribution. | ||
1746 | */ | ||
1747 | shares -= aggregate(tg, sd)->shares; | ||
1748 | if (shares) { | ||
1749 | tg->cfs_rq[sd->first_cpu]->shares += shares; | ||
1750 | aggregate(tg, sd)->shares += shares; | ||
1751 | } | ||
1752 | } | ||
1753 | |||
1754 | /* | ||
1755 | * Calculate the accumulative weight and recursive load of each task group | ||
1756 | * while walking down the tree. | ||
1757 | */ | ||
1758 | static | ||
1759 | void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) | ||
1760 | { | ||
1761 | aggregate_group_weight(tg, sd); | ||
1762 | aggregate_group_shares(tg, sd); | ||
1763 | aggregate_group_load(tg, sd); | ||
1764 | } | ||
1765 | |||
1766 | /* | ||
1767 | * Rebalance the cpu shares while walking back up the tree. | ||
1768 | */ | ||
1769 | static | ||
1770 | void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) | ||
1771 | { | ||
1772 | aggregate_group_set_shares(tg, sd); | ||
1773 | } | ||
1774 | |||
1775 | static DEFINE_PER_CPU(spinlock_t, aggregate_lock); | ||
1776 | |||
1777 | static void __init init_aggregate(void) | ||
1778 | { | ||
1779 | int i; | ||
1780 | |||
1781 | for_each_possible_cpu(i) | ||
1782 | spin_lock_init(&per_cpu(aggregate_lock, i)); | ||
1783 | } | ||
1784 | |||
1785 | static int get_aggregate(struct sched_domain *sd) | ||
1786 | { | ||
1787 | if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) | ||
1788 | return 0; | ||
1789 | |||
1790 | aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); | ||
1791 | return 1; | ||
1792 | } | ||
1793 | |||
1794 | static void put_aggregate(struct sched_domain *sd) | ||
1795 | { | ||
1796 | spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); | ||
1797 | } | ||
1798 | |||
1799 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1800 | { | ||
1801 | cfs_rq->shares = shares; | ||
1802 | } | ||
1803 | |||
1804 | #else | ||
1805 | |||
1806 | static inline void init_aggregate(void) | ||
1807 | { | ||
1808 | } | ||
1809 | |||
1810 | static inline int get_aggregate(struct sched_domain *sd) | ||
1811 | { | ||
1812 | return 0; | ||
1813 | } | ||
1814 | |||
1815 | static inline void put_aggregate(struct sched_domain *sd) | ||
1816 | { | ||
1817 | } | ||
1818 | #endif | ||
1819 | |||
1820 | #else /* CONFIG_SMP */ | ||
1821 | |||
1822 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1823 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1824 | { | ||
1825 | } | ||
1826 | #endif | ||
1827 | |||
1410 | #endif /* CONFIG_SMP */ | 1828 | #endif /* CONFIG_SMP */ |
1411 | 1829 | ||
1412 | #include "sched_stats.h" | 1830 | #include "sched_stats.h" |
@@ -1419,26 +1837,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | |||
1419 | 1837 | ||
1420 | #define sched_class_highest (&rt_sched_class) | 1838 | #define sched_class_highest (&rt_sched_class) |
1421 | 1839 | ||
1422 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | 1840 | static void inc_nr_running(struct rq *rq) |
1423 | { | ||
1424 | update_load_add(&rq->load, p->se.load.weight); | ||
1425 | } | ||
1426 | |||
1427 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
1428 | { | ||
1429 | update_load_sub(&rq->load, p->se.load.weight); | ||
1430 | } | ||
1431 | |||
1432 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
1433 | { | 1841 | { |
1434 | rq->nr_running++; | 1842 | rq->nr_running++; |
1435 | inc_load(rq, p); | ||
1436 | } | 1843 | } |
1437 | 1844 | ||
1438 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1845 | static void dec_nr_running(struct rq *rq) |
1439 | { | 1846 | { |
1440 | rq->nr_running--; | 1847 | rq->nr_running--; |
1441 | dec_load(rq, p); | ||
1442 | } | 1848 | } |
1443 | 1849 | ||
1444 | static void set_load_weight(struct task_struct *p) | 1850 | static void set_load_weight(struct task_struct *p) |
@@ -1530,7 +1936,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1530 | rq->nr_uninterruptible--; | 1936 | rq->nr_uninterruptible--; |
1531 | 1937 | ||
1532 | enqueue_task(rq, p, wakeup); | 1938 | enqueue_task(rq, p, wakeup); |
1533 | inc_nr_running(p, rq); | 1939 | inc_nr_running(rq); |
1534 | } | 1940 | } |
1535 | 1941 | ||
1536 | /* | 1942 | /* |
@@ -1542,7 +1948,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1542 | rq->nr_uninterruptible++; | 1948 | rq->nr_uninterruptible++; |
1543 | 1949 | ||
1544 | dequeue_task(rq, p, sleep); | 1950 | dequeue_task(rq, p, sleep); |
1545 | dec_nr_running(p, rq); | 1951 | dec_nr_running(rq); |
1546 | } | 1952 | } |
1547 | 1953 | ||
1548 | /** | 1954 | /** |
@@ -2194,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2194 | * management (if any): | 2600 | * management (if any): |
2195 | */ | 2601 | */ |
2196 | p->sched_class->task_new(rq, p); | 2602 | p->sched_class->task_new(rq, p); |
2197 | inc_nr_running(p, rq); | 2603 | inc_nr_running(rq); |
2198 | } | 2604 | } |
2199 | check_preempt_curr(rq, p); | 2605 | check_preempt_curr(rq, p); |
2200 | #ifdef CONFIG_SMP | 2606 | #ifdef CONFIG_SMP |
@@ -3185,9 +3591,12 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3185 | unsigned long imbalance; | 3591 | unsigned long imbalance; |
3186 | struct rq *busiest; | 3592 | struct rq *busiest; |
3187 | unsigned long flags; | 3593 | unsigned long flags; |
3594 | int unlock_aggregate; | ||
3188 | 3595 | ||
3189 | cpus_setall(*cpus); | 3596 | cpus_setall(*cpus); |
3190 | 3597 | ||
3598 | unlock_aggregate = get_aggregate(sd); | ||
3599 | |||
3191 | /* | 3600 | /* |
3192 | * When power savings policy is enabled for the parent domain, idle | 3601 | * When power savings policy is enabled for the parent domain, idle |
3193 | * sibling can pick up load irrespective of busy siblings. In this case, | 3602 | * sibling can pick up load irrespective of busy siblings. In this case, |
@@ -3303,8 +3712,9 @@ redo: | |||
3303 | 3712 | ||
3304 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3713 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3305 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3714 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3306 | return -1; | 3715 | ld_moved = -1; |
3307 | return ld_moved; | 3716 | |
3717 | goto out; | ||
3308 | 3718 | ||
3309 | out_balanced: | 3719 | out_balanced: |
3310 | schedstat_inc(sd, lb_balanced[idle]); | 3720 | schedstat_inc(sd, lb_balanced[idle]); |
@@ -3319,8 +3729,13 @@ out_one_pinned: | |||
3319 | 3729 | ||
3320 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3730 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3321 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3731 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3322 | return -1; | 3732 | ld_moved = -1; |
3323 | return 0; | 3733 | else |
3734 | ld_moved = 0; | ||
3735 | out: | ||
3736 | if (unlock_aggregate) | ||
3737 | put_aggregate(sd); | ||
3738 | return ld_moved; | ||
3324 | } | 3739 | } |
3325 | 3740 | ||
3326 | /* | 3741 | /* |
@@ -4535,10 +4950,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4535 | goto out_unlock; | 4950 | goto out_unlock; |
4536 | } | 4951 | } |
4537 | on_rq = p->se.on_rq; | 4952 | on_rq = p->se.on_rq; |
4538 | if (on_rq) { | 4953 | if (on_rq) |
4539 | dequeue_task(rq, p, 0); | 4954 | dequeue_task(rq, p, 0); |
4540 | dec_load(rq, p); | ||
4541 | } | ||
4542 | 4955 | ||
4543 | p->static_prio = NICE_TO_PRIO(nice); | 4956 | p->static_prio = NICE_TO_PRIO(nice); |
4544 | set_load_weight(p); | 4957 | set_load_weight(p); |
@@ -4548,7 +4961,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4548 | 4961 | ||
4549 | if (on_rq) { | 4962 | if (on_rq) { |
4550 | enqueue_task(rq, p, 0); | 4963 | enqueue_task(rq, p, 0); |
4551 | inc_load(rq, p); | ||
4552 | /* | 4964 | /* |
4553 | * If the task increased its priority or is running and | 4965 | * If the task increased its priority or is running and |
4554 | * lowered its priority, then reschedule its CPU: | 4966 | * lowered its priority, then reschedule its CPU: |
@@ -6921,6 +7333,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
6921 | SD_INIT(sd, ALLNODES); | 7333 | SD_INIT(sd, ALLNODES); |
6922 | set_domain_attribute(sd, attr); | 7334 | set_domain_attribute(sd, attr); |
6923 | sd->span = *cpu_map; | 7335 | sd->span = *cpu_map; |
7336 | sd->first_cpu = first_cpu(sd->span); | ||
6924 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | 7337 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); |
6925 | p = sd; | 7338 | p = sd; |
6926 | sd_allnodes = 1; | 7339 | sd_allnodes = 1; |
@@ -6931,6 +7344,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
6931 | SD_INIT(sd, NODE); | 7344 | SD_INIT(sd, NODE); |
6932 | set_domain_attribute(sd, attr); | 7345 | set_domain_attribute(sd, attr); |
6933 | sched_domain_node_span(cpu_to_node(i), &sd->span); | 7346 | sched_domain_node_span(cpu_to_node(i), &sd->span); |
7347 | sd->first_cpu = first_cpu(sd->span); | ||
6934 | sd->parent = p; | 7348 | sd->parent = p; |
6935 | if (p) | 7349 | if (p) |
6936 | p->child = sd; | 7350 | p->child = sd; |
@@ -6942,6 +7356,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
6942 | SD_INIT(sd, CPU); | 7356 | SD_INIT(sd, CPU); |
6943 | set_domain_attribute(sd, attr); | 7357 | set_domain_attribute(sd, attr); |
6944 | sd->span = *nodemask; | 7358 | sd->span = *nodemask; |
7359 | sd->first_cpu = first_cpu(sd->span); | ||
6945 | sd->parent = p; | 7360 | sd->parent = p; |
6946 | if (p) | 7361 | if (p) |
6947 | p->child = sd; | 7362 | p->child = sd; |
@@ -6953,6 +7368,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
6953 | SD_INIT(sd, MC); | 7368 | SD_INIT(sd, MC); |
6954 | set_domain_attribute(sd, attr); | 7369 | set_domain_attribute(sd, attr); |
6955 | sd->span = cpu_coregroup_map(i); | 7370 | sd->span = cpu_coregroup_map(i); |
7371 | sd->first_cpu = first_cpu(sd->span); | ||
6956 | cpus_and(sd->span, sd->span, *cpu_map); | 7372 | cpus_and(sd->span, sd->span, *cpu_map); |
6957 | sd->parent = p; | 7373 | sd->parent = p; |
6958 | p->child = sd; | 7374 | p->child = sd; |
@@ -6965,6 +7381,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
6965 | SD_INIT(sd, SIBLING); | 7381 | SD_INIT(sd, SIBLING); |
6966 | set_domain_attribute(sd, attr); | 7382 | set_domain_attribute(sd, attr); |
6967 | sd->span = per_cpu(cpu_sibling_map, i); | 7383 | sd->span = per_cpu(cpu_sibling_map, i); |
7384 | sd->first_cpu = first_cpu(sd->span); | ||
6968 | cpus_and(sd->span, sd->span, *cpu_map); | 7385 | cpus_and(sd->span, sd->span, *cpu_map); |
6969 | sd->parent = p; | 7386 | sd->parent = p; |
6970 | p->child = sd; | 7387 | p->child = sd; |
@@ -7633,6 +8050,7 @@ void __init sched_init(void) | |||
7633 | } | 8050 | } |
7634 | 8051 | ||
7635 | #ifdef CONFIG_SMP | 8052 | #ifdef CONFIG_SMP |
8053 | init_aggregate(); | ||
7636 | init_defrootdomain(); | 8054 | init_defrootdomain(); |
7637 | #endif | 8055 | #endif |
7638 | 8056 | ||
@@ -8199,14 +8617,11 @@ void sched_move_task(struct task_struct *tsk) | |||
8199 | #endif | 8617 | #endif |
8200 | 8618 | ||
8201 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8619 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8202 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 8620 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) |
8203 | { | 8621 | { |
8204 | struct cfs_rq *cfs_rq = se->cfs_rq; | 8622 | struct cfs_rq *cfs_rq = se->cfs_rq; |
8205 | struct rq *rq = cfs_rq->rq; | ||
8206 | int on_rq; | 8623 | int on_rq; |
8207 | 8624 | ||
8208 | spin_lock_irq(&rq->lock); | ||
8209 | |||
8210 | on_rq = se->on_rq; | 8625 | on_rq = se->on_rq; |
8211 | if (on_rq) | 8626 | if (on_rq) |
8212 | dequeue_entity(cfs_rq, se, 0); | 8627 | dequeue_entity(cfs_rq, se, 0); |
@@ -8216,8 +8631,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) | |||
8216 | 8631 | ||
8217 | if (on_rq) | 8632 | if (on_rq) |
8218 | enqueue_entity(cfs_rq, se, 0); | 8633 | enqueue_entity(cfs_rq, se, 0); |
8634 | } | ||
8219 | 8635 | ||
8220 | spin_unlock_irq(&rq->lock); | 8636 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
8637 | { | ||
8638 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8639 | struct rq *rq = cfs_rq->rq; | ||
8640 | unsigned long flags; | ||
8641 | |||
8642 | spin_lock_irqsave(&rq->lock, flags); | ||
8643 | __set_se_shares(se, shares); | ||
8644 | spin_unlock_irqrestore(&rq->lock, flags); | ||
8221 | } | 8645 | } |
8222 | 8646 | ||
8223 | static DEFINE_MUTEX(shares_mutex); | 8647 | static DEFINE_MUTEX(shares_mutex); |
@@ -8238,8 +8662,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8238 | * (The default weight is 1024 - so there's no practical | 8662 | * (The default weight is 1024 - so there's no practical |
8239 | * limitation from this.) | 8663 | * limitation from this.) |
8240 | */ | 8664 | */ |
8241 | if (shares < 2) | 8665 | if (shares < MIN_SHARES) |
8242 | shares = 2; | 8666 | shares = MIN_SHARES; |
8243 | 8667 | ||
8244 | mutex_lock(&shares_mutex); | 8668 | mutex_lock(&shares_mutex); |
8245 | if (tg->shares == shares) | 8669 | if (tg->shares == shares) |
@@ -8259,8 +8683,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8259 | * w/o tripping rebalance_share or load_balance_fair. | 8683 | * w/o tripping rebalance_share or load_balance_fair. |
8260 | */ | 8684 | */ |
8261 | tg->shares = shares; | 8685 | tg->shares = shares; |
8262 | for_each_possible_cpu(i) | 8686 | for_each_possible_cpu(i) { |
8263 | set_se_shares(tg->se[i], shares); | 8687 | /* |
8688 | * force a rebalance | ||
8689 | */ | ||
8690 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | ||
8691 | set_se_shares(tg->se[i], shares/nr_cpu_ids); | ||
8692 | } | ||
8264 | 8693 | ||
8265 | /* | 8694 | /* |
8266 | * Enable load balance activity on this group, by inserting it back on | 8695 | * Enable load balance activity on this group, by inserting it back on |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b43748efaa7f..b89fec93a237 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -492,10 +492,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
492 | * Scheduling class queueing methods: | 492 | * Scheduling class queueing methods: |
493 | */ | 493 | */ |
494 | 494 | ||
495 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
496 | static void | ||
497 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
498 | { | ||
499 | cfs_rq->task_weight += weight; | ||
500 | } | ||
501 | #else | ||
502 | static inline void | ||
503 | add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) | ||
504 | { | ||
505 | } | ||
506 | #endif | ||
507 | |||
495 | static void | 508 | static void |
496 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 509 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
497 | { | 510 | { |
498 | update_load_add(&cfs_rq->load, se->load.weight); | 511 | update_load_add(&cfs_rq->load, se->load.weight); |
512 | if (!parent_entity(se)) | ||
513 | inc_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
514 | if (entity_is_task(se)) | ||
515 | add_cfs_task_weight(cfs_rq, se->load.weight); | ||
499 | cfs_rq->nr_running++; | 516 | cfs_rq->nr_running++; |
500 | se->on_rq = 1; | 517 | se->on_rq = 1; |
501 | } | 518 | } |
@@ -504,6 +521,10 @@ static void | |||
504 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 521 | account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
505 | { | 522 | { |
506 | update_load_sub(&cfs_rq->load, se->load.weight); | 523 | update_load_sub(&cfs_rq->load, se->load.weight); |
524 | if (!parent_entity(se)) | ||
525 | dec_cpu_load(rq_of(cfs_rq), se->load.weight); | ||
526 | if (entity_is_task(se)) | ||
527 | add_cfs_task_weight(cfs_rq, -se->load.weight); | ||
507 | cfs_rq->nr_running--; | 528 | cfs_rq->nr_running--; |
508 | se->on_rq = 0; | 529 | se->on_rq = 0; |
509 | } | 530 | } |
@@ -1286,75 +1307,90 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
1286 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1307 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); |
1287 | } | 1308 | } |
1288 | 1309 | ||
1289 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1310 | static unsigned long |
1290 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | 1311 | __load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1312 | unsigned long max_load_move, struct sched_domain *sd, | ||
1313 | enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, | ||
1314 | struct cfs_rq *cfs_rq) | ||
1291 | { | 1315 | { |
1292 | struct sched_entity *curr; | 1316 | struct rq_iterator cfs_rq_iterator; |
1293 | struct task_struct *p; | ||
1294 | |||
1295 | if (!cfs_rq->nr_running || !first_fair(cfs_rq)) | ||
1296 | return MAX_PRIO; | ||
1297 | |||
1298 | curr = cfs_rq->curr; | ||
1299 | if (!curr) | ||
1300 | curr = __pick_next_entity(cfs_rq); | ||
1301 | 1317 | ||
1302 | p = task_of(curr); | 1318 | cfs_rq_iterator.start = load_balance_start_fair; |
1319 | cfs_rq_iterator.next = load_balance_next_fair; | ||
1320 | cfs_rq_iterator.arg = cfs_rq; | ||
1303 | 1321 | ||
1304 | return p->prio; | 1322 | return balance_tasks(this_rq, this_cpu, busiest, |
1323 | max_load_move, sd, idle, all_pinned, | ||
1324 | this_best_prio, &cfs_rq_iterator); | ||
1305 | } | 1325 | } |
1306 | #endif | ||
1307 | 1326 | ||
1327 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1308 | static unsigned long | 1328 | static unsigned long |
1309 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1329 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
1310 | unsigned long max_load_move, | 1330 | unsigned long max_load_move, |
1311 | struct sched_domain *sd, enum cpu_idle_type idle, | 1331 | struct sched_domain *sd, enum cpu_idle_type idle, |
1312 | int *all_pinned, int *this_best_prio) | 1332 | int *all_pinned, int *this_best_prio) |
1313 | { | 1333 | { |
1314 | struct cfs_rq *busy_cfs_rq; | ||
1315 | long rem_load_move = max_load_move; | 1334 | long rem_load_move = max_load_move; |
1316 | struct rq_iterator cfs_rq_iterator; | 1335 | int busiest_cpu = cpu_of(busiest); |
1336 | struct task_group *tg; | ||
1317 | 1337 | ||
1318 | cfs_rq_iterator.start = load_balance_start_fair; | 1338 | rcu_read_lock(); |
1319 | cfs_rq_iterator.next = load_balance_next_fair; | 1339 | list_for_each_entry(tg, &task_groups, list) { |
1320 | |||
1321 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | ||
1322 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1323 | struct cfs_rq *this_cfs_rq; | ||
1324 | long imbalance; | 1340 | long imbalance; |
1325 | unsigned long maxload; | 1341 | unsigned long this_weight, busiest_weight; |
1342 | long rem_load, max_load, moved_load; | ||
1343 | |||
1344 | /* | ||
1345 | * empty group | ||
1346 | */ | ||
1347 | if (!aggregate(tg, sd)->task_weight) | ||
1348 | continue; | ||
1349 | |||
1350 | rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; | ||
1351 | rem_load /= aggregate(tg, sd)->load + 1; | ||
1326 | 1352 | ||
1327 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1353 | this_weight = tg->cfs_rq[this_cpu]->task_weight; |
1354 | busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; | ||
1328 | 1355 | ||
1329 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1356 | imbalance = (busiest_weight - this_weight) / 2; |
1330 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | 1357 | |
1331 | if (imbalance <= 0) | 1358 | if (imbalance < 0) |
1359 | imbalance = busiest_weight; | ||
1360 | |||
1361 | max_load = max(rem_load, imbalance); | ||
1362 | moved_load = __load_balance_fair(this_rq, this_cpu, busiest, | ||
1363 | max_load, sd, idle, all_pinned, this_best_prio, | ||
1364 | tg->cfs_rq[busiest_cpu]); | ||
1365 | |||
1366 | if (!moved_load) | ||
1332 | continue; | 1367 | continue; |
1333 | 1368 | ||
1334 | /* Don't pull more than imbalance/2 */ | 1369 | move_group_shares(tg, sd, busiest_cpu, this_cpu); |
1335 | imbalance /= 2; | ||
1336 | maxload = min(rem_load_move, imbalance); | ||
1337 | 1370 | ||
1338 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1371 | moved_load *= aggregate(tg, sd)->load; |
1339 | #else | 1372 | moved_load /= aggregate(tg, sd)->rq_weight + 1; |
1340 | # define maxload rem_load_move | ||
1341 | #endif | ||
1342 | /* | ||
1343 | * pass busy_cfs_rq argument into | ||
1344 | * load_balance_[start|next]_fair iterators | ||
1345 | */ | ||
1346 | cfs_rq_iterator.arg = busy_cfs_rq; | ||
1347 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | ||
1348 | maxload, sd, idle, all_pinned, | ||
1349 | this_best_prio, | ||
1350 | &cfs_rq_iterator); | ||
1351 | 1373 | ||
1352 | if (rem_load_move <= 0) | 1374 | rem_load_move -= moved_load; |
1375 | if (rem_load_move < 0) | ||
1353 | break; | 1376 | break; |
1354 | } | 1377 | } |
1378 | rcu_read_unlock(); | ||
1355 | 1379 | ||
1356 | return max_load_move - rem_load_move; | 1380 | return max_load_move - rem_load_move; |
1357 | } | 1381 | } |
1382 | #else | ||
1383 | static unsigned long | ||
1384 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | ||
1385 | unsigned long max_load_move, | ||
1386 | struct sched_domain *sd, enum cpu_idle_type idle, | ||
1387 | int *all_pinned, int *this_best_prio) | ||
1388 | { | ||
1389 | return __load_balance_fair(this_rq, this_cpu, busiest, | ||
1390 | max_load_move, sd, idle, all_pinned, | ||
1391 | this_best_prio, &busiest->cfs); | ||
1392 | } | ||
1393 | #endif | ||
1358 | 1394 | ||
1359 | static int | 1395 | static int |
1360 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1396 | move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 201a69382a42..736fb8fd8977 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -518,6 +518,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | |||
518 | */ | 518 | */ |
519 | for_each_sched_rt_entity(rt_se) | 519 | for_each_sched_rt_entity(rt_se) |
520 | enqueue_rt_entity(rt_se); | 520 | enqueue_rt_entity(rt_se); |
521 | |||
522 | inc_cpu_load(rq, p->se.load.weight); | ||
521 | } | 523 | } |
522 | 524 | ||
523 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 525 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
@@ -537,6 +539,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | |||
537 | if (rt_rq && rt_rq->rt_nr_running) | 539 | if (rt_rq && rt_rq->rt_nr_running) |
538 | enqueue_rt_entity(rt_se); | 540 | enqueue_rt_entity(rt_se); |
539 | } | 541 | } |
542 | |||
543 | dec_cpu_load(rq, p->se.load.weight); | ||
540 | } | 544 | } |
541 | 545 | ||
542 | /* | 546 | /* |