diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 521 |
1 files changed, 96 insertions, 425 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index e2e985eeee78..70cb127e3495 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -137,7 +137,7 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) | |||
137 | 137 | ||
138 | static inline int rt_policy(int policy) | 138 | static inline int rt_policy(int policy) |
139 | { | 139 | { |
140 | if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) | 140 | if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) |
141 | return 1; | 141 | return 1; |
142 | return 0; | 142 | return 0; |
143 | } | 143 | } |
@@ -313,12 +313,15 @@ static DEFINE_SPINLOCK(task_group_lock); | |||
313 | #endif | 313 | #endif |
314 | 314 | ||
315 | /* | 315 | /* |
316 | * A weight of 0, 1 or ULONG_MAX can cause arithmetics problems. | 316 | * A weight of 0 or 1 can cause arithmetics problems. |
317 | * A weight of a cfs_rq is the sum of weights of which entities | ||
318 | * are queued on this cfs_rq, so a weight of a entity should not be | ||
319 | * too large, so as the shares value of a task group. | ||
317 | * (The default weight is 1024 - so there's no practical | 320 | * (The default weight is 1024 - so there's no practical |
318 | * limitation from this.) | 321 | * limitation from this.) |
319 | */ | 322 | */ |
320 | #define MIN_SHARES 2 | 323 | #define MIN_SHARES 2 |
321 | #define MAX_SHARES (ULONG_MAX - 1) | 324 | #define MAX_SHARES (1UL << 18) |
322 | 325 | ||
323 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | 326 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; |
324 | #endif | 327 | #endif |
@@ -399,43 +402,6 @@ struct cfs_rq { | |||
399 | */ | 402 | */ |
400 | struct list_head leaf_cfs_rq_list; | 403 | struct list_head leaf_cfs_rq_list; |
401 | struct task_group *tg; /* group that "owns" this runqueue */ | 404 | struct task_group *tg; /* group that "owns" this runqueue */ |
402 | |||
403 | #ifdef CONFIG_SMP | ||
404 | unsigned long task_weight; | ||
405 | unsigned long shares; | ||
406 | /* | ||
407 | * We need space to build a sched_domain wide view of the full task | ||
408 | * group tree, in order to avoid depending on dynamic memory allocation | ||
409 | * during the load balancing we place this in the per cpu task group | ||
410 | * hierarchy. This limits the load balancing to one instance per cpu, | ||
411 | * but more should not be needed anyway. | ||
412 | */ | ||
413 | struct aggregate_struct { | ||
414 | /* | ||
415 | * load = weight(cpus) * f(tg) | ||
416 | * | ||
417 | * Where f(tg) is the recursive weight fraction assigned to | ||
418 | * this group. | ||
419 | */ | ||
420 | unsigned long load; | ||
421 | |||
422 | /* | ||
423 | * part of the group weight distributed to this span. | ||
424 | */ | ||
425 | unsigned long shares; | ||
426 | |||
427 | /* | ||
428 | * The sum of all runqueue weights within this span. | ||
429 | */ | ||
430 | unsigned long rq_weight; | ||
431 | |||
432 | /* | ||
433 | * Weight contributed by tasks; this is the part we can | ||
434 | * influence by moving tasks around. | ||
435 | */ | ||
436 | unsigned long task_weight; | ||
437 | } aggregate; | ||
438 | #endif | ||
439 | #endif | 405 | #endif |
440 | }; | 406 | }; |
441 | 407 | ||
@@ -1180,6 +1146,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) | |||
1180 | return HRTIMER_NORESTART; | 1146 | return HRTIMER_NORESTART; |
1181 | } | 1147 | } |
1182 | 1148 | ||
1149 | #ifdef CONFIG_SMP | ||
1183 | static void hotplug_hrtick_disable(int cpu) | 1150 | static void hotplug_hrtick_disable(int cpu) |
1184 | { | 1151 | { |
1185 | struct rq *rq = cpu_rq(cpu); | 1152 | struct rq *rq = cpu_rq(cpu); |
@@ -1235,6 +1202,7 @@ static void init_hrtick(void) | |||
1235 | { | 1202 | { |
1236 | hotcpu_notifier(hotplug_hrtick, 0); | 1203 | hotcpu_notifier(hotplug_hrtick, 0); |
1237 | } | 1204 | } |
1205 | #endif /* CONFIG_SMP */ | ||
1238 | 1206 | ||
1239 | static void init_rq_hrtick(struct rq *rq) | 1207 | static void init_rq_hrtick(struct rq *rq) |
1240 | { | 1208 | { |
@@ -1387,17 +1355,19 @@ static void __resched_task(struct task_struct *p, int tif_bit) | |||
1387 | */ | 1355 | */ |
1388 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) | 1356 | #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) |
1389 | 1357 | ||
1390 | /* | ||
1391 | * delta *= weight / lw | ||
1392 | */ | ||
1393 | static unsigned long | 1358 | static unsigned long |
1394 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, | 1359 | calc_delta_mine(unsigned long delta_exec, unsigned long weight, |
1395 | struct load_weight *lw) | 1360 | struct load_weight *lw) |
1396 | { | 1361 | { |
1397 | u64 tmp; | 1362 | u64 tmp; |
1398 | 1363 | ||
1399 | if (!lw->inv_weight) | 1364 | if (!lw->inv_weight) { |
1400 | lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)/(lw->weight+1); | 1365 | if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) |
1366 | lw->inv_weight = 1; | ||
1367 | else | ||
1368 | lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) | ||
1369 | / (lw->weight+1); | ||
1370 | } | ||
1401 | 1371 | ||
1402 | tmp = (u64)delta_exec * weight; | 1372 | tmp = (u64)delta_exec * weight; |
1403 | /* | 1373 | /* |
@@ -1412,6 +1382,12 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, | |||
1412 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); | 1382 | return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); |
1413 | } | 1383 | } |
1414 | 1384 | ||
1385 | static inline unsigned long | ||
1386 | calc_delta_fair(unsigned long delta_exec, struct load_weight *lw) | ||
1387 | { | ||
1388 | return calc_delta_mine(delta_exec, NICE_0_LOAD, lw); | ||
1389 | } | ||
1390 | |||
1415 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) | 1391 | static inline void update_load_add(struct load_weight *lw, unsigned long inc) |
1416 | { | 1392 | { |
1417 | lw->weight += inc; | 1393 | lw->weight += inc; |
@@ -1524,326 +1500,6 @@ static unsigned long source_load(int cpu, int type); | |||
1524 | static unsigned long target_load(int cpu, int type); | 1500 | static unsigned long target_load(int cpu, int type); |
1525 | static unsigned long cpu_avg_load_per_task(int cpu); | 1501 | static unsigned long cpu_avg_load_per_task(int cpu); |
1526 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | 1502 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); |
1527 | |||
1528 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1529 | |||
1530 | /* | ||
1531 | * Group load balancing. | ||
1532 | * | ||
1533 | * We calculate a few balance domain wide aggregate numbers; load and weight. | ||
1534 | * Given the pictures below, and assuming each item has equal weight: | ||
1535 | * | ||
1536 | * root 1 - thread | ||
1537 | * / | \ A - group | ||
1538 | * A 1 B | ||
1539 | * /|\ / \ | ||
1540 | * C 2 D 3 4 | ||
1541 | * | | | ||
1542 | * 5 6 | ||
1543 | * | ||
1544 | * load: | ||
1545 | * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, | ||
1546 | * which equals 1/9-th of the total load. | ||
1547 | * | ||
1548 | * shares: | ||
1549 | * The weight of this group on the selected cpus. | ||
1550 | * | ||
1551 | * rq_weight: | ||
1552 | * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while | ||
1553 | * B would get 2. | ||
1554 | * | ||
1555 | * task_weight: | ||
1556 | * Part of the rq_weight contributed by tasks; all groups except B would | ||
1557 | * get 1, B gets 2. | ||
1558 | */ | ||
1559 | |||
1560 | static inline struct aggregate_struct * | ||
1561 | aggregate(struct task_group *tg, struct sched_domain *sd) | ||
1562 | { | ||
1563 | return &tg->cfs_rq[sd->first_cpu]->aggregate; | ||
1564 | } | ||
1565 | |||
1566 | typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); | ||
1567 | |||
1568 | /* | ||
1569 | * Iterate the full tree, calling @down when first entering a node and @up when | ||
1570 | * leaving it for the final time. | ||
1571 | */ | ||
1572 | static | ||
1573 | void aggregate_walk_tree(aggregate_func down, aggregate_func up, | ||
1574 | struct sched_domain *sd) | ||
1575 | { | ||
1576 | struct task_group *parent, *child; | ||
1577 | |||
1578 | rcu_read_lock(); | ||
1579 | parent = &root_task_group; | ||
1580 | down: | ||
1581 | (*down)(parent, sd); | ||
1582 | list_for_each_entry_rcu(child, &parent->children, siblings) { | ||
1583 | parent = child; | ||
1584 | goto down; | ||
1585 | |||
1586 | up: | ||
1587 | continue; | ||
1588 | } | ||
1589 | (*up)(parent, sd); | ||
1590 | |||
1591 | child = parent; | ||
1592 | parent = parent->parent; | ||
1593 | if (parent) | ||
1594 | goto up; | ||
1595 | rcu_read_unlock(); | ||
1596 | } | ||
1597 | |||
1598 | /* | ||
1599 | * Calculate the aggregate runqueue weight. | ||
1600 | */ | ||
1601 | static | ||
1602 | void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) | ||
1603 | { | ||
1604 | unsigned long rq_weight = 0; | ||
1605 | unsigned long task_weight = 0; | ||
1606 | int i; | ||
1607 | |||
1608 | for_each_cpu_mask(i, sd->span) { | ||
1609 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
1610 | task_weight += tg->cfs_rq[i]->task_weight; | ||
1611 | } | ||
1612 | |||
1613 | aggregate(tg, sd)->rq_weight = rq_weight; | ||
1614 | aggregate(tg, sd)->task_weight = task_weight; | ||
1615 | } | ||
1616 | |||
1617 | /* | ||
1618 | * Compute the weight of this group on the given cpus. | ||
1619 | */ | ||
1620 | static | ||
1621 | void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) | ||
1622 | { | ||
1623 | unsigned long shares = 0; | ||
1624 | int i; | ||
1625 | |||
1626 | for_each_cpu_mask(i, sd->span) | ||
1627 | shares += tg->cfs_rq[i]->shares; | ||
1628 | |||
1629 | if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) | ||
1630 | shares = tg->shares; | ||
1631 | |||
1632 | aggregate(tg, sd)->shares = shares; | ||
1633 | } | ||
1634 | |||
1635 | /* | ||
1636 | * Compute the load fraction assigned to this group, relies on the aggregate | ||
1637 | * weight and this group's parent's load, i.e. top-down. | ||
1638 | */ | ||
1639 | static | ||
1640 | void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) | ||
1641 | { | ||
1642 | unsigned long load; | ||
1643 | |||
1644 | if (!tg->parent) { | ||
1645 | int i; | ||
1646 | |||
1647 | load = 0; | ||
1648 | for_each_cpu_mask(i, sd->span) | ||
1649 | load += cpu_rq(i)->load.weight; | ||
1650 | |||
1651 | } else { | ||
1652 | load = aggregate(tg->parent, sd)->load; | ||
1653 | |||
1654 | /* | ||
1655 | * shares is our weight in the parent's rq so | ||
1656 | * shares/parent->rq_weight gives our fraction of the load | ||
1657 | */ | ||
1658 | load *= aggregate(tg, sd)->shares; | ||
1659 | load /= aggregate(tg->parent, sd)->rq_weight + 1; | ||
1660 | } | ||
1661 | |||
1662 | aggregate(tg, sd)->load = load; | ||
1663 | } | ||
1664 | |||
1665 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1666 | |||
1667 | /* | ||
1668 | * Calculate and set the cpu's group shares. | ||
1669 | */ | ||
1670 | static void | ||
1671 | __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, | ||
1672 | int tcpu) | ||
1673 | { | ||
1674 | int boost = 0; | ||
1675 | unsigned long shares; | ||
1676 | unsigned long rq_weight; | ||
1677 | |||
1678 | if (!tg->se[tcpu]) | ||
1679 | return; | ||
1680 | |||
1681 | rq_weight = tg->cfs_rq[tcpu]->load.weight; | ||
1682 | |||
1683 | /* | ||
1684 | * If there are currently no tasks on the cpu pretend there is one of | ||
1685 | * average load so that when a new task gets to run here it will not | ||
1686 | * get delayed by group starvation. | ||
1687 | */ | ||
1688 | if (!rq_weight) { | ||
1689 | boost = 1; | ||
1690 | rq_weight = NICE_0_LOAD; | ||
1691 | } | ||
1692 | |||
1693 | /* | ||
1694 | * \Sum shares * rq_weight | ||
1695 | * shares = ----------------------- | ||
1696 | * \Sum rq_weight | ||
1697 | * | ||
1698 | */ | ||
1699 | shares = aggregate(tg, sd)->shares * rq_weight; | ||
1700 | shares /= aggregate(tg, sd)->rq_weight + 1; | ||
1701 | |||
1702 | /* | ||
1703 | * record the actual number of shares, not the boosted amount. | ||
1704 | */ | ||
1705 | tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; | ||
1706 | |||
1707 | if (shares < MIN_SHARES) | ||
1708 | shares = MIN_SHARES; | ||
1709 | else if (shares > MAX_SHARES) | ||
1710 | shares = MAX_SHARES; | ||
1711 | |||
1712 | __set_se_shares(tg->se[tcpu], shares); | ||
1713 | } | ||
1714 | |||
1715 | /* | ||
1716 | * Re-adjust the weights on the cpu the task came from and on the cpu the | ||
1717 | * task went to. | ||
1718 | */ | ||
1719 | static void | ||
1720 | __move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
1721 | int scpu, int dcpu) | ||
1722 | { | ||
1723 | unsigned long shares; | ||
1724 | |||
1725 | shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
1726 | |||
1727 | __update_group_shares_cpu(tg, sd, scpu); | ||
1728 | __update_group_shares_cpu(tg, sd, dcpu); | ||
1729 | |||
1730 | /* | ||
1731 | * ensure we never loose shares due to rounding errors in the | ||
1732 | * above redistribution. | ||
1733 | */ | ||
1734 | shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | ||
1735 | if (shares) | ||
1736 | tg->cfs_rq[dcpu]->shares += shares; | ||
1737 | } | ||
1738 | |||
1739 | /* | ||
1740 | * Because changing a group's shares changes the weight of the super-group | ||
1741 | * we need to walk up the tree and change all shares until we hit the root. | ||
1742 | */ | ||
1743 | static void | ||
1744 | move_group_shares(struct task_group *tg, struct sched_domain *sd, | ||
1745 | int scpu, int dcpu) | ||
1746 | { | ||
1747 | while (tg) { | ||
1748 | __move_group_shares(tg, sd, scpu, dcpu); | ||
1749 | tg = tg->parent; | ||
1750 | } | ||
1751 | } | ||
1752 | |||
1753 | static | ||
1754 | void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) | ||
1755 | { | ||
1756 | unsigned long shares = aggregate(tg, sd)->shares; | ||
1757 | int i; | ||
1758 | |||
1759 | for_each_cpu_mask(i, sd->span) { | ||
1760 | struct rq *rq = cpu_rq(i); | ||
1761 | unsigned long flags; | ||
1762 | |||
1763 | spin_lock_irqsave(&rq->lock, flags); | ||
1764 | __update_group_shares_cpu(tg, sd, i); | ||
1765 | spin_unlock_irqrestore(&rq->lock, flags); | ||
1766 | } | ||
1767 | |||
1768 | aggregate_group_shares(tg, sd); | ||
1769 | |||
1770 | /* | ||
1771 | * ensure we never loose shares due to rounding errors in the | ||
1772 | * above redistribution. | ||
1773 | */ | ||
1774 | shares -= aggregate(tg, sd)->shares; | ||
1775 | if (shares) { | ||
1776 | tg->cfs_rq[sd->first_cpu]->shares += shares; | ||
1777 | aggregate(tg, sd)->shares += shares; | ||
1778 | } | ||
1779 | } | ||
1780 | |||
1781 | /* | ||
1782 | * Calculate the accumulative weight and recursive load of each task group | ||
1783 | * while walking down the tree. | ||
1784 | */ | ||
1785 | static | ||
1786 | void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) | ||
1787 | { | ||
1788 | aggregate_group_weight(tg, sd); | ||
1789 | aggregate_group_shares(tg, sd); | ||
1790 | aggregate_group_load(tg, sd); | ||
1791 | } | ||
1792 | |||
1793 | /* | ||
1794 | * Rebalance the cpu shares while walking back up the tree. | ||
1795 | */ | ||
1796 | static | ||
1797 | void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) | ||
1798 | { | ||
1799 | aggregate_group_set_shares(tg, sd); | ||
1800 | } | ||
1801 | |||
1802 | static DEFINE_PER_CPU(spinlock_t, aggregate_lock); | ||
1803 | |||
1804 | static void __init init_aggregate(void) | ||
1805 | { | ||
1806 | int i; | ||
1807 | |||
1808 | for_each_possible_cpu(i) | ||
1809 | spin_lock_init(&per_cpu(aggregate_lock, i)); | ||
1810 | } | ||
1811 | |||
1812 | static int get_aggregate(struct sched_domain *sd) | ||
1813 | { | ||
1814 | if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) | ||
1815 | return 0; | ||
1816 | |||
1817 | aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); | ||
1818 | return 1; | ||
1819 | } | ||
1820 | |||
1821 | static void put_aggregate(struct sched_domain *sd) | ||
1822 | { | ||
1823 | spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); | ||
1824 | } | ||
1825 | |||
1826 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1827 | { | ||
1828 | cfs_rq->shares = shares; | ||
1829 | } | ||
1830 | |||
1831 | #else | ||
1832 | |||
1833 | static inline void init_aggregate(void) | ||
1834 | { | ||
1835 | } | ||
1836 | |||
1837 | static inline int get_aggregate(struct sched_domain *sd) | ||
1838 | { | ||
1839 | return 0; | ||
1840 | } | ||
1841 | |||
1842 | static inline void put_aggregate(struct sched_domain *sd) | ||
1843 | { | ||
1844 | } | ||
1845 | #endif | ||
1846 | |||
1847 | #else /* CONFIG_SMP */ | 1503 | #else /* CONFIG_SMP */ |
1848 | 1504 | ||
1849 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1505 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -1864,14 +1520,26 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
1864 | 1520 | ||
1865 | #define sched_class_highest (&rt_sched_class) | 1521 | #define sched_class_highest (&rt_sched_class) |
1866 | 1522 | ||
1867 | static void inc_nr_running(struct rq *rq) | 1523 | static inline void inc_load(struct rq *rq, const struct task_struct *p) |
1524 | { | ||
1525 | update_load_add(&rq->load, p->se.load.weight); | ||
1526 | } | ||
1527 | |||
1528 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
1529 | { | ||
1530 | update_load_sub(&rq->load, p->se.load.weight); | ||
1531 | } | ||
1532 | |||
1533 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
1868 | { | 1534 | { |
1869 | rq->nr_running++; | 1535 | rq->nr_running++; |
1536 | inc_load(rq, p); | ||
1870 | } | 1537 | } |
1871 | 1538 | ||
1872 | static void dec_nr_running(struct rq *rq) | 1539 | static void dec_nr_running(struct task_struct *p, struct rq *rq) |
1873 | { | 1540 | { |
1874 | rq->nr_running--; | 1541 | rq->nr_running--; |
1542 | dec_load(rq, p); | ||
1875 | } | 1543 | } |
1876 | 1544 | ||
1877 | static void set_load_weight(struct task_struct *p) | 1545 | static void set_load_weight(struct task_struct *p) |
@@ -1963,7 +1631,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
1963 | rq->nr_uninterruptible--; | 1631 | rq->nr_uninterruptible--; |
1964 | 1632 | ||
1965 | enqueue_task(rq, p, wakeup); | 1633 | enqueue_task(rq, p, wakeup); |
1966 | inc_nr_running(rq); | 1634 | inc_nr_running(p, rq); |
1967 | } | 1635 | } |
1968 | 1636 | ||
1969 | /* | 1637 | /* |
@@ -1975,7 +1643,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | |||
1975 | rq->nr_uninterruptible++; | 1643 | rq->nr_uninterruptible++; |
1976 | 1644 | ||
1977 | dequeue_task(rq, p, sleep); | 1645 | dequeue_task(rq, p, sleep); |
1978 | dec_nr_running(rq); | 1646 | dec_nr_running(p, rq); |
1979 | } | 1647 | } |
1980 | 1648 | ||
1981 | /** | 1649 | /** |
@@ -2631,7 +2299,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
2631 | * management (if any): | 2299 | * management (if any): |
2632 | */ | 2300 | */ |
2633 | p->sched_class->task_new(rq, p); | 2301 | p->sched_class->task_new(rq, p); |
2634 | inc_nr_running(rq); | 2302 | inc_nr_running(p, rq); |
2635 | } | 2303 | } |
2636 | trace_mark(kernel_sched_wakeup_new, | 2304 | trace_mark(kernel_sched_wakeup_new, |
2637 | "pid %d state %ld ## rq %p task %p rq->curr %p", | 2305 | "pid %d state %ld ## rq %p task %p rq->curr %p", |
@@ -3630,12 +3298,9 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3630 | unsigned long imbalance; | 3298 | unsigned long imbalance; |
3631 | struct rq *busiest; | 3299 | struct rq *busiest; |
3632 | unsigned long flags; | 3300 | unsigned long flags; |
3633 | int unlock_aggregate; | ||
3634 | 3301 | ||
3635 | cpus_setall(*cpus); | 3302 | cpus_setall(*cpus); |
3636 | 3303 | ||
3637 | unlock_aggregate = get_aggregate(sd); | ||
3638 | |||
3639 | /* | 3304 | /* |
3640 | * When power savings policy is enabled for the parent domain, idle | 3305 | * When power savings policy is enabled for the parent domain, idle |
3641 | * sibling can pick up load irrespective of busy siblings. In this case, | 3306 | * sibling can pick up load irrespective of busy siblings. In this case, |
@@ -3751,9 +3416,8 @@ redo: | |||
3751 | 3416 | ||
3752 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3417 | if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3753 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3418 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3754 | ld_moved = -1; | 3419 | return -1; |
3755 | 3420 | return ld_moved; | |
3756 | goto out; | ||
3757 | 3421 | ||
3758 | out_balanced: | 3422 | out_balanced: |
3759 | schedstat_inc(sd, lb_balanced[idle]); | 3423 | schedstat_inc(sd, lb_balanced[idle]); |
@@ -3768,13 +3432,8 @@ out_one_pinned: | |||
3768 | 3432 | ||
3769 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && | 3433 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
3770 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) | 3434 | !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) |
3771 | ld_moved = -1; | 3435 | return -1; |
3772 | else | 3436 | return 0; |
3773 | ld_moved = 0; | ||
3774 | out: | ||
3775 | if (unlock_aggregate) | ||
3776 | put_aggregate(sd); | ||
3777 | return ld_moved; | ||
3778 | } | 3437 | } |
3779 | 3438 | ||
3780 | /* | 3439 | /* |
@@ -4481,7 +4140,7 @@ static inline void schedule_debug(struct task_struct *prev) | |||
4481 | * schedule() atomically, we ignore that path for now. | 4140 | * schedule() atomically, we ignore that path for now. |
4482 | * Otherwise, whine if we are scheduling when we should not be. | 4141 | * Otherwise, whine if we are scheduling when we should not be. |
4483 | */ | 4142 | */ |
4484 | if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) | 4143 | if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) |
4485 | __schedule_bug(prev); | 4144 | __schedule_bug(prev); |
4486 | 4145 | ||
4487 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); | 4146 | profile_hit(SCHED_PROFILING, __builtin_return_address(0)); |
@@ -4561,12 +4220,10 @@ need_resched_nonpreemptible: | |||
4561 | clear_tsk_need_resched(prev); | 4220 | clear_tsk_need_resched(prev); |
4562 | 4221 | ||
4563 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { | 4222 | if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { |
4564 | if (unlikely((prev->state & TASK_INTERRUPTIBLE) && | 4223 | if (unlikely(signal_pending_state(prev->state, prev))) |
4565 | signal_pending(prev))) { | ||
4566 | prev->state = TASK_RUNNING; | 4224 | prev->state = TASK_RUNNING; |
4567 | } else { | 4225 | else |
4568 | deactivate_task(rq, prev, 1); | 4226 | deactivate_task(rq, prev, 1); |
4569 | } | ||
4570 | switch_count = &prev->nvcsw; | 4227 | switch_count = &prev->nvcsw; |
4571 | } | 4228 | } |
4572 | 4229 | ||
@@ -4792,22 +4449,20 @@ do_wait_for_common(struct completion *x, long timeout, int state) | |||
4792 | signal_pending(current)) || | 4449 | signal_pending(current)) || |
4793 | (state == TASK_KILLABLE && | 4450 | (state == TASK_KILLABLE && |
4794 | fatal_signal_pending(current))) { | 4451 | fatal_signal_pending(current))) { |
4795 | __remove_wait_queue(&x->wait, &wait); | 4452 | timeout = -ERESTARTSYS; |
4796 | return -ERESTARTSYS; | 4453 | break; |
4797 | } | 4454 | } |
4798 | __set_current_state(state); | 4455 | __set_current_state(state); |
4799 | spin_unlock_irq(&x->wait.lock); | 4456 | spin_unlock_irq(&x->wait.lock); |
4800 | timeout = schedule_timeout(timeout); | 4457 | timeout = schedule_timeout(timeout); |
4801 | spin_lock_irq(&x->wait.lock); | 4458 | spin_lock_irq(&x->wait.lock); |
4802 | if (!timeout) { | 4459 | } while (!x->done && timeout); |
4803 | __remove_wait_queue(&x->wait, &wait); | ||
4804 | return timeout; | ||
4805 | } | ||
4806 | } while (!x->done); | ||
4807 | __remove_wait_queue(&x->wait, &wait); | 4460 | __remove_wait_queue(&x->wait, &wait); |
4461 | if (!x->done) | ||
4462 | return timeout; | ||
4808 | } | 4463 | } |
4809 | x->done--; | 4464 | x->done--; |
4810 | return timeout; | 4465 | return timeout ?: 1; |
4811 | } | 4466 | } |
4812 | 4467 | ||
4813 | static long __sched | 4468 | static long __sched |
@@ -4982,8 +4637,10 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4982 | goto out_unlock; | 4637 | goto out_unlock; |
4983 | } | 4638 | } |
4984 | on_rq = p->se.on_rq; | 4639 | on_rq = p->se.on_rq; |
4985 | if (on_rq) | 4640 | if (on_rq) { |
4986 | dequeue_task(rq, p, 0); | 4641 | dequeue_task(rq, p, 0); |
4642 | dec_load(rq, p); | ||
4643 | } | ||
4987 | 4644 | ||
4988 | p->static_prio = NICE_TO_PRIO(nice); | 4645 | p->static_prio = NICE_TO_PRIO(nice); |
4989 | set_load_weight(p); | 4646 | set_load_weight(p); |
@@ -4993,6 +4650,7 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4993 | 4650 | ||
4994 | if (on_rq) { | 4651 | if (on_rq) { |
4995 | enqueue_task(rq, p, 0); | 4652 | enqueue_task(rq, p, 0); |
4653 | inc_load(rq, p); | ||
4996 | /* | 4654 | /* |
4997 | * If the task increased its priority or is running and | 4655 | * If the task increased its priority or is running and |
4998 | * lowered its priority, then reschedule its CPU: | 4656 | * lowered its priority, then reschedule its CPU: |
@@ -6280,6 +5938,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
6280 | next = pick_next_task(rq, rq->curr); | 5938 | next = pick_next_task(rq, rq->curr); |
6281 | if (!next) | 5939 | if (!next) |
6282 | break; | 5940 | break; |
5941 | next->sched_class->put_prev_task(rq, next); | ||
6283 | migrate_dead(dead_cpu, next); | 5942 | migrate_dead(dead_cpu, next); |
6284 | 5943 | ||
6285 | } | 5944 | } |
@@ -7270,7 +6929,12 @@ static int default_relax_domain_level = -1; | |||
7270 | 6929 | ||
7271 | static int __init setup_relax_domain_level(char *str) | 6930 | static int __init setup_relax_domain_level(char *str) |
7272 | { | 6931 | { |
7273 | default_relax_domain_level = simple_strtoul(str, NULL, 0); | 6932 | unsigned long val; |
6933 | |||
6934 | val = simple_strtoul(str, NULL, 0); | ||
6935 | if (val < SD_LV_MAX) | ||
6936 | default_relax_domain_level = val; | ||
6937 | |||
7274 | return 1; | 6938 | return 1; |
7275 | } | 6939 | } |
7276 | __setup("relax_domain_level=", setup_relax_domain_level); | 6940 | __setup("relax_domain_level=", setup_relax_domain_level); |
@@ -7367,7 +7031,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7367 | SD_INIT(sd, ALLNODES); | 7031 | SD_INIT(sd, ALLNODES); |
7368 | set_domain_attribute(sd, attr); | 7032 | set_domain_attribute(sd, attr); |
7369 | sd->span = *cpu_map; | 7033 | sd->span = *cpu_map; |
7370 | sd->first_cpu = first_cpu(sd->span); | ||
7371 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | 7034 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); |
7372 | p = sd; | 7035 | p = sd; |
7373 | sd_allnodes = 1; | 7036 | sd_allnodes = 1; |
@@ -7378,7 +7041,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7378 | SD_INIT(sd, NODE); | 7041 | SD_INIT(sd, NODE); |
7379 | set_domain_attribute(sd, attr); | 7042 | set_domain_attribute(sd, attr); |
7380 | sched_domain_node_span(cpu_to_node(i), &sd->span); | 7043 | sched_domain_node_span(cpu_to_node(i), &sd->span); |
7381 | sd->first_cpu = first_cpu(sd->span); | ||
7382 | sd->parent = p; | 7044 | sd->parent = p; |
7383 | if (p) | 7045 | if (p) |
7384 | p->child = sd; | 7046 | p->child = sd; |
@@ -7390,7 +7052,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7390 | SD_INIT(sd, CPU); | 7052 | SD_INIT(sd, CPU); |
7391 | set_domain_attribute(sd, attr); | 7053 | set_domain_attribute(sd, attr); |
7392 | sd->span = *nodemask; | 7054 | sd->span = *nodemask; |
7393 | sd->first_cpu = first_cpu(sd->span); | ||
7394 | sd->parent = p; | 7055 | sd->parent = p; |
7395 | if (p) | 7056 | if (p) |
7396 | p->child = sd; | 7057 | p->child = sd; |
@@ -7402,7 +7063,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7402 | SD_INIT(sd, MC); | 7063 | SD_INIT(sd, MC); |
7403 | set_domain_attribute(sd, attr); | 7064 | set_domain_attribute(sd, attr); |
7404 | sd->span = cpu_coregroup_map(i); | 7065 | sd->span = cpu_coregroup_map(i); |
7405 | sd->first_cpu = first_cpu(sd->span); | ||
7406 | cpus_and(sd->span, sd->span, *cpu_map); | 7066 | cpus_and(sd->span, sd->span, *cpu_map); |
7407 | sd->parent = p; | 7067 | sd->parent = p; |
7408 | p->child = sd; | 7068 | p->child = sd; |
@@ -7415,7 +7075,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7415 | SD_INIT(sd, SIBLING); | 7075 | SD_INIT(sd, SIBLING); |
7416 | set_domain_attribute(sd, attr); | 7076 | set_domain_attribute(sd, attr); |
7417 | sd->span = per_cpu(cpu_sibling_map, i); | 7077 | sd->span = per_cpu(cpu_sibling_map, i); |
7418 | sd->first_cpu = first_cpu(sd->span); | ||
7419 | cpus_and(sd->span, sd->span, *cpu_map); | 7078 | cpus_and(sd->span, sd->span, *cpu_map); |
7420 | sd->parent = p; | 7079 | sd->parent = p; |
7421 | p->child = sd; | 7080 | p->child = sd; |
@@ -7619,8 +7278,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
7619 | 7278 | ||
7620 | static cpumask_t *doms_cur; /* current sched domains */ | 7279 | static cpumask_t *doms_cur; /* current sched domains */ |
7621 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | 7280 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ |
7622 | static struct sched_domain_attr *dattr_cur; /* attribues of custom domains | 7281 | static struct sched_domain_attr *dattr_cur; |
7623 | in 'doms_cur' */ | 7282 | /* attribues of custom domains in 'doms_cur' */ |
7624 | 7283 | ||
7625 | /* | 7284 | /* |
7626 | * Special case: If a kmalloc of a doms_cur partition (array of | 7285 | * Special case: If a kmalloc of a doms_cur partition (array of |
@@ -7634,6 +7293,18 @@ void __attribute__((weak)) arch_update_cpu_topology(void) | |||
7634 | } | 7293 | } |
7635 | 7294 | ||
7636 | /* | 7295 | /* |
7296 | * Free current domain masks. | ||
7297 | * Called after all cpus are attached to NULL domain. | ||
7298 | */ | ||
7299 | static void free_sched_domains(void) | ||
7300 | { | ||
7301 | ndoms_cur = 0; | ||
7302 | if (doms_cur != &fallback_doms) | ||
7303 | kfree(doms_cur); | ||
7304 | doms_cur = &fallback_doms; | ||
7305 | } | ||
7306 | |||
7307 | /* | ||
7637 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 7308 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
7638 | * For now this just excludes isolated cpus, but could be used to | 7309 | * For now this just excludes isolated cpus, but could be used to |
7639 | * exclude other special cases in the future. | 7310 | * exclude other special cases in the future. |
@@ -7780,6 +7451,7 @@ int arch_reinit_sched_domains(void) | |||
7780 | get_online_cpus(); | 7451 | get_online_cpus(); |
7781 | mutex_lock(&sched_domains_mutex); | 7452 | mutex_lock(&sched_domains_mutex); |
7782 | detach_destroy_domains(&cpu_online_map); | 7453 | detach_destroy_domains(&cpu_online_map); |
7454 | free_sched_domains(); | ||
7783 | err = arch_init_sched_domains(&cpu_online_map); | 7455 | err = arch_init_sched_domains(&cpu_online_map); |
7784 | mutex_unlock(&sched_domains_mutex); | 7456 | mutex_unlock(&sched_domains_mutex); |
7785 | put_online_cpus(); | 7457 | put_online_cpus(); |
@@ -7865,6 +7537,7 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
7865 | case CPU_DOWN_PREPARE: | 7537 | case CPU_DOWN_PREPARE: |
7866 | case CPU_DOWN_PREPARE_FROZEN: | 7538 | case CPU_DOWN_PREPARE_FROZEN: |
7867 | detach_destroy_domains(&cpu_online_map); | 7539 | detach_destroy_domains(&cpu_online_map); |
7540 | free_sched_domains(); | ||
7868 | return NOTIFY_OK; | 7541 | return NOTIFY_OK; |
7869 | 7542 | ||
7870 | case CPU_UP_CANCELED: | 7543 | case CPU_UP_CANCELED: |
@@ -7883,8 +7556,16 @@ static int update_sched_domains(struct notifier_block *nfb, | |||
7883 | return NOTIFY_DONE; | 7556 | return NOTIFY_DONE; |
7884 | } | 7557 | } |
7885 | 7558 | ||
7559 | #ifndef CONFIG_CPUSETS | ||
7560 | /* | ||
7561 | * Create default domain partitioning if cpusets are disabled. | ||
7562 | * Otherwise we let cpusets rebuild the domains based on the | ||
7563 | * current setup. | ||
7564 | */ | ||
7565 | |||
7886 | /* The hotplug lock is already held by cpu_up/cpu_down */ | 7566 | /* The hotplug lock is already held by cpu_up/cpu_down */ |
7887 | arch_init_sched_domains(&cpu_online_map); | 7567 | arch_init_sched_domains(&cpu_online_map); |
7568 | #endif | ||
7888 | 7569 | ||
7889 | return NOTIFY_OK; | 7570 | return NOTIFY_OK; |
7890 | } | 7571 | } |
@@ -8024,7 +7705,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, | |||
8024 | else | 7705 | else |
8025 | rt_se->rt_rq = parent->my_q; | 7706 | rt_se->rt_rq = parent->my_q; |
8026 | 7707 | ||
8027 | rt_se->rt_rq = &rq->rt; | ||
8028 | rt_se->my_q = rt_rq; | 7708 | rt_se->my_q = rt_rq; |
8029 | rt_se->parent = parent; | 7709 | rt_se->parent = parent; |
8030 | INIT_LIST_HEAD(&rt_se->run_list); | 7710 | INIT_LIST_HEAD(&rt_se->run_list); |
@@ -8085,7 +7765,6 @@ void __init sched_init(void) | |||
8085 | } | 7765 | } |
8086 | 7766 | ||
8087 | #ifdef CONFIG_SMP | 7767 | #ifdef CONFIG_SMP |
8088 | init_aggregate(); | ||
8089 | init_defrootdomain(); | 7768 | init_defrootdomain(); |
8090 | #endif | 7769 | #endif |
8091 | 7770 | ||
@@ -8650,11 +8329,14 @@ void sched_move_task(struct task_struct *tsk) | |||
8650 | #endif | 8329 | #endif |
8651 | 8330 | ||
8652 | #ifdef CONFIG_FAIR_GROUP_SCHED | 8331 | #ifdef CONFIG_FAIR_GROUP_SCHED |
8653 | static void __set_se_shares(struct sched_entity *se, unsigned long shares) | 8332 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
8654 | { | 8333 | { |
8655 | struct cfs_rq *cfs_rq = se->cfs_rq; | 8334 | struct cfs_rq *cfs_rq = se->cfs_rq; |
8335 | struct rq *rq = cfs_rq->rq; | ||
8656 | int on_rq; | 8336 | int on_rq; |
8657 | 8337 | ||
8338 | spin_lock_irq(&rq->lock); | ||
8339 | |||
8658 | on_rq = se->on_rq; | 8340 | on_rq = se->on_rq; |
8659 | if (on_rq) | 8341 | if (on_rq) |
8660 | dequeue_entity(cfs_rq, se, 0); | 8342 | dequeue_entity(cfs_rq, se, 0); |
@@ -8664,17 +8346,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares) | |||
8664 | 8346 | ||
8665 | if (on_rq) | 8347 | if (on_rq) |
8666 | enqueue_entity(cfs_rq, se, 0); | 8348 | enqueue_entity(cfs_rq, se, 0); |
8667 | } | ||
8668 | 8349 | ||
8669 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 8350 | spin_unlock_irq(&rq->lock); |
8670 | { | ||
8671 | struct cfs_rq *cfs_rq = se->cfs_rq; | ||
8672 | struct rq *rq = cfs_rq->rq; | ||
8673 | unsigned long flags; | ||
8674 | |||
8675 | spin_lock_irqsave(&rq->lock, flags); | ||
8676 | __set_se_shares(se, shares); | ||
8677 | spin_unlock_irqrestore(&rq->lock, flags); | ||
8678 | } | 8351 | } |
8679 | 8352 | ||
8680 | static DEFINE_MUTEX(shares_mutex); | 8353 | static DEFINE_MUTEX(shares_mutex); |
@@ -8713,13 +8386,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8713 | * w/o tripping rebalance_share or load_balance_fair. | 8386 | * w/o tripping rebalance_share or load_balance_fair. |
8714 | */ | 8387 | */ |
8715 | tg->shares = shares; | 8388 | tg->shares = shares; |
8716 | for_each_possible_cpu(i) { | 8389 | for_each_possible_cpu(i) |
8717 | /* | ||
8718 | * force a rebalance | ||
8719 | */ | ||
8720 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | ||
8721 | set_se_shares(tg->se[i], shares); | 8390 | set_se_shares(tg->se[i], shares); |
8722 | } | ||
8723 | 8391 | ||
8724 | /* | 8392 | /* |
8725 | * Enable load balance activity on this group, by inserting it back on | 8393 | * Enable load balance activity on this group, by inserting it back on |
@@ -8758,7 +8426,7 @@ static unsigned long to_ratio(u64 period, u64 runtime) | |||
8758 | #ifdef CONFIG_CGROUP_SCHED | 8426 | #ifdef CONFIG_CGROUP_SCHED |
8759 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) | 8427 | static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) |
8760 | { | 8428 | { |
8761 | struct task_group *tgi, *parent = tg->parent; | 8429 | struct task_group *tgi, *parent = tg ? tg->parent : NULL; |
8762 | unsigned long total = 0; | 8430 | unsigned long total = 0; |
8763 | 8431 | ||
8764 | if (!parent) { | 8432 | if (!parent) { |
@@ -8885,6 +8553,9 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us) | |||
8885 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; | 8553 | rt_period = (u64)rt_period_us * NSEC_PER_USEC; |
8886 | rt_runtime = tg->rt_bandwidth.rt_runtime; | 8554 | rt_runtime = tg->rt_bandwidth.rt_runtime; |
8887 | 8555 | ||
8556 | if (rt_period == 0) | ||
8557 | return -EINVAL; | ||
8558 | |||
8888 | return tg_set_bandwidth(tg, rt_period, rt_runtime); | 8559 | return tg_set_bandwidth(tg, rt_period, rt_runtime); |
8889 | } | 8560 | } |
8890 | 8561 | ||