diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2008-06-27 07:41:18 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-06-27 08:31:32 -0400 |
commit | b6a86c746f5b708012809958462234d19e9c8177 (patch) | |
tree | 38654c70da6382f50779ede1e973d2d395f38e54 /kernel/sched.c | |
parent | 32df2ee86a580f70f2dbb90cf81f413aa655f838 (diff) |
sched: fix sched_domain aggregation
Keeping the aggregate on the first cpu of the sched domain has two problems:
- it could collide between different sched domains on different cpus
- it could slow things down because of the remote accesses
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 113 |
1 files changed, 54 insertions, 59 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 7d282c52bd42..160d3c209b8f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -1480,12 +1480,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | |||
1480 | */ | 1480 | */ |
1481 | 1481 | ||
1482 | static inline struct aggregate_struct * | 1482 | static inline struct aggregate_struct * |
1483 | aggregate(struct task_group *tg, struct sched_domain *sd) | 1483 | aggregate(struct task_group *tg, int cpu) |
1484 | { | 1484 | { |
1485 | return &tg->cfs_rq[sd->first_cpu]->aggregate; | 1485 | return &tg->cfs_rq[cpu]->aggregate; |
1486 | } | 1486 | } |
1487 | 1487 | ||
1488 | typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); | 1488 | typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *); |
1489 | 1489 | ||
1490 | /* | 1490 | /* |
1491 | * Iterate the full tree, calling @down when first entering a node and @up when | 1491 | * Iterate the full tree, calling @down when first entering a node and @up when |
@@ -1493,14 +1493,14 @@ typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); | |||
1493 | */ | 1493 | */ |
1494 | static | 1494 | static |
1495 | void aggregate_walk_tree(aggregate_func down, aggregate_func up, | 1495 | void aggregate_walk_tree(aggregate_func down, aggregate_func up, |
1496 | struct sched_domain *sd) | 1496 | int cpu, struct sched_domain *sd) |
1497 | { | 1497 | { |
1498 | struct task_group *parent, *child; | 1498 | struct task_group *parent, *child; |
1499 | 1499 | ||
1500 | rcu_read_lock(); | 1500 | rcu_read_lock(); |
1501 | parent = &root_task_group; | 1501 | parent = &root_task_group; |
1502 | down: | 1502 | down: |
1503 | (*down)(parent, sd); | 1503 | (*down)(parent, cpu, sd); |
1504 | list_for_each_entry_rcu(child, &parent->children, siblings) { | 1504 | list_for_each_entry_rcu(child, &parent->children, siblings) { |
1505 | parent = child; | 1505 | parent = child; |
1506 | goto down; | 1506 | goto down; |
@@ -1508,7 +1508,7 @@ down: | |||
1508 | up: | 1508 | up: |
1509 | continue; | 1509 | continue; |
1510 | } | 1510 | } |
1511 | (*up)(parent, sd); | 1511 | (*up)(parent, cpu, sd); |
1512 | 1512 | ||
1513 | child = parent; | 1513 | child = parent; |
1514 | parent = parent->parent; | 1514 | parent = parent->parent; |
@@ -1520,8 +1520,8 @@ up: | |||
1520 | /* | 1520 | /* |
1521 | * Calculate the aggregate runqueue weight. | 1521 | * Calculate the aggregate runqueue weight. |
1522 | */ | 1522 | */ |
1523 | static | 1523 | static void |
1524 | void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) | 1524 | aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd) |
1525 | { | 1525 | { |
1526 | unsigned long rq_weight = 0; | 1526 | unsigned long rq_weight = 0; |
1527 | unsigned long task_weight = 0; | 1527 | unsigned long task_weight = 0; |
@@ -1532,15 +1532,15 @@ void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) | |||
1532 | task_weight += tg->cfs_rq[i]->task_weight; | 1532 | task_weight += tg->cfs_rq[i]->task_weight; |
1533 | } | 1533 | } |
1534 | 1534 | ||
1535 | aggregate(tg, sd)->rq_weight = rq_weight; | 1535 | aggregate(tg, cpu)->rq_weight = rq_weight; |
1536 | aggregate(tg, sd)->task_weight = task_weight; | 1536 | aggregate(tg, cpu)->task_weight = task_weight; |
1537 | } | 1537 | } |
1538 | 1538 | ||
1539 | /* | 1539 | /* |
1540 | * Compute the weight of this group on the given cpus. | 1540 | * Compute the weight of this group on the given cpus. |
1541 | */ | 1541 | */ |
1542 | static | 1542 | static void |
1543 | void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) | 1543 | aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd) |
1544 | { | 1544 | { |
1545 | unsigned long shares = 0; | 1545 | unsigned long shares = 0; |
1546 | int i; | 1546 | int i; |
@@ -1548,18 +1548,18 @@ void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) | |||
1548 | for_each_cpu_mask(i, sd->span) | 1548 | for_each_cpu_mask(i, sd->span) |
1549 | shares += tg->cfs_rq[i]->shares; | 1549 | shares += tg->cfs_rq[i]->shares; |
1550 | 1550 | ||
1551 | if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) | 1551 | if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares) |
1552 | shares = tg->shares; | 1552 | shares = tg->shares; |
1553 | 1553 | ||
1554 | aggregate(tg, sd)->shares = shares; | 1554 | aggregate(tg, cpu)->shares = shares; |
1555 | } | 1555 | } |
1556 | 1556 | ||
1557 | /* | 1557 | /* |
1558 | * Compute the load fraction assigned to this group, relies on the aggregate | 1558 | * Compute the load fraction assigned to this group, relies on the aggregate |
1559 | * weight and this group's parent's load, i.e. top-down. | 1559 | * weight and this group's parent's load, i.e. top-down. |
1560 | */ | 1560 | */ |
1561 | static | 1561 | static void |
1562 | void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) | 1562 | aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd) |
1563 | { | 1563 | { |
1564 | unsigned long load; | 1564 | unsigned long load; |
1565 | 1565 | ||
@@ -1571,17 +1571,17 @@ void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) | |||
1571 | load += cpu_rq(i)->load.weight; | 1571 | load += cpu_rq(i)->load.weight; |
1572 | 1572 | ||
1573 | } else { | 1573 | } else { |
1574 | load = aggregate(tg->parent, sd)->load; | 1574 | load = aggregate(tg->parent, cpu)->load; |
1575 | 1575 | ||
1576 | /* | 1576 | /* |
1577 | * shares is our weight in the parent's rq so | 1577 | * shares is our weight in the parent's rq so |
1578 | * shares/parent->rq_weight gives our fraction of the load | 1578 | * shares/parent->rq_weight gives our fraction of the load |
1579 | */ | 1579 | */ |
1580 | load *= aggregate(tg, sd)->shares; | 1580 | load *= aggregate(tg, cpu)->shares; |
1581 | load /= aggregate(tg->parent, sd)->rq_weight + 1; | 1581 | load /= aggregate(tg->parent, cpu)->rq_weight + 1; |
1582 | } | 1582 | } |
1583 | 1583 | ||
1584 | aggregate(tg, sd)->load = load; | 1584 | aggregate(tg, cpu)->load = load; |
1585 | } | 1585 | } |
1586 | 1586 | ||
1587 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1587 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
@@ -1590,8 +1590,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); | |||
1590 | * Calculate and set the cpu's group shares. | 1590 | * Calculate and set the cpu's group shares. |
1591 | */ | 1591 | */ |
1592 | static void | 1592 | static void |
1593 | __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, | 1593 | __update_group_shares_cpu(struct task_group *tg, int cpu, |
1594 | int tcpu) | 1594 | struct sched_domain *sd, int tcpu) |
1595 | { | 1595 | { |
1596 | int boost = 0; | 1596 | int boost = 0; |
1597 | unsigned long shares; | 1597 | unsigned long shares; |
@@ -1618,8 +1618,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, | |||
1618 | * \Sum rq_weight | 1618 | * \Sum rq_weight |
1619 | * | 1619 | * |
1620 | */ | 1620 | */ |
1621 | shares = aggregate(tg, sd)->shares * rq_weight; | 1621 | shares = aggregate(tg, cpu)->shares * rq_weight; |
1622 | shares /= aggregate(tg, sd)->rq_weight + 1; | 1622 | shares /= aggregate(tg, cpu)->rq_weight + 1; |
1623 | 1623 | ||
1624 | /* | 1624 | /* |
1625 | * record the actual number of shares, not the boosted amount. | 1625 | * record the actual number of shares, not the boosted amount. |
@@ -1639,15 +1639,15 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, | |||
1639 | * task went to. | 1639 | * task went to. |
1640 | */ | 1640 | */ |
1641 | static void | 1641 | static void |
1642 | __move_group_shares(struct task_group *tg, struct sched_domain *sd, | 1642 | __move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, |
1643 | int scpu, int dcpu) | 1643 | int scpu, int dcpu) |
1644 | { | 1644 | { |
1645 | unsigned long shares; | 1645 | unsigned long shares; |
1646 | 1646 | ||
1647 | shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; | 1647 | shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; |
1648 | 1648 | ||
1649 | __update_group_shares_cpu(tg, sd, scpu); | 1649 | __update_group_shares_cpu(tg, cpu, sd, scpu); |
1650 | __update_group_shares_cpu(tg, sd, dcpu); | 1650 | __update_group_shares_cpu(tg, cpu, sd, dcpu); |
1651 | 1651 | ||
1652 | /* | 1652 | /* |
1653 | * ensure we never loose shares due to rounding errors in the | 1653 | * ensure we never loose shares due to rounding errors in the |
@@ -1663,19 +1663,19 @@ __move_group_shares(struct task_group *tg, struct sched_domain *sd, | |||
1663 | * we need to walk up the tree and change all shares until we hit the root. | 1663 | * we need to walk up the tree and change all shares until we hit the root. |
1664 | */ | 1664 | */ |
1665 | static void | 1665 | static void |
1666 | move_group_shares(struct task_group *tg, struct sched_domain *sd, | 1666 | move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, |
1667 | int scpu, int dcpu) | 1667 | int scpu, int dcpu) |
1668 | { | 1668 | { |
1669 | while (tg) { | 1669 | while (tg) { |
1670 | __move_group_shares(tg, sd, scpu, dcpu); | 1670 | __move_group_shares(tg, cpu, sd, scpu, dcpu); |
1671 | tg = tg->parent; | 1671 | tg = tg->parent; |
1672 | } | 1672 | } |
1673 | } | 1673 | } |
1674 | 1674 | ||
1675 | static | 1675 | static void |
1676 | void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) | 1676 | aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd) |
1677 | { | 1677 | { |
1678 | unsigned long shares = aggregate(tg, sd)->shares; | 1678 | unsigned long shares = aggregate(tg, cpu)->shares; |
1679 | int i; | 1679 | int i; |
1680 | 1680 | ||
1681 | for_each_cpu_mask(i, sd->span) { | 1681 | for_each_cpu_mask(i, sd->span) { |
@@ -1683,20 +1683,20 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) | |||
1683 | unsigned long flags; | 1683 | unsigned long flags; |
1684 | 1684 | ||
1685 | spin_lock_irqsave(&rq->lock, flags); | 1685 | spin_lock_irqsave(&rq->lock, flags); |
1686 | __update_group_shares_cpu(tg, sd, i); | 1686 | __update_group_shares_cpu(tg, cpu, sd, i); |
1687 | spin_unlock_irqrestore(&rq->lock, flags); | 1687 | spin_unlock_irqrestore(&rq->lock, flags); |
1688 | } | 1688 | } |
1689 | 1689 | ||
1690 | aggregate_group_shares(tg, sd); | 1690 | aggregate_group_shares(tg, cpu, sd); |
1691 | 1691 | ||
1692 | /* | 1692 | /* |
1693 | * ensure we never loose shares due to rounding errors in the | 1693 | * ensure we never loose shares due to rounding errors in the |
1694 | * above redistribution. | 1694 | * above redistribution. |
1695 | */ | 1695 | */ |
1696 | shares -= aggregate(tg, sd)->shares; | 1696 | shares -= aggregate(tg, cpu)->shares; |
1697 | if (shares) { | 1697 | if (shares) { |
1698 | tg->cfs_rq[sd->first_cpu]->shares += shares; | 1698 | tg->cfs_rq[cpu]->shares += shares; |
1699 | aggregate(tg, sd)->shares += shares; | 1699 | aggregate(tg, cpu)->shares += shares; |
1700 | } | 1700 | } |
1701 | } | 1701 | } |
1702 | 1702 | ||
@@ -1704,21 +1704,21 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) | |||
1704 | * Calculate the accumulative weight and recursive load of each task group | 1704 | * Calculate the accumulative weight and recursive load of each task group |
1705 | * while walking down the tree. | 1705 | * while walking down the tree. |
1706 | */ | 1706 | */ |
1707 | static | 1707 | static void |
1708 | void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) | 1708 | aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd) |
1709 | { | 1709 | { |
1710 | aggregate_group_weight(tg, sd); | 1710 | aggregate_group_weight(tg, cpu, sd); |
1711 | aggregate_group_shares(tg, sd); | 1711 | aggregate_group_shares(tg, cpu, sd); |
1712 | aggregate_group_load(tg, sd); | 1712 | aggregate_group_load(tg, cpu, sd); |
1713 | } | 1713 | } |
1714 | 1714 | ||
1715 | /* | 1715 | /* |
1716 | * Rebalance the cpu shares while walking back up the tree. | 1716 | * Rebalance the cpu shares while walking back up the tree. |
1717 | */ | 1717 | */ |
1718 | static | 1718 | static void |
1719 | void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) | 1719 | aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd) |
1720 | { | 1720 | { |
1721 | aggregate_group_set_shares(tg, sd); | 1721 | aggregate_group_set_shares(tg, cpu, sd); |
1722 | } | 1722 | } |
1723 | 1723 | ||
1724 | static DEFINE_PER_CPU(spinlock_t, aggregate_lock); | 1724 | static DEFINE_PER_CPU(spinlock_t, aggregate_lock); |
@@ -1731,18 +1731,18 @@ static void __init init_aggregate(void) | |||
1731 | spin_lock_init(&per_cpu(aggregate_lock, i)); | 1731 | spin_lock_init(&per_cpu(aggregate_lock, i)); |
1732 | } | 1732 | } |
1733 | 1733 | ||
1734 | static int get_aggregate(struct sched_domain *sd) | 1734 | static int get_aggregate(int cpu, struct sched_domain *sd) |
1735 | { | 1735 | { |
1736 | if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) | 1736 | if (!spin_trylock(&per_cpu(aggregate_lock, cpu))) |
1737 | return 0; | 1737 | return 0; |
1738 | 1738 | ||
1739 | aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); | 1739 | aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd); |
1740 | return 1; | 1740 | return 1; |
1741 | } | 1741 | } |
1742 | 1742 | ||
1743 | static void put_aggregate(struct sched_domain *sd) | 1743 | static void put_aggregate(int cpu, struct sched_domain *sd) |
1744 | { | 1744 | { |
1745 | spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); | 1745 | spin_unlock(&per_cpu(aggregate_lock, cpu)); |
1746 | } | 1746 | } |
1747 | 1747 | ||
1748 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | 1748 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) |
@@ -1756,12 +1756,12 @@ static inline void init_aggregate(void) | |||
1756 | { | 1756 | { |
1757 | } | 1757 | } |
1758 | 1758 | ||
1759 | static inline int get_aggregate(struct sched_domain *sd) | 1759 | static inline int get_aggregate(int cpu, struct sched_domain *sd) |
1760 | { | 1760 | { |
1761 | return 0; | 1761 | return 0; |
1762 | } | 1762 | } |
1763 | 1763 | ||
1764 | static inline void put_aggregate(struct sched_domain *sd) | 1764 | static inline void put_aggregate(int cpu, struct sched_domain *sd) |
1765 | { | 1765 | { |
1766 | } | 1766 | } |
1767 | #endif | 1767 | #endif |
@@ -3539,7 +3539,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3539 | 3539 | ||
3540 | cpus_setall(*cpus); | 3540 | cpus_setall(*cpus); |
3541 | 3541 | ||
3542 | unlock_aggregate = get_aggregate(sd); | 3542 | unlock_aggregate = get_aggregate(this_cpu, sd); |
3543 | 3543 | ||
3544 | /* | 3544 | /* |
3545 | * When power savings policy is enabled for the parent domain, idle | 3545 | * When power savings policy is enabled for the parent domain, idle |
@@ -3678,7 +3678,7 @@ out_one_pinned: | |||
3678 | ld_moved = 0; | 3678 | ld_moved = 0; |
3679 | out: | 3679 | out: |
3680 | if (unlock_aggregate) | 3680 | if (unlock_aggregate) |
3681 | put_aggregate(sd); | 3681 | put_aggregate(this_cpu, sd); |
3682 | return ld_moved; | 3682 | return ld_moved; |
3683 | } | 3683 | } |
3684 | 3684 | ||
@@ -7292,7 +7292,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7292 | SD_INIT(sd, ALLNODES); | 7292 | SD_INIT(sd, ALLNODES); |
7293 | set_domain_attribute(sd, attr); | 7293 | set_domain_attribute(sd, attr); |
7294 | sd->span = *cpu_map; | 7294 | sd->span = *cpu_map; |
7295 | sd->first_cpu = first_cpu(sd->span); | ||
7296 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); | 7295 | cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); |
7297 | p = sd; | 7296 | p = sd; |
7298 | sd_allnodes = 1; | 7297 | sd_allnodes = 1; |
@@ -7303,7 +7302,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7303 | SD_INIT(sd, NODE); | 7302 | SD_INIT(sd, NODE); |
7304 | set_domain_attribute(sd, attr); | 7303 | set_domain_attribute(sd, attr); |
7305 | sched_domain_node_span(cpu_to_node(i), &sd->span); | 7304 | sched_domain_node_span(cpu_to_node(i), &sd->span); |
7306 | sd->first_cpu = first_cpu(sd->span); | ||
7307 | sd->parent = p; | 7305 | sd->parent = p; |
7308 | if (p) | 7306 | if (p) |
7309 | p->child = sd; | 7307 | p->child = sd; |
@@ -7315,7 +7313,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7315 | SD_INIT(sd, CPU); | 7313 | SD_INIT(sd, CPU); |
7316 | set_domain_attribute(sd, attr); | 7314 | set_domain_attribute(sd, attr); |
7317 | sd->span = *nodemask; | 7315 | sd->span = *nodemask; |
7318 | sd->first_cpu = first_cpu(sd->span); | ||
7319 | sd->parent = p; | 7316 | sd->parent = p; |
7320 | if (p) | 7317 | if (p) |
7321 | p->child = sd; | 7318 | p->child = sd; |
@@ -7327,7 +7324,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7327 | SD_INIT(sd, MC); | 7324 | SD_INIT(sd, MC); |
7328 | set_domain_attribute(sd, attr); | 7325 | set_domain_attribute(sd, attr); |
7329 | sd->span = cpu_coregroup_map(i); | 7326 | sd->span = cpu_coregroup_map(i); |
7330 | sd->first_cpu = first_cpu(sd->span); | ||
7331 | cpus_and(sd->span, sd->span, *cpu_map); | 7327 | cpus_and(sd->span, sd->span, *cpu_map); |
7332 | sd->parent = p; | 7328 | sd->parent = p; |
7333 | p->child = sd; | 7329 | p->child = sd; |
@@ -7340,7 +7336,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map, | |||
7340 | SD_INIT(sd, SIBLING); | 7336 | SD_INIT(sd, SIBLING); |
7341 | set_domain_attribute(sd, attr); | 7337 | set_domain_attribute(sd, attr); |
7342 | sd->span = per_cpu(cpu_sibling_map, i); | 7338 | sd->span = per_cpu(cpu_sibling_map, i); |
7343 | sd->first_cpu = first_cpu(sd->span); | ||
7344 | cpus_and(sd->span, sd->span, *cpu_map); | 7339 | cpus_and(sd->span, sd->span, *cpu_map); |
7345 | sd->parent = p; | 7340 | sd->parent = p; |
7346 | p->child = sd; | 7341 | p->child = sd; |