aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2008-06-27 07:41:18 -0400
committerIngo Molnar <mingo@elte.hu>2008-06-27 08:31:32 -0400
commitb6a86c746f5b708012809958462234d19e9c8177 (patch)
tree38654c70da6382f50779ede1e973d2d395f38e54
parent32df2ee86a580f70f2dbb90cf81f413aa655f838 (diff)
sched: fix sched_domain aggregation
Keeping the aggregate on the first cpu of the sched domain has two problems: - it could collide between different sched domains on different cpus - it could slow things down because of the remote accesses Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Mike Galbraith <efault@gmx.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/sched.h1
-rw-r--r--kernel/sched.c113
-rw-r--r--kernel/sched_fair.c12
3 files changed, 60 insertions, 66 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 97a58b622ee1..eaf821072dbd 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -765,7 +765,6 @@ struct sched_domain {
765 struct sched_domain *child; /* bottom domain must be null terminated */ 765 struct sched_domain *child; /* bottom domain must be null terminated */
766 struct sched_group *groups; /* the balancing groups of the domain */ 766 struct sched_group *groups; /* the balancing groups of the domain */
767 cpumask_t span; /* span of all CPUs in this domain */ 767 cpumask_t span; /* span of all CPUs in this domain */
768 int first_cpu; /* cache of the first cpu in this domain */
769 unsigned long min_interval; /* Minimum balance interval ms */ 768 unsigned long min_interval; /* Minimum balance interval ms */
770 unsigned long max_interval; /* Maximum balance interval ms */ 769 unsigned long max_interval; /* Maximum balance interval ms */
771 unsigned int busy_factor; /* less balancing by factor if busy */ 770 unsigned int busy_factor; /* less balancing by factor if busy */
diff --git a/kernel/sched.c b/kernel/sched.c
index 7d282c52bd42..160d3c209b8f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1480,12 +1480,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1480 */ 1480 */
1481 1481
1482static inline struct aggregate_struct * 1482static inline struct aggregate_struct *
1483aggregate(struct task_group *tg, struct sched_domain *sd) 1483aggregate(struct task_group *tg, int cpu)
1484{ 1484{
1485 return &tg->cfs_rq[sd->first_cpu]->aggregate; 1485 return &tg->cfs_rq[cpu]->aggregate;
1486} 1486}
1487 1487
1488typedef void (*aggregate_func)(struct task_group *, struct sched_domain *); 1488typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *);
1489 1489
1490/* 1490/*
1491 * Iterate the full tree, calling @down when first entering a node and @up when 1491 * Iterate the full tree, calling @down when first entering a node and @up when
@@ -1493,14 +1493,14 @@ typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
1493 */ 1493 */
1494static 1494static
1495void aggregate_walk_tree(aggregate_func down, aggregate_func up, 1495void aggregate_walk_tree(aggregate_func down, aggregate_func up,
1496 struct sched_domain *sd) 1496 int cpu, struct sched_domain *sd)
1497{ 1497{
1498 struct task_group *parent, *child; 1498 struct task_group *parent, *child;
1499 1499
1500 rcu_read_lock(); 1500 rcu_read_lock();
1501 parent = &root_task_group; 1501 parent = &root_task_group;
1502down: 1502down:
1503 (*down)(parent, sd); 1503 (*down)(parent, cpu, sd);
1504 list_for_each_entry_rcu(child, &parent->children, siblings) { 1504 list_for_each_entry_rcu(child, &parent->children, siblings) {
1505 parent = child; 1505 parent = child;
1506 goto down; 1506 goto down;
@@ -1508,7 +1508,7 @@ down:
1508up: 1508up:
1509 continue; 1509 continue;
1510 } 1510 }
1511 (*up)(parent, sd); 1511 (*up)(parent, cpu, sd);
1512 1512
1513 child = parent; 1513 child = parent;
1514 parent = parent->parent; 1514 parent = parent->parent;
@@ -1520,8 +1520,8 @@ up:
1520/* 1520/*
1521 * Calculate the aggregate runqueue weight. 1521 * Calculate the aggregate runqueue weight.
1522 */ 1522 */
1523static 1523static void
1524void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd) 1524aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
1525{ 1525{
1526 unsigned long rq_weight = 0; 1526 unsigned long rq_weight = 0;
1527 unsigned long task_weight = 0; 1527 unsigned long task_weight = 0;
@@ -1532,15 +1532,15 @@ void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
1532 task_weight += tg->cfs_rq[i]->task_weight; 1532 task_weight += tg->cfs_rq[i]->task_weight;
1533 } 1533 }
1534 1534
1535 aggregate(tg, sd)->rq_weight = rq_weight; 1535 aggregate(tg, cpu)->rq_weight = rq_weight;
1536 aggregate(tg, sd)->task_weight = task_weight; 1536 aggregate(tg, cpu)->task_weight = task_weight;
1537} 1537}
1538 1538
1539/* 1539/*
1540 * Compute the weight of this group on the given cpus. 1540 * Compute the weight of this group on the given cpus.
1541 */ 1541 */
1542static 1542static void
1543void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd) 1543aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
1544{ 1544{
1545 unsigned long shares = 0; 1545 unsigned long shares = 0;
1546 int i; 1546 int i;
@@ -1548,18 +1548,18 @@ void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
1548 for_each_cpu_mask(i, sd->span) 1548 for_each_cpu_mask(i, sd->span)
1549 shares += tg->cfs_rq[i]->shares; 1549 shares += tg->cfs_rq[i]->shares;
1550 1550
1551 if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares) 1551 if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
1552 shares = tg->shares; 1552 shares = tg->shares;
1553 1553
1554 aggregate(tg, sd)->shares = shares; 1554 aggregate(tg, cpu)->shares = shares;
1555} 1555}
1556 1556
1557/* 1557/*
1558 * Compute the load fraction assigned to this group, relies on the aggregate 1558 * Compute the load fraction assigned to this group, relies on the aggregate
1559 * weight and this group's parent's load, i.e. top-down. 1559 * weight and this group's parent's load, i.e. top-down.
1560 */ 1560 */
1561static 1561static void
1562void aggregate_group_load(struct task_group *tg, struct sched_domain *sd) 1562aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd)
1563{ 1563{
1564 unsigned long load; 1564 unsigned long load;
1565 1565
@@ -1571,17 +1571,17 @@ void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
1571 load += cpu_rq(i)->load.weight; 1571 load += cpu_rq(i)->load.weight;
1572 1572
1573 } else { 1573 } else {
1574 load = aggregate(tg->parent, sd)->load; 1574 load = aggregate(tg->parent, cpu)->load;
1575 1575
1576 /* 1576 /*
1577 * shares is our weight in the parent's rq so 1577 * shares is our weight in the parent's rq so
1578 * shares/parent->rq_weight gives our fraction of the load 1578 * shares/parent->rq_weight gives our fraction of the load
1579 */ 1579 */
1580 load *= aggregate(tg, sd)->shares; 1580 load *= aggregate(tg, cpu)->shares;
1581 load /= aggregate(tg->parent, sd)->rq_weight + 1; 1581 load /= aggregate(tg->parent, cpu)->rq_weight + 1;
1582 } 1582 }
1583 1583
1584 aggregate(tg, sd)->load = load; 1584 aggregate(tg, cpu)->load = load;
1585} 1585}
1586 1586
1587static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1587static void __set_se_shares(struct sched_entity *se, unsigned long shares);
@@ -1590,8 +1590,8 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1590 * Calculate and set the cpu's group shares. 1590 * Calculate and set the cpu's group shares.
1591 */ 1591 */
1592static void 1592static void
1593__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd, 1593__update_group_shares_cpu(struct task_group *tg, int cpu,
1594 int tcpu) 1594 struct sched_domain *sd, int tcpu)
1595{ 1595{
1596 int boost = 0; 1596 int boost = 0;
1597 unsigned long shares; 1597 unsigned long shares;
@@ -1618,8 +1618,8 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1618 * \Sum rq_weight 1618 * \Sum rq_weight
1619 * 1619 *
1620 */ 1620 */
1621 shares = aggregate(tg, sd)->shares * rq_weight; 1621 shares = aggregate(tg, cpu)->shares * rq_weight;
1622 shares /= aggregate(tg, sd)->rq_weight + 1; 1622 shares /= aggregate(tg, cpu)->rq_weight + 1;
1623 1623
1624 /* 1624 /*
1625 * record the actual number of shares, not the boosted amount. 1625 * record the actual number of shares, not the boosted amount.
@@ -1639,15 +1639,15 @@ __update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
1639 * task went to. 1639 * task went to.
1640 */ 1640 */
1641static void 1641static void
1642__move_group_shares(struct task_group *tg, struct sched_domain *sd, 1642__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
1643 int scpu, int dcpu) 1643 int scpu, int dcpu)
1644{ 1644{
1645 unsigned long shares; 1645 unsigned long shares;
1646 1646
1647 shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares; 1647 shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
1648 1648
1649 __update_group_shares_cpu(tg, sd, scpu); 1649 __update_group_shares_cpu(tg, cpu, sd, scpu);
1650 __update_group_shares_cpu(tg, sd, dcpu); 1650 __update_group_shares_cpu(tg, cpu, sd, dcpu);
1651 1651
1652 /* 1652 /*
1653 * ensure we never loose shares due to rounding errors in the 1653 * ensure we never loose shares due to rounding errors in the
@@ -1663,19 +1663,19 @@ __move_group_shares(struct task_group *tg, struct sched_domain *sd,
1663 * we need to walk up the tree and change all shares until we hit the root. 1663 * we need to walk up the tree and change all shares until we hit the root.
1664 */ 1664 */
1665static void 1665static void
1666move_group_shares(struct task_group *tg, struct sched_domain *sd, 1666move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
1667 int scpu, int dcpu) 1667 int scpu, int dcpu)
1668{ 1668{
1669 while (tg) { 1669 while (tg) {
1670 __move_group_shares(tg, sd, scpu, dcpu); 1670 __move_group_shares(tg, cpu, sd, scpu, dcpu);
1671 tg = tg->parent; 1671 tg = tg->parent;
1672 } 1672 }
1673} 1673}
1674 1674
1675static 1675static void
1676void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd) 1676aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
1677{ 1677{
1678 unsigned long shares = aggregate(tg, sd)->shares; 1678 unsigned long shares = aggregate(tg, cpu)->shares;
1679 int i; 1679 int i;
1680 1680
1681 for_each_cpu_mask(i, sd->span) { 1681 for_each_cpu_mask(i, sd->span) {
@@ -1683,20 +1683,20 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
1683 unsigned long flags; 1683 unsigned long flags;
1684 1684
1685 spin_lock_irqsave(&rq->lock, flags); 1685 spin_lock_irqsave(&rq->lock, flags);
1686 __update_group_shares_cpu(tg, sd, i); 1686 __update_group_shares_cpu(tg, cpu, sd, i);
1687 spin_unlock_irqrestore(&rq->lock, flags); 1687 spin_unlock_irqrestore(&rq->lock, flags);
1688 } 1688 }
1689 1689
1690 aggregate_group_shares(tg, sd); 1690 aggregate_group_shares(tg, cpu, sd);
1691 1691
1692 /* 1692 /*
1693 * ensure we never loose shares due to rounding errors in the 1693 * ensure we never loose shares due to rounding errors in the
1694 * above redistribution. 1694 * above redistribution.
1695 */ 1695 */
1696 shares -= aggregate(tg, sd)->shares; 1696 shares -= aggregate(tg, cpu)->shares;
1697 if (shares) { 1697 if (shares) {
1698 tg->cfs_rq[sd->first_cpu]->shares += shares; 1698 tg->cfs_rq[cpu]->shares += shares;
1699 aggregate(tg, sd)->shares += shares; 1699 aggregate(tg, cpu)->shares += shares;
1700 } 1700 }
1701} 1701}
1702 1702
@@ -1704,21 +1704,21 @@ void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
1704 * Calculate the accumulative weight and recursive load of each task group 1704 * Calculate the accumulative weight and recursive load of each task group
1705 * while walking down the tree. 1705 * while walking down the tree.
1706 */ 1706 */
1707static 1707static void
1708void aggregate_get_down(struct task_group *tg, struct sched_domain *sd) 1708aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1709{ 1709{
1710 aggregate_group_weight(tg, sd); 1710 aggregate_group_weight(tg, cpu, sd);
1711 aggregate_group_shares(tg, sd); 1711 aggregate_group_shares(tg, cpu, sd);
1712 aggregate_group_load(tg, sd); 1712 aggregate_group_load(tg, cpu, sd);
1713} 1713}
1714 1714
1715/* 1715/*
1716 * Rebalance the cpu shares while walking back up the tree. 1716 * Rebalance the cpu shares while walking back up the tree.
1717 */ 1717 */
1718static 1718static void
1719void aggregate_get_up(struct task_group *tg, struct sched_domain *sd) 1719aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1720{ 1720{
1721 aggregate_group_set_shares(tg, sd); 1721 aggregate_group_set_shares(tg, cpu, sd);
1722} 1722}
1723 1723
1724static DEFINE_PER_CPU(spinlock_t, aggregate_lock); 1724static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
@@ -1731,18 +1731,18 @@ static void __init init_aggregate(void)
1731 spin_lock_init(&per_cpu(aggregate_lock, i)); 1731 spin_lock_init(&per_cpu(aggregate_lock, i));
1732} 1732}
1733 1733
1734static int get_aggregate(struct sched_domain *sd) 1734static int get_aggregate(int cpu, struct sched_domain *sd)
1735{ 1735{
1736 if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu))) 1736 if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
1737 return 0; 1737 return 0;
1738 1738
1739 aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd); 1739 aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
1740 return 1; 1740 return 1;
1741} 1741}
1742 1742
1743static void put_aggregate(struct sched_domain *sd) 1743static void put_aggregate(int cpu, struct sched_domain *sd)
1744{ 1744{
1745 spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu)); 1745 spin_unlock(&per_cpu(aggregate_lock, cpu));
1746} 1746}
1747 1747
1748static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1748static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1756,12 +1756,12 @@ static inline void init_aggregate(void)
1756{ 1756{
1757} 1757}
1758 1758
1759static inline int get_aggregate(struct sched_domain *sd) 1759static inline int get_aggregate(int cpu, struct sched_domain *sd)
1760{ 1760{
1761 return 0; 1761 return 0;
1762} 1762}
1763 1763
1764static inline void put_aggregate(struct sched_domain *sd) 1764static inline void put_aggregate(int cpu, struct sched_domain *sd)
1765{ 1765{
1766} 1766}
1767#endif 1767#endif
@@ -3539,7 +3539,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3539 3539
3540 cpus_setall(*cpus); 3540 cpus_setall(*cpus);
3541 3541
3542 unlock_aggregate = get_aggregate(sd); 3542 unlock_aggregate = get_aggregate(this_cpu, sd);
3543 3543
3544 /* 3544 /*
3545 * When power savings policy is enabled for the parent domain, idle 3545 * When power savings policy is enabled for the parent domain, idle
@@ -3678,7 +3678,7 @@ out_one_pinned:
3678 ld_moved = 0; 3678 ld_moved = 0;
3679out: 3679out:
3680 if (unlock_aggregate) 3680 if (unlock_aggregate)
3681 put_aggregate(sd); 3681 put_aggregate(this_cpu, sd);
3682 return ld_moved; 3682 return ld_moved;
3683} 3683}
3684 3684
@@ -7292,7 +7292,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7292 SD_INIT(sd, ALLNODES); 7292 SD_INIT(sd, ALLNODES);
7293 set_domain_attribute(sd, attr); 7293 set_domain_attribute(sd, attr);
7294 sd->span = *cpu_map; 7294 sd->span = *cpu_map;
7295 sd->first_cpu = first_cpu(sd->span);
7296 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask); 7295 cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7297 p = sd; 7296 p = sd;
7298 sd_allnodes = 1; 7297 sd_allnodes = 1;
@@ -7303,7 +7302,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7303 SD_INIT(sd, NODE); 7302 SD_INIT(sd, NODE);
7304 set_domain_attribute(sd, attr); 7303 set_domain_attribute(sd, attr);
7305 sched_domain_node_span(cpu_to_node(i), &sd->span); 7304 sched_domain_node_span(cpu_to_node(i), &sd->span);
7306 sd->first_cpu = first_cpu(sd->span);
7307 sd->parent = p; 7305 sd->parent = p;
7308 if (p) 7306 if (p)
7309 p->child = sd; 7307 p->child = sd;
@@ -7315,7 +7313,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7315 SD_INIT(sd, CPU); 7313 SD_INIT(sd, CPU);
7316 set_domain_attribute(sd, attr); 7314 set_domain_attribute(sd, attr);
7317 sd->span = *nodemask; 7315 sd->span = *nodemask;
7318 sd->first_cpu = first_cpu(sd->span);
7319 sd->parent = p; 7316 sd->parent = p;
7320 if (p) 7317 if (p)
7321 p->child = sd; 7318 p->child = sd;
@@ -7327,7 +7324,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7327 SD_INIT(sd, MC); 7324 SD_INIT(sd, MC);
7328 set_domain_attribute(sd, attr); 7325 set_domain_attribute(sd, attr);
7329 sd->span = cpu_coregroup_map(i); 7326 sd->span = cpu_coregroup_map(i);
7330 sd->first_cpu = first_cpu(sd->span);
7331 cpus_and(sd->span, sd->span, *cpu_map); 7327 cpus_and(sd->span, sd->span, *cpu_map);
7332 sd->parent = p; 7328 sd->parent = p;
7333 p->child = sd; 7329 p->child = sd;
@@ -7340,7 +7336,6 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7340 SD_INIT(sd, SIBLING); 7336 SD_INIT(sd, SIBLING);
7341 set_domain_attribute(sd, attr); 7337 set_domain_attribute(sd, attr);
7342 sd->span = per_cpu(cpu_sibling_map, i); 7338 sd->span = per_cpu(cpu_sibling_map, i);
7343 sd->first_cpu = first_cpu(sd->span);
7344 cpus_and(sd->span, sd->span, *cpu_map); 7339 cpus_and(sd->span, sd->span, *cpu_map);
7345 sd->parent = p; 7340 sd->parent = p;
7346 p->child = sd; 7341 p->child = sd;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 509092af0330..40cf24ab4de8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1429,11 +1429,11 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1429 /* 1429 /*
1430 * empty group 1430 * empty group
1431 */ 1431 */
1432 if (!aggregate(tg, sd)->task_weight) 1432 if (!aggregate(tg, this_cpu)->task_weight)
1433 continue; 1433 continue;
1434 1434
1435 rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; 1435 rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight;
1436 rem_load /= aggregate(tg, sd)->load + 1; 1436 rem_load /= aggregate(tg, this_cpu)->load + 1;
1437 1437
1438 this_weight = tg->cfs_rq[this_cpu]->task_weight; 1438 this_weight = tg->cfs_rq[this_cpu]->task_weight;
1439 busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; 1439 busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
@@ -1451,10 +1451,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1451 if (!moved_load) 1451 if (!moved_load)
1452 continue; 1452 continue;
1453 1453
1454 move_group_shares(tg, sd, busiest_cpu, this_cpu); 1454 move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu);
1455 1455
1456 moved_load *= aggregate(tg, sd)->load; 1456 moved_load *= aggregate(tg, this_cpu)->load;
1457 moved_load /= aggregate(tg, sd)->rq_weight + 1; 1457 moved_load /= aggregate(tg, this_cpu)->rq_weight + 1;
1458 1458
1459 rem_load_move -= moved_load; 1459 rem_load_move -= moved_load;
1460 if (rem_load_move < 0) 1460 if (rem_load_move < 0)