aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2008-06-27 07:41:23 -0400
committerIngo Molnar <mingo@elte.hu>2008-06-27 08:31:36 -0400
commitc8cba857b4997d5b00451d01474638f6a153f713 (patch)
treea784dce37d72ae20a0efb81b8e498b504a207650
parenta25b5aca8740ea99d5e18dfc71235a52b685dcf7 (diff)
sched: simplify the group load balancer
While thinking about the previous patch - I realized that using per domain aggregate load values in load_balance_fair() is wrong. We should use the load value for that CPU. By not needing per domain hierarchical load values we don't need to store per domain aggregate shares, which greatly simplifies all the math. It basically falls apart in two separate computations: - per domain update of the shares - per CPU update of the hierarchical load Also get rid of the move_group_shares() stuff - just re-compute the shares again after a successful load balance. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Mike Galbraith <efault@gmx.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--kernel/sched.c286
-rw-r--r--kernel/sched_fair.c15
2 files changed, 72 insertions, 229 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 716cfc8e099e..f864b751fd19 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -406,34 +406,23 @@ struct cfs_rq {
406 struct task_group *tg; /* group that "owns" this runqueue */ 406 struct task_group *tg; /* group that "owns" this runqueue */
407 407
408#ifdef CONFIG_SMP 408#ifdef CONFIG_SMP
409 unsigned long task_weight;
410 unsigned long shares;
411 /* 409 /*
412 * We need space to build a sched_domain wide view of the full task 410 * the part of load.weight contributed by tasks
413 * group tree, in order to avoid depending on dynamic memory allocation
414 * during the load balancing we place this in the per cpu task group
415 * hierarchy. This limits the load balancing to one instance per cpu,
416 * but more should not be needed anyway.
417 */ 411 */
418 struct aggregate_struct { 412 unsigned long task_weight;
419 /*
420 * load = weight(cpus) * f(tg)
421 *
422 * Where f(tg) is the recursive weight fraction assigned to
423 * this group.
424 */
425 unsigned long load;
426 413
427 /* 414 /*
428 * part of the group weight distributed to this span. 415 * h_load = weight * f(tg)
429 */ 416 *
430 unsigned long shares; 417 * Where f(tg) is the recursive weight fraction assigned to
418 * this group.
419 */
420 unsigned long h_load;
431 421
432 /* 422 /*
433 * The sum of all runqueue weights within this span. 423 * this cpu's part of tg->shares
434 */ 424 */
435 unsigned long rq_weight; 425 unsigned long shares;
436 } aggregate;
437#endif 426#endif
438#endif 427#endif
439}; 428};
@@ -1443,47 +1432,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1443 1432
1444#ifdef CONFIG_FAIR_GROUP_SCHED 1433#ifdef CONFIG_FAIR_GROUP_SCHED
1445 1434
1446/* 1435typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1447 * Group load balancing.
1448 *
1449 * We calculate a few balance domain wide aggregate numbers; load and weight.
1450 * Given the pictures below, and assuming each item has equal weight:
1451 *
1452 * root 1 - thread
1453 * / | \ A - group
1454 * A 1 B
1455 * /|\ / \
1456 * C 2 D 3 4
1457 * | |
1458 * 5 6
1459 *
1460 * load:
1461 * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
1462 * which equals 1/9-th of the total load.
1463 *
1464 * shares:
1465 * The weight of this group on the selected cpus.
1466 *
1467 * rq_weight:
1468 * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
1469 * B would get 2.
1470 */
1471
1472static inline struct aggregate_struct *
1473aggregate(struct task_group *tg, int cpu)
1474{
1475 return &tg->cfs_rq[cpu]->aggregate;
1476}
1477
1478typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *);
1479 1436
1480/* 1437/*
1481 * Iterate the full tree, calling @down when first entering a node and @up when 1438 * Iterate the full tree, calling @down when first entering a node and @up when
1482 * leaving it for the final time. 1439 * leaving it for the final time.
1483 */ 1440 */
1484static 1441static void
1485void aggregate_walk_tree(aggregate_func down, aggregate_func up, 1442walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1486 int cpu, struct sched_domain *sd)
1487{ 1443{
1488 struct task_group *parent, *child; 1444 struct task_group *parent, *child;
1489 1445
@@ -1507,72 +1463,6 @@ up:
1507 rcu_read_unlock(); 1463 rcu_read_unlock();
1508} 1464}
1509 1465
1510/*
1511 * Calculate the aggregate runqueue weight.
1512 */
1513static void
1514aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
1515{
1516 unsigned long rq_weight = 0;
1517 int i;
1518
1519 for_each_cpu_mask(i, sd->span)
1520 rq_weight += tg->cfs_rq[i]->load.weight;
1521
1522 aggregate(tg, cpu)->rq_weight = rq_weight;
1523}
1524
1525/*
1526 * Compute the weight of this group on the given cpus.
1527 */
1528static void
1529aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
1530{
1531 unsigned long shares = 0;
1532 int i;
1533
1534 for_each_cpu_mask(i, sd->span)
1535 shares += tg->cfs_rq[i]->shares;
1536
1537 if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
1538 shares = tg->shares;
1539
1540 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1541 shares = tg->shares;
1542
1543 aggregate(tg, cpu)->shares = shares;
1544}
1545
1546/*
1547 * Compute the load fraction assigned to this group, relies on the aggregate
1548 * weight and this group's parent's load, i.e. top-down.
1549 */
1550static void
1551aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd)
1552{
1553 unsigned long load;
1554
1555 if (!tg->parent) {
1556 int i;
1557
1558 load = 0;
1559 for_each_cpu_mask(i, sd->span)
1560 load += cpu_rq(i)->load.weight;
1561
1562 } else {
1563 load = aggregate(tg->parent, cpu)->load;
1564
1565 /*
1566 * shares is our weight in the parent's rq so
1567 * shares/parent->rq_weight gives our fraction of the load
1568 */
1569 load *= aggregate(tg, cpu)->shares;
1570 load /= aggregate(tg->parent, cpu)->rq_weight + 1;
1571 }
1572
1573 aggregate(tg, cpu)->load = load;
1574}
1575
1576static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1466static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1577 1467
1578/* 1468/*
@@ -1580,16 +1470,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1580 */ 1470 */
1581static void 1471static void
1582__update_group_shares_cpu(struct task_group *tg, int cpu, 1472__update_group_shares_cpu(struct task_group *tg, int cpu,
1583 struct sched_domain *sd, int tcpu) 1473 unsigned long sd_shares, unsigned long sd_rq_weight)
1584{ 1474{
1585 int boost = 0; 1475 int boost = 0;
1586 unsigned long shares; 1476 unsigned long shares;
1587 unsigned long rq_weight; 1477 unsigned long rq_weight;
1588 1478
1589 if (!tg->se[tcpu]) 1479 if (!tg->se[cpu])
1590 return; 1480 return;
1591 1481
1592 rq_weight = tg->cfs_rq[tcpu]->load.weight; 1482 rq_weight = tg->cfs_rq[cpu]->load.weight;
1593 1483
1594 /* 1484 /*
1595 * If there are currently no tasks on the cpu pretend there is one of 1485 * If there are currently no tasks on the cpu pretend there is one of
@@ -1601,124 +1491,97 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1601 rq_weight = NICE_0_LOAD; 1491 rq_weight = NICE_0_LOAD;
1602 } 1492 }
1603 1493
1494 if (unlikely(rq_weight > sd_rq_weight))
1495 rq_weight = sd_rq_weight;
1496
1604 /* 1497 /*
1605 * \Sum shares * rq_weight 1498 * \Sum shares * rq_weight
1606 * shares = ----------------------- 1499 * shares = -----------------------
1607 * \Sum rq_weight 1500 * \Sum rq_weight
1608 * 1501 *
1609 */ 1502 */
1610 shares = aggregate(tg, cpu)->shares * rq_weight; 1503 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1611 shares /= aggregate(tg, cpu)->rq_weight + 1;
1612 1504
1613 /* 1505 /*
1614 * record the actual number of shares, not the boosted amount. 1506 * record the actual number of shares, not the boosted amount.
1615 */ 1507 */
1616 tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; 1508 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1617 1509
1618 if (shares < MIN_SHARES) 1510 if (shares < MIN_SHARES)
1619 shares = MIN_SHARES; 1511 shares = MIN_SHARES;
1620 else if (shares > MAX_SHARES) 1512 else if (shares > MAX_SHARES)
1621 shares = MAX_SHARES; 1513 shares = MAX_SHARES;
1622 1514
1623 __set_se_shares(tg->se[tcpu], shares); 1515 __set_se_shares(tg->se[cpu], shares);
1624} 1516}
1625 1517
1626/* 1518/*
1627 * Re-adjust the weights on the cpu the task came from and on the cpu the 1519 * Re-compute the task group their per cpu shares over the given domain.
1628 * task went to. 1520 * This needs to be done in a bottom-up fashion because the rq weight of a
1521 * parent group depends on the shares of its child groups.
1629 */ 1522 */
1630static void 1523static void
1631__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, 1524tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1632 int scpu, int dcpu)
1633{ 1525{
1634 __update_group_shares_cpu(tg, cpu, sd, scpu); 1526 unsigned long rq_weight = 0;
1635 __update_group_shares_cpu(tg, cpu, sd, dcpu); 1527 unsigned long shares = 0;
1636} 1528 int i;
1637 1529
1638/* 1530 for_each_cpu_mask(i, sd->span) {
1639 * Because changing a group's shares changes the weight of the super-group 1531 rq_weight += tg->cfs_rq[i]->load.weight;
1640 * we need to walk up the tree and change all shares until we hit the root. 1532 shares += tg->cfs_rq[i]->shares;
1641 */
1642static void
1643move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
1644 int scpu, int dcpu)
1645{
1646 while (tg) {
1647 __move_group_shares(tg, cpu, sd, scpu, dcpu);
1648 tg = tg->parent;
1649 } 1533 }
1650}
1651 1534
1652static void 1535 if ((!shares && rq_weight) || shares > tg->shares)
1653aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd) 1536 shares = tg->shares;
1654{ 1537
1655 int i; 1538 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1539 shares = tg->shares;
1656 1540
1657 for_each_cpu_mask(i, sd->span) { 1541 for_each_cpu_mask(i, sd->span) {
1658 struct rq *rq = cpu_rq(i); 1542 struct rq *rq = cpu_rq(i);
1659 unsigned long flags; 1543 unsigned long flags;
1660 1544
1661 spin_lock_irqsave(&rq->lock, flags); 1545 spin_lock_irqsave(&rq->lock, flags);
1662 __update_group_shares_cpu(tg, cpu, sd, i); 1546 __update_group_shares_cpu(tg, i, shares, rq_weight);
1663 spin_unlock_irqrestore(&rq->lock, flags); 1547 spin_unlock_irqrestore(&rq->lock, flags);
1664 } 1548 }
1665
1666 aggregate_group_shares(tg, cpu, sd);
1667} 1549}
1668 1550
1669/* 1551/*
1670 * Calculate the accumulative weight and recursive load of each task group 1552 * Compute the cpu's hierarchical load factor for each task group.
1671 * while walking down the tree. 1553 * This needs to be done in a top-down fashion because the load of a child
1554 * group is a fraction of its parents load.
1672 */ 1555 */
1673static void 1556static void
1674aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd) 1557tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1675{ 1558{
1676 aggregate_group_weight(tg, cpu, sd); 1559 unsigned long load;
1677 aggregate_group_shares(tg, cpu, sd);
1678 aggregate_group_load(tg, cpu, sd);
1679}
1680
1681/*
1682 * Rebalance the cpu shares while walking back up the tree.
1683 */
1684static void
1685aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1686{
1687 aggregate_group_set_shares(tg, cpu, sd);
1688}
1689
1690static void
1691aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1692{
1693}
1694
1695static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
1696 1560
1697static void __init init_aggregate(void) 1561 if (!tg->parent) {
1698{ 1562 load = cpu_rq(cpu)->load.weight;
1699 int i; 1563 } else {
1564 load = tg->parent->cfs_rq[cpu]->h_load;
1565 load *= tg->cfs_rq[cpu]->shares;
1566 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1567 }
1700 1568
1701 for_each_possible_cpu(i) 1569 tg->cfs_rq[cpu]->h_load = load;
1702 spin_lock_init(&per_cpu(aggregate_lock, i));
1703} 1570}
1704 1571
1705static int get_aggregate(int cpu, struct sched_domain *sd) 1572static void
1573tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1706{ 1574{
1707 if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
1708 return 0;
1709
1710 aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
1711 return 1;
1712} 1575}
1713 1576
1714static void update_aggregate(int cpu, struct sched_domain *sd) 1577static void update_shares(struct sched_domain *sd)
1715{ 1578{
1716 aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd); 1579 walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
1717} 1580}
1718 1581
1719static void put_aggregate(int cpu, struct sched_domain *sd) 1582static void update_h_load(int cpu)
1720{ 1583{
1721 spin_unlock(&per_cpu(aggregate_lock, cpu)); 1584 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
1722} 1585}
1723 1586
1724static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1587static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1728,22 +1591,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728 1591
1729#else 1592#else
1730 1593
1731static inline void init_aggregate(void) 1594static inline void update_shares(struct sched_domain *sd)
1732{
1733}
1734
1735static inline int get_aggregate(int cpu, struct sched_domain *sd)
1736{
1737 return 0;
1738}
1739
1740static inline void update_aggregate(int cpu, struct sched_domain *sd)
1741{ 1595{
1742} 1596}
1743 1597
1744static inline void put_aggregate(int cpu, struct sched_domain *sd)
1745{
1746}
1747#endif 1598#endif
1748 1599
1749#endif 1600#endif
@@ -2172,12 +2023,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2172 int load_idx = sd->forkexec_idx; 2023 int load_idx = sd->forkexec_idx;
2173 int imbalance = 100 + (sd->imbalance_pct-100)/2; 2024 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2174 2025
2175 /*
2176 * now that we have both rqs locked the rq weight won't change
2177 * anymore - so update the stats.
2178 */
2179 update_aggregate(this_cpu, sd);
2180
2181 do { 2026 do {
2182 unsigned long load, avg_load; 2027 unsigned long load, avg_load;
2183 int local_group; 2028 int local_group;
@@ -3521,12 +3366,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3521 unsigned long imbalance; 3366 unsigned long imbalance;
3522 struct rq *busiest; 3367 struct rq *busiest;
3523 unsigned long flags; 3368 unsigned long flags;
3524 int unlock_aggregate;
3525 3369
3526 cpus_setall(*cpus); 3370 cpus_setall(*cpus);
3527 3371
3528 unlock_aggregate = get_aggregate(this_cpu, sd);
3529
3530 /* 3372 /*
3531 * When power savings policy is enabled for the parent domain, idle 3373 * When power savings policy is enabled for the parent domain, idle
3532 * sibling can pick up load irrespective of busy siblings. In this case, 3374 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3540,6 +3382,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3540 schedstat_inc(sd, lb_count[idle]); 3382 schedstat_inc(sd, lb_count[idle]);
3541 3383
3542redo: 3384redo:
3385 update_shares(sd);
3543 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3386 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3544 cpus, balance); 3387 cpus, balance);
3545 3388
@@ -3663,8 +3506,8 @@ out_one_pinned:
3663 else 3506 else
3664 ld_moved = 0; 3507 ld_moved = 0;
3665out: 3508out:
3666 if (unlock_aggregate) 3509 if (ld_moved)
3667 put_aggregate(this_cpu, sd); 3510 update_shares(sd);
3668 return ld_moved; 3511 return ld_moved;
3669} 3512}
3670 3513
@@ -8019,7 +7862,6 @@ void __init sched_init(void)
8019 } 7862 }
8020 7863
8021#ifdef CONFIG_SMP 7864#ifdef CONFIG_SMP
8022 init_aggregate();
8023 init_defrootdomain(); 7865 init_defrootdomain();
8024#endif 7866#endif
8025 7867
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 03b9fbd9d648..7b8d664d6f22 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1421,17 +1421,20 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1421 struct task_group *tg; 1421 struct task_group *tg;
1422 1422
1423 rcu_read_lock(); 1423 rcu_read_lock();
1424 update_h_load(busiest_cpu);
1425
1424 list_for_each_entry(tg, &task_groups, list) { 1426 list_for_each_entry(tg, &task_groups, list) {
1427 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
1425 long rem_load, moved_load; 1428 long rem_load, moved_load;
1426 1429
1427 /* 1430 /*
1428 * empty group 1431 * empty group
1429 */ 1432 */
1430 if (!tg->cfs_rq[busiest_cpu]->task_weight) 1433 if (!busiest_cfs_rq->task_weight)
1431 continue; 1434 continue;
1432 1435
1433 rem_load = rem_load_move * aggregate(tg, this_cpu)->rq_weight; 1436 rem_load = rem_load_move * busiest_cfs_rq->load.weight;
1434 rem_load /= aggregate(tg, this_cpu)->load + 1; 1437 rem_load /= busiest_cfs_rq->h_load + 1;
1435 1438
1436 moved_load = __load_balance_fair(this_rq, this_cpu, busiest, 1439 moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1437 rem_load, sd, idle, all_pinned, this_best_prio, 1440 rem_load, sd, idle, all_pinned, this_best_prio,
@@ -1440,10 +1443,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1440 if (!moved_load) 1443 if (!moved_load)
1441 continue; 1444 continue;
1442 1445
1443 move_group_shares(tg, this_cpu, sd, busiest_cpu, this_cpu); 1446 moved_load *= busiest_cfs_rq->h_load;
1444 1447 moved_load /= busiest_cfs_rq->load.weight + 1;
1445 moved_load *= aggregate(tg, this_cpu)->load;
1446 moved_load /= aggregate(tg, this_cpu)->rq_weight + 1;
1447 1448
1448 rem_load_move -= moved_load; 1449 rem_load_move -= moved_load;
1449 if (rem_load_move < 0) 1450 if (rem_load_move < 0)