diff options
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 286 |
1 files changed, 64 insertions, 222 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index 716cfc8e099e..f864b751fd19 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -406,34 +406,23 @@ struct cfs_rq { | |||
406 | struct task_group *tg; /* group that "owns" this runqueue */ | 406 | struct task_group *tg; /* group that "owns" this runqueue */ |
407 | 407 | ||
408 | #ifdef CONFIG_SMP | 408 | #ifdef CONFIG_SMP |
409 | unsigned long task_weight; | ||
410 | unsigned long shares; | ||
411 | /* | 409 | /* |
412 | * We need space to build a sched_domain wide view of the full task | 410 | * the part of load.weight contributed by tasks |
413 | * group tree, in order to avoid depending on dynamic memory allocation | ||
414 | * during the load balancing we place this in the per cpu task group | ||
415 | * hierarchy. This limits the load balancing to one instance per cpu, | ||
416 | * but more should not be needed anyway. | ||
417 | */ | 411 | */ |
418 | struct aggregate_struct { | 412 | unsigned long task_weight; |
419 | /* | ||
420 | * load = weight(cpus) * f(tg) | ||
421 | * | ||
422 | * Where f(tg) is the recursive weight fraction assigned to | ||
423 | * this group. | ||
424 | */ | ||
425 | unsigned long load; | ||
426 | 413 | ||
427 | /* | 414 | /* |
428 | * part of the group weight distributed to this span. | 415 | * h_load = weight * f(tg) |
429 | */ | 416 | * |
430 | unsigned long shares; | 417 | * Where f(tg) is the recursive weight fraction assigned to |
418 | * this group. | ||
419 | */ | ||
420 | unsigned long h_load; | ||
431 | 421 | ||
432 | /* | 422 | /* |
433 | * The sum of all runqueue weights within this span. | 423 | * this cpu's part of tg->shares |
434 | */ | 424 | */ |
435 | unsigned long rq_weight; | 425 | unsigned long shares; |
436 | } aggregate; | ||
437 | #endif | 426 | #endif |
438 | #endif | 427 | #endif |
439 | }; | 428 | }; |
@@ -1443,47 +1432,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | |||
1443 | 1432 | ||
1444 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1433 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1445 | 1434 | ||
1446 | /* | 1435 | typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *); |
1447 | * Group load balancing. | ||
1448 | * | ||
1449 | * We calculate a few balance domain wide aggregate numbers; load and weight. | ||
1450 | * Given the pictures below, and assuming each item has equal weight: | ||
1451 | * | ||
1452 | * root 1 - thread | ||
1453 | * / | \ A - group | ||
1454 | * A 1 B | ||
1455 | * /|\ / \ | ||
1456 | * C 2 D 3 4 | ||
1457 | * | | | ||
1458 | * 5 6 | ||
1459 | * | ||
1460 | * load: | ||
1461 | * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd, | ||
1462 | * which equals 1/9-th of the total load. | ||
1463 | * | ||
1464 | * shares: | ||
1465 | * The weight of this group on the selected cpus. | ||
1466 | * | ||
1467 | * rq_weight: | ||
1468 | * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while | ||
1469 | * B would get 2. | ||
1470 | */ | ||
1471 | |||
1472 | static inline struct aggregate_struct * | ||
1473 | aggregate(struct task_group *tg, int cpu) | ||
1474 | { | ||
1475 | return &tg->cfs_rq[cpu]->aggregate; | ||
1476 | } | ||
1477 | |||
1478 | typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *); | ||
1479 | 1436 | ||
1480 | /* | 1437 | /* |
1481 | * Iterate the full tree, calling @down when first entering a node and @up when | 1438 | * Iterate the full tree, calling @down when first entering a node and @up when |
1482 | * leaving it for the final time. | 1439 | * leaving it for the final time. |
1483 | */ | 1440 | */ |
1484 | static | 1441 | static void |
1485 | void aggregate_walk_tree(aggregate_func down, aggregate_func up, | 1442 | walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd) |
1486 | int cpu, struct sched_domain *sd) | ||
1487 | { | 1443 | { |
1488 | struct task_group *parent, *child; | 1444 | struct task_group *parent, *child; |
1489 | 1445 | ||
@@ -1507,72 +1463,6 @@ up: | |||
1507 | rcu_read_unlock(); | 1463 | rcu_read_unlock(); |
1508 | } | 1464 | } |
1509 | 1465 | ||
1510 | /* | ||
1511 | * Calculate the aggregate runqueue weight. | ||
1512 | */ | ||
1513 | static void | ||
1514 | aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1515 | { | ||
1516 | unsigned long rq_weight = 0; | ||
1517 | int i; | ||
1518 | |||
1519 | for_each_cpu_mask(i, sd->span) | ||
1520 | rq_weight += tg->cfs_rq[i]->load.weight; | ||
1521 | |||
1522 | aggregate(tg, cpu)->rq_weight = rq_weight; | ||
1523 | } | ||
1524 | |||
1525 | /* | ||
1526 | * Compute the weight of this group on the given cpus. | ||
1527 | */ | ||
1528 | static void | ||
1529 | aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1530 | { | ||
1531 | unsigned long shares = 0; | ||
1532 | int i; | ||
1533 | |||
1534 | for_each_cpu_mask(i, sd->span) | ||
1535 | shares += tg->cfs_rq[i]->shares; | ||
1536 | |||
1537 | if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares) | ||
1538 | shares = tg->shares; | ||
1539 | |||
1540 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | ||
1541 | shares = tg->shares; | ||
1542 | |||
1543 | aggregate(tg, cpu)->shares = shares; | ||
1544 | } | ||
1545 | |||
1546 | /* | ||
1547 | * Compute the load fraction assigned to this group, relies on the aggregate | ||
1548 | * weight and this group's parent's load, i.e. top-down. | ||
1549 | */ | ||
1550 | static void | ||
1551 | aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1552 | { | ||
1553 | unsigned long load; | ||
1554 | |||
1555 | if (!tg->parent) { | ||
1556 | int i; | ||
1557 | |||
1558 | load = 0; | ||
1559 | for_each_cpu_mask(i, sd->span) | ||
1560 | load += cpu_rq(i)->load.weight; | ||
1561 | |||
1562 | } else { | ||
1563 | load = aggregate(tg->parent, cpu)->load; | ||
1564 | |||
1565 | /* | ||
1566 | * shares is our weight in the parent's rq so | ||
1567 | * shares/parent->rq_weight gives our fraction of the load | ||
1568 | */ | ||
1569 | load *= aggregate(tg, cpu)->shares; | ||
1570 | load /= aggregate(tg->parent, cpu)->rq_weight + 1; | ||
1571 | } | ||
1572 | |||
1573 | aggregate(tg, cpu)->load = load; | ||
1574 | } | ||
1575 | |||
1576 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | 1466 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); |
1577 | 1467 | ||
1578 | /* | 1468 | /* |
@@ -1580,16 +1470,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares); | |||
1580 | */ | 1470 | */ |
1581 | static void | 1471 | static void |
1582 | __update_group_shares_cpu(struct task_group *tg, int cpu, | 1472 | __update_group_shares_cpu(struct task_group *tg, int cpu, |
1583 | struct sched_domain *sd, int tcpu) | 1473 | unsigned long sd_shares, unsigned long sd_rq_weight) |
1584 | { | 1474 | { |
1585 | int boost = 0; | 1475 | int boost = 0; |
1586 | unsigned long shares; | 1476 | unsigned long shares; |
1587 | unsigned long rq_weight; | 1477 | unsigned long rq_weight; |
1588 | 1478 | ||
1589 | if (!tg->se[tcpu]) | 1479 | if (!tg->se[cpu]) |
1590 | return; | 1480 | return; |
1591 | 1481 | ||
1592 | rq_weight = tg->cfs_rq[tcpu]->load.weight; | 1482 | rq_weight = tg->cfs_rq[cpu]->load.weight; |
1593 | 1483 | ||
1594 | /* | 1484 | /* |
1595 | * If there are currently no tasks on the cpu pretend there is one of | 1485 | * If there are currently no tasks on the cpu pretend there is one of |
@@ -1601,124 +1491,97 @@ __update_group_shares_cpu(struct task_group *tg, int cpu, | |||
1601 | rq_weight = NICE_0_LOAD; | 1491 | rq_weight = NICE_0_LOAD; |
1602 | } | 1492 | } |
1603 | 1493 | ||
1494 | if (unlikely(rq_weight > sd_rq_weight)) | ||
1495 | rq_weight = sd_rq_weight; | ||
1496 | |||
1604 | /* | 1497 | /* |
1605 | * \Sum shares * rq_weight | 1498 | * \Sum shares * rq_weight |
1606 | * shares = ----------------------- | 1499 | * shares = ----------------------- |
1607 | * \Sum rq_weight | 1500 | * \Sum rq_weight |
1608 | * | 1501 | * |
1609 | */ | 1502 | */ |
1610 | shares = aggregate(tg, cpu)->shares * rq_weight; | 1503 | shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); |
1611 | shares /= aggregate(tg, cpu)->rq_weight + 1; | ||
1612 | 1504 | ||
1613 | /* | 1505 | /* |
1614 | * record the actual number of shares, not the boosted amount. | 1506 | * record the actual number of shares, not the boosted amount. |
1615 | */ | 1507 | */ |
1616 | tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; | 1508 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; |
1617 | 1509 | ||
1618 | if (shares < MIN_SHARES) | 1510 | if (shares < MIN_SHARES) |
1619 | shares = MIN_SHARES; | 1511 | shares = MIN_SHARES; |
1620 | else if (shares > MAX_SHARES) | 1512 | else if (shares > MAX_SHARES) |
1621 | shares = MAX_SHARES; | 1513 | shares = MAX_SHARES; |
1622 | 1514 | ||
1623 | __set_se_shares(tg->se[tcpu], shares); | 1515 | __set_se_shares(tg->se[cpu], shares); |
1624 | } | 1516 | } |
1625 | 1517 | ||
1626 | /* | 1518 | /* |
1627 | * Re-adjust the weights on the cpu the task came from and on the cpu the | 1519 | * Re-compute the task group their per cpu shares over the given domain. |
1628 | * task went to. | 1520 | * This needs to be done in a bottom-up fashion because the rq weight of a |
1521 | * parent group depends on the shares of its child groups. | ||
1629 | */ | 1522 | */ |
1630 | static void | 1523 | static void |
1631 | __move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, | 1524 | tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd) |
1632 | int scpu, int dcpu) | ||
1633 | { | 1525 | { |
1634 | __update_group_shares_cpu(tg, cpu, sd, scpu); | 1526 | unsigned long rq_weight = 0; |
1635 | __update_group_shares_cpu(tg, cpu, sd, dcpu); | 1527 | unsigned long shares = 0; |
1636 | } | 1528 | int i; |
1637 | 1529 | ||
1638 | /* | 1530 | for_each_cpu_mask(i, sd->span) { |
1639 | * Because changing a group's shares changes the weight of the super-group | 1531 | rq_weight += tg->cfs_rq[i]->load.weight; |
1640 | * we need to walk up the tree and change all shares until we hit the root. | 1532 | shares += tg->cfs_rq[i]->shares; |
1641 | */ | ||
1642 | static void | ||
1643 | move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, | ||
1644 | int scpu, int dcpu) | ||
1645 | { | ||
1646 | while (tg) { | ||
1647 | __move_group_shares(tg, cpu, sd, scpu, dcpu); | ||
1648 | tg = tg->parent; | ||
1649 | } | 1533 | } |
1650 | } | ||
1651 | 1534 | ||
1652 | static void | 1535 | if ((!shares && rq_weight) || shares > tg->shares) |
1653 | aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd) | 1536 | shares = tg->shares; |
1654 | { | 1537 | |
1655 | int i; | 1538 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) |
1539 | shares = tg->shares; | ||
1656 | 1540 | ||
1657 | for_each_cpu_mask(i, sd->span) { | 1541 | for_each_cpu_mask(i, sd->span) { |
1658 | struct rq *rq = cpu_rq(i); | 1542 | struct rq *rq = cpu_rq(i); |
1659 | unsigned long flags; | 1543 | unsigned long flags; |
1660 | 1544 | ||
1661 | spin_lock_irqsave(&rq->lock, flags); | 1545 | spin_lock_irqsave(&rq->lock, flags); |
1662 | __update_group_shares_cpu(tg, cpu, sd, i); | 1546 | __update_group_shares_cpu(tg, i, shares, rq_weight); |
1663 | spin_unlock_irqrestore(&rq->lock, flags); | 1547 | spin_unlock_irqrestore(&rq->lock, flags); |
1664 | } | 1548 | } |
1665 | |||
1666 | aggregate_group_shares(tg, cpu, sd); | ||
1667 | } | 1549 | } |
1668 | 1550 | ||
1669 | /* | 1551 | /* |
1670 | * Calculate the accumulative weight and recursive load of each task group | 1552 | * Compute the cpu's hierarchical load factor for each task group. |
1671 | * while walking down the tree. | 1553 | * This needs to be done in a top-down fashion because the load of a child |
1554 | * group is a fraction of its parents load. | ||
1672 | */ | 1555 | */ |
1673 | static void | 1556 | static void |
1674 | aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd) | 1557 | tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd) |
1675 | { | 1558 | { |
1676 | aggregate_group_weight(tg, cpu, sd); | 1559 | unsigned long load; |
1677 | aggregate_group_shares(tg, cpu, sd); | ||
1678 | aggregate_group_load(tg, cpu, sd); | ||
1679 | } | ||
1680 | |||
1681 | /* | ||
1682 | * Rebalance the cpu shares while walking back up the tree. | ||
1683 | */ | ||
1684 | static void | ||
1685 | aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1686 | { | ||
1687 | aggregate_group_set_shares(tg, cpu, sd); | ||
1688 | } | ||
1689 | |||
1690 | static void | ||
1691 | aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1692 | { | ||
1693 | } | ||
1694 | |||
1695 | static DEFINE_PER_CPU(spinlock_t, aggregate_lock); | ||
1696 | 1560 | ||
1697 | static void __init init_aggregate(void) | 1561 | if (!tg->parent) { |
1698 | { | 1562 | load = cpu_rq(cpu)->load.weight; |
1699 | int i; | 1563 | } else { |
1564 | load = tg->parent->cfs_rq[cpu]->h_load; | ||
1565 | load *= tg->cfs_rq[cpu]->shares; | ||
1566 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | ||
1567 | } | ||
1700 | 1568 | ||
1701 | for_each_possible_cpu(i) | 1569 | tg->cfs_rq[cpu]->h_load = load; |
1702 | spin_lock_init(&per_cpu(aggregate_lock, i)); | ||
1703 | } | 1570 | } |
1704 | 1571 | ||
1705 | static int get_aggregate(int cpu, struct sched_domain *sd) | 1572 | static void |
1573 | tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd) | ||
1706 | { | 1574 | { |
1707 | if (!spin_trylock(&per_cpu(aggregate_lock, cpu))) | ||
1708 | return 0; | ||
1709 | |||
1710 | aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd); | ||
1711 | return 1; | ||
1712 | } | 1575 | } |
1713 | 1576 | ||
1714 | static void update_aggregate(int cpu, struct sched_domain *sd) | 1577 | static void update_shares(struct sched_domain *sd) |
1715 | { | 1578 | { |
1716 | aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd); | 1579 | walk_tg_tree(tg_nop, tg_shares_up, 0, sd); |
1717 | } | 1580 | } |
1718 | 1581 | ||
1719 | static void put_aggregate(int cpu, struct sched_domain *sd) | 1582 | static void update_h_load(int cpu) |
1720 | { | 1583 | { |
1721 | spin_unlock(&per_cpu(aggregate_lock, cpu)); | 1584 | walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); |
1722 | } | 1585 | } |
1723 | 1586 | ||
1724 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | 1587 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) |
@@ -1728,22 +1591,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | |||
1728 | 1591 | ||
1729 | #else | 1592 | #else |
1730 | 1593 | ||
1731 | static inline void init_aggregate(void) | 1594 | static inline void update_shares(struct sched_domain *sd) |
1732 | { | ||
1733 | } | ||
1734 | |||
1735 | static inline int get_aggregate(int cpu, struct sched_domain *sd) | ||
1736 | { | ||
1737 | return 0; | ||
1738 | } | ||
1739 | |||
1740 | static inline void update_aggregate(int cpu, struct sched_domain *sd) | ||
1741 | { | 1595 | { |
1742 | } | 1596 | } |
1743 | 1597 | ||
1744 | static inline void put_aggregate(int cpu, struct sched_domain *sd) | ||
1745 | { | ||
1746 | } | ||
1747 | #endif | 1598 | #endif |
1748 | 1599 | ||
1749 | #endif | 1600 | #endif |
@@ -2172,12 +2023,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) | |||
2172 | int load_idx = sd->forkexec_idx; | 2023 | int load_idx = sd->forkexec_idx; |
2173 | int imbalance = 100 + (sd->imbalance_pct-100)/2; | 2024 | int imbalance = 100 + (sd->imbalance_pct-100)/2; |
2174 | 2025 | ||
2175 | /* | ||
2176 | * now that we have both rqs locked the rq weight won't change | ||
2177 | * anymore - so update the stats. | ||
2178 | */ | ||
2179 | update_aggregate(this_cpu, sd); | ||
2180 | |||
2181 | do { | 2026 | do { |
2182 | unsigned long load, avg_load; | 2027 | unsigned long load, avg_load; |
2183 | int local_group; | 2028 | int local_group; |
@@ -3521,12 +3366,9 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3521 | unsigned long imbalance; | 3366 | unsigned long imbalance; |
3522 | struct rq *busiest; | 3367 | struct rq *busiest; |
3523 | unsigned long flags; | 3368 | unsigned long flags; |
3524 | int unlock_aggregate; | ||
3525 | 3369 | ||
3526 | cpus_setall(*cpus); | 3370 | cpus_setall(*cpus); |
3527 | 3371 | ||
3528 | unlock_aggregate = get_aggregate(this_cpu, sd); | ||
3529 | |||
3530 | /* | 3372 | /* |
3531 | * When power savings policy is enabled for the parent domain, idle | 3373 | * When power savings policy is enabled for the parent domain, idle |
3532 | * sibling can pick up load irrespective of busy siblings. In this case, | 3374 | * sibling can pick up load irrespective of busy siblings. In this case, |
@@ -3540,6 +3382,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3540 | schedstat_inc(sd, lb_count[idle]); | 3382 | schedstat_inc(sd, lb_count[idle]); |
3541 | 3383 | ||
3542 | redo: | 3384 | redo: |
3385 | update_shares(sd); | ||
3543 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3386 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
3544 | cpus, balance); | 3387 | cpus, balance); |
3545 | 3388 | ||
@@ -3663,8 +3506,8 @@ out_one_pinned: | |||
3663 | else | 3506 | else |
3664 | ld_moved = 0; | 3507 | ld_moved = 0; |
3665 | out: | 3508 | out: |
3666 | if (unlock_aggregate) | 3509 | if (ld_moved) |
3667 | put_aggregate(this_cpu, sd); | 3510 | update_shares(sd); |
3668 | return ld_moved; | 3511 | return ld_moved; |
3669 | } | 3512 | } |
3670 | 3513 | ||
@@ -8019,7 +7862,6 @@ void __init sched_init(void) | |||
8019 | } | 7862 | } |
8020 | 7863 | ||
8021 | #ifdef CONFIG_SMP | 7864 | #ifdef CONFIG_SMP |
8022 | init_aggregate(); | ||
8023 | init_defrootdomain(); | 7865 | init_defrootdomain(); |
8024 | #endif | 7866 | #endif |
8025 | 7867 | ||