aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c286
1 files changed, 64 insertions, 222 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 716cfc8e099e..f864b751fd19 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -406,34 +406,23 @@ struct cfs_rq {
406 struct task_group *tg; /* group that "owns" this runqueue */ 406 struct task_group *tg; /* group that "owns" this runqueue */
407 407
408#ifdef CONFIG_SMP 408#ifdef CONFIG_SMP
409 unsigned long task_weight;
410 unsigned long shares;
411 /* 409 /*
412 * We need space to build a sched_domain wide view of the full task 410 * the part of load.weight contributed by tasks
413 * group tree, in order to avoid depending on dynamic memory allocation
414 * during the load balancing we place this in the per cpu task group
415 * hierarchy. This limits the load balancing to one instance per cpu,
416 * but more should not be needed anyway.
417 */ 411 */
418 struct aggregate_struct { 412 unsigned long task_weight;
419 /*
420 * load = weight(cpus) * f(tg)
421 *
422 * Where f(tg) is the recursive weight fraction assigned to
423 * this group.
424 */
425 unsigned long load;
426 413
427 /* 414 /*
428 * part of the group weight distributed to this span. 415 * h_load = weight * f(tg)
429 */ 416 *
430 unsigned long shares; 417 * Where f(tg) is the recursive weight fraction assigned to
418 * this group.
419 */
420 unsigned long h_load;
431 421
432 /* 422 /*
433 * The sum of all runqueue weights within this span. 423 * this cpu's part of tg->shares
434 */ 424 */
435 unsigned long rq_weight; 425 unsigned long shares;
436 } aggregate;
437#endif 426#endif
438#endif 427#endif
439}; 428};
@@ -1443,47 +1432,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1443 1432
1444#ifdef CONFIG_FAIR_GROUP_SCHED 1433#ifdef CONFIG_FAIR_GROUP_SCHED
1445 1434
1446/* 1435typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1447 * Group load balancing.
1448 *
1449 * We calculate a few balance domain wide aggregate numbers; load and weight.
1450 * Given the pictures below, and assuming each item has equal weight:
1451 *
1452 * root 1 - thread
1453 * / | \ A - group
1454 * A 1 B
1455 * /|\ / \
1456 * C 2 D 3 4
1457 * | |
1458 * 5 6
1459 *
1460 * load:
1461 * A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
1462 * which equals 1/9-th of the total load.
1463 *
1464 * shares:
1465 * The weight of this group on the selected cpus.
1466 *
1467 * rq_weight:
1468 * Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
1469 * B would get 2.
1470 */
1471
1472static inline struct aggregate_struct *
1473aggregate(struct task_group *tg, int cpu)
1474{
1475 return &tg->cfs_rq[cpu]->aggregate;
1476}
1477
1478typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *);
1479 1436
1480/* 1437/*
1481 * Iterate the full tree, calling @down when first entering a node and @up when 1438 * Iterate the full tree, calling @down when first entering a node and @up when
1482 * leaving it for the final time. 1439 * leaving it for the final time.
1483 */ 1440 */
1484static 1441static void
1485void aggregate_walk_tree(aggregate_func down, aggregate_func up, 1442walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1486 int cpu, struct sched_domain *sd)
1487{ 1443{
1488 struct task_group *parent, *child; 1444 struct task_group *parent, *child;
1489 1445
@@ -1507,72 +1463,6 @@ up:
1507 rcu_read_unlock(); 1463 rcu_read_unlock();
1508} 1464}
1509 1465
1510/*
1511 * Calculate the aggregate runqueue weight.
1512 */
1513static void
1514aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
1515{
1516 unsigned long rq_weight = 0;
1517 int i;
1518
1519 for_each_cpu_mask(i, sd->span)
1520 rq_weight += tg->cfs_rq[i]->load.weight;
1521
1522 aggregate(tg, cpu)->rq_weight = rq_weight;
1523}
1524
1525/*
1526 * Compute the weight of this group on the given cpus.
1527 */
1528static void
1529aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
1530{
1531 unsigned long shares = 0;
1532 int i;
1533
1534 for_each_cpu_mask(i, sd->span)
1535 shares += tg->cfs_rq[i]->shares;
1536
1537 if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
1538 shares = tg->shares;
1539
1540 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1541 shares = tg->shares;
1542
1543 aggregate(tg, cpu)->shares = shares;
1544}
1545
1546/*
1547 * Compute the load fraction assigned to this group, relies on the aggregate
1548 * weight and this group's parent's load, i.e. top-down.
1549 */
1550static void
1551aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd)
1552{
1553 unsigned long load;
1554
1555 if (!tg->parent) {
1556 int i;
1557
1558 load = 0;
1559 for_each_cpu_mask(i, sd->span)
1560 load += cpu_rq(i)->load.weight;
1561
1562 } else {
1563 load = aggregate(tg->parent, cpu)->load;
1564
1565 /*
1566 * shares is our weight in the parent's rq so
1567 * shares/parent->rq_weight gives our fraction of the load
1568 */
1569 load *= aggregate(tg, cpu)->shares;
1570 load /= aggregate(tg->parent, cpu)->rq_weight + 1;
1571 }
1572
1573 aggregate(tg, cpu)->load = load;
1574}
1575
1576static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1466static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1577 1467
1578/* 1468/*
@@ -1580,16 +1470,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1580 */ 1470 */
1581static void 1471static void
1582__update_group_shares_cpu(struct task_group *tg, int cpu, 1472__update_group_shares_cpu(struct task_group *tg, int cpu,
1583 struct sched_domain *sd, int tcpu) 1473 unsigned long sd_shares, unsigned long sd_rq_weight)
1584{ 1474{
1585 int boost = 0; 1475 int boost = 0;
1586 unsigned long shares; 1476 unsigned long shares;
1587 unsigned long rq_weight; 1477 unsigned long rq_weight;
1588 1478
1589 if (!tg->se[tcpu]) 1479 if (!tg->se[cpu])
1590 return; 1480 return;
1591 1481
1592 rq_weight = tg->cfs_rq[tcpu]->load.weight; 1482 rq_weight = tg->cfs_rq[cpu]->load.weight;
1593 1483
1594 /* 1484 /*
1595 * If there are currently no tasks on the cpu pretend there is one of 1485 * If there are currently no tasks on the cpu pretend there is one of
@@ -1601,124 +1491,97 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1601 rq_weight = NICE_0_LOAD; 1491 rq_weight = NICE_0_LOAD;
1602 } 1492 }
1603 1493
1494 if (unlikely(rq_weight > sd_rq_weight))
1495 rq_weight = sd_rq_weight;
1496
1604 /* 1497 /*
1605 * \Sum shares * rq_weight 1498 * \Sum shares * rq_weight
1606 * shares = ----------------------- 1499 * shares = -----------------------
1607 * \Sum rq_weight 1500 * \Sum rq_weight
1608 * 1501 *
1609 */ 1502 */
1610 shares = aggregate(tg, cpu)->shares * rq_weight; 1503 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1611 shares /= aggregate(tg, cpu)->rq_weight + 1;
1612 1504
1613 /* 1505 /*
1614 * record the actual number of shares, not the boosted amount. 1506 * record the actual number of shares, not the boosted amount.
1615 */ 1507 */
1616 tg->cfs_rq[tcpu]->shares = boost ? 0 : shares; 1508 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1617 1509
1618 if (shares < MIN_SHARES) 1510 if (shares < MIN_SHARES)
1619 shares = MIN_SHARES; 1511 shares = MIN_SHARES;
1620 else if (shares > MAX_SHARES) 1512 else if (shares > MAX_SHARES)
1621 shares = MAX_SHARES; 1513 shares = MAX_SHARES;
1622 1514
1623 __set_se_shares(tg->se[tcpu], shares); 1515 __set_se_shares(tg->se[cpu], shares);
1624} 1516}
1625 1517
1626/* 1518/*
1627 * Re-adjust the weights on the cpu the task came from and on the cpu the 1519 * Re-compute the task group their per cpu shares over the given domain.
1628 * task went to. 1520 * This needs to be done in a bottom-up fashion because the rq weight of a
1521 * parent group depends on the shares of its child groups.
1629 */ 1522 */
1630static void 1523static void
1631__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd, 1524tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1632 int scpu, int dcpu)
1633{ 1525{
1634 __update_group_shares_cpu(tg, cpu, sd, scpu); 1526 unsigned long rq_weight = 0;
1635 __update_group_shares_cpu(tg, cpu, sd, dcpu); 1527 unsigned long shares = 0;
1636} 1528 int i;
1637 1529
1638/* 1530 for_each_cpu_mask(i, sd->span) {
1639 * Because changing a group's shares changes the weight of the super-group 1531 rq_weight += tg->cfs_rq[i]->load.weight;
1640 * we need to walk up the tree and change all shares until we hit the root. 1532 shares += tg->cfs_rq[i]->shares;
1641 */
1642static void
1643move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
1644 int scpu, int dcpu)
1645{
1646 while (tg) {
1647 __move_group_shares(tg, cpu, sd, scpu, dcpu);
1648 tg = tg->parent;
1649 } 1533 }
1650}
1651 1534
1652static void 1535 if ((!shares && rq_weight) || shares > tg->shares)
1653aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd) 1536 shares = tg->shares;
1654{ 1537
1655 int i; 1538 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1539 shares = tg->shares;
1656 1540
1657 for_each_cpu_mask(i, sd->span) { 1541 for_each_cpu_mask(i, sd->span) {
1658 struct rq *rq = cpu_rq(i); 1542 struct rq *rq = cpu_rq(i);
1659 unsigned long flags; 1543 unsigned long flags;
1660 1544
1661 spin_lock_irqsave(&rq->lock, flags); 1545 spin_lock_irqsave(&rq->lock, flags);
1662 __update_group_shares_cpu(tg, cpu, sd, i); 1546 __update_group_shares_cpu(tg, i, shares, rq_weight);
1663 spin_unlock_irqrestore(&rq->lock, flags); 1547 spin_unlock_irqrestore(&rq->lock, flags);
1664 } 1548 }
1665
1666 aggregate_group_shares(tg, cpu, sd);
1667} 1549}
1668 1550
1669/* 1551/*
1670 * Calculate the accumulative weight and recursive load of each task group 1552 * Compute the cpu's hierarchical load factor for each task group.
1671 * while walking down the tree. 1553 * This needs to be done in a top-down fashion because the load of a child
1554 * group is a fraction of its parents load.
1672 */ 1555 */
1673static void 1556static void
1674aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd) 1557tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1675{ 1558{
1676 aggregate_group_weight(tg, cpu, sd); 1559 unsigned long load;
1677 aggregate_group_shares(tg, cpu, sd);
1678 aggregate_group_load(tg, cpu, sd);
1679}
1680
1681/*
1682 * Rebalance the cpu shares while walking back up the tree.
1683 */
1684static void
1685aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1686{
1687 aggregate_group_set_shares(tg, cpu, sd);
1688}
1689
1690static void
1691aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1692{
1693}
1694
1695static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
1696 1560
1697static void __init init_aggregate(void) 1561 if (!tg->parent) {
1698{ 1562 load = cpu_rq(cpu)->load.weight;
1699 int i; 1563 } else {
1564 load = tg->parent->cfs_rq[cpu]->h_load;
1565 load *= tg->cfs_rq[cpu]->shares;
1566 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1567 }
1700 1568
1701 for_each_possible_cpu(i) 1569 tg->cfs_rq[cpu]->h_load = load;
1702 spin_lock_init(&per_cpu(aggregate_lock, i));
1703} 1570}
1704 1571
1705static int get_aggregate(int cpu, struct sched_domain *sd) 1572static void
1573tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1706{ 1574{
1707 if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
1708 return 0;
1709
1710 aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
1711 return 1;
1712} 1575}
1713 1576
1714static void update_aggregate(int cpu, struct sched_domain *sd) 1577static void update_shares(struct sched_domain *sd)
1715{ 1578{
1716 aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd); 1579 walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
1717} 1580}
1718 1581
1719static void put_aggregate(int cpu, struct sched_domain *sd) 1582static void update_h_load(int cpu)
1720{ 1583{
1721 spin_unlock(&per_cpu(aggregate_lock, cpu)); 1584 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
1722} 1585}
1723 1586
1724static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1587static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1728,22 +1591,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728 1591
1729#else 1592#else
1730 1593
1731static inline void init_aggregate(void) 1594static inline void update_shares(struct sched_domain *sd)
1732{
1733}
1734
1735static inline int get_aggregate(int cpu, struct sched_domain *sd)
1736{
1737 return 0;
1738}
1739
1740static inline void update_aggregate(int cpu, struct sched_domain *sd)
1741{ 1595{
1742} 1596}
1743 1597
1744static inline void put_aggregate(int cpu, struct sched_domain *sd)
1745{
1746}
1747#endif 1598#endif
1748 1599
1749#endif 1600#endif
@@ -2172,12 +2023,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2172 int load_idx = sd->forkexec_idx; 2023 int load_idx = sd->forkexec_idx;
2173 int imbalance = 100 + (sd->imbalance_pct-100)/2; 2024 int imbalance = 100 + (sd->imbalance_pct-100)/2;
2174 2025
2175 /*
2176 * now that we have both rqs locked the rq weight won't change
2177 * anymore - so update the stats.
2178 */
2179 update_aggregate(this_cpu, sd);
2180
2181 do { 2026 do {
2182 unsigned long load, avg_load; 2027 unsigned long load, avg_load;
2183 int local_group; 2028 int local_group;
@@ -3521,12 +3366,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3521 unsigned long imbalance; 3366 unsigned long imbalance;
3522 struct rq *busiest; 3367 struct rq *busiest;
3523 unsigned long flags; 3368 unsigned long flags;
3524 int unlock_aggregate;
3525 3369
3526 cpus_setall(*cpus); 3370 cpus_setall(*cpus);
3527 3371
3528 unlock_aggregate = get_aggregate(this_cpu, sd);
3529
3530 /* 3372 /*
3531 * When power savings policy is enabled for the parent domain, idle 3373 * When power savings policy is enabled for the parent domain, idle
3532 * sibling can pick up load irrespective of busy siblings. In this case, 3374 * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3540,6 +3382,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3540 schedstat_inc(sd, lb_count[idle]); 3382 schedstat_inc(sd, lb_count[idle]);
3541 3383
3542redo: 3384redo:
3385 update_shares(sd);
3543 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3386 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3544 cpus, balance); 3387 cpus, balance);
3545 3388
@@ -3663,8 +3506,8 @@ out_one_pinned:
3663 else 3506 else
3664 ld_moved = 0; 3507 ld_moved = 0;
3665out: 3508out:
3666 if (unlock_aggregate) 3509 if (ld_moved)
3667 put_aggregate(this_cpu, sd); 3510 update_shares(sd);
3668 return ld_moved; 3511 return ld_moved;
3669} 3512}
3670 3513
@@ -8019,7 +7862,6 @@ void __init sched_init(void)
8019 } 7862 }
8020 7863
8021#ifdef CONFIG_SMP 7864#ifdef CONFIG_SMP
8022 init_aggregate();
8023 init_defrootdomain(); 7865 init_defrootdomain();
8024#endif 7866#endif
8025 7867