diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2010-11-15 18:47:00 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-11-18 07:27:46 -0500 |
commit | 2069dd75c7d0f49355939e5586daf5a9ab216db7 (patch) | |
tree | c221747420e47b194a2a634024438a55420224d5 /kernel/sched.c | |
parent | 48c5ccae88dcd989d9de507e8510313c6cbd352b (diff) |
sched: Rewrite tg_shares_up)
By tracking a per-cpu load-avg for each cfs_rq and folding it into a
global task_group load on each tick we can rework tg_shares_up to be
strictly per-cpu.
This should improve cpu-cgroup performance for smp systems
significantly.
[ Paul: changed to use queueing cfs_rq + bug fixes ]
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234937.580480400@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r-- | kernel/sched.c | 173 |
1 files changed, 44 insertions, 129 deletions
diff --git a/kernel/sched.c b/kernel/sched.c index b0d5f1b24a39..e2f1a3024a99 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -253,6 +253,8 @@ struct task_group { | |||
253 | /* runqueue "owned" by this group on each cpu */ | 253 | /* runqueue "owned" by this group on each cpu */ |
254 | struct cfs_rq **cfs_rq; | 254 | struct cfs_rq **cfs_rq; |
255 | unsigned long shares; | 255 | unsigned long shares; |
256 | |||
257 | atomic_t load_weight; | ||
256 | #endif | 258 | #endif |
257 | 259 | ||
258 | #ifdef CONFIG_RT_GROUP_SCHED | 260 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -359,15 +361,11 @@ struct cfs_rq { | |||
359 | */ | 361 | */ |
360 | unsigned long h_load; | 362 | unsigned long h_load; |
361 | 363 | ||
362 | /* | 364 | u64 load_avg; |
363 | * this cpu's part of tg->shares | 365 | u64 load_period; |
364 | */ | 366 | u64 load_stamp; |
365 | unsigned long shares; | ||
366 | 367 | ||
367 | /* | 368 | unsigned long load_contribution; |
368 | * load.weight at the time we set shares | ||
369 | */ | ||
370 | unsigned long rq_weight; | ||
371 | #endif | 369 | #endif |
372 | #endif | 370 | #endif |
373 | }; | 371 | }; |
@@ -807,20 +805,6 @@ late_initcall(sched_init_debug); | |||
807 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 805 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
808 | 806 | ||
809 | /* | 807 | /* |
810 | * ratelimit for updating the group shares. | ||
811 | * default: 0.25ms | ||
812 | */ | ||
813 | unsigned int sysctl_sched_shares_ratelimit = 250000; | ||
814 | unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; | ||
815 | |||
816 | /* | ||
817 | * Inject some fuzzyness into changing the per-cpu group shares | ||
818 | * this avoids remote rq-locks at the expense of fairness. | ||
819 | * default: 4 | ||
820 | */ | ||
821 | unsigned int sysctl_sched_shares_thresh = 4; | ||
822 | |||
823 | /* | ||
824 | * period over which we average the RT time consumption, measured | 808 | * period over which we average the RT time consumption, measured |
825 | * in ms. | 809 | * in ms. |
826 | * | 810 | * |
@@ -1369,6 +1353,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | |||
1369 | lw->inv_weight = 0; | 1353 | lw->inv_weight = 0; |
1370 | } | 1354 | } |
1371 | 1355 | ||
1356 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
1357 | { | ||
1358 | lw->weight = w; | ||
1359 | lw->inv_weight = 0; | ||
1360 | } | ||
1361 | |||
1372 | /* | 1362 | /* |
1373 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1363 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
1374 | * of tasks with abnormal "nice" values across CPUs the contribution that | 1364 | * of tasks with abnormal "nice" values across CPUs the contribution that |
@@ -1557,97 +1547,44 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1557 | 1547 | ||
1558 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1548 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1559 | 1549 | ||
1560 | static __read_mostly unsigned long __percpu *update_shares_data; | 1550 | static void update_cfs_load(struct cfs_rq *cfs_rq); |
1561 | 1551 | static void update_cfs_shares(struct cfs_rq *cfs_rq); | |
1562 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1563 | |||
1564 | /* | ||
1565 | * Calculate and set the cpu's group shares. | ||
1566 | */ | ||
1567 | static void update_group_shares_cpu(struct task_group *tg, int cpu, | ||
1568 | unsigned long sd_shares, | ||
1569 | unsigned long sd_rq_weight, | ||
1570 | unsigned long *usd_rq_weight) | ||
1571 | { | ||
1572 | unsigned long shares, rq_weight; | ||
1573 | int boost = 0; | ||
1574 | |||
1575 | rq_weight = usd_rq_weight[cpu]; | ||
1576 | if (!rq_weight) { | ||
1577 | boost = 1; | ||
1578 | rq_weight = NICE_0_LOAD; | ||
1579 | } | ||
1580 | |||
1581 | /* | ||
1582 | * \Sum_j shares_j * rq_weight_i | ||
1583 | * shares_i = ----------------------------- | ||
1584 | * \Sum_j rq_weight_j | ||
1585 | */ | ||
1586 | shares = (sd_shares * rq_weight) / sd_rq_weight; | ||
1587 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | ||
1588 | |||
1589 | if (abs(shares - tg->se[cpu]->load.weight) > | ||
1590 | sysctl_sched_shares_thresh) { | ||
1591 | struct rq *rq = cpu_rq(cpu); | ||
1592 | unsigned long flags; | ||
1593 | |||
1594 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
1595 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; | ||
1596 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
1597 | __set_se_shares(tg->se[cpu], shares); | ||
1598 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
1599 | } | ||
1600 | } | ||
1601 | 1552 | ||
1602 | /* | 1553 | /* |
1603 | * Re-compute the task group their per cpu shares over the given domain. | 1554 | * update tg->load_weight by folding this cpu's load_avg |
1604 | * This needs to be done in a bottom-up fashion because the rq weight of a | ||
1605 | * parent group depends on the shares of its child groups. | ||
1606 | */ | 1555 | */ |
1607 | static int tg_shares_up(struct task_group *tg, void *data) | 1556 | static int tg_shares_up(struct task_group *tg, void *data) |
1608 | { | 1557 | { |
1609 | unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; | 1558 | long load_avg; |
1610 | unsigned long *usd_rq_weight; | 1559 | struct cfs_rq *cfs_rq; |
1611 | struct sched_domain *sd = data; | ||
1612 | unsigned long flags; | 1560 | unsigned long flags; |
1613 | int i; | 1561 | int cpu = (long)data; |
1562 | struct rq *rq; | ||
1614 | 1563 | ||
1615 | if (!tg->se[0]) | 1564 | if (!tg->se[cpu]) |
1616 | return 0; | 1565 | return 0; |
1617 | 1566 | ||
1618 | local_irq_save(flags); | 1567 | rq = cpu_rq(cpu); |
1619 | usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); | 1568 | cfs_rq = tg->cfs_rq[cpu]; |
1620 | |||
1621 | for_each_cpu(i, sched_domain_span(sd)) { | ||
1622 | weight = tg->cfs_rq[i]->load.weight; | ||
1623 | usd_rq_weight[i] = weight; | ||
1624 | |||
1625 | rq_weight += weight; | ||
1626 | /* | ||
1627 | * If there are currently no tasks on the cpu pretend there | ||
1628 | * is one of average load so that when a new task gets to | ||
1629 | * run here it will not get delayed by group starvation. | ||
1630 | */ | ||
1631 | if (!weight) | ||
1632 | weight = NICE_0_LOAD; | ||
1633 | 1569 | ||
1634 | sum_weight += weight; | 1570 | raw_spin_lock_irqsave(&rq->lock, flags); |
1635 | shares += tg->cfs_rq[i]->shares; | ||
1636 | } | ||
1637 | 1571 | ||
1638 | if (!rq_weight) | 1572 | update_rq_clock(rq); |
1639 | rq_weight = sum_weight; | 1573 | update_cfs_load(cfs_rq); |
1640 | 1574 | ||
1641 | if ((!shares && rq_weight) || shares > tg->shares) | 1575 | load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); |
1642 | shares = tg->shares; | 1576 | load_avg -= cfs_rq->load_contribution; |
1643 | 1577 | ||
1644 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | 1578 | atomic_add(load_avg, &tg->load_weight); |
1645 | shares = tg->shares; | 1579 | cfs_rq->load_contribution += load_avg; |
1646 | 1580 | ||
1647 | for_each_cpu(i, sched_domain_span(sd)) | 1581 | /* |
1648 | update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); | 1582 | * We need to update shares after updating tg->load_weight in |
1583 | * order to adjust the weight of groups with long running tasks. | ||
1584 | */ | ||
1585 | update_cfs_shares(cfs_rq); | ||
1649 | 1586 | ||
1650 | local_irq_restore(flags); | 1587 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
1651 | 1588 | ||
1652 | return 0; | 1589 | return 0; |
1653 | } | 1590 | } |
@@ -1666,7 +1603,7 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1666 | load = cpu_rq(cpu)->load.weight; | 1603 | load = cpu_rq(cpu)->load.weight; |
1667 | } else { | 1604 | } else { |
1668 | load = tg->parent->cfs_rq[cpu]->h_load; | 1605 | load = tg->parent->cfs_rq[cpu]->h_load; |
1669 | load *= tg->cfs_rq[cpu]->shares; | 1606 | load *= tg->se[cpu]->load.weight; |
1670 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | 1607 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; |
1671 | } | 1608 | } |
1672 | 1609 | ||
@@ -1675,21 +1612,16 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1675 | return 0; | 1612 | return 0; |
1676 | } | 1613 | } |
1677 | 1614 | ||
1678 | static void update_shares(struct sched_domain *sd) | 1615 | static void update_shares(long cpu) |
1679 | { | 1616 | { |
1680 | s64 elapsed; | ||
1681 | u64 now; | ||
1682 | |||
1683 | if (root_task_group_empty()) | 1617 | if (root_task_group_empty()) |
1684 | return; | 1618 | return; |
1685 | 1619 | ||
1686 | now = local_clock(); | 1620 | /* |
1687 | elapsed = now - sd->last_update; | 1621 | * XXX: replace with an on-demand list |
1622 | */ | ||
1688 | 1623 | ||
1689 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1624 | walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu); |
1690 | sd->last_update = now; | ||
1691 | walk_tg_tree(tg_nop, tg_shares_up, sd); | ||
1692 | } | ||
1693 | } | 1625 | } |
1694 | 1626 | ||
1695 | static void update_h_load(long cpu) | 1627 | static void update_h_load(long cpu) |
@@ -1699,7 +1631,7 @@ static void update_h_load(long cpu) | |||
1699 | 1631 | ||
1700 | #else | 1632 | #else |
1701 | 1633 | ||
1702 | static inline void update_shares(struct sched_domain *sd) | 1634 | static inline void update_shares(int cpu) |
1703 | { | 1635 | { |
1704 | } | 1636 | } |
1705 | 1637 | ||
@@ -1824,15 +1756,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1824 | 1756 | ||
1825 | #endif | 1757 | #endif |
1826 | 1758 | ||
1827 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1828 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1829 | { | ||
1830 | #ifdef CONFIG_SMP | ||
1831 | cfs_rq->shares = shares; | ||
1832 | #endif | ||
1833 | } | ||
1834 | #endif | ||
1835 | |||
1836 | static void calc_load_account_idle(struct rq *this_rq); | 1759 | static void calc_load_account_idle(struct rq *this_rq); |
1837 | static void update_sysctl(void); | 1760 | static void update_sysctl(void); |
1838 | static int get_update_sysctl_factor(void); | 1761 | static int get_update_sysctl_factor(void); |
@@ -5551,7 +5474,6 @@ static void update_sysctl(void) | |||
5551 | SET_SYSCTL(sched_min_granularity); | 5474 | SET_SYSCTL(sched_min_granularity); |
5552 | SET_SYSCTL(sched_latency); | 5475 | SET_SYSCTL(sched_latency); |
5553 | SET_SYSCTL(sched_wakeup_granularity); | 5476 | SET_SYSCTL(sched_wakeup_granularity); |
5554 | SET_SYSCTL(sched_shares_ratelimit); | ||
5555 | #undef SET_SYSCTL | 5477 | #undef SET_SYSCTL |
5556 | } | 5478 | } |
5557 | 5479 | ||
@@ -7787,8 +7709,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7787 | se->cfs_rq = parent->my_q; | 7709 | se->cfs_rq = parent->my_q; |
7788 | 7710 | ||
7789 | se->my_q = cfs_rq; | 7711 | se->my_q = cfs_rq; |
7790 | se->load.weight = tg->shares; | 7712 | update_load_set(&se->load, tg->shares); |
7791 | se->load.inv_weight = 0; | ||
7792 | se->parent = parent; | 7713 | se->parent = parent; |
7793 | } | 7714 | } |
7794 | #endif | 7715 | #endif |
@@ -7881,10 +7802,6 @@ void __init sched_init(void) | |||
7881 | 7802 | ||
7882 | #endif /* CONFIG_CGROUP_SCHED */ | 7803 | #endif /* CONFIG_CGROUP_SCHED */ |
7883 | 7804 | ||
7884 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
7885 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | ||
7886 | __alignof__(unsigned long)); | ||
7887 | #endif | ||
7888 | for_each_possible_cpu(i) { | 7805 | for_each_possible_cpu(i) { |
7889 | struct rq *rq; | 7806 | struct rq *rq; |
7890 | 7807 | ||
@@ -8452,8 +8369,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares) | |||
8452 | if (on_rq) | 8369 | if (on_rq) |
8453 | dequeue_entity(cfs_rq, se, 0); | 8370 | dequeue_entity(cfs_rq, se, 0); |
8454 | 8371 | ||
8455 | se->load.weight = shares; | 8372 | update_load_set(&se->load, shares); |
8456 | se->load.inv_weight = 0; | ||
8457 | 8373 | ||
8458 | if (on_rq) | 8374 | if (on_rq) |
8459 | enqueue_entity(cfs_rq, se, 0); | 8375 | enqueue_entity(cfs_rq, se, 0); |
@@ -8510,7 +8426,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8510 | /* | 8426 | /* |
8511 | * force a rebalance | 8427 | * force a rebalance |
8512 | */ | 8428 | */ |
8513 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | ||
8514 | set_se_shares(tg->se[i], shares); | 8429 | set_se_shares(tg->se[i], shares); |
8515 | } | 8430 | } |
8516 | 8431 | ||