diff options
author | Peter Zijlstra <a.p.zijlstra@chello.nl> | 2010-11-15 18:47:00 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2010-11-18 07:27:46 -0500 |
commit | 2069dd75c7d0f49355939e5586daf5a9ab216db7 (patch) | |
tree | c221747420e47b194a2a634024438a55420224d5 | |
parent | 48c5ccae88dcd989d9de507e8510313c6cbd352b (diff) |
sched: Rewrite tg_shares_up)
By tracking a per-cpu load-avg for each cfs_rq and folding it into a
global task_group load on each tick we can rework tg_shares_up to be
strictly per-cpu.
This should improve cpu-cgroup performance for smp systems
significantly.
[ Paul: changed to use queueing cfs_rq + bug fixes ]
Signed-off-by: Paul Turner <pjt@google.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20101115234937.580480400@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r-- | include/linux/sched.h | 2 | ||||
-rw-r--r-- | kernel/sched.c | 173 | ||||
-rw-r--r-- | kernel/sched_debug.c | 15 | ||||
-rw-r--r-- | kernel/sched_fair.c | 164 | ||||
-rw-r--r-- | kernel/sched_features.h | 2 | ||||
-rw-r--r-- | kernel/sysctl.c | 19 |
6 files changed, 162 insertions, 213 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 29d953abb5ad..8abb8aa59664 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1885,8 +1885,6 @@ static inline void wake_up_idle_cpu(int cpu) { } | |||
1885 | extern unsigned int sysctl_sched_latency; | 1885 | extern unsigned int sysctl_sched_latency; |
1886 | extern unsigned int sysctl_sched_min_granularity; | 1886 | extern unsigned int sysctl_sched_min_granularity; |
1887 | extern unsigned int sysctl_sched_wakeup_granularity; | 1887 | extern unsigned int sysctl_sched_wakeup_granularity; |
1888 | extern unsigned int sysctl_sched_shares_ratelimit; | ||
1889 | extern unsigned int sysctl_sched_shares_thresh; | ||
1890 | extern unsigned int sysctl_sched_child_runs_first; | 1888 | extern unsigned int sysctl_sched_child_runs_first; |
1891 | 1889 | ||
1892 | enum sched_tunable_scaling { | 1890 | enum sched_tunable_scaling { |
diff --git a/kernel/sched.c b/kernel/sched.c index b0d5f1b24a39..e2f1a3024a99 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -253,6 +253,8 @@ struct task_group { | |||
253 | /* runqueue "owned" by this group on each cpu */ | 253 | /* runqueue "owned" by this group on each cpu */ |
254 | struct cfs_rq **cfs_rq; | 254 | struct cfs_rq **cfs_rq; |
255 | unsigned long shares; | 255 | unsigned long shares; |
256 | |||
257 | atomic_t load_weight; | ||
256 | #endif | 258 | #endif |
257 | 259 | ||
258 | #ifdef CONFIG_RT_GROUP_SCHED | 260 | #ifdef CONFIG_RT_GROUP_SCHED |
@@ -359,15 +361,11 @@ struct cfs_rq { | |||
359 | */ | 361 | */ |
360 | unsigned long h_load; | 362 | unsigned long h_load; |
361 | 363 | ||
362 | /* | 364 | u64 load_avg; |
363 | * this cpu's part of tg->shares | 365 | u64 load_period; |
364 | */ | 366 | u64 load_stamp; |
365 | unsigned long shares; | ||
366 | 367 | ||
367 | /* | 368 | unsigned long load_contribution; |
368 | * load.weight at the time we set shares | ||
369 | */ | ||
370 | unsigned long rq_weight; | ||
371 | #endif | 369 | #endif |
372 | #endif | 370 | #endif |
373 | }; | 371 | }; |
@@ -807,20 +805,6 @@ late_initcall(sched_init_debug); | |||
807 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | 805 | const_debug unsigned int sysctl_sched_nr_migrate = 32; |
808 | 806 | ||
809 | /* | 807 | /* |
810 | * ratelimit for updating the group shares. | ||
811 | * default: 0.25ms | ||
812 | */ | ||
813 | unsigned int sysctl_sched_shares_ratelimit = 250000; | ||
814 | unsigned int normalized_sysctl_sched_shares_ratelimit = 250000; | ||
815 | |||
816 | /* | ||
817 | * Inject some fuzzyness into changing the per-cpu group shares | ||
818 | * this avoids remote rq-locks at the expense of fairness. | ||
819 | * default: 4 | ||
820 | */ | ||
821 | unsigned int sysctl_sched_shares_thresh = 4; | ||
822 | |||
823 | /* | ||
824 | * period over which we average the RT time consumption, measured | 808 | * period over which we average the RT time consumption, measured |
825 | * in ms. | 809 | * in ms. |
826 | * | 810 | * |
@@ -1369,6 +1353,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | |||
1369 | lw->inv_weight = 0; | 1353 | lw->inv_weight = 0; |
1370 | } | 1354 | } |
1371 | 1355 | ||
1356 | static inline void update_load_set(struct load_weight *lw, unsigned long w) | ||
1357 | { | ||
1358 | lw->weight = w; | ||
1359 | lw->inv_weight = 0; | ||
1360 | } | ||
1361 | |||
1372 | /* | 1362 | /* |
1373 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1363 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
1374 | * of tasks with abnormal "nice" values across CPUs the contribution that | 1364 | * of tasks with abnormal "nice" values across CPUs the contribution that |
@@ -1557,97 +1547,44 @@ static unsigned long cpu_avg_load_per_task(int cpu) | |||
1557 | 1547 | ||
1558 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1548 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1559 | 1549 | ||
1560 | static __read_mostly unsigned long __percpu *update_shares_data; | 1550 | static void update_cfs_load(struct cfs_rq *cfs_rq); |
1561 | 1551 | static void update_cfs_shares(struct cfs_rq *cfs_rq); | |
1562 | static void __set_se_shares(struct sched_entity *se, unsigned long shares); | ||
1563 | |||
1564 | /* | ||
1565 | * Calculate and set the cpu's group shares. | ||
1566 | */ | ||
1567 | static void update_group_shares_cpu(struct task_group *tg, int cpu, | ||
1568 | unsigned long sd_shares, | ||
1569 | unsigned long sd_rq_weight, | ||
1570 | unsigned long *usd_rq_weight) | ||
1571 | { | ||
1572 | unsigned long shares, rq_weight; | ||
1573 | int boost = 0; | ||
1574 | |||
1575 | rq_weight = usd_rq_weight[cpu]; | ||
1576 | if (!rq_weight) { | ||
1577 | boost = 1; | ||
1578 | rq_weight = NICE_0_LOAD; | ||
1579 | } | ||
1580 | |||
1581 | /* | ||
1582 | * \Sum_j shares_j * rq_weight_i | ||
1583 | * shares_i = ----------------------------- | ||
1584 | * \Sum_j rq_weight_j | ||
1585 | */ | ||
1586 | shares = (sd_shares * rq_weight) / sd_rq_weight; | ||
1587 | shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES); | ||
1588 | |||
1589 | if (abs(shares - tg->se[cpu]->load.weight) > | ||
1590 | sysctl_sched_shares_thresh) { | ||
1591 | struct rq *rq = cpu_rq(cpu); | ||
1592 | unsigned long flags; | ||
1593 | |||
1594 | raw_spin_lock_irqsave(&rq->lock, flags); | ||
1595 | tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight; | ||
1596 | tg->cfs_rq[cpu]->shares = boost ? 0 : shares; | ||
1597 | __set_se_shares(tg->se[cpu], shares); | ||
1598 | raw_spin_unlock_irqrestore(&rq->lock, flags); | ||
1599 | } | ||
1600 | } | ||
1601 | 1552 | ||
1602 | /* | 1553 | /* |
1603 | * Re-compute the task group their per cpu shares over the given domain. | 1554 | * update tg->load_weight by folding this cpu's load_avg |
1604 | * This needs to be done in a bottom-up fashion because the rq weight of a | ||
1605 | * parent group depends on the shares of its child groups. | ||
1606 | */ | 1555 | */ |
1607 | static int tg_shares_up(struct task_group *tg, void *data) | 1556 | static int tg_shares_up(struct task_group *tg, void *data) |
1608 | { | 1557 | { |
1609 | unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; | 1558 | long load_avg; |
1610 | unsigned long *usd_rq_weight; | 1559 | struct cfs_rq *cfs_rq; |
1611 | struct sched_domain *sd = data; | ||
1612 | unsigned long flags; | 1560 | unsigned long flags; |
1613 | int i; | 1561 | int cpu = (long)data; |
1562 | struct rq *rq; | ||
1614 | 1563 | ||
1615 | if (!tg->se[0]) | 1564 | if (!tg->se[cpu]) |
1616 | return 0; | 1565 | return 0; |
1617 | 1566 | ||
1618 | local_irq_save(flags); | 1567 | rq = cpu_rq(cpu); |
1619 | usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); | 1568 | cfs_rq = tg->cfs_rq[cpu]; |
1620 | |||
1621 | for_each_cpu(i, sched_domain_span(sd)) { | ||
1622 | weight = tg->cfs_rq[i]->load.weight; | ||
1623 | usd_rq_weight[i] = weight; | ||
1624 | |||
1625 | rq_weight += weight; | ||
1626 | /* | ||
1627 | * If there are currently no tasks on the cpu pretend there | ||
1628 | * is one of average load so that when a new task gets to | ||
1629 | * run here it will not get delayed by group starvation. | ||
1630 | */ | ||
1631 | if (!weight) | ||
1632 | weight = NICE_0_LOAD; | ||
1633 | 1569 | ||
1634 | sum_weight += weight; | 1570 | raw_spin_lock_irqsave(&rq->lock, flags); |
1635 | shares += tg->cfs_rq[i]->shares; | ||
1636 | } | ||
1637 | 1571 | ||
1638 | if (!rq_weight) | 1572 | update_rq_clock(rq); |
1639 | rq_weight = sum_weight; | 1573 | update_cfs_load(cfs_rq); |
1640 | 1574 | ||
1641 | if ((!shares && rq_weight) || shares > tg->shares) | 1575 | load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1); |
1642 | shares = tg->shares; | 1576 | load_avg -= cfs_rq->load_contribution; |
1643 | 1577 | ||
1644 | if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) | 1578 | atomic_add(load_avg, &tg->load_weight); |
1645 | shares = tg->shares; | 1579 | cfs_rq->load_contribution += load_avg; |
1646 | 1580 | ||
1647 | for_each_cpu(i, sched_domain_span(sd)) | 1581 | /* |
1648 | update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); | 1582 | * We need to update shares after updating tg->load_weight in |
1583 | * order to adjust the weight of groups with long running tasks. | ||
1584 | */ | ||
1585 | update_cfs_shares(cfs_rq); | ||
1649 | 1586 | ||
1650 | local_irq_restore(flags); | 1587 | raw_spin_unlock_irqrestore(&rq->lock, flags); |
1651 | 1588 | ||
1652 | return 0; | 1589 | return 0; |
1653 | } | 1590 | } |
@@ -1666,7 +1603,7 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1666 | load = cpu_rq(cpu)->load.weight; | 1603 | load = cpu_rq(cpu)->load.weight; |
1667 | } else { | 1604 | } else { |
1668 | load = tg->parent->cfs_rq[cpu]->h_load; | 1605 | load = tg->parent->cfs_rq[cpu]->h_load; |
1669 | load *= tg->cfs_rq[cpu]->shares; | 1606 | load *= tg->se[cpu]->load.weight; |
1670 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; | 1607 | load /= tg->parent->cfs_rq[cpu]->load.weight + 1; |
1671 | } | 1608 | } |
1672 | 1609 | ||
@@ -1675,21 +1612,16 @@ static int tg_load_down(struct task_group *tg, void *data) | |||
1675 | return 0; | 1612 | return 0; |
1676 | } | 1613 | } |
1677 | 1614 | ||
1678 | static void update_shares(struct sched_domain *sd) | 1615 | static void update_shares(long cpu) |
1679 | { | 1616 | { |
1680 | s64 elapsed; | ||
1681 | u64 now; | ||
1682 | |||
1683 | if (root_task_group_empty()) | 1617 | if (root_task_group_empty()) |
1684 | return; | 1618 | return; |
1685 | 1619 | ||
1686 | now = local_clock(); | 1620 | /* |
1687 | elapsed = now - sd->last_update; | 1621 | * XXX: replace with an on-demand list |
1622 | */ | ||
1688 | 1623 | ||
1689 | if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { | 1624 | walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu); |
1690 | sd->last_update = now; | ||
1691 | walk_tg_tree(tg_nop, tg_shares_up, sd); | ||
1692 | } | ||
1693 | } | 1625 | } |
1694 | 1626 | ||
1695 | static void update_h_load(long cpu) | 1627 | static void update_h_load(long cpu) |
@@ -1699,7 +1631,7 @@ static void update_h_load(long cpu) | |||
1699 | 1631 | ||
1700 | #else | 1632 | #else |
1701 | 1633 | ||
1702 | static inline void update_shares(struct sched_domain *sd) | 1634 | static inline void update_shares(int cpu) |
1703 | { | 1635 | { |
1704 | } | 1636 | } |
1705 | 1637 | ||
@@ -1824,15 +1756,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
1824 | 1756 | ||
1825 | #endif | 1757 | #endif |
1826 | 1758 | ||
1827 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1828 | static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) | ||
1829 | { | ||
1830 | #ifdef CONFIG_SMP | ||
1831 | cfs_rq->shares = shares; | ||
1832 | #endif | ||
1833 | } | ||
1834 | #endif | ||
1835 | |||
1836 | static void calc_load_account_idle(struct rq *this_rq); | 1759 | static void calc_load_account_idle(struct rq *this_rq); |
1837 | static void update_sysctl(void); | 1760 | static void update_sysctl(void); |
1838 | static int get_update_sysctl_factor(void); | 1761 | static int get_update_sysctl_factor(void); |
@@ -5551,7 +5474,6 @@ static void update_sysctl(void) | |||
5551 | SET_SYSCTL(sched_min_granularity); | 5474 | SET_SYSCTL(sched_min_granularity); |
5552 | SET_SYSCTL(sched_latency); | 5475 | SET_SYSCTL(sched_latency); |
5553 | SET_SYSCTL(sched_wakeup_granularity); | 5476 | SET_SYSCTL(sched_wakeup_granularity); |
5554 | SET_SYSCTL(sched_shares_ratelimit); | ||
5555 | #undef SET_SYSCTL | 5477 | #undef SET_SYSCTL |
5556 | } | 5478 | } |
5557 | 5479 | ||
@@ -7787,8 +7709,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, | |||
7787 | se->cfs_rq = parent->my_q; | 7709 | se->cfs_rq = parent->my_q; |
7788 | 7710 | ||
7789 | se->my_q = cfs_rq; | 7711 | se->my_q = cfs_rq; |
7790 | se->load.weight = tg->shares; | 7712 | update_load_set(&se->load, tg->shares); |
7791 | se->load.inv_weight = 0; | ||
7792 | se->parent = parent; | 7713 | se->parent = parent; |
7793 | } | 7714 | } |
7794 | #endif | 7715 | #endif |
@@ -7881,10 +7802,6 @@ void __init sched_init(void) | |||
7881 | 7802 | ||
7882 | #endif /* CONFIG_CGROUP_SCHED */ | 7803 | #endif /* CONFIG_CGROUP_SCHED */ |
7883 | 7804 | ||
7884 | #if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP | ||
7885 | update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long), | ||
7886 | __alignof__(unsigned long)); | ||
7887 | #endif | ||
7888 | for_each_possible_cpu(i) { | 7805 | for_each_possible_cpu(i) { |
7889 | struct rq *rq; | 7806 | struct rq *rq; |
7890 | 7807 | ||
@@ -8452,8 +8369,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares) | |||
8452 | if (on_rq) | 8369 | if (on_rq) |
8453 | dequeue_entity(cfs_rq, se, 0); | 8370 | dequeue_entity(cfs_rq, se, 0); |
8454 | 8371 | ||
8455 | se->load.weight = shares; | 8372 | update_load_set(&se->load, shares); |
8456 | se->load.inv_weight = 0; | ||
8457 | 8373 | ||
8458 | if (on_rq) | 8374 | if (on_rq) |
8459 | enqueue_entity(cfs_rq, se, 0); | 8375 | enqueue_entity(cfs_rq, se, 0); |
@@ -8510,7 +8426,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) | |||
8510 | /* | 8426 | /* |
8511 | * force a rebalance | 8427 | * force a rebalance |
8512 | */ | 8428 | */ |
8513 | cfs_rq_set_shares(tg->cfs_rq[i], 0); | ||
8514 | set_se_shares(tg->se[i], shares); | 8429 | set_se_shares(tg->se[i], shares); |
8515 | } | 8430 | } |
8516 | 8431 | ||
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 2e1b0d17dd9b..e6590e7312e8 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -202,15 +202,22 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | |||
202 | spread0 = min_vruntime - rq0_min_vruntime; | 202 | spread0 = min_vruntime - rq0_min_vruntime; |
203 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", | 203 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", |
204 | SPLIT_NS(spread0)); | 204 | SPLIT_NS(spread0)); |
205 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | ||
206 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | ||
207 | |||
208 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", | 205 | SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", |
209 | cfs_rq->nr_spread_over); | 206 | cfs_rq->nr_spread_over); |
207 | SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); | ||
208 | SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); | ||
210 | #ifdef CONFIG_FAIR_GROUP_SCHED | 209 | #ifdef CONFIG_FAIR_GROUP_SCHED |
211 | #ifdef CONFIG_SMP | 210 | #ifdef CONFIG_SMP |
212 | SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); | 211 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg", |
212 | SPLIT_NS(cfs_rq->load_avg)); | ||
213 | SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period", | ||
214 | SPLIT_NS(cfs_rq->load_period)); | ||
215 | SEQ_printf(m, " .%-30s: %ld\n", "load_contrib", | ||
216 | cfs_rq->load_contribution); | ||
217 | SEQ_printf(m, " .%-30s: %d\n", "load_tg", | ||
218 | atomic_read(&tg->load_weight)); | ||
213 | #endif | 219 | #endif |
220 | |||
214 | print_cfs_group_stats(m, cpu, cfs_rq->tg); | 221 | print_cfs_group_stats(m, cpu, cfs_rq->tg); |
215 | #endif | 222 | #endif |
216 | } | 223 | } |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index f4f6a8326dd0..d86544b4151c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write, | |||
417 | WRT_SYSCTL(sched_min_granularity); | 417 | WRT_SYSCTL(sched_min_granularity); |
418 | WRT_SYSCTL(sched_latency); | 418 | WRT_SYSCTL(sched_latency); |
419 | WRT_SYSCTL(sched_wakeup_granularity); | 419 | WRT_SYSCTL(sched_wakeup_granularity); |
420 | WRT_SYSCTL(sched_shares_ratelimit); | ||
421 | #undef WRT_SYSCTL | 420 | #undef WRT_SYSCTL |
422 | 421 | ||
423 | return 0; | 422 | return 0; |
@@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
633 | list_add(&se->group_node, &cfs_rq->tasks); | 632 | list_add(&se->group_node, &cfs_rq->tasks); |
634 | } | 633 | } |
635 | cfs_rq->nr_running++; | 634 | cfs_rq->nr_running++; |
636 | se->on_rq = 1; | ||
637 | } | 635 | } |
638 | 636 | ||
639 | static void | 637 | static void |
@@ -647,9 +645,89 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
647 | list_del_init(&se->group_node); | 645 | list_del_init(&se->group_node); |
648 | } | 646 | } |
649 | cfs_rq->nr_running--; | 647 | cfs_rq->nr_running--; |
650 | se->on_rq = 0; | ||
651 | } | 648 | } |
652 | 649 | ||
650 | #if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED | ||
651 | static void update_cfs_load(struct cfs_rq *cfs_rq) | ||
652 | { | ||
653 | u64 period = sched_avg_period(); | ||
654 | u64 now, delta; | ||
655 | |||
656 | if (!cfs_rq) | ||
657 | return; | ||
658 | |||
659 | now = rq_of(cfs_rq)->clock; | ||
660 | delta = now - cfs_rq->load_stamp; | ||
661 | |||
662 | cfs_rq->load_stamp = now; | ||
663 | cfs_rq->load_period += delta; | ||
664 | cfs_rq->load_avg += delta * cfs_rq->load.weight; | ||
665 | |||
666 | while (cfs_rq->load_period > period) { | ||
667 | /* | ||
668 | * Inline assembly required to prevent the compiler | ||
669 | * optimising this loop into a divmod call. | ||
670 | * See __iter_div_u64_rem() for another example of this. | ||
671 | */ | ||
672 | asm("" : "+rm" (cfs_rq->load_period)); | ||
673 | cfs_rq->load_period /= 2; | ||
674 | cfs_rq->load_avg /= 2; | ||
675 | } | ||
676 | } | ||
677 | |||
678 | static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
679 | unsigned long weight) | ||
680 | { | ||
681 | if (se->on_rq) | ||
682 | account_entity_dequeue(cfs_rq, se); | ||
683 | |||
684 | update_load_set(&se->load, weight); | ||
685 | |||
686 | if (se->on_rq) | ||
687 | account_entity_enqueue(cfs_rq, se); | ||
688 | } | ||
689 | |||
690 | static void update_cfs_shares(struct cfs_rq *cfs_rq) | ||
691 | { | ||
692 | struct task_group *tg; | ||
693 | struct sched_entity *se; | ||
694 | long load_weight, load, shares; | ||
695 | |||
696 | if (!cfs_rq) | ||
697 | return; | ||
698 | |||
699 | tg = cfs_rq->tg; | ||
700 | se = tg->se[cpu_of(rq_of(cfs_rq))]; | ||
701 | if (!se) | ||
702 | return; | ||
703 | |||
704 | load = cfs_rq->load.weight; | ||
705 | |||
706 | load_weight = atomic_read(&tg->load_weight); | ||
707 | load_weight -= cfs_rq->load_contribution; | ||
708 | load_weight += load; | ||
709 | |||
710 | shares = (tg->shares * load); | ||
711 | if (load_weight) | ||
712 | shares /= load_weight; | ||
713 | |||
714 | if (shares < MIN_SHARES) | ||
715 | shares = MIN_SHARES; | ||
716 | if (shares > tg->shares) | ||
717 | shares = tg->shares; | ||
718 | |||
719 | reweight_entity(cfs_rq_of(se), se, shares); | ||
720 | } | ||
721 | #else /* CONFIG_FAIR_GROUP_SCHED */ | ||
722 | static inline void update_cfs_load(struct cfs_rq *cfs_rq) | ||
723 | { | ||
724 | } | ||
725 | |||
726 | static inline void update_cfs_shares(struct cfs_rq *cfs_rq) | ||
727 | { | ||
728 | } | ||
729 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | ||
730 | |||
653 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | 731 | static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) |
654 | { | 732 | { |
655 | #ifdef CONFIG_SCHEDSTATS | 733 | #ifdef CONFIG_SCHEDSTATS |
@@ -771,7 +849,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
771 | * Update run-time statistics of the 'current'. | 849 | * Update run-time statistics of the 'current'. |
772 | */ | 850 | */ |
773 | update_curr(cfs_rq); | 851 | update_curr(cfs_rq); |
852 | update_cfs_load(cfs_rq); | ||
774 | account_entity_enqueue(cfs_rq, se); | 853 | account_entity_enqueue(cfs_rq, se); |
854 | update_cfs_shares(cfs_rq); | ||
775 | 855 | ||
776 | if (flags & ENQUEUE_WAKEUP) { | 856 | if (flags & ENQUEUE_WAKEUP) { |
777 | place_entity(cfs_rq, se, 0); | 857 | place_entity(cfs_rq, se, 0); |
@@ -782,6 +862,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
782 | check_spread(cfs_rq, se); | 862 | check_spread(cfs_rq, se); |
783 | if (se != cfs_rq->curr) | 863 | if (se != cfs_rq->curr) |
784 | __enqueue_entity(cfs_rq, se); | 864 | __enqueue_entity(cfs_rq, se); |
865 | se->on_rq = 1; | ||
785 | } | 866 | } |
786 | 867 | ||
787 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) | 868 | static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) |
@@ -825,8 +906,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) | |||
825 | 906 | ||
826 | if (se != cfs_rq->curr) | 907 | if (se != cfs_rq->curr) |
827 | __dequeue_entity(cfs_rq, se); | 908 | __dequeue_entity(cfs_rq, se); |
909 | se->on_rq = 0; | ||
910 | update_cfs_load(cfs_rq); | ||
828 | account_entity_dequeue(cfs_rq, se); | 911 | account_entity_dequeue(cfs_rq, se); |
829 | update_min_vruntime(cfs_rq); | 912 | update_min_vruntime(cfs_rq); |
913 | update_cfs_shares(cfs_rq); | ||
830 | 914 | ||
831 | /* | 915 | /* |
832 | * Normalize the entity after updating the min_vruntime because the | 916 | * Normalize the entity after updating the min_vruntime because the |
@@ -1055,6 +1139,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1055 | flags = ENQUEUE_WAKEUP; | 1139 | flags = ENQUEUE_WAKEUP; |
1056 | } | 1140 | } |
1057 | 1141 | ||
1142 | for_each_sched_entity(se) { | ||
1143 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1144 | |||
1145 | update_cfs_load(cfs_rq); | ||
1146 | update_cfs_shares(cfs_rq); | ||
1147 | } | ||
1148 | |||
1058 | hrtick_update(rq); | 1149 | hrtick_update(rq); |
1059 | } | 1150 | } |
1060 | 1151 | ||
@@ -1071,12 +1162,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) | |||
1071 | for_each_sched_entity(se) { | 1162 | for_each_sched_entity(se) { |
1072 | cfs_rq = cfs_rq_of(se); | 1163 | cfs_rq = cfs_rq_of(se); |
1073 | dequeue_entity(cfs_rq, se, flags); | 1164 | dequeue_entity(cfs_rq, se, flags); |
1165 | |||
1074 | /* Don't dequeue parent if it has other entities besides us */ | 1166 | /* Don't dequeue parent if it has other entities besides us */ |
1075 | if (cfs_rq->load.weight) | 1167 | if (cfs_rq->load.weight) |
1076 | break; | 1168 | break; |
1077 | flags |= DEQUEUE_SLEEP; | 1169 | flags |= DEQUEUE_SLEEP; |
1078 | } | 1170 | } |
1079 | 1171 | ||
1172 | for_each_sched_entity(se) { | ||
1173 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
1174 | |||
1175 | update_cfs_load(cfs_rq); | ||
1176 | update_cfs_shares(cfs_rq); | ||
1177 | } | ||
1178 | |||
1080 | hrtick_update(rq); | 1179 | hrtick_update(rq); |
1081 | } | 1180 | } |
1082 | 1181 | ||
@@ -1143,51 +1242,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p) | |||
1143 | * Adding load to a group doesn't make a group heavier, but can cause movement | 1242 | * Adding load to a group doesn't make a group heavier, but can cause movement |
1144 | * of group shares between cpus. Assuming the shares were perfectly aligned one | 1243 | * of group shares between cpus. Assuming the shares were perfectly aligned one |
1145 | * can calculate the shift in shares. | 1244 | * can calculate the shift in shares. |
1146 | * | ||
1147 | * The problem is that perfectly aligning the shares is rather expensive, hence | ||
1148 | * we try to avoid doing that too often - see update_shares(), which ratelimits | ||
1149 | * this change. | ||
1150 | * | ||
1151 | * We compensate this by not only taking the current delta into account, but | ||
1152 | * also considering the delta between when the shares were last adjusted and | ||
1153 | * now. | ||
1154 | * | ||
1155 | * We still saw a performance dip, some tracing learned us that between | ||
1156 | * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased | ||
1157 | * significantly. Therefore try to bias the error in direction of failing | ||
1158 | * the affine wakeup. | ||
1159 | * | ||
1160 | */ | 1245 | */ |
1161 | static long effective_load(struct task_group *tg, int cpu, | 1246 | static long effective_load(struct task_group *tg, int cpu, long wl, long wg) |
1162 | long wl, long wg) | ||
1163 | { | 1247 | { |
1164 | struct sched_entity *se = tg->se[cpu]; | 1248 | struct sched_entity *se = tg->se[cpu]; |
1165 | 1249 | ||
1166 | if (!tg->parent) | 1250 | if (!tg->parent) |
1167 | return wl; | 1251 | return wl; |
1168 | 1252 | ||
1169 | /* | ||
1170 | * By not taking the decrease of shares on the other cpu into | ||
1171 | * account our error leans towards reducing the affine wakeups. | ||
1172 | */ | ||
1173 | if (!wl && sched_feat(ASYM_EFF_LOAD)) | ||
1174 | return wl; | ||
1175 | |||
1176 | for_each_sched_entity(se) { | 1253 | for_each_sched_entity(se) { |
1177 | long S, rw, s, a, b; | 1254 | long S, rw, s, a, b; |
1178 | long more_w; | ||
1179 | |||
1180 | /* | ||
1181 | * Instead of using this increment, also add the difference | ||
1182 | * between when the shares were last updated and now. | ||
1183 | */ | ||
1184 | more_w = se->my_q->load.weight - se->my_q->rq_weight; | ||
1185 | wl += more_w; | ||
1186 | wg += more_w; | ||
1187 | 1255 | ||
1188 | S = se->my_q->tg->shares; | 1256 | S = se->my_q->tg->shares; |
1189 | s = se->my_q->shares; | 1257 | s = se->load.weight; |
1190 | rw = se->my_q->rq_weight; | 1258 | rw = se->my_q->load.weight; |
1191 | 1259 | ||
1192 | a = S*(rw + wl); | 1260 | a = S*(rw + wl); |
1193 | b = S*rw + s*wg; | 1261 | b = S*rw + s*wg; |
@@ -1508,23 +1576,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_ | |||
1508 | sd = tmp; | 1576 | sd = tmp; |
1509 | } | 1577 | } |
1510 | 1578 | ||
1511 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1512 | if (sched_feat(LB_SHARES_UPDATE)) { | ||
1513 | /* | ||
1514 | * Pick the largest domain to update shares over | ||
1515 | */ | ||
1516 | tmp = sd; | ||
1517 | if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight)) | ||
1518 | tmp = affine_sd; | ||
1519 | |||
1520 | if (tmp) { | ||
1521 | raw_spin_unlock(&rq->lock); | ||
1522 | update_shares(tmp); | ||
1523 | raw_spin_lock(&rq->lock); | ||
1524 | } | ||
1525 | } | ||
1526 | #endif | ||
1527 | |||
1528 | if (affine_sd) { | 1579 | if (affine_sd) { |
1529 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) | 1580 | if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) |
1530 | return select_idle_sibling(p, cpu); | 1581 | return select_idle_sibling(p, cpu); |
@@ -3014,7 +3065,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, | |||
3014 | schedstat_inc(sd, lb_count[idle]); | 3065 | schedstat_inc(sd, lb_count[idle]); |
3015 | 3066 | ||
3016 | redo: | 3067 | redo: |
3017 | update_shares(sd); | ||
3018 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, | 3068 | group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, |
3019 | cpus, balance); | 3069 | cpus, balance); |
3020 | 3070 | ||
@@ -3156,8 +3206,6 @@ out_one_pinned: | |||
3156 | else | 3206 | else |
3157 | ld_moved = 0; | 3207 | ld_moved = 0; |
3158 | out: | 3208 | out: |
3159 | if (ld_moved) | ||
3160 | update_shares(sd); | ||
3161 | return ld_moved; | 3209 | return ld_moved; |
3162 | } | 3210 | } |
3163 | 3211 | ||
@@ -3549,6 +3597,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) | |||
3549 | int update_next_balance = 0; | 3597 | int update_next_balance = 0; |
3550 | int need_serialize; | 3598 | int need_serialize; |
3551 | 3599 | ||
3600 | update_shares(cpu); | ||
3601 | |||
3552 | for_each_domain(cpu, sd) { | 3602 | for_each_domain(cpu, sd) { |
3553 | if (!(sd->flags & SD_LOAD_BALANCE)) | 3603 | if (!(sd->flags & SD_LOAD_BALANCE)) |
3554 | continue; | 3604 | continue; |
diff --git a/kernel/sched_features.h b/kernel/sched_features.h index 185f920ec1a2..68e69acc29b9 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h | |||
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0) | |||
52 | SCHED_FEAT(HRTICK, 0) | 52 | SCHED_FEAT(HRTICK, 0) |
53 | SCHED_FEAT(DOUBLE_TICK, 0) | 53 | SCHED_FEAT(DOUBLE_TICK, 0) |
54 | SCHED_FEAT(LB_BIAS, 1) | 54 | SCHED_FEAT(LB_BIAS, 1) |
55 | SCHED_FEAT(LB_SHARES_UPDATE, 1) | ||
56 | SCHED_FEAT(ASYM_EFF_LOAD, 1) | ||
57 | 55 | ||
58 | /* | 56 | /* |
59 | * Spin-wait on mutex acquisition when the mutex owner is running on | 57 | * Spin-wait on mutex acquisition when the mutex owner is running on |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index b65bf634035e..3132b25193db 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */ | |||
259 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 259 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
260 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; | 260 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; |
261 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; | 261 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; |
262 | static int min_sched_shares_ratelimit = 100000; /* 100 usec */ | ||
263 | static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */ | ||
264 | #endif | 262 | #endif |
265 | 263 | ||
266 | #ifdef CONFIG_COMPACTION | 264 | #ifdef CONFIG_COMPACTION |
@@ -305,15 +303,6 @@ static struct ctl_table kern_table[] = { | |||
305 | .extra2 = &max_wakeup_granularity_ns, | 303 | .extra2 = &max_wakeup_granularity_ns, |
306 | }, | 304 | }, |
307 | { | 305 | { |
308 | .procname = "sched_shares_ratelimit", | ||
309 | .data = &sysctl_sched_shares_ratelimit, | ||
310 | .maxlen = sizeof(unsigned int), | ||
311 | .mode = 0644, | ||
312 | .proc_handler = sched_proc_update_handler, | ||
313 | .extra1 = &min_sched_shares_ratelimit, | ||
314 | .extra2 = &max_sched_shares_ratelimit, | ||
315 | }, | ||
316 | { | ||
317 | .procname = "sched_tunable_scaling", | 306 | .procname = "sched_tunable_scaling", |
318 | .data = &sysctl_sched_tunable_scaling, | 307 | .data = &sysctl_sched_tunable_scaling, |
319 | .maxlen = sizeof(enum sched_tunable_scaling), | 308 | .maxlen = sizeof(enum sched_tunable_scaling), |
@@ -323,14 +312,6 @@ static struct ctl_table kern_table[] = { | |||
323 | .extra2 = &max_sched_tunable_scaling, | 312 | .extra2 = &max_sched_tunable_scaling, |
324 | }, | 313 | }, |
325 | { | 314 | { |
326 | .procname = "sched_shares_thresh", | ||
327 | .data = &sysctl_sched_shares_thresh, | ||
328 | .maxlen = sizeof(unsigned int), | ||
329 | .mode = 0644, | ||
330 | .proc_handler = proc_dointvec_minmax, | ||
331 | .extra1 = &zero, | ||
332 | }, | ||
333 | { | ||
334 | .procname = "sched_migration_cost", | 315 | .procname = "sched_migration_cost", |
335 | .data = &sysctl_sched_migration_cost, | 316 | .data = &sysctl_sched_migration_cost, |
336 | .maxlen = sizeof(unsigned int), | 317 | .maxlen = sizeof(unsigned int), |