aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2010-11-15 18:47:00 -0500
committerIngo Molnar <mingo@elte.hu>2010-11-18 07:27:46 -0500
commit2069dd75c7d0f49355939e5586daf5a9ab216db7 (patch)
treec221747420e47b194a2a634024438a55420224d5
parent48c5ccae88dcd989d9de507e8510313c6cbd352b (diff)
sched: Rewrite tg_shares_up)
By tracking a per-cpu load-avg for each cfs_rq and folding it into a global task_group load on each tick we can rework tg_shares_up to be strictly per-cpu. This should improve cpu-cgroup performance for smp systems significantly. [ Paul: changed to use queueing cfs_rq + bug fixes ] Signed-off-by: Paul Turner <pjt@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <20101115234937.580480400@google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
-rw-r--r--include/linux/sched.h2
-rw-r--r--kernel/sched.c173
-rw-r--r--kernel/sched_debug.c15
-rw-r--r--kernel/sched_fair.c164
-rw-r--r--kernel/sched_features.h2
-rw-r--r--kernel/sysctl.c19
6 files changed, 162 insertions, 213 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29d953abb5ad..8abb8aa59664 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1885,8 +1885,6 @@ static inline void wake_up_idle_cpu(int cpu) { }
1885extern unsigned int sysctl_sched_latency; 1885extern unsigned int sysctl_sched_latency;
1886extern unsigned int sysctl_sched_min_granularity; 1886extern unsigned int sysctl_sched_min_granularity;
1887extern unsigned int sysctl_sched_wakeup_granularity; 1887extern unsigned int sysctl_sched_wakeup_granularity;
1888extern unsigned int sysctl_sched_shares_ratelimit;
1889extern unsigned int sysctl_sched_shares_thresh;
1890extern unsigned int sysctl_sched_child_runs_first; 1888extern unsigned int sysctl_sched_child_runs_first;
1891 1889
1892enum sched_tunable_scaling { 1890enum sched_tunable_scaling {
diff --git a/kernel/sched.c b/kernel/sched.c
index b0d5f1b24a39..e2f1a3024a99 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -253,6 +253,8 @@ struct task_group {
253 /* runqueue "owned" by this group on each cpu */ 253 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq; 254 struct cfs_rq **cfs_rq;
255 unsigned long shares; 255 unsigned long shares;
256
257 atomic_t load_weight;
256#endif 258#endif
257 259
258#ifdef CONFIG_RT_GROUP_SCHED 260#ifdef CONFIG_RT_GROUP_SCHED
@@ -359,15 +361,11 @@ struct cfs_rq {
359 */ 361 */
360 unsigned long h_load; 362 unsigned long h_load;
361 363
362 /* 364 u64 load_avg;
363 * this cpu's part of tg->shares 365 u64 load_period;
364 */ 366 u64 load_stamp;
365 unsigned long shares;
366 367
367 /* 368 unsigned long load_contribution;
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
371#endif 369#endif
372#endif 370#endif
373}; 371};
@@ -807,20 +805,6 @@ late_initcall(sched_init_debug);
807const_debug unsigned int sysctl_sched_nr_migrate = 32; 805const_debug unsigned int sysctl_sched_nr_migrate = 32;
808 806
809/* 807/*
810 * ratelimit for updating the group shares.
811 * default: 0.25ms
812 */
813unsigned int sysctl_sched_shares_ratelimit = 250000;
814unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
815
816/*
817 * Inject some fuzzyness into changing the per-cpu group shares
818 * this avoids remote rq-locks at the expense of fairness.
819 * default: 4
820 */
821unsigned int sysctl_sched_shares_thresh = 4;
822
823/*
824 * period over which we average the RT time consumption, measured 808 * period over which we average the RT time consumption, measured
825 * in ms. 809 * in ms.
826 * 810 *
@@ -1369,6 +1353,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1369 lw->inv_weight = 0; 1353 lw->inv_weight = 0;
1370} 1354}
1371 1355
1356static inline void update_load_set(struct load_weight *lw, unsigned long w)
1357{
1358 lw->weight = w;
1359 lw->inv_weight = 0;
1360}
1361
1372/* 1362/*
1373 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1363 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1374 * of tasks with abnormal "nice" values across CPUs the contribution that 1364 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1557,97 +1547,44 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1557 1547
1558#ifdef CONFIG_FAIR_GROUP_SCHED 1548#ifdef CONFIG_FAIR_GROUP_SCHED
1559 1549
1560static __read_mostly unsigned long __percpu *update_shares_data; 1550static void update_cfs_load(struct cfs_rq *cfs_rq);
1561 1551static void update_cfs_shares(struct cfs_rq *cfs_rq);
1562static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1563
1564/*
1565 * Calculate and set the cpu's group shares.
1566 */
1567static void update_group_shares_cpu(struct task_group *tg, int cpu,
1568 unsigned long sd_shares,
1569 unsigned long sd_rq_weight,
1570 unsigned long *usd_rq_weight)
1571{
1572 unsigned long shares, rq_weight;
1573 int boost = 0;
1574
1575 rq_weight = usd_rq_weight[cpu];
1576 if (!rq_weight) {
1577 boost = 1;
1578 rq_weight = NICE_0_LOAD;
1579 }
1580
1581 /*
1582 * \Sum_j shares_j * rq_weight_i
1583 * shares_i = -----------------------------
1584 * \Sum_j rq_weight_j
1585 */
1586 shares = (sd_shares * rq_weight) / sd_rq_weight;
1587 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1588
1589 if (abs(shares - tg->se[cpu]->load.weight) >
1590 sysctl_sched_shares_thresh) {
1591 struct rq *rq = cpu_rq(cpu);
1592 unsigned long flags;
1593
1594 raw_spin_lock_irqsave(&rq->lock, flags);
1595 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1596 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1597 __set_se_shares(tg->se[cpu], shares);
1598 raw_spin_unlock_irqrestore(&rq->lock, flags);
1599 }
1600}
1601 1552
1602/* 1553/*
1603 * Re-compute the task group their per cpu shares over the given domain. 1554 * update tg->load_weight by folding this cpu's load_avg
1604 * This needs to be done in a bottom-up fashion because the rq weight of a
1605 * parent group depends on the shares of its child groups.
1606 */ 1555 */
1607static int tg_shares_up(struct task_group *tg, void *data) 1556static int tg_shares_up(struct task_group *tg, void *data)
1608{ 1557{
1609 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; 1558 long load_avg;
1610 unsigned long *usd_rq_weight; 1559 struct cfs_rq *cfs_rq;
1611 struct sched_domain *sd = data;
1612 unsigned long flags; 1560 unsigned long flags;
1613 int i; 1561 int cpu = (long)data;
1562 struct rq *rq;
1614 1563
1615 if (!tg->se[0]) 1564 if (!tg->se[cpu])
1616 return 0; 1565 return 0;
1617 1566
1618 local_irq_save(flags); 1567 rq = cpu_rq(cpu);
1619 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); 1568 cfs_rq = tg->cfs_rq[cpu];
1620
1621 for_each_cpu(i, sched_domain_span(sd)) {
1622 weight = tg->cfs_rq[i]->load.weight;
1623 usd_rq_weight[i] = weight;
1624
1625 rq_weight += weight;
1626 /*
1627 * If there are currently no tasks on the cpu pretend there
1628 * is one of average load so that when a new task gets to
1629 * run here it will not get delayed by group starvation.
1630 */
1631 if (!weight)
1632 weight = NICE_0_LOAD;
1633 1569
1634 sum_weight += weight; 1570 raw_spin_lock_irqsave(&rq->lock, flags);
1635 shares += tg->cfs_rq[i]->shares;
1636 }
1637 1571
1638 if (!rq_weight) 1572 update_rq_clock(rq);
1639 rq_weight = sum_weight; 1573 update_cfs_load(cfs_rq);
1640 1574
1641 if ((!shares && rq_weight) || shares > tg->shares) 1575 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
1642 shares = tg->shares; 1576 load_avg -= cfs_rq->load_contribution;
1643 1577
1644 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1578 atomic_add(load_avg, &tg->load_weight);
1645 shares = tg->shares; 1579 cfs_rq->load_contribution += load_avg;
1646 1580
1647 for_each_cpu(i, sched_domain_span(sd)) 1581 /*
1648 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); 1582 * We need to update shares after updating tg->load_weight in
1583 * order to adjust the weight of groups with long running tasks.
1584 */
1585 update_cfs_shares(cfs_rq);
1649 1586
1650 local_irq_restore(flags); 1587 raw_spin_unlock_irqrestore(&rq->lock, flags);
1651 1588
1652 return 0; 1589 return 0;
1653} 1590}
@@ -1666,7 +1603,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1666 load = cpu_rq(cpu)->load.weight; 1603 load = cpu_rq(cpu)->load.weight;
1667 } else { 1604 } else {
1668 load = tg->parent->cfs_rq[cpu]->h_load; 1605 load = tg->parent->cfs_rq[cpu]->h_load;
1669 load *= tg->cfs_rq[cpu]->shares; 1606 load *= tg->se[cpu]->load.weight;
1670 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1607 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1671 } 1608 }
1672 1609
@@ -1675,21 +1612,16 @@ static int tg_load_down(struct task_group *tg, void *data)
1675 return 0; 1612 return 0;
1676} 1613}
1677 1614
1678static void update_shares(struct sched_domain *sd) 1615static void update_shares(long cpu)
1679{ 1616{
1680 s64 elapsed;
1681 u64 now;
1682
1683 if (root_task_group_empty()) 1617 if (root_task_group_empty())
1684 return; 1618 return;
1685 1619
1686 now = local_clock(); 1620 /*
1687 elapsed = now - sd->last_update; 1621 * XXX: replace with an on-demand list
1622 */
1688 1623
1689 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1624 walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu);
1690 sd->last_update = now;
1691 walk_tg_tree(tg_nop, tg_shares_up, sd);
1692 }
1693} 1625}
1694 1626
1695static void update_h_load(long cpu) 1627static void update_h_load(long cpu)
@@ -1699,7 +1631,7 @@ static void update_h_load(long cpu)
1699 1631
1700#else 1632#else
1701 1633
1702static inline void update_shares(struct sched_domain *sd) 1634static inline void update_shares(int cpu)
1703{ 1635{
1704} 1636}
1705 1637
@@ -1824,15 +1756,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1824 1756
1825#endif 1757#endif
1826 1758
1827#ifdef CONFIG_FAIR_GROUP_SCHED
1828static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1829{
1830#ifdef CONFIG_SMP
1831 cfs_rq->shares = shares;
1832#endif
1833}
1834#endif
1835
1836static void calc_load_account_idle(struct rq *this_rq); 1759static void calc_load_account_idle(struct rq *this_rq);
1837static void update_sysctl(void); 1760static void update_sysctl(void);
1838static int get_update_sysctl_factor(void); 1761static int get_update_sysctl_factor(void);
@@ -5551,7 +5474,6 @@ static void update_sysctl(void)
5551 SET_SYSCTL(sched_min_granularity); 5474 SET_SYSCTL(sched_min_granularity);
5552 SET_SYSCTL(sched_latency); 5475 SET_SYSCTL(sched_latency);
5553 SET_SYSCTL(sched_wakeup_granularity); 5476 SET_SYSCTL(sched_wakeup_granularity);
5554 SET_SYSCTL(sched_shares_ratelimit);
5555#undef SET_SYSCTL 5477#undef SET_SYSCTL
5556} 5478}
5557 5479
@@ -7787,8 +7709,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7787 se->cfs_rq = parent->my_q; 7709 se->cfs_rq = parent->my_q;
7788 7710
7789 se->my_q = cfs_rq; 7711 se->my_q = cfs_rq;
7790 se->load.weight = tg->shares; 7712 update_load_set(&se->load, tg->shares);
7791 se->load.inv_weight = 0;
7792 se->parent = parent; 7713 se->parent = parent;
7793} 7714}
7794#endif 7715#endif
@@ -7881,10 +7802,6 @@ void __init sched_init(void)
7881 7802
7882#endif /* CONFIG_CGROUP_SCHED */ 7803#endif /* CONFIG_CGROUP_SCHED */
7883 7804
7884#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7885 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
7886 __alignof__(unsigned long));
7887#endif
7888 for_each_possible_cpu(i) { 7805 for_each_possible_cpu(i) {
7889 struct rq *rq; 7806 struct rq *rq;
7890 7807
@@ -8452,8 +8369,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8452 if (on_rq) 8369 if (on_rq)
8453 dequeue_entity(cfs_rq, se, 0); 8370 dequeue_entity(cfs_rq, se, 0);
8454 8371
8455 se->load.weight = shares; 8372 update_load_set(&se->load, shares);
8456 se->load.inv_weight = 0;
8457 8373
8458 if (on_rq) 8374 if (on_rq)
8459 enqueue_entity(cfs_rq, se, 0); 8375 enqueue_entity(cfs_rq, se, 0);
@@ -8510,7 +8426,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8510 /* 8426 /*
8511 * force a rebalance 8427 * force a rebalance
8512 */ 8428 */
8513 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8514 set_se_shares(tg->se[i], shares); 8429 set_se_shares(tg->se[i], shares);
8515 } 8430 }
8516 8431
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2e1b0d17dd9b..e6590e7312e8 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -202,15 +202,22 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
202 spread0 = min_vruntime - rq0_min_vruntime; 202 spread0 = min_vruntime - rq0_min_vruntime;
203 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0", 203 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread0",
204 SPLIT_NS(spread0)); 204 SPLIT_NS(spread0));
205 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
206 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
207
208 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", 205 SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
209 cfs_rq->nr_spread_over); 206 cfs_rq->nr_spread_over);
207 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
208 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
210#ifdef CONFIG_FAIR_GROUP_SCHED 209#ifdef CONFIG_FAIR_GROUP_SCHED
211#ifdef CONFIG_SMP 210#ifdef CONFIG_SMP
212 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares); 211 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_avg",
212 SPLIT_NS(cfs_rq->load_avg));
213 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "load_period",
214 SPLIT_NS(cfs_rq->load_period));
215 SEQ_printf(m, " .%-30s: %ld\n", "load_contrib",
216 cfs_rq->load_contribution);
217 SEQ_printf(m, " .%-30s: %d\n", "load_tg",
218 atomic_read(&tg->load_weight));
213#endif 219#endif
220
214 print_cfs_group_stats(m, cpu, cfs_rq->tg); 221 print_cfs_group_stats(m, cpu, cfs_rq->tg);
215#endif 222#endif
216} 223}
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f4f6a8326dd0..d86544b4151c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -417,7 +417,6 @@ int sched_proc_update_handler(struct ctl_table *table, int write,
417 WRT_SYSCTL(sched_min_granularity); 417 WRT_SYSCTL(sched_min_granularity);
418 WRT_SYSCTL(sched_latency); 418 WRT_SYSCTL(sched_latency);
419 WRT_SYSCTL(sched_wakeup_granularity); 419 WRT_SYSCTL(sched_wakeup_granularity);
420 WRT_SYSCTL(sched_shares_ratelimit);
421#undef WRT_SYSCTL 420#undef WRT_SYSCTL
422 421
423 return 0; 422 return 0;
@@ -633,7 +632,6 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
633 list_add(&se->group_node, &cfs_rq->tasks); 632 list_add(&se->group_node, &cfs_rq->tasks);
634 } 633 }
635 cfs_rq->nr_running++; 634 cfs_rq->nr_running++;
636 se->on_rq = 1;
637} 635}
638 636
639static void 637static void
@@ -647,9 +645,89 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
647 list_del_init(&se->group_node); 645 list_del_init(&se->group_node);
648 } 646 }
649 cfs_rq->nr_running--; 647 cfs_rq->nr_running--;
650 se->on_rq = 0;
651} 648}
652 649
650#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
651static void update_cfs_load(struct cfs_rq *cfs_rq)
652{
653 u64 period = sched_avg_period();
654 u64 now, delta;
655
656 if (!cfs_rq)
657 return;
658
659 now = rq_of(cfs_rq)->clock;
660 delta = now - cfs_rq->load_stamp;
661
662 cfs_rq->load_stamp = now;
663 cfs_rq->load_period += delta;
664 cfs_rq->load_avg += delta * cfs_rq->load.weight;
665
666 while (cfs_rq->load_period > period) {
667 /*
668 * Inline assembly required to prevent the compiler
669 * optimising this loop into a divmod call.
670 * See __iter_div_u64_rem() for another example of this.
671 */
672 asm("" : "+rm" (cfs_rq->load_period));
673 cfs_rq->load_period /= 2;
674 cfs_rq->load_avg /= 2;
675 }
676}
677
678static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
679 unsigned long weight)
680{
681 if (se->on_rq)
682 account_entity_dequeue(cfs_rq, se);
683
684 update_load_set(&se->load, weight);
685
686 if (se->on_rq)
687 account_entity_enqueue(cfs_rq, se);
688}
689
690static void update_cfs_shares(struct cfs_rq *cfs_rq)
691{
692 struct task_group *tg;
693 struct sched_entity *se;
694 long load_weight, load, shares;
695
696 if (!cfs_rq)
697 return;
698
699 tg = cfs_rq->tg;
700 se = tg->se[cpu_of(rq_of(cfs_rq))];
701 if (!se)
702 return;
703
704 load = cfs_rq->load.weight;
705
706 load_weight = atomic_read(&tg->load_weight);
707 load_weight -= cfs_rq->load_contribution;
708 load_weight += load;
709
710 shares = (tg->shares * load);
711 if (load_weight)
712 shares /= load_weight;
713
714 if (shares < MIN_SHARES)
715 shares = MIN_SHARES;
716 if (shares > tg->shares)
717 shares = tg->shares;
718
719 reweight_entity(cfs_rq_of(se), se, shares);
720}
721#else /* CONFIG_FAIR_GROUP_SCHED */
722static inline void update_cfs_load(struct cfs_rq *cfs_rq)
723{
724}
725
726static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
727{
728}
729#endif /* CONFIG_FAIR_GROUP_SCHED */
730
653static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 731static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
654{ 732{
655#ifdef CONFIG_SCHEDSTATS 733#ifdef CONFIG_SCHEDSTATS
@@ -771,7 +849,9 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
771 * Update run-time statistics of the 'current'. 849 * Update run-time statistics of the 'current'.
772 */ 850 */
773 update_curr(cfs_rq); 851 update_curr(cfs_rq);
852 update_cfs_load(cfs_rq);
774 account_entity_enqueue(cfs_rq, se); 853 account_entity_enqueue(cfs_rq, se);
854 update_cfs_shares(cfs_rq);
775 855
776 if (flags & ENQUEUE_WAKEUP) { 856 if (flags & ENQUEUE_WAKEUP) {
777 place_entity(cfs_rq, se, 0); 857 place_entity(cfs_rq, se, 0);
@@ -782,6 +862,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
782 check_spread(cfs_rq, se); 862 check_spread(cfs_rq, se);
783 if (se != cfs_rq->curr) 863 if (se != cfs_rq->curr)
784 __enqueue_entity(cfs_rq, se); 864 __enqueue_entity(cfs_rq, se);
865 se->on_rq = 1;
785} 866}
786 867
787static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) 868static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -825,8 +906,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
825 906
826 if (se != cfs_rq->curr) 907 if (se != cfs_rq->curr)
827 __dequeue_entity(cfs_rq, se); 908 __dequeue_entity(cfs_rq, se);
909 se->on_rq = 0;
910 update_cfs_load(cfs_rq);
828 account_entity_dequeue(cfs_rq, se); 911 account_entity_dequeue(cfs_rq, se);
829 update_min_vruntime(cfs_rq); 912 update_min_vruntime(cfs_rq);
913 update_cfs_shares(cfs_rq);
830 914
831 /* 915 /*
832 * Normalize the entity after updating the min_vruntime because the 916 * Normalize the entity after updating the min_vruntime because the
@@ -1055,6 +1139,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1055 flags = ENQUEUE_WAKEUP; 1139 flags = ENQUEUE_WAKEUP;
1056 } 1140 }
1057 1141
1142 for_each_sched_entity(se) {
1143 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1144
1145 update_cfs_load(cfs_rq);
1146 update_cfs_shares(cfs_rq);
1147 }
1148
1058 hrtick_update(rq); 1149 hrtick_update(rq);
1059} 1150}
1060 1151
@@ -1071,12 +1162,20 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
1071 for_each_sched_entity(se) { 1162 for_each_sched_entity(se) {
1072 cfs_rq = cfs_rq_of(se); 1163 cfs_rq = cfs_rq_of(se);
1073 dequeue_entity(cfs_rq, se, flags); 1164 dequeue_entity(cfs_rq, se, flags);
1165
1074 /* Don't dequeue parent if it has other entities besides us */ 1166 /* Don't dequeue parent if it has other entities besides us */
1075 if (cfs_rq->load.weight) 1167 if (cfs_rq->load.weight)
1076 break; 1168 break;
1077 flags |= DEQUEUE_SLEEP; 1169 flags |= DEQUEUE_SLEEP;
1078 } 1170 }
1079 1171
1172 for_each_sched_entity(se) {
1173 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1174
1175 update_cfs_load(cfs_rq);
1176 update_cfs_shares(cfs_rq);
1177 }
1178
1080 hrtick_update(rq); 1179 hrtick_update(rq);
1081} 1180}
1082 1181
@@ -1143,51 +1242,20 @@ static void task_waking_fair(struct rq *rq, struct task_struct *p)
1143 * Adding load to a group doesn't make a group heavier, but can cause movement 1242 * Adding load to a group doesn't make a group heavier, but can cause movement
1144 * of group shares between cpus. Assuming the shares were perfectly aligned one 1243 * of group shares between cpus. Assuming the shares were perfectly aligned one
1145 * can calculate the shift in shares. 1244 * can calculate the shift in shares.
1146 *
1147 * The problem is that perfectly aligning the shares is rather expensive, hence
1148 * we try to avoid doing that too often - see update_shares(), which ratelimits
1149 * this change.
1150 *
1151 * We compensate this by not only taking the current delta into account, but
1152 * also considering the delta between when the shares were last adjusted and
1153 * now.
1154 *
1155 * We still saw a performance dip, some tracing learned us that between
1156 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1157 * significantly. Therefore try to bias the error in direction of failing
1158 * the affine wakeup.
1159 *
1160 */ 1245 */
1161static long effective_load(struct task_group *tg, int cpu, 1246static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
1162 long wl, long wg)
1163{ 1247{
1164 struct sched_entity *se = tg->se[cpu]; 1248 struct sched_entity *se = tg->se[cpu];
1165 1249
1166 if (!tg->parent) 1250 if (!tg->parent)
1167 return wl; 1251 return wl;
1168 1252
1169 /*
1170 * By not taking the decrease of shares on the other cpu into
1171 * account our error leans towards reducing the affine wakeups.
1172 */
1173 if (!wl && sched_feat(ASYM_EFF_LOAD))
1174 return wl;
1175
1176 for_each_sched_entity(se) { 1253 for_each_sched_entity(se) {
1177 long S, rw, s, a, b; 1254 long S, rw, s, a, b;
1178 long more_w;
1179
1180 /*
1181 * Instead of using this increment, also add the difference
1182 * between when the shares were last updated and now.
1183 */
1184 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1185 wl += more_w;
1186 wg += more_w;
1187 1255
1188 S = se->my_q->tg->shares; 1256 S = se->my_q->tg->shares;
1189 s = se->my_q->shares; 1257 s = se->load.weight;
1190 rw = se->my_q->rq_weight; 1258 rw = se->my_q->load.weight;
1191 1259
1192 a = S*(rw + wl); 1260 a = S*(rw + wl);
1193 b = S*rw + s*wg; 1261 b = S*rw + s*wg;
@@ -1508,23 +1576,6 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
1508 sd = tmp; 1576 sd = tmp;
1509 } 1577 }
1510 1578
1511#ifdef CONFIG_FAIR_GROUP_SCHED
1512 if (sched_feat(LB_SHARES_UPDATE)) {
1513 /*
1514 * Pick the largest domain to update shares over
1515 */
1516 tmp = sd;
1517 if (affine_sd && (!tmp || affine_sd->span_weight > sd->span_weight))
1518 tmp = affine_sd;
1519
1520 if (tmp) {
1521 raw_spin_unlock(&rq->lock);
1522 update_shares(tmp);
1523 raw_spin_lock(&rq->lock);
1524 }
1525 }
1526#endif
1527
1528 if (affine_sd) { 1579 if (affine_sd) {
1529 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync)) 1580 if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
1530 return select_idle_sibling(p, cpu); 1581 return select_idle_sibling(p, cpu);
@@ -3014,7 +3065,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3014 schedstat_inc(sd, lb_count[idle]); 3065 schedstat_inc(sd, lb_count[idle]);
3015 3066
3016redo: 3067redo:
3017 update_shares(sd);
3018 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3068 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3019 cpus, balance); 3069 cpus, balance);
3020 3070
@@ -3156,8 +3206,6 @@ out_one_pinned:
3156 else 3206 else
3157 ld_moved = 0; 3207 ld_moved = 0;
3158out: 3208out:
3159 if (ld_moved)
3160 update_shares(sd);
3161 return ld_moved; 3209 return ld_moved;
3162} 3210}
3163 3211
@@ -3549,6 +3597,8 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3549 int update_next_balance = 0; 3597 int update_next_balance = 0;
3550 int need_serialize; 3598 int need_serialize;
3551 3599
3600 update_shares(cpu);
3601
3552 for_each_domain(cpu, sd) { 3602 for_each_domain(cpu, sd) {
3553 if (!(sd->flags & SD_LOAD_BALANCE)) 3603 if (!(sd->flags & SD_LOAD_BALANCE))
3554 continue; 3604 continue;
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 185f920ec1a2..68e69acc29b9 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -52,8 +52,6 @@ SCHED_FEAT(ARCH_POWER, 0)
52SCHED_FEAT(HRTICK, 0) 52SCHED_FEAT(HRTICK, 0)
53SCHED_FEAT(DOUBLE_TICK, 0) 53SCHED_FEAT(DOUBLE_TICK, 0)
54SCHED_FEAT(LB_BIAS, 1) 54SCHED_FEAT(LB_BIAS, 1)
55SCHED_FEAT(LB_SHARES_UPDATE, 1)
56SCHED_FEAT(ASYM_EFF_LOAD, 1)
57 55
58/* 56/*
59 * Spin-wait on mutex acquisition when the mutex owner is running on 57 * Spin-wait on mutex acquisition when the mutex owner is running on
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b65bf634035e..3132b25193db 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -259,8 +259,6 @@ static int min_wakeup_granularity_ns; /* 0 usecs */
259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 259static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
262static int min_sched_shares_ratelimit = 100000; /* 100 usec */
263static int max_sched_shares_ratelimit = NSEC_PER_SEC; /* 1 second */
264#endif 262#endif
265 263
266#ifdef CONFIG_COMPACTION 264#ifdef CONFIG_COMPACTION
@@ -305,15 +303,6 @@ static struct ctl_table kern_table[] = {
305 .extra2 = &max_wakeup_granularity_ns, 303 .extra2 = &max_wakeup_granularity_ns,
306 }, 304 },
307 { 305 {
308 .procname = "sched_shares_ratelimit",
309 .data = &sysctl_sched_shares_ratelimit,
310 .maxlen = sizeof(unsigned int),
311 .mode = 0644,
312 .proc_handler = sched_proc_update_handler,
313 .extra1 = &min_sched_shares_ratelimit,
314 .extra2 = &max_sched_shares_ratelimit,
315 },
316 {
317 .procname = "sched_tunable_scaling", 306 .procname = "sched_tunable_scaling",
318 .data = &sysctl_sched_tunable_scaling, 307 .data = &sysctl_sched_tunable_scaling,
319 .maxlen = sizeof(enum sched_tunable_scaling), 308 .maxlen = sizeof(enum sched_tunable_scaling),
@@ -323,14 +312,6 @@ static struct ctl_table kern_table[] = {
323 .extra2 = &max_sched_tunable_scaling, 312 .extra2 = &max_sched_tunable_scaling,
324 }, 313 },
325 { 314 {
326 .procname = "sched_shares_thresh",
327 .data = &sysctl_sched_shares_thresh,
328 .maxlen = sizeof(unsigned int),
329 .mode = 0644,
330 .proc_handler = proc_dointvec_minmax,
331 .extra1 = &zero,
332 },
333 {
334 .procname = "sched_migration_cost", 315 .procname = "sched_migration_cost",
335 .data = &sysctl_sched_migration_cost, 316 .data = &sysctl_sched_migration_cost,
336 .maxlen = sizeof(unsigned int), 317 .maxlen = sizeof(unsigned int),