aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched.c
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2010-11-15 18:47:00 -0500
committerIngo Molnar <mingo@elte.hu>2010-11-18 07:27:46 -0500
commit2069dd75c7d0f49355939e5586daf5a9ab216db7 (patch)
treec221747420e47b194a2a634024438a55420224d5 /kernel/sched.c
parent48c5ccae88dcd989d9de507e8510313c6cbd352b (diff)
sched: Rewrite tg_shares_up)
By tracking a per-cpu load-avg for each cfs_rq and folding it into a global task_group load on each tick we can rework tg_shares_up to be strictly per-cpu. This should improve cpu-cgroup performance for smp systems significantly. [ Paul: changed to use queueing cfs_rq + bug fixes ] Signed-off-by: Paul Turner <pjt@google.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <20101115234937.580480400@google.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel/sched.c')
-rw-r--r--kernel/sched.c173
1 files changed, 44 insertions, 129 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index b0d5f1b24a39..e2f1a3024a99 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -253,6 +253,8 @@ struct task_group {
253 /* runqueue "owned" by this group on each cpu */ 253 /* runqueue "owned" by this group on each cpu */
254 struct cfs_rq **cfs_rq; 254 struct cfs_rq **cfs_rq;
255 unsigned long shares; 255 unsigned long shares;
256
257 atomic_t load_weight;
256#endif 258#endif
257 259
258#ifdef CONFIG_RT_GROUP_SCHED 260#ifdef CONFIG_RT_GROUP_SCHED
@@ -359,15 +361,11 @@ struct cfs_rq {
359 */ 361 */
360 unsigned long h_load; 362 unsigned long h_load;
361 363
362 /* 364 u64 load_avg;
363 * this cpu's part of tg->shares 365 u64 load_period;
364 */ 366 u64 load_stamp;
365 unsigned long shares;
366 367
367 /* 368 unsigned long load_contribution;
368 * load.weight at the time we set shares
369 */
370 unsigned long rq_weight;
371#endif 369#endif
372#endif 370#endif
373}; 371};
@@ -807,20 +805,6 @@ late_initcall(sched_init_debug);
807const_debug unsigned int sysctl_sched_nr_migrate = 32; 805const_debug unsigned int sysctl_sched_nr_migrate = 32;
808 806
809/* 807/*
810 * ratelimit for updating the group shares.
811 * default: 0.25ms
812 */
813unsigned int sysctl_sched_shares_ratelimit = 250000;
814unsigned int normalized_sysctl_sched_shares_ratelimit = 250000;
815
816/*
817 * Inject some fuzzyness into changing the per-cpu group shares
818 * this avoids remote rq-locks at the expense of fairness.
819 * default: 4
820 */
821unsigned int sysctl_sched_shares_thresh = 4;
822
823/*
824 * period over which we average the RT time consumption, measured 808 * period over which we average the RT time consumption, measured
825 * in ms. 809 * in ms.
826 * 810 *
@@ -1369,6 +1353,12 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
1369 lw->inv_weight = 0; 1353 lw->inv_weight = 0;
1370} 1354}
1371 1355
1356static inline void update_load_set(struct load_weight *lw, unsigned long w)
1357{
1358 lw->weight = w;
1359 lw->inv_weight = 0;
1360}
1361
1372/* 1362/*
1373 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1363 * To aid in avoiding the subversion of "niceness" due to uneven distribution
1374 * of tasks with abnormal "nice" values across CPUs the contribution that 1364 * of tasks with abnormal "nice" values across CPUs the contribution that
@@ -1557,97 +1547,44 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1557 1547
1558#ifdef CONFIG_FAIR_GROUP_SCHED 1548#ifdef CONFIG_FAIR_GROUP_SCHED
1559 1549
1560static __read_mostly unsigned long __percpu *update_shares_data; 1550static void update_cfs_load(struct cfs_rq *cfs_rq);
1561 1551static void update_cfs_shares(struct cfs_rq *cfs_rq);
1562static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1563
1564/*
1565 * Calculate and set the cpu's group shares.
1566 */
1567static void update_group_shares_cpu(struct task_group *tg, int cpu,
1568 unsigned long sd_shares,
1569 unsigned long sd_rq_weight,
1570 unsigned long *usd_rq_weight)
1571{
1572 unsigned long shares, rq_weight;
1573 int boost = 0;
1574
1575 rq_weight = usd_rq_weight[cpu];
1576 if (!rq_weight) {
1577 boost = 1;
1578 rq_weight = NICE_0_LOAD;
1579 }
1580
1581 /*
1582 * \Sum_j shares_j * rq_weight_i
1583 * shares_i = -----------------------------
1584 * \Sum_j rq_weight_j
1585 */
1586 shares = (sd_shares * rq_weight) / sd_rq_weight;
1587 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1588
1589 if (abs(shares - tg->se[cpu]->load.weight) >
1590 sysctl_sched_shares_thresh) {
1591 struct rq *rq = cpu_rq(cpu);
1592 unsigned long flags;
1593
1594 raw_spin_lock_irqsave(&rq->lock, flags);
1595 tg->cfs_rq[cpu]->rq_weight = boost ? 0 : rq_weight;
1596 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1597 __set_se_shares(tg->se[cpu], shares);
1598 raw_spin_unlock_irqrestore(&rq->lock, flags);
1599 }
1600}
1601 1552
1602/* 1553/*
1603 * Re-compute the task group their per cpu shares over the given domain. 1554 * update tg->load_weight by folding this cpu's load_avg
1604 * This needs to be done in a bottom-up fashion because the rq weight of a
1605 * parent group depends on the shares of its child groups.
1606 */ 1555 */
1607static int tg_shares_up(struct task_group *tg, void *data) 1556static int tg_shares_up(struct task_group *tg, void *data)
1608{ 1557{
1609 unsigned long weight, rq_weight = 0, sum_weight = 0, shares = 0; 1558 long load_avg;
1610 unsigned long *usd_rq_weight; 1559 struct cfs_rq *cfs_rq;
1611 struct sched_domain *sd = data;
1612 unsigned long flags; 1560 unsigned long flags;
1613 int i; 1561 int cpu = (long)data;
1562 struct rq *rq;
1614 1563
1615 if (!tg->se[0]) 1564 if (!tg->se[cpu])
1616 return 0; 1565 return 0;
1617 1566
1618 local_irq_save(flags); 1567 rq = cpu_rq(cpu);
1619 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id()); 1568 cfs_rq = tg->cfs_rq[cpu];
1620
1621 for_each_cpu(i, sched_domain_span(sd)) {
1622 weight = tg->cfs_rq[i]->load.weight;
1623 usd_rq_weight[i] = weight;
1624
1625 rq_weight += weight;
1626 /*
1627 * If there are currently no tasks on the cpu pretend there
1628 * is one of average load so that when a new task gets to
1629 * run here it will not get delayed by group starvation.
1630 */
1631 if (!weight)
1632 weight = NICE_0_LOAD;
1633 1569
1634 sum_weight += weight; 1570 raw_spin_lock_irqsave(&rq->lock, flags);
1635 shares += tg->cfs_rq[i]->shares;
1636 }
1637 1571
1638 if (!rq_weight) 1572 update_rq_clock(rq);
1639 rq_weight = sum_weight; 1573 update_cfs_load(cfs_rq);
1640 1574
1641 if ((!shares && rq_weight) || shares > tg->shares) 1575 load_avg = div64_u64(cfs_rq->load_avg, cfs_rq->load_period+1);
1642 shares = tg->shares; 1576 load_avg -= cfs_rq->load_contribution;
1643 1577
1644 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE)) 1578 atomic_add(load_avg, &tg->load_weight);
1645 shares = tg->shares; 1579 cfs_rq->load_contribution += load_avg;
1646 1580
1647 for_each_cpu(i, sched_domain_span(sd)) 1581 /*
1648 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight); 1582 * We need to update shares after updating tg->load_weight in
1583 * order to adjust the weight of groups with long running tasks.
1584 */
1585 update_cfs_shares(cfs_rq);
1649 1586
1650 local_irq_restore(flags); 1587 raw_spin_unlock_irqrestore(&rq->lock, flags);
1651 1588
1652 return 0; 1589 return 0;
1653} 1590}
@@ -1666,7 +1603,7 @@ static int tg_load_down(struct task_group *tg, void *data)
1666 load = cpu_rq(cpu)->load.weight; 1603 load = cpu_rq(cpu)->load.weight;
1667 } else { 1604 } else {
1668 load = tg->parent->cfs_rq[cpu]->h_load; 1605 load = tg->parent->cfs_rq[cpu]->h_load;
1669 load *= tg->cfs_rq[cpu]->shares; 1606 load *= tg->se[cpu]->load.weight;
1670 load /= tg->parent->cfs_rq[cpu]->load.weight + 1; 1607 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1671 } 1608 }
1672 1609
@@ -1675,21 +1612,16 @@ static int tg_load_down(struct task_group *tg, void *data)
1675 return 0; 1612 return 0;
1676} 1613}
1677 1614
1678static void update_shares(struct sched_domain *sd) 1615static void update_shares(long cpu)
1679{ 1616{
1680 s64 elapsed;
1681 u64 now;
1682
1683 if (root_task_group_empty()) 1617 if (root_task_group_empty())
1684 return; 1618 return;
1685 1619
1686 now = local_clock(); 1620 /*
1687 elapsed = now - sd->last_update; 1621 * XXX: replace with an on-demand list
1622 */
1688 1623
1689 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1624 walk_tg_tree(tg_nop, tg_shares_up, (void *)cpu);
1690 sd->last_update = now;
1691 walk_tg_tree(tg_nop, tg_shares_up, sd);
1692 }
1693} 1625}
1694 1626
1695static void update_h_load(long cpu) 1627static void update_h_load(long cpu)
@@ -1699,7 +1631,7 @@ static void update_h_load(long cpu)
1699 1631
1700#else 1632#else
1701 1633
1702static inline void update_shares(struct sched_domain *sd) 1634static inline void update_shares(int cpu)
1703{ 1635{
1704} 1636}
1705 1637
@@ -1824,15 +1756,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1824 1756
1825#endif 1757#endif
1826 1758
1827#ifdef CONFIG_FAIR_GROUP_SCHED
1828static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1829{
1830#ifdef CONFIG_SMP
1831 cfs_rq->shares = shares;
1832#endif
1833}
1834#endif
1835
1836static void calc_load_account_idle(struct rq *this_rq); 1759static void calc_load_account_idle(struct rq *this_rq);
1837static void update_sysctl(void); 1760static void update_sysctl(void);
1838static int get_update_sysctl_factor(void); 1761static int get_update_sysctl_factor(void);
@@ -5551,7 +5474,6 @@ static void update_sysctl(void)
5551 SET_SYSCTL(sched_min_granularity); 5474 SET_SYSCTL(sched_min_granularity);
5552 SET_SYSCTL(sched_latency); 5475 SET_SYSCTL(sched_latency);
5553 SET_SYSCTL(sched_wakeup_granularity); 5476 SET_SYSCTL(sched_wakeup_granularity);
5554 SET_SYSCTL(sched_shares_ratelimit);
5555#undef SET_SYSCTL 5477#undef SET_SYSCTL
5556} 5478}
5557 5479
@@ -7787,8 +7709,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
7787 se->cfs_rq = parent->my_q; 7709 se->cfs_rq = parent->my_q;
7788 7710
7789 se->my_q = cfs_rq; 7711 se->my_q = cfs_rq;
7790 se->load.weight = tg->shares; 7712 update_load_set(&se->load, tg->shares);
7791 se->load.inv_weight = 0;
7792 se->parent = parent; 7713 se->parent = parent;
7793} 7714}
7794#endif 7715#endif
@@ -7881,10 +7802,6 @@ void __init sched_init(void)
7881 7802
7882#endif /* CONFIG_CGROUP_SCHED */ 7803#endif /* CONFIG_CGROUP_SCHED */
7883 7804
7884#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7885 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
7886 __alignof__(unsigned long));
7887#endif
7888 for_each_possible_cpu(i) { 7805 for_each_possible_cpu(i) {
7889 struct rq *rq; 7806 struct rq *rq;
7890 7807
@@ -8452,8 +8369,7 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8452 if (on_rq) 8369 if (on_rq)
8453 dequeue_entity(cfs_rq, se, 0); 8370 dequeue_entity(cfs_rq, se, 0);
8454 8371
8455 se->load.weight = shares; 8372 update_load_set(&se->load, shares);
8456 se->load.inv_weight = 0;
8457 8373
8458 if (on_rq) 8374 if (on_rq)
8459 enqueue_entity(cfs_rq, se, 0); 8375 enqueue_entity(cfs_rq, se, 0);
@@ -8510,7 +8426,6 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8510 /* 8426 /*
8511 * force a rebalance 8427 * force a rebalance
8512 */ 8428 */
8513 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8514 set_se_shares(tg->se[i], shares); 8429 set_se_shares(tg->se[i], shares);
8515 } 8430 }
8516 8431