sched: fix share (re)distribution

fix __aggregate_redistribute_shares() related lockup reported by David S. Miller. The problem this code tries to solve is 'accurately' calculating the 'fair' share of the group weight for each cpu. The current code falls back to a global group rebalance in case the sched_domain's span it looks at has no shares, but does have tasks. The reason it gets stuck here, is because its inherently racy - if someone steals the last task after we compute the agg->rq_weight, but before we rebalance, we'll never get out of the loop. We could of course go fix that, but while looking at this issue I found that this 'fallback' wasn't nearly as rare as I'd hoped it to be. In fact its quite common - and given it walks the whole machine, thats very bad. The new approach is simple (why didn't I think of it before?), we set the aggregate shares to the full task group weight, and each larger sched domain that encounters an aggregate shares larger than the weight, clips it (it already re-distributes anyway). This nicely converges to the desired global picture where the sum of all shares equals the task group weight. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2008-04-24 18:25:08 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-04-24 18:25:08 -0400
commit: 3f5087a2bae5d1ce10a3d698dec8f879a96f5419 (patch)
tree: ad28e2dd5d36e7ea435032dd8a5fbd94340342ca
parent: 126e01bf92dfc5f0ba91e88be02c473e1506d7d9 (diff)
1 files changed, 2 insertions, 45 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 0014b03adaca..85e1721594f0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1657,42 +1657,6 @@ void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
 }
 /*
- * Redistribute tg->shares amongst all tg->cfs_rq[]s.
- */
-static void __aggregate_redistribute_shares(struct task_group *tg)
-{
-        int i, max_cpu = smp_processor_id();
-        unsigned long rq_weight = 0;
-        unsigned long shares, max_shares = 0, shares_rem = tg->shares;
-        for_each_possible_cpu(i)
-                rq_weight += tg->cfs_rq[i]->load.weight;
-        for_each_possible_cpu(i) {
-                /*
-                 * divide shares proportional to the rq_weights.
-                 */
-                shares = tg->shares * tg->cfs_rq[i]->load.weight;
-                shares /= rq_weight + 1;
-                tg->cfs_rq[i]->shares = shares;
-                if (shares > max_shares) {
-                        max_shares = shares;
-                        max_cpu = i;
-                }
-                shares_rem -= shares;
-        }
-        /*
-         * Ensure it all adds up to tg->shares; we can loose a few
-         * due to rounding down when computing the per-cpu shares.
-         */
-        if (shares_rem)
-                tg->cfs_rq[max_cpu]->shares += shares_rem;
-}
-/*
 * Compute the weight of this group on the given cpus.
 */
 static
@@ -1701,18 +1665,11 @@ void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
        unsigned long shares = 0;
        int i;
-again:
        for_each_cpu_mask(i, sd->span)
                shares += tg->cfs_rq[i]->shares;
-        /*
+        if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
-         * When the span doesn't have any shares assigned, but does have
+                shares = tg->shares;
-         * tasks to run do a machine wide rebalance (should be rare).
-         */
-        if (unlikely(!shares && aggregate(tg, sd)->rq_weight)) {
-                __aggregate_redistribute_shares(tg);
-                goto again;
-        }
        aggregate(tg, sd)->shares = shares;
 }
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2008-04-24 18:25:08 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-04-24 18:25:08 -0400
commit	3f5087a2bae5d1ce10a3d698dec8f879a96f5419 (patch)
tree	ad28e2dd5d36e7ea435032dd8a5fbd94340342ca
parent	126e01bf92dfc5f0ba91e88be02c473e1506d7d9 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 0014b03adaca..85e1721594f0 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -1657,42 +1657,6 @@ void aggregate_group_weight(struct task_group tg, struct sched_domain sd)
1657	}	1657	}
1658		1658
1659	/*	1659	/*
1660	* Redistribute tg->shares amongst all tg->cfs_rq[]s.
1661	*/
1662	static void __aggregate_redistribute_shares(struct task_group *tg)
1663	{
1664	int i, max_cpu = smp_processor_id();
1665	unsigned long rq_weight = 0;
1666	unsigned long shares, max_shares = 0, shares_rem = tg->shares;
1667
1668	for_each_possible_cpu(i)
1669	rq_weight += tg->cfs_rq[i]->load.weight;
1670
1671	for_each_possible_cpu(i) {
1672	/*
1673	* divide shares proportional to the rq_weights.
1674	*/
1675	shares = tg->shares * tg->cfs_rq[i]->load.weight;
1676	shares /= rq_weight + 1;
1677
1678	tg->cfs_rq[i]->shares = shares;
1679
1680	if (shares > max_shares) {
1681	max_shares = shares;
1682	max_cpu = i;
1683	}
1684	shares_rem -= shares;
1685	}
1686
1687	/*
1688	* Ensure it all adds up to tg->shares; we can loose a few
1689	* due to rounding down when computing the per-cpu shares.
1690	*/
1691	if (shares_rem)
1692	tg->cfs_rq[max_cpu]->shares += shares_rem;
1693	}
1694
1695	/*
1696	* Compute the weight of this group on the given cpus.	1660	* Compute the weight of this group on the given cpus.
1697	*/	1661	*/
1698	static	1662	static
@@ -1701,18 +1665,11 @@ void aggregate_group_shares(struct task_group tg, struct sched_domain sd)
1701	unsigned long shares = 0;	1665	unsigned long shares = 0;
1702	int i;	1666	int i;
1703		1667
1704	again:
1705	for_each_cpu_mask(i, sd->span)	1668	for_each_cpu_mask(i, sd->span)
1706	shares += tg->cfs_rq[i]->shares;	1669	shares += tg->cfs_rq[i]->shares;
1707		1670
1708	/*	1671	if ((!shares && aggregate(tg, sd)->rq_weight) \|\| shares > tg->shares)
1709	* When the span doesn't have any shares assigned, but does have	1672	shares = tg->shares;
1710	* tasks to run do a machine wide rebalance (should be rare).
1711	*/
1712	if (unlikely(!shares && aggregate(tg, sd)->rq_weight)) {
1713	__aggregate_redistribute_shares(tg);
1714	goto again;
1715	}
1716		1673
1717	aggregate(tg, sd)->shares = shares;	1674	aggregate(tg, sd)->shares = shares;
1718	}	1675	}