sched: simplify the group load balancer

While thinking about the previous patch - I realized that using per domain aggregate load values in load_balance_fair() is wrong. We should use the load value for that CPU. By not needing per domain hierarchical load values we don't need to store per domain aggregate shares, which greatly simplifies all the math. It basically falls apart in two separate computations: - per domain update of the shares - per CPU update of the hierarchical load Also get rid of the move_group_shares() stuff - just re-compute the shares again after a successful load balance. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Mike Galbraith <efault@gmx.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2008-06-27 07:41:23 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-06-27 08:31:36 -0400
commit: c8cba857b4997d5b00451d01474638f6a153f713 (patch)
tree: a784dce37d72ae20a0efb81b8e498b504a207650 /kernel/sched.c
parent: a25b5aca8740ea99d5e18dfc71235a52b685dcf7 (diff)
1 files changed, 64 insertions, 222 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 716cfc8e099e..f864b751fd19 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -406,34 +406,23 @@ struct cfs_rq {
        struct task_group *tg;  /* group that "owns" this runqueue */
 #ifdef CONFIG_SMP
-        unsigned long task_weight;
-        unsigned long shares;
        /*
-         * We need space to build a sched_domain wide view of the full task
+         * the part of load.weight contributed by tasks
-         * group tree, in order to avoid depending on dynamic memory allocation
-         * during the load balancing we place this in the per cpu task group
-         * hierarchy. This limits the load balancing to one instance per cpu,
-         * but more should not be needed anyway.
         */
-        struct aggregate_struct {
+        unsigned long task_weight;
-                /*
-                 *   load = weight(cpus) * f(tg)
-                 *
-                 * Where f(tg) is the recursive weight fraction assigned to
-                 * this group.
-                 */
-                unsigned long load;
-                /*
+        /*
-                 * part of the group weight distributed to this span.
+         *   h_load = weight * f(tg)
-                 */
+         *
-                unsigned long shares;
+         * Where f(tg) is the recursive weight fraction assigned to
+         * this group.
+         */
+        unsigned long h_load;
-                /*
+        /*
-                 * The sum of all runqueue weights within this span.
+         * this cpu's part of tg->shares
-                 */
+         */
-                unsigned long rq_weight;
+        unsigned long shares;
-        } aggregate;
 #endif
 #endif
 };
@@ -1443,47 +1432,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/*
+typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
- * Group load balancing.
- *
- * We calculate a few balance domain wide aggregate numbers; load and weight.
- * Given the pictures below, and assuming each item has equal weight:
- *
- *         root          1 - thread
- *         / | \         A - group
- *        A  1  B
- *       /|\   / \
- *      C 2 D 3   4
- *      |   |
- *      5   6
- *
- * load:
- *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
- *    which equals 1/9-th of the total load.
- *
- * shares:
- *    The weight of this group on the selected cpus.
- *
- * rq_weight:
- *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
- *    B would get 2.
- */
-static inline struct aggregate_struct *
-aggregate(struct task_group *tg, int cpu)
-{
-        return &tg->cfs_rq[cpu]->aggregate;
-}
-typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *);
 /*
 * Iterate the full tree, calling @down when first entering a node and @up when
 * leaving it for the final time.
 */
-static
+static void
-void aggregate_walk_tree(aggregate_func down, aggregate_func up,
+walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
-                         int cpu, struct sched_domain *sd)
 {
        struct task_group *parent, *child;
@@ -1507,72 +1463,6 @@ up:
        rcu_read_unlock();
 }
-/*
- * Calculate the aggregate runqueue weight.
- */
-static void
-aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-        unsigned long rq_weight = 0;
-        int i;
-        for_each_cpu_mask(i, sd->span)
-                rq_weight += tg->cfs_rq[i]->load.weight;
-        aggregate(tg, cpu)->rq_weight = rq_weight;
-}
-/*
- * Compute the weight of this group on the given cpus.
- */
-static void
-aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-        unsigned long shares = 0;
-        int i;
-        for_each_cpu_mask(i, sd->span)
-                shares += tg->cfs_rq[i]->shares;
-        if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
-                shares = tg->shares;
-        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
-                shares = tg->shares;
-        aggregate(tg, cpu)->shares = shares;
-}
-/*
- * Compute the load fraction assigned to this group, relies on the aggregate
- * weight and this group's parent's load, i.e. top-down.
- */
-static void
-aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-        unsigned long load;
-        if (!tg->parent) {
-                int i;
-                load = 0;
-                for_each_cpu_mask(i, sd->span)
-                        load += cpu_rq(i)->load.weight;
-        } else {
-                load = aggregate(tg->parent, cpu)->load;
-                /*
-                 * shares is our weight in the parent's rq so
-                 * shares/parent->rq_weight gives our fraction of the load
-                 */
-                load *= aggregate(tg, cpu)->shares;
-                load /= aggregate(tg->parent, cpu)->rq_weight + 1;
-        }
-        aggregate(tg, cpu)->load = load;
-}
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 /*
@@ -1580,16 +1470,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 */
 static void
 __update_group_shares_cpu(struct task_group *tg, int cpu,
-                          struct sched_domain *sd, int tcpu)
+                          unsigned long sd_shares, unsigned long sd_rq_weight)
 {
        int boost = 0;
        unsigned long shares;
        unsigned long rq_weight;
-        if (!tg->se[tcpu])
+        if (!tg->se[cpu])
                return;
-        rq_weight = tg->cfs_rq[tcpu]->load.weight;
+        rq_weight = tg->cfs_rq[cpu]->load.weight;
        /*
         * If there are currently no tasks on the cpu pretend there is one of
@@ -1601,124 +1491,97 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
                rq_weight = NICE_0_LOAD;
        }
+        if (unlikely(rq_weight > sd_rq_weight))
+                rq_weight = sd_rq_weight;
        /*
         *           \Sum shares * rq_weight
         * shares =  -----------------------
         *               \Sum rq_weight
         *
         */
-        shares = aggregate(tg, cpu)->shares * rq_weight;
+        shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
-        shares /= aggregate(tg, cpu)->rq_weight + 1;
        /*
         * record the actual number of shares, not the boosted amount.
         */
-        tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
+        tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
        if (shares < MIN_SHARES)
                shares = MIN_SHARES;
        else if (shares > MAX_SHARES)
                shares = MAX_SHARES;
-        __set_se_shares(tg->se[tcpu], shares);
+        __set_se_shares(tg->se[cpu], shares);
 }
 /*
- * Re-adjust the weights on the cpu the task came from and on the cpu the
+ * Re-compute the task group their per cpu shares over the given domain.
- * task went to.
+ * This needs to be done in a bottom-up fashion because the rq weight of a
+ * parent group depends on the shares of its child groups.
 */
 static void
-__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
+tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
-                    int scpu, int dcpu)
 {
-        __update_group_shares_cpu(tg, cpu, sd, scpu);
+        unsigned long rq_weight = 0;
-        __update_group_shares_cpu(tg, cpu, sd, dcpu);
+        unsigned long shares = 0;
-}
+        int i;
-/*
+        for_each_cpu_mask(i, sd->span) {
- * Because changing a group's shares changes the weight of the super-group
+                rq_weight += tg->cfs_rq[i]->load.weight;
- * we need to walk up the tree and change all shares until we hit the root.
+                shares += tg->cfs_rq[i]->shares;
- */
-static void
-move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
-                  int scpu, int dcpu)
-{
-        while (tg) {
-                __move_group_shares(tg, cpu, sd, scpu, dcpu);
-                tg = tg->parent;
        }
-}
-static void
+        if ((!shares && rq_weight) || shares > tg->shares)
-aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
+                shares = tg->shares;
-{
-        int i;
+        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
+                shares = tg->shares;
        for_each_cpu_mask(i, sd->span) {
                struct rq *rq = cpu_rq(i);
                unsigned long flags;
                spin_lock_irqsave(&rq->lock, flags);
-                __update_group_shares_cpu(tg, cpu, sd, i);
+                __update_group_shares_cpu(tg, i, shares, rq_weight);
                spin_unlock_irqrestore(&rq->lock, flags);
        }
-        aggregate_group_shares(tg, cpu, sd);
 }
 /*
- * Calculate the accumulative weight and recursive load of each task group
+ * Compute the cpu's hierarchical load factor for each task group.
- * while walking down the tree.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
 */
 static void
-aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-        aggregate_group_weight(tg, cpu, sd);
+        unsigned long load;
-        aggregate_group_shares(tg, cpu, sd);
-        aggregate_group_load(tg, cpu, sd);
-}
-/*
- * Rebalance the cpu shares while walking back up the tree.
- */
-static void
-aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-        aggregate_group_set_shares(tg, cpu, sd);
-}
-static void
-aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-}
-static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
-static void __init init_aggregate(void)
+        if (!tg->parent) {
-{
+                load = cpu_rq(cpu)->load.weight;
-        int i;
+        } else {
+                load = tg->parent->cfs_rq[cpu]->h_load;
+                load *= tg->cfs_rq[cpu]->shares;
+                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+        }
-        for_each_possible_cpu(i)
+        tg->cfs_rq[cpu]->h_load = load;
-                spin_lock_init(&per_cpu(aggregate_lock, i));
 }
-static int get_aggregate(int cpu, struct sched_domain *sd)
+static void
+tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-        if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
-                return 0;
-        aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
-        return 1;
 }
-static void update_aggregate(int cpu, struct sched_domain *sd)
+static void update_shares(struct sched_domain *sd)
 {
-        aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd);
+        walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
 }
-static void put_aggregate(int cpu, struct sched_domain *sd)
+static void update_h_load(int cpu)
 {
-        spin_unlock(&per_cpu(aggregate_lock, cpu));
+        walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
 }
 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1728,22 +1591,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 #else
-static inline void init_aggregate(void)
+static inline void update_shares(struct sched_domain *sd)
-{
-}
-static inline int get_aggregate(int cpu, struct sched_domain *sd)
-{
-        return 0;
-}
-static inline void update_aggregate(int cpu, struct sched_domain *sd)
 {
 }
-static inline void put_aggregate(int cpu, struct sched_domain *sd)
-{
-}
 #endif
 #endif
@@ -2172,12 +2023,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
        int load_idx = sd->forkexec_idx;
        int imbalance = 100 + (sd->imbalance_pct-100)/2;
-        /*
-         * now that we have both rqs locked the rq weight won't change
-         * anymore - so update the stats.
-         */
-        update_aggregate(this_cpu, sd);
        do {
                unsigned long load, avg_load;
                int local_group;
@@ -3521,12 +3366,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        unsigned long imbalance;
        struct rq *busiest;
        unsigned long flags;
-        int unlock_aggregate;
        cpus_setall(*cpus);
-        unlock_aggregate = get_aggregate(this_cpu, sd);
        /*
         * When power savings policy is enabled for the parent domain, idle
         * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3540,6 +3382,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        schedstat_inc(sd, lb_count[idle]);
 redo:
+        update_shares(sd);
        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                   cpus, balance);
@@ -3663,8 +3506,8 @@ out_one_pinned:
        else
                ld_moved = 0;
 out:
-        if (unlock_aggregate)
+        if (ld_moved)
-                put_aggregate(this_cpu, sd);
+                update_shares(sd);
        return ld_moved;
 }
@@ -8019,7 +7862,6 @@ void __init sched_init(void)
        }
 #ifdef CONFIG_SMP
-        init_aggregate();
        init_defrootdomain();
 #endif
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2008-06-27 07:41:23 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-06-27 08:31:36 -0400
commit	c8cba857b4997d5b00451d01474638f6a153f713 (patch)
tree	a784dce37d72ae20a0efb81b8e498b504a207650 /kernel/sched.c
parent	a25b5aca8740ea99d5e18dfc71235a52b685dcf7 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index 716cfc8e099e..f864b751fd19 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -406,34 +406,23 @@ struct cfs_rq {
406	struct task_group tg; / group that "owns" this runqueue */	406	struct task_group tg; / group that "owns" this runqueue */
407		407
408	#ifdef CONFIG_SMP	408	#ifdef CONFIG_SMP
409	unsigned long task_weight;
410	unsigned long shares;
411	/*	409	/*
412	* We need space to build a sched_domain wide view of the full task	410	* the part of load.weight contributed by tasks
413	* group tree, in order to avoid depending on dynamic memory allocation
414	* during the load balancing we place this in the per cpu task group
415	* hierarchy. This limits the load balancing to one instance per cpu,
416	* but more should not be needed anyway.
417	*/	411	*/
418	struct aggregate_struct {	412	unsigned long task_weight;
419	/*
420	* load = weight(cpus) * f(tg)
421	*
422	* Where f(tg) is the recursive weight fraction assigned to
423	* this group.
424	*/
425	unsigned long load;
426		413
427	/*	414	/*
428	* part of the group weight distributed to this span.	415	* h_load = weight * f(tg)
429	*/	416	*
430	unsigned long shares;	417	* Where f(tg) is the recursive weight fraction assigned to
		418	* this group.
		419	*/
		420	unsigned long h_load;
431		421
432	/*	422	/*
433	* The sum of all runqueue weights within this span.	423	* this cpu's part of tg->shares
434	*/	424	*/
435	unsigned long rq_weight;	425	unsigned long shares;
436	} aggregate;
437	#endif	426	#endif
438	#endif	427	#endif
439	};	428	};
@@ -1443,47 +1432,14 @@ static int task_hot(struct task_struct p, u64 now, struct sched_domain sd);
1443		1432
1444	#ifdef CONFIG_FAIR_GROUP_SCHED	1433	#ifdef CONFIG_FAIR_GROUP_SCHED
1445		1434
1446	/*	1435	typedef void (tg_visitor)(struct task_group , int, struct sched_domain *);
1447	* Group load balancing.
1448	*
1449	* We calculate a few balance domain wide aggregate numbers; load and weight.
1450	* Given the pictures below, and assuming each item has equal weight:
1451	*
1452	* root 1 - thread
1453	* / \| \ A - group
1454	* A 1 B
1455	* /\|\ / \
1456	* C 2 D 3 4
1457	* \| \|
1458	* 5 6
1459	*
1460	* load:
1461	* A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
1462	* which equals 1/9-th of the total load.
1463	*
1464	* shares:
1465	* The weight of this group on the selected cpus.
1466	*
1467	* rq_weight:
1468	* Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
1469	* B would get 2.
1470	*/
1471
1472	static inline struct aggregate_struct *
1473	aggregate(struct task_group *tg, int cpu)
1474	{
1475	return &tg->cfs_rq[cpu]->aggregate;
1476	}
1477
1478	typedef void (aggregate_func)(struct task_group , int, struct sched_domain *);
1479		1436
1480	/*	1437	/*
1481	* Iterate the full tree, calling @down when first entering a node and @up when	1438	* Iterate the full tree, calling @down when first entering a node and @up when
1482	* leaving it for the final time.	1439	* leaving it for the final time.
1483	*/	1440	*/
1484	static	1441	static void
1485	void aggregate_walk_tree(aggregate_func down, aggregate_func up,	1442	walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1486	int cpu, struct sched_domain *sd)
1487	{	1443	{
1488	struct task_group parent, child;	1444	struct task_group parent, child;
1489		1445
@@ -1507,72 +1463,6 @@ up:
1507	rcu_read_unlock();	1463	rcu_read_unlock();
1508	}	1464	}
1509		1465
1510	/*
1511	* Calculate the aggregate runqueue weight.
1512	*/
1513	static void
1514	aggregate_group_weight(struct task_group tg, int cpu, struct sched_domain sd)
1515	{
1516	unsigned long rq_weight = 0;
1517	int i;
1518
1519	for_each_cpu_mask(i, sd->span)
1520	rq_weight += tg->cfs_rq[i]->load.weight;
1521
1522	aggregate(tg, cpu)->rq_weight = rq_weight;
1523	}
1524
1525	/*
1526	* Compute the weight of this group on the given cpus.
1527	*/
1528	static void
1529	aggregate_group_shares(struct task_group tg, int cpu, struct sched_domain sd)
1530	{
1531	unsigned long shares = 0;
1532	int i;
1533
1534	for_each_cpu_mask(i, sd->span)
1535	shares += tg->cfs_rq[i]->shares;
1536
1537	if ((!shares && aggregate(tg, cpu)->rq_weight) \|\| shares > tg->shares)
1538	shares = tg->shares;
1539
1540	if (!sd->parent \|\| !(sd->parent->flags & SD_LOAD_BALANCE))
1541	shares = tg->shares;
1542
1543	aggregate(tg, cpu)->shares = shares;
1544	}
1545
1546	/*
1547	* Compute the load fraction assigned to this group, relies on the aggregate
1548	* weight and this group's parent's load, i.e. top-down.
1549	*/
1550	static void
1551	aggregate_group_load(struct task_group tg, int cpu, struct sched_domain sd)
1552	{
1553	unsigned long load;
1554
1555	if (!tg->parent) {
1556	int i;
1557
1558	load = 0;
1559	for_each_cpu_mask(i, sd->span)
1560	load += cpu_rq(i)->load.weight;
1561
1562	} else {
1563	load = aggregate(tg->parent, cpu)->load;
1564
1565	/*
1566	* shares is our weight in the parent's rq so
1567	* shares/parent->rq_weight gives our fraction of the load
1568	*/
1569	load *= aggregate(tg, cpu)->shares;
1570	load /= aggregate(tg->parent, cpu)->rq_weight + 1;
1571	}
1572
1573	aggregate(tg, cpu)->load = load;
1574	}
1575
1576	static void __set_se_shares(struct sched_entity *se, unsigned long shares);	1466	static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1577		1467
1578	/*	1468	/*
@@ -1580,16 +1470,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1580	*/	1470	*/
1581	static void	1471	static void
1582	__update_group_shares_cpu(struct task_group *tg, int cpu,	1472	__update_group_shares_cpu(struct task_group *tg, int cpu,
1583	struct sched_domain *sd, int tcpu)	1473	unsigned long sd_shares, unsigned long sd_rq_weight)
1584	{	1474	{
1585	int boost = 0;	1475	int boost = 0;
1586	unsigned long shares;	1476	unsigned long shares;
1587	unsigned long rq_weight;	1477	unsigned long rq_weight;
1588		1478
1589	if (!tg->se[tcpu])	1479	if (!tg->se[cpu])
1590	return;	1480	return;
1591		1481
1592	rq_weight = tg->cfs_rq[tcpu]->load.weight;	1482	rq_weight = tg->cfs_rq[cpu]->load.weight;
1593		1483
1594	/*	1484	/*
1595	* If there are currently no tasks on the cpu pretend there is one of	1485	* If there are currently no tasks on the cpu pretend there is one of
@@ -1601,124 +1491,97 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1601	rq_weight = NICE_0_LOAD;	1491	rq_weight = NICE_0_LOAD;
1602	}	1492	}
1603		1493
		1494	if (unlikely(rq_weight > sd_rq_weight))
		1495	rq_weight = sd_rq_weight;
		1496
1604	/*	1497	/*
1605	* \Sum shares * rq_weight	1498	* \Sum shares * rq_weight
1606	* shares = -----------------------	1499	* shares = -----------------------
1607	* \Sum rq_weight	1500	* \Sum rq_weight
1608	*	1501	*
1609	*/	1502	*/
1610	shares = aggregate(tg, cpu)->shares * rq_weight;	1503	shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1611	shares /= aggregate(tg, cpu)->rq_weight + 1;
1612		1504
1613	/*	1505	/*
1614	* record the actual number of shares, not the boosted amount.	1506	* record the actual number of shares, not the boosted amount.
1615	*/	1507	*/
1616	tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;	1508	tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1617		1509
1618	if (shares < MIN_SHARES)	1510	if (shares < MIN_SHARES)
1619	shares = MIN_SHARES;	1511	shares = MIN_SHARES;
1620	else if (shares > MAX_SHARES)	1512	else if (shares > MAX_SHARES)
1621	shares = MAX_SHARES;	1513	shares = MAX_SHARES;
1622		1514
1623	__set_se_shares(tg->se[tcpu], shares);	1515	__set_se_shares(tg->se[cpu], shares);
1624	}	1516	}
1625		1517
1626	/*	1518	/*
1627	* Re-adjust the weights on the cpu the task came from and on the cpu the	1519	* Re-compute the task group their per cpu shares over the given domain.
1628	* task went to.	1520	* This needs to be done in a bottom-up fashion because the rq weight of a
		1521	* parent group depends on the shares of its child groups.
1629	*/	1522	*/
1630	static void	1523	static void
1631	__move_group_shares(struct task_group tg, int cpu, struct sched_domain sd,	1524	tg_shares_up(struct task_group tg, int cpu, struct sched_domain sd)
1632	int scpu, int dcpu)
1633	{	1525	{
1634	__update_group_shares_cpu(tg, cpu, sd, scpu);	1526	unsigned long rq_weight = 0;
1635	__update_group_shares_cpu(tg, cpu, sd, dcpu);	1527	unsigned long shares = 0;
1636	}	1528	int i;
1637		1529
1638	/*	1530	for_each_cpu_mask(i, sd->span) {
1639	* Because changing a group's shares changes the weight of the super-group	1531	rq_weight += tg->cfs_rq[i]->load.weight;
1640	* we need to walk up the tree and change all shares until we hit the root.	1532	shares += tg->cfs_rq[i]->shares;
1641	*/
1642	static void
1643	move_group_shares(struct task_group tg, int cpu, struct sched_domain sd,
1644	int scpu, int dcpu)
1645	{
1646	while (tg) {
1647	__move_group_shares(tg, cpu, sd, scpu, dcpu);
1648	tg = tg->parent;
1649	}	1533	}
1650	}
1651		1534
1652	static void	1535	if ((!shares && rq_weight) \|\| shares > tg->shares)
1653	aggregate_group_set_shares(struct task_group tg, int cpu, struct sched_domain sd)	1536	shares = tg->shares;
1654	{	1537
1655	int i;	1538	if (!sd->parent \|\| !(sd->parent->flags & SD_LOAD_BALANCE))
		1539	shares = tg->shares;
1656		1540
1657	for_each_cpu_mask(i, sd->span) {	1541	for_each_cpu_mask(i, sd->span) {
1658	struct rq *rq = cpu_rq(i);	1542	struct rq *rq = cpu_rq(i);
1659	unsigned long flags;	1543	unsigned long flags;
1660		1544
1661	spin_lock_irqsave(&rq->lock, flags);	1545	spin_lock_irqsave(&rq->lock, flags);
1662	__update_group_shares_cpu(tg, cpu, sd, i);	1546	__update_group_shares_cpu(tg, i, shares, rq_weight);
1663	spin_unlock_irqrestore(&rq->lock, flags);	1547	spin_unlock_irqrestore(&rq->lock, flags);
1664	}	1548	}
1665
1666	aggregate_group_shares(tg, cpu, sd);
1667	}	1549	}
1668		1550
1669	/*	1551	/*
1670	* Calculate the accumulative weight and recursive load of each task group	1552	* Compute the cpu's hierarchical load factor for each task group.
1671	* while walking down the tree.	1553	* This needs to be done in a top-down fashion because the load of a child
		1554	* group is a fraction of its parents load.
1672	*/	1555	*/
1673	static void	1556	static void
1674	aggregate_get_down(struct task_group tg, int cpu, struct sched_domain sd)	1557	tg_load_down(struct task_group tg, int cpu, struct sched_domain sd)
1675	{	1558	{
1676	aggregate_group_weight(tg, cpu, sd);	1559	unsigned long load;
1677	aggregate_group_shares(tg, cpu, sd);
1678	aggregate_group_load(tg, cpu, sd);
1679	}
1680
1681	/*
1682	* Rebalance the cpu shares while walking back up the tree.
1683	*/
1684	static void
1685	aggregate_get_up(struct task_group tg, int cpu, struct sched_domain sd)
1686	{
1687	aggregate_group_set_shares(tg, cpu, sd);
1688	}
1689
1690	static void
1691	aggregate_get_nop(struct task_group tg, int cpu, struct sched_domain sd)
1692	{
1693	}
1694
1695	static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
1696		1560
1697	static void __init init_aggregate(void)	1561	if (!tg->parent) {
1698	{	1562	load = cpu_rq(cpu)->load.weight;
1699	int i;	1563	} else {
		1564	load = tg->parent->cfs_rq[cpu]->h_load;
		1565	load *= tg->cfs_rq[cpu]->shares;
		1566	load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
		1567	}
1700		1568
1701	for_each_possible_cpu(i)	1569	tg->cfs_rq[cpu]->h_load = load;
1702	spin_lock_init(&per_cpu(aggregate_lock, i));
1703	}	1570	}
1704		1571
1705	static int get_aggregate(int cpu, struct sched_domain *sd)	1572	static void
		1573	tg_nop(struct task_group tg, int cpu, struct sched_domain sd)
1706	{	1574	{
1707	if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
1708	return 0;
1709
1710	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
1711	return 1;
1712	}	1575	}
1713		1576
1714	static void update_aggregate(int cpu, struct sched_domain *sd)	1577	static void update_shares(struct sched_domain *sd)
1715	{	1578	{
1716	aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd);	1579	walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
1717	}	1580	}
1718		1581
1719	static void put_aggregate(int cpu, struct sched_domain *sd)	1582	static void update_h_load(int cpu)
1720	{	1583	{
1721	spin_unlock(&per_cpu(aggregate_lock, cpu));	1584	walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
1722	}	1585	}
1723		1586
1724	static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)	1587	static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1728,22 +1591,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728		1591
1729	#else	1592	#else
1730		1593
1731	static inline void init_aggregate(void)	1594	static inline void update_shares(struct sched_domain *sd)
1732	{
1733	}
1734
1735	static inline int get_aggregate(int cpu, struct sched_domain *sd)
1736	{
1737	return 0;
1738	}
1739
1740	static inline void update_aggregate(int cpu, struct sched_domain *sd)
1741	{	1595	{
1742	}	1596	}
1743		1597
1744	static inline void put_aggregate(int cpu, struct sched_domain *sd)
1745	{
1746	}
1747	#endif	1598	#endif
1748		1599
1749	#endif	1600	#endif
@@ -2172,12 +2023,6 @@ find_idlest_group(struct sched_domain sd, struct task_struct p, int this_cpu)
2172	int load_idx = sd->forkexec_idx;	2023	int load_idx = sd->forkexec_idx;
2173	int imbalance = 100 + (sd->imbalance_pct-100)/2;	2024	int imbalance = 100 + (sd->imbalance_pct-100)/2;
2174		2025
2175	/*
2176	* now that we have both rqs locked the rq weight won't change
2177	* anymore - so update the stats.
2178	*/
2179	update_aggregate(this_cpu, sd);
2180
2181	do {	2026	do {
2182	unsigned long load, avg_load;	2027	unsigned long load, avg_load;
2183	int local_group;	2028	int local_group;
@@ -3521,12 +3366,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3521	unsigned long imbalance;	3366	unsigned long imbalance;
3522	struct rq *busiest;	3367	struct rq *busiest;
3523	unsigned long flags;	3368	unsigned long flags;
3524	int unlock_aggregate;
3525		3369
3526	cpus_setall(*cpus);	3370	cpus_setall(*cpus);
3527		3371
3528	unlock_aggregate = get_aggregate(this_cpu, sd);
3529
3530	/*	3372	/*
3531	* When power savings policy is enabled for the parent domain, idle	3373	* When power savings policy is enabled for the parent domain, idle
3532	* sibling can pick up load irrespective of busy siblings. In this case,	3374	* sibling can pick up load irrespective of busy siblings. In this case,
@@ -3540,6 +3382,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3540	schedstat_inc(sd, lb_count[idle]);	3382	schedstat_inc(sd, lb_count[idle]);
3541		3383
3542	redo:	3384	redo:
		3385	update_shares(sd);
3543	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,	3386	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3544	cpus, balance);	3387	cpus, balance);
3545		3388
@@ -3663,8 +3506,8 @@ out_one_pinned:
3663	else	3506	else
3664	ld_moved = 0;	3507	ld_moved = 0;
3665	out:	3508	out:
3666	if (unlock_aggregate)	3509	if (ld_moved)
3667	put_aggregate(this_cpu, sd);	3510	update_shares(sd);
3668	return ld_moved;	3511	return ld_moved;
3669	}	3512	}
3670		3513
@@ -8019,7 +7862,6 @@ void __init sched_init(void)
8019	}	7862	}
8020		7863
8021	#ifdef CONFIG_SMP	7864	#ifdef CONFIG_SMP
8022	init_aggregate();
8023	init_defrootdomain();	7865	init_defrootdomain();
8024	#endif	7866	#endif
8025		7867