1 files changed, 64 insertions, 222 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 716cfc8e099e..f864b751fd19 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -406,34 +406,23 @@ struct cfs_rq {
        struct task_group *tg;  /* group that "owns" this runqueue */
 #ifdef CONFIG_SMP
-        unsigned long task_weight;
-        unsigned long shares;
        /*
-         * We need space to build a sched_domain wide view of the full task
+         * the part of load.weight contributed by tasks
-         * group tree, in order to avoid depending on dynamic memory allocation
-         * during the load balancing we place this in the per cpu task group
-         * hierarchy. This limits the load balancing to one instance per cpu,
-         * but more should not be needed anyway.
         */
-        struct aggregate_struct {
+        unsigned long task_weight;
-                /*
-                 *   load = weight(cpus) * f(tg)
-                 *
-                 * Where f(tg) is the recursive weight fraction assigned to
-                 * this group.
-                 */
-                unsigned long load;
-                /*
+        /*
-                 * part of the group weight distributed to this span.
+         *   h_load = weight * f(tg)
-                 */
+         *
-                unsigned long shares;
+         * Where f(tg) is the recursive weight fraction assigned to
+         * this group.
+         */
+        unsigned long h_load;
-                /*
+        /*
-                 * The sum of all runqueue weights within this span.
+         * this cpu's part of tg->shares
-                 */
+         */
-                unsigned long rq_weight;
+        unsigned long shares;
-        } aggregate;
 #endif
 #endif
 };
@@ -1443,47 +1432,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/*
+typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
- * Group load balancing.
- *
- * We calculate a few balance domain wide aggregate numbers; load and weight.
- * Given the pictures below, and assuming each item has equal weight:
- *
- *         root          1 - thread
- *         / | \         A - group
- *        A  1  B
- *       /|\   / \
- *      C 2 D 3   4
- *      |   |
- *      5   6
- *
- * load:
- *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
- *    which equals 1/9-th of the total load.
- *
- * shares:
- *    The weight of this group on the selected cpus.
- *
- * rq_weight:
- *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
- *    B would get 2.
- */
-static inline struct aggregate_struct *
-aggregate(struct task_group *tg, int cpu)
-{
-        return &tg->cfs_rq[cpu]->aggregate;
-}
-typedef void (*aggregate_func)(struct task_group *, int, struct sched_domain *);
 /*
 * Iterate the full tree, calling @down when first entering a node and @up when
 * leaving it for the final time.
 */
-static
+static void
-void aggregate_walk_tree(aggregate_func down, aggregate_func up,
+walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
-                         int cpu, struct sched_domain *sd)
 {
        struct task_group *parent, *child;
@@ -1507,72 +1463,6 @@ up:
        rcu_read_unlock();
 }
-/*
- * Calculate the aggregate runqueue weight.
- */
-static void
-aggregate_group_weight(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-        unsigned long rq_weight = 0;
-        int i;
-        for_each_cpu_mask(i, sd->span)
-                rq_weight += tg->cfs_rq[i]->load.weight;
-        aggregate(tg, cpu)->rq_weight = rq_weight;
-}
-/*
- * Compute the weight of this group on the given cpus.
- */
-static void
-aggregate_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-        unsigned long shares = 0;
-        int i;
-        for_each_cpu_mask(i, sd->span)
-                shares += tg->cfs_rq[i]->shares;
-        if ((!shares && aggregate(tg, cpu)->rq_weight) || shares > tg->shares)
-                shares = tg->shares;
-        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
-                shares = tg->shares;
-        aggregate(tg, cpu)->shares = shares;
-}
-/*
- * Compute the load fraction assigned to this group, relies on the aggregate
- * weight and this group's parent's load, i.e. top-down.
- */
-static void
-aggregate_group_load(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-        unsigned long load;
-        if (!tg->parent) {
-                int i;
-                load = 0;
-                for_each_cpu_mask(i, sd->span)
-                        load += cpu_rq(i)->load.weight;
-        } else {
-                load = aggregate(tg->parent, cpu)->load;
-                /*
-                 * shares is our weight in the parent's rq so
-                 * shares/parent->rq_weight gives our fraction of the load
-                 */
-                load *= aggregate(tg, cpu)->shares;
-                load /= aggregate(tg->parent, cpu)->rq_weight + 1;
-        }
-        aggregate(tg, cpu)->load = load;
-}
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 /*
@@ -1580,16 +1470,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 */
 static void
 __update_group_shares_cpu(struct task_group *tg, int cpu,
-                          struct sched_domain *sd, int tcpu)
+                          unsigned long sd_shares, unsigned long sd_rq_weight)
 {
        int boost = 0;
        unsigned long shares;
        unsigned long rq_weight;
-        if (!tg->se[tcpu])
+        if (!tg->se[cpu])
                return;
-        rq_weight = tg->cfs_rq[tcpu]->load.weight;
+        rq_weight = tg->cfs_rq[cpu]->load.weight;
        /*
         * If there are currently no tasks on the cpu pretend there is one of
@@ -1601,124 +1491,97 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
                rq_weight = NICE_0_LOAD;
        }
+        if (unlikely(rq_weight > sd_rq_weight))
+                rq_weight = sd_rq_weight;
        /*
         *           \Sum shares * rq_weight
         * shares =  -----------------------
         *               \Sum rq_weight
         *
         */
-        shares = aggregate(tg, cpu)->shares * rq_weight;
+        shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
-        shares /= aggregate(tg, cpu)->rq_weight + 1;
        /*
         * record the actual number of shares, not the boosted amount.
         */
-        tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
+        tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
        if (shares < MIN_SHARES)
                shares = MIN_SHARES;
        else if (shares > MAX_SHARES)
                shares = MAX_SHARES;
-        __set_se_shares(tg->se[tcpu], shares);
+        __set_se_shares(tg->se[cpu], shares);
 }
 /*
- * Re-adjust the weights on the cpu the task came from and on the cpu the
+ * Re-compute the task group their per cpu shares over the given domain.
- * task went to.
+ * This needs to be done in a bottom-up fashion because the rq weight of a
+ * parent group depends on the shares of its child groups.
 */
 static void
-__move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
+tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
-                    int scpu, int dcpu)
 {
-        __update_group_shares_cpu(tg, cpu, sd, scpu);
+        unsigned long rq_weight = 0;
-        __update_group_shares_cpu(tg, cpu, sd, dcpu);
+        unsigned long shares = 0;
-}
+        int i;
-/*
+        for_each_cpu_mask(i, sd->span) {
- * Because changing a group's shares changes the weight of the super-group
+                rq_weight += tg->cfs_rq[i]->load.weight;
- * we need to walk up the tree and change all shares until we hit the root.
+                shares += tg->cfs_rq[i]->shares;
- */
-static void
-move_group_shares(struct task_group *tg, int cpu, struct sched_domain *sd,
-                  int scpu, int dcpu)
-{
-        while (tg) {
-                __move_group_shares(tg, cpu, sd, scpu, dcpu);
-                tg = tg->parent;
        }
-}
-static void
+        if ((!shares && rq_weight) || shares > tg->shares)
-aggregate_group_set_shares(struct task_group *tg, int cpu, struct sched_domain *sd)
+                shares = tg->shares;
-{
-        int i;
+        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
+                shares = tg->shares;
        for_each_cpu_mask(i, sd->span) {
                struct rq *rq = cpu_rq(i);
                unsigned long flags;
                spin_lock_irqsave(&rq->lock, flags);
-                __update_group_shares_cpu(tg, cpu, sd, i);
+                __update_group_shares_cpu(tg, i, shares, rq_weight);
                spin_unlock_irqrestore(&rq->lock, flags);
        }
-        aggregate_group_shares(tg, cpu, sd);
 }
 /*
- * Calculate the accumulative weight and recursive load of each task group
+ * Compute the cpu's hierarchical load factor for each task group.
- * while walking down the tree.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
 */
 static void
-aggregate_get_down(struct task_group *tg, int cpu, struct sched_domain *sd)
+tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-        aggregate_group_weight(tg, cpu, sd);
+        unsigned long load;
-        aggregate_group_shares(tg, cpu, sd);
-        aggregate_group_load(tg, cpu, sd);
-}
-/*
- * Rebalance the cpu shares while walking back up the tree.
- */
-static void
-aggregate_get_up(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-        aggregate_group_set_shares(tg, cpu, sd);
-}
-static void
-aggregate_get_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
-{
-}
-static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
-static void __init init_aggregate(void)
+        if (!tg->parent) {
-{
+                load = cpu_rq(cpu)->load.weight;
-        int i;
+        } else {
+                load = tg->parent->cfs_rq[cpu]->h_load;
+                load *= tg->cfs_rq[cpu]->shares;
+                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+        }
-        for_each_possible_cpu(i)
+        tg->cfs_rq[cpu]->h_load = load;
-                spin_lock_init(&per_cpu(aggregate_lock, i));
 }
-static int get_aggregate(int cpu, struct sched_domain *sd)
+static void
+tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
 {
-        if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
-                return 0;
-        aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
-        return 1;
 }
-static void update_aggregate(int cpu, struct sched_domain *sd)
+static void update_shares(struct sched_domain *sd)
 {
-        aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd);
+        walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
 }
-static void put_aggregate(int cpu, struct sched_domain *sd)
+static void update_h_load(int cpu)
 {
-        spin_unlock(&per_cpu(aggregate_lock, cpu));
+        walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
 }
 static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1728,22 +1591,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 #else
-static inline void init_aggregate(void)
+static inline void update_shares(struct sched_domain *sd)
-{
-}
-static inline int get_aggregate(int cpu, struct sched_domain *sd)
-{
-        return 0;
-}
-static inline void update_aggregate(int cpu, struct sched_domain *sd)
 {
 }
-static inline void put_aggregate(int cpu, struct sched_domain *sd)
-{
-}
 #endif
 #endif
@@ -2172,12 +2023,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
        int load_idx = sd->forkexec_idx;
        int imbalance = 100 + (sd->imbalance_pct-100)/2;
-        /*
-         * now that we have both rqs locked the rq weight won't change
-         * anymore - so update the stats.
-         */
-        update_aggregate(this_cpu, sd);
        do {
                unsigned long load, avg_load;
                int local_group;
@@ -3521,12 +3366,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        unsigned long imbalance;
        struct rq *busiest;
        unsigned long flags;
-        int unlock_aggregate;
        cpus_setall(*cpus);
-        unlock_aggregate = get_aggregate(this_cpu, sd);
        /*
         * When power savings policy is enabled for the parent domain, idle
         * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3540,6 +3382,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        schedstat_inc(sd, lb_count[idle]);
 redo:
+        update_shares(sd);
        group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
                                   cpus, balance);
@@ -3663,8 +3506,8 @@ out_one_pinned:
        else
                ld_moved = 0;
 out:
-        if (unlock_aggregate)
+        if (ld_moved)
-                put_aggregate(this_cpu, sd);
+                update_shares(sd);
        return ld_moved;
 }
@@ -8019,7 +7862,6 @@ void __init sched_init(void)
        }
 #ifdef CONFIG_SMP
-        init_aggregate();
        init_defrootdomain();
 #endif

diff --git a/kernel/sched.c b/kernel/sched.c index 716cfc8e099e..f864b751fd19 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -406,34 +406,23 @@ struct cfs_rq {
406	struct task_group tg; / group that "owns" this runqueue */	406	struct task_group tg; / group that "owns" this runqueue */
407		407
408	#ifdef CONFIG_SMP	408	#ifdef CONFIG_SMP
409	unsigned long task_weight;
410	unsigned long shares;
411	/*	409	/*
412	* We need space to build a sched_domain wide view of the full task	410	* the part of load.weight contributed by tasks
413	* group tree, in order to avoid depending on dynamic memory allocation
414	* during the load balancing we place this in the per cpu task group
415	* hierarchy. This limits the load balancing to one instance per cpu,
416	* but more should not be needed anyway.
417	*/	411	*/
418	struct aggregate_struct {	412	unsigned long task_weight;
419	/*
420	* load = weight(cpus) * f(tg)
421	*
422	* Where f(tg) is the recursive weight fraction assigned to
423	* this group.
424	*/
425	unsigned long load;
426		413
427	/*	414	/*
428	* part of the group weight distributed to this span.	415	* h_load = weight * f(tg)
429	*/	416	*
430	unsigned long shares;	417	* Where f(tg) is the recursive weight fraction assigned to
		418	* this group.
		419	*/
		420	unsigned long h_load;
431		421
432	/*	422	/*
433	* The sum of all runqueue weights within this span.	423	* this cpu's part of tg->shares
434	*/	424	*/
435	unsigned long rq_weight;	425	unsigned long shares;
436	} aggregate;
437	#endif	426	#endif
438	#endif	427	#endif
439	};	428	};
@@ -1443,47 +1432,14 @@ static int task_hot(struct task_struct p, u64 now, struct sched_domain sd);
1443		1432
1444	#ifdef CONFIG_FAIR_GROUP_SCHED	1433	#ifdef CONFIG_FAIR_GROUP_SCHED
1445		1434
1446	/*	1435	typedef void (tg_visitor)(struct task_group , int, struct sched_domain *);
1447	* Group load balancing.
1448	*
1449	* We calculate a few balance domain wide aggregate numbers; load and weight.
1450	* Given the pictures below, and assuming each item has equal weight:
1451	*
1452	* root 1 - thread
1453	* / \| \ A - group
1454	* A 1 B
1455	* /\|\ / \
1456	* C 2 D 3 4
1457	* \| \|
1458	* 5 6
1459	*
1460	* load:
1461	* A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
1462	* which equals 1/9-th of the total load.
1463	*
1464	* shares:
1465	* The weight of this group on the selected cpus.
1466	*
1467	* rq_weight:
1468	* Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
1469	* B would get 2.
1470	*/
1471
1472	static inline struct aggregate_struct *
1473	aggregate(struct task_group *tg, int cpu)
1474	{
1475	return &tg->cfs_rq[cpu]->aggregate;
1476	}
1477
1478	typedef void (aggregate_func)(struct task_group , int, struct sched_domain *);
1479		1436
1480	/*	1437	/*
1481	* Iterate the full tree, calling @down when first entering a node and @up when	1438	* Iterate the full tree, calling @down when first entering a node and @up when
1482	* leaving it for the final time.	1439	* leaving it for the final time.
1483	*/	1440	*/
1484	static	1441	static void
1485	void aggregate_walk_tree(aggregate_func down, aggregate_func up,	1442	walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1486	int cpu, struct sched_domain *sd)
1487	{	1443	{
1488	struct task_group parent, child;	1444	struct task_group parent, child;
1489		1445
@@ -1507,72 +1463,6 @@ up:
1507	rcu_read_unlock();	1463	rcu_read_unlock();
1508	}	1464	}
1509		1465
1510	/*
1511	* Calculate the aggregate runqueue weight.
1512	*/
1513	static void
1514	aggregate_group_weight(struct task_group tg, int cpu, struct sched_domain sd)
1515	{
1516	unsigned long rq_weight = 0;
1517	int i;
1518
1519	for_each_cpu_mask(i, sd->span)
1520	rq_weight += tg->cfs_rq[i]->load.weight;
1521
1522	aggregate(tg, cpu)->rq_weight = rq_weight;
1523	}
1524
1525	/*
1526	* Compute the weight of this group on the given cpus.
1527	*/
1528	static void
1529	aggregate_group_shares(struct task_group tg, int cpu, struct sched_domain sd)
1530	{
1531	unsigned long shares = 0;
1532	int i;
1533
1534	for_each_cpu_mask(i, sd->span)
1535	shares += tg->cfs_rq[i]->shares;
1536
1537	if ((!shares && aggregate(tg, cpu)->rq_weight) \|\| shares > tg->shares)
1538	shares = tg->shares;
1539
1540	if (!sd->parent \|\| !(sd->parent->flags & SD_LOAD_BALANCE))
1541	shares = tg->shares;
1542
1543	aggregate(tg, cpu)->shares = shares;
1544	}
1545
1546	/*
1547	* Compute the load fraction assigned to this group, relies on the aggregate
1548	* weight and this group's parent's load, i.e. top-down.
1549	*/
1550	static void
1551	aggregate_group_load(struct task_group tg, int cpu, struct sched_domain sd)
1552	{
1553	unsigned long load;
1554
1555	if (!tg->parent) {
1556	int i;
1557
1558	load = 0;
1559	for_each_cpu_mask(i, sd->span)
1560	load += cpu_rq(i)->load.weight;
1561
1562	} else {
1563	load = aggregate(tg->parent, cpu)->load;
1564
1565	/*
1566	* shares is our weight in the parent's rq so
1567	* shares/parent->rq_weight gives our fraction of the load
1568	*/
1569	load *= aggregate(tg, cpu)->shares;
1570	load /= aggregate(tg->parent, cpu)->rq_weight + 1;
1571	}
1572
1573	aggregate(tg, cpu)->load = load;
1574	}
1575
1576	static void __set_se_shares(struct sched_entity *se, unsigned long shares);	1466	static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1577		1467
1578	/*	1468	/*
@@ -1580,16 +1470,16 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1580	*/	1470	*/
1581	static void	1471	static void
1582	__update_group_shares_cpu(struct task_group *tg, int cpu,	1472	__update_group_shares_cpu(struct task_group *tg, int cpu,
1583	struct sched_domain *sd, int tcpu)	1473	unsigned long sd_shares, unsigned long sd_rq_weight)
1584	{	1474	{
1585	int boost = 0;	1475	int boost = 0;
1586	unsigned long shares;	1476	unsigned long shares;
1587	unsigned long rq_weight;	1477	unsigned long rq_weight;
1588		1478
1589	if (!tg->se[tcpu])	1479	if (!tg->se[cpu])
1590	return;	1480	return;
1591		1481
1592	rq_weight = tg->cfs_rq[tcpu]->load.weight;	1482	rq_weight = tg->cfs_rq[cpu]->load.weight;
1593		1483
1594	/*	1484	/*
1595	* If there are currently no tasks on the cpu pretend there is one of	1485	* If there are currently no tasks on the cpu pretend there is one of
@@ -1601,124 +1491,97 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1601	rq_weight = NICE_0_LOAD;	1491	rq_weight = NICE_0_LOAD;
1602	}	1492	}
1603		1493
		1494	if (unlikely(rq_weight > sd_rq_weight))
		1495	rq_weight = sd_rq_weight;
		1496
1604	/*	1497	/*
1605	* \Sum shares * rq_weight	1498	* \Sum shares * rq_weight
1606	* shares = -----------------------	1499	* shares = -----------------------
1607	* \Sum rq_weight	1500	* \Sum rq_weight
1608	*	1501	*
1609	*/	1502	*/
1610	shares = aggregate(tg, cpu)->shares * rq_weight;	1503	shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1611	shares /= aggregate(tg, cpu)->rq_weight + 1;
1612		1504
1613	/*	1505	/*
1614	* record the actual number of shares, not the boosted amount.	1506	* record the actual number of shares, not the boosted amount.
1615	*/	1507	*/
1616	tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;	1508	tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1617		1509
1618	if (shares < MIN_SHARES)	1510	if (shares < MIN_SHARES)
1619	shares = MIN_SHARES;	1511	shares = MIN_SHARES;
1620	else if (shares > MAX_SHARES)	1512	else if (shares > MAX_SHARES)
1621	shares = MAX_SHARES;	1513	shares = MAX_SHARES;
1622		1514
1623	__set_se_shares(tg->se[tcpu], shares);	1515	__set_se_shares(tg->se[cpu], shares);
1624	}	1516	}
1625		1517
1626	/*	1518	/*
1627	* Re-adjust the weights on the cpu the task came from and on the cpu the	1519	* Re-compute the task group their per cpu shares over the given domain.
1628	* task went to.	1520	* This needs to be done in a bottom-up fashion because the rq weight of a
		1521	* parent group depends on the shares of its child groups.
1629	*/	1522	*/
1630	static void	1523	static void
1631	__move_group_shares(struct task_group tg, int cpu, struct sched_domain sd,	1524	tg_shares_up(struct task_group tg, int cpu, struct sched_domain sd)
1632	int scpu, int dcpu)
1633	{	1525	{
1634	__update_group_shares_cpu(tg, cpu, sd, scpu);	1526	unsigned long rq_weight = 0;
1635	__update_group_shares_cpu(tg, cpu, sd, dcpu);	1527	unsigned long shares = 0;
1636	}	1528	int i;
1637		1529
1638	/*	1530	for_each_cpu_mask(i, sd->span) {
1639	* Because changing a group's shares changes the weight of the super-group	1531	rq_weight += tg->cfs_rq[i]->load.weight;
1640	* we need to walk up the tree and change all shares until we hit the root.	1532	shares += tg->cfs_rq[i]->shares;
1641	*/
1642	static void
1643	move_group_shares(struct task_group tg, int cpu, struct sched_domain sd,
1644	int scpu, int dcpu)
1645	{
1646	while (tg) {
1647	__move_group_shares(tg, cpu, sd, scpu, dcpu);
1648	tg = tg->parent;
1649	}	1533	}
1650	}
1651		1534
1652	static void	1535	if ((!shares && rq_weight) \|\| shares > tg->shares)
1653	aggregate_group_set_shares(struct task_group tg, int cpu, struct sched_domain sd)	1536	shares = tg->shares;
1654	{	1537
1655	int i;	1538	if (!sd->parent \|\| !(sd->parent->flags & SD_LOAD_BALANCE))
		1539	shares = tg->shares;
1656		1540
1657	for_each_cpu_mask(i, sd->span) {	1541	for_each_cpu_mask(i, sd->span) {
1658	struct rq *rq = cpu_rq(i);	1542	struct rq *rq = cpu_rq(i);
1659	unsigned long flags;	1543	unsigned long flags;
1660		1544
1661	spin_lock_irqsave(&rq->lock, flags);	1545	spin_lock_irqsave(&rq->lock, flags);
1662	__update_group_shares_cpu(tg, cpu, sd, i);	1546	__update_group_shares_cpu(tg, i, shares, rq_weight);
1663	spin_unlock_irqrestore(&rq->lock, flags);	1547	spin_unlock_irqrestore(&rq->lock, flags);
1664	}	1548	}
1665
1666	aggregate_group_shares(tg, cpu, sd);
1667	}	1549	}
1668		1550
1669	/*	1551	/*
1670	* Calculate the accumulative weight and recursive load of each task group	1552	* Compute the cpu's hierarchical load factor for each task group.
1671	* while walking down the tree.	1553	* This needs to be done in a top-down fashion because the load of a child
		1554	* group is a fraction of its parents load.
1672	*/	1555	*/
1673	static void	1556	static void
1674	aggregate_get_down(struct task_group tg, int cpu, struct sched_domain sd)	1557	tg_load_down(struct task_group tg, int cpu, struct sched_domain sd)
1675	{	1558	{
1676	aggregate_group_weight(tg, cpu, sd);	1559	unsigned long load;
1677	aggregate_group_shares(tg, cpu, sd);
1678	aggregate_group_load(tg, cpu, sd);
1679	}
1680
1681	/*
1682	* Rebalance the cpu shares while walking back up the tree.
1683	*/
1684	static void
1685	aggregate_get_up(struct task_group tg, int cpu, struct sched_domain sd)
1686	{
1687	aggregate_group_set_shares(tg, cpu, sd);
1688	}
1689
1690	static void
1691	aggregate_get_nop(struct task_group tg, int cpu, struct sched_domain sd)
1692	{
1693	}
1694
1695	static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
1696		1560
1697	static void __init init_aggregate(void)	1561	if (!tg->parent) {
1698	{	1562	load = cpu_rq(cpu)->load.weight;
1699	int i;	1563	} else {
		1564	load = tg->parent->cfs_rq[cpu]->h_load;
		1565	load *= tg->cfs_rq[cpu]->shares;
		1566	load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
		1567	}
1700		1568
1701	for_each_possible_cpu(i)	1569	tg->cfs_rq[cpu]->h_load = load;
1702	spin_lock_init(&per_cpu(aggregate_lock, i));
1703	}	1570	}
1704		1571
1705	static int get_aggregate(int cpu, struct sched_domain *sd)	1572	static void
		1573	tg_nop(struct task_group tg, int cpu, struct sched_domain sd)
1706	{	1574	{
1707	if (!spin_trylock(&per_cpu(aggregate_lock, cpu)))
1708	return 0;
1709
1710	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, cpu, sd);
1711	return 1;
1712	}	1575	}
1713		1576
1714	static void update_aggregate(int cpu, struct sched_domain *sd)	1577	static void update_shares(struct sched_domain *sd)
1715	{	1578	{
1716	aggregate_walk_tree(aggregate_get_down, aggregate_get_nop, cpu, sd);	1579	walk_tg_tree(tg_nop, tg_shares_up, 0, sd);
1717	}	1580	}
1718		1581
1719	static void put_aggregate(int cpu, struct sched_domain *sd)	1582	static void update_h_load(int cpu)
1720	{	1583	{
1721	spin_unlock(&per_cpu(aggregate_lock, cpu));	1584	walk_tg_tree(tg_load_down, tg_nop, cpu, NULL);
1722	}	1585	}
1723		1586
1724	static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)	1587	static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
@@ -1728,22 +1591,10 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1728		1591
1729	#else	1592	#else
1730		1593
1731	static inline void init_aggregate(void)	1594	static inline void update_shares(struct sched_domain *sd)
1732	{
1733	}
1734
1735	static inline int get_aggregate(int cpu, struct sched_domain *sd)
1736	{
1737	return 0;
1738	}
1739
1740	static inline void update_aggregate(int cpu, struct sched_domain *sd)
1741	{	1595	{
1742	}	1596	}
1743		1597
1744	static inline void put_aggregate(int cpu, struct sched_domain *sd)
1745	{
1746	}
1747	#endif	1598	#endif
1748		1599
1749	#endif	1600	#endif
@@ -2172,12 +2023,6 @@ find_idlest_group(struct sched_domain sd, struct task_struct p, int this_cpu)
2172	int load_idx = sd->forkexec_idx;	2023	int load_idx = sd->forkexec_idx;
2173	int imbalance = 100 + (sd->imbalance_pct-100)/2;	2024	int imbalance = 100 + (sd->imbalance_pct-100)/2;
2174		2025
2175	/*
2176	* now that we have both rqs locked the rq weight won't change
2177	* anymore - so update the stats.
2178	*/
2179	update_aggregate(this_cpu, sd);
2180
2181	do {	2026	do {
2182	unsigned long load, avg_load;	2027	unsigned long load, avg_load;
2183	int local_group;	2028	int local_group;
@@ -3521,12 +3366,9 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3521	unsigned long imbalance;	3366	unsigned long imbalance;
3522	struct rq *busiest;	3367	struct rq *busiest;
3523	unsigned long flags;	3368	unsigned long flags;
3524	int unlock_aggregate;
3525		3369
3526	cpus_setall(*cpus);	3370	cpus_setall(*cpus);
3527		3371
3528	unlock_aggregate = get_aggregate(this_cpu, sd);
3529
3530	/*	3372	/*
3531	* When power savings policy is enabled for the parent domain, idle	3373	* When power savings policy is enabled for the parent domain, idle
3532	* sibling can pick up load irrespective of busy siblings. In this case,	3374	* sibling can pick up load irrespective of busy siblings. In this case,
@@ -3540,6 +3382,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3540	schedstat_inc(sd, lb_count[idle]);	3382	schedstat_inc(sd, lb_count[idle]);
3541		3383
3542	redo:	3384	redo:
		3385	update_shares(sd);
3543	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,	3386	group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3544	cpus, balance);	3387	cpus, balance);
3545		3388
@@ -3663,8 +3506,8 @@ out_one_pinned:
3663	else	3506	else
3664	ld_moved = 0;	3507	ld_moved = 0;
3665	out:	3508	out:
3666	if (unlock_aggregate)	3509	if (ld_moved)
3667	put_aggregate(this_cpu, sd);	3510	update_shares(sd);
3668	return ld_moved;	3511	return ld_moved;
3669	}	3512	}
3670		3513
@@ -8019,7 +7862,6 @@ void __init sched_init(void)
8019	}	7862	}
8020		7863
8021	#ifdef CONFIG_SMP	7864	#ifdef CONFIG_SMP
8022	init_aggregate();
8023	init_defrootdomain();	7865	init_defrootdomain();
8024	#endif	7866	#endif
8025		7867