sched: revert revert of: fair-group: SMP-nice for group scheduling

Try again.. Initial commit: 18d95a2832c1392a2d63227a7a6d433cb9f2037e Revert: 6363ca57c76b7b83639ca8c83fc285fa26a7880e Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Mike Galbraith <efault@gmx.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Peter Zijlstra <a.p.zijlstra@chello.nl> 2008-06-27 07:41:14 -0400
committer: Ingo Molnar <mingo@elte.hu> 2008-06-27 08:31:29 -0400
commit: c09595f63bb1909c5dc4dca288f4fe818561b5f3 (patch)
tree: 42631e6986f3ea4543b125ca62a99df8548e0eb9 /kernel/sched.c
parent: ced8aa16e1db55c33c507174c1b1f9e107445865 (diff)
1 files changed, 399 insertions, 31 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index f653af684fb3..874b6da15430 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -403,6 +403,43 @@ struct cfs_rq {
         */
        struct list_head leaf_cfs_rq_list;
        struct task_group *tg;  /* group that "owns" this runqueue */
+#ifdef CONFIG_SMP
+        unsigned long task_weight;
+        unsigned long shares;
+        /*
+         * We need space to build a sched_domain wide view of the full task
+         * group tree, in order to avoid depending on dynamic memory allocation
+         * during the load balancing we place this in the per cpu task group
+         * hierarchy. This limits the load balancing to one instance per cpu,
+         * but more should not be needed anyway.
+         */
+        struct aggregate_struct {
+                /*
+                 *   load = weight(cpus) * f(tg)
+                 *
+                 * Where f(tg) is the recursive weight fraction assigned to
+                 * this group.
+                 */
+                unsigned long load;
+                /*
+                 * part of the group weight distributed to this span.
+                 */
+                unsigned long shares;
+                /*
+                 * The sum of all runqueue weights within this span.
+                 */
+                unsigned long rq_weight;
+                /*
+                 * Weight contributed by tasks; this is the part we can
+                 * influence by moving tasks around.
+                 */
+                unsigned long task_weight;
+        } aggregate;
+#endif
 #endif
 };
@@ -1484,6 +1521,326 @@ static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
 static unsigned long cpu_avg_load_per_task(int cpu);
 static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/*
+ * Group load balancing.
+ *
+ * We calculate a few balance domain wide aggregate numbers; load and weight.
+ * Given the pictures below, and assuming each item has equal weight:
+ *
+ *         root          1 - thread
+ *         / | \         A - group
+ *        A  1  B
+ *       /|\   / \
+ *      C 2 D 3   4
+ *      |   |
+ *      5   6
+ *
+ * load:
+ *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
+ *    which equals 1/9-th of the total load.
+ *
+ * shares:
+ *    The weight of this group on the selected cpus.
+ *
+ * rq_weight:
+ *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
+ *    B would get 2.
+ *
+ * task_weight:
+ *    Part of the rq_weight contributed by tasks; all groups except B would
+ *    get 1, B gets 2.
+ */
+static inline struct aggregate_struct *
+aggregate(struct task_group *tg, struct sched_domain *sd)
+{
+        return &tg->cfs_rq[sd->first_cpu]->aggregate;
+}
+typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ */
+static
+void aggregate_walk_tree(aggregate_func down, aggregate_func up,
+                         struct sched_domain *sd)
+{
+        struct task_group *parent, *child;
+        rcu_read_lock();
+        parent = &root_task_group;
+down:
+        (*down)(parent, sd);
+        list_for_each_entry_rcu(child, &parent->children, siblings) {
+                parent = child;
+                goto down;
+up:
+                continue;
+        }
+        (*up)(parent, sd);
+        child = parent;
+        parent = parent->parent;
+        if (parent)
+                goto up;
+        rcu_read_unlock();
+}
+/*
+ * Calculate the aggregate runqueue weight.
+ */
+static
+void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
+{
+        unsigned long rq_weight = 0;
+        unsigned long task_weight = 0;
+        int i;
+        for_each_cpu_mask(i, sd->span) {
+                rq_weight += tg->cfs_rq[i]->load.weight;
+                task_weight += tg->cfs_rq[i]->task_weight;
+        }
+        aggregate(tg, sd)->rq_weight = rq_weight;
+        aggregate(tg, sd)->task_weight = task_weight;
+}
+/*
+ * Compute the weight of this group on the given cpus.
+ */
+static
+void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
+{
+        unsigned long shares = 0;
+        int i;
+        for_each_cpu_mask(i, sd->span)
+                shares += tg->cfs_rq[i]->shares;
+        if ((!shares && aggregate(tg, sd)->rq_weight) || shares > tg->shares)
+                shares = tg->shares;
+        aggregate(tg, sd)->shares = shares;
+}
+/*
+ * Compute the load fraction assigned to this group, relies on the aggregate
+ * weight and this group's parent's load, i.e. top-down.
+ */
+static
+void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
+{
+        unsigned long load;
+        if (!tg->parent) {
+                int i;
+                load = 0;
+                for_each_cpu_mask(i, sd->span)
+                        load += cpu_rq(i)->load.weight;
+        } else {
+                load = aggregate(tg->parent, sd)->load;
+                /*
+                 * shares is our weight in the parent's rq so
+                 * shares/parent->rq_weight gives our fraction of the load
+                 */
+                load *= aggregate(tg, sd)->shares;
+                load /= aggregate(tg->parent, sd)->rq_weight + 1;
+        }
+        aggregate(tg, sd)->load = load;
+}
+static void __set_se_shares(struct sched_entity *se, unsigned long shares);
+/*
+ * Calculate and set the cpu's group shares.
+ */
+static void
+__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
+                          int tcpu)
+{
+        int boost = 0;
+        unsigned long shares;
+        unsigned long rq_weight;
+        if (!tg->se[tcpu])
+                return;
+        rq_weight = tg->cfs_rq[tcpu]->load.weight;
+        /*
+         * If there are currently no tasks on the cpu pretend there is one of
+         * average load so that when a new task gets to run here it will not
+         * get delayed by group starvation.
+         */
+        if (!rq_weight) {
+                boost = 1;
+                rq_weight = NICE_0_LOAD;
+        }
+        /*
+         *           \Sum shares * rq_weight
+         * shares =  -----------------------
+         *               \Sum rq_weight
+         *
+         */
+        shares = aggregate(tg, sd)->shares * rq_weight;
+        shares /= aggregate(tg, sd)->rq_weight + 1;
+        /*
+         * record the actual number of shares, not the boosted amount.
+         */
+        tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
+        if (shares < MIN_SHARES)
+                shares = MIN_SHARES;
+        else if (shares > MAX_SHARES)
+                shares = MAX_SHARES;
+        __set_se_shares(tg->se[tcpu], shares);
+}
+/*
+ * Re-adjust the weights on the cpu the task came from and on the cpu the
+ * task went to.
+ */
+static void
+__move_group_shares(struct task_group *tg, struct sched_domain *sd,
+                    int scpu, int dcpu)
+{
+        unsigned long shares;
+        shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+        __update_group_shares_cpu(tg, sd, scpu);
+        __update_group_shares_cpu(tg, sd, dcpu);
+        /*
+         * ensure we never loose shares due to rounding errors in the
+         * above redistribution.
+         */
+        shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+        if (shares)
+                tg->cfs_rq[dcpu]->shares += shares;
+}
+/*
+ * Because changing a group's shares changes the weight of the super-group
+ * we need to walk up the tree and change all shares until we hit the root.
+ */
+static void
+move_group_shares(struct task_group *tg, struct sched_domain *sd,
+                  int scpu, int dcpu)
+{
+        while (tg) {
+                __move_group_shares(tg, sd, scpu, dcpu);
+                tg = tg->parent;
+        }
+}
+static
+void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
+{
+        unsigned long shares = aggregate(tg, sd)->shares;
+        int i;
+        for_each_cpu_mask(i, sd->span) {
+                struct rq *rq = cpu_rq(i);
+                unsigned long flags;
+                spin_lock_irqsave(&rq->lock, flags);
+                __update_group_shares_cpu(tg, sd, i);
+                spin_unlock_irqrestore(&rq->lock, flags);
+        }
+        aggregate_group_shares(tg, sd);
+        /*
+         * ensure we never loose shares due to rounding errors in the
+         * above redistribution.
+         */
+        shares -= aggregate(tg, sd)->shares;
+        if (shares) {
+                tg->cfs_rq[sd->first_cpu]->shares += shares;
+                aggregate(tg, sd)->shares += shares;
+        }
+}
+/*
+ * Calculate the accumulative weight and recursive load of each task group
+ * while walking down the tree.
+ */
+static
+void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
+{
+        aggregate_group_weight(tg, sd);
+        aggregate_group_shares(tg, sd);
+        aggregate_group_load(tg, sd);
+}
+/*
+ * Rebalance the cpu shares while walking back up the tree.
+ */
+static
+void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
+{
+        aggregate_group_set_shares(tg, sd);
+}
+static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
+static void __init init_aggregate(void)
+{
+        int i;
+        for_each_possible_cpu(i)
+                spin_lock_init(&per_cpu(aggregate_lock, i));
+}
+static int get_aggregate(struct sched_domain *sd)
+{
+        if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
+                return 0;
+        aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
+        return 1;
+}
+static void put_aggregate(struct sched_domain *sd)
+{
+        spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
+}
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+        cfs_rq->shares = shares;
+}
+#else
+static inline void init_aggregate(void)
+{
+}
+static inline int get_aggregate(struct sched_domain *sd)
+{
+        return 0;
+}
+static inline void put_aggregate(struct sched_domain *sd)
+{
+}
+#endif
 #endif
 #include "sched_stats.h"
@@ -1498,26 +1855,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
-static inline void inc_load(struct rq *rq, const struct task_struct *p)
+static void inc_nr_running(struct rq *rq)
-{
-        update_load_add(&rq->load, p->se.load.weight);
-}
-static inline void dec_load(struct rq *rq, const struct task_struct *p)
-{
-        update_load_sub(&rq->load, p->se.load.weight);
-}
-static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
        rq->nr_running++;
-        inc_load(rq, p);
 }
-static void dec_nr_running(struct task_struct *p, struct rq *rq)
+static void dec_nr_running(struct rq *rq)
 {
        rq->nr_running--;
-        dec_load(rq, p);
 }
 static void set_load_weight(struct task_struct *p)
@@ -1609,7 +1954,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
                rq->nr_uninterruptible--;
        enqueue_task(rq, p, wakeup);
-        inc_nr_running(p, rq);
+        inc_nr_running(rq);
 }
 /*
@@ -1621,7 +1966,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
                rq->nr_uninterruptible++;
        dequeue_task(rq, p, sleep);
-        dec_nr_running(p, rq);
+        dec_nr_running(rq);
 }
 /**
@@ -2274,7 +2619,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                 * management (if any):
                 */
                p->sched_class->task_new(rq, p);
-                inc_nr_running(p, rq);
+                inc_nr_running(rq);
        }
        check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
@@ -3265,9 +3610,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        unsigned long imbalance;
        struct rq *busiest;
        unsigned long flags;
+        int unlock_aggregate;
        cpus_setall(*cpus);
+        unlock_aggregate = get_aggregate(sd);
        /*
         * When power savings policy is enabled for the parent domain, idle
         * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3383,8 +3731,9 @@ redo:
        if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                return -1;
+                ld_moved = -1;
-        return ld_moved;
+        goto out;
 out_balanced:
        schedstat_inc(sd, lb_balanced[idle]);
@@ -3399,8 +3748,13 @@ out_one_pinned:
        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-                return -1;
+                ld_moved = -1;
-        return 0;
+        else
+                ld_moved = 0;
+out:
+        if (unlock_aggregate)
+                put_aggregate(sd);
+        return ld_moved;
 }
 /*
@@ -4588,10 +4942,8 @@ void set_user_nice(struct task_struct *p, long nice)
                goto out_unlock;
        }
        on_rq = p->se.on_rq;
-        if (on_rq) {
+        if (on_rq)
                dequeue_task(rq, p, 0);
-                dec_load(rq, p);
-        }
        p->static_prio = NICE_TO_PRIO(nice);
        set_load_weight(p);
@@ -4601,7 +4953,6 @@ void set_user_nice(struct task_struct *p, long nice)
        if (on_rq) {
                enqueue_task(rq, p, 0);
-                inc_load(rq, p);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -7016,6 +7367,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                        SD_INIT(sd, ALLNODES);
                        set_domain_attribute(sd, attr);
                        sd->span = *cpu_map;
+                        sd->first_cpu = first_cpu(sd->span);
                        cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
                        p = sd;
                        sd_allnodes = 1;
@@ -7026,6 +7378,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                SD_INIT(sd, NODE);
                set_domain_attribute(sd, attr);
                sched_domain_node_span(cpu_to_node(i), &sd->span);
+                sd->first_cpu = first_cpu(sd->span);
                sd->parent = p;
                if (p)
                        p->child = sd;
@@ -7037,6 +7390,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                SD_INIT(sd, CPU);
                set_domain_attribute(sd, attr);
                sd->span = *nodemask;
+                sd->first_cpu = first_cpu(sd->span);
                sd->parent = p;
                if (p)
                        p->child = sd;
@@ -7048,6 +7402,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                SD_INIT(sd, MC);
                set_domain_attribute(sd, attr);
                sd->span = cpu_coregroup_map(i);
+                sd->first_cpu = first_cpu(sd->span);
                cpus_and(sd->span, sd->span, *cpu_map);
                sd->parent = p;
                p->child = sd;
@@ -7060,6 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                SD_INIT(sd, SIBLING);
                set_domain_attribute(sd, attr);
                sd->span = per_cpu(cpu_sibling_map, i);
+                sd->first_cpu = first_cpu(sd->span);
                cpus_and(sd->span, sd->span, *cpu_map);
                sd->parent = p;
                p->child = sd;
@@ -7757,6 +8113,7 @@ void __init sched_init(void)
        }
 #ifdef CONFIG_SMP
+        init_aggregate();
        init_defrootdomain();
 #endif
@@ -8322,14 +8679,11 @@ void sched_move_task(struct task_struct *tsk)
 #endif /* CONFIG_GROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
+static void __set_se_shares(struct sched_entity *se, unsigned long shares)
 {
        struct cfs_rq *cfs_rq = se->cfs_rq;
-        struct rq *rq = cfs_rq->rq;
        int on_rq;
-        spin_lock_irq(&rq->lock);
        on_rq = se->on_rq;
        if (on_rq)
                dequeue_entity(cfs_rq, se, 0);
@@ -8339,8 +8693,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
        if (on_rq)
                enqueue_entity(cfs_rq, se, 0);
+}
-        spin_unlock_irq(&rq->lock);
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+        struct cfs_rq *cfs_rq = se->cfs_rq;
+        struct rq *rq = cfs_rq->rq;
+        unsigned long flags;
+        spin_lock_irqsave(&rq->lock, flags);
+        __set_se_shares(se, shares);
+        spin_unlock_irqrestore(&rq->lock, flags);
 }
 static DEFINE_MUTEX(shares_mutex);
@@ -8379,8 +8742,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         * w/o tripping rebalance_share or load_balance_fair.
         */
        tg->shares = shares;
-        for_each_possible_cpu(i)
+        for_each_possible_cpu(i) {
+                /*
+                 * force a rebalance
+                 */
+                cfs_rq_set_shares(tg->cfs_rq[i], 0);
                set_se_shares(tg->se[i], shares);
+        }
        /*
         * Enable load balance activity on this group, by inserting it back on
author	Peter Zijlstra <a.p.zijlstra@chello.nl>	2008-06-27 07:41:14 -0400
committer	Ingo Molnar <mingo@elte.hu>	2008-06-27 08:31:29 -0400
commit	c09595f63bb1909c5dc4dca288f4fe818561b5f3 (patch)
tree	42631e6986f3ea4543b125ca62a99df8548e0eb9 /kernel/sched.c
parent	ced8aa16e1db55c33c507174c1b1f9e107445865 (diff)

diff --git a/kernel/sched.c b/kernel/sched.c index f653af684fb3..874b6da15430 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -403,6 +403,43 @@ struct cfs_rq {
403	*/	403	*/
404	struct list_head leaf_cfs_rq_list;	404	struct list_head leaf_cfs_rq_list;
405	struct task_group tg; / group that "owns" this runqueue */	405	struct task_group tg; / group that "owns" this runqueue */
		406
		407	#ifdef CONFIG_SMP
		408	unsigned long task_weight;
		409	unsigned long shares;
		410	/*
		411	* We need space to build a sched_domain wide view of the full task
		412	* group tree, in order to avoid depending on dynamic memory allocation
		413	* during the load balancing we place this in the per cpu task group
		414	* hierarchy. This limits the load balancing to one instance per cpu,
		415	* but more should not be needed anyway.
		416	*/
		417	struct aggregate_struct {
		418	/*
		419	* load = weight(cpus) * f(tg)
		420	*
		421	* Where f(tg) is the recursive weight fraction assigned to
		422	* this group.
		423	*/
		424	unsigned long load;
		425
		426	/*
		427	* part of the group weight distributed to this span.
		428	*/
		429	unsigned long shares;
		430
		431	/*
		432	* The sum of all runqueue weights within this span.
		433	*/
		434	unsigned long rq_weight;
		435
		436	/*
		437	* Weight contributed by tasks; this is the part we can
		438	* influence by moving tasks around.
		439	*/
		440	unsigned long task_weight;
		441	} aggregate;
		442	#endif
406	#endif	443	#endif
407	};	444	};
408		445
@@ -1484,6 +1521,326 @@ static unsigned long source_load(int cpu, int type);
1484	static unsigned long target_load(int cpu, int type);	1521	static unsigned long target_load(int cpu, int type);
1485	static unsigned long cpu_avg_load_per_task(int cpu);	1522	static unsigned long cpu_avg_load_per_task(int cpu);
1486	static int task_hot(struct task_struct p, u64 now, struct sched_domain sd);	1523	static int task_hot(struct task_struct p, u64 now, struct sched_domain sd);
		1524
		1525	#ifdef CONFIG_FAIR_GROUP_SCHED
		1526
		1527	/*
		1528	* Group load balancing.
		1529	*
		1530	* We calculate a few balance domain wide aggregate numbers; load and weight.
		1531	* Given the pictures below, and assuming each item has equal weight:
		1532	*
		1533	* root 1 - thread
		1534	* / \| \ A - group
		1535	* A 1 B
		1536	* /\|\ / \
		1537	* C 2 D 3 4
		1538	* \| \|
		1539	* 5 6
		1540	*
		1541	* load:
		1542	* A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
		1543	* which equals 1/9-th of the total load.
		1544	*
		1545	* shares:
		1546	* The weight of this group on the selected cpus.
		1547	*
		1548	* rq_weight:
		1549	* Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
		1550	* B would get 2.
		1551	*
		1552	* task_weight:
		1553	* Part of the rq_weight contributed by tasks; all groups except B would
		1554	* get 1, B gets 2.
		1555	*/
		1556
		1557	static inline struct aggregate_struct *
		1558	aggregate(struct task_group tg, struct sched_domain sd)
		1559	{
		1560	return &tg->cfs_rq[sd->first_cpu]->aggregate;
		1561	}
		1562
		1563	typedef void (aggregate_func)(struct task_group , struct sched_domain *);
		1564
		1565	/*
		1566	* Iterate the full tree, calling @down when first entering a node and @up when
		1567	* leaving it for the final time.
		1568	*/
		1569	static
		1570	void aggregate_walk_tree(aggregate_func down, aggregate_func up,
		1571	struct sched_domain *sd)
		1572	{
		1573	struct task_group parent, child;
		1574
		1575	rcu_read_lock();
		1576	parent = &root_task_group;
		1577	down:
		1578	(*down)(parent, sd);
		1579	list_for_each_entry_rcu(child, &parent->children, siblings) {
		1580	parent = child;
		1581	goto down;
		1582
		1583	up:
		1584	continue;
		1585	}
		1586	(*up)(parent, sd);
		1587
		1588	child = parent;
		1589	parent = parent->parent;
		1590	if (parent)
		1591	goto up;
		1592	rcu_read_unlock();
		1593	}
		1594
		1595	/*
		1596	* Calculate the aggregate runqueue weight.
		1597	*/
		1598	static
		1599	void aggregate_group_weight(struct task_group tg, struct sched_domain sd)
		1600	{
		1601	unsigned long rq_weight = 0;
		1602	unsigned long task_weight = 0;
		1603	int i;
		1604
		1605	for_each_cpu_mask(i, sd->span) {
		1606	rq_weight += tg->cfs_rq[i]->load.weight;
		1607	task_weight += tg->cfs_rq[i]->task_weight;
		1608	}
		1609
		1610	aggregate(tg, sd)->rq_weight = rq_weight;
		1611	aggregate(tg, sd)->task_weight = task_weight;
		1612	}
		1613
		1614	/*
		1615	* Compute the weight of this group on the given cpus.
		1616	*/
		1617	static
		1618	void aggregate_group_shares(struct task_group tg, struct sched_domain sd)
		1619	{
		1620	unsigned long shares = 0;
		1621	int i;
		1622
		1623	for_each_cpu_mask(i, sd->span)
		1624	shares += tg->cfs_rq[i]->shares;
		1625
		1626	if ((!shares && aggregate(tg, sd)->rq_weight) \|\| shares > tg->shares)
		1627	shares = tg->shares;
		1628
		1629	aggregate(tg, sd)->shares = shares;
		1630	}
		1631
		1632	/*
		1633	* Compute the load fraction assigned to this group, relies on the aggregate
		1634	* weight and this group's parent's load, i.e. top-down.
		1635	*/
		1636	static
		1637	void aggregate_group_load(struct task_group tg, struct sched_domain sd)
		1638	{
		1639	unsigned long load;
		1640
		1641	if (!tg->parent) {
		1642	int i;
		1643
		1644	load = 0;
		1645	for_each_cpu_mask(i, sd->span)
		1646	load += cpu_rq(i)->load.weight;
		1647
		1648	} else {
		1649	load = aggregate(tg->parent, sd)->load;
		1650
		1651	/*
		1652	* shares is our weight in the parent's rq so
		1653	* shares/parent->rq_weight gives our fraction of the load
		1654	*/
		1655	load *= aggregate(tg, sd)->shares;
		1656	load /= aggregate(tg->parent, sd)->rq_weight + 1;
		1657	}
		1658
		1659	aggregate(tg, sd)->load = load;
		1660	}
		1661
		1662	static void __set_se_shares(struct sched_entity *se, unsigned long shares);
		1663
		1664	/*
		1665	* Calculate and set the cpu's group shares.
		1666	*/
		1667	static void
		1668	__update_group_shares_cpu(struct task_group tg, struct sched_domain sd,
		1669	int tcpu)
		1670	{
		1671	int boost = 0;
		1672	unsigned long shares;
		1673	unsigned long rq_weight;
		1674
		1675	if (!tg->se[tcpu])
		1676	return;
		1677
		1678	rq_weight = tg->cfs_rq[tcpu]->load.weight;
		1679
		1680	/*
		1681	* If there are currently no tasks on the cpu pretend there is one of
		1682	* average load so that when a new task gets to run here it will not
		1683	* get delayed by group starvation.
		1684	*/
		1685	if (!rq_weight) {
		1686	boost = 1;
		1687	rq_weight = NICE_0_LOAD;
		1688	}
		1689
		1690	/*
		1691	* \Sum shares * rq_weight
		1692	* shares = -----------------------
		1693	* \Sum rq_weight
		1694	*
		1695	*/
		1696	shares = aggregate(tg, sd)->shares * rq_weight;
		1697	shares /= aggregate(tg, sd)->rq_weight + 1;
		1698
		1699	/*
		1700	* record the actual number of shares, not the boosted amount.
		1701	*/
		1702	tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
		1703
		1704	if (shares < MIN_SHARES)
		1705	shares = MIN_SHARES;
		1706	else if (shares > MAX_SHARES)
		1707	shares = MAX_SHARES;
		1708
		1709	__set_se_shares(tg->se[tcpu], shares);
		1710	}
		1711
		1712	/*
		1713	* Re-adjust the weights on the cpu the task came from and on the cpu the
		1714	* task went to.
		1715	*/
		1716	static void
		1717	__move_group_shares(struct task_group tg, struct sched_domain sd,
		1718	int scpu, int dcpu)
		1719	{
		1720	unsigned long shares;
		1721
		1722	shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
		1723
		1724	__update_group_shares_cpu(tg, sd, scpu);
		1725	__update_group_shares_cpu(tg, sd, dcpu);
		1726
		1727	/*
		1728	* ensure we never loose shares due to rounding errors in the
		1729	* above redistribution.
		1730	*/
		1731	shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
		1732	if (shares)
		1733	tg->cfs_rq[dcpu]->shares += shares;
		1734	}
		1735
		1736	/*
		1737	* Because changing a group's shares changes the weight of the super-group
		1738	* we need to walk up the tree and change all shares until we hit the root.
		1739	*/
		1740	static void
		1741	move_group_shares(struct task_group tg, struct sched_domain sd,
		1742	int scpu, int dcpu)
		1743	{
		1744	while (tg) {
		1745	__move_group_shares(tg, sd, scpu, dcpu);
		1746	tg = tg->parent;
		1747	}
		1748	}
		1749
		1750	static
		1751	void aggregate_group_set_shares(struct task_group tg, struct sched_domain sd)
		1752	{
		1753	unsigned long shares = aggregate(tg, sd)->shares;
		1754	int i;
		1755
		1756	for_each_cpu_mask(i, sd->span) {
		1757	struct rq *rq = cpu_rq(i);
		1758	unsigned long flags;
		1759
		1760	spin_lock_irqsave(&rq->lock, flags);
		1761	__update_group_shares_cpu(tg, sd, i);
		1762	spin_unlock_irqrestore(&rq->lock, flags);
		1763	}
		1764
		1765	aggregate_group_shares(tg, sd);
		1766
		1767	/*
		1768	* ensure we never loose shares due to rounding errors in the
		1769	* above redistribution.
		1770	*/
		1771	shares -= aggregate(tg, sd)->shares;
		1772	if (shares) {
		1773	tg->cfs_rq[sd->first_cpu]->shares += shares;
		1774	aggregate(tg, sd)->shares += shares;
		1775	}
		1776	}
		1777
		1778	/*
		1779	* Calculate the accumulative weight and recursive load of each task group
		1780	* while walking down the tree.
		1781	*/
		1782	static
		1783	void aggregate_get_down(struct task_group tg, struct sched_domain sd)
		1784	{
		1785	aggregate_group_weight(tg, sd);
		1786	aggregate_group_shares(tg, sd);
		1787	aggregate_group_load(tg, sd);
		1788	}
		1789
		1790	/*
		1791	* Rebalance the cpu shares while walking back up the tree.
		1792	*/
		1793	static
		1794	void aggregate_get_up(struct task_group tg, struct sched_domain sd)
		1795	{
		1796	aggregate_group_set_shares(tg, sd);
		1797	}
		1798
		1799	static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
		1800
		1801	static void __init init_aggregate(void)
		1802	{
		1803	int i;
		1804
		1805	for_each_possible_cpu(i)
		1806	spin_lock_init(&per_cpu(aggregate_lock, i));
		1807	}
		1808
		1809	static int get_aggregate(struct sched_domain *sd)
		1810	{
		1811	if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
		1812	return 0;
		1813
		1814	aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
		1815	return 1;
		1816	}
		1817
		1818	static void put_aggregate(struct sched_domain *sd)
		1819	{
		1820	spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
		1821	}
		1822
		1823	static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
		1824	{
		1825	cfs_rq->shares = shares;
		1826	}
		1827
		1828	#else
		1829
		1830	static inline void init_aggregate(void)
		1831	{
		1832	}
		1833
		1834	static inline int get_aggregate(struct sched_domain *sd)
		1835	{
		1836	return 0;
		1837	}
		1838
		1839	static inline void put_aggregate(struct sched_domain *sd)
		1840	{
		1841	}
		1842	#endif
		1843
1487	#endif	1844	#endif
1488		1845
1489	#include "sched_stats.h"	1846	#include "sched_stats.h"
@@ -1498,26 +1855,14 @@ static int task_hot(struct task_struct p, u64 now, struct sched_domain sd);
1498	#define for_each_class(class) \	1855	#define for_each_class(class) \
1499	for (class = sched_class_highest; class; class = class->next)	1856	for (class = sched_class_highest; class; class = class->next)
1500		1857
1501	static inline void inc_load(struct rq rq, const struct task_struct p)	1858	static void inc_nr_running(struct rq *rq)
1502	{
1503	update_load_add(&rq->load, p->se.load.weight);
1504	}
1505
1506	static inline void dec_load(struct rq rq, const struct task_struct p)
1507	{
1508	update_load_sub(&rq->load, p->se.load.weight);
1509	}
1510
1511	static void inc_nr_running(struct task_struct p, struct rq rq)
1512	{	1859	{
1513	rq->nr_running++;	1860	rq->nr_running++;
1514	inc_load(rq, p);
1515	}	1861	}
1516		1862
1517	static void dec_nr_running(struct task_struct p, struct rq rq)	1863	static void dec_nr_running(struct rq *rq)
1518	{	1864	{
1519	rq->nr_running--;	1865	rq->nr_running--;
1520	dec_load(rq, p);
1521	}	1866	}
1522		1867
1523	static void set_load_weight(struct task_struct *p)	1868	static void set_load_weight(struct task_struct *p)
@@ -1609,7 +1954,7 @@ static void activate_task(struct rq rq, struct task_struct p, int wakeup)
1609	rq->nr_uninterruptible--;	1954	rq->nr_uninterruptible--;
1610		1955
1611	enqueue_task(rq, p, wakeup);	1956	enqueue_task(rq, p, wakeup);
1612	inc_nr_running(p, rq);	1957	inc_nr_running(rq);
1613	}	1958	}
1614		1959
1615	/*	1960	/*
@@ -1621,7 +1966,7 @@ static void deactivate_task(struct rq rq, struct task_struct p, int sleep)
1621	rq->nr_uninterruptible++;	1966	rq->nr_uninterruptible++;
1622		1967
1623	dequeue_task(rq, p, sleep);	1968	dequeue_task(rq, p, sleep);
1624	dec_nr_running(p, rq);	1969	dec_nr_running(rq);
1625	}	1970	}
1626		1971
1627	/**	1972	/**
@@ -2274,7 +2619,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2274	* management (if any):	2619	* management (if any):
2275	*/	2620	*/
2276	p->sched_class->task_new(rq, p);	2621	p->sched_class->task_new(rq, p);
2277	inc_nr_running(p, rq);	2622	inc_nr_running(rq);
2278	}	2623	}
2279	check_preempt_curr(rq, p);	2624	check_preempt_curr(rq, p);
2280	#ifdef CONFIG_SMP	2625	#ifdef CONFIG_SMP
@@ -3265,9 +3610,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3265	unsigned long imbalance;	3610	unsigned long imbalance;
3266	struct rq *busiest;	3611	struct rq *busiest;
3267	unsigned long flags;	3612	unsigned long flags;
		3613	int unlock_aggregate;
3268		3614
3269	cpus_setall(*cpus);	3615	cpus_setall(*cpus);
3270		3616
		3617	unlock_aggregate = get_aggregate(sd);
		3618
3271	/*	3619	/*
3272	* When power savings policy is enabled for the parent domain, idle	3620	* When power savings policy is enabled for the parent domain, idle
3273	* sibling can pick up load irrespective of busy siblings. In this case,	3621	* sibling can pick up load irrespective of busy siblings. In this case,
@@ -3383,8 +3731,9 @@ redo:
3383		3731
3384	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&	3732	if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3385	!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))	3733	!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3386	return -1;	3734	ld_moved = -1;
3387	return ld_moved;	3735
		3736	goto out;
3388		3737
3389	out_balanced:	3738	out_balanced:
3390	schedstat_inc(sd, lb_balanced[idle]);	3739	schedstat_inc(sd, lb_balanced[idle]);
@@ -3399,8 +3748,13 @@ out_one_pinned:
3399		3748
3400	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&	3749	if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3401	!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))	3750	!test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3402	return -1;	3751	ld_moved = -1;
3403	return 0;	3752	else
		3753	ld_moved = 0;
		3754	out:
		3755	if (unlock_aggregate)
		3756	put_aggregate(sd);
		3757	return ld_moved;
3404	}	3758	}
3405		3759
3406	/*	3760	/*
@@ -4588,10 +4942,8 @@ void set_user_nice(struct task_struct *p, long nice)
4588	goto out_unlock;	4942	goto out_unlock;
4589	}	4943	}
4590	on_rq = p->se.on_rq;	4944	on_rq = p->se.on_rq;
4591	if (on_rq) {	4945	if (on_rq)
4592	dequeue_task(rq, p, 0);	4946	dequeue_task(rq, p, 0);
4593	dec_load(rq, p);
4594	}
4595		4947
4596	p->static_prio = NICE_TO_PRIO(nice);	4948	p->static_prio = NICE_TO_PRIO(nice);
4597	set_load_weight(p);	4949	set_load_weight(p);
@@ -4601,7 +4953,6 @@ void set_user_nice(struct task_struct *p, long nice)
4601		4953
4602	if (on_rq) {	4954	if (on_rq) {
4603	enqueue_task(rq, p, 0);	4955	enqueue_task(rq, p, 0);
4604	inc_load(rq, p);
4605	/*	4956	/*
4606	* If the task increased its priority or is running and	4957	* If the task increased its priority or is running and
4607	* lowered its priority, then reschedule its CPU:	4958	* lowered its priority, then reschedule its CPU:
@@ -7016,6 +7367,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7016	SD_INIT(sd, ALLNODES);	7367	SD_INIT(sd, ALLNODES);
7017	set_domain_attribute(sd, attr);	7368	set_domain_attribute(sd, attr);
7018	sd->span = *cpu_map;	7369	sd->span = *cpu_map;
		7370	sd->first_cpu = first_cpu(sd->span);
7019	cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);	7371	cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
7020	p = sd;	7372	p = sd;
7021	sd_allnodes = 1;	7373	sd_allnodes = 1;
@@ -7026,6 +7378,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7026	SD_INIT(sd, NODE);	7378	SD_INIT(sd, NODE);
7027	set_domain_attribute(sd, attr);	7379	set_domain_attribute(sd, attr);
7028	sched_domain_node_span(cpu_to_node(i), &sd->span);	7380	sched_domain_node_span(cpu_to_node(i), &sd->span);
		7381	sd->first_cpu = first_cpu(sd->span);
7029	sd->parent = p;	7382	sd->parent = p;
7030	if (p)	7383	if (p)
7031	p->child = sd;	7384	p->child = sd;
@@ -7037,6 +7390,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7037	SD_INIT(sd, CPU);	7390	SD_INIT(sd, CPU);
7038	set_domain_attribute(sd, attr);	7391	set_domain_attribute(sd, attr);
7039	sd->span = *nodemask;	7392	sd->span = *nodemask;
		7393	sd->first_cpu = first_cpu(sd->span);
7040	sd->parent = p;	7394	sd->parent = p;
7041	if (p)	7395	if (p)
7042	p->child = sd;	7396	p->child = sd;
@@ -7048,6 +7402,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7048	SD_INIT(sd, MC);	7402	SD_INIT(sd, MC);
7049	set_domain_attribute(sd, attr);	7403	set_domain_attribute(sd, attr);
7050	sd->span = cpu_coregroup_map(i);	7404	sd->span = cpu_coregroup_map(i);
		7405	sd->first_cpu = first_cpu(sd->span);
7051	cpus_and(sd->span, sd->span, *cpu_map);	7406	cpus_and(sd->span, sd->span, *cpu_map);
7052	sd->parent = p;	7407	sd->parent = p;
7053	p->child = sd;	7408	p->child = sd;
@@ -7060,6 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7060	SD_INIT(sd, SIBLING);	7415	SD_INIT(sd, SIBLING);
7061	set_domain_attribute(sd, attr);	7416	set_domain_attribute(sd, attr);
7062	sd->span = per_cpu(cpu_sibling_map, i);	7417	sd->span = per_cpu(cpu_sibling_map, i);
		7418	sd->first_cpu = first_cpu(sd->span);
7063	cpus_and(sd->span, sd->span, *cpu_map);	7419	cpus_and(sd->span, sd->span, *cpu_map);
7064	sd->parent = p;	7420	sd->parent = p;
7065	p->child = sd;	7421	p->child = sd;
@@ -7757,6 +8113,7 @@ void __init sched_init(void)
7757	}	8113	}
7758		8114
7759	#ifdef CONFIG_SMP	8115	#ifdef CONFIG_SMP
		8116	init_aggregate();
7760	init_defrootdomain();	8117	init_defrootdomain();
7761	#endif	8118	#endif
7762		8119
@@ -8322,14 +8679,11 @@ void sched_move_task(struct task_struct *tsk)
8322	#endif /* CONFIG_GROUP_SCHED */	8679	#endif /* CONFIG_GROUP_SCHED */
8323		8680
8324	#ifdef CONFIG_FAIR_GROUP_SCHED	8681	#ifdef CONFIG_FAIR_GROUP_SCHED
8325	static void set_se_shares(struct sched_entity *se, unsigned long shares)	8682	static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8326	{	8683	{
8327	struct cfs_rq *cfs_rq = se->cfs_rq;	8684	struct cfs_rq *cfs_rq = se->cfs_rq;
8328	struct rq *rq = cfs_rq->rq;
8329	int on_rq;	8685	int on_rq;
8330		8686
8331	spin_lock_irq(&rq->lock);
8332
8333	on_rq = se->on_rq;	8687	on_rq = se->on_rq;
8334	if (on_rq)	8688	if (on_rq)
8335	dequeue_entity(cfs_rq, se, 0);	8689	dequeue_entity(cfs_rq, se, 0);
@@ -8339,8 +8693,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
8339		8693
8340	if (on_rq)	8694	if (on_rq)
8341	enqueue_entity(cfs_rq, se, 0);	8695	enqueue_entity(cfs_rq, se, 0);
		8696	}
8342		8697
8343	spin_unlock_irq(&rq->lock);	8698	static void set_se_shares(struct sched_entity *se, unsigned long shares)
		8699	{
		8700	struct cfs_rq *cfs_rq = se->cfs_rq;
		8701	struct rq *rq = cfs_rq->rq;
		8702	unsigned long flags;
		8703
		8704	spin_lock_irqsave(&rq->lock, flags);
		8705	__set_se_shares(se, shares);
		8706	spin_unlock_irqrestore(&rq->lock, flags);
8344	}	8707	}
8345		8708
8346	static DEFINE_MUTEX(shares_mutex);	8709	static DEFINE_MUTEX(shares_mutex);
@@ -8379,8 +8742,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8379	* w/o tripping rebalance_share or load_balance_fair.	8742	* w/o tripping rebalance_share or load_balance_fair.
8380	*/	8743	*/
8381	tg->shares = shares;	8744	tg->shares = shares;
8382	for_each_possible_cpu(i)	8745	for_each_possible_cpu(i) {
		8746	/*
		8747	* force a rebalance
		8748	*/
		8749	cfs_rq_set_shares(tg->cfs_rq[i], 0);
8383	set_se_shares(tg->se[i], shares);	8750	set_se_shares(tg->se[i], shares);
		8751	}
8384		8752
8385	/*	8753	/*
8386	* Enable load balance activity on this group, by inserting it back on	8754	* Enable load balance activity on this group, by inserting it back on