1 files changed, 35 insertions, 248 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index f06950c8a6ce..dcd553cc4ee8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -174,41 +174,6 @@ struct task_group {
        struct sched_entity **se;
        /* runqueue "owned" by this group on each cpu */
        struct cfs_rq **cfs_rq;
-        /*
-         * shares assigned to a task group governs how much of cpu bandwidth
-         * is allocated to the group. The more shares a group has, the more is
-         * the cpu bandwidth allocated to it.
-         *
-         * For ex, lets say that there are three task groups, A, B and C which
-         * have been assigned shares 1000, 2000 and 3000 respectively. Then,
-         * cpu bandwidth allocated by the scheduler to task groups A, B and C
-         * should be:
-         *
-         *      Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
-         *      Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
-         *      Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
-         *
-         * The weight assigned to a task group's schedulable entities on every
-         * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
-         * group's shares. For ex: lets say that task group A has been
-         * assigned shares of 1000 and there are two CPUs in a system. Then,
-         *
-         *  tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
-         *
-         * Note: It's not necessary that each of a task's group schedulable
-         *       entity have the same weight on all CPUs. If the group
-         *       has 2 of its tasks on CPU0 and 1 task on CPU1, then a
-         *       better distribution of weight could be:
-         *
-         *      tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
-         *      tg_A->se[1]->load.weight = 1/2 * 2000 =  667
-         *
-         * rebalance_shares() is responsible for distributing the shares of a
-         * task groups like this among the group's schedulable entities across
-         * cpus.
-         *
-         */
        unsigned long shares;
 #endif
@@ -250,22 +215,12 @@ static DEFINE_SPINLOCK(task_group_lock);
 static DEFINE_MUTEX(doms_cur_mutex);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_SMP
-/* kernel thread that runs rebalance_shares() periodically */
-static struct task_struct *lb_monitor_task;
-static int load_balance_monitor(void *unused);
-#endif
-static void set_se_shares(struct sched_entity *se, unsigned long shares);
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
 #else
 # define INIT_TASK_GROUP_LOAD   NICE_0_LOAD
 #endif
-#define MIN_GROUP_SHARES        2
 static int init_task_group_load = INIT_TASK_GROUP_LOAD;
 #endif
@@ -1245,16 +1200,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
 #endif
-static inline void inc_cpu_load(struct rq *rq, unsigned long load)
-{
-        update_load_add(&rq->load, load);
-}
-static inline void dec_cpu_load(struct rq *rq, unsigned long load)
-{
-        update_load_sub(&rq->load, load);
-}
 #ifdef CONFIG_SMP
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
@@ -1272,14 +1217,26 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
 #define sched_class_highest (&rt_sched_class)
-static void inc_nr_running(struct rq *rq)
+static inline void inc_load(struct rq *rq, const struct task_struct *p)
+{
+        update_load_add(&rq->load, p->se.load.weight);
+}
+static inline void dec_load(struct rq *rq, const struct task_struct *p)
+{
+        update_load_sub(&rq->load, p->se.load.weight);
+}
+static void inc_nr_running(struct task_struct *p, struct rq *rq)
 {
        rq->nr_running++;
+        inc_load(rq, p);
 }
-static void dec_nr_running(struct rq *rq)
+static void dec_nr_running(struct task_struct *p, struct rq *rq)
 {
        rq->nr_running--;
+        dec_load(rq, p);
 }
 static void set_load_weight(struct task_struct *p)
@@ -1371,7 +1328,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
                rq->nr_uninterruptible--;
        enqueue_task(rq, p, wakeup);
-        inc_nr_running(rq);
+        inc_nr_running(p, rq);
 }
 /*
@@ -1383,7 +1340,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
                rq->nr_uninterruptible++;
        dequeue_task(rq, p, sleep);
-        dec_nr_running(rq);
+        dec_nr_running(p, rq);
 }
 /**
@@ -2023,7 +1980,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                 * management (if any):
                 */
                p->sched_class->task_new(rq, p);
-                inc_nr_running(rq);
+                inc_nr_running(p, rq);
        }
        check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
@@ -4362,8 +4319,10 @@ void set_user_nice(struct task_struct *p, long nice)
                goto out_unlock;
        }
        on_rq = p->se.on_rq;
-        if (on_rq)
+        if (on_rq) {
                dequeue_task(rq, p, 0);
+                dec_load(rq, p);
+        }
        p->static_prio = NICE_TO_PRIO(nice);
        set_load_weight(p);
@@ -4373,6 +4332,7 @@ void set_user_nice(struct task_struct *p, long nice)
        if (on_rq) {
                enqueue_task(rq, p, 0);
+                inc_load(rq, p);
                /*
                 * If the task increased its priority or is running and
                 * lowered its priority, then reschedule its CPU:
@@ -7087,21 +7047,6 @@ void __init sched_init_smp(void)
        if (set_cpus_allowed(current, non_isolated_cpus) < 0)
                BUG();
        sched_init_granularity();
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        if (nr_cpu_ids == 1)
-                return;
-        lb_monitor_task = kthread_create(load_balance_monitor, NULL,
-                                         "group_balance");
-        if (!IS_ERR(lb_monitor_task)) {
-                lb_monitor_task->flags |= PF_NOFREEZE;
-                wake_up_process(lb_monitor_task);
-        } else {
-                printk(KERN_ERR "Could not create load balance monitor thread"
-                        "(error = %ld) \n", PTR_ERR(lb_monitor_task));
-        }
-#endif
 }
 #else
 void __init sched_init_smp(void)
@@ -7424,157 +7369,6 @@ void set_curr_task(int cpu, struct task_struct *p)
 #ifdef CONFIG_GROUP_SCHED
-#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
-/*
- * distribute shares of all task groups among their schedulable entities,
- * to reflect load distribution across cpus.
- */
-static int rebalance_shares(struct sched_domain *sd, int this_cpu)
-{
-        struct cfs_rq *cfs_rq;
-        struct rq *rq = cpu_rq(this_cpu);
-        cpumask_t sdspan = sd->span;
-        int balanced = 1;
-        /* Walk thr' all the task groups that we have */
-        for_each_leaf_cfs_rq(rq, cfs_rq) {
-                int i;
-                unsigned long total_load = 0, total_shares;
-                struct task_group *tg = cfs_rq->tg;
-                /* Gather total task load of this group across cpus */
-                for_each_cpu_mask(i, sdspan)
-                        total_load += tg->cfs_rq[i]->load.weight;
-                /* Nothing to do if this group has no load */
-                if (!total_load)
-                        continue;
-                /*
-                 * tg->shares represents the number of cpu shares the task group
-                 * is eligible to hold on a single cpu. On N cpus, it is
-                 * eligible to hold (N * tg->shares) number of cpu shares.
-                 */
-                total_shares = tg->shares * cpus_weight(sdspan);
-                /*
-                 * redistribute total_shares across cpus as per the task load
-                 * distribution.
-                 */
-                for_each_cpu_mask(i, sdspan) {
-                        unsigned long local_load, local_shares;
-                        local_load = tg->cfs_rq[i]->load.weight;
-                        local_shares = (local_load * total_shares) / total_load;
-                        if (!local_shares)
-                                local_shares = MIN_GROUP_SHARES;
-                        if (local_shares == tg->se[i]->load.weight)
-                                continue;
-                        spin_lock_irq(&cpu_rq(i)->lock);
-                        set_se_shares(tg->se[i], local_shares);
-                        spin_unlock_irq(&cpu_rq(i)->lock);
-                        balanced = 0;
-                }
-        }
-        return balanced;
-}
-/*
- * How frequently should we rebalance_shares() across cpus?
- *
- * The more frequently we rebalance shares, the more accurate is the fairness
- * of cpu bandwidth distribution between task groups. However higher frequency
- * also implies increased scheduling overhead.
- *
- * sysctl_sched_min_bal_int_shares represents the minimum interval between
- * consecutive calls to rebalance_shares() in the same sched domain.
- *
- * sysctl_sched_max_bal_int_shares represents the maximum interval between
- * consecutive calls to rebalance_shares() in the same sched domain.
- *
- * These settings allows for the appropriate trade-off between accuracy of
- * fairness and the associated overhead.
- *
- */
-/* default: 8ms, units: milliseconds */
-const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
-/* default: 128ms, units: milliseconds */
-const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
-/* kernel thread that runs rebalance_shares() periodically */
-static int load_balance_monitor(void *unused)
-{
-        unsigned int timeout = sysctl_sched_min_bal_int_shares;
-        struct sched_param schedparm;
-        int ret;
-        /*
-         * We don't want this thread's execution to be limited by the shares
-         * assigned to default group (init_task_group). Hence make it run
-         * as a SCHED_RR RT task at the lowest priority.
-         */
-        schedparm.sched_priority = 1;
-        ret = sched_setscheduler(current, SCHED_RR, &schedparm);
-        if (ret)
-                printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
-                                " monitor thread (error = %d) \n", ret);
-        while (!kthread_should_stop()) {
-                int i, cpu, balanced = 1;
-                /* Prevent cpus going down or coming up */
-                get_online_cpus();
-                /* lockout changes to doms_cur[] array */
-                lock_doms_cur();
-                /*
-                 * Enter a rcu read-side critical section to safely walk rq->sd
-                 * chain on various cpus and to walk task group list
-                 * (rq->leaf_cfs_rq_list) in rebalance_shares().
-                 */
-                rcu_read_lock();
-                for (i = 0; i < ndoms_cur; i++) {
-                        cpumask_t cpumap = doms_cur[i];
-                        struct sched_domain *sd = NULL, *sd_prev = NULL;
-                        cpu = first_cpu(cpumap);
-                        /* Find the highest domain at which to balance shares */
-                        for_each_domain(cpu, sd) {
-                                if (!(sd->flags & SD_LOAD_BALANCE))
-                                        continue;
-                                sd_prev = sd;
-                        }
-                        sd = sd_prev;
-                        /* sd == NULL? No load balance reqd in this domain */
-                        if (!sd)
-                                continue;
-                        balanced &= rebalance_shares(sd, cpu);
-                }
-                rcu_read_unlock();
-                unlock_doms_cur();
-                put_online_cpus();
-                if (!balanced)
-                        timeout = sysctl_sched_min_bal_int_shares;
-                else if (timeout < sysctl_sched_max_bal_int_shares)
-                        timeout *= 2;
-                msleep_interruptible(timeout);
-        }
-        return 0;
-}
-#endif  /* CONFIG_SMP */
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void free_fair_sched_group(struct task_group *tg)
 {
@@ -7841,29 +7635,25 @@ void sched_move_task(struct task_struct *tsk)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/* rq->lock to be locked by caller */
 static void set_se_shares(struct sched_entity *se, unsigned long shares)
 {
        struct cfs_rq *cfs_rq = se->cfs_rq;
        struct rq *rq = cfs_rq->rq;
        int on_rq;
-        if (!shares)
+        spin_lock_irq(&rq->lock);
-                shares = MIN_GROUP_SHARES;
        on_rq = se->on_rq;
-        if (on_rq) {
+        if (on_rq)
                dequeue_entity(cfs_rq, se, 0);
-                dec_cpu_load(rq, se->load.weight);
-        }
        se->load.weight = shares;
        se->load.inv_weight = div64_64((1ULL<<32), shares);
-        if (on_rq) {
+        if (on_rq)
                enqueue_entity(cfs_rq, se, 0);
-                inc_cpu_load(rq, se->load.weight);
-        }
+        spin_unlock_irq(&rq->lock);
 }
 static DEFINE_MUTEX(shares_mutex);
@@ -7873,18 +7663,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
        int i;
        unsigned long flags;
+        /*
+         * A weight of 0 or 1 can cause arithmetics problems.
+         * (The default weight is 1024 - so there's no practical
+         *  limitation from this.)
+         */
+        if (shares < 2)
+                shares = 2;
        mutex_lock(&shares_mutex);
        if (tg->shares == shares)
                goto done;
-        if (shares < MIN_GROUP_SHARES)
-                shares = MIN_GROUP_SHARES;
-        /*
-         * Prevent any load balance activity (rebalance_shares,
-         * load_balance_fair) from referring to this group first,
-         * by taking it off the rq->leaf_cfs_rq_list on each cpu.
-         */
        spin_lock_irqsave(&task_group_lock, flags);
        for_each_possible_cpu(i)
                unregister_fair_sched_group(tg, i);
@@ -7898,11 +7688,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         * w/o tripping rebalance_share or load_balance_fair.
         */
        tg->shares = shares;
-        for_each_possible_cpu(i) {
+        for_each_possible_cpu(i)
-                spin_lock_irq(&cpu_rq(i)->lock);
                set_se_shares(tg->se[i], shares);
-                spin_unlock_irq(&cpu_rq(i)->lock);
-        }
        /*
         * Enable load balance activity on this group, by inserting it back on

diff --git a/kernel/sched.c b/kernel/sched.c index f06950c8a6ce..dcd553cc4ee8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -174,41 +174,6 @@ struct task_group {
174	struct sched_entity **se;	174	struct sched_entity **se;
175	/* runqueue "owned" by this group on each cpu */	175	/* runqueue "owned" by this group on each cpu */
176	struct cfs_rq **cfs_rq;	176	struct cfs_rq **cfs_rq;
177
178	/*
179	* shares assigned to a task group governs how much of cpu bandwidth
180	* is allocated to the group. The more shares a group has, the more is
181	* the cpu bandwidth allocated to it.
182	*
183	* For ex, lets say that there are three task groups, A, B and C which
184	* have been assigned shares 1000, 2000 and 3000 respectively. Then,
185	* cpu bandwidth allocated by the scheduler to task groups A, B and C
186	* should be:
187	*
188	* Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
189	* Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
190	* Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
191	*
192	* The weight assigned to a task group's schedulable entities on every
193	* cpu (task_group.se[a_cpu]->load.weight) is derived from the task
194	* group's shares. For ex: lets say that task group A has been
195	* assigned shares of 1000 and there are two CPUs in a system. Then,
196	*
197	* tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
198	*
199	* Note: It's not necessary that each of a task's group schedulable
200	* entity have the same weight on all CPUs. If the group
201	* has 2 of its tasks on CPU0 and 1 task on CPU1, then a
202	* better distribution of weight could be:
203	*
204	* tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
205	* tg_A->se[1]->load.weight = 1/2 * 2000 = 667
206	*
207	* rebalance_shares() is responsible for distributing the shares of a
208	* task groups like this among the group's schedulable entities across
209	* cpus.
210	*
211	*/
212	unsigned long shares;	177	unsigned long shares;
213	#endif	178	#endif
214		179
@@ -250,22 +215,12 @@ static DEFINE_SPINLOCK(task_group_lock);
250	static DEFINE_MUTEX(doms_cur_mutex);	215	static DEFINE_MUTEX(doms_cur_mutex);
251		216
252	#ifdef CONFIG_FAIR_GROUP_SCHED	217	#ifdef CONFIG_FAIR_GROUP_SCHED
253	#ifdef CONFIG_SMP
254	/* kernel thread that runs rebalance_shares() periodically */
255	static struct task_struct *lb_monitor_task;
256	static int load_balance_monitor(void *unused);
257	#endif
258
259	static void set_se_shares(struct sched_entity *se, unsigned long shares);
260
261	#ifdef CONFIG_USER_SCHED	218	#ifdef CONFIG_USER_SCHED
262	# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)	219	# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
263	#else	220	#else
264	# define INIT_TASK_GROUP_LOAD NICE_0_LOAD	221	# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
265	#endif	222	#endif
266		223
267	#define MIN_GROUP_SHARES 2
268
269	static int init_task_group_load = INIT_TASK_GROUP_LOAD;	224	static int init_task_group_load = INIT_TASK_GROUP_LOAD;
270	#endif	225	#endif
271		226
@@ -1245,16 +1200,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1245	static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}	1200	static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1246	#endif	1201	#endif
1247		1202
1248	static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1249	{
1250	update_load_add(&rq->load, load);
1251	}
1252
1253	static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1254	{
1255	update_load_sub(&rq->load, load);
1256	}
1257
1258	#ifdef CONFIG_SMP	1203	#ifdef CONFIG_SMP
1259	static unsigned long source_load(int cpu, int type);	1204	static unsigned long source_load(int cpu, int type);
1260	static unsigned long target_load(int cpu, int type);	1205	static unsigned long target_load(int cpu, int type);
@@ -1272,14 +1217,26 @@ static int task_hot(struct task_struct p, u64 now, struct sched_domain sd);
1272		1217
1273	#define sched_class_highest (&rt_sched_class)	1218	#define sched_class_highest (&rt_sched_class)
1274		1219
1275	static void inc_nr_running(struct rq *rq)	1220	static inline void inc_load(struct rq rq, const struct task_struct p)
		1221	{
		1222	update_load_add(&rq->load, p->se.load.weight);
		1223	}
		1224
		1225	static inline void dec_load(struct rq rq, const struct task_struct p)
		1226	{
		1227	update_load_sub(&rq->load, p->se.load.weight);
		1228	}
		1229
		1230	static void inc_nr_running(struct task_struct p, struct rq rq)
1276	{	1231	{
1277	rq->nr_running++;	1232	rq->nr_running++;
		1233	inc_load(rq, p);
1278	}	1234	}
1279		1235
1280	static void dec_nr_running(struct rq *rq)	1236	static void dec_nr_running(struct task_struct p, struct rq rq)
1281	{	1237	{
1282	rq->nr_running--;	1238	rq->nr_running--;
		1239	dec_load(rq, p);
1283	}	1240	}
1284		1241
1285	static void set_load_weight(struct task_struct *p)	1242	static void set_load_weight(struct task_struct *p)
@@ -1371,7 +1328,7 @@ static void activate_task(struct rq rq, struct task_struct p, int wakeup)
1371	rq->nr_uninterruptible--;	1328	rq->nr_uninterruptible--;
1372		1329
1373	enqueue_task(rq, p, wakeup);	1330	enqueue_task(rq, p, wakeup);
1374	inc_nr_running(rq);	1331	inc_nr_running(p, rq);
1375	}	1332	}
1376		1333
1377	/*	1334	/*
@@ -1383,7 +1340,7 @@ static void deactivate_task(struct rq rq, struct task_struct p, int sleep)
1383	rq->nr_uninterruptible++;	1340	rq->nr_uninterruptible++;
1384		1341
1385	dequeue_task(rq, p, sleep);	1342	dequeue_task(rq, p, sleep);
1386	dec_nr_running(rq);	1343	dec_nr_running(p, rq);
1387	}	1344	}
1388		1345
1389	/**	1346	/**
@@ -2023,7 +1980,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2023	* management (if any):	1980	* management (if any):
2024	*/	1981	*/
2025	p->sched_class->task_new(rq, p);	1982	p->sched_class->task_new(rq, p);
2026	inc_nr_running(rq);	1983	inc_nr_running(p, rq);
2027	}	1984	}
2028	check_preempt_curr(rq, p);	1985	check_preempt_curr(rq, p);
2029	#ifdef CONFIG_SMP	1986	#ifdef CONFIG_SMP
@@ -4362,8 +4319,10 @@ void set_user_nice(struct task_struct *p, long nice)
4362	goto out_unlock;	4319	goto out_unlock;
4363	}	4320	}
4364	on_rq = p->se.on_rq;	4321	on_rq = p->se.on_rq;
4365	if (on_rq)	4322	if (on_rq) {
4366	dequeue_task(rq, p, 0);	4323	dequeue_task(rq, p, 0);
		4324	dec_load(rq, p);
		4325	}
4367		4326
4368	p->static_prio = NICE_TO_PRIO(nice);	4327	p->static_prio = NICE_TO_PRIO(nice);
4369	set_load_weight(p);	4328	set_load_weight(p);
@@ -4373,6 +4332,7 @@ void set_user_nice(struct task_struct *p, long nice)
4373		4332
4374	if (on_rq) {	4333	if (on_rq) {
4375	enqueue_task(rq, p, 0);	4334	enqueue_task(rq, p, 0);
		4335	inc_load(rq, p);
4376	/*	4336	/*
4377	* If the task increased its priority or is running and	4337	* If the task increased its priority or is running and
4378	* lowered its priority, then reschedule its CPU:	4338	* lowered its priority, then reschedule its CPU:
@@ -7087,21 +7047,6 @@ void __init sched_init_smp(void)
7087	if (set_cpus_allowed(current, non_isolated_cpus) < 0)	7047	if (set_cpus_allowed(current, non_isolated_cpus) < 0)
7088	BUG();	7048	BUG();
7089	sched_init_granularity();	7049	sched_init_granularity();
7090
7091	#ifdef CONFIG_FAIR_GROUP_SCHED
7092	if (nr_cpu_ids == 1)
7093	return;
7094
7095	lb_monitor_task = kthread_create(load_balance_monitor, NULL,
7096	"group_balance");
7097	if (!IS_ERR(lb_monitor_task)) {
7098	lb_monitor_task->flags \|= PF_NOFREEZE;
7099	wake_up_process(lb_monitor_task);
7100	} else {
7101	printk(KERN_ERR "Could not create load balance monitor thread"
7102	"(error = %ld) \n", PTR_ERR(lb_monitor_task));
7103	}
7104	#endif
7105	}	7050	}
7106	#else	7051	#else
7107	void __init sched_init_smp(void)	7052	void __init sched_init_smp(void)
@@ -7424,157 +7369,6 @@ void set_curr_task(int cpu, struct task_struct *p)
7424		7369
7425	#ifdef CONFIG_GROUP_SCHED	7370	#ifdef CONFIG_GROUP_SCHED
7426		7371
7427	#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
7428	/*
7429	* distribute shares of all task groups among their schedulable entities,
7430	* to reflect load distribution across cpus.
7431	*/
7432	static int rebalance_shares(struct sched_domain *sd, int this_cpu)
7433	{
7434	struct cfs_rq *cfs_rq;
7435	struct rq *rq = cpu_rq(this_cpu);
7436	cpumask_t sdspan = sd->span;
7437	int balanced = 1;
7438
7439	/* Walk thr' all the task groups that we have */
7440	for_each_leaf_cfs_rq(rq, cfs_rq) {
7441	int i;
7442	unsigned long total_load = 0, total_shares;
7443	struct task_group *tg = cfs_rq->tg;
7444
7445	/* Gather total task load of this group across cpus */
7446	for_each_cpu_mask(i, sdspan)
7447	total_load += tg->cfs_rq[i]->load.weight;
7448
7449	/* Nothing to do if this group has no load */
7450	if (!total_load)
7451	continue;
7452
7453	/*
7454	* tg->shares represents the number of cpu shares the task group
7455	* is eligible to hold on a single cpu. On N cpus, it is
7456	* eligible to hold (N * tg->shares) number of cpu shares.
7457	*/
7458	total_shares = tg->shares * cpus_weight(sdspan);
7459
7460	/*
7461	* redistribute total_shares across cpus as per the task load
7462	* distribution.
7463	*/
7464	for_each_cpu_mask(i, sdspan) {
7465	unsigned long local_load, local_shares;
7466
7467	local_load = tg->cfs_rq[i]->load.weight;
7468	local_shares = (local_load * total_shares) / total_load;
7469	if (!local_shares)
7470	local_shares = MIN_GROUP_SHARES;
7471	if (local_shares == tg->se[i]->load.weight)
7472	continue;
7473
7474	spin_lock_irq(&cpu_rq(i)->lock);
7475	set_se_shares(tg->se[i], local_shares);
7476	spin_unlock_irq(&cpu_rq(i)->lock);
7477	balanced = 0;
7478	}
7479	}
7480
7481	return balanced;
7482	}
7483
7484	/*
7485	* How frequently should we rebalance_shares() across cpus?
7486	*
7487	* The more frequently we rebalance shares, the more accurate is the fairness
7488	* of cpu bandwidth distribution between task groups. However higher frequency
7489	* also implies increased scheduling overhead.
7490	*
7491	* sysctl_sched_min_bal_int_shares represents the minimum interval between
7492	* consecutive calls to rebalance_shares() in the same sched domain.
7493	*
7494	* sysctl_sched_max_bal_int_shares represents the maximum interval between
7495	* consecutive calls to rebalance_shares() in the same sched domain.
7496	*
7497	* These settings allows for the appropriate trade-off between accuracy of
7498	* fairness and the associated overhead.
7499	*
7500	*/
7501
7502	/* default: 8ms, units: milliseconds */
7503	const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
7504
7505	/* default: 128ms, units: milliseconds */
7506	const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
7507
7508	/* kernel thread that runs rebalance_shares() periodically */
7509	static int load_balance_monitor(void *unused)
7510	{
7511	unsigned int timeout = sysctl_sched_min_bal_int_shares;
7512	struct sched_param schedparm;
7513	int ret;
7514
7515	/*
7516	* We don't want this thread's execution to be limited by the shares
7517	* assigned to default group (init_task_group). Hence make it run
7518	* as a SCHED_RR RT task at the lowest priority.
7519	*/
7520	schedparm.sched_priority = 1;
7521	ret = sched_setscheduler(current, SCHED_RR, &schedparm);
7522	if (ret)
7523	printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
7524	" monitor thread (error = %d) \n", ret);
7525
7526	while (!kthread_should_stop()) {
7527	int i, cpu, balanced = 1;
7528
7529	/* Prevent cpus going down or coming up */
7530	get_online_cpus();
7531	/* lockout changes to doms_cur[] array */
7532	lock_doms_cur();
7533	/*
7534	* Enter a rcu read-side critical section to safely walk rq->sd
7535	* chain on various cpus and to walk task group list
7536	* (rq->leaf_cfs_rq_list) in rebalance_shares().
7537	*/
7538	rcu_read_lock();
7539
7540	for (i = 0; i < ndoms_cur; i++) {
7541	cpumask_t cpumap = doms_cur[i];
7542	struct sched_domain sd = NULL, sd_prev = NULL;
7543
7544	cpu = first_cpu(cpumap);
7545
7546	/* Find the highest domain at which to balance shares */
7547	for_each_domain(cpu, sd) {
7548	if (!(sd->flags & SD_LOAD_BALANCE))
7549	continue;
7550	sd_prev = sd;
7551	}
7552
7553	sd = sd_prev;
7554	/* sd == NULL? No load balance reqd in this domain */
7555	if (!sd)
7556	continue;
7557
7558	balanced &= rebalance_shares(sd, cpu);
7559	}
7560
7561	rcu_read_unlock();
7562
7563	unlock_doms_cur();
7564	put_online_cpus();
7565
7566	if (!balanced)
7567	timeout = sysctl_sched_min_bal_int_shares;
7568	else if (timeout < sysctl_sched_max_bal_int_shares)
7569	timeout *= 2;
7570
7571	msleep_interruptible(timeout);
7572	}
7573
7574	return 0;
7575	}
7576	#endif /* CONFIG_SMP */
7577
7578	#ifdef CONFIG_FAIR_GROUP_SCHED	7372	#ifdef CONFIG_FAIR_GROUP_SCHED
7579	static void free_fair_sched_group(struct task_group *tg)	7373	static void free_fair_sched_group(struct task_group *tg)
7580	{	7374	{
@@ -7841,29 +7635,25 @@ void sched_move_task(struct task_struct *tsk)
7841	}	7635	}
7842		7636
7843	#ifdef CONFIG_FAIR_GROUP_SCHED	7637	#ifdef CONFIG_FAIR_GROUP_SCHED
7844	/* rq->lock to be locked by caller */
7845	static void set_se_shares(struct sched_entity *se, unsigned long shares)	7638	static void set_se_shares(struct sched_entity *se, unsigned long shares)
7846	{	7639	{
7847	struct cfs_rq *cfs_rq = se->cfs_rq;	7640	struct cfs_rq *cfs_rq = se->cfs_rq;
7848	struct rq *rq = cfs_rq->rq;	7641	struct rq *rq = cfs_rq->rq;
7849	int on_rq;	7642	int on_rq;
7850		7643
7851	if (!shares)	7644	spin_lock_irq(&rq->lock);
7852	shares = MIN_GROUP_SHARES;
7853		7645
7854	on_rq = se->on_rq;	7646	on_rq = se->on_rq;
7855	if (on_rq) {	7647	if (on_rq)
7856	dequeue_entity(cfs_rq, se, 0);	7648	dequeue_entity(cfs_rq, se, 0);
7857	dec_cpu_load(rq, se->load.weight);
7858	}
7859		7649
7860	se->load.weight = shares;	7650	se->load.weight = shares;
7861	se->load.inv_weight = div64_64((1ULL<<32), shares);	7651	se->load.inv_weight = div64_64((1ULL<<32), shares);
7862		7652
7863	if (on_rq) {	7653	if (on_rq)
7864	enqueue_entity(cfs_rq, se, 0);	7654	enqueue_entity(cfs_rq, se, 0);
7865	inc_cpu_load(rq, se->load.weight);	7655
7866	}	7656	spin_unlock_irq(&rq->lock);
7867	}	7657	}
7868		7658
7869	static DEFINE_MUTEX(shares_mutex);	7659	static DEFINE_MUTEX(shares_mutex);
@@ -7873,18 +7663,18 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7873	int i;	7663	int i;
7874	unsigned long flags;	7664	unsigned long flags;
7875		7665
		7666	/*
		7667	* A weight of 0 or 1 can cause arithmetics problems.
		7668	* (The default weight is 1024 - so there's no practical
		7669	* limitation from this.)
		7670	*/
		7671	if (shares < 2)
		7672	shares = 2;
		7673
7876	mutex_lock(&shares_mutex);	7674	mutex_lock(&shares_mutex);
7877	if (tg->shares == shares)	7675	if (tg->shares == shares)
7878	goto done;	7676	goto done;
7879		7677
7880	if (shares < MIN_GROUP_SHARES)
7881	shares = MIN_GROUP_SHARES;
7882
7883	/*
7884	* Prevent any load balance activity (rebalance_shares,
7885	* load_balance_fair) from referring to this group first,
7886	* by taking it off the rq->leaf_cfs_rq_list on each cpu.
7887	*/
7888	spin_lock_irqsave(&task_group_lock, flags);	7678	spin_lock_irqsave(&task_group_lock, flags);
7889	for_each_possible_cpu(i)	7679	for_each_possible_cpu(i)
7890	unregister_fair_sched_group(tg, i);	7680	unregister_fair_sched_group(tg, i);
@@ -7898,11 +7688,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7898	* w/o tripping rebalance_share or load_balance_fair.	7688	* w/o tripping rebalance_share or load_balance_fair.
7899	*/	7689	*/
7900	tg->shares = shares;	7690	tg->shares = shares;
7901	for_each_possible_cpu(i) {	7691	for_each_possible_cpu(i)
7902	spin_lock_irq(&cpu_rq(i)->lock);
7903	set_se_shares(tg->se[i], shares);	7692	set_se_shares(tg->se[i], shares);
7904	spin_unlock_irq(&cpu_rq(i)->lock);
7905	}
7906		7693
7907	/*	7694	/*
7908	* Enable load balance activity on this group, by inserting it back on	7695	* Enable load balance activity on this group, by inserting it back on