1 files changed, 91 insertions, 163 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index e24ecd39c4b8..08ae848b71d4 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -334,34 +334,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
 #endif
 /*
- * delta *= w / rw
- */
-static inline unsigned long
-calc_delta_weight(unsigned long delta, struct sched_entity *se)
-{
-        for_each_sched_entity(se) {
-                delta = calc_delta_mine(delta,
-                                se->load.weight, &cfs_rq_of(se)->load);
-        }
-        return delta;
-}
-/*
- * delta *= rw / w
- */
-static inline unsigned long
-calc_delta_fair(unsigned long delta, struct sched_entity *se)
-{
-        for_each_sched_entity(se) {
-                delta = calc_delta_mine(delta,
-                                cfs_rq_of(se)->load.weight, &se->load);
-        }
-        return delta;
-}
-/*
 * The idea is to set a period in which each task runs once.
 *
 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -390,54 +362,47 @@ static u64 __sched_period(unsigned long nr_running)
 */
 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
+        u64 slice = __sched_period(cfs_rq->nr_running);
+        for_each_sched_entity(se) {
+                cfs_rq = cfs_rq_of(se);
+                slice *= se->load.weight;
+                do_div(slice, cfs_rq->load.weight);
+        }
+        return slice;
 }
 /*
 * We calculate the vruntime slice of a to be inserted task
 *
- * vs = s*rw/w = p
+ * vs = s/w = p/rw
 */
 static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        unsigned long nr_running = cfs_rq->nr_running;
+        unsigned long weight;
+        u64 vslice;
        if (!se->on_rq)
                nr_running++;
-        return __sched_period(nr_running);
+        vslice = __sched_period(nr_running);
-}
-/*
- * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
- * that it favours >=0 over <0.
- *
- *   -20         |
- *               |
- *     0 --------+-------
- *             .'
- *    19     .'
- *
- */
-static unsigned long
-calc_delta_asym(unsigned long delta, struct sched_entity *se)
-{
-        struct load_weight lw = {
-                .weight = NICE_0_LOAD,
-                .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
-        };
        for_each_sched_entity(se) {
-                struct load_weight *se_lw = &se->load;
+                cfs_rq = cfs_rq_of(se);
-                if (se->load.weight < NICE_0_LOAD)
+                weight = cfs_rq->load.weight;
-                        se_lw = &lw;
+                if (!se->on_rq)
+                        weight += se->load.weight;
-                delta = calc_delta_mine(delta,
+                vslice *= NICE_0_LOAD;
-                                cfs_rq_of(se)->load.weight, se_lw);
+                do_div(vslice, weight);
        }
-        return delta;
+        return vslice;
 }
 /*
@@ -454,7 +419,11 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
        curr->sum_exec_runtime += delta_exec;
        schedstat_add(cfs_rq, exec_clock, delta_exec);
-        delta_exec_weighted = calc_delta_fair(delta_exec, curr);
+        delta_exec_weighted = delta_exec;
+        if (unlikely(curr->load.weight != NICE_0_LOAD)) {
+                delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
+                                                        &curr->load);
+        }
        curr->vruntime += delta_exec_weighted;
 }
@@ -541,27 +510,10 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 * Scheduling class queueing methods:
 */
-#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
-static void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-        cfs_rq->task_weight += weight;
-}
-#else
-static inline void
-add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
-{
-}
-#endif
 static void
 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_add(&cfs_rq->load, se->load.weight);
-        if (!parent_entity(se))
-                inc_cpu_load(rq_of(cfs_rq), se->load.weight);
-        if (entity_is_task(se))
-                add_cfs_task_weight(cfs_rq, se->load.weight);
        cfs_rq->nr_running++;
        se->on_rq = 1;
        list_add(&se->group_node, &cfs_rq->tasks);
@@ -571,10 +523,6 @@ static void
 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_sub(&cfs_rq->load, se->load.weight);
-        if (!parent_entity(se))
-                dec_cpu_load(rq_of(cfs_rq), se->load.weight);
-        if (entity_is_task(se))
-                add_cfs_task_weight(cfs_rq, -se->load.weight);
        cfs_rq->nr_running--;
        se->on_rq = 0;
        list_del_init(&se->group_node);
@@ -661,17 +609,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        if (!initial) {
                /* sleeps upto a single latency don't count. */
-                if (sched_feat(NEW_FAIR_SLEEPERS)) {
+                if (sched_feat(NEW_FAIR_SLEEPERS))
-                        unsigned long thresh = sysctl_sched_latency;
+                        vruntime -= sysctl_sched_latency;
-                        /*
-                         * convert the sleeper threshold into virtual time
-                         */
-                        if (sched_feat(NORMALIZED_SLEEPER))
-                                thresh = calc_delta_fair(thresh, se);
-                        vruntime -= thresh;
-                }
                /* ensure we never gain time by being placed backwards. */
                vruntime = max_vruntime(se->vruntime, vruntime);
@@ -1057,16 +996,27 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
        struct task_struct *curr = this_rq->curr;
        unsigned long tl = this_load;
        unsigned long tl_per_task;
+        int balanced;
-        if (!(this_sd->flags & SD_WAKE_AFFINE))
+        if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
                return 0;
        /*
+         * If sync wakeup then subtract the (maximum possible)
+         * effect of the currently running task from the load
+         * of the current CPU:
+         */
+        if (sync)
+                tl -= current->se.load.weight;
+        balanced = 100*(tl + p->se.load.weight) <= imbalance*load;
+        /*
         * If the currently running task will sleep within
         * a reasonable amount of time then attract this newly
         * woken task:
         */
-        if (sync && curr->sched_class == &fair_sched_class) {
+        if (sync && balanced && curr->sched_class == &fair_sched_class) {
                if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
                                p->se.avg_overlap < sysctl_sched_migration_cost)
                        return 1;
@@ -1075,16 +1025,8 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
        schedstat_inc(p, se.nr_wakeups_affine_attempts);
        tl_per_task = cpu_avg_load_per_task(this_cpu);
-        /*
-         * If sync wakeup then subtract the (maximum possible)
-         * effect of the currently running task from the load
-         * of the current CPU:
-         */
-        if (sync)
-                tl -= current->se.load.weight;
        if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) ||
-                        100*(tl + p->se.load.weight) <= imbalance*load) {
+                        balanced) {
                /*
                 * This domain has SD_WAKE_AFFINE and
                 * p is cache cold in this domain, and
@@ -1169,10 +1111,11 @@ static unsigned long wakeup_gran(struct sched_entity *se)
        unsigned long gran = sysctl_sched_wakeup_granularity;
        /*
-         * More easily preempt - nice tasks, while not making it harder for
+         * More easily preempt - nice tasks, while not making
-         * + nice tasks.
+         * it harder for + nice tasks.
         */
-        gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);
+        if (unlikely(se->load.weight > NICE_0_LOAD))
+                gran = calc_delta_fair(gran, &se->load);
        return gran;
 }
@@ -1366,90 +1309,75 @@ static struct task_struct *load_balance_next_fair(void *arg)
        return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
 }
-static unsigned long
+#ifdef CONFIG_FAIR_GROUP_SCHED
-__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
-                unsigned long max_load_move, struct sched_domain *sd,
-                enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
-                struct cfs_rq *cfs_rq)
 {
-        struct rq_iterator cfs_rq_iterator;
+        struct sched_entity *curr;
+        struct task_struct *p;
-        cfs_rq_iterator.start = load_balance_start_fair;
+        if (!cfs_rq->nr_running || !first_fair(cfs_rq))
-        cfs_rq_iterator.next = load_balance_next_fair;
+                return MAX_PRIO;
-        cfs_rq_iterator.arg = cfs_rq;
+        curr = cfs_rq->curr;
+        if (!curr)
+                curr = __pick_next_entity(cfs_rq);
+        p = task_of(curr);
-        return balance_tasks(this_rq, this_cpu, busiest,
+        return p->prio;
-                        max_load_move, sd, idle, all_pinned,
-                        this_best_prio, &cfs_rq_iterator);
 }
+#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
                  struct sched_domain *sd, enum cpu_idle_type idle,
                  int *all_pinned, int *this_best_prio)
 {
+        struct cfs_rq *busy_cfs_rq;
        long rem_load_move = max_load_move;
-        int busiest_cpu = cpu_of(busiest);
+        struct rq_iterator cfs_rq_iterator;
-        struct task_group *tg;
-        rcu_read_lock();
-        list_for_each_entry(tg, &task_groups, list) {
-                long imbalance;
-                unsigned long this_weight, busiest_weight;
-                long rem_load, max_load, moved_load;
-                /*
-                 * empty group
-                 */
-                if (!aggregate(tg, sd)->task_weight)
-                        continue;
-                rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
-                rem_load /= aggregate(tg, sd)->load + 1;
-                this_weight = tg->cfs_rq[this_cpu]->task_weight;
-                busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
-                imbalance = (busiest_weight - this_weight) / 2;
+        cfs_rq_iterator.start = load_balance_start_fair;
+        cfs_rq_iterator.next = load_balance_next_fair;
-                if (imbalance < 0)
+        for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
-                        imbalance = busiest_weight;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+                struct cfs_rq *this_cfs_rq;
+                long imbalance;
+                unsigned long maxload;
-                max_load = max(rem_load, imbalance);
+                this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
-                moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
-                                max_load, sd, idle, all_pinned, this_best_prio,
-                                tg->cfs_rq[busiest_cpu]);
-                if (!moved_load)
+                imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
+                /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
+                if (imbalance <= 0)
                        continue;
-                move_group_shares(tg, sd, busiest_cpu, this_cpu);
+                /* Don't pull more than imbalance/2 */
+                imbalance /= 2;
+                maxload = min(rem_load_move, imbalance);
-                moved_load *= aggregate(tg, sd)->load;
+                *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
-                moved_load /= aggregate(tg, sd)->rq_weight + 1;
+#else
+# define maxload rem_load_move
+#endif
+                /*
+                 * pass busy_cfs_rq argument into
+                 * load_balance_[start|next]_fair iterators
+                 */
+                cfs_rq_iterator.arg = busy_cfs_rq;
+                rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
+                                               maxload, sd, idle, all_pinned,
+                                               this_best_prio,
+                                               &cfs_rq_iterator);
-                rem_load_move -= moved_load;
+                if (rem_load_move <= 0)
-                if (rem_load_move < 0)
                        break;
        }
-        rcu_read_unlock();
        return max_load_move - rem_load_move;
 }
-#else
-static unsigned long
-load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
-                  unsigned long max_load_move,
-                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *all_pinned, int *this_best_prio)
-{
-        return __load_balance_fair(this_rq, this_cpu, busiest,
-                        max_load_move, sd, idle, all_pinned,
-                        this_best_prio, &busiest->cfs);
-}
-#endif
 static int
 move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index e24ecd39c4b8..08ae848b71d4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -334,34 +334,6 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
334	#endif	334	#endif
335		335
336	/*	336	/*
337	* delta *= w / rw
338	*/
339	static inline unsigned long
340	calc_delta_weight(unsigned long delta, struct sched_entity *se)
341	{
342	for_each_sched_entity(se) {
343	delta = calc_delta_mine(delta,
344	se->load.weight, &cfs_rq_of(se)->load);
345	}
346
347	return delta;
348	}
349
350	/*
351	* delta *= rw / w
352	*/
353	static inline unsigned long
354	calc_delta_fair(unsigned long delta, struct sched_entity *se)
355	{
356	for_each_sched_entity(se) {
357	delta = calc_delta_mine(delta,
358	cfs_rq_of(se)->load.weight, &se->load);
359	}
360
361	return delta;
362	}
363
364	/*
365	* The idea is to set a period in which each task runs once.	337	* The idea is to set a period in which each task runs once.
366	*	338	*
367	* When there are too many tasks (sysctl_sched_nr_latency) we have to stretch	339	* When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -390,54 +362,47 @@ static u64 __sched_period(unsigned long nr_running)
390	*/	362	*/
391	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)	363	static u64 sched_slice(struct cfs_rq cfs_rq, struct sched_entity se)
392	{	364	{
393	return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);	365	u64 slice = __sched_period(cfs_rq->nr_running);
		366
		367	for_each_sched_entity(se) {
		368	cfs_rq = cfs_rq_of(se);
		369
		370	slice *= se->load.weight;
		371	do_div(slice, cfs_rq->load.weight);
		372	}
		373
		374
		375	return slice;
394	}	376	}
395		377
396	/*	378	/*
397	* We calculate the vruntime slice of a to be inserted task	379	* We calculate the vruntime slice of a to be inserted task
398	*	380	*
399	* vs = s*rw/w = p	381	* vs = s/w = p/rw
400	*/	382	*/
401	static u64 sched_vslice_add(struct cfs_rq cfs_rq, struct sched_entity se)	383	static u64 sched_vslice_add(struct cfs_rq cfs_rq, struct sched_entity se)
402	{	384	{
403	unsigned long nr_running = cfs_rq->nr_running;	385	unsigned long nr_running = cfs_rq->nr_running;
		386	unsigned long weight;
		387	u64 vslice;
404		388
405	if (!se->on_rq)	389	if (!se->on_rq)
406	nr_running++;	390	nr_running++;
407		391
408	return __sched_period(nr_running);	392	vslice = __sched_period(nr_running);
409	}
410
411	/*
412	* The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in
413	* that it favours >=0 over <0.
414	*
415	* -20 \|
416	* \|
417	* 0 --------+-------
418	* .'
419	* 19 .'
420	*
421	*/
422	static unsigned long
423	calc_delta_asym(unsigned long delta, struct sched_entity *se)
424	{
425	struct load_weight lw = {
426	.weight = NICE_0_LOAD,
427	.inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428	};
429		393
430	for_each_sched_entity(se) {	394	for_each_sched_entity(se) {
431	struct load_weight *se_lw = &se->load;	395	cfs_rq = cfs_rq_of(se);
432		396
433	if (se->load.weight < NICE_0_LOAD)	397	weight = cfs_rq->load.weight;
434	se_lw = &lw;	398	if (!se->on_rq)
		399	weight += se->load.weight;
435		400
436	delta = calc_delta_mine(delta,	401	vslice *= NICE_0_LOAD;
437	cfs_rq_of(se)->load.weight, se_lw);	402	do_div(vslice, weight);
438	}	403	}
439		404
440	return delta;	405	return vslice;
441	}	406	}
442		407
443	/*	408	/*
@@ -454,7 +419,11 @@ __update_curr(struct cfs_rq cfs_rq, struct sched_entity curr,
454		419
455	curr->sum_exec_runtime += delta_exec;	420	curr->sum_exec_runtime += delta_exec;
456	schedstat_add(cfs_rq, exec_clock, delta_exec);	421	schedstat_add(cfs_rq, exec_clock, delta_exec);
457	delta_exec_weighted = calc_delta_fair(delta_exec, curr);	422	delta_exec_weighted = delta_exec;
		423	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
		424	delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
		425	&curr->load);
		426	}
458	curr->vruntime += delta_exec_weighted;	427	curr->vruntime += delta_exec_weighted;
459	}	428	}
460		429
@@ -541,27 +510,10 @@ update_stats_curr_start(struct cfs_rq cfs_rq, struct sched_entity se)
541	* Scheduling class queueing methods:	510	* Scheduling class queueing methods:
542	*/	511	*/
543		512
544	#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
545	static void
546	add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
547	{
548	cfs_rq->task_weight += weight;
549	}
550	#else
551	static inline void
552	add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
553	{
554	}
555	#endif
556
557	static void	513	static void
558	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)	514	account_entity_enqueue(struct cfs_rq cfs_rq, struct sched_entity se)
559	{	515	{
560	update_load_add(&cfs_rq->load, se->load.weight);	516	update_load_add(&cfs_rq->load, se->load.weight);
561	if (!parent_entity(se))
562	inc_cpu_load(rq_of(cfs_rq), se->load.weight);
563	if (entity_is_task(se))
564	add_cfs_task_weight(cfs_rq, se->load.weight);
565	cfs_rq->nr_running++;	517	cfs_rq->nr_running++;
566	se->on_rq = 1;	518	se->on_rq = 1;
567	list_add(&se->group_node, &cfs_rq->tasks);	519	list_add(&se->group_node, &cfs_rq->tasks);
@@ -571,10 +523,6 @@ static void
571	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)	523	account_entity_dequeue(struct cfs_rq cfs_rq, struct sched_entity se)
572	{	524	{
573	update_load_sub(&cfs_rq->load, se->load.weight);	525	update_load_sub(&cfs_rq->load, se->load.weight);
574	if (!parent_entity(se))
575	dec_cpu_load(rq_of(cfs_rq), se->load.weight);
576	if (entity_is_task(se))
577	add_cfs_task_weight(cfs_rq, -se->load.weight);
578	cfs_rq->nr_running--;	526	cfs_rq->nr_running--;
579	se->on_rq = 0;	527	se->on_rq = 0;
580	list_del_init(&se->group_node);	528	list_del_init(&se->group_node);
@@ -661,17 +609,8 @@ place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
661		609
662	if (!initial) {	610	if (!initial) {
663	/* sleeps upto a single latency don't count. */	611	/* sleeps upto a single latency don't count. */
664	if (sched_feat(NEW_FAIR_SLEEPERS)) {	612	if (sched_feat(NEW_FAIR_SLEEPERS))
665	unsigned long thresh = sysctl_sched_latency;	613	vruntime -= sysctl_sched_latency;
666
667	/*
668	* convert the sleeper threshold into virtual time
669	*/
670	if (sched_feat(NORMALIZED_SLEEPER))
671	thresh = calc_delta_fair(thresh, se);
672
673	vruntime -= thresh;
674	}
675		614
676	/* ensure we never gain time by being placed backwards. */	615	/* ensure we never gain time by being placed backwards. */
677	vruntime = max_vruntime(se->vruntime, vruntime);	616	vruntime = max_vruntime(se->vruntime, vruntime);
@@ -1057,16 +996,27 @@ wake_affine(struct rq rq, struct sched_domain this_sd, struct rq *this_rq,
1057	struct task_struct *curr = this_rq->curr;	996	struct task_struct *curr = this_rq->curr;
1058	unsigned long tl = this_load;	997	unsigned long tl = this_load;
1059	unsigned long tl_per_task;	998	unsigned long tl_per_task;
		999	int balanced;
1060		1000
1061	if (!(this_sd->flags & SD_WAKE_AFFINE))	1001	if (!(this_sd->flags & SD_WAKE_AFFINE) \|\| !sched_feat(AFFINE_WAKEUPS))
1062	return 0;	1002	return 0;
1063		1003
1064	/*	1004	/*
		1005	* If sync wakeup then subtract the (maximum possible)
		1006	* effect of the currently running task from the load
		1007	* of the current CPU:
		1008	*/
		1009	if (sync)
		1010	tl -= current->se.load.weight;
		1011
		1012	balanced = 100(tl + p->se.load.weight) <= imbalanceload;
		1013
		1014	/*
1065	* If the currently running task will sleep within	1015	* If the currently running task will sleep within
1066	* a reasonable amount of time then attract this newly	1016	* a reasonable amount of time then attract this newly
1067	* woken task:	1017	* woken task:
1068	*/	1018	*/
1069	if (sync && curr->sched_class == &fair_sched_class) {	1019	if (sync && balanced && curr->sched_class == &fair_sched_class) {
1070	if (curr->se.avg_overlap < sysctl_sched_migration_cost &&	1020	if (curr->se.avg_overlap < sysctl_sched_migration_cost &&
1071	p->se.avg_overlap < sysctl_sched_migration_cost)	1021	p->se.avg_overlap < sysctl_sched_migration_cost)
1072	return 1;	1022	return 1;
@@ -1075,16 +1025,8 @@ wake_affine(struct rq rq, struct sched_domain this_sd, struct rq *this_rq,
1075	schedstat_inc(p, se.nr_wakeups_affine_attempts);	1025	schedstat_inc(p, se.nr_wakeups_affine_attempts);
1076	tl_per_task = cpu_avg_load_per_task(this_cpu);	1026	tl_per_task = cpu_avg_load_per_task(this_cpu);
1077		1027
1078	/*
1079	* If sync wakeup then subtract the (maximum possible)
1080	* effect of the currently running task from the load
1081	* of the current CPU:
1082	*/
1083	if (sync)
1084	tl -= current->se.load.weight;
1085
1086	if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) \|\|	1028	if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) \|\|
1087	100(tl + p->se.load.weight) <= imbalanceload) {	1029	balanced) {
1088	/*	1030	/*
1089	* This domain has SD_WAKE_AFFINE and	1031	* This domain has SD_WAKE_AFFINE and
1090	* p is cache cold in this domain, and	1032	* p is cache cold in this domain, and
@@ -1169,10 +1111,11 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1169	unsigned long gran = sysctl_sched_wakeup_granularity;	1111	unsigned long gran = sysctl_sched_wakeup_granularity;
1170		1112
1171	/*	1113	/*
1172	* More easily preempt - nice tasks, while not making it harder for	1114	* More easily preempt - nice tasks, while not making
1173	* + nice tasks.	1115	* it harder for + nice tasks.
1174	*/	1116	*/
1175	gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se);	1117	if (unlikely(se->load.weight > NICE_0_LOAD))
		1118	gran = calc_delta_fair(gran, &se->load);
1176		1119
1177	return gran;	1120	return gran;
1178	}	1121	}
@@ -1366,90 +1309,75 @@ static struct task_struct load_balance_next_fair(void arg)
1366	return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);	1309	return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
1367	}	1310	}
1368		1311
1369	static unsigned long	1312	#ifdef CONFIG_FAIR_GROUP_SCHED
1370	__load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,	1313	static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
1371	unsigned long max_load_move, struct sched_domain *sd,
1372	enum cpu_idle_type idle, int all_pinned, int this_best_prio,
1373	struct cfs_rq *cfs_rq)
1374	{	1314	{
1375	struct rq_iterator cfs_rq_iterator;	1315	struct sched_entity *curr;
		1316	struct task_struct *p;
1376		1317
1377	cfs_rq_iterator.start = load_balance_start_fair;	1318	if (!cfs_rq->nr_running \|\| !first_fair(cfs_rq))
1378	cfs_rq_iterator.next = load_balance_next_fair;	1319	return MAX_PRIO;
1379	cfs_rq_iterator.arg = cfs_rq;	1320
		1321	curr = cfs_rq->curr;
		1322	if (!curr)
		1323	curr = __pick_next_entity(cfs_rq);
		1324
		1325	p = task_of(curr);
1380		1326
1381	return balance_tasks(this_rq, this_cpu, busiest,	1327	return p->prio;
1382	max_load_move, sd, idle, all_pinned,
1383	this_best_prio, &cfs_rq_iterator);
1384	}	1328	}
		1329	#endif
1385		1330
1386	#ifdef CONFIG_FAIR_GROUP_SCHED
1387	static unsigned long	1331	static unsigned long
1388	load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,	1332	load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,
1389	unsigned long max_load_move,	1333	unsigned long max_load_move,
1390	struct sched_domain *sd, enum cpu_idle_type idle,	1334	struct sched_domain *sd, enum cpu_idle_type idle,
1391	int all_pinned, int this_best_prio)	1335	int all_pinned, int this_best_prio)
1392	{	1336	{
		1337	struct cfs_rq *busy_cfs_rq;
1393	long rem_load_move = max_load_move;	1338	long rem_load_move = max_load_move;
1394	int busiest_cpu = cpu_of(busiest);	1339	struct rq_iterator cfs_rq_iterator;
1395	struct task_group *tg;
1396
1397	rcu_read_lock();
1398	list_for_each_entry(tg, &task_groups, list) {
1399	long imbalance;
1400	unsigned long this_weight, busiest_weight;
1401	long rem_load, max_load, moved_load;
1402
1403	/*
1404	* empty group
1405	*/
1406	if (!aggregate(tg, sd)->task_weight)
1407	continue;
1408
1409	rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
1410	rem_load /= aggregate(tg, sd)->load + 1;
1411
1412	this_weight = tg->cfs_rq[this_cpu]->task_weight;
1413	busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
1414		1340
1415	imbalance = (busiest_weight - this_weight) / 2;	1341	cfs_rq_iterator.start = load_balance_start_fair;
		1342	cfs_rq_iterator.next = load_balance_next_fair;
1416		1343
1417	if (imbalance < 0)	1344	for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
1418	imbalance = busiest_weight;	1345	#ifdef CONFIG_FAIR_GROUP_SCHED
		1346	struct cfs_rq *this_cfs_rq;
		1347	long imbalance;
		1348	unsigned long maxload;
1419		1349
1420	max_load = max(rem_load, imbalance);	1350	this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
1421	moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1422	max_load, sd, idle, all_pinned, this_best_prio,
1423	tg->cfs_rq[busiest_cpu]);
1424		1351
1425	if (!moved_load)	1352	imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
		1353	/* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
		1354	if (imbalance <= 0)
1426	continue;	1355	continue;
1427		1356
1428	move_group_shares(tg, sd, busiest_cpu, this_cpu);	1357	/* Don't pull more than imbalance/2 */
		1358	imbalance /= 2;
		1359	maxload = min(rem_load_move, imbalance);
1429		1360
1430	moved_load *= aggregate(tg, sd)->load;	1361	*this_best_prio = cfs_rq_best_prio(this_cfs_rq);
1431	moved_load /= aggregate(tg, sd)->rq_weight + 1;	1362	#else
		1363	# define maxload rem_load_move
		1364	#endif
		1365	/*
		1366	* pass busy_cfs_rq argument into
		1367	* load_balance_[start\|next]_fair iterators
		1368	*/
		1369	cfs_rq_iterator.arg = busy_cfs_rq;
		1370	rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
		1371	maxload, sd, idle, all_pinned,
		1372	this_best_prio,
		1373	&cfs_rq_iterator);
1432		1374
1433	rem_load_move -= moved_load;	1375	if (rem_load_move <= 0)
1434	if (rem_load_move < 0)
1435	break;	1376	break;
1436	}	1377	}
1437	rcu_read_unlock();
1438		1378
1439	return max_load_move - rem_load_move;	1379	return max_load_move - rem_load_move;
1440	}	1380	}
1441	#else
1442	static unsigned long
1443	load_balance_fair(struct rq this_rq, int this_cpu, struct rq busiest,
1444	unsigned long max_load_move,
1445	struct sched_domain *sd, enum cpu_idle_type idle,
1446	int all_pinned, int this_best_prio)
1447	{
1448	return __load_balance_fair(this_rq, this_cpu, busiest,
1449	max_load_move, sd, idle, all_pinned,
1450	this_best_prio, &busiest->cfs);
1451	}
1452	#endif
1453		1381
1454	static int	1382	static int
1455	move_one_task_fair(struct rq this_rq, int this_cpu, struct rq busiest,	1383	move_one_task_fair(struct rq this_rq, int this_cpu, struct rq busiest,