1 files changed, 261 insertions, 153 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index aa7f84121016..10d218ab69f2 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -711,7 +711,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        if (!initial) {
                /* sleeps upto a single latency don't count. */
-                if (sched_feat(NEW_FAIR_SLEEPERS)) {
+                if (sched_feat(FAIR_SLEEPERS)) {
                        unsigned long thresh = sysctl_sched_latency;
                        /*
@@ -725,6 +725,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
                                         task_of(se)->policy != SCHED_IDLE))
                                thresh = calc_delta_fair(thresh, se);
+                        /*
+                         * Halve their sleep time's effect, to allow
+                         * for a gentler effect of sleepers:
+                         */
+                        if (sched_feat(GENTLE_FAIR_SLEEPERS))
+                                thresh >>= 1;
                        vruntime -= thresh;
                }
        }
@@ -757,10 +764,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        if (cfs_rq->last == se)
+        if (!se || cfs_rq->last == se)
                cfs_rq->last = NULL;
-        if (cfs_rq->next == se)
+        if (!se || cfs_rq->next == se)
                cfs_rq->next = NULL;
 }
@@ -1062,83 +1069,6 @@ static void yield_task_fair(struct rq *rq)
        se->vruntime = rightmost->vruntime + 1;
 }
-/*
- * wake_idle() will wake a task on an idle cpu if task->cpu is
- * not idle and an idle cpu is available.  The span of cpus to
- * search starts with cpus closest then further out as needed,
- * so we always favor a closer, idle cpu.
- * Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (rq->rd->online)
- *
- * Returns the CPU we should wake onto.
- */
-#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
-static int wake_idle(int cpu, struct task_struct *p)
-{
-        struct sched_domain *sd;
-        int i;
-        unsigned int chosen_wakeup_cpu;
-        int this_cpu;
-        struct rq *task_rq = task_rq(p);
-        /*
-         * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
-         * are idle and this is not a kernel thread and this task's affinity
-         * allows it to be moved to preferred cpu, then just move!
-         */
-        this_cpu = smp_processor_id();
-        chosen_wakeup_cpu =
-                cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
-        if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
-                idle_cpu(cpu) && idle_cpu(this_cpu) &&
-                p->mm && !(p->flags & PF_KTHREAD) &&
-                cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
-                return chosen_wakeup_cpu;
-        /*
-         * If it is idle, then it is the best cpu to run this task.
-         *
-         * This cpu is also the best, if it has more than one task already.
-         * Siblings must be also busy(in most cases) as they didn't already
-         * pickup the extra load from this cpu and hence we need not check
-         * sibling runqueue info. This will avoid the checks and cache miss
-         * penalities associated with that.
-         */
-        if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
-                return cpu;
-        for_each_domain(cpu, sd) {
-                if ((sd->flags & SD_WAKE_IDLE)
-                    || ((sd->flags & SD_WAKE_IDLE_FAR)
-                        && !task_hot(p, task_rq->clock, sd))) {
-                        for_each_cpu_and(i, sched_domain_span(sd),
-                                         &p->cpus_allowed) {
-                                if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
-                                        if (i != task_cpu(p)) {
-                                                schedstat_inc(p,
-                                                       se.nr_wakeups_idle);
-                                        }
-                                        return i;
-                                }
-                        }
-                } else {
-                        break;
-                }
-        }
-        return cpu;
-}
-#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
-static inline int wake_idle(int cpu, struct task_struct *p)
-{
-        return cpu;
-}
-#endif
 #ifdef CONFIG_SMP
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1225,25 +1155,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 #endif
-static int
+static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
-wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
-            struct task_struct *p, int prev_cpu, int this_cpu, int sync,
-            int idx, unsigned long load, unsigned long this_load,
-            unsigned int imbalance)
 {
-        struct task_struct *curr = this_rq->curr;
+        struct task_struct *curr = current;
-        struct task_group *tg;
+        unsigned long this_load, load;
-        unsigned long tl = this_load;
+        int idx, this_cpu, prev_cpu;
        unsigned long tl_per_task;
+        unsigned int imbalance;
+        struct task_group *tg;
        unsigned long weight;
        int balanced;
-        if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
+        idx       = sd->wake_idx;
-                return 0;
+        this_cpu  = smp_processor_id();
+        prev_cpu  = task_cpu(p);
+        load      = source_load(prev_cpu, idx);
+        this_load = target_load(this_cpu, idx);
-        if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
+        if (sync) {
-                        p->se.avg_overlap > sysctl_sched_migration_cost))
+               if (sched_feat(SYNC_LESS) &&
-                sync = 0;
+                   (curr->se.avg_overlap > sysctl_sched_migration_cost ||
+                    p->se.avg_overlap > sysctl_sched_migration_cost))
+                       sync = 0;
+        } else {
+                if (sched_feat(SYNC_MORE) &&
+                    (curr->se.avg_overlap < sysctl_sched_migration_cost &&
+                     p->se.avg_overlap < sysctl_sched_migration_cost))
+                        sync = 1;
+        }
        /*
         * If sync wakeup then subtract the (maximum possible)
@@ -1254,24 +1193,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
                tg = task_group(current);
                weight = current->se.load.weight;
-                tl += effective_load(tg, this_cpu, -weight, -weight);
+                this_load += effective_load(tg, this_cpu, -weight, -weight);
                load += effective_load(tg, prev_cpu, 0, -weight);
        }
        tg = task_group(p);
        weight = p->se.load.weight;
+        imbalance = 100 + (sd->imbalance_pct - 100) / 2;
        /*
         * In low-load situations, where prev_cpu is idle and this_cpu is idle
-         * due to the sync cause above having dropped tl to 0, we'll always have
+         * due to the sync cause above having dropped this_load to 0, we'll
-         * an imbalance, but there's really nothing you can do about that, so
+         * always have an imbalance, but there's really nothing you can do
-         * that's good too.
+         * about that, so that's good too.
         *
         * Otherwise check if either cpus are near enough in load to allow this
         * task to be woken on this_cpu.
         */
-        balanced = !tl ||
+        balanced = !this_load ||
-                100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+                100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
                imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
        /*
@@ -1285,14 +1226,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
        schedstat_inc(p, se.nr_wakeups_affine_attempts);
        tl_per_task = cpu_avg_load_per_task(this_cpu);
-        if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
+        if (balanced ||
-                        tl_per_task)) {
+            (this_load <= load &&
+             this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
                /*
                 * This domain has SD_WAKE_AFFINE and
                 * p is cache cold in this domain, and
                 * there is no bad imbalance.
                 */
-                schedstat_inc(this_sd, ttwu_move_affine);
+                schedstat_inc(sd, ttwu_move_affine);
                schedstat_inc(p, se.nr_wakeups_affine);
                return 1;
@@ -1300,65 +1242,215 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
        return 0;
 }
-static int select_task_rq_fair(struct task_struct *p, int sync)
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+                  int this_cpu, int load_idx)
 {
-        struct sched_domain *sd, *this_sd = NULL;
+        struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
-        int prev_cpu, this_cpu, new_cpu;
+        unsigned long min_load = ULONG_MAX, this_load = 0;
-        unsigned long load, this_load;
+        int imbalance = 100 + (sd->imbalance_pct-100)/2;
-        struct rq *this_rq;
-        unsigned int imbalance;
-        int idx;
-        prev_cpu        = task_cpu(p);
+        do {
-        this_cpu        = smp_processor_id();
+                unsigned long load, avg_load;
-        this_rq         = cpu_rq(this_cpu);
+                int local_group;
-        new_cpu         = prev_cpu;
+                int i;
-        /*
+                /* Skip over this group if it has no CPUs allowed */
-         * 'this_sd' is the first domain that both
+                if (!cpumask_intersects(sched_group_cpus(group),
-         * this_cpu and prev_cpu are present in:
+                                        &p->cpus_allowed))
-         */
+                        continue;
-        for_each_domain(this_cpu, sd) {
-                if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
+                local_group = cpumask_test_cpu(this_cpu,
-                        this_sd = sd;
+                                               sched_group_cpus(group));
-                        break;
+                /* Tally up the load of all CPUs in the group */
+                avg_load = 0;
+                for_each_cpu(i, sched_group_cpus(group)) {
+                        /* Bias balancing toward cpus of our domain */
+                        if (local_group)
+                                load = source_load(i, load_idx);
+                        else
+                                load = target_load(i, load_idx);
+                        avg_load += load;
+                }
+                /* Adjust by relative CPU power of the group */
+                avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+                if (local_group) {
+                        this_load = avg_load;
+                        this = group;
+                } else if (avg_load < min_load) {
+                        min_load = avg_load;
+                        idlest = group;
+                }
+        } while (group = group->next, group != sd->groups);
+        if (!idlest || 100*this_load < imbalance*min_load)
+                return NULL;
+        return idlest;
+}
+/*
+ * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ */
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+{
+        unsigned long load, min_load = ULONG_MAX;
+        int idlest = -1;
+        int i;
+        /* Traverse only the allowed CPUs */
+        for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+                load = weighted_cpuload(i);
+                if (load < min_load || (load == min_load && i == this_cpu)) {
+                        min_load = load;
+                        idlest = i;
                }
        }
-        if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
+        return idlest;
-                goto out;
+}
-        /*
+/*
-         * Check for affine wakeup and passive balancing possibilities.
+ * sched_balance_self: balance the current task (running on cpu) in domains
-         */
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
-        if (!this_sd)
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+{
+        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+        int cpu = smp_processor_id();
+        int prev_cpu = task_cpu(p);
+        int new_cpu = cpu;
+        int want_affine = 0;
+        int want_sd = 1;
+        int sync = wake_flags & WF_SYNC;
+        if (sd_flag & SD_BALANCE_WAKE) {
+                if (sched_feat(AFFINE_WAKEUPS))
+                        want_affine = 1;
+                new_cpu = prev_cpu;
+        }
+        rcu_read_lock();
+        for_each_domain(cpu, tmp) {
+                /*
+                 * If power savings logic is enabled for a domain, see if we
+                 * are not overloaded, if so, don't balance wider.
+                 */
+                if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
+                        unsigned long power = 0;
+                        unsigned long nr_running = 0;
+                        unsigned long capacity;
+                        int i;
+                        for_each_cpu(i, sched_domain_span(tmp)) {
+                                power += power_of(i);
+                                nr_running += cpu_rq(i)->cfs.nr_running;
+                        }
+                        capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+                        if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+                                nr_running /= 2;
+                        if (nr_running < capacity)
+                                want_sd = 0;
+                }
+                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+                    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+                        affine_sd = tmp;
+                        want_affine = 0;
+                }
+                if (!want_sd && !want_affine)
+                        break;
+                if (!(tmp->flags & sd_flag))
+                        continue;
+                if (want_sd)
+                        sd = tmp;
+        }
+        if (sched_feat(LB_SHARES_UPDATE)) {
+                /*
+                 * Pick the largest domain to update shares over
+                 */
+                tmp = sd;
+                if (affine_sd && (!tmp ||
+                                  cpumask_weight(sched_domain_span(affine_sd)) >
+                                  cpumask_weight(sched_domain_span(sd))))
+                        tmp = affine_sd;
+                if (tmp)
+                        update_shares(tmp);
+        }
+        if (affine_sd && wake_affine(affine_sd, p, sync)) {
+                new_cpu = cpu;
                goto out;
+        }
-        idx = this_sd->wake_idx;
+        while (sd) {
+                int load_idx = sd->forkexec_idx;
+                struct sched_group *group;
+                int weight;
-        imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+                if (!(sd->flags & sd_flag)) {
+                        sd = sd->child;
+                        continue;
+                }
-        load = source_load(prev_cpu, idx);
+                if (sd_flag & SD_BALANCE_WAKE)
-        this_load = target_load(this_cpu, idx);
+                        load_idx = sd->wake_idx;
-        if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
+                group = find_idlest_group(sd, p, cpu, load_idx);
-                                     load, this_load, imbalance))
+                if (!group) {
-                return this_cpu;
+                        sd = sd->child;
+                        continue;
+                }
-        /*
+                new_cpu = find_idlest_cpu(group, p, cpu);
-         * Start passive balancing when half the imbalance_pct
+                if (new_cpu == -1 || new_cpu == cpu) {
-         * limit is reached.
+                        /* Now try balancing at a lower domain level of cpu */
-         */
+                        sd = sd->child;
-        if (this_sd->flags & SD_WAKE_BALANCE) {
+                        continue;
-                if (imbalance*this_load <= 100*load) {
-                        schedstat_inc(this_sd, ttwu_move_balance);
-                        schedstat_inc(p, se.nr_wakeups_passive);
-                        return this_cpu;
                }
+                /* Now try balancing at a lower domain level of new_cpu */
+                cpu = new_cpu;
+                weight = cpumask_weight(sched_domain_span(sd));
+                sd = NULL;
+                for_each_domain(cpu, tmp) {
+                        if (weight <= cpumask_weight(sched_domain_span(tmp)))
+                                break;
+                        if (tmp->flags & sd_flag)
+                                sd = tmp;
+                }
+                /* while loop will break here if sd == NULL */
        }
 out:
-        return wake_idle(new_cpu, p);
+        rcu_read_unlock();
+        return new_cpu;
 }
 #endif /* CONFIG_SMP */
@@ -1471,11 +1563,12 @@ static void set_next_buddy(struct sched_entity *se)
 /*
 * Preempt the current task with a newly woken task if needed:
 */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
        struct task_struct *curr = rq->curr;
        struct sched_entity *se = &curr->se, *pse = &p->se;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+        int sync = wake_flags & WF_SYNC;
        update_curr(cfs_rq);
@@ -1501,7 +1594,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
         */
        if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
                set_last_buddy(se);
-        set_next_buddy(pse);
+        if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
+                set_next_buddy(pse);
        /*
         * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1523,16 +1617,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
                return;
        }
-        if (!sched_feat(WAKEUP_PREEMPT))
+        if ((sched_feat(WAKEUP_SYNC) && sync) ||
-                return;
+            (sched_feat(WAKEUP_OVERLAP) &&
+             (se->avg_overlap < sysctl_sched_migration_cost &&
-        if (sched_feat(WAKEUP_OVERLAP) && (sync ||
+              pse->avg_overlap < sysctl_sched_migration_cost))) {
-                        (se->avg_overlap < sysctl_sched_migration_cost &&
-                         pse->avg_overlap < sysctl_sched_migration_cost))) {
                resched_task(curr);
                return;
        }
+        if (sched_feat(WAKEUP_RUNNING)) {
+                if (pse->avg_running < se->avg_running) {
+                        set_next_buddy(pse);
+                        resched_task(curr);
+                        return;
+                }
+        }
+        if (!sched_feat(WAKEUP_PREEMPT))
+                return;
        find_matching_se(&se, &pse);
        BUG_ON(!pse);
@@ -1555,8 +1658,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
                /*
                 * If se was a buddy, clear it so that it will have to earn
                 * the favour again.
+                 *
+                 * If se was not a buddy, clear the buddies because neither
+                 * was elegible to run, let them earn it again.
+                 *
+                 * IOW. unconditionally clear buddies.
                 */
-                __clear_buddies(cfs_rq, se);
+                __clear_buddies(cfs_rq, NULL);
                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index aa7f84121016..10d218ab69f2 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -711,7 +711,7 @@ place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
711		711
712	if (!initial) {	712	if (!initial) {
713	/* sleeps upto a single latency don't count. */	713	/* sleeps upto a single latency don't count. */
714	if (sched_feat(NEW_FAIR_SLEEPERS)) {	714	if (sched_feat(FAIR_SLEEPERS)) {
715	unsigned long thresh = sysctl_sched_latency;	715	unsigned long thresh = sysctl_sched_latency;
716		716
717	/*	717	/*
@@ -725,6 +725,13 @@ place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
725	task_of(se)->policy != SCHED_IDLE))	725	task_of(se)->policy != SCHED_IDLE))
726	thresh = calc_delta_fair(thresh, se);	726	thresh = calc_delta_fair(thresh, se);
727		727
		728	/*
		729	* Halve their sleep time's effect, to allow
		730	* for a gentler effect of sleepers:
		731	*/
		732	if (sched_feat(GENTLE_FAIR_SLEEPERS))
		733	thresh >>= 1;
		734
728	vruntime -= thresh;	735	vruntime -= thresh;
729	}	736	}
730	}	737	}
@@ -757,10 +764,10 @@ enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int wakeup)
757		764
758	static void __clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)	765	static void __clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
759	{	766	{
760	if (cfs_rq->last == se)	767	if (!se \|\| cfs_rq->last == se)
761	cfs_rq->last = NULL;	768	cfs_rq->last = NULL;
762		769
763	if (cfs_rq->next == se)	770	if (!se \|\| cfs_rq->next == se)
764	cfs_rq->next = NULL;	771	cfs_rq->next = NULL;
765	}	772	}
766		773
@@ -1062,83 +1069,6 @@ static void yield_task_fair(struct rq *rq)
1062	se->vruntime = rightmost->vruntime + 1;	1069	se->vruntime = rightmost->vruntime + 1;
1063	}	1070	}
1064		1071
1065	/*
1066	* wake_idle() will wake a task on an idle cpu if task->cpu is
1067	* not idle and an idle cpu is available. The span of cpus to
1068	* search starts with cpus closest then further out as needed,
1069	* so we always favor a closer, idle cpu.
1070	* Domains may include CPUs that are not usable for migration,
1071	* hence we need to mask them out (rq->rd->online)
1072	*
1073	* Returns the CPU we should wake onto.
1074	*/
1075	#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1076
1077	#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
1078
1079	static int wake_idle(int cpu, struct task_struct *p)
1080	{
1081	struct sched_domain *sd;
1082	int i;
1083	unsigned int chosen_wakeup_cpu;
1084	int this_cpu;
1085	struct rq *task_rq = task_rq(p);
1086
1087	/*
1088	* At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1089	* are idle and this is not a kernel thread and this task's affinity
1090	* allows it to be moved to preferred cpu, then just move!
1091	*/
1092
1093	this_cpu = smp_processor_id();
1094	chosen_wakeup_cpu =
1095	cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1096
1097	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1098	idle_cpu(cpu) && idle_cpu(this_cpu) &&
1099	p->mm && !(p->flags & PF_KTHREAD) &&
1100	cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1101	return chosen_wakeup_cpu;
1102
1103	/*
1104	* If it is idle, then it is the best cpu to run this task.
1105	*
1106	* This cpu is also the best, if it has more than one task already.
1107	* Siblings must be also busy(in most cases) as they didn't already
1108	* pickup the extra load from this cpu and hence we need not check
1109	* sibling runqueue info. This will avoid the checks and cache miss
1110	* penalities associated with that.
1111	*/
1112	if (idle_cpu(cpu) \|\| cpu_rq(cpu)->cfs.nr_running > 1)
1113	return cpu;
1114
1115	for_each_domain(cpu, sd) {
1116	if ((sd->flags & SD_WAKE_IDLE)
1117	\|\| ((sd->flags & SD_WAKE_IDLE_FAR)
1118	&& !task_hot(p, task_rq->clock, sd))) {
1119	for_each_cpu_and(i, sched_domain_span(sd),
1120	&p->cpus_allowed) {
1121	if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
1122	if (i != task_cpu(p)) {
1123	schedstat_inc(p,
1124	se.nr_wakeups_idle);
1125	}
1126	return i;
1127	}
1128	}
1129	} else {
1130	break;
1131	}
1132	}
1133	return cpu;
1134	}
1135	#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
1136	static inline int wake_idle(int cpu, struct task_struct *p)
1137	{
1138	return cpu;
1139	}
1140	#endif
1141
1142	#ifdef CONFIG_SMP	1072	#ifdef CONFIG_SMP
1143		1073
1144	#ifdef CONFIG_FAIR_GROUP_SCHED	1074	#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1225,25 +1155,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1225		1155
1226	#endif	1156	#endif
1227		1157
1228	static int	1158	static int wake_affine(struct sched_domain sd, struct task_struct p, int sync)
1229	wake_affine(struct sched_domain this_sd, struct rq this_rq,
1230	struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1231	int idx, unsigned long load, unsigned long this_load,
1232	unsigned int imbalance)
1233	{	1159	{
1234	struct task_struct *curr = this_rq->curr;	1160	struct task_struct *curr = current;
1235	struct task_group *tg;	1161	unsigned long this_load, load;
1236	unsigned long tl = this_load;	1162	int idx, this_cpu, prev_cpu;
1237	unsigned long tl_per_task;	1163	unsigned long tl_per_task;
		1164	unsigned int imbalance;
		1165	struct task_group *tg;
1238	unsigned long weight;	1166	unsigned long weight;
1239	int balanced;	1167	int balanced;
1240		1168
1241	if (!(this_sd->flags & SD_WAKE_AFFINE) \|\| !sched_feat(AFFINE_WAKEUPS))	1169	idx = sd->wake_idx;
1242	return 0;	1170	this_cpu = smp_processor_id();
		1171	prev_cpu = task_cpu(p);
		1172	load = source_load(prev_cpu, idx);
		1173	this_load = target_load(this_cpu, idx);
1243		1174
1244	if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost \|\|	1175	if (sync) {
1245	p->se.avg_overlap > sysctl_sched_migration_cost))	1176	if (sched_feat(SYNC_LESS) &&
1246	sync = 0;	1177	(curr->se.avg_overlap > sysctl_sched_migration_cost \|\|
		1178	p->se.avg_overlap > sysctl_sched_migration_cost))
		1179	sync = 0;
		1180	} else {
		1181	if (sched_feat(SYNC_MORE) &&
		1182	(curr->se.avg_overlap < sysctl_sched_migration_cost &&
		1183	p->se.avg_overlap < sysctl_sched_migration_cost))
		1184	sync = 1;
		1185	}
1247		1186
1248	/*	1187	/*
1249	* If sync wakeup then subtract the (maximum possible)	1188	* If sync wakeup then subtract the (maximum possible)
@@ -1254,24 +1193,26 @@ wake_affine(struct sched_domain this_sd, struct rq this_rq,
1254	tg = task_group(current);	1193	tg = task_group(current);
1255	weight = current->se.load.weight;	1194	weight = current->se.load.weight;
1256		1195
1257	tl += effective_load(tg, this_cpu, -weight, -weight);	1196	this_load += effective_load(tg, this_cpu, -weight, -weight);
1258	load += effective_load(tg, prev_cpu, 0, -weight);	1197	load += effective_load(tg, prev_cpu, 0, -weight);
1259	}	1198	}
1260		1199
1261	tg = task_group(p);	1200	tg = task_group(p);
1262	weight = p->se.load.weight;	1201	weight = p->se.load.weight;
1263		1202
		1203	imbalance = 100 + (sd->imbalance_pct - 100) / 2;
		1204
1264	/*	1205	/*
1265	* In low-load situations, where prev_cpu is idle and this_cpu is idle	1206	* In low-load situations, where prev_cpu is idle and this_cpu is idle
1266	* due to the sync cause above having dropped tl to 0, we'll always have	1207	* due to the sync cause above having dropped this_load to 0, we'll
1267	* an imbalance, but there's really nothing you can do about that, so	1208	* always have an imbalance, but there's really nothing you can do
1268	* that's good too.	1209	* about that, so that's good too.
1269	*	1210	*
1270	* Otherwise check if either cpus are near enough in load to allow this	1211	* Otherwise check if either cpus are near enough in load to allow this
1271	* task to be woken on this_cpu.	1212	* task to be woken on this_cpu.
1272	*/	1213	*/
1273	balanced = !tl \|\|	1214	balanced = !this_load \|\|
1274	100*(tl + effective_load(tg, this_cpu, weight, weight)) <=	1215	100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
1275	imbalance*(load + effective_load(tg, prev_cpu, 0, weight));	1216	imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1276		1217
1277	/*	1218	/*
@@ -1285,14 +1226,15 @@ wake_affine(struct sched_domain this_sd, struct rq this_rq,
1285	schedstat_inc(p, se.nr_wakeups_affine_attempts);	1226	schedstat_inc(p, se.nr_wakeups_affine_attempts);
1286	tl_per_task = cpu_avg_load_per_task(this_cpu);	1227	tl_per_task = cpu_avg_load_per_task(this_cpu);
1287		1228
1288	if (balanced \|\| (tl <= load && tl + target_load(prev_cpu, idx) <=	1229	if (balanced \|\|
1289	tl_per_task)) {	1230	(this_load <= load &&
		1231	this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1290	/*	1232	/*
1291	* This domain has SD_WAKE_AFFINE and	1233	* This domain has SD_WAKE_AFFINE and
1292	* p is cache cold in this domain, and	1234	* p is cache cold in this domain, and
1293	* there is no bad imbalance.	1235	* there is no bad imbalance.
1294	*/	1236	*/
1295	schedstat_inc(this_sd, ttwu_move_affine);	1237	schedstat_inc(sd, ttwu_move_affine);
1296	schedstat_inc(p, se.nr_wakeups_affine);	1238	schedstat_inc(p, se.nr_wakeups_affine);
1297		1239
1298	return 1;	1240	return 1;
@@ -1300,65 +1242,215 @@ wake_affine(struct sched_domain this_sd, struct rq this_rq,
1300	return 0;	1242	return 0;
1301	}	1243	}
1302		1244
1303	static int select_task_rq_fair(struct task_struct *p, int sync)	1245	/*
		1246	* find_idlest_group finds and returns the least busy CPU group within the
		1247	* domain.
		1248	*/
		1249	static struct sched_group *
		1250	find_idlest_group(struct sched_domain sd, struct task_struct p,
		1251	int this_cpu, int load_idx)
1304	{	1252	{
1305	struct sched_domain sd, this_sd = NULL;	1253	struct sched_group idlest = NULL, this = NULL, *group = sd->groups;
1306	int prev_cpu, this_cpu, new_cpu;	1254	unsigned long min_load = ULONG_MAX, this_load = 0;
1307	unsigned long load, this_load;	1255	int imbalance = 100 + (sd->imbalance_pct-100)/2;
1308	struct rq *this_rq;
1309	unsigned int imbalance;
1310	int idx;
1311		1256
1312	prev_cpu = task_cpu(p);	1257	do {
1313	this_cpu = smp_processor_id();	1258	unsigned long load, avg_load;
1314	this_rq = cpu_rq(this_cpu);	1259	int local_group;
1315	new_cpu = prev_cpu;	1260	int i;
1316		1261
1317	/*	1262	/* Skip over this group if it has no CPUs allowed */
1318	* 'this_sd' is the first domain that both	1263	if (!cpumask_intersects(sched_group_cpus(group),
1319	* this_cpu and prev_cpu are present in:	1264	&p->cpus_allowed))
1320	*/	1265	continue;
1321	for_each_domain(this_cpu, sd) {	1266
1322	if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {	1267	local_group = cpumask_test_cpu(this_cpu,
1323	this_sd = sd;	1268	sched_group_cpus(group));
1324	break;	1269
		1270	/* Tally up the load of all CPUs in the group */
		1271	avg_load = 0;
		1272
		1273	for_each_cpu(i, sched_group_cpus(group)) {
		1274	/* Bias balancing toward cpus of our domain */
		1275	if (local_group)
		1276	load = source_load(i, load_idx);
		1277	else
		1278	load = target_load(i, load_idx);
		1279
		1280	avg_load += load;
		1281	}
		1282
		1283	/* Adjust by relative CPU power of the group */
		1284	avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
		1285
		1286	if (local_group) {
		1287	this_load = avg_load;
		1288	this = group;
		1289	} else if (avg_load < min_load) {
		1290	min_load = avg_load;
		1291	idlest = group;
		1292	}
		1293	} while (group = group->next, group != sd->groups);
		1294
		1295	if (!idlest \|\| 100this_load < imbalancemin_load)
		1296	return NULL;
		1297	return idlest;
		1298	}
		1299
		1300	/*
		1301	* find_idlest_cpu - find the idlest cpu among the cpus in group.
		1302	*/
		1303	static int
		1304	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
		1305	{
		1306	unsigned long load, min_load = ULONG_MAX;
		1307	int idlest = -1;
		1308	int i;
		1309
		1310	/* Traverse only the allowed CPUs */
		1311	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
		1312	load = weighted_cpuload(i);
		1313
		1314	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
		1315	min_load = load;
		1316	idlest = i;
1325	}	1317	}
1326	}	1318	}
1327		1319
1328	if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))	1320	return idlest;
1329	goto out;	1321	}
1330		1322
1331	/*	1323	/*
1332	* Check for affine wakeup and passive balancing possibilities.	1324	* sched_balance_self: balance the current task (running on cpu) in domains
1333	*/	1325	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1334	if (!this_sd)	1326	* SD_BALANCE_EXEC.
		1327	*
		1328	* Balance, ie. select the least loaded group.
		1329	*
		1330	* Returns the target CPU number, or the same CPU if no balancing is needed.
		1331	*
		1332	* preempt must be disabled.
		1333	*/
		1334	static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
		1335	{
		1336	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
		1337	int cpu = smp_processor_id();
		1338	int prev_cpu = task_cpu(p);
		1339	int new_cpu = cpu;
		1340	int want_affine = 0;
		1341	int want_sd = 1;
		1342	int sync = wake_flags & WF_SYNC;
		1343
		1344	if (sd_flag & SD_BALANCE_WAKE) {
		1345	if (sched_feat(AFFINE_WAKEUPS))
		1346	want_affine = 1;
		1347	new_cpu = prev_cpu;
		1348	}
		1349
		1350	rcu_read_lock();
		1351	for_each_domain(cpu, tmp) {
		1352	/*
		1353	* If power savings logic is enabled for a domain, see if we
		1354	* are not overloaded, if so, don't balance wider.
		1355	*/
		1356	if (tmp->flags & (SD_POWERSAVINGS_BALANCE\|SD_PREFER_LOCAL)) {
		1357	unsigned long power = 0;
		1358	unsigned long nr_running = 0;
		1359	unsigned long capacity;
		1360	int i;
		1361
		1362	for_each_cpu(i, sched_domain_span(tmp)) {
		1363	power += power_of(i);
		1364	nr_running += cpu_rq(i)->cfs.nr_running;
		1365	}
		1366
		1367	capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
		1368
		1369	if (tmp->flags & SD_POWERSAVINGS_BALANCE)
		1370	nr_running /= 2;
		1371
		1372	if (nr_running < capacity)
		1373	want_sd = 0;
		1374	}
		1375
		1376	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
		1377	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
		1378
		1379	affine_sd = tmp;
		1380	want_affine = 0;
		1381	}
		1382
		1383	if (!want_sd && !want_affine)
		1384	break;
		1385
		1386	if (!(tmp->flags & sd_flag))
		1387	continue;
		1388
		1389	if (want_sd)
		1390	sd = tmp;
		1391	}
		1392
		1393	if (sched_feat(LB_SHARES_UPDATE)) {
		1394	/*
		1395	* Pick the largest domain to update shares over
		1396	*/
		1397	tmp = sd;
		1398	if (affine_sd && (!tmp \|\|
		1399	cpumask_weight(sched_domain_span(affine_sd)) >
		1400	cpumask_weight(sched_domain_span(sd))))
		1401	tmp = affine_sd;
		1402
		1403	if (tmp)
		1404	update_shares(tmp);
		1405	}
		1406
		1407	if (affine_sd && wake_affine(affine_sd, p, sync)) {
		1408	new_cpu = cpu;
1335	goto out;	1409	goto out;
		1410	}
1336		1411
1337	idx = this_sd->wake_idx;	1412	while (sd) {
		1413	int load_idx = sd->forkexec_idx;
		1414	struct sched_group *group;
		1415	int weight;
1338		1416
1339	imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;	1417	if (!(sd->flags & sd_flag)) {
		1418	sd = sd->child;
		1419	continue;
		1420	}
1340		1421
1341	load = source_load(prev_cpu, idx);	1422	if (sd_flag & SD_BALANCE_WAKE)
1342	this_load = target_load(this_cpu, idx);	1423	load_idx = sd->wake_idx;
1343		1424
1344	if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,	1425	group = find_idlest_group(sd, p, cpu, load_idx);
1345	load, this_load, imbalance))	1426	if (!group) {
1346	return this_cpu;	1427	sd = sd->child;
		1428	continue;
		1429	}
1347		1430
1348	/*	1431	new_cpu = find_idlest_cpu(group, p, cpu);
1349	* Start passive balancing when half the imbalance_pct	1432	if (new_cpu == -1 \|\| new_cpu == cpu) {
1350	* limit is reached.	1433	/* Now try balancing at a lower domain level of cpu */
1351	*/	1434	sd = sd->child;
1352	if (this_sd->flags & SD_WAKE_BALANCE) {	1435	continue;
1353	if (imbalancethis_load <= 100load) {
1354	schedstat_inc(this_sd, ttwu_move_balance);
1355	schedstat_inc(p, se.nr_wakeups_passive);
1356	return this_cpu;
1357	}	1436	}
		1437
		1438	/* Now try balancing at a lower domain level of new_cpu */
		1439	cpu = new_cpu;
		1440	weight = cpumask_weight(sched_domain_span(sd));
		1441	sd = NULL;
		1442	for_each_domain(cpu, tmp) {
		1443	if (weight <= cpumask_weight(sched_domain_span(tmp)))
		1444	break;
		1445	if (tmp->flags & sd_flag)
		1446	sd = tmp;
		1447	}
		1448	/* while loop will break here if sd == NULL */
1358	}	1449	}
1359		1450
1360	out:	1451	out:
1361	return wake_idle(new_cpu, p);	1452	rcu_read_unlock();
		1453	return new_cpu;
1362	}	1454	}
1363	#endif /* CONFIG_SMP */	1455	#endif /* CONFIG_SMP */
1364		1456
@@ -1471,11 +1563,12 @@ static void set_next_buddy(struct sched_entity *se)
1471	/*	1563	/*
1472	* Preempt the current task with a newly woken task if needed:	1564	* Preempt the current task with a newly woken task if needed:
1473	*/	1565	*/
1474	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)	1566	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
1475	{	1567	{
1476	struct task_struct *curr = rq->curr;	1568	struct task_struct *curr = rq->curr;
1477	struct sched_entity se = &curr->se, pse = &p->se;	1569	struct sched_entity se = &curr->se, pse = &p->se;
1478	struct cfs_rq *cfs_rq = task_cfs_rq(curr);	1570	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
		1571	int sync = wake_flags & WF_SYNC;
1479		1572
1480	update_curr(cfs_rq);	1573	update_curr(cfs_rq);
1481		1574
@@ -1501,7 +1594,8 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)
1501	*/	1594	*/
1502	if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))	1595	if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1503	set_last_buddy(se);	1596	set_last_buddy(se);
1504	set_next_buddy(pse);	1597	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
		1598	set_next_buddy(pse);
1505		1599
1506	/*	1600	/*
1507	* We can come here with TIF_NEED_RESCHED already set from new task	1601	* We can come here with TIF_NEED_RESCHED already set from new task
@@ -1523,16 +1617,25 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)
1523	return;	1617	return;
1524	}	1618	}
1525		1619
1526	if (!sched_feat(WAKEUP_PREEMPT))	1620	if ((sched_feat(WAKEUP_SYNC) && sync) \|\|
1527	return;	1621	(sched_feat(WAKEUP_OVERLAP) &&
1528		1622	(se->avg_overlap < sysctl_sched_migration_cost &&
1529	if (sched_feat(WAKEUP_OVERLAP) && (sync \|\|	1623	pse->avg_overlap < sysctl_sched_migration_cost))) {
1530	(se->avg_overlap < sysctl_sched_migration_cost &&
1531	pse->avg_overlap < sysctl_sched_migration_cost))) {
1532	resched_task(curr);	1624	resched_task(curr);
1533	return;	1625	return;
1534	}	1626	}
1535		1627
		1628	if (sched_feat(WAKEUP_RUNNING)) {
		1629	if (pse->avg_running < se->avg_running) {
		1630	set_next_buddy(pse);
		1631	resched_task(curr);
		1632	return;
		1633	}
		1634	}
		1635
		1636	if (!sched_feat(WAKEUP_PREEMPT))
		1637	return;
		1638
1536	find_matching_se(&se, &pse);	1639	find_matching_se(&se, &pse);
1537		1640
1538	BUG_ON(!pse);	1641	BUG_ON(!pse);
@@ -1555,8 +1658,13 @@ static struct task_struct pick_next_task_fair(struct rq rq)
1555	/*	1658	/*
1556	* If se was a buddy, clear it so that it will have to earn	1659	* If se was a buddy, clear it so that it will have to earn
1557	* the favour again.	1660	* the favour again.
		1661	*
		1662	* If se was not a buddy, clear the buddies because neither
		1663	* was elegible to run, let them earn it again.
		1664	*
		1665	* IOW. unconditionally clear buddies.
1558	*/	1666	*/
1559	__clear_buddies(cfs_rq, se);	1667	__clear_buddies(cfs_rq, NULL);
1560	set_next_entity(cfs_rq, se);	1668	set_next_entity(cfs_rq, se);
1561	cfs_rq = group_cfs_rq(se);	1669	cfs_rq = group_cfs_rq(se);
1562	} while (cfs_rq);	1670	} while (cfs_rq);