Merge branch 'linus' into perfcounters/core

Merge reason: Bring in tracing changes we depend on. Signed-off-by: Ingo Molnar <mingo@elte.hu>
author: Ingo Molnar <mingo@elte.hu> 2009-09-19 05:27:32 -0400
committer: Ingo Molnar <mingo@elte.hu> 2009-09-19 05:28:41 -0400
commit: 929bf0d0156562ce631728b6fa53d68004d456d2 (patch)
tree: 739063990a8077b29ef97e69d73bce94573daae4 /kernel/sched_fair.c
parent: def0a9b2573e00ab0b486cb5382625203ab4c4a6 (diff)
parent: 202c4675c55ddf6b443c7e057d2dff6b42ef71aa (diff)
1 files changed, 261 insertions, 153 deletions
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a097e909e80f..990b188803ce 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -712,7 +712,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        if (!initial) {
                /* sleeps upto a single latency don't count. */
-                if (sched_feat(NEW_FAIR_SLEEPERS)) {
+                if (sched_feat(FAIR_SLEEPERS)) {
                        unsigned long thresh = sysctl_sched_latency;
                        /*
@@ -726,6 +726,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
                                         task_of(se)->policy != SCHED_IDLE))
                                thresh = calc_delta_fair(thresh, se);
+                        /*
+                         * Halve their sleep time's effect, to allow
+                         * for a gentler effect of sleepers:
+                         */
+                        if (sched_feat(GENTLE_FAIR_SLEEPERS))
+                                thresh >>= 1;
                        vruntime -= thresh;
                }
        }
@@ -758,10 +765,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
 static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        if (cfs_rq->last == se)
+        if (!se || cfs_rq->last == se)
                cfs_rq->last = NULL;
-        if (cfs_rq->next == se)
+        if (!se || cfs_rq->next == se)
                cfs_rq->next = NULL;
 }
@@ -1063,83 +1070,6 @@ static void yield_task_fair(struct rq *rq)
        se->vruntime = rightmost->vruntime + 1;
 }
-/*
- * wake_idle() will wake a task on an idle cpu if task->cpu is
- * not idle and an idle cpu is available.  The span of cpus to
- * search starts with cpus closest then further out as needed,
- * so we always favor a closer, idle cpu.
- * Domains may include CPUs that are not usable for migration,
- * hence we need to mask them out (rq->rd->online)
- *
- * Returns the CPU we should wake onto.
- */
-#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
-#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
-static int wake_idle(int cpu, struct task_struct *p)
-{
-        struct sched_domain *sd;
-        int i;
-        unsigned int chosen_wakeup_cpu;
-        int this_cpu;
-        struct rq *task_rq = task_rq(p);
-        /*
-         * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
-         * are idle and this is not a kernel thread and this task's affinity
-         * allows it to be moved to preferred cpu, then just move!
-         */
-        this_cpu = smp_processor_id();
-        chosen_wakeup_cpu =
-                cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
-        if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
-                idle_cpu(cpu) && idle_cpu(this_cpu) &&
-                p->mm && !(p->flags & PF_KTHREAD) &&
-                cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
-                return chosen_wakeup_cpu;
-        /*
-         * If it is idle, then it is the best cpu to run this task.
-         *
-         * This cpu is also the best, if it has more than one task already.
-         * Siblings must be also busy(in most cases) as they didn't already
-         * pickup the extra load from this cpu and hence we need not check
-         * sibling runqueue info. This will avoid the checks and cache miss
-         * penalities associated with that.
-         */
-        if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1)
-                return cpu;
-        for_each_domain(cpu, sd) {
-                if ((sd->flags & SD_WAKE_IDLE)
-                    || ((sd->flags & SD_WAKE_IDLE_FAR)
-                        && !task_hot(p, task_rq->clock, sd))) {
-                        for_each_cpu_and(i, sched_domain_span(sd),
-                                         &p->cpus_allowed) {
-                                if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
-                                        if (i != task_cpu(p)) {
-                                                schedstat_inc(p,
-                                                       se.nr_wakeups_idle);
-                                        }
-                                        return i;
-                                }
-                        }
-                } else {
-                        break;
-                }
-        }
-        return cpu;
-}
-#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
-static inline int wake_idle(int cpu, struct task_struct *p)
-{
-        return cpu;
-}
-#endif
 #ifdef CONFIG_SMP
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1226,25 +1156,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 #endif
-static int
+static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
-wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
-            struct task_struct *p, int prev_cpu, int this_cpu, int sync,
-            int idx, unsigned long load, unsigned long this_load,
-            unsigned int imbalance)
 {
-        struct task_struct *curr = this_rq->curr;
+        struct task_struct *curr = current;
-        struct task_group *tg;
+        unsigned long this_load, load;
-        unsigned long tl = this_load;
+        int idx, this_cpu, prev_cpu;
        unsigned long tl_per_task;
+        unsigned int imbalance;
+        struct task_group *tg;
        unsigned long weight;
        int balanced;
-        if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
+        idx       = sd->wake_idx;
-                return 0;
+        this_cpu  = smp_processor_id();
+        prev_cpu  = task_cpu(p);
+        load      = source_load(prev_cpu, idx);
+        this_load = target_load(this_cpu, idx);
-        if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
+        if (sync) {
-                        p->se.avg_overlap > sysctl_sched_migration_cost))
+               if (sched_feat(SYNC_LESS) &&
-                sync = 0;
+                   (curr->se.avg_overlap > sysctl_sched_migration_cost ||
+                    p->se.avg_overlap > sysctl_sched_migration_cost))
+                       sync = 0;
+        } else {
+                if (sched_feat(SYNC_MORE) &&
+                    (curr->se.avg_overlap < sysctl_sched_migration_cost &&
+                     p->se.avg_overlap < sysctl_sched_migration_cost))
+                        sync = 1;
+        }
        /*
         * If sync wakeup then subtract the (maximum possible)
@@ -1255,24 +1194,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
                tg = task_group(current);
                weight = current->se.load.weight;
-                tl += effective_load(tg, this_cpu, -weight, -weight);
+                this_load += effective_load(tg, this_cpu, -weight, -weight);
                load += effective_load(tg, prev_cpu, 0, -weight);
        }
        tg = task_group(p);
        weight = p->se.load.weight;
+        imbalance = 100 + (sd->imbalance_pct - 100) / 2;
        /*
         * In low-load situations, where prev_cpu is idle and this_cpu is idle
-         * due to the sync cause above having dropped tl to 0, we'll always have
+         * due to the sync cause above having dropped this_load to 0, we'll
-         * an imbalance, but there's really nothing you can do about that, so
+         * always have an imbalance, but there's really nothing you can do
-         * that's good too.
+         * about that, so that's good too.
         *
         * Otherwise check if either cpus are near enough in load to allow this
         * task to be woken on this_cpu.
         */
-        balanced = !tl ||
+        balanced = !this_load ||
-                100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
+                100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
                imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
        /*
@@ -1286,14 +1227,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
        schedstat_inc(p, se.nr_wakeups_affine_attempts);
        tl_per_task = cpu_avg_load_per_task(this_cpu);
-        if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
+        if (balanced ||
-                        tl_per_task)) {
+            (this_load <= load &&
+             this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
                /*
                 * This domain has SD_WAKE_AFFINE and
                 * p is cache cold in this domain, and
                 * there is no bad imbalance.
                 */
-                schedstat_inc(this_sd, ttwu_move_affine);
+                schedstat_inc(sd, ttwu_move_affine);
                schedstat_inc(p, se.nr_wakeups_affine);
                return 1;
@@ -1301,65 +1243,215 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
        return 0;
 }
-static int select_task_rq_fair(struct task_struct *p, int sync)
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+                  int this_cpu, int load_idx)
 {
-        struct sched_domain *sd, *this_sd = NULL;
+        struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
-        int prev_cpu, this_cpu, new_cpu;
+        unsigned long min_load = ULONG_MAX, this_load = 0;
-        unsigned long load, this_load;
+        int imbalance = 100 + (sd->imbalance_pct-100)/2;
-        struct rq *this_rq;
-        unsigned int imbalance;
-        int idx;
-        prev_cpu        = task_cpu(p);
+        do {
-        this_cpu        = smp_processor_id();
+                unsigned long load, avg_load;
-        this_rq         = cpu_rq(this_cpu);
+                int local_group;
-        new_cpu         = prev_cpu;
+                int i;
-        /*
+                /* Skip over this group if it has no CPUs allowed */
-         * 'this_sd' is the first domain that both
+                if (!cpumask_intersects(sched_group_cpus(group),
-         * this_cpu and prev_cpu are present in:
+                                        &p->cpus_allowed))
-         */
+                        continue;
-        for_each_domain(this_cpu, sd) {
-                if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {
+                local_group = cpumask_test_cpu(this_cpu,
-                        this_sd = sd;
+                                               sched_group_cpus(group));
-                        break;
+                /* Tally up the load of all CPUs in the group */
+                avg_load = 0;
+                for_each_cpu(i, sched_group_cpus(group)) {
+                        /* Bias balancing toward cpus of our domain */
+                        if (local_group)
+                                load = source_load(i, load_idx);
+                        else
+                                load = target_load(i, load_idx);
+                        avg_load += load;
+                }
+                /* Adjust by relative CPU power of the group */
+                avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+                if (local_group) {
+                        this_load = avg_load;
+                        this = group;
+                } else if (avg_load < min_load) {
+                        min_load = avg_load;
+                        idlest = group;
+                }
+        } while (group = group->next, group != sd->groups);
+        if (!idlest || 100*this_load < imbalance*min_load)
+                return NULL;
+        return idlest;
+}
+/*
+ * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ */
+static int
+find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+{
+        unsigned long load, min_load = ULONG_MAX;
+        int idlest = -1;
+        int i;
+        /* Traverse only the allowed CPUs */
+        for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+                load = weighted_cpuload(i);
+                if (load < min_load || (load == min_load && i == this_cpu)) {
+                        min_load = load;
+                        idlest = i;
                }
        }
-        if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))
+        return idlest;
-                goto out;
+}
-        /*
+/*
-         * Check for affine wakeup and passive balancing possibilities.
+ * sched_balance_self: balance the current task (running on cpu) in domains
-         */
+ * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
-        if (!this_sd)
+ * SD_BALANCE_EXEC.
+ *
+ * Balance, ie. select the least loaded group.
+ *
+ * Returns the target CPU number, or the same CPU if no balancing is needed.
+ *
+ * preempt must be disabled.
+ */
+static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+{
+        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+        int cpu = smp_processor_id();
+        int prev_cpu = task_cpu(p);
+        int new_cpu = cpu;
+        int want_affine = 0;
+        int want_sd = 1;
+        int sync = wake_flags & WF_SYNC;
+        if (sd_flag & SD_BALANCE_WAKE) {
+                if (sched_feat(AFFINE_WAKEUPS))
+                        want_affine = 1;
+                new_cpu = prev_cpu;
+        }
+        rcu_read_lock();
+        for_each_domain(cpu, tmp) {
+                /*
+                 * If power savings logic is enabled for a domain, see if we
+                 * are not overloaded, if so, don't balance wider.
+                 */
+                if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
+                        unsigned long power = 0;
+                        unsigned long nr_running = 0;
+                        unsigned long capacity;
+                        int i;
+                        for_each_cpu(i, sched_domain_span(tmp)) {
+                                power += power_of(i);
+                                nr_running += cpu_rq(i)->cfs.nr_running;
+                        }
+                        capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+                        if (tmp->flags & SD_POWERSAVINGS_BALANCE)
+                                nr_running /= 2;
+                        if (nr_running < capacity)
+                                want_sd = 0;
+                }
+                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
+                    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
+                        affine_sd = tmp;
+                        want_affine = 0;
+                }
+                if (!want_sd && !want_affine)
+                        break;
+                if (!(tmp->flags & sd_flag))
+                        continue;
+                if (want_sd)
+                        sd = tmp;
+        }
+        if (sched_feat(LB_SHARES_UPDATE)) {
+                /*
+                 * Pick the largest domain to update shares over
+                 */
+                tmp = sd;
+                if (affine_sd && (!tmp ||
+                                  cpumask_weight(sched_domain_span(affine_sd)) >
+                                  cpumask_weight(sched_domain_span(sd))))
+                        tmp = affine_sd;
+                if (tmp)
+                        update_shares(tmp);
+        }
+        if (affine_sd && wake_affine(affine_sd, p, sync)) {
+                new_cpu = cpu;
                goto out;
+        }
-        idx = this_sd->wake_idx;
+        while (sd) {
+                int load_idx = sd->forkexec_idx;
+                struct sched_group *group;
+                int weight;
-        imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
+                if (!(sd->flags & sd_flag)) {
+                        sd = sd->child;
+                        continue;
+                }
-        load = source_load(prev_cpu, idx);
+                if (sd_flag & SD_BALANCE_WAKE)
-        this_load = target_load(this_cpu, idx);
+                        load_idx = sd->wake_idx;
-        if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
+                group = find_idlest_group(sd, p, cpu, load_idx);
-                                     load, this_load, imbalance))
+                if (!group) {
-                return this_cpu;
+                        sd = sd->child;
+                        continue;
+                }
-        /*
+                new_cpu = find_idlest_cpu(group, p, cpu);
-         * Start passive balancing when half the imbalance_pct
+                if (new_cpu == -1 || new_cpu == cpu) {
-         * limit is reached.
+                        /* Now try balancing at a lower domain level of cpu */
-         */
+                        sd = sd->child;
-        if (this_sd->flags & SD_WAKE_BALANCE) {
+                        continue;
-                if (imbalance*this_load <= 100*load) {
-                        schedstat_inc(this_sd, ttwu_move_balance);
-                        schedstat_inc(p, se.nr_wakeups_passive);
-                        return this_cpu;
                }
+                /* Now try balancing at a lower domain level of new_cpu */
+                cpu = new_cpu;
+                weight = cpumask_weight(sched_domain_span(sd));
+                sd = NULL;
+                for_each_domain(cpu, tmp) {
+                        if (weight <= cpumask_weight(sched_domain_span(tmp)))
+                                break;
+                        if (tmp->flags & sd_flag)
+                                sd = tmp;
+                }
+                /* while loop will break here if sd == NULL */
        }
 out:
-        return wake_idle(new_cpu, p);
+        rcu_read_unlock();
+        return new_cpu;
 }
 #endif /* CONFIG_SMP */
@@ -1472,11 +1564,12 @@ static void set_next_buddy(struct sched_entity *se)
 /*
 * Preempt the current task with a newly woken task if needed:
 */
-static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
+static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
        struct task_struct *curr = rq->curr;
        struct sched_entity *se = &curr->se, *pse = &p->se;
        struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+        int sync = wake_flags & WF_SYNC;
        update_curr(cfs_rq);
@@ -1502,7 +1595,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
         */
        if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
                set_last_buddy(se);
-        set_next_buddy(pse);
+        if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
+                set_next_buddy(pse);
        /*
         * We can come here with TIF_NEED_RESCHED already set from new task
@@ -1524,16 +1618,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
                return;
        }
-        if (!sched_feat(WAKEUP_PREEMPT))
+        if ((sched_feat(WAKEUP_SYNC) && sync) ||
-                return;
+            (sched_feat(WAKEUP_OVERLAP) &&
+             (se->avg_overlap < sysctl_sched_migration_cost &&
-        if (sched_feat(WAKEUP_OVERLAP) && (sync ||
+              pse->avg_overlap < sysctl_sched_migration_cost))) {
-                        (se->avg_overlap < sysctl_sched_migration_cost &&
-                         pse->avg_overlap < sysctl_sched_migration_cost))) {
                resched_task(curr);
                return;
        }
+        if (sched_feat(WAKEUP_RUNNING)) {
+                if (pse->avg_running < se->avg_running) {
+                        set_next_buddy(pse);
+                        resched_task(curr);
+                        return;
+                }
+        }
+        if (!sched_feat(WAKEUP_PREEMPT))
+                return;
        find_matching_se(&se, &pse);
        BUG_ON(!pse);
@@ -1556,8 +1659,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
                /*
                 * If se was a buddy, clear it so that it will have to earn
                 * the favour again.
+                 *
+                 * If se was not a buddy, clear the buddies because neither
+                 * was elegible to run, let them earn it again.
+                 *
+                 * IOW. unconditionally clear buddies.
                 */
-                __clear_buddies(cfs_rq, se);
+                __clear_buddies(cfs_rq, NULL);
                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
author	Ingo Molnar <mingo@elte.hu>	2009-09-19 05:27:32 -0400
committer	Ingo Molnar <mingo@elte.hu>	2009-09-19 05:28:41 -0400
commit	929bf0d0156562ce631728b6fa53d68004d456d2 (patch)
tree	739063990a8077b29ef97e69d73bce94573daae4 /kernel/sched_fair.c
parent	def0a9b2573e00ab0b486cb5382625203ab4c4a6 (diff)
parent	202c4675c55ddf6b443c7e057d2dff6b42ef71aa (diff)

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index a097e909e80f..990b188803ce 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c
@@ -712,7 +712,7 @@ place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
712		712
713	if (!initial) {	713	if (!initial) {
714	/* sleeps upto a single latency don't count. */	714	/* sleeps upto a single latency don't count. */
715	if (sched_feat(NEW_FAIR_SLEEPERS)) {	715	if (sched_feat(FAIR_SLEEPERS)) {
716	unsigned long thresh = sysctl_sched_latency;	716	unsigned long thresh = sysctl_sched_latency;
717		717
718	/*	718	/*
@@ -726,6 +726,13 @@ place_entity(struct cfs_rq cfs_rq, struct sched_entity se, int initial)
726	task_of(se)->policy != SCHED_IDLE))	726	task_of(se)->policy != SCHED_IDLE))
727	thresh = calc_delta_fair(thresh, se);	727	thresh = calc_delta_fair(thresh, se);
728		728
		729	/*
		730	* Halve their sleep time's effect, to allow
		731	* for a gentler effect of sleepers:
		732	*/
		733	if (sched_feat(GENTLE_FAIR_SLEEPERS))
		734	thresh >>= 1;
		735
729	vruntime -= thresh;	736	vruntime -= thresh;
730	}	737	}
731	}	738	}
@@ -758,10 +765,10 @@ enqueue_entity(struct cfs_rq cfs_rq, struct sched_entity se, int wakeup)
758		765
759	static void __clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)	766	static void __clear_buddies(struct cfs_rq cfs_rq, struct sched_entity se)
760	{	767	{
761	if (cfs_rq->last == se)	768	if (!se \|\| cfs_rq->last == se)
762	cfs_rq->last = NULL;	769	cfs_rq->last = NULL;
763		770
764	if (cfs_rq->next == se)	771	if (!se \|\| cfs_rq->next == se)
765	cfs_rq->next = NULL;	772	cfs_rq->next = NULL;
766	}	773	}
767		774
@@ -1063,83 +1070,6 @@ static void yield_task_fair(struct rq *rq)
1063	se->vruntime = rightmost->vruntime + 1;	1070	se->vruntime = rightmost->vruntime + 1;
1064	}	1071	}
1065		1072
1066	/*
1067	* wake_idle() will wake a task on an idle cpu if task->cpu is
1068	* not idle and an idle cpu is available. The span of cpus to
1069	* search starts with cpus closest then further out as needed,
1070	* so we always favor a closer, idle cpu.
1071	* Domains may include CPUs that are not usable for migration,
1072	* hence we need to mask them out (rq->rd->online)
1073	*
1074	* Returns the CPU we should wake onto.
1075	*/
1076	#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1077
1078	#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online)
1079
1080	static int wake_idle(int cpu, struct task_struct *p)
1081	{
1082	struct sched_domain *sd;
1083	int i;
1084	unsigned int chosen_wakeup_cpu;
1085	int this_cpu;
1086	struct rq *task_rq = task_rq(p);
1087
1088	/*
1089	* At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu
1090	* are idle and this is not a kernel thread and this task's affinity
1091	* allows it to be moved to preferred cpu, then just move!
1092	*/
1093
1094	this_cpu = smp_processor_id();
1095	chosen_wakeup_cpu =
1096	cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu;
1097
1098	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP &&
1099	idle_cpu(cpu) && idle_cpu(this_cpu) &&
1100	p->mm && !(p->flags & PF_KTHREAD) &&
1101	cpu_isset(chosen_wakeup_cpu, p->cpus_allowed))
1102	return chosen_wakeup_cpu;
1103
1104	/*
1105	* If it is idle, then it is the best cpu to run this task.
1106	*
1107	* This cpu is also the best, if it has more than one task already.
1108	* Siblings must be also busy(in most cases) as they didn't already
1109	* pickup the extra load from this cpu and hence we need not check
1110	* sibling runqueue info. This will avoid the checks and cache miss
1111	* penalities associated with that.
1112	*/
1113	if (idle_cpu(cpu) \|\| cpu_rq(cpu)->cfs.nr_running > 1)
1114	return cpu;
1115
1116	for_each_domain(cpu, sd) {
1117	if ((sd->flags & SD_WAKE_IDLE)
1118	\|\| ((sd->flags & SD_WAKE_IDLE_FAR)
1119	&& !task_hot(p, task_rq->clock, sd))) {
1120	for_each_cpu_and(i, sched_domain_span(sd),
1121	&p->cpus_allowed) {
1122	if (cpu_rd_active(i, task_rq) && idle_cpu(i)) {
1123	if (i != task_cpu(p)) {
1124	schedstat_inc(p,
1125	se.nr_wakeups_idle);
1126	}
1127	return i;
1128	}
1129	}
1130	} else {
1131	break;
1132	}
1133	}
1134	return cpu;
1135	}
1136	#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
1137	static inline int wake_idle(int cpu, struct task_struct *p)
1138	{
1139	return cpu;
1140	}
1141	#endif
1142
1143	#ifdef CONFIG_SMP	1073	#ifdef CONFIG_SMP
1144		1074
1145	#ifdef CONFIG_FAIR_GROUP_SCHED	1075	#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1226,25 +1156,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1226		1156
1227	#endif	1157	#endif
1228		1158
1229	static int	1159	static int wake_affine(struct sched_domain sd, struct task_struct p, int sync)
1230	wake_affine(struct sched_domain this_sd, struct rq this_rq,
1231	struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1232	int idx, unsigned long load, unsigned long this_load,
1233	unsigned int imbalance)
1234	{	1160	{
1235	struct task_struct *curr = this_rq->curr;	1161	struct task_struct *curr = current;
1236	struct task_group *tg;	1162	unsigned long this_load, load;
1237	unsigned long tl = this_load;	1163	int idx, this_cpu, prev_cpu;
1238	unsigned long tl_per_task;	1164	unsigned long tl_per_task;
		1165	unsigned int imbalance;
		1166	struct task_group *tg;
1239	unsigned long weight;	1167	unsigned long weight;
1240	int balanced;	1168	int balanced;
1241		1169
1242	if (!(this_sd->flags & SD_WAKE_AFFINE) \|\| !sched_feat(AFFINE_WAKEUPS))	1170	idx = sd->wake_idx;
1243	return 0;	1171	this_cpu = smp_processor_id();
		1172	prev_cpu = task_cpu(p);
		1173	load = source_load(prev_cpu, idx);
		1174	this_load = target_load(this_cpu, idx);
1244		1175
1245	if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost \|\|	1176	if (sync) {
1246	p->se.avg_overlap > sysctl_sched_migration_cost))	1177	if (sched_feat(SYNC_LESS) &&
1247	sync = 0;	1178	(curr->se.avg_overlap > sysctl_sched_migration_cost \|\|
		1179	p->se.avg_overlap > sysctl_sched_migration_cost))
		1180	sync = 0;
		1181	} else {
		1182	if (sched_feat(SYNC_MORE) &&
		1183	(curr->se.avg_overlap < sysctl_sched_migration_cost &&
		1184	p->se.avg_overlap < sysctl_sched_migration_cost))
		1185	sync = 1;
		1186	}
1248		1187
1249	/*	1188	/*
1250	* If sync wakeup then subtract the (maximum possible)	1189	* If sync wakeup then subtract the (maximum possible)
@@ -1255,24 +1194,26 @@ wake_affine(struct sched_domain this_sd, struct rq this_rq,
1255	tg = task_group(current);	1194	tg = task_group(current);
1256	weight = current->se.load.weight;	1195	weight = current->se.load.weight;
1257		1196
1258	tl += effective_load(tg, this_cpu, -weight, -weight);	1197	this_load += effective_load(tg, this_cpu, -weight, -weight);
1259	load += effective_load(tg, prev_cpu, 0, -weight);	1198	load += effective_load(tg, prev_cpu, 0, -weight);
1260	}	1199	}
1261		1200
1262	tg = task_group(p);	1201	tg = task_group(p);
1263	weight = p->se.load.weight;	1202	weight = p->se.load.weight;
1264		1203
		1204	imbalance = 100 + (sd->imbalance_pct - 100) / 2;
		1205
1265	/*	1206	/*
1266	* In low-load situations, where prev_cpu is idle and this_cpu is idle	1207	* In low-load situations, where prev_cpu is idle and this_cpu is idle
1267	* due to the sync cause above having dropped tl to 0, we'll always have	1208	* due to the sync cause above having dropped this_load to 0, we'll
1268	* an imbalance, but there's really nothing you can do about that, so	1209	* always have an imbalance, but there's really nothing you can do
1269	* that's good too.	1210	* about that, so that's good too.
1270	*	1211	*
1271	* Otherwise check if either cpus are near enough in load to allow this	1212	* Otherwise check if either cpus are near enough in load to allow this
1272	* task to be woken on this_cpu.	1213	* task to be woken on this_cpu.
1273	*/	1214	*/
1274	balanced = !tl \|\|	1215	balanced = !this_load \|\|
1275	100*(tl + effective_load(tg, this_cpu, weight, weight)) <=	1216	100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=
1276	imbalance*(load + effective_load(tg, prev_cpu, 0, weight));	1217	imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1277		1218
1278	/*	1219	/*
@@ -1286,14 +1227,15 @@ wake_affine(struct sched_domain this_sd, struct rq this_rq,
1286	schedstat_inc(p, se.nr_wakeups_affine_attempts);	1227	schedstat_inc(p, se.nr_wakeups_affine_attempts);
1287	tl_per_task = cpu_avg_load_per_task(this_cpu);	1228	tl_per_task = cpu_avg_load_per_task(this_cpu);
1288		1229
1289	if (balanced \|\| (tl <= load && tl + target_load(prev_cpu, idx) <=	1230	if (balanced \|\|
1290	tl_per_task)) {	1231	(this_load <= load &&
		1232	this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
1291	/*	1233	/*
1292	* This domain has SD_WAKE_AFFINE and	1234	* This domain has SD_WAKE_AFFINE and
1293	* p is cache cold in this domain, and	1235	* p is cache cold in this domain, and
1294	* there is no bad imbalance.	1236	* there is no bad imbalance.
1295	*/	1237	*/
1296	schedstat_inc(this_sd, ttwu_move_affine);	1238	schedstat_inc(sd, ttwu_move_affine);
1297	schedstat_inc(p, se.nr_wakeups_affine);	1239	schedstat_inc(p, se.nr_wakeups_affine);
1298		1240
1299	return 1;	1241	return 1;
@@ -1301,65 +1243,215 @@ wake_affine(struct sched_domain this_sd, struct rq this_rq,
1301	return 0;	1243	return 0;
1302	}	1244	}
1303		1245
1304	static int select_task_rq_fair(struct task_struct *p, int sync)	1246	/*
		1247	* find_idlest_group finds and returns the least busy CPU group within the
		1248	* domain.
		1249	*/
		1250	static struct sched_group *
		1251	find_idlest_group(struct sched_domain sd, struct task_struct p,
		1252	int this_cpu, int load_idx)
1305	{	1253	{
1306	struct sched_domain sd, this_sd = NULL;	1254	struct sched_group idlest = NULL, this = NULL, *group = sd->groups;
1307	int prev_cpu, this_cpu, new_cpu;	1255	unsigned long min_load = ULONG_MAX, this_load = 0;
1308	unsigned long load, this_load;	1256	int imbalance = 100 + (sd->imbalance_pct-100)/2;
1309	struct rq *this_rq;
1310	unsigned int imbalance;
1311	int idx;
1312		1257
1313	prev_cpu = task_cpu(p);	1258	do {
1314	this_cpu = smp_processor_id();	1259	unsigned long load, avg_load;
1315	this_rq = cpu_rq(this_cpu);	1260	int local_group;
1316	new_cpu = prev_cpu;	1261	int i;
1317		1262
1318	/*	1263	/* Skip over this group if it has no CPUs allowed */
1319	* 'this_sd' is the first domain that both	1264	if (!cpumask_intersects(sched_group_cpus(group),
1320	* this_cpu and prev_cpu are present in:	1265	&p->cpus_allowed))
1321	*/	1266	continue;
1322	for_each_domain(this_cpu, sd) {	1267
1323	if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) {	1268	local_group = cpumask_test_cpu(this_cpu,
1324	this_sd = sd;	1269	sched_group_cpus(group));
1325	break;	1270
		1271	/* Tally up the load of all CPUs in the group */
		1272	avg_load = 0;
		1273
		1274	for_each_cpu(i, sched_group_cpus(group)) {
		1275	/* Bias balancing toward cpus of our domain */
		1276	if (local_group)
		1277	load = source_load(i, load_idx);
		1278	else
		1279	load = target_load(i, load_idx);
		1280
		1281	avg_load += load;
		1282	}
		1283
		1284	/* Adjust by relative CPU power of the group */
		1285	avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
		1286
		1287	if (local_group) {
		1288	this_load = avg_load;
		1289	this = group;
		1290	} else if (avg_load < min_load) {
		1291	min_load = avg_load;
		1292	idlest = group;
		1293	}
		1294	} while (group = group->next, group != sd->groups);
		1295
		1296	if (!idlest \|\| 100this_load < imbalancemin_load)
		1297	return NULL;
		1298	return idlest;
		1299	}
		1300
		1301	/*
		1302	* find_idlest_cpu - find the idlest cpu among the cpus in group.
		1303	*/
		1304	static int
		1305	find_idlest_cpu(struct sched_group group, struct task_struct p, int this_cpu)
		1306	{
		1307	unsigned long load, min_load = ULONG_MAX;
		1308	int idlest = -1;
		1309	int i;
		1310
		1311	/* Traverse only the allowed CPUs */
		1312	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
		1313	load = weighted_cpuload(i);
		1314
		1315	if (load < min_load \|\| (load == min_load && i == this_cpu)) {
		1316	min_load = load;
		1317	idlest = i;
1326	}	1318	}
1327	}	1319	}
1328		1320
1329	if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed)))	1321	return idlest;
1330	goto out;	1322	}
1331		1323
1332	/*	1324	/*
1333	* Check for affine wakeup and passive balancing possibilities.	1325	* sched_balance_self: balance the current task (running on cpu) in domains
1334	*/	1326	* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
1335	if (!this_sd)	1327	* SD_BALANCE_EXEC.
		1328	*
		1329	* Balance, ie. select the least loaded group.
		1330	*
		1331	* Returns the target CPU number, or the same CPU if no balancing is needed.
		1332	*
		1333	* preempt must be disabled.
		1334	*/
		1335	static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
		1336	{
		1337	struct sched_domain tmp, affine_sd = NULL, *sd = NULL;
		1338	int cpu = smp_processor_id();
		1339	int prev_cpu = task_cpu(p);
		1340	int new_cpu = cpu;
		1341	int want_affine = 0;
		1342	int want_sd = 1;
		1343	int sync = wake_flags & WF_SYNC;
		1344
		1345	if (sd_flag & SD_BALANCE_WAKE) {
		1346	if (sched_feat(AFFINE_WAKEUPS))
		1347	want_affine = 1;
		1348	new_cpu = prev_cpu;
		1349	}
		1350
		1351	rcu_read_lock();
		1352	for_each_domain(cpu, tmp) {
		1353	/*
		1354	* If power savings logic is enabled for a domain, see if we
		1355	* are not overloaded, if so, don't balance wider.
		1356	*/
		1357	if (tmp->flags & (SD_POWERSAVINGS_BALANCE\|SD_PREFER_LOCAL)) {
		1358	unsigned long power = 0;
		1359	unsigned long nr_running = 0;
		1360	unsigned long capacity;
		1361	int i;
		1362
		1363	for_each_cpu(i, sched_domain_span(tmp)) {
		1364	power += power_of(i);
		1365	nr_running += cpu_rq(i)->cfs.nr_running;
		1366	}
		1367
		1368	capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
		1369
		1370	if (tmp->flags & SD_POWERSAVINGS_BALANCE)
		1371	nr_running /= 2;
		1372
		1373	if (nr_running < capacity)
		1374	want_sd = 0;
		1375	}
		1376
		1377	if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
		1378	cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
		1379
		1380	affine_sd = tmp;
		1381	want_affine = 0;
		1382	}
		1383
		1384	if (!want_sd && !want_affine)
		1385	break;
		1386
		1387	if (!(tmp->flags & sd_flag))
		1388	continue;
		1389
		1390	if (want_sd)
		1391	sd = tmp;
		1392	}
		1393
		1394	if (sched_feat(LB_SHARES_UPDATE)) {
		1395	/*
		1396	* Pick the largest domain to update shares over
		1397	*/
		1398	tmp = sd;
		1399	if (affine_sd && (!tmp \|\|
		1400	cpumask_weight(sched_domain_span(affine_sd)) >
		1401	cpumask_weight(sched_domain_span(sd))))
		1402	tmp = affine_sd;
		1403
		1404	if (tmp)
		1405	update_shares(tmp);
		1406	}
		1407
		1408	if (affine_sd && wake_affine(affine_sd, p, sync)) {
		1409	new_cpu = cpu;
1336	goto out;	1410	goto out;
		1411	}
1337		1412
1338	idx = this_sd->wake_idx;	1413	while (sd) {
		1414	int load_idx = sd->forkexec_idx;
		1415	struct sched_group *group;
		1416	int weight;
1339		1417
1340	imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;	1418	if (!(sd->flags & sd_flag)) {
		1419	sd = sd->child;
		1420	continue;
		1421	}
1341		1422
1342	load = source_load(prev_cpu, idx);	1423	if (sd_flag & SD_BALANCE_WAKE)
1343	this_load = target_load(this_cpu, idx);	1424	load_idx = sd->wake_idx;
1344		1425
1345	if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,	1426	group = find_idlest_group(sd, p, cpu, load_idx);
1346	load, this_load, imbalance))	1427	if (!group) {
1347	return this_cpu;	1428	sd = sd->child;
		1429	continue;
		1430	}
1348		1431
1349	/*	1432	new_cpu = find_idlest_cpu(group, p, cpu);
1350	* Start passive balancing when half the imbalance_pct	1433	if (new_cpu == -1 \|\| new_cpu == cpu) {
1351	* limit is reached.	1434	/* Now try balancing at a lower domain level of cpu */
1352	*/	1435	sd = sd->child;
1353	if (this_sd->flags & SD_WAKE_BALANCE) {	1436	continue;
1354	if (imbalancethis_load <= 100load) {
1355	schedstat_inc(this_sd, ttwu_move_balance);
1356	schedstat_inc(p, se.nr_wakeups_passive);
1357	return this_cpu;
1358	}	1437	}
		1438
		1439	/* Now try balancing at a lower domain level of new_cpu */
		1440	cpu = new_cpu;
		1441	weight = cpumask_weight(sched_domain_span(sd));
		1442	sd = NULL;
		1443	for_each_domain(cpu, tmp) {
		1444	if (weight <= cpumask_weight(sched_domain_span(tmp)))
		1445	break;
		1446	if (tmp->flags & sd_flag)
		1447	sd = tmp;
		1448	}
		1449	/* while loop will break here if sd == NULL */
1359	}	1450	}
1360		1451
1361	out:	1452	out:
1362	return wake_idle(new_cpu, p);	1453	rcu_read_unlock();
		1454	return new_cpu;
1363	}	1455	}
1364	#endif /* CONFIG_SMP */	1456	#endif /* CONFIG_SMP */
1365		1457
@@ -1472,11 +1564,12 @@ static void set_next_buddy(struct sched_entity *se)
1472	/*	1564	/*
1473	* Preempt the current task with a newly woken task if needed:	1565	* Preempt the current task with a newly woken task if needed:
1474	*/	1566	*/
1475	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)	1567	static void check_preempt_wakeup(struct rq rq, struct task_struct p, int wake_flags)
1476	{	1568	{
1477	struct task_struct *curr = rq->curr;	1569	struct task_struct *curr = rq->curr;
1478	struct sched_entity se = &curr->se, pse = &p->se;	1570	struct sched_entity se = &curr->se, pse = &p->se;
1479	struct cfs_rq *cfs_rq = task_cfs_rq(curr);	1571	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
		1572	int sync = wake_flags & WF_SYNC;
1480		1573
1481	update_curr(cfs_rq);	1574	update_curr(cfs_rq);
1482		1575
@@ -1502,7 +1595,8 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)
1502	*/	1595	*/
1503	if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))	1596	if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
1504	set_last_buddy(se);	1597	set_last_buddy(se);
1505	set_next_buddy(pse);	1598	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
		1599	set_next_buddy(pse);
1506		1600
1507	/*	1601	/*
1508	* We can come here with TIF_NEED_RESCHED already set from new task	1602	* We can come here with TIF_NEED_RESCHED already set from new task
@@ -1524,16 +1618,25 @@ static void check_preempt_wakeup(struct rq rq, struct task_struct p, int sync)
1524	return;	1618	return;
1525	}	1619	}
1526		1620
1527	if (!sched_feat(WAKEUP_PREEMPT))	1621	if ((sched_feat(WAKEUP_SYNC) && sync) \|\|
1528	return;	1622	(sched_feat(WAKEUP_OVERLAP) &&
1529		1623	(se->avg_overlap < sysctl_sched_migration_cost &&
1530	if (sched_feat(WAKEUP_OVERLAP) && (sync \|\|	1624	pse->avg_overlap < sysctl_sched_migration_cost))) {
1531	(se->avg_overlap < sysctl_sched_migration_cost &&
1532	pse->avg_overlap < sysctl_sched_migration_cost))) {
1533	resched_task(curr);	1625	resched_task(curr);
1534	return;	1626	return;
1535	}	1627	}
1536		1628
		1629	if (sched_feat(WAKEUP_RUNNING)) {
		1630	if (pse->avg_running < se->avg_running) {
		1631	set_next_buddy(pse);
		1632	resched_task(curr);
		1633	return;
		1634	}
		1635	}
		1636
		1637	if (!sched_feat(WAKEUP_PREEMPT))
		1638	return;
		1639
1537	find_matching_se(&se, &pse);	1640	find_matching_se(&se, &pse);
1538		1641
1539	BUG_ON(!pse);	1642	BUG_ON(!pse);
@@ -1556,8 +1659,13 @@ static struct task_struct pick_next_task_fair(struct rq rq)
1556	/*	1659	/*
1557	* If se was a buddy, clear it so that it will have to earn	1660	* If se was a buddy, clear it so that it will have to earn
1558	* the favour again.	1661	* the favour again.
		1662	*
		1663	* If se was not a buddy, clear the buddies because neither
		1664	* was elegible to run, let them earn it again.
		1665	*
		1666	* IOW. unconditionally clear buddies.
1559	*/	1667	*/
1560	__clear_buddies(cfs_rq, se);	1668	__clear_buddies(cfs_rq, NULL);
1561	set_next_entity(cfs_rq, se);	1669	set_next_entity(cfs_rq, se);
1562	cfs_rq = group_cfs_rq(se);	1670	cfs_rq = group_cfs_rq(se);
1563	} while (cfs_rq);	1671	} while (cfs_rq);