1 files changed, 373 insertions, 259 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f77f9c527449..7f0a5e6cdae0 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -851,7 +851,7 @@ void task_numa_fault(int node, int pages, bool migrated)
 {
        struct task_struct *p = current;
-        if (!sched_feat_numa(NUMA))
+        if (!numabalancing_enabled)
                return;
        /* FIXME: Allocate task-specific structure for placement policy here */
@@ -2032,6 +2032,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
         */
        update_entity_load_avg(curr, 1);
        update_cfs_rq_blocked_load(cfs_rq, 1);
+        update_cfs_shares(cfs_rq);
 #ifdef CONFIG_SCHED_HRTICK
        /*
@@ -3017,6 +3018,23 @@ static unsigned long cpu_avg_load_per_task(int cpu)
        return 0;
 }
+static void record_wakee(struct task_struct *p)
+{
+        /*
+         * Rough decay (wiping) for cost saving, don't worry
+         * about the boundary, really active task won't care
+         * about the loss.
+         */
+        if (jiffies > current->wakee_flip_decay_ts + HZ) {
+                current->wakee_flips = 0;
+                current->wakee_flip_decay_ts = jiffies;
+        }
+        if (current->last_wakee != p) {
+                current->last_wakee = p;
+                current->wakee_flips++;
+        }
+}
 static void task_waking_fair(struct task_struct *p)
 {
@@ -3037,6 +3055,7 @@ static void task_waking_fair(struct task_struct *p)
 #endif
        se->vruntime -= min_vruntime;
+        record_wakee(p);
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -3155,6 +3174,28 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
 #endif
+static int wake_wide(struct task_struct *p)
+{
+        int factor = this_cpu_read(sd_llc_size);
+        /*
+         * Yeah, it's the switching-frequency, could means many wakee or
+         * rapidly switch, use factor here will just help to automatically
+         * adjust the loose-degree, so bigger node will lead to more pull.
+         */
+        if (p->wakee_flips > factor) {
+                /*
+                 * wakee is somewhat hot, it needs certain amount of cpu
+                 * resource, so if waker is far more hot, prefer to leave
+                 * it alone.
+                 */
+                if (current->wakee_flips > (factor * p->wakee_flips))
+                        return 1;
+        }
+        return 0;
+}
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
        s64 this_load, load;
@@ -3164,6 +3205,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
        unsigned long weight;
        int balanced;
+        /*
+         * If we wake multiple tasks be careful to not bounce
+         * ourselves around too much.
+         */
+        if (wake_wide(p))
+                return 0;
        idx       = sd->wake_idx;
        this_cpu  = smp_processor_id();
        prev_cpu  = task_cpu(p);
@@ -4171,47 +4219,48 @@ static void update_blocked_averages(int cpu)
 }
 /*
- * Compute the cpu's hierarchical load factor for each task group.
+ * Compute the hierarchical load factor for cfs_rq and all its ascendants.
 * This needs to be done in a top-down fashion because the load of a child
 * group is a fraction of its parents load.
 */
-static int tg_load_down(struct task_group *tg, void *data)
+static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
-{
-        unsigned long load;
-        long cpu = (long)data;
-        if (!tg->parent) {
-                load = cpu_rq(cpu)->avg.load_avg_contrib;
-        } else {
-                load = tg->parent->cfs_rq[cpu]->h_load;
-                load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
-                                tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
-        }
-        tg->cfs_rq[cpu]->h_load = load;
-        return 0;
-}
-static void update_h_load(long cpu)
 {
-        struct rq *rq = cpu_rq(cpu);
+        struct rq *rq = rq_of(cfs_rq);
+        struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
        unsigned long now = jiffies;
+        unsigned long load;
-        if (rq->h_load_throttle == now)
+        if (cfs_rq->last_h_load_update == now)
                return;
-        rq->h_load_throttle = now;
+        cfs_rq->h_load_next = NULL;
+        for_each_sched_entity(se) {
+                cfs_rq = cfs_rq_of(se);
+                cfs_rq->h_load_next = se;
+                if (cfs_rq->last_h_load_update == now)
+                        break;
+        }
-        rcu_read_lock();
+        if (!se) {
-        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
+                cfs_rq->h_load = rq->avg.load_avg_contrib;
-        rcu_read_unlock();
+                cfs_rq->last_h_load_update = now;
+        }
+        while ((se = cfs_rq->h_load_next) != NULL) {
+                load = cfs_rq->h_load;
+                load = div64_ul(load * se->avg.load_avg_contrib,
+                                cfs_rq->runnable_load_avg + 1);
+                cfs_rq = group_cfs_rq(se);
+                cfs_rq->h_load = load;
+                cfs_rq->last_h_load_update = now;
+        }
 }
 static unsigned long task_h_load(struct task_struct *p)
 {
        struct cfs_rq *cfs_rq = task_cfs_rq(p);
+        update_cfs_rq_h_load(cfs_rq);
        return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
                        cfs_rq->runnable_load_avg + 1);
 }
@@ -4220,10 +4269,6 @@ static inline void update_blocked_averages(int cpu)
 {
 }
-static inline void update_h_load(long cpu)
-{
-}
 static unsigned long task_h_load(struct task_struct *p)
 {
        return p->se.avg.load_avg_contrib;
@@ -4232,54 +4277,62 @@ static unsigned long task_h_load(struct task_struct *p)
 /********** Helpers for find_busiest_group ************************/
 /*
- * sd_lb_stats - Structure to store the statistics of a sched_domain
- *              during load balancing.
- */
-struct sd_lb_stats {
-        struct sched_group *busiest; /* Busiest group in this sd */
-        struct sched_group *this;  /* Local group in this sd */
-        unsigned long total_load;  /* Total load of all groups in sd */
-        unsigned long total_pwr;   /*   Total power of all groups in sd */
-        unsigned long avg_load;    /* Average load across all groups in sd */
-        /** Statistics of this group */
-        unsigned long this_load;
-        unsigned long this_load_per_task;
-        unsigned long this_nr_running;
-        unsigned long this_has_capacity;
-        unsigned int  this_idle_cpus;
-        /* Statistics of the busiest group */
-        unsigned int  busiest_idle_cpus;
-        unsigned long max_load;
-        unsigned long busiest_load_per_task;
-        unsigned long busiest_nr_running;
-        unsigned long busiest_group_capacity;
-        unsigned long busiest_has_capacity;
-        unsigned int  busiest_group_weight;
-        int group_imb; /* Is there imbalance in this sd */
-};
-/*
 * sg_lb_stats - stats of a sched_group required for load_balancing
 */
 struct sg_lb_stats {
        unsigned long avg_load; /*Avg load across the CPUs of the group */
        unsigned long group_load; /* Total load over the CPUs of the group */
-        unsigned long sum_nr_running; /* Nr tasks running in the group */
        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
-        unsigned long group_capacity;
+        unsigned long load_per_task;
-        unsigned long idle_cpus;
+        unsigned long group_power;
-        unsigned long group_weight;
+        unsigned int sum_nr_running; /* Nr tasks running in the group */
+        unsigned int group_capacity;
+        unsigned int idle_cpus;
+        unsigned int group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
        int group_has_capacity; /* Is there extra capacity in the group? */
 };
+/*
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ *               during load balancing.
+ */
+struct sd_lb_stats {
+        struct sched_group *busiest;    /* Busiest group in this sd */
+        struct sched_group *local;      /* Local group in this sd */
+        unsigned long total_load;       /* Total load of all groups in sd */
+        unsigned long total_pwr;        /* Total power of all groups in sd */
+        unsigned long avg_load; /* Average load across all groups in sd */
+        struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
+        struct sg_lb_stats local_stat;  /* Statistics of the local group */
+};
+static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
+{
+        /*
+         * Skimp on the clearing to avoid duplicate work. We can avoid clearing
+         * local_stat because update_sg_lb_stats() does a full clear/assignment.
+         * We must however clear busiest_stat::avg_load because
+         * update_sd_pick_busiest() reads this before assignment.
+         */
+        *sds = (struct sd_lb_stats){
+                .busiest = NULL,
+                .local = NULL,
+                .total_load = 0UL,
+                .total_pwr = 0UL,
+                .busiest_stat = {
+                        .avg_load = 0UL,
+                },
+        };
+}
 /**
 * get_sd_load_idx - Obtain the load index for a given sched domain.
 * @sd: The sched_domain whose load_idx is to be obtained.
 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ *
+ * Return: The load index.
 */
 static inline int get_sd_load_idx(struct sched_domain *sd,
                                        enum cpu_idle_type idle)
@@ -4457,33 +4510,99 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
        return 0;
 }
+/*
+ * Group imbalance indicates (and tries to solve) the problem where balancing
+ * groups is inadequate due to tsk_cpus_allowed() constraints.
+ *
+ * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
+ * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
+ * Something like:
+ *
+ *      { 0 1 2 3 } { 4 5 6 7 }
+ *              *     * * *
+ *
+ * If we were to balance group-wise we'd place two tasks in the first group and
+ * two tasks in the second group. Clearly this is undesired as it will overload
+ * cpu 3 and leave one of the cpus in the second group unused.
+ *
+ * The current solution to this issue is detecting the skew in the first group
+ * by noticing it has a cpu that is overloaded while the remaining cpus are
+ * idle -- or rather, there's a distinct imbalance in the cpus; see
+ * sg_imbalanced().
+ *
+ * When this is so detected; this group becomes a candidate for busiest; see
+ * update_sd_pick_busiest(). And calculcate_imbalance() and
+ * find_busiest_group() avoid some of the usual balance conditional to allow it
+ * to create an effective group imbalance.
+ *
+ * This is a somewhat tricky proposition since the next run might not find the
+ * group imbalance and decide the groups need to be balanced again. A most
+ * subtle and fragile situation.
+ */
+struct sg_imb_stats {
+        unsigned long max_nr_running, min_nr_running;
+        unsigned long max_cpu_load, min_cpu_load;
+};
+static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
+{
+        sgi->max_cpu_load = sgi->max_nr_running = 0UL;
+        sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
+}
+static inline void
+update_sg_imb_stats(struct sg_imb_stats *sgi,
+                    unsigned long load, unsigned long nr_running)
+{
+        if (load > sgi->max_cpu_load)
+                sgi->max_cpu_load = load;
+        if (sgi->min_cpu_load > load)
+                sgi->min_cpu_load = load;
+        if (nr_running > sgi->max_nr_running)
+                sgi->max_nr_running = nr_running;
+        if (sgi->min_nr_running > nr_running)
+                sgi->min_nr_running = nr_running;
+}
+static inline int
+sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
+{
+        /*
+         * Consider the group unbalanced when the imbalance is larger
+         * than the average weight of a task.
+         *
+         * APZ: with cgroup the avg task weight can vary wildly and
+         *      might not be a suitable number - should we keep a
+         *      normalized nr_running number somewhere that negates
+         *      the hierarchy?
+         */
+        if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
+            (sgi->max_nr_running - sgi->min_nr_running) > 1)
+                return 1;
+        return 0;
+}
 /**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
 * @env: The load balancing environment.
 * @group: sched_group whose statistics are to be updated.
 * @load_idx: Load index of sched_domain of this_cpu for load calc.
 * @local_group: Does group contain this_cpu.
- * @balance: Should we balance.
 * @sgs: variable to hold the statistics for this group.
 */
 static inline void update_sg_lb_stats(struct lb_env *env,
                        struct sched_group *group, int load_idx,
-                        int local_group, int *balance, struct sg_lb_stats *sgs)
+                        int local_group, struct sg_lb_stats *sgs)
 {
-        unsigned long nr_running, max_nr_running, min_nr_running;
+        struct sg_imb_stats sgi;
-        unsigned long load, max_cpu_load, min_cpu_load;
+        unsigned long nr_running;
-        unsigned int balance_cpu = -1, first_idle_cpu = 0;
+        unsigned long load;
-        unsigned long avg_load_per_task = 0;
        int i;
-        if (local_group)
+        init_sg_imb_stats(&sgi);
-                balance_cpu = group_balance_cpu(group);
-        /* Tally up the load of all CPUs in the group */
-        max_cpu_load = 0;
-        min_cpu_load = ~0UL;
-        max_nr_running = 0;
-        min_nr_running = ~0UL;
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                struct rq *rq = cpu_rq(i);
@@ -4492,24 +4611,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                /* Bias balancing toward cpus of our domain */
                if (local_group) {
-                        if (idle_cpu(i) && !first_idle_cpu &&
-                                        cpumask_test_cpu(i, sched_group_mask(group))) {
-                                first_idle_cpu = 1;
-                                balance_cpu = i;
-                        }
                        load = target_load(i, load_idx);
                } else {
                        load = source_load(i, load_idx);
-                        if (load > max_cpu_load)
+                        update_sg_imb_stats(&sgi, load, nr_running);
-                                max_cpu_load = load;
-                        if (min_cpu_load > load)
-                                min_cpu_load = load;
-                        if (nr_running > max_nr_running)
-                                max_nr_running = nr_running;
-                        if (min_nr_running > nr_running)
-                                min_nr_running = nr_running;
                }
                sgs->group_load += load;
@@ -4519,46 +4624,25 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        sgs->idle_cpus++;
        }
-        /*
+        if (local_group && (env->idle != CPU_NEWLY_IDLE ||
-         * First idle cpu or the first cpu(busiest) in this sched group
+                        time_after_eq(jiffies, group->sgp->next_update)))
-         * is eligible for doing load balancing at this and above
+                update_group_power(env->sd, env->dst_cpu);
-         * domains. In the newly idle case, we will allow all the cpu's
-         * to do the newly idle load balance.
-         */
-        if (local_group) {
-                if (env->idle != CPU_NEWLY_IDLE) {
-                        if (balance_cpu != env->dst_cpu) {
-                                *balance = 0;
-                                return;
-                        }
-                        update_group_power(env->sd, env->dst_cpu);
-                } else if (time_after_eq(jiffies, group->sgp->next_update))
-                        update_group_power(env->sd, env->dst_cpu);
-        }
        /* Adjust by relative CPU power of the group */
-        sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
+        sgs->group_power = group->sgp->power;
+        sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
-        /*
-         * Consider the group unbalanced when the imbalance is larger
-         * than the average weight of a task.
-         *
-         * APZ: with cgroup the avg task weight can vary wildly and
-         *      might not be a suitable number - should we keep a
-         *      normalized nr_running number somewhere that negates
-         *      the hierarchy?
-         */
        if (sgs->sum_nr_running)
-                avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
+        sgs->group_imb = sg_imbalanced(sgs, &sgi);
-        if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
+        sgs->group_capacity =
-            (max_nr_running - min_nr_running) > 1)
+                DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
-                sgs->group_imb = 1;
-        sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
-                                                SCHED_POWER_SCALE);
        if (!sgs->group_capacity)
                sgs->group_capacity = fix_small_capacity(env->sd, group);
        sgs->group_weight = group->group_weight;
        if (sgs->group_capacity > sgs->sum_nr_running)
@@ -4574,13 +4658,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 *
 * Determine if @sg is a busier group than the previously selected
 * busiest group.
+ *
+ * Return: %true if @sg is a busier group than the previously selected
+ * busiest group. %false otherwise.
 */
 static bool update_sd_pick_busiest(struct lb_env *env,
                                   struct sd_lb_stats *sds,
                                   struct sched_group *sg,
                                   struct sg_lb_stats *sgs)
 {
-        if (sgs->avg_load <= sds->max_load)
+        if (sgs->avg_load <= sds->busiest_stat.avg_load)
                return false;
        if (sgs->sum_nr_running > sgs->group_capacity)
@@ -4613,11 +4700,11 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 * @sds: variable to hold the statistics for this sched_domain.
 */
 static inline void update_sd_lb_stats(struct lb_env *env,
-                                        int *balance, struct sd_lb_stats *sds)
+                                        struct sd_lb_stats *sds)
 {
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
-        struct sg_lb_stats sgs;
+        struct sg_lb_stats tmp_sgs;
        int load_idx, prefer_sibling = 0;
        if (child && child->flags & SD_PREFER_SIBLING)
@@ -4626,17 +4713,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
        load_idx = get_sd_load_idx(env->sd, env->idle);
        do {
+                struct sg_lb_stats *sgs = &tmp_sgs;
                int local_group;
                local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
-                memset(&sgs, 0, sizeof(sgs));
+                if (local_group) {
-                update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
+                        sds->local = sg;
+                        sgs = &sds->local_stat;
-                if (local_group && !(*balance))
+                }
-                        return;
-                sds->total_load += sgs.group_load;
+                memset(sgs, 0, sizeof(*sgs));
-                sds->total_pwr += sg->sgp->power;
+                update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
                /*
                 * In case the child domain prefers tasks go to siblings
@@ -4648,26 +4735,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
                 * heaviest group when it is already under-utilized (possible
                 * with a large weight task outweighs the tasks on the system).
                 */
-                if (prefer_sibling && !local_group && sds->this_has_capacity)
+                if (prefer_sibling && !local_group &&
-                        sgs.group_capacity = min(sgs.group_capacity, 1UL);
+                                sds->local && sds->local_stat.group_has_capacity)
+                        sgs->group_capacity = min(sgs->group_capacity, 1U);
-                if (local_group) {
+                /* Now, start updating sd_lb_stats */
-                        sds->this_load = sgs.avg_load;
+                sds->total_load += sgs->group_load;
-                        sds->this = sg;
+                sds->total_pwr += sgs->group_power;
-                        sds->this_nr_running = sgs.sum_nr_running;
-                        sds->this_load_per_task = sgs.sum_weighted_load;
+                if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
-                        sds->this_has_capacity = sgs.group_has_capacity;
-                        sds->this_idle_cpus = sgs.idle_cpus;
-                } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
-                        sds->max_load = sgs.avg_load;
                        sds->busiest = sg;
-                        sds->busiest_nr_running = sgs.sum_nr_running;
+                        sds->busiest_stat = *sgs;
-                        sds->busiest_idle_cpus = sgs.idle_cpus;
-                        sds->busiest_group_capacity = sgs.group_capacity;
-                        sds->busiest_load_per_task = sgs.sum_weighted_load;
-                        sds->busiest_has_capacity = sgs.group_has_capacity;
-                        sds->busiest_group_weight = sgs.group_weight;
-                        sds->group_imb = sgs.group_imb;
                }
                sg = sg->next;
@@ -4691,7 +4769,7 @@ static inline void update_sd_lb_stats(struct lb_env *env,
 * assuming lower CPU number will be equivalent to lower a SMT thread
 * number.
 *
- * Returns 1 when packing is required and a task should be moved to
+ * Return: 1 when packing is required and a task should be moved to
 * this CPU.  The amount of the imbalance is returned in *imbalance.
 *
 * @env: The load balancing environment.
@@ -4712,7 +4790,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
                return 0;
        env->imbalance = DIV_ROUND_CLOSEST(
-                sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
+                sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
+                SCHED_POWER_SCALE);
        return 1;
 }
@@ -4730,24 +4809,23 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
        unsigned long tmp, pwr_now = 0, pwr_move = 0;
        unsigned int imbn = 2;
        unsigned long scaled_busy_load_per_task;
+        struct sg_lb_stats *local, *busiest;
-        if (sds->this_nr_running) {
+        local = &sds->local_stat;
-                sds->this_load_per_task /= sds->this_nr_running;
+        busiest = &sds->busiest_stat;
-                if (sds->busiest_load_per_task >
-                                sds->this_load_per_task)
+        if (!local->sum_nr_running)
-                        imbn = 1;
+                local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
-        } else {
+        else if (busiest->load_per_task > local->load_per_task)
-                sds->this_load_per_task =
+                imbn = 1;
-                        cpu_avg_load_per_task(env->dst_cpu);
-        }
-        scaled_busy_load_per_task = sds->busiest_load_per_task
+        scaled_busy_load_per_task =
-                                         * SCHED_POWER_SCALE;
+                (busiest->load_per_task * SCHED_POWER_SCALE) /
-        scaled_busy_load_per_task /= sds->busiest->sgp->power;
+                busiest->group_power;
-        if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
+        if (busiest->avg_load - local->avg_load + scaled_busy_load_per_task >=
-                        (scaled_busy_load_per_task * imbn)) {
+            (scaled_busy_load_per_task * imbn)) {
-                env->imbalance = sds->busiest_load_per_task;
+                env->imbalance = busiest->load_per_task;
                return;
        }
@@ -4757,34 +4835,37 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
         * moving them.
         */
-        pwr_now += sds->busiest->sgp->power *
+        pwr_now += busiest->group_power *
-                        min(sds->busiest_load_per_task, sds->max_load);
+                        min(busiest->load_per_task, busiest->avg_load);
-        pwr_now += sds->this->sgp->power *
+        pwr_now += local->group_power *
-                        min(sds->this_load_per_task, sds->this_load);
+                        min(local->load_per_task, local->avg_load);
        pwr_now /= SCHED_POWER_SCALE;
        /* Amount of load we'd subtract */
-        tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
+        tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
-                sds->busiest->sgp->power;
+                busiest->group_power;
-        if (sds->max_load > tmp)
+        if (busiest->avg_load > tmp) {
-                pwr_move += sds->busiest->sgp->power *
+                pwr_move += busiest->group_power *
-                        min(sds->busiest_load_per_task, sds->max_load - tmp);
+                            min(busiest->load_per_task,
+                                busiest->avg_load - tmp);
+        }
        /* Amount of load we'd add */
-        if (sds->max_load * sds->busiest->sgp->power <
+        if (busiest->avg_load * busiest->group_power <
-                sds->busiest_load_per_task * SCHED_POWER_SCALE)
+            busiest->load_per_task * SCHED_POWER_SCALE) {
-                tmp = (sds->max_load * sds->busiest->sgp->power) /
+                tmp = (busiest->avg_load * busiest->group_power) /
-                        sds->this->sgp->power;
+                      local->group_power;
-        else
+        } else {
-                tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
+                tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
-                        sds->this->sgp->power;
+                      local->group_power;
-        pwr_move += sds->this->sgp->power *
+        }
-                        min(sds->this_load_per_task, sds->this_load + tmp);
+        pwr_move += local->group_power *
+                    min(local->load_per_task, local->avg_load + tmp);
        pwr_move /= SCHED_POWER_SCALE;
        /* Move if we gain throughput */
        if (pwr_move > pwr_now)
-                env->imbalance = sds->busiest_load_per_task;
+                env->imbalance = busiest->load_per_task;
 }
 /**
@@ -4796,11 +4877,18 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 {
        unsigned long max_pull, load_above_capacity = ~0UL;
+        struct sg_lb_stats *local, *busiest;
-        sds->busiest_load_per_task /= sds->busiest_nr_running;
+        local = &sds->local_stat;
-        if (sds->group_imb) {
+        busiest = &sds->busiest_stat;
-                sds->busiest_load_per_task =
-                        min(sds->busiest_load_per_task, sds->avg_load);
+        if (busiest->group_imb) {
+                /*
+                 * In the group_imb case we cannot rely on group-wide averages
+                 * to ensure cpu-load equilibrium, look at wider averages. XXX
+                 */
+                busiest->load_per_task =
+                        min(busiest->load_per_task, sds->avg_load);
        }
        /*
@@ -4808,21 +4896,22 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         * max load less than avg load(as we skip the groups at or below
         * its cpu_power, while calculating max_load..)
         */
-        if (sds->max_load < sds->avg_load) {
+        if (busiest->avg_load < sds->avg_load) {
                env->imbalance = 0;
                return fix_small_imbalance(env, sds);
        }
-        if (!sds->group_imb) {
+        if (!busiest->group_imb) {
                /*
                 * Don't want to pull so many tasks that a group would go idle.
+                 * Except of course for the group_imb case, since then we might
+                 * have to drop below capacity to reach cpu-load equilibrium.
                 */
-                load_above_capacity = (sds->busiest_nr_running -
+                load_above_capacity =
-                                                sds->busiest_group_capacity);
+                        (busiest->sum_nr_running - busiest->group_capacity);
                load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
+                load_above_capacity /= busiest->group_power;
-                load_above_capacity /= sds->busiest->sgp->power;
        }
        /*
@@ -4832,15 +4921,14 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         * we also don't want to reduce the group load below the group capacity
         * (so that we can implement power-savings policies etc). Thus we look
         * for the minimum possible imbalance.
-         * Be careful of negative numbers as they'll appear as very large values
-         * with unsigned longs.
         */
-        max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
+        max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
        /* How much load to actually move to equalise the imbalance */
-        env->imbalance = min(max_pull * sds->busiest->sgp->power,
+        env->imbalance = min(
-                (sds->avg_load - sds->this_load) * sds->this->sgp->power)
+                max_pull * busiest->group_power,
-                        / SCHED_POWER_SCALE;
+                (sds->avg_load - local->avg_load) * local->group_power
+        ) / SCHED_POWER_SCALE;
        /*
         * if *imbalance is less than the average load per runnable task
@@ -4848,9 +4936,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
         * a think about bumping its value to force at least one task to be
         * moved
         */
-        if (env->imbalance < sds->busiest_load_per_task)
+        if (env->imbalance < busiest->load_per_task)
                return fix_small_imbalance(env, sds);
 }
 /******* find_busiest_group() helpers end here *********************/
@@ -4866,69 +4953,62 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 * to restore balance.
 *
 * @env: The load balancing environment.
- * @balance: Pointer to a variable indicating if this_cpu
- *      is the appropriate cpu to perform load balancing at this_level.
 *
- * Returns:     - the busiest group if imbalance exists.
+ * Return:      - The busiest group if imbalance exists.
 *              - If no imbalance and user has opted for power-savings balance,
 *                 return the least loaded group whose CPUs can be
 *                 put to idle by rebalancing its tasks onto our group.
 */
-static struct sched_group *
+static struct sched_group *find_busiest_group(struct lb_env *env)
-find_busiest_group(struct lb_env *env, int *balance)
 {
+        struct sg_lb_stats *local, *busiest;
        struct sd_lb_stats sds;
-        memset(&sds, 0, sizeof(sds));
+        init_sd_lb_stats(&sds);
        /*
         * Compute the various statistics relavent for load balancing at
         * this level.
         */
-        update_sd_lb_stats(env, balance, &sds);
+        update_sd_lb_stats(env, &sds);
+        local = &sds.local_stat;
-        /*
+        busiest = &sds.busiest_stat;
-         * this_cpu is not the appropriate cpu to perform load balancing at
-         * this level.
-         */
-        if (!(*balance))
-                goto ret;
        if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
            check_asym_packing(env, &sds))
                return sds.busiest;
        /* There is no busy sibling group to pull tasks from */
-        if (!sds.busiest || sds.busiest_nr_running == 0)
+        if (!sds.busiest || busiest->sum_nr_running == 0)
                goto out_balanced;
        sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
        /*
         * If the busiest group is imbalanced the below checks don't
-         * work because they assumes all things are equal, which typically
+         * work because they assume all things are equal, which typically
         * isn't true due to cpus_allowed constraints and the like.
         */
-        if (sds.group_imb)
+        if (busiest->group_imb)
                goto force_balance;
        /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-        if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+        if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
-                        !sds.busiest_has_capacity)
+            !busiest->group_has_capacity)
                goto force_balance;
        /*
         * If the local group is more busy than the selected busiest group
         * don't try and pull any tasks.
         */
-        if (sds.this_load >= sds.max_load)
+        if (local->avg_load >= busiest->avg_load)
                goto out_balanced;
        /*
         * Don't pull any tasks if this group is already above the domain
         * average load.
         */
-        if (sds.this_load >= sds.avg_load)
+        if (local->avg_load >= sds.avg_load)
                goto out_balanced;
        if (env->idle == CPU_IDLE) {
@@ -4938,15 +5018,16 @@ find_busiest_group(struct lb_env *env, int *balance)
                 * there is no imbalance between this and busiest group
                 * wrt to idle cpu's, it is balanced.
                 */
-                if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
+                if ((local->idle_cpus < busiest->idle_cpus) &&
-                    sds.busiest_nr_running <= sds.busiest_group_weight)
+                    busiest->sum_nr_running <= busiest->group_weight)
                        goto out_balanced;
        } else {
                /*
                 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
                 * imbalance_pct to be conservative.
                 */
-                if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
+                if (100 * busiest->avg_load <=
+                                env->sd->imbalance_pct * local->avg_load)
                        goto out_balanced;
        }
@@ -4956,7 +5037,6 @@ force_balance:
        return sds.busiest;
 out_balanced:
-ret:
        env->imbalance = 0;
        return NULL;
 }
@@ -4968,10 +5048,10 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                                     struct sched_group *group)
 {
        struct rq *busiest = NULL, *rq;
-        unsigned long max_load = 0;
+        unsigned long busiest_load = 0, busiest_power = 1;
        int i;
-        for_each_cpu(i, sched_group_cpus(group)) {
+        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                unsigned long power = power_of(i);
                unsigned long capacity = DIV_ROUND_CLOSEST(power,
                                                           SCHED_POWER_SCALE);
@@ -4980,9 +5060,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                if (!capacity)
                        capacity = fix_small_capacity(env->sd, group);
-                if (!cpumask_test_cpu(i, env->cpus))
-                        continue;
                rq = cpu_rq(i);
                wl = weighted_cpuload(i);
@@ -4998,11 +5075,15 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                 * the weighted_cpuload() scaled with the cpu power, so that
                 * the load can be moved away from the cpu that is potentially
                 * running at a lower capacity.
+                 *
+                 * Thus we're looking for max(wl_i / power_i), crosswise
+                 * multiplication to rid ourselves of the division works out
+                 * to: wl_i * power_j > wl_j * power_i;  where j is our
+                 * previous maximum.
                 */
-                wl = (wl * SCHED_POWER_SCALE) / power;
+                if (wl * busiest_power > busiest_load * power) {
+                        busiest_load = wl;
-                if (wl > max_load) {
+                        busiest_power = power;
-                        max_load = wl;
                        busiest = rq;
                }
        }
@@ -5039,13 +5120,47 @@ static int need_active_balance(struct lb_env *env)
 static int active_load_balance_cpu_stop(void *data);
+static int should_we_balance(struct lb_env *env)
+{
+        struct sched_group *sg = env->sd->groups;
+        struct cpumask *sg_cpus, *sg_mask;
+        int cpu, balance_cpu = -1;
+        /*
+         * In the newly idle case, we will allow all the cpu's
+         * to do the newly idle load balance.
+         */
+        if (env->idle == CPU_NEWLY_IDLE)
+                return 1;
+        sg_cpus = sched_group_cpus(sg);
+        sg_mask = sched_group_mask(sg);
+        /* Try to find first idle cpu */
+        for_each_cpu_and(cpu, sg_cpus, env->cpus) {
+                if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
+                        continue;
+                balance_cpu = cpu;
+                break;
+        }
+        if (balance_cpu == -1)
+                balance_cpu = group_balance_cpu(sg);
+        /*
+         * First idle cpu or the first cpu(busiest) in this sched group
+         * is eligible for doing load balancing at this and above domains.
+         */
+        return balance_cpu != env->dst_cpu;
+}
 /*
 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 * tasks if there is an imbalance.
 */
 static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
-                        int *balance)
+                        int *continue_balancing)
 {
        int ld_moved, cur_ld_moved, active_balance = 0;
        struct sched_group *group;
@@ -5075,11 +5190,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        schedstat_inc(sd, lb_count[idle]);
 redo:
-        group = find_busiest_group(&env, balance);
+        if (!should_we_balance(&env)) {
+                *continue_balancing = 0;
-        if (*balance == 0)
                goto out_balanced;
+        }
+        group = find_busiest_group(&env);
        if (!group) {
                schedstat_inc(sd, lb_nobusyg[idle]);
                goto out_balanced;
@@ -5108,7 +5224,6 @@ redo:
                env.src_rq    = busiest;
                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
-                update_h_load(env.src_cpu);
 more_balance:
                local_irq_save(flags);
                double_rq_lock(env.dst_rq, busiest);
@@ -5292,7 +5407,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        rcu_read_lock();
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
-                int balance = 1;
+                int continue_balancing = 1;
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
@@ -5300,7 +5415,8 @@ void idle_balance(int this_cpu, struct rq *this_rq)
                if (sd->flags & SD_BALANCE_NEWIDLE) {
                        /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance(this_cpu, this_rq,
-                                                   sd, CPU_NEWLY_IDLE, &balance);
+                                                   sd, CPU_NEWLY_IDLE,
+                                                   &continue_balancing);
                }
                interval = msecs_to_jiffies(sd->balance_interval);
@@ -5506,7 +5622,7 @@ void nohz_balance_enter_idle(int cpu)
        set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
 }
-static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
+static int sched_ilb_notifier(struct notifier_block *nfb,
                                        unsigned long action, void *hcpu)
 {
        switch (action & ~CPU_TASKS_FROZEN) {
@@ -5538,7 +5654,7 @@ void update_max_interval(void)
 */
 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 {
-        int balance = 1;
+        int continue_balancing = 1;
        struct rq *rq = cpu_rq(cpu);
        unsigned long interval;
        struct sched_domain *sd;
@@ -5570,7 +5686,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                }
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                        if (load_balance(cpu, rq, sd, idle, &balance)) {
+                        if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
                                /*
                                 * The LBF_SOME_PINNED logic could have changed
                                 * env->dst_cpu, so we can't know our idle
@@ -5593,7 +5709,7 @@ out:
                 * CPU in our sched group which is doing load balancing more
                 * actively.
                 */
-                if (!balance)
+                if (!continue_balancing)
                        break;
        }
        rcu_read_unlock();
@@ -5786,7 +5902,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
                entity_tick(cfs_rq, se, queued);
        }
-        if (sched_feat_numa(NUMA))
+        if (numabalancing_enabled)
                task_tick_numa(rq, curr);
        update_rq_runnable_avg(rq, 1);
@@ -5889,11 +6005,9 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
        * and ensure we don't carry in an old decay_count if we
        * switch back.
        */
-        if (p->se.avg.decay_count) {
+        if (se->avg.decay_count) {
-                struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
+                __synchronize_entity_decay(se);
-                __synchronize_entity_decay(&p->se);
+                subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
-                subtract_blocked_load_contrib(cfs_rq,
-                                p->se.avg.load_avg_contrib);
        }
 #endif
 }