8 files changed, 635 insertions, 709 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a7dd35102a3..173ea52f3af0 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -16,5 +16,3 @@ obj-$(CONFIG_SMP) += cpupri.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4603b9d8f30a..d5594a4268d4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,6 +83,7 @@
 #include "sched.h"
 #include "../workqueue_sched.h"
+#include "../smpboot.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -141,9 +142,8 @@ const_debug unsigned int sysctl_sched_features =
 #define SCHED_FEAT(name, enabled)       \
        #name ,
-static __read_mostly char *sched_feat_names[] = {
+static const char * const sched_feat_names[] = {
 #include "features.h"
-        NULL
 };
 #undef SCHED_FEAT
@@ -692,8 +692,6 @@ int tg_nop(struct task_group *tg, void *data)
 }
 #endif
-void update_cpu_load(struct rq *this_rq);
 static void set_load_weight(struct task_struct *p)
 {
        int prio = p->static_prio - MAX_RT_PRIO;
@@ -2083,6 +2081,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
 #endif
        /* Here we just switch the register state and the stack. */
+        rcu_switch_from(prev);
        switch_to(prev, next, prev);
        barrier();
@@ -2486,22 +2485,13 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
 * every tick. We fix it up based on jiffies.
 */
-void update_cpu_load(struct rq *this_rq)
+static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
+                              unsigned long pending_updates)
 {
-        unsigned long this_load = this_rq->load.weight;
-        unsigned long curr_jiffies = jiffies;
-        unsigned long pending_updates;
        int i, scale;
        this_rq->nr_load_updates++;
-        /* Avoid repeated calls on same jiffy, when moving in and out of idle */
-        if (curr_jiffies == this_rq->last_load_update_tick)
-                return;
-        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
-        this_rq->last_load_update_tick = curr_jiffies;
        /* Update our load: */
        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
@@ -2526,9 +2516,78 @@ void update_cpu_load(struct rq *this_rq)
        sched_avg_update(this_rq);
 }
+#ifdef CONFIG_NO_HZ
+/*
+ * There is no sane way to deal with nohz on smp when using jiffies because the
+ * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
+ *
+ * Therefore we cannot use the delta approach from the regular tick since that
+ * would seriously skew the load calculation. However we'll make do for those
+ * updates happening while idle (nohz_idle_balance) or coming out of idle
+ * (tick_nohz_idle_exit).
+ *
+ * This means we might still be one tick off for nohz periods.
+ */
+/*
+ * Called from nohz_idle_balance() to update the load ratings before doing the
+ * idle balance.
+ */
+void update_idle_cpu_load(struct rq *this_rq)
+{
+        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+        unsigned long load = this_rq->load.weight;
+        unsigned long pending_updates;
+        /*
+         * bail if there's load or we're actually up-to-date.
+         */
+        if (load || curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        this_rq->last_load_update_tick = curr_jiffies;
+        __update_cpu_load(this_rq, load, pending_updates);
+}
+/*
+ * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
+ */
+void update_cpu_load_nohz(void)
+{
+        struct rq *this_rq = this_rq();
+        unsigned long curr_jiffies = ACCESS_ONCE(jiffies);
+        unsigned long pending_updates;
+        if (curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        raw_spin_lock(&this_rq->lock);
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        if (pending_updates) {
+                this_rq->last_load_update_tick = curr_jiffies;
+                /*
+                 * We were idle, this means load 0, the current load might be
+                 * !0 due to remote wakeups and the sort.
+                 */
+                __update_cpu_load(this_rq, 0, pending_updates);
+        }
+        raw_spin_unlock(&this_rq->lock);
+}
+#endif /* CONFIG_NO_HZ */
+/*
+ * Called from scheduler_tick()
+ */
 static void update_cpu_load_active(struct rq *this_rq)
 {
-        update_cpu_load(this_rq);
+        /*
+         * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
+         */
+        this_rq->last_load_update_tick = jiffies;
+        __update_cpu_load(this_rq, this_rq->load.weight, 1);
        calc_load_account_active(this_rq);
 }
@@ -3113,6 +3172,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
        if (irqs_disabled())
                print_irqtrace_events(prev);
        dump_stack();
+        add_taint(TAINT_WARN);
 }
 /*
@@ -4042,11 +4102,8 @@ static bool check_same_owner(struct task_struct *p)
        rcu_read_lock();
        pcred = __task_cred(p);
-        if (cred->user->user_ns == pcred->user->user_ns)
+        match = (uid_eq(cred->euid, pcred->euid) ||
-                match = (cred->euid == pcred->euid ||
+                 uid_eq(cred->euid, pcred->uid));
-                         cred->euid == pcred->uid);
-        else
-                match = false;
        rcu_read_unlock();
        return match;
 }
@@ -4957,7 +5014,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
                p->sched_class->set_cpus_allowed(p, new_mask);
        cpumask_copy(&p->cpus_allowed, new_mask);
-        p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+        p->nr_cpus_allowed = cpumask_weight(new_mask);
 }
 /*
@@ -5499,15 +5556,20 @@ static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 #ifdef CONFIG_SCHED_DEBUG
-static __read_mostly int sched_domain_debug_enabled;
+static __read_mostly int sched_debug_enabled;
-static int __init sched_domain_debug_setup(char *str)
+static int __init sched_debug_setup(char *str)
 {
-        sched_domain_debug_enabled = 1;
+        sched_debug_enabled = 1;
        return 0;
 }
-early_param("sched_debug", sched_domain_debug_setup);
+early_param("sched_debug", sched_debug_setup);
+static inline bool sched_debug(void)
+{
+        return sched_debug_enabled;
+}
 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                                  struct cpumask *groupmask)
@@ -5547,7 +5609,12 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
-                if (!group->sgp->power) {
+                /*
+                 * Even though we initialize ->power to something semi-sane,
+                 * we leave power_orig unset. This allows us to detect if
+                 * domain iteration is still funny without causing /0 traps.
+                 */
+                if (!group->sgp->power_orig) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: domain->cpu_power not "
                                        "set\n");
@@ -5560,7 +5627,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
-                if (cpumask_intersects(groupmask, sched_group_cpus(group))) {
+                if (!(sd->flags & SD_OVERLAP) &&
+                    cpumask_intersects(groupmask, sched_group_cpus(group))) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: repeated CPUs\n");
                        break;
@@ -5594,7 +5662,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
        int level = 0;
-        if (!sched_domain_debug_enabled)
+        if (!sched_debug_enabled)
                return;
        if (!sd) {
@@ -5615,6 +5683,10 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
+static inline bool sched_debug(void)
+{
+        return false;
+}
 #endif /* CONFIG_SCHED_DEBUG */
 static int sd_degenerate(struct sched_domain *sd)
@@ -5898,99 +5970,11 @@ static int __init isolated_cpu_setup(char *str)
 __setup("isolcpus=", isolated_cpu_setup);
-#ifdef CONFIG_NUMA
-/**
- * find_next_best_node - find the next node to include in a sched_domain
- * @node: node whose sched_domain we're building
- * @used_nodes: nodes already in the sched_domain
- *
- * Find the next node to include in a given scheduling domain. Simply
- * finds the closest node not already in the @used_nodes map.
- *
- * Should use nodemask_t.
- */
-static int find_next_best_node(int node, nodemask_t *used_nodes)
-{
-        int i, n, val, min_val, best_node = -1;
-        min_val = INT_MAX;
-        for (i = 0; i < nr_node_ids; i++) {
-                /* Start at @node */
-                n = (node + i) % nr_node_ids;
-                if (!nr_cpus_node(n))
-                        continue;
-                /* Skip already used nodes */
-                if (node_isset(n, *used_nodes))
-                        continue;
-                /* Simple min distance search */
-                val = node_distance(node, n);
-                if (val < min_val) {
-                        min_val = val;
-                        best_node = n;
-                }
-        }
-        if (best_node != -1)
-                node_set(best_node, *used_nodes);
-        return best_node;
-}
-/**
- * sched_domain_node_span - get a cpumask for a node's sched_domain
- * @node: node whose cpumask we're constructing
- * @span: resulting cpumask
- *
- * Given a node, construct a good cpumask for its sched_domain to span. It
- * should be one that prevents unnecessary balancing, but also spreads tasks
- * out optimally.
- */
-static void sched_domain_node_span(int node, struct cpumask *span)
-{
-        nodemask_t used_nodes;
-        int i;
-        cpumask_clear(span);
-        nodes_clear(used_nodes);
-        cpumask_or(span, span, cpumask_of_node(node));
-        node_set(node, used_nodes);
-        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
-                int next_node = find_next_best_node(node, &used_nodes);
-                if (next_node < 0)
-                        break;
-                cpumask_or(span, span, cpumask_of_node(next_node));
-        }
-}
-static const struct cpumask *cpu_node_mask(int cpu)
-{
-        lockdep_assert_held(&sched_domains_mutex);
-        sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
-        return sched_domains_tmpmask;
-}
-static const struct cpumask *cpu_allnodes_mask(int cpu)
-{
-        return cpu_possible_mask;
-}
-#endif /* CONFIG_NUMA */
 static const struct cpumask *cpu_cpu_mask(int cpu)
 {
        return cpumask_of_node(cpu_to_node(cpu));
 }
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 struct sd_data {
        struct sched_domain **__percpu sd;
        struct sched_group **__percpu sg;
@@ -6020,9 +6004,48 @@ struct sched_domain_topology_level {
        sched_domain_init_f init;
        sched_domain_mask_f mask;
        int                 flags;
+        int                 numa_level;
        struct sd_data      data;
 };
+/*
+ * Build an iteration mask that can exclude certain CPUs from the upwards
+ * domain traversal.
+ *
+ * Asymmetric node setups can result in situations where the domain tree is of
+ * unequal depth, make sure to skip domains that already cover the entire
+ * range.
+ *
+ * In that case build_sched_domains() will have terminated the iteration early
+ * and our sibling sd spans will be empty. Domains should always include the
+ * cpu they're built on, so check that.
+ *
+ */
+static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
+{
+        const struct cpumask *span = sched_domain_span(sd);
+        struct sd_data *sdd = sd->private;
+        struct sched_domain *sibling;
+        int i;
+        for_each_cpu(i, span) {
+                sibling = *per_cpu_ptr(sdd->sd, i);
+                if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+                        continue;
+                cpumask_set_cpu(i, sched_group_mask(sg));
+        }
+}
+/*
+ * Return the canonical balance cpu for this group, this is the first cpu
+ * of this group that's also in the iteration mask.
+ */
+int group_balance_cpu(struct sched_group *sg)
+{
+        return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg));
+}
 static int
 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
 {
@@ -6041,6 +6064,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                if (cpumask_test_cpu(i, covered))
                        continue;
+                child = *per_cpu_ptr(sdd->sd, i);
+                /* See the comment near build_group_mask(). */
+                if (!cpumask_test_cpu(i, sched_domain_span(child)))
+                        continue;
                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
                                GFP_KERNEL, cpu_to_node(cpu));
@@ -6048,8 +6077,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                        goto fail;
                sg_span = sched_group_cpus(sg);
-                child = *per_cpu_ptr(sdd->sd, i);
                if (child->child) {
                        child = child->child;
                        cpumask_copy(sg_span, sched_domain_span(child));
@@ -6058,10 +6085,24 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                cpumask_or(covered, covered, sg_span);
-                sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+                sg->sgp = *per_cpu_ptr(sdd->sgp, i);
-                atomic_inc(&sg->sgp->ref);
+                if (atomic_inc_return(&sg->sgp->ref) == 1)
+                        build_group_mask(sd, sg);
-                if (cpumask_test_cpu(cpu, sg_span))
+                /*
+                 * Initialize sgp->power such that even if we mess up the
+                 * domains and no possible iteration will get us here, we won't
+                 * die on a /0 trap.
+                 */
+                sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
+                /*
+                 * Make sure the first group of this domain contains the
+                 * canonical balance cpu. Otherwise the sched_domain iteration
+                 * breaks. See update_sg_lb_stats().
+                 */
+                if ((!groups && cpumask_test_cpu(cpu, sg_span)) ||
+                    group_balance_cpu(sg) == cpu)
                        groups = sg;
                if (!first)
@@ -6135,6 +6176,7 @@ build_sched_groups(struct sched_domain *sd, int cpu)
                cpumask_clear(sched_group_cpus(sg));
                sg->sgp->power = 0;
+                cpumask_setall(sched_group_mask(sg));
                for_each_cpu(j, span) {
                        if (get_group(j, sdd, NULL) != group)
@@ -6176,7 +6218,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
                sg = sg->next;
        } while (sg != sd->groups);
-        if (cpu != group_first_cpu(sg))
+        if (cpu != group_balance_cpu(sg))
                return;
        update_group_power(sd, cpu);
@@ -6211,10 +6253,6 @@ sd_init_##type(struct sched_domain_topology_level *tl, int cpu) 	\
 }
 SD_INIT_FUNC(CPU)
-#ifdef CONFIG_NUMA
- SD_INIT_FUNC(ALLNODES)
- SD_INIT_FUNC(NODE)
-#endif
 #ifdef CONFIG_SCHED_SMT
 SD_INIT_FUNC(SIBLING)
 #endif
@@ -6230,11 +6268,8 @@ int sched_domain_level_max;
 static int __init setup_relax_domain_level(char *str)
 {
-        unsigned long val;
+        if (kstrtoint(str, 0, &default_relax_domain_level))
+                pr_warn("Unable to set relax_domain_level\n");
-        val = simple_strtoul(str, NULL, 0);
-        if (val < sched_domain_level_max)
-                default_relax_domain_level = val;
        return 1;
 }
@@ -6336,15 +6371,236 @@ static struct sched_domain_topology_level default_topology[] = {
        { sd_init_BOOK, cpu_book_mask, },
 #endif
        { sd_init_CPU, cpu_cpu_mask, },
-#ifdef CONFIG_NUMA
-        { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
-        { sd_init_ALLNODES, cpu_allnodes_mask, },
-#endif
        { NULL, },
 };
 static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+#ifdef CONFIG_NUMA
+static int sched_domains_numa_levels;
+static int *sched_domains_numa_distance;
+static struct cpumask ***sched_domains_numa_masks;
+static int sched_domains_curr_level;
+static inline int sd_local_flags(int level)
+{
+        if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
+                return 0;
+        return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
+}
+static struct sched_domain *
+sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+{
+        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+        int level = tl->numa_level;
+        int sd_weight = cpumask_weight(
+                        sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+        *sd = (struct sched_domain){
+                .min_interval           = sd_weight,
+                .max_interval           = 2*sd_weight,
+                .busy_factor            = 32,
+                .imbalance_pct          = 125,
+                .cache_nice_tries       = 2,
+                .busy_idx               = 3,
+                .idle_idx               = 2,
+                .newidle_idx            = 0,
+                .wake_idx               = 0,
+                .forkexec_idx           = 0,
+                .flags                  = 1*SD_LOAD_BALANCE
+                                        | 1*SD_BALANCE_NEWIDLE
+                                        | 0*SD_BALANCE_EXEC
+                                        | 0*SD_BALANCE_FORK
+                                        | 0*SD_BALANCE_WAKE
+                                        | 0*SD_WAKE_AFFINE
+                                        | 0*SD_PREFER_LOCAL
+                                        | 0*SD_SHARE_CPUPOWER
+                                        | 0*SD_SHARE_PKG_RESOURCES
+                                        | 1*SD_SERIALIZE
+                                        | 0*SD_PREFER_SIBLING
+                                        | sd_local_flags(level)
+                                        ,
+                .last_balance           = jiffies,
+                .balance_interval       = sd_weight,
+        };
+        SD_INIT_NAME(sd, NUMA);
+        sd->private = &tl->data;
+        /*
+         * Ugly hack to pass state to sd_numa_mask()...
+         */
+        sched_domains_curr_level = tl->numa_level;
+        return sd;
+}
+static const struct cpumask *sd_numa_mask(int cpu)
+{
+        return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
+}
+static void sched_numa_warn(const char *str)
+{
+        static int done = false;
+        int i,j;
+        if (done)
+                return;
+        done = true;
+        printk(KERN_WARNING "ERROR: %s\n\n", str);
+        for (i = 0; i < nr_node_ids; i++) {
+                printk(KERN_WARNING "  ");
+                for (j = 0; j < nr_node_ids; j++)
+                        printk(KERN_CONT "%02d ", node_distance(i,j));
+                printk(KERN_CONT "\n");
+        }
+        printk(KERN_WARNING "\n");
+}
+static bool find_numa_distance(int distance)
+{
+        int i;
+        if (distance == node_distance(0, 0))
+                return true;
+        for (i = 0; i < sched_domains_numa_levels; i++) {
+                if (sched_domains_numa_distance[i] == distance)
+                        return true;
+        }
+        return false;
+}
+static void sched_init_numa(void)
+{
+        int next_distance, curr_distance = node_distance(0, 0);
+        struct sched_domain_topology_level *tl;
+        int level = 0;
+        int i, j, k;
+        sched_domains_numa_distance = kzalloc(sizeof(int) * nr_node_ids, GFP_KERNEL);
+        if (!sched_domains_numa_distance)
+                return;
+        /*
+         * O(nr_nodes^2) deduplicating selection sort -- in order to find the
+         * unique distances in the node_distance() table.
+         *
+         * Assumes node_distance(0,j) includes all distances in
+         * node_distance(i,j) in order to avoid cubic time.
+         */
+        next_distance = curr_distance;
+        for (i = 0; i < nr_node_ids; i++) {
+                for (j = 0; j < nr_node_ids; j++) {
+                        for (k = 0; k < nr_node_ids; k++) {
+                                int distance = node_distance(i, k);
+                                if (distance > curr_distance &&
+                                    (distance < next_distance ||
+                                     next_distance == curr_distance))
+                                        next_distance = distance;
+                                /*
+                                 * While not a strong assumption it would be nice to know
+                                 * about cases where if node A is connected to B, B is not
+                                 * equally connected to A.
+                                 */
+                                if (sched_debug() && node_distance(k, i) != distance)
+                                        sched_numa_warn("Node-distance not symmetric");
+                                if (sched_debug() && i && !find_numa_distance(distance))
+                                        sched_numa_warn("Node-0 not representative");
+                        }
+                        if (next_distance != curr_distance) {
+                                sched_domains_numa_distance[level++] = next_distance;
+                                sched_domains_numa_levels = level;
+                                curr_distance = next_distance;
+                        } else break;
+                }
+                /*
+                 * In case of sched_debug() we verify the above assumption.
+                 */
+                if (!sched_debug())
+                        break;
+        }
+        /*
+         * 'level' contains the number of unique distances, excluding the
+         * identity distance node_distance(i,i).
+         *
+         * The sched_domains_nume_distance[] array includes the actual distance
+         * numbers.
+         */
+        sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+        if (!sched_domains_numa_masks)
+                return;
+        /*
+         * Now for each level, construct a mask per node which contains all
+         * cpus of nodes that are that many hops away from us.
+         */
+        for (i = 0; i < level; i++) {
+                sched_domains_numa_masks[i] =
+                        kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
+                if (!sched_domains_numa_masks[i])
+                        return;
+                for (j = 0; j < nr_node_ids; j++) {
+                        struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+                        if (!mask)
+                                return;
+                        sched_domains_numa_masks[i][j] = mask;
+                        for (k = 0; k < nr_node_ids; k++) {
+                                if (node_distance(j, k) > sched_domains_numa_distance[i])
+                                        continue;
+                                cpumask_or(mask, mask, cpumask_of_node(k));
+                        }
+                }
+        }
+        tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+                        sizeof(struct sched_domain_topology_level), GFP_KERNEL);
+        if (!tl)
+                return;
+        /*
+         * Copy the default topology bits..
+         */
+        for (i = 0; default_topology[i].init; i++)
+                tl[i] = default_topology[i];
+        /*
+         * .. and append 'j' levels of NUMA goodness.
+         */
+        for (j = 0; j < level; i++, j++) {
+                tl[i] = (struct sched_domain_topology_level){
+                        .init = sd_numa_init,
+                        .mask = sd_numa_mask,
+                        .flags = SDTL_OVERLAP,
+                        .numa_level = j,
+                };
+        }
+        sched_domain_topology = tl;
+}
+#else
+static inline void sched_init_numa(void)
+{
+}
+#endif /* CONFIG_NUMA */
 static int __sdt_alloc(const struct cpumask *cpu_map)
 {
        struct sched_domain_topology_level *tl;
@@ -6382,9 +6638,11 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                        if (!sg)
                                return -ENOMEM;
+                        sg->next = sg;
                        *per_cpu_ptr(sdd->sg, j) = sg;
-                        sgp = kzalloc_node(sizeof(struct sched_group_power),
+                        sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
                        if (!sgp)
                                return -ENOMEM;
@@ -6405,16 +6663,26 @@ static void __sdt_free(const struct cpumask *cpu_map)
                struct sd_data *sdd = &tl->data;
                for_each_cpu(j, cpu_map) {
-                        struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+                        struct sched_domain *sd;
-                        if (sd && (sd->flags & SD_OVERLAP))
-                                free_sched_groups(sd->groups, 0);
+                        if (sdd->sd) {
-                        kfree(*per_cpu_ptr(sdd->sd, j));
+                                sd = *per_cpu_ptr(sdd->sd, j);
-                        kfree(*per_cpu_ptr(sdd->sg, j));
+                                if (sd && (sd->flags & SD_OVERLAP))
-                        kfree(*per_cpu_ptr(sdd->sgp, j));
+                                        free_sched_groups(sd->groups, 0);
+                                kfree(*per_cpu_ptr(sdd->sd, j));
+                        }
+                        if (sdd->sg)
+                                kfree(*per_cpu_ptr(sdd->sg, j));
+                        if (sdd->sgp)
+                                kfree(*per_cpu_ptr(sdd->sgp, j));
                }
                free_percpu(sdd->sd);
+                sdd->sd = NULL;
                free_percpu(sdd->sg);
+                sdd->sg = NULL;
                free_percpu(sdd->sgp);
+                sdd->sgp = NULL;
        }
 }
@@ -6427,7 +6695,6 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
        if (!sd)
                return child;
-        set_domain_attribute(sd, attr);
        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
        if (child) {
                sd->level = child->level + 1;
@@ -6435,6 +6702,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                child->parent = sd;
        }
        sd->child = child;
+        set_domain_attribute(sd, attr);
        return sd;
 }
@@ -6575,7 +6843,6 @@ static int init_sched_domains(const struct cpumask *cpu_map)
        if (!doms_cur)
                doms_cur = &fallback_doms;
        cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
-        dattr_cur = NULL;
        err = build_sched_domains(doms_cur[0], NULL);
        register_sched_domain_sysctl();
@@ -6700,97 +6967,6 @@ match2:
        mutex_unlock(&sched_domains_mutex);
 }
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void reinit_sched_domains(void)
-{
-        get_online_cpus();
-        /* Destroy domains first to force the rebuild */
-        partition_sched_domains(0, NULL, NULL);
-        rebuild_sched_domains();
-        put_online_cpus();
-}
-static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
-{
-        unsigned int level = 0;
-        if (sscanf(buf, "%u", &level) != 1)
-                return -EINVAL;
-        /*
-         * level is always be positive so don't check for
-         * level < POWERSAVINGS_BALANCE_NONE which is 0
-         * What happens on 0 or 1 byte write,
-         * need to check for count as well?
-         */
-        if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
-                return -EINVAL;
-        if (smt)
-                sched_smt_power_savings = level;
-        else
-                sched_mc_power_savings = level;
-        reinit_sched_domains();
-        return count;
-}
-#ifdef CONFIG_SCHED_MC
-static ssize_t sched_mc_power_savings_show(struct device *dev,
-                                           struct device_attribute *attr,
-                                           char *buf)
-{
-        return sprintf(buf, "%u\n", sched_mc_power_savings);
-}
-static ssize_t sched_mc_power_savings_store(struct device *dev,
-                                            struct device_attribute *attr,
-                                            const char *buf, size_t count)
-{
-        return sched_power_savings_store(buf, count, 0);
-}
-static DEVICE_ATTR(sched_mc_power_savings, 0644,
-                   sched_mc_power_savings_show,
-                   sched_mc_power_savings_store);
-#endif
-#ifdef CONFIG_SCHED_SMT
-static ssize_t sched_smt_power_savings_show(struct device *dev,
-                                            struct device_attribute *attr,
-                                            char *buf)
-{
-        return sprintf(buf, "%u\n", sched_smt_power_savings);
-}
-static ssize_t sched_smt_power_savings_store(struct device *dev,
-                                            struct device_attribute *attr,
-                                             const char *buf, size_t count)
-{
-        return sched_power_savings_store(buf, count, 1);
-}
-static DEVICE_ATTR(sched_smt_power_savings, 0644,
-                   sched_smt_power_savings_show,
-                   sched_smt_power_savings_store);
-#endif
-int __init sched_create_sysfs_power_savings_entries(struct device *dev)
-{
-        int err = 0;
-#ifdef CONFIG_SCHED_SMT
-        if (smt_capable())
-                err = device_create_file(dev, &dev_attr_sched_smt_power_savings);
-#endif
-#ifdef CONFIG_SCHED_MC
-        if (!err && mc_capable())
-                err = device_create_file(dev, &dev_attr_sched_mc_power_savings);
-#endif
-        return err;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 /*
 * Update cpusets according to cpu_active mask.  If cpusets are
 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
@@ -6828,6 +7004,8 @@ void __init sched_init_smp(void)
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
+        sched_init_numa();
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
        init_sched_domains(cpu_active_mask);
@@ -7049,6 +7227,7 @@ void __init sched_init(void)
        /* May be allocated at isolcpus cmdline parse time */
        if (cpu_isolated_map == NULL)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+        idle_thread_set_boot_cpu();
 #endif
        init_sched_fair_class();
@@ -7970,13 +8149,9 @@ static struct cftype cpu_files[] = {
                .write_u64 = cpu_rt_period_write_uint,
        },
 #endif
+        { }     /* terminate */
 };
-static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
-{
-        return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
-}
 struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .create         = cpu_cgroup_create,
@@ -7984,8 +8159,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
-        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,
+        .base_cftypes   = cpu_files,
        .early_init     = 1,
 };
@@ -8170,13 +8345,9 @@ static struct cftype files[] = {
                .name = "stat",
                .read_map = cpuacct_stats_show,
        },
+        { }     /* terminate */
 };
-static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
-{
-        return cgroup_add_files(cgrp, ss, files, ARRAY_SIZE(files));
-}
 /*
 * charge this task's execution time to its accounting group.
 *
@@ -8208,7 +8379,7 @@ struct cgroup_subsys cpuacct_subsys = {
        .name = "cpuacct",
        .create = cpuacct_create,
        .destroy = cpuacct_destroy,
-        .populate = cpuacct_populate,
        .subsys_id = cpuacct_subsys_id,
+        .base_cftypes = files,
 };
 #endif  /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 09acaa15161d..6f79596e0ea9 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -202,7 +202,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                        SPLIT_NS(spread0));
        SEQ_printf(m, "  .%-30s: %d\n", "nr_spread_over",
                        cfs_rq->nr_spread_over);
-        SEQ_printf(m, "  .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
+        SEQ_printf(m, "  .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
        SEQ_printf(m, "  .%-30s: %ld\n", "load", cfs_rq->load.weight);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_SMP
@@ -260,8 +260,14 @@ static void print_cpu(struct seq_file *m, int cpu)
        SEQ_printf(m, "\ncpu#%d\n", cpu);
 #endif
-#define P(x) \
+#define P(x)                                                            \
-        SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x))
+do {                                                                    \
+        if (sizeof(rq->x) == 4)                                         \
+                SEQ_printf(m, "  .%-30s: %ld\n", #x, (long)(rq->x));    \
+        else                                                            \
+                SEQ_printf(m, "  .%-30s: %Ld\n", #x, (long long)(rq->x));\
+} while (0)
 #define PN(x) \
        SEQ_printf(m, "  .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0d97ebdc58f0..c099cc6eebe3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -784,7 +784,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
                update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
        if (entity_is_task(se))
-                list_add_tail(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+                list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
 #endif
        cfs_rq->nr_running++;
 }
@@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
        int want_sd = 1;
        int sync = wake_flags & WF_SYNC;
-        if (p->rt.nr_cpus_allowed == 1)
+        if (p->nr_cpus_allowed == 1)
                return prev_cpu;
        if (sd_flag & SD_BALANCE_WAKE) {
@@ -2721,7 +2721,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
                 * If power savings logic is enabled for a domain, see if we
                 * are not overloaded, if so, don't balance wider.
                 */
-                if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) {
+                if (tmp->flags & (SD_PREFER_LOCAL)) {
                        unsigned long power = 0;
                        unsigned long nr_running = 0;
                        unsigned long capacity;
@@ -2734,9 +2734,6 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
                        capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
-                        if (tmp->flags & SD_POWERSAVINGS_BALANCE)
-                                nr_running /= 2;
                        if (nr_running < capacity)
                                want_sd = 0;
                }
@@ -3082,7 +3079,7 @@ struct lb_env {
        struct rq               *dst_rq;
        enum cpu_idle_type      idle;
-        long                    load_move;
+        long                    imbalance;
        unsigned int            flags;
        unsigned int            loop;
@@ -3215,8 +3212,10 @@ static int move_one_task(struct lb_env *env)
 static unsigned long task_h_load(struct task_struct *p);
+static const unsigned int sched_nr_migrate_break = 32;
 /*
- * move_tasks tries to move up to load_move weighted load from busiest to
+ * move_tasks tries to move up to imbalance weighted load from busiest to
 * this_rq, as part of a balancing operation within domain "sd".
 * Returns 1 if successful and 0 otherwise.
 *
@@ -3229,7 +3228,7 @@ static int move_tasks(struct lb_env *env)
        unsigned long load;
        int pulled = 0;
-        if (env->load_move <= 0)
+        if (env->imbalance <= 0)
                return 0;
        while (!list_empty(tasks)) {
@@ -3242,7 +3241,7 @@ static int move_tasks(struct lb_env *env)
                /* take a breather every nr_migrate tasks */
                if (env->loop > env->loop_break) {
-                        env->loop_break += sysctl_sched_nr_migrate;
+                        env->loop_break += sched_nr_migrate_break;
                        env->flags |= LBF_NEED_BREAK;
                        break;
                }
@@ -3252,10 +3251,10 @@ static int move_tasks(struct lb_env *env)
                load = task_h_load(p);
-                if (load < 16 && !env->sd->nr_balance_failed)
+                if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
                        goto next;
-                if ((load / 2) > env->load_move)
+                if ((load / 2) > env->imbalance)
                        goto next;
                if (!can_migrate_task(p, env))
@@ -3263,7 +3262,7 @@ static int move_tasks(struct lb_env *env)
                move_task(p, env);
                pulled++;
-                env->load_move -= load;
+                env->imbalance -= load;
 #ifdef CONFIG_PREEMPT
                /*
@@ -3279,7 +3278,7 @@ static int move_tasks(struct lb_env *env)
                 * We only want to steal up to the prescribed amount of
                 * weighted load.
                 */
-                if (env->load_move <= 0)
+                if (env->imbalance <= 0)
                        break;
                continue;
@@ -3433,14 +3432,6 @@ struct sd_lb_stats {
        unsigned int  busiest_group_weight;
        int group_imb; /* Is there imbalance in this sd */
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-        int power_savings_balance; /* Is powersave balance needed for this sd */
-        struct sched_group *group_min; /* Least loaded group in sd */
-        struct sched_group *group_leader; /* Group which relieves group_min */
-        unsigned long min_load_per_task; /* load_per_task in group_min */
-        unsigned long leader_nr_running; /* Nr running of group_leader */
-        unsigned long min_nr_running; /* Nr running of group_min */
-#endif
 };
 /*
@@ -3484,148 +3475,6 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
        return load_idx;
 }
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-/**
- * init_sd_power_savings_stats - Initialize power savings statistics for
- * the given sched_domain, during load balancing.
- *
- * @sd: Sched domain whose power-savings statistics are to be initialized.
- * @sds: Variable containing the statistics for sd.
- * @idle: Idle status of the CPU at which we're performing load-balancing.
- */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-        struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-        /*
-         * Busy processors will not participate in power savings
-         * balance.
-         */
-        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-                sds->power_savings_balance = 0;
-        else {
-                sds->power_savings_balance = 1;
-                sds->min_nr_running = ULONG_MAX;
-                sds->leader_nr_running = 0;
-        }
-}
-/**
- * update_sd_power_savings_stats - Update the power saving stats for a
- * sched_domain while performing load balancing.
- *
- * @group: sched_group belonging to the sched_domain under consideration.
- * @sds: Variable containing the statistics of the sched_domain
- * @local_group: Does group contain the CPU for which we're performing
- *              load balancing ?
- * @sgs: Variable containing the statistics of the group.
- */
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-        if (!sds->power_savings_balance)
-                return;
-        /*
-         * If the local group is idle or completely loaded
-         * no need to do power savings balance at this domain
-         */
-        if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
-                                !sds->this_nr_running))
-                sds->power_savings_balance = 0;
-        /*
-         * If a group is already running at full capacity or idle,
-         * don't include that group in power savings calculations
-         */
-        if (!sds->power_savings_balance ||
-                sgs->sum_nr_running >= sgs->group_capacity ||
-                !sgs->sum_nr_running)
-                return;
-        /*
-         * Calculate the group which has the least non-idle load.
-         * This is the group from where we need to pick up the load
-         * for saving power
-         */
-        if ((sgs->sum_nr_running < sds->min_nr_running) ||
-            (sgs->sum_nr_running == sds->min_nr_running &&
-             group_first_cpu(group) > group_first_cpu(sds->group_min))) {
-                sds->group_min = group;
-                sds->min_nr_running = sgs->sum_nr_running;
-                sds->min_load_per_task = sgs->sum_weighted_load /
-                                                sgs->sum_nr_running;
-        }
-        /*
-         * Calculate the group which is almost near its
-         * capacity but still has some space to pick up some load
-         * from other group and save more power
-         */
-        if (sgs->sum_nr_running + 1 > sgs->group_capacity)
-                return;
-        if (sgs->sum_nr_running > sds->leader_nr_running ||
-            (sgs->sum_nr_running == sds->leader_nr_running &&
-             group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
-                sds->group_leader = group;
-                sds->leader_nr_running = sgs->sum_nr_running;
-        }
-}
-/**
- * check_power_save_busiest_group - see if there is potential for some power-savings balance
- * @sds: Variable containing the statistics of the sched_domain
- *      under consideration.
- * @this_cpu: Cpu at which we're currently performing load-balancing.
- * @imbalance: Variable to store the imbalance.
- *
- * Description:
- * Check if we have potential to perform some power-savings balance.
- * If yes, set the busiest group to be the least loaded group in the
- * sched_domain, so that it's CPUs can be put to idle.
- *
- * Returns 1 if there is potential to perform power-savings balance.
- * Else returns 0.
- */
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-                                        int this_cpu, unsigned long *imbalance)
-{
-        if (!sds->power_savings_balance)
-                return 0;
-        if (sds->this != sds->group_leader ||
-                        sds->group_leader == sds->group_min)
-                return 0;
-        *imbalance = sds->min_load_per_task;
-        sds->busiest = sds->group_min;
-        return 1;
-}
-#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-        struct sd_lb_stats *sds, enum cpu_idle_type idle)
-{
-        return;
-}
-static inline void update_sd_power_savings_stats(struct sched_group *group,
-        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-{
-        return;
-}
-static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
-                                        int this_cpu, unsigned long *imbalance)
-{
-        return 0;
-}
-#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
 unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
 {
        return SCHED_POWER_SCALE;
@@ -3654,15 +3503,22 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
 unsigned long scale_rt_power(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        u64 total, available;
+        u64 total, available, age_stamp, avg;
+        /*
+         * Since we're reading these variables without serialization make sure
+         * we read them once before doing sanity checks on them.
+         */
+        age_stamp = ACCESS_ONCE(rq->age_stamp);
+        avg = ACCESS_ONCE(rq->rt_avg);
-        total = sched_avg_period() + (rq->clock - rq->age_stamp);
+        total = sched_avg_period() + (rq->clock - age_stamp);
-        if (unlikely(total < rq->rt_avg)) {
+        if (unlikely(total < avg)) {
                /* Ensures that power won't end up being negative */
                available = 0;
        } else {
-                available = total - rq->rt_avg;
+                available = total - avg;
        }
        if (unlikely((s64)total < SCHED_POWER_SCALE))
@@ -3725,13 +3581,28 @@ void update_group_power(struct sched_domain *sd, int cpu)
        power = 0;
-        group = child->groups;
+        if (child->flags & SD_OVERLAP) {
-        do {
+                /*
-                power += group->sgp->power;
+                 * SD_OVERLAP domains cannot assume that child groups
-                group = group->next;
+                 * span the current group.
-        } while (group != child->groups);
+                 */
-        sdg->sgp->power = power;
+                for_each_cpu(cpu, sched_group_cpus(sdg))
+                        power += power_of(cpu);
+        } else  {
+                /*
+                 * !SD_OVERLAP domains can assume that child groups
+                 * span the current group.
+                 */ 
+                group = child->groups;
+                do {
+                        power += group->sgp->power;
+                        group = group->next;
+                } while (group != child->groups);
+        }
+        sdg->sgp->power_orig = sdg->sgp->power = power;
 }
 /*
@@ -3761,41 +3632,43 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 /**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
- * @sd: The sched_domain whose statistics are to be updated.
+ * @env: The load balancing environment.
 * @group: sched_group whose statistics are to be updated.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
 * @load_idx: Load index of sched_domain of this_cpu for load calc.
 * @local_group: Does group contain this_cpu.
 * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
 * @sgs: variable to hold the statistics for this group.
 */
-static inline void update_sg_lb_stats(struct sched_domain *sd,
+static inline void update_sg_lb_stats(struct lb_env *env,
-                        struct sched_group *group, int this_cpu,
+                        struct sched_group *group, int load_idx,
-                        enum cpu_idle_type idle, int load_idx,
                        int local_group, const struct cpumask *cpus,
                        int *balance, struct sg_lb_stats *sgs)
 {
-        unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
+        unsigned long nr_running, max_nr_running, min_nr_running;
-        int i;
+        unsigned long load, max_cpu_load, min_cpu_load;
        unsigned int balance_cpu = -1, first_idle_cpu = 0;
        unsigned long avg_load_per_task = 0;
+        int i;
        if (local_group)
-                balance_cpu = group_first_cpu(group);
+                balance_cpu = group_balance_cpu(group);
        /* Tally up the load of all CPUs in the group */
        max_cpu_load = 0;
        min_cpu_load = ~0UL;
        max_nr_running = 0;
+        min_nr_running = ~0UL;
        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
                struct rq *rq = cpu_rq(i);
+                nr_running = rq->nr_running;
                /* Bias balancing toward cpus of our domain */
                if (local_group) {
-                        if (idle_cpu(i) && !first_idle_cpu) {
+                        if (idle_cpu(i) && !first_idle_cpu &&
+                                        cpumask_test_cpu(i, sched_group_mask(group))) {
                                first_idle_cpu = 1;
                                balance_cpu = i;
                        }
@@ -3803,16 +3676,19 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
                        load = target_load(i, load_idx);
                } else {
                        load = source_load(i, load_idx);
-                        if (load > max_cpu_load) {
+                        if (load > max_cpu_load)
                                max_cpu_load = load;
-                                max_nr_running = rq->nr_running;
-                        }
                        if (min_cpu_load > load)
                                min_cpu_load = load;
+                        if (nr_running > max_nr_running)
+                                max_nr_running = nr_running;
+                        if (min_nr_running > nr_running)
+                                min_nr_running = nr_running;
                }
                sgs->group_load += load;
-                sgs->sum_nr_running += rq->nr_running;
+                sgs->sum_nr_running += nr_running;
                sgs->sum_weighted_load += weighted_cpuload(i);
                if (idle_cpu(i))
                        sgs->idle_cpus++;
@@ -3825,14 +3701,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
         * to do the newly idle load balance.
         */
        if (local_group) {
-                if (idle != CPU_NEWLY_IDLE) {
+                if (env->idle != CPU_NEWLY_IDLE) {
-                        if (balance_cpu != this_cpu) {
+                        if (balance_cpu != env->dst_cpu) {
                                *balance = 0;
                                return;
                        }
-                        update_group_power(sd, this_cpu);
+                        update_group_power(env->sd, env->dst_cpu);
                } else if (time_after_eq(jiffies, group->sgp->next_update))
-                        update_group_power(sd, this_cpu);
+                        update_group_power(env->sd, env->dst_cpu);
        }
        /* Adjust by relative CPU power of the group */
@@ -3850,13 +3726,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
        if (sgs->sum_nr_running)
                avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
-        if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
+        if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
+            (max_nr_running - min_nr_running) > 1)
                sgs->group_imb = 1;
        sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
                                                SCHED_POWER_SCALE);
        if (!sgs->group_capacity)
-                sgs->group_capacity = fix_small_capacity(sd, group);
+                sgs->group_capacity = fix_small_capacity(env->sd, group);
        sgs->group_weight = group->group_weight;
        if (sgs->group_capacity > sgs->sum_nr_running)
@@ -3865,20 +3742,18 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
 /**
 * update_sd_pick_busiest - return 1 on busiest group
- * @sd: sched_domain whose statistics are to be checked
+ * @env: The load balancing environment.
 * @sds: sched_domain statistics
 * @sg: sched_group candidate to be checked for being the busiest
 * @sgs: sched_group statistics
- * @this_cpu: the current cpu
 *
 * Determine if @sg is a busier group than the previously selected
 * busiest group.
 */
-static bool update_sd_pick_busiest(struct sched_domain *sd,
+static bool update_sd_pick_busiest(struct lb_env *env,
                                   struct sd_lb_stats *sds,
                                   struct sched_group *sg,
-                                   struct sg_lb_stats *sgs,
+                                   struct sg_lb_stats *sgs)
-                                   int this_cpu)
 {
        if (sgs->avg_load <= sds->max_load)
                return false;
@@ -3894,8 +3769,8 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
         * numbered CPUs in the group, therefore mark all groups
         * higher than ourself as busy.
         */
-        if ((sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+        if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
-            this_cpu < group_first_cpu(sg)) {
+            env->dst_cpu < group_first_cpu(sg)) {
                if (!sds->busiest)
                        return true;
@@ -3908,35 +3783,32 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
 /**
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
- * @sd: sched_domain whose statistics are to be updated.
+ * @env: The load balancing environment.
- * @this_cpu: Cpu for which load balance is currently performed.
- * @idle: Idle status of this_cpu
 * @cpus: Set of cpus considered for load balancing.
 * @balance: Should we balance.
 * @sds: variable to hold the statistics for this sched_domain.
 */
-static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+static inline void update_sd_lb_stats(struct lb_env *env,
-                        enum cpu_idle_type idle, const struct cpumask *cpus,
+                                      const struct cpumask *cpus,
-                        int *balance, struct sd_lb_stats *sds)
+                                      int *balance, struct sd_lb_stats *sds)
 {
-        struct sched_domain *child = sd->child;
+        struct sched_domain *child = env->sd->child;
-        struct sched_group *sg = sd->groups;
+        struct sched_group *sg = env->sd->groups;
        struct sg_lb_stats sgs;
        int load_idx, prefer_sibling = 0;
        if (child && child->flags & SD_PREFER_SIBLING)
                prefer_sibling = 1;
-        init_sd_power_savings_stats(sd, sds, idle);
+        load_idx = get_sd_load_idx(env->sd, env->idle);
-        load_idx = get_sd_load_idx(sd, idle);
        do {
                int local_group;
-                local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
+                local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
                memset(&sgs, 0, sizeof(sgs));
-                update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
+                update_sg_lb_stats(env, sg, load_idx, local_group,
-                                local_group, cpus, balance, &sgs);
+                                   cpus, balance, &sgs);
                if (local_group && !(*balance))
                        return;
@@ -3964,7 +3836,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        sds->this_load_per_task = sgs.sum_weighted_load;
                        sds->this_has_capacity = sgs.group_has_capacity;
                        sds->this_idle_cpus = sgs.idle_cpus;
-                } else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
+                } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
                        sds->max_load = sgs.avg_load;
                        sds->busiest = sg;
                        sds->busiest_nr_running = sgs.sum_nr_running;
@@ -3976,9 +3848,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
                        sds->group_imb = sgs.group_imb;
                }
-                update_sd_power_savings_stats(sg, sds, local_group, &sgs);
                sg = sg->next;
-        } while (sg != sd->groups);
+        } while (sg != env->sd->groups);
 }
 /**
@@ -4001,29 +3872,26 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 * Returns 1 when packing is required and a task should be moved to
 * this CPU.  The amount of the imbalance is returned in *imbalance.
 *
- * @sd: The sched_domain whose packing is to be checked.
+ * @env: The load balancing environment.
 * @sds: Statistics of the sched_domain which is to be packed
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: returns amount of imbalanced due to packing.
 */
-static int check_asym_packing(struct sched_domain *sd,
+static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
-                              struct sd_lb_stats *sds,
-                              int this_cpu, unsigned long *imbalance)
 {
        int busiest_cpu;
-        if (!(sd->flags & SD_ASYM_PACKING))
+        if (!(env->sd->flags & SD_ASYM_PACKING))
                return 0;
        if (!sds->busiest)
                return 0;
        busiest_cpu = group_first_cpu(sds->busiest);
-        if (this_cpu > busiest_cpu)
+        if (env->dst_cpu > busiest_cpu)
                return 0;
-        *imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->sgp->power,
+        env->imbalance = DIV_ROUND_CLOSEST(
-                                       SCHED_POWER_SCALE);
+                sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
        return 1;
 }
@@ -4031,12 +3899,11 @@ static int check_asym_packing(struct sched_domain *sd,
 * fix_small_imbalance - Calculate the minor imbalance that exists
 *                      amongst the groups of a sched_domain, during
 *                      load balancing.
+ * @env: The load balancing environment.
 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
- * @imbalance: Variable to store the imbalance.
 */
-static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+static inline
-                                int this_cpu, unsigned long *imbalance)
+void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 {
        unsigned long tmp, pwr_now = 0, pwr_move = 0;
        unsigned int imbn = 2;
@@ -4047,9 +3914,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
                if (sds->busiest_load_per_task >
                                sds->this_load_per_task)
                        imbn = 1;
-        } else
+        } else {
                sds->this_load_per_task =
-                        cpu_avg_load_per_task(this_cpu);
+                        cpu_avg_load_per_task(env->dst_cpu);
+        }
        scaled_busy_load_per_task = sds->busiest_load_per_task
                                         * SCHED_POWER_SCALE;
@@ -4057,7 +3925,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
        if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
                        (scaled_busy_load_per_task * imbn)) {
-                *imbalance = sds->busiest_load_per_task;
+                env->imbalance = sds->busiest_load_per_task;
                return;
        }
@@ -4094,18 +3962,16 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
        /* Move if we gain throughput */
        if (pwr_move > pwr_now)
-                *imbalance = sds->busiest_load_per_task;
+                env->imbalance = sds->busiest_load_per_task;
 }
 /**
 * calculate_imbalance - Calculate the amount of imbalance present within the
 *                       groups of a given sched_domain during load balance.
+ * @env: load balance environment
 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
- * @this_cpu: Cpu for which currently load balance is being performed.
- * @imbalance: The variable to store the imbalance.
 */
-static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
-                unsigned long *imbalance)
 {
        unsigned long max_pull, load_above_capacity = ~0UL;
@@ -4121,8 +3987,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
         * its cpu_power, while calculating max_load..)
         */
        if (sds->max_load < sds->avg_load) {
-                *imbalance = 0;
+                env->imbalance = 0;
-                return fix_small_imbalance(sds, this_cpu, imbalance);
+                return fix_small_imbalance(env, sds);
        }
        if (!sds->group_imb) {
@@ -4150,7 +4016,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
        max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
        /* How much load to actually move to equalise the imbalance */
-        *imbalance = min(max_pull * sds->busiest->sgp->power,
+        env->imbalance = min(max_pull * sds->busiest->sgp->power,
                (sds->avg_load - sds->this_load) * sds->this->sgp->power)
                        / SCHED_POWER_SCALE;
@@ -4160,8 +4026,8 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
         * a think about bumping its value to force at least one task to be
         * moved
         */
-        if (*imbalance < sds->busiest_load_per_task)
+        if (env->imbalance < sds->busiest_load_per_task)
-                return fix_small_imbalance(sds, this_cpu, imbalance);
+                return fix_small_imbalance(env, sds);
 }
@@ -4177,11 +4043,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 * Also calculates the amount of weighted load which should be moved
 * to restore balance.
 *
- * @sd: The sched_domain whose busiest group is to be returned.
+ * @env: The load balancing environment.
- * @this_cpu: The cpu for which load balancing is currently being performed.
- * @imbalance: Variable which stores amount of weighted load which should
- *              be moved to restore balance/put a group to idle.
- * @idle: The idle status of this_cpu.
 * @cpus: The set of CPUs under consideration for load-balancing.
 * @balance: Pointer to a variable indicating if this_cpu
 *      is the appropriate cpu to perform load balancing at this_level.
@@ -4192,9 +4054,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 *                 put to idle by rebalancing its tasks onto our group.
 */
 static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
+find_busiest_group(struct lb_env *env, const struct cpumask *cpus, int *balance)
-                   unsigned long *imbalance, enum cpu_idle_type idle,
-                   const struct cpumask *cpus, int *balance)
 {
        struct sd_lb_stats sds;
@@ -4204,7 +4064,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
         * Compute the various statistics relavent for load balancing at
         * this level.
         */
-        update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
+        update_sd_lb_stats(env, cpus, balance, &sds);
        /*
         * this_cpu is not the appropriate cpu to perform load balancing at
@@ -4213,8 +4073,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (!(*balance))
                goto ret;
-        if ((idle == CPU_IDLE || idle == CPU_NEWLY_IDLE) &&
+        if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
-            check_asym_packing(sd, &sds, this_cpu, imbalance))
+            check_asym_packing(env, &sds))
                return sds.busiest;
        /* There is no busy sibling group to pull tasks from */
@@ -4232,7 +4092,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                goto force_balance;
        /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-        if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+        if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
                        !sds.busiest_has_capacity)
                goto force_balance;
@@ -4250,7 +4110,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
        if (sds.this_load >= sds.avg_load)
                goto out_balanced;
-        if (idle == CPU_IDLE) {
+        if (env->idle == CPU_IDLE) {
                /*
                 * This cpu is idle. If the busiest group load doesn't
                 * have more tasks than the number of available cpu's and
@@ -4265,34 +4125,27 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
                 * imbalance_pct to be conservative.
                 */
-                if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+                if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
                        goto out_balanced;
        }
 force_balance:
        /* Looks like there is an imbalance. Compute it */
-        calculate_imbalance(&sds, this_cpu, imbalance);
+        calculate_imbalance(env, &sds);
        return sds.busiest;
 out_balanced:
-        /*
-         * There is no obvious imbalance. But check if we can do some balancing
-         * to save power.
-         */
-        if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
-                return sds.busiest;
 ret:
-        *imbalance = 0;
+        env->imbalance = 0;
        return NULL;
 }
 /*
 * find_busiest_queue - find the busiest runqueue among the cpus in group.
 */
-static struct rq *
+static struct rq *find_busiest_queue(struct lb_env *env,
-find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
+                                     struct sched_group *group,
-                   enum cpu_idle_type idle, unsigned long imbalance,
+                                     const struct cpumask *cpus)
-                   const struct cpumask *cpus)
 {
        struct rq *busiest = NULL, *rq;
        unsigned long max_load = 0;
@@ -4305,7 +4158,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
                unsigned long wl;
                if (!capacity)
-                        capacity = fix_small_capacity(sd, group);
+                        capacity = fix_small_capacity(env->sd, group);
                if (!cpumask_test_cpu(i, cpus))
                        continue;
@@ -4317,7 +4170,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
                 * When comparing with imbalance, use weighted_cpuload()
                 * which is not scaled with the cpu power.
                 */
-                if (capacity && rq->nr_running == 1 && wl > imbalance)
+                if (capacity && rq->nr_running == 1 && wl > env->imbalance)
                        continue;
                /*
@@ -4346,40 +4199,19 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
 /* Working cpumask for load_balance and load_balance_newidle. */
 DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-static int need_active_balance(struct sched_domain *sd, int idle,
+static int need_active_balance(struct lb_env *env)
-                               int busiest_cpu, int this_cpu)
 {
-        if (idle == CPU_NEWLY_IDLE) {
+        struct sched_domain *sd = env->sd;
+        if (env->idle == CPU_NEWLY_IDLE) {
                /*
                 * ASYM_PACKING needs to force migrate tasks from busy but
                 * higher numbered CPUs in order to pack all tasks in the
                 * lowest numbered CPUs.
                 */
-                if ((sd->flags & SD_ASYM_PACKING) && busiest_cpu > this_cpu)
+                if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
                        return 1;
-                /*
-                 * The only task running in a non-idle cpu can be moved to this
-                 * cpu in an attempt to completely freeup the other CPU
-                 * package.
-                 *
-                 * The package power saving logic comes from
-                 * find_busiest_group(). If there are no imbalance, then
-                 * f_b_g() will return NULL. However when sched_mc={1,2} then
-                 * f_b_g() will select a group from which a running task may be
-                 * pulled to this cpu in order to make the other package idle.
-                 * If there is no opportunity to make a package idle and if
-                 * there are no imbalance, then f_b_g() will return NULL and no
-                 * action will be taken in load_balance_newidle().
-                 *
-                 * Under normal task pull operation due to imbalance, there
-                 * will be more than one task in the source run queue and
-                 * move_tasks() will succeed.  ld_moved will be true and this
-                 * active balance code will not be triggered.
-                 */
-                if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
-                        return 0;
        }
        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
@@ -4397,7 +4229,6 @@ static int load_balance(int this_cpu, struct rq *this_rq,
 {
        int ld_moved, active_balance = 0;
        struct sched_group *group;
-        unsigned long imbalance;
        struct rq *busiest;
        unsigned long flags;
        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
@@ -4407,7 +4238,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .dst_cpu        = this_cpu,
                .dst_rq         = this_rq,
                .idle           = idle,
-                .loop_break     = sysctl_sched_nr_migrate,
+                .loop_break     = sched_nr_migrate_break,
        };
        cpumask_copy(cpus, cpu_active_mask);
@@ -4415,8 +4246,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        schedstat_inc(sd, lb_count[idle]);
 redo:
-        group = find_busiest_group(sd, this_cpu, &imbalance, idle,
+        group = find_busiest_group(&env, cpus, balance);
-                                   cpus, balance);
        if (*balance == 0)
                goto out_balanced;
@@ -4426,7 +4256,7 @@ redo:
                goto out_balanced;
        }
-        busiest = find_busiest_queue(sd, group, idle, imbalance, cpus);
+        busiest = find_busiest_queue(&env, group, cpus);
        if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[idle]);
                goto out_balanced;
@@ -4434,7 +4264,7 @@ redo:
        BUG_ON(busiest == this_rq);
-        schedstat_add(sd, lb_imbalance[idle], imbalance);
+        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
        ld_moved = 0;
        if (busiest->nr_running > 1) {
@@ -4445,10 +4275,9 @@ redo:
                 * correctly treated as an imbalance.
                 */
                env.flags |= LBF_ALL_PINNED;
-                env.load_move = imbalance;
+                env.src_cpu   = busiest->cpu;
-                env.src_cpu = busiest->cpu;
+                env.src_rq    = busiest;
-                env.src_rq = busiest;
+                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
-                env.loop_max = busiest->nr_running;
 more_balance:
                local_irq_save(flags);
@@ -4490,7 +4319,7 @@ more_balance:
                if (idle != CPU_NEWLY_IDLE)
                        sd->nr_balance_failed++;
-                if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
+                if (need_active_balance(&env)) {
                        raw_spin_lock_irqsave(&busiest->lock, flags);
                        /* don't kick the active_load_balance_cpu_stop,
@@ -4517,10 +4346,11 @@ more_balance:
                        }
                        raw_spin_unlock_irqrestore(&busiest->lock, flags);
-                        if (active_balance)
+                        if (active_balance) {
                                stop_one_cpu_nowait(cpu_of(busiest),
                                        active_load_balance_cpu_stop, busiest,
                                        &busiest->active_balance_work);
+                        }
                        /*
                         * We've kicked active balancing, reset the failure
@@ -4701,104 +4531,15 @@ static struct {
        unsigned long next_balance;     /* in jiffy units */
 } nohz ____cacheline_aligned;
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+static inline int find_new_ilb(int call_cpu)
-/**
- * lowest_flag_domain - Return lowest sched_domain containing flag.
- * @cpu:        The cpu whose lowest level of sched domain is to
- *              be returned.
- * @flag:       The flag to check for the lowest sched_domain
- *              for the given cpu.
- *
- * Returns the lowest sched_domain of a cpu which contains the given flag.
- */
-static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
-{
-        struct sched_domain *sd;
-        for_each_domain(cpu, sd)
-                if (sd->flags & flag)
-                        break;
-        return sd;
-}
-/**
- * for_each_flag_domain - Iterates over sched_domains containing the flag.
- * @cpu:        The cpu whose domains we're iterating over.
- * @sd:         variable holding the value of the power_savings_sd
- *              for cpu.
- * @flag:       The flag to filter the sched_domains to be iterated.
- *
- * Iterates over all the scheduler domains for a given cpu that has the 'flag'
- * set, starting from the lowest sched_domain to the highest.
- */
-#define for_each_flag_domain(cpu, sd, flag) \
-        for (sd = lowest_flag_domain(cpu, flag); \
-                (sd && (sd->flags & flag)); sd = sd->parent)
-/**
- * find_new_ilb - Finds the optimum idle load balancer for nomination.
- * @cpu:        The cpu which is nominating a new idle_load_balancer.
- *
- * Returns:     Returns the id of the idle load balancer if it exists,
- *              Else, returns >= nr_cpu_ids.
- *
- * This algorithm picks the idle load balancer such that it belongs to a
- * semi-idle powersavings sched_domain. The idea is to try and avoid
- * completely idle packages/cores just for the purpose of idle load balancing
- * when there are other idle cpu's which are better suited for that job.
- */
-static int find_new_ilb(int cpu)
 {
        int ilb = cpumask_first(nohz.idle_cpus_mask);
-        struct sched_group *ilbg;
-        struct sched_domain *sd;
-        /*
-         * Have idle load balancer selection from semi-idle packages only
-         * when power-aware load balancing is enabled
-         */
-        if (!(sched_smt_power_savings || sched_mc_power_savings))
-                goto out_done;
-        /*
-         * Optimize for the case when we have no idle CPUs or only one
-         * idle CPU. Don't walk the sched_domain hierarchy in such cases
-         */
-        if (cpumask_weight(nohz.idle_cpus_mask) < 2)
-                goto out_done;
-        rcu_read_lock();
-        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
-                ilbg = sd->groups;
-                do {
-                        if (ilbg->group_weight !=
-                                atomic_read(&ilbg->sgp->nr_busy_cpus)) {
-                                ilb = cpumask_first_and(nohz.idle_cpus_mask,
-                                                        sched_group_cpus(ilbg));
-                                goto unlock;
-                        }
-                        ilbg = ilbg->next;
-                } while (ilbg != sd->groups);
-        }
-unlock:
-        rcu_read_unlock();
-out_done:
        if (ilb < nr_cpu_ids && idle_cpu(ilb))
                return ilb;
        return nr_cpu_ids;
 }
-#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
-static inline int find_new_ilb(int call_cpu)
-{
-        return nr_cpu_ids;
-}
-#endif
 /*
 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
@@ -5021,7 +4762,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
                raw_spin_lock_irq(&this_rq->lock);
                update_rq_clock(this_rq);
-                update_cpu_load(this_rq);
+                update_idle_cpu_load(this_rq);
                raw_spin_unlock_irq(&this_rq->lock);
                rebalance_domains(balance_cpu, CPU_IDLE);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index e61fd73913d0..de00a486c5c6 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -68,3 +68,4 @@ SCHED_FEAT(TTWU_QUEUE, true)
 SCHED_FEAT(FORCE_SD_OVERLAP, false)
 SCHED_FEAT(RT_RUNTIME_SHARE, true)
+SCHED_FEAT(LB_MIN, false)
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 91b4c957f289..b44d604b35d1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -4,7 +4,7 @@
 * idle-task scheduling class.
 *
 * (NOTE: these are not related to SCHED_IDLE tasks which are
- *  handled in sched_fair.c)
+ *  handled in sched/fair.c)
 */
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 44af55e6d5d0..573e1ca01102 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq)
 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+        struct task_struct *p;
        if (!rt_entity_is_task(rt_se))
                return;
+        p = rt_task_of(rt_se);
        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
        rt_rq->rt_nr_total++;
-        if (rt_se->nr_cpus_allowed > 1)
+        if (p->nr_cpus_allowed > 1)
                rt_rq->rt_nr_migratory++;
        update_rt_migration(rt_rq);
@@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
+        struct task_struct *p;
        if (!rt_entity_is_task(rt_se))
                return;
+        p = rt_task_of(rt_se);
        rt_rq = &rq_of_rt_rq(rt_rq)->rt;
        rt_rq->rt_nr_total--;
-        if (rt_se->nr_cpus_allowed > 1)
+        if (p->nr_cpus_allowed > 1)
                rt_rq->rt_nr_migratory--;
        update_rt_migration(rt_rq);
@@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
-        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
        inc_nr_running(rq);
@@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
        cpu = task_cpu(p);
-        if (p->rt.nr_cpus_allowed == 1)
+        if (p->nr_cpus_allowed == 1)
                goto out;
        /* For anything but wake ups, just return the task_cpu */
@@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
         * will have to sort it out.
         */
        if (curr && unlikely(rt_task(curr)) &&
-            (curr->rt.nr_cpus_allowed < 2 ||
+            (curr->nr_cpus_allowed < 2 ||
             curr->prio <= p->prio) &&
-            (p->rt.nr_cpus_allowed > 1)) {
+            (p->nr_cpus_allowed > 1)) {
                int target = find_lowest_rq(p);
                if (target != -1)
@@ -1276,10 +1282,10 @@ out:
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
-        if (rq->curr->rt.nr_cpus_allowed == 1)
+        if (rq->curr->nr_cpus_allowed == 1)
                return;
-        if (p->rt.nr_cpus_allowed != 1
+        if (p->nr_cpus_allowed != 1
            && cpupri_find(&rq->rd->cpupri, p, NULL))
                return;
@@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
         * The previous task needs to be made eligible for pushing
         * if it is still active
         */
-        if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
+        if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
 }
@@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
            (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
-            (p->rt.nr_cpus_allowed > 1))
+            (p->nr_cpus_allowed > 1))
                return 1;
        return 0;
 }
@@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task)
        if (unlikely(!lowest_mask))
                return -1;
-        if (task->rt.nr_cpus_allowed == 1)
+        if (task->nr_cpus_allowed == 1)
                return -1; /* No other targets possible */
        if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
@@ -1556,7 +1562,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                                     task_running(rq, task) ||
                                     !task->on_rq)) {
-                                raw_spin_unlock(&lowest_rq->lock);
+                                double_unlock_balance(rq, lowest_rq);
                                lowest_rq = NULL;
                                break;
                        }
@@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
        BUG_ON(rq->cpu != task_cpu(p));
        BUG_ON(task_current(rq, p));
-        BUG_ON(p->rt.nr_cpus_allowed <= 1);
+        BUG_ON(p->nr_cpus_allowed <= 1);
        BUG_ON(!p->on_rq);
        BUG_ON(!rt_task(p));
@@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
        if (!task_running(rq, p) &&
            !test_tsk_need_resched(rq->curr) &&
            has_pushable_tasks(rq) &&
-            p->rt.nr_cpus_allowed > 1 &&
+            p->nr_cpus_allowed > 1 &&
            rt_task(rq->curr) &&
-            (rq->curr->rt.nr_cpus_allowed < 2 ||
+            (rq->curr->nr_cpus_allowed < 2 ||
             rq->curr->prio <= p->prio))
                push_rt_tasks(rq);
 }
@@ -1803,44 +1809,40 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 static void set_cpus_allowed_rt(struct task_struct *p,
                                const struct cpumask *new_mask)
 {
-        int weight = cpumask_weight(new_mask);
+        struct rq *rq;
+        int weight;
        BUG_ON(!rt_task(p));
-        /*
+        if (!p->on_rq)
-         * Update the migration status of the RQ if we have an RT task
+                return;
-         * which is running AND changing its weight value.
-         */
-        if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
-                struct rq *rq = task_rq(p);
-                if (!task_current(rq, p)) {
+        weight = cpumask_weight(new_mask);
-                        /*
-                         * Make sure we dequeue this task from the pushable list
-                         * before going further.  It will either remain off of
-                         * the list because we are no longer pushable, or it
-                         * will be requeued.
-                         */
-                        if (p->rt.nr_cpus_allowed > 1)
-                                dequeue_pushable_task(rq, p);
-                        /*
+        /*
-                         * Requeue if our weight is changing and still > 1
+         * Only update if the process changes its state from whether it
-                         */
+         * can migrate or not.
-                        if (weight > 1)
+         */
-                                enqueue_pushable_task(rq, p);
+        if ((p->nr_cpus_allowed > 1) == (weight > 1))
+                return;
-                }
-                if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
+        rq = task_rq(p);
-                        rq->rt.rt_nr_migratory++;
-                } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
-                        BUG_ON(!rq->rt.rt_nr_migratory);
-                        rq->rt.rt_nr_migratory--;
-                }
-                update_rt_migration(&rq->rt);
+        /*
+         * The process used to be able to migrate OR it can now migrate
+         */
+        if (weight <= 1) {
+                if (!task_current(rq, p))
+                        dequeue_pushable_task(rq, p);
+                BUG_ON(!rq->rt.rt_nr_migratory);
+                rq->rt.rt_nr_migratory--;
+        } else {
+                if (!task_current(rq, p))
+                        enqueue_pushable_task(rq, p);
+                rq->rt.rt_nr_migratory++;
        }
+        update_rt_migration(&rq->rt);
 }
 /* Assumes rq->lock is held */
@@ -1983,6 +1985,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
+        struct sched_rt_entity *rt_se = &p->rt;
        update_curr_rt(rq);
        watchdog(rq, p);
@@ -2000,12 +2004,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        p->rt.time_slice = RR_TIMESLICE;
        /*
-         * Requeue to the end of queue if we are not the only element
+         * Requeue to the end of queue if we (and all of our ancestors) are the
-         * on the queue:
+         * only element on the queue
         */
-        if (p->rt.run_list.prev != p->rt.run_list.next) {
+        for_each_sched_rt_entity(rt_se) {
-                requeue_task_rt(rq, p, 0);
+                if (rt_se->run_list.prev != rt_se->run_list.next) {
-                set_tsk_need_resched(p);
+                        requeue_task_rt(rq, p, 0);
+                        set_tsk_need_resched(p);
+                        return;
+                }
        }
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb3acba4d52e..6d52cea7f33d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -201,7 +201,7 @@ struct cfs_bandwidth { };
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
        struct load_weight load;
-        unsigned long nr_running, h_nr_running;
+        unsigned int nr_running, h_nr_running;
        u64 exec_clock;
        u64 min_vruntime;
@@ -279,7 +279,7 @@ static inline int rt_bandwidth_enabled(void)
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
        struct rt_prio_array active;
-        unsigned long rt_nr_running;
+        unsigned int rt_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
        struct {
                int curr; /* highest queued rt task prio */
@@ -353,7 +353,7 @@ struct rq {
         * nr_running and cpu_load should be in the same cacheline because
         * remote CPUs use both these fields when doing load calculation.
         */
-        unsigned long nr_running;
+        unsigned int nr_running;
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
        unsigned long last_load_update_tick;
@@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_id);
+extern int group_balance_cpu(struct sched_group *sg);
 #endif /* CONFIG_SMP */
 #include "stats.h"
@@ -876,7 +878,7 @@ extern void resched_cpu(int cpu);
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
-extern void update_cpu_load(struct rq *this_rq);
+extern void update_idle_cpu_load(struct rq *this_rq);
 #ifdef CONFIG_CGROUP_CPUACCT
 #include <linux/cgroup.h>