1 files changed, 845 insertions, 340 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index c1d0ed360088..6cc1fd5d5072 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
        ktime_t now;
-        if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
+        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                return;
        if (hrtimer_active(&rt_b->rt_period_timer))
@@ -231,13 +231,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
        spin_lock(&rt_b->rt_runtime_lock);
        for (;;) {
+                unsigned long delta;
+                ktime_t soft, hard;
                if (hrtimer_active(&rt_b->rt_period_timer))
                        break;
                now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
                hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-                hrtimer_start_expires(&rt_b->rt_period_timer,
-                                HRTIMER_MODE_ABS);
+                soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
+                hard = hrtimer_get_expires(&rt_b->rt_period_timer);
+                delta = ktime_to_ns(ktime_sub(hard, soft));
+                __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
+                                HRTIMER_MODE_ABS, 0);
        }
        spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -331,6 +338,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
 */
 static DEFINE_SPINLOCK(task_group_lock);
+#ifdef CONFIG_SMP
+static int root_task_group_empty(void)
+{
+        return list_empty(&root_task_group.children);
+}
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD   (2*NICE_0_LOAD)
@@ -391,6 +405,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 #else
+#ifdef CONFIG_SMP
+static int root_task_group_empty(void)
+{
+        return 1;
+}
+#endif
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 static inline struct task_group *task_group(struct task_struct *p)
 {
@@ -467,11 +488,17 @@ struct rt_rq {
        struct rt_prio_array active;
        unsigned long rt_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-        int highest_prio; /* highest queued rt task prio */
+        struct {
+                int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
+                int next; /* next highest */
+#endif
+        } highest_prio;
 #endif
 #ifdef CONFIG_SMP
        unsigned long rt_nr_migratory;
        int overloaded;
+        struct plist_head pushable_tasks;
 #endif
        int rt_throttled;
        u64 rt_time;
@@ -549,7 +576,6 @@ struct rq {
        unsigned long nr_running;
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
-        unsigned char idle_at_tick;
 #ifdef CONFIG_NO_HZ
        unsigned long last_tick_seen;
        unsigned char in_nohz_recently;
@@ -590,6 +616,7 @@ struct rq {
        struct root_domain *rd;
        struct sched_domain *sd;
+        unsigned char idle_at_tick;
        /* For active balancing */
        int active_balance;
        int push_cpu;
@@ -618,9 +645,6 @@ struct rq {
        /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
        /* sys_sched_yield() stats */
-        unsigned int yld_exp_empty;
-        unsigned int yld_act_empty;
-        unsigned int yld_both_empty;
        unsigned int yld_count;
        /* schedule() stats */
@@ -1093,7 +1117,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
        if (rq == this_rq()) {
                hrtimer_restart(timer);
        } else if (!rq->hrtick_csd_pending) {
-                __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+                __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
                rq->hrtick_csd_pending = 1;
        }
 }
@@ -1129,7 +1153,8 @@ static __init void init_hrtick(void)
 */
 static void hrtick_start(struct rq *rq, u64 delay)
 {
-        hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+        __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
+                        HRTIMER_MODE_REL, 0);
 }
 static inline void init_hrtick(void)
@@ -1183,10 +1208,10 @@ static void resched_task(struct task_struct *p)
        assert_spin_locked(&task_rq(p)->lock);
-        if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+        if (test_tsk_need_resched(p))
                return;
-        set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+        set_tsk_need_resched(p);
        cpu = task_cpu(p);
        if (cpu == smp_processor_id())
@@ -1242,7 +1267,7 @@ void wake_up_idle_cpu(int cpu)
         * lockless. The worst case is that the other CPU runs the
         * idle task through an additional NOOP schedule()
         */
-        set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+        set_tsk_need_resched(rq->idle);
        /* NEED_RESCHED must be visible before we test polling */
        smp_mb();
@@ -1610,21 +1635,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 #endif
+#ifdef CONFIG_PREEMPT
 /*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations.  This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below.  However, it
+ * also adds more overhead and therefore may reduce throughput.
 */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+        __releases(this_rq->lock)
+        __acquires(busiest->lock)
+        __acquires(this_rq->lock)
+{
+        spin_unlock(&this_rq->lock);
+        double_rq_lock(this_rq, busiest);
+        return 1;
+}
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry.  This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(this_rq->lock)
        __acquires(busiest->lock)
        __acquires(this_rq->lock)
 {
        int ret = 0;
-        if (unlikely(!irqs_disabled())) {
-                /* printk() doesn't work good under rq->lock */
-                spin_unlock(&this_rq->lock);
-                BUG_ON(1);
-        }
        if (unlikely(!spin_trylock(&busiest->lock))) {
                if (busiest < this_rq) {
                        spin_unlock(&this_rq->lock);
@@ -1637,6 +1683,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
        return ret;
 }
+#endif /* CONFIG_PREEMPT */
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+        if (unlikely(!irqs_disabled())) {
+                /* printk() doesn't work good under rq->lock */
+                spin_unlock(&this_rq->lock);
+                BUG_ON(1);
+        }
+        return _double_lock_balance(this_rq, busiest);
+}
 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
        __releases(busiest->lock)
 {
@@ -1705,6 +1767,9 @@ static void update_avg(u64 *avg, u64 sample)
 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
+        if (wakeup)
+                p->se.start_runtime = p->se.sum_exec_runtime;
        sched_info_queued(p);
        p->sched_class->enqueue_task(rq, p, wakeup);
        p->se.on_rq = 1;
@@ -1712,10 +1777,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
-        if (sleep && p->se.last_wakeup) {
+        if (sleep) {
-                update_avg(&p->se.avg_overlap,
+                if (p->se.last_wakeup) {
-                           p->se.sum_exec_runtime - p->se.last_wakeup);
+                        update_avg(&p->se.avg_overlap,
-                p->se.last_wakeup = 0;
+                                p->se.sum_exec_runtime - p->se.last_wakeup);
+                        p->se.last_wakeup = 0;
+                } else {
+                        update_avg(&p->se.avg_wakeup,
+                                sysctl_sched_wakeup_granularity);
+                }
        }
        sched_info_dequeued(p);
@@ -2017,7 +2087,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 * it must be off the runqueue _entirely_, and not
                 * preempted!
                 *
-                 * So if it wa still runnable (but just not actively
+                 * So if it was still runnable (but just not actively
                 * running right now), it's preempted, and we should
                 * yield - it could be a while.
                 */
@@ -2267,7 +2337,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
                sync = 0;
 #ifdef CONFIG_SMP
-        if (sched_feat(LB_WAKEUP_UPDATE)) {
+        if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
                struct sched_domain *sd;
                this_cpu = raw_smp_processor_id();
@@ -2345,6 +2415,22 @@ out_activate:
        activate_task(rq, p, 1);
        success = 1;
+        /*
+         * Only attribute actual wakeups done by this task.
+         */
+        if (!in_interrupt()) {
+                struct sched_entity *se = &current->se;
+                u64 sample = se->sum_exec_runtime;
+                if (se->last_wakeup)
+                        sample -= se->last_wakeup;
+                else
+                        sample -= se->start_runtime;
+                update_avg(&se->avg_wakeup, sample);
+                se->last_wakeup = se->sum_exec_runtime;
+        }
 out_running:
        trace_sched_wakeup(rq, p, success);
        check_preempt_curr(rq, p, sync);
@@ -2355,8 +2441,6 @@ out_running:
                p->sched_class->task_wake_up(rq, p);
 #endif
 out:
-        current->se.last_wakeup = current->se.sum_exec_runtime;
        task_rq_unlock(rq, &flags);
        return success;
@@ -2386,6 +2470,8 @@ static void __sched_fork(struct task_struct *p)
        p->se.prev_sum_exec_runtime     = 0;
        p->se.last_wakeup               = 0;
        p->se.avg_overlap               = 0;
+        p->se.start_runtime             = 0;
+        p->se.avg_wakeup                = sysctl_sched_wakeup_granularity;
 #ifdef CONFIG_SCHEDSTATS
        p->se.wait_start                = 0;
@@ -2448,6 +2534,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
 #endif
+        plist_node_init(&p->pushable_tasks, MAX_PRIO);
        put_cpu();
 }
@@ -2491,7 +2579,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 /**
- * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
 * @notifier: notifier struct to register
 */
 void preempt_notifier_register(struct preempt_notifier *notifier)
@@ -2588,6 +2676,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 {
        struct mm_struct *mm = rq->prev_mm;
        long prev_state;
+#ifdef CONFIG_SMP
+        int post_schedule = 0;
+        if (current->sched_class->needs_post_schedule)
+                post_schedule = current->sched_class->needs_post_schedule(rq);
+#endif
        rq->prev_mm = NULL;
@@ -2606,7 +2700,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        finish_arch_switch(prev);
        finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
-        if (current->sched_class->post_schedule)
+        if (post_schedule)
                current->sched_class->post_schedule(rq);
 #endif
@@ -2913,6 +3007,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
                     struct sched_domain *sd, enum cpu_idle_type idle,
                     int *all_pinned)
 {
+        int tsk_cache_hot = 0;
        /*
         * We do not migrate tasks that are:
         * 1) running (obviously), or
@@ -2936,10 +3031,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 2) too many balance attempts have failed.
         */
-        if (!task_hot(p, rq->clock, sd) ||
+        tsk_cache_hot = task_hot(p, rq->clock, sd);
-                        sd->nr_balance_failed > sd->cache_nice_tries) {
+        if (!tsk_cache_hot ||
+                sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
-                if (task_hot(p, rq->clock, sd)) {
+                if (tsk_cache_hot) {
                        schedstat_inc(sd, lb_hot_gained[idle]);
                        schedstat_inc(p, se.nr_forced_migrations);
                }
@@ -2947,7 +3043,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
                return 1;
        }
-        if (task_hot(p, rq->clock, sd)) {
+        if (tsk_cache_hot) {
                schedstat_inc(p, se.nr_failed_migrations_hot);
                return 0;
        }
@@ -2987,6 +3083,16 @@ next:
        pulled++;
        rem_load_move -= p->se.load.weight;
+#ifdef CONFIG_PREEMPT
+        /*
+         * NEWIDLE balancing is a source of latency, so preemptible kernels
+         * will stop after the first task is pulled to minimize the critical
+         * section.
+         */
+        if (idle == CPU_NEWLY_IDLE)
+                goto out;
+#endif
        /*
         * We only want to steal up to the prescribed amount of weighted load.
         */
@@ -3033,9 +3139,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                                sd, idle, all_pinned, &this_best_prio);
                class = class->next;
+#ifdef CONFIG_PREEMPT
+                /*
+                 * NEWIDLE balancing is a source of latency, so preemptible
+                 * kernels will stop after the first task is pulled to minimize
+                 * the critical section.
+                 */
                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
                        break;
+#endif
        } while (class && max_load_move > total_load_moved);
        return total_load_moved > 0;
@@ -3085,246 +3197,480 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
        return 0;
 }
+/********** Helpers for find_busiest_group ************************/
 /*
- * find_busiest_group finds and returns the busiest CPU group within the
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
- * domain. It calculates and returns the amount of weighted load which
+ *              during load balancing.
- * should be moved to restore balance via the imbalance parameter.
 */
-static struct sched_group *
+struct sd_lb_stats {
-find_busiest_group(struct sched_domain *sd, int this_cpu,
+        struct sched_group *busiest; /* Busiest group in this sd */
-                   unsigned long *imbalance, enum cpu_idle_type idle,
+        struct sched_group *this;  /* Local group in this sd */
-                   int *sd_idle, const struct cpumask *cpus, int *balance)
+        unsigned long total_load;  /* Total load of all groups in sd */
-{
+        unsigned long total_pwr;   /*   Total power of all groups in sd */
-        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
+        unsigned long avg_load;    /* Average load across all groups in sd */
-        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
-        unsigned long max_pull;
+        /** Statistics of this group */
-        unsigned long busiest_load_per_task, busiest_nr_running;
+        unsigned long this_load;
-        unsigned long this_load_per_task, this_nr_running;
+        unsigned long this_load_per_task;
-        int load_idx, group_imb = 0;
+        unsigned long this_nr_running;
+        /* Statistics of the busiest group */
+        unsigned long max_load;
+        unsigned long busiest_load_per_task;
+        unsigned long busiest_nr_running;
+        int group_imb; /* Is there imbalance in this sd */
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-        int power_savings_balance = 1;
+        int power_savings_balance; /* Is powersave balance needed for this sd */
-        unsigned long leader_nr_running = 0, min_load_per_task = 0;
+        struct sched_group *group_min; /* Least loaded group in sd */
-        unsigned long min_nr_running = ULONG_MAX;
+        struct sched_group *group_leader; /* Group which relieves group_min */
-        struct sched_group *group_min = NULL, *group_leader = NULL;
+        unsigned long min_load_per_task; /* load_per_task in group_min */
+        unsigned long leader_nr_running; /* Nr running of group_leader */
+        unsigned long min_nr_running; /* Nr running of group_min */
 #endif
+};
+/*
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+struct sg_lb_stats {
+        unsigned long avg_load; /*Avg load across the CPUs of the group */
+        unsigned long group_load; /* Total load over the CPUs of the group */
+        unsigned long sum_nr_running; /* Nr tasks running in the group */
+        unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+        unsigned long group_capacity;
+        int group_imb; /* Is there an imbalance in the group ? */
+};
-        max_load = this_load = total_load = total_pwr = 0;
+/**
-        busiest_load_per_task = busiest_nr_running = 0;
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
-        this_load_per_task = this_nr_running = 0;
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+        return cpumask_first(sched_group_cpus(group));
+}
-        if (idle == CPU_NOT_IDLE)
+/**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ */
+static inline int get_sd_load_idx(struct sched_domain *sd,
+                                        enum cpu_idle_type idle)
+{
+        int load_idx;
+        switch (idle) {
+        case CPU_NOT_IDLE:
                load_idx = sd->busy_idx;
-        else if (idle == CPU_NEWLY_IDLE)
+                break;
+        case CPU_NEWLY_IDLE:
                load_idx = sd->newidle_idx;
-        else
+                break;
+        default:
                load_idx = sd->idle_idx;
+                break;
+        }
-        do {
+        return load_idx;
-                unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
+}
-                int local_group;
-                int i;
-                int __group_imb = 0;
-                unsigned int balance_cpu = -1, first_idle_cpu = 0;
-                unsigned long sum_nr_running, sum_weighted_load;
-                unsigned long sum_avg_load_per_task;
-                unsigned long avg_load_per_task;
-                local_group = cpumask_test_cpu(this_cpu,
-                                               sched_group_cpus(group));
-                if (local_group)
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-                        balance_cpu = cpumask_first(sched_group_cpus(group));
+/**
+ * init_sd_power_savings_stats - Initialize power savings statistics for
+ * the given sched_domain, during load balancing.
+ *
+ * @sd: Sched domain whose power-savings statistics are to be initialized.
+ * @sds: Variable containing the statistics for sd.
+ * @idle: Idle status of the CPU at which we're performing load-balancing.
+ */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+        struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+        /*
+         * Busy processors will not participate in power savings
+         * balance.
+         */
+        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+                sds->power_savings_balance = 0;
+        else {
+                sds->power_savings_balance = 1;
+                sds->min_nr_running = ULONG_MAX;
+                sds->leader_nr_running = 0;
+        }
+}
-                /* Tally up the load of all CPUs in the group */
+/**
-                sum_weighted_load = sum_nr_running = avg_load = 0;
+ * update_sd_power_savings_stats - Update the power saving stats for a
-                sum_avg_load_per_task = avg_load_per_task = 0;
+ * sched_domain while performing load balancing.
+ *
+ * @group: sched_group belonging to the sched_domain under consideration.
+ * @sds: Variable containing the statistics of the sched_domain
+ * @local_group: Does group contain the CPU for which we're performing
+ *              load balancing ?
+ * @sgs: Variable containing the statistics of the group.
+ */
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
-                max_cpu_load = 0;
+        if (!sds->power_savings_balance)
-                min_cpu_load = ~0UL;
+                return;
-                for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+        /*
-                        struct rq *rq = cpu_rq(i);
+         * If the local group is idle or completely loaded
+         * no need to do power savings balance at this domain
+         */
+        if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+                                !sds->this_nr_running))
+                sds->power_savings_balance = 0;
-                        if (*sd_idle && rq->nr_running)
+        /*
-                                *sd_idle = 0;
+         * If a group is already running at full capacity or idle,
+         * don't include that group in power savings calculations
+         */
+        if (!sds->power_savings_balance ||
+                sgs->sum_nr_running >= sgs->group_capacity ||
+                !sgs->sum_nr_running)
+                return;
-                        /* Bias balancing toward cpus of our domain */
+        /*
-                        if (local_group) {
+         * Calculate the group which has the least non-idle load.
-                                if (idle_cpu(i) && !first_idle_cpu) {
+         * This is the group from where we need to pick up the load
-                                        first_idle_cpu = 1;
+         * for saving power
-                                        balance_cpu = i;
+         */
-                                }
+        if ((sgs->sum_nr_running < sds->min_nr_running) ||
+            (sgs->sum_nr_running == sds->min_nr_running &&
+             group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+                sds->group_min = group;
+                sds->min_nr_running = sgs->sum_nr_running;
+                sds->min_load_per_task = sgs->sum_weighted_load /
+                                                sgs->sum_nr_running;
+        }
-                                load = target_load(i, load_idx);
+        /*
-                        } else {
+         * Calculate the group which is almost near its
-                                load = source_load(i, load_idx);
+         * capacity but still has some space to pick up some load
-                                if (load > max_cpu_load)
+         * from other group and save more power
-                                        max_cpu_load = load;
+         */
-                                if (min_cpu_load > load)
+        if (sgs->sum_nr_running > sgs->group_capacity - 1)
-                                        min_cpu_load = load;
+                return;
-                        }
-                        avg_load += load;
+        if (sgs->sum_nr_running > sds->leader_nr_running ||
-                        sum_nr_running += rq->nr_running;
+            (sgs->sum_nr_running == sds->leader_nr_running &&
-                        sum_weighted_load += weighted_cpuload(i);
+             group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+                sds->group_leader = group;
+                sds->leader_nr_running = sgs->sum_nr_running;
+        }
+}
-                        sum_avg_load_per_task += cpu_avg_load_per_task(i);
+/**
-                }
+ * check_power_save_busiest_group - see if there is potential for some power-savings balance
+ * @sds: Variable containing the statistics of the sched_domain
+ *      under consideration.
+ * @this_cpu: Cpu at which we're currently performing load-balancing.
+ * @imbalance: Variable to store the imbalance.
+ *
+ * Description:
+ * Check if we have potential to perform some power-savings balance.
+ * If yes, set the busiest group to be the least loaded group in the
+ * sched_domain, so that it's CPUs can be put to idle.
+ *
+ * Returns 1 if there is potential to perform power-savings balance.
+ * Else returns 0.
+ */
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                        int this_cpu, unsigned long *imbalance)
+{
+        if (!sds->power_savings_balance)
+                return 0;
-                /*
+        if (sds->this != sds->group_leader ||
-                 * First idle cpu or the first cpu(busiest) in this sched group
+                        sds->group_leader == sds->group_min)
-                 * is eligible for doing load balancing at this and above
+                return 0;
-                 * domains. In the newly idle case, we will allow all the cpu's
-                 * to do the newly idle load balance.
-                 */
-                if (idle != CPU_NEWLY_IDLE && local_group &&
-                    balance_cpu != this_cpu && balance) {
-                        *balance = 0;
-                        goto ret;
-                }
-                total_load += avg_load;
+        *imbalance = sds->min_load_per_task;
-                total_pwr += group->__cpu_power;
+        sds->busiest = sds->group_min;
-                /* Adjust by relative CPU power of the group */
+        if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
-                avg_load = sg_div_cpu_power(group,
+                cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-                                avg_load * SCHED_LOAD_SCALE);
+                        group_first_cpu(sds->group_leader);
+        }
+        return 1;
-                /*
+}
-                 * Consider the group unbalanced when the imbalance is larger
+#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-                 * than the average weight of two tasks.
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
-                 *
+        struct sd_lb_stats *sds, enum cpu_idle_type idle)
-                 * APZ: with cgroup the avg task weight can vary wildly and
+{
-                 *      might not be a suitable number - should we keep a
+        return;
-                 *      normalized nr_running number somewhere that negates
+}
-                 *      the hierarchy?
-                 */
+static inline void update_sd_power_savings_stats(struct sched_group *group,
-                avg_load_per_task = sg_div_cpu_power(group,
+        struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
-                                sum_avg_load_per_task * SCHED_LOAD_SCALE);
+{
+        return;
+}
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+                                        int this_cpu, unsigned long *imbalance)
+{
+        return 0;
+}
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+/**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @group: sched_group whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @local_group: Does group contain this_cpu.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sgs: variable to hold the statistics for this group.
+ */
+static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+                        enum cpu_idle_type idle, int load_idx, int *sd_idle,
+                        int local_group, const struct cpumask *cpus,
+                        int *balance, struct sg_lb_stats *sgs)
+{
+        unsigned long load, max_cpu_load, min_cpu_load;
+        int i;
+        unsigned int balance_cpu = -1, first_idle_cpu = 0;
+        unsigned long sum_avg_load_per_task;
+        unsigned long avg_load_per_task;
+        if (local_group)
+                balance_cpu = group_first_cpu(group);
-                if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+        /* Tally up the load of all CPUs in the group */
-                        __group_imb = 1;
+        sum_avg_load_per_task = avg_load_per_task = 0;
+        max_cpu_load = 0;
+        min_cpu_load = ~0UL;
-                group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+        for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+                struct rq *rq = cpu_rq(i);
+                if (*sd_idle && rq->nr_running)
+                        *sd_idle = 0;
+                /* Bias balancing toward cpus of our domain */
                if (local_group) {
-                        this_load = avg_load;
+                        if (idle_cpu(i) && !first_idle_cpu) {
-                        this = group;
+                                first_idle_cpu = 1;
-                        this_nr_running = sum_nr_running;
+                                balance_cpu = i;
-                        this_load_per_task = sum_weighted_load;
+                        }
-                } else if (avg_load > max_load &&
-                           (sum_nr_running > group_capacity || __group_imb)) {
+                        load = target_load(i, load_idx);
-                        max_load = avg_load;
+                } else {
-                        busiest = group;
+                        load = source_load(i, load_idx);
-                        busiest_nr_running = sum_nr_running;
+                        if (load > max_cpu_load)
-                        busiest_load_per_task = sum_weighted_load;
+                                max_cpu_load = load;
-                        group_imb = __group_imb;
+                        if (min_cpu_load > load)
+                                min_cpu_load = load;
                }
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+                sgs->group_load += load;
-                /*
+                sgs->sum_nr_running += rq->nr_running;
-                 * Busy processors will not participate in power savings
+                sgs->sum_weighted_load += weighted_cpuload(i);
-                 * balance.
-                 */
-                if (idle == CPU_NOT_IDLE ||
-                                !(sd->flags & SD_POWERSAVINGS_BALANCE))
-                        goto group_next;
-                /*
+                sum_avg_load_per_task += cpu_avg_load_per_task(i);
-                 * If the local group is idle or completely loaded
+        }
-                 * no need to do power savings balance at this domain
-                 */
-                if (local_group && (this_nr_running >= group_capacity ||
-                                    !this_nr_running))
-                        power_savings_balance = 0;
-                /*
+        /*
-                 * If a group is already running at full capacity or idle,
+         * First idle cpu or the first cpu(busiest) in this sched group
-                 * don't include that group in power savings calculations
+         * is eligible for doing load balancing at this and above
-                 */
+         * domains. In the newly idle case, we will allow all the cpu's
-                if (!power_savings_balance || sum_nr_running >= group_capacity
+         * to do the newly idle load balance.
-                    || !sum_nr_running)
+         */
-                        goto group_next;
+        if (idle != CPU_NEWLY_IDLE && local_group &&
+            balance_cpu != this_cpu && balance) {
+                *balance = 0;
+                return;
+        }
-                /*
+        /* Adjust by relative CPU power of the group */
-                 * Calculate the group which has the least non-idle load.
+        sgs->avg_load = sg_div_cpu_power(group,
-                 * This is the group from where we need to pick up the load
+                        sgs->group_load * SCHED_LOAD_SCALE);
-                 * for saving power
-                 */
-                if ((sum_nr_running < min_nr_running) ||
-                    (sum_nr_running == min_nr_running &&
-                     cpumask_first(sched_group_cpus(group)) >
-                     cpumask_first(sched_group_cpus(group_min)))) {
-                        group_min = group;
-                        min_nr_running = sum_nr_running;
-                        min_load_per_task = sum_weighted_load /
-                                                sum_nr_running;
-                }
-                /*
-                 * Calculate the group which is almost near its
+        /*
-                 * capacity but still has some space to pick up some load
+         * Consider the group unbalanced when the imbalance is larger
-                 * from other group and save more power
+         * than the average weight of two tasks.
-                 */
+         *
-                if (sum_nr_running <= group_capacity - 1) {
+         * APZ: with cgroup the avg task weight can vary wildly and
-                        if (sum_nr_running > leader_nr_running ||
+         *      might not be a suitable number - should we keep a
-                            (sum_nr_running == leader_nr_running &&
+         *      normalized nr_running number somewhere that negates
-                             cpumask_first(sched_group_cpus(group)) <
+         *      the hierarchy?
-                             cpumask_first(sched_group_cpus(group_leader)))) {
+         */
-                                group_leader = group;
+        avg_load_per_task = sg_div_cpu_power(group,
-                                leader_nr_running = sum_nr_running;
+                        sum_avg_load_per_task * SCHED_LOAD_SCALE);
-                        }
+        if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+                sgs->group_imb = 1;
+        sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+}
+/**
+ * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: sched_domain whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sds: variable to hold the statistics for this sched_domain.
+ */
+static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+                        enum cpu_idle_type idle, int *sd_idle,
+                        const struct cpumask *cpus, int *balance,
+                        struct sd_lb_stats *sds)
+{
+        struct sched_group *group = sd->groups;
+        struct sg_lb_stats sgs;
+        int load_idx;
+        init_sd_power_savings_stats(sd, sds, idle);
+        load_idx = get_sd_load_idx(sd, idle);
+        do {
+                int local_group;
+                local_group = cpumask_test_cpu(this_cpu,
+                                               sched_group_cpus(group));
+                memset(&sgs, 0, sizeof(sgs));
+                update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+                                local_group, cpus, balance, &sgs);
+                if (local_group && balance && !(*balance))
+                        return;
+                sds->total_load += sgs.group_load;
+                sds->total_pwr += group->__cpu_power;
+                if (local_group) {
+                        sds->this_load = sgs.avg_load;
+                        sds->this = group;
+                        sds->this_nr_running = sgs.sum_nr_running;
+                        sds->this_load_per_task = sgs.sum_weighted_load;
+                } else if (sgs.avg_load > sds->max_load &&
+                           (sgs.sum_nr_running > sgs.group_capacity ||
+                                sgs.group_imb)) {
+                        sds->max_load = sgs.avg_load;
+                        sds->busiest = group;
+                        sds->busiest_nr_running = sgs.sum_nr_running;
+                        sds->busiest_load_per_task = sgs.sum_weighted_load;
+                        sds->group_imb = sgs.group_imb;
                }
-group_next:
-#endif
+                update_sd_power_savings_stats(group, sds, local_group, &sgs);
                group = group->next;
        } while (group != sd->groups);
-        if (!busiest || this_load >= max_load || busiest_nr_running == 0)
+}
-                goto out_balanced;
-        avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
-        if (this_load >= avg_load ||
+/**
-                        100*max_load <= sd->imbalance_pct*this_load)
+ * fix_small_imbalance - Calculate the minor imbalance that exists
-                goto out_balanced;
+ *                      amongst the groups of a sched_domain, during
+ *                      load balancing.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: Variable to store the imbalance.
+ */
+static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+                                int this_cpu, unsigned long *imbalance)
+{
+        unsigned long tmp, pwr_now = 0, pwr_move = 0;
+        unsigned int imbn = 2;
+        if (sds->this_nr_running) {
+                sds->this_load_per_task /= sds->this_nr_running;
+                if (sds->busiest_load_per_task >
+                                sds->this_load_per_task)
+                        imbn = 1;
+        } else
+                sds->this_load_per_task =
+                        cpu_avg_load_per_task(this_cpu);
-        busiest_load_per_task /= busiest_nr_running;
+        if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
-        if (group_imb)
+                        sds->busiest_load_per_task * imbn) {
-                busiest_load_per_task = min(busiest_load_per_task, avg_load);
+                *imbalance = sds->busiest_load_per_task;
+                return;
+        }
        /*
-         * We're trying to get all the cpus to the average_load, so we don't
+         * OK, we don't have enough imbalance to justify moving tasks,
-         * want to push ourselves above the average load, nor do we wish to
+         * however we may be able to increase total CPU power used by
-         * reduce the max loaded cpu below the average load, as either of these
+         * moving them.
-         * actions would just result in more rebalancing later, and ping-pong
-         * tasks around. Thus we look for the minimum possible imbalance.
-         * Negative imbalances (*we* are more loaded than anyone else) will
-         * be counted as no imbalance for these purposes -- we can't fix that
-         * by pulling tasks to us. Be careful of negative numbers as they'll
-         * appear as very large values with unsigned longs.
         */
-        if (max_load <= busiest_load_per_task)
-                goto out_balanced;
+        pwr_now += sds->busiest->__cpu_power *
+                        min(sds->busiest_load_per_task, sds->max_load);
+        pwr_now += sds->this->__cpu_power *
+                        min(sds->this_load_per_task, sds->this_load);
+        pwr_now /= SCHED_LOAD_SCALE;
+        /* Amount of load we'd subtract */
+        tmp = sg_div_cpu_power(sds->busiest,
+                        sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+        if (sds->max_load > tmp)
+                pwr_move += sds->busiest->__cpu_power *
+                        min(sds->busiest_load_per_task, sds->max_load - tmp);
+        /* Amount of load we'd add */
+        if (sds->max_load * sds->busiest->__cpu_power <
+                sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+                tmp = sg_div_cpu_power(sds->this,
+                        sds->max_load * sds->busiest->__cpu_power);
+        else
+                tmp = sg_div_cpu_power(sds->this,
+                        sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+        pwr_move += sds->this->__cpu_power *
+                        min(sds->this_load_per_task, sds->this_load + tmp);
+        pwr_move /= SCHED_LOAD_SCALE;
+        /* Move if we gain throughput */
+        if (pwr_move > pwr_now)
+                *imbalance = sds->busiest_load_per_task;
+}
+/**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ *                       groups of a given sched_domain during load balance.
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: Cpu for which currently load balance is being performed.
+ * @imbalance: The variable to store the imbalance.
+ */
+static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+                unsigned long *imbalance)
+{
+        unsigned long max_pull;
        /*
         * In the presence of smp nice balancing, certain scenarios can have
         * max load less than avg load(as we skip the groups at or below
         * its cpu_power, while calculating max_load..)
         */
-        if (max_load < avg_load) {
+        if (sds->max_load < sds->avg_load) {
                *imbalance = 0;
-                goto small_imbalance;
+                return fix_small_imbalance(sds, this_cpu, imbalance);
        }
        /* Don't want to pull so many tasks that a group would go idle */
-        max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+        max_pull = min(sds->max_load - sds->avg_load,
+                        sds->max_load - sds->busiest_load_per_task);
        /* How much load to actually move to equalise the imbalance */
-        *imbalance = min(max_pull * busiest->__cpu_power,
+        *imbalance = min(max_pull * sds->busiest->__cpu_power,
-                                (avg_load - this_load) * this->__cpu_power)
+                (sds->avg_load - sds->this_load) * sds->this->__cpu_power)
                        / SCHED_LOAD_SCALE;
        /*
@@ -3333,78 +3679,110 @@ group_next:
         * a think about bumping its value to force at least one task to be
         * moved
         */
-        if (*imbalance < busiest_load_per_task) {
+        if (*imbalance < sds->busiest_load_per_task)
-                unsigned long tmp, pwr_now, pwr_move;
+                return fix_small_imbalance(sds, this_cpu, imbalance);
-                unsigned int imbn;
-small_imbalance:
-                pwr_move = pwr_now = 0;
-                imbn = 2;
-                if (this_nr_running) {
-                        this_load_per_task /= this_nr_running;
-                        if (busiest_load_per_task > this_load_per_task)
-                                imbn = 1;
-                } else
-                        this_load_per_task = cpu_avg_load_per_task(this_cpu);
-                if (max_load - this_load + busiest_load_per_task >=
+}
-                                        busiest_load_per_task * imbn) {
+/******* find_busiest_group() helpers end here *********************/
-                        *imbalance = busiest_load_per_task;
-                        return busiest;
-                }
-                /*
+/**
-                 * OK, we don't have enough imbalance to justify moving tasks,
+ * find_busiest_group - Returns the busiest group within the sched_domain
-                 * however we may be able to increase total CPU power used by
+ * if there is an imbalance. If there isn't an imbalance, and
-                 * moving them.
+ * the user has opted for power-savings, it returns a group whose
-                 */
+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ * such a group exists.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @sd: The sched_domain whose busiest group is to be returned.
+ * @this_cpu: The cpu for which load balancing is currently being performed.
+ * @imbalance: Variable which stores amount of weighted load which should
+ *              be moved to restore balance/put a group to idle.
+ * @idle: The idle status of this_cpu.
+ * @sd_idle: The idleness of sd
+ * @cpus: The set of CPUs under consideration for load-balancing.
+ * @balance: Pointer to a variable indicating if this_cpu
+ *      is the appropriate cpu to perform load balancing at this_level.
+ *
+ * Returns:     - the busiest group if imbalance exists.
+ *              - If no imbalance and user has opted for power-savings balance,
+ *                 return the least loaded group whose CPUs can be
+ *                 put to idle by rebalancing its tasks onto our group.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+                   unsigned long *imbalance, enum cpu_idle_type idle,
+                   int *sd_idle, const struct cpumask *cpus, int *balance)
+{
+        struct sd_lb_stats sds;
-                pwr_now += busiest->__cpu_power *
+        memset(&sds, 0, sizeof(sds));
-                                min(busiest_load_per_task, max_load);
-                pwr_now += this->__cpu_power *
-                                min(this_load_per_task, this_load);
-                pwr_now /= SCHED_LOAD_SCALE;
-                /* Amount of load we'd subtract */
-                tmp = sg_div_cpu_power(busiest,
-                                busiest_load_per_task * SCHED_LOAD_SCALE);
-                if (max_load > tmp)
-                        pwr_move += busiest->__cpu_power *
-                                min(busiest_load_per_task, max_load - tmp);
-                /* Amount of load we'd add */
-                if (max_load * busiest->__cpu_power <
-                                busiest_load_per_task * SCHED_LOAD_SCALE)
-                        tmp = sg_div_cpu_power(this,
-                                        max_load * busiest->__cpu_power);
-                else
-                        tmp = sg_div_cpu_power(this,
-                                busiest_load_per_task * SCHED_LOAD_SCALE);
-                pwr_move += this->__cpu_power *
-                                min(this_load_per_task, this_load + tmp);
-                pwr_move /= SCHED_LOAD_SCALE;
-                /* Move if we gain throughput */
+        /*
-                if (pwr_move > pwr_now)
+         * Compute the various statistics relavent for load balancing at
-                        *imbalance = busiest_load_per_task;
+         * this level.
-        }
+         */
+        update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+                                        balance, &sds);
+        /* Cases where imbalance does not exist from POV of this_cpu */
+        /* 1) this_cpu is not the appropriate cpu to perform load balancing
+         *    at this level.
+         * 2) There is no busy sibling group to pull from.
+         * 3) This group is the busiest group.
+         * 4) This group is more busy than the avg busieness at this
+         *    sched_domain.
+         * 5) The imbalance is within the specified limit.
+         * 6) Any rebalance would lead to ping-pong
+         */
+        if (balance && !(*balance))
+                goto ret;
-        return busiest;
+        if (!sds.busiest || sds.busiest_nr_running == 0)
+                goto out_balanced;
-out_balanced:
+        if (sds.this_load >= sds.max_load)
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+                goto out_balanced;
-        if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-                goto ret;
-        if (this == group_leader && group_leader != group_min) {
+        sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
-                *imbalance = min_load_per_task;
-                if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+        if (sds.this_load >= sds.avg_load)
-                        cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+                goto out_balanced;
-                                cpumask_first(sched_group_cpus(group_leader));
-                }
+        if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
-                return group_min;
+                goto out_balanced;
-        }
-#endif
+        sds.busiest_load_per_task /= sds.busiest_nr_running;
+        if (sds.group_imb)
+                sds.busiest_load_per_task =
+                        min(sds.busiest_load_per_task, sds.avg_load);
+        /*
+         * We're trying to get all the cpus to the average_load, so we don't
+         * want to push ourselves above the average load, nor do we wish to
+         * reduce the max loaded cpu below the average load, as either of these
+         * actions would just result in more rebalancing later, and ping-pong
+         * tasks around. Thus we look for the minimum possible imbalance.
+         * Negative imbalances (*we* are more loaded than anyone else) will
+         * be counted as no imbalance for these purposes -- we can't fix that
+         * by pulling tasks to us. Be careful of negative numbers as they'll
+         * appear as very large values with unsigned longs.
+         */
+        if (sds.max_load <= sds.busiest_load_per_task)
+                goto out_balanced;
+        /* Looks like there is an imbalance. Compute it */
+        calculate_imbalance(&sds, this_cpu, imbalance);
+        return sds.busiest;
+out_balanced:
+        /*
+         * There is no obvious imbalance. But check if we can do some balancing
+         * to save power.
+         */
+        if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+                return sds.busiest;
 ret:
        *imbalance = 0;
        return NULL;
@@ -3448,19 +3826,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
 */
 #define MAX_PINNED_INTERVAL     512
+/* Working cpumask for load_balance and load_balance_newidle. */
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
 /*
 * Check this_cpu to ensure it is balanced within domain. Attempt to move
 * tasks if there is an imbalance.
 */
 static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
-                        int *balance, struct cpumask *cpus)
+                        int *balance)
 {
        int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
        struct sched_group *group;
        unsigned long imbalance;
        struct rq *busiest;
        unsigned long flags;
+        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
        cpumask_setall(cpus);
@@ -3615,8 +3997,7 @@ out:
 * this_rq is locked.
 */
 static int
-load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
+load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
-                        struct cpumask *cpus)
 {
        struct sched_group *group;
        struct rq *busiest = NULL;
@@ -3624,6 +4005,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
        int ld_moved = 0;
        int sd_idle = 0;
        int all_pinned = 0;
+        struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
        cpumask_setall(cpus);
@@ -3764,10 +4146,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
        struct sched_domain *sd;
        int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
-        cpumask_var_t tmpmask;
-        if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
-                return;
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
@@ -3778,7 +4156,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                if (sd->flags & SD_BALANCE_NEWIDLE)
                        /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance_newidle(this_cpu, this_rq,
-                                                           sd, tmpmask);
+                                                           sd);
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
@@ -3793,7 +4171,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
                 */
                this_rq->next_balance = next_balance;
        }
-        free_cpumask_var(tmpmask);
 }
 /*
@@ -3943,11 +4320,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
        int need_serialize;
-        cpumask_var_t tmp;
-        /* Fails alloc?  Rebalancing probably not a priority right now. */
-        if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
-                return;
        for_each_domain(cpu, sd) {
                if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3972,7 +4344,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                }
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
-                        if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
+                        if (load_balance(cpu, rq, sd, idle, &balance)) {
                                /*
                                 * We've pulled tasks over so either we're no
                                 * longer idle, or one of our SMT siblings is
@@ -4006,8 +4378,6 @@ out:
         */
        if (likely(update_next_balance))
                rq->next_balance = next_balance;
-        free_cpumask_var(tmp);
 }
 /*
@@ -4057,6 +4427,11 @@ static void run_rebalance_domains(struct softirq_action *h)
 #endif
 }
+static inline int on_null_domain(int cpu)
+{
+        return !rcu_dereference(cpu_rq(cpu)->sd);
+}
 /*
 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
 *
@@ -4114,7 +4489,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
            cpumask_test_cpu(cpu, nohz.cpu_mask))
                return;
 #endif
-        if (time_after_eq(jiffies, rq->next_balance))
+        /* Don't need to rebalance while attached to NULL domain */
+        if (time_after_eq(jiffies, rq->next_balance) &&
+            likely(!on_null_domain(cpu)))
                raise_softirq(SCHED_SOFTIRQ);
 }
@@ -4404,10 +4781,7 @@ void scheduler_tick(void)
 #endif
 }
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+unsigned long get_parent_ip(unsigned long addr)
-                                defined(CONFIG_PREEMPT_TRACER))
-static inline unsigned long get_parent_ip(unsigned long addr)
 {
        if (in_lock_functions(addr)) {
                addr = CALLER_ADDR2;
@@ -4417,6 +4791,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
        return addr;
 }
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+                                defined(CONFIG_PREEMPT_TRACER))
 void __kprobes add_preempt_count(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
@@ -4508,11 +4885,33 @@ static inline void schedule_debug(struct task_struct *prev)
 #endif
 }
+static void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+        if (prev->state == TASK_RUNNING) {
+                u64 runtime = prev->se.sum_exec_runtime;
+                runtime -= prev->se.prev_sum_exec_runtime;
+                runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+                /*
+                 * In order to avoid avg_overlap growing stale when we are
+                 * indeed overlapping and hence not getting put to sleep, grow
+                 * the avg_overlap on preemption.
+                 *
+                 * We use the average preemption runtime because that
+                 * correlates to the amount of cache footprint a task can
+                 * build up.
+                 */
+                update_avg(&prev->se.avg_overlap, runtime);
+        }
+        prev->sched_class->put_prev_task(rq, prev);
+}
 /*
 * Pick up the highest-prio task:
 */
 static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+pick_next_task(struct rq *rq)
 {
        const struct sched_class *class;
        struct task_struct *p;
@@ -4543,15 +4942,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
 /*
 * schedule() is the main scheduler function.
 */
-asmlinkage void __sched schedule(void)
+asmlinkage void __sched __schedule(void)
 {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        struct rq *rq;
        int cpu;
-need_resched:
-        preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        rcu_qsctr_inc(cpu);
@@ -4586,8 +4983,8 @@ need_resched_nonpreemptible:
        if (unlikely(!rq->nr_running))
                idle_balance(cpu, rq);
-        prev->sched_class->put_prev_task(rq, prev);
+        put_prev_task(rq, prev);
-        next = pick_next_task(rq, prev);
+        next = pick_next_task(rq);
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
@@ -4608,13 +5005,80 @@ need_resched_nonpreemptible:
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
+}
+asmlinkage void __sched schedule(void)
+{
+need_resched:
+        preempt_disable();
+        __schedule();
        preempt_enable_no_resched();
        if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
                goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
+#ifdef CONFIG_SMP
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
+{
+        unsigned int cpu;
+        struct rq *rq;
+        if (!sched_feat(OWNER_SPIN))
+                return 0;
+#ifdef CONFIG_DEBUG_PAGEALLOC
+        /*
+         * Need to access the cpu field knowing that
+         * DEBUG_PAGEALLOC could have unmapped it if
+         * the mutex owner just released it and exited.
+         */
+        if (probe_kernel_address(&owner->cpu, cpu))
+                goto out;
+#else
+        cpu = owner->cpu;
+#endif
+        /*
+         * Even if the access succeeded (likely case),
+         * the cpu field may no longer be valid.
+         */
+        if (cpu >= nr_cpumask_bits)
+                goto out;
+        /*
+         * We need to validate that we can do a
+         * get_cpu() and that we have the percpu area.
+         */
+        if (!cpu_online(cpu))
+                goto out;
+        rq = cpu_rq(cpu);
+        for (;;) {
+                /*
+                 * Owner changed, break to re-assess state.
+                 */
+                if (lock->owner != owner)
+                        break;
+                /*
+                 * Is that owner really running on that cpu?
+                 */
+                if (task_thread_info(rq->curr) != owner || need_resched())
+                        return 0;
+                cpu_relax();
+        }
+out:
+        return 1;
+}
+#endif
 #ifdef CONFIG_PREEMPT
 /*
 * this is the entry point to schedule() from in-kernel preemption
@@ -4642,7 +5106,7 @@ asmlinkage void __sched preempt_schedule(void)
                 * between schedule and now.
                 */
                barrier();
-        } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+        } while (need_resched());
 }
 EXPORT_SYMBOL(preempt_schedule);
@@ -4671,7 +5135,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
                 * between schedule and now.
                 */
                barrier();
-        } while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+        } while (need_resched());
 }
 #endif /* CONFIG_PREEMPT */
@@ -4732,11 +5196,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
        __wake_up_common(q, mode, 1, 0, NULL);
 }
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+        __wake_up_common(q, mode, 1, 0, key);
+}
 /**
- * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
 * @q: the waitqueue
 * @mode: which threads
 * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
 *
 * The sync wakeup differs that the waker knows that it will schedule
 * away soon, so while the target thread will be woken up, it will not
@@ -4745,8 +5215,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 *
 * On UP it can prevent extra preemption.
 */
-void
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
-__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+                        int nr_exclusive, void *key)
 {
        unsigned long flags;
        int sync = 1;
@@ -4758,9 +5228,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
                sync = 0;
        spin_lock_irqsave(&q->lock, flags);
-        __wake_up_common(q, mode, nr_exclusive, sync, NULL);
+        __wake_up_common(q, mode, nr_exclusive, sync, key);
        spin_unlock_irqrestore(&q->lock, flags);
 }
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+        __wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
 EXPORT_SYMBOL_GPL(__wake_up_sync);      /* For internal use only */
 /**
@@ -5145,7 +5624,7 @@ SYSCALL_DEFINE1(nice, int, increment)
        if (increment > 40)
                increment = 40;
-        nice = PRIO_TO_NICE(current->static_prio) + increment;
+        nice = TASK_NICE(current) + increment;
        if (nice < -20)
                nice = -20;
        if (nice > 19)
@@ -5944,12 +6423,7 @@ void sched_show_task(struct task_struct *p)
                printk(KERN_CONT " %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
-        {
+        free = stack_not_used(p);
-                unsigned long *n = end_of_stack(p);
-                while (!*n)
-                        n++;
-                free = (unsigned long)n - (unsigned long)end_of_stack(p);
-        }
 #endif
        printk(KERN_CONT "%5lu %5d %6d\n", free,
                task_pid_nr(p), task_pid_nr(p->real_parent));
@@ -6423,7 +6897,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
                if (!rq->nr_running)
                        break;
                update_rq_clock(rq);
-                next = pick_next_task(rq, rq->curr);
+                next = pick_next_task(rq);
                if (!next)
                        break;
                next->sched_class->put_prev_task(rq, next);
@@ -6944,20 +7418,26 @@ static void free_rootdomain(struct root_domain *rd)
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
+        struct root_domain *old_rd = NULL;
        unsigned long flags;
        spin_lock_irqsave(&rq->lock, flags);
        if (rq->rd) {
-                struct root_domain *old_rd = rq->rd;
+                old_rd = rq->rd;
                if (cpumask_test_cpu(rq->cpu, old_rd->online))
                        set_rq_offline(rq);
                cpumask_clear_cpu(rq->cpu, old_rd->span);
-                if (atomic_dec_and_test(&old_rd->refcount))
+                /*
-                        free_rootdomain(old_rd);
+                 * If we dont want to free the old_rt yet then
+                 * set old_rd to NULL to skip the freeing later
+                 * in this function:
+                 */
+                if (!atomic_dec_and_test(&old_rd->refcount))
+                        old_rd = NULL;
        }
        atomic_inc(&rd->refcount);
@@ -6968,6 +7448,9 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                set_rq_online(rq);
        spin_unlock_irqrestore(&rq->lock, flags);
+        if (old_rd)
+                free_rootdomain(old_rd);
 }
 static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
@@ -7245,7 +7728,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
 {
        int group;
-        cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
        group = cpumask_first(mask);
        if (sg)
                *sg = &per_cpu(sched_group_core, group).sg;
@@ -7274,7 +7757,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
        cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
        group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
-        cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
        group = cpumask_first(mask);
 #else
        group = cpu;
@@ -7617,7 +8100,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
                SD_INIT(sd, SIBLING);
                set_domain_attribute(sd, attr);
                cpumask_and(sched_domain_span(sd),
-                            &per_cpu(cpu_sibling_map, i), cpu_map);
+                            topology_thread_cpumask(i), cpu_map);
                sd->parent = p;
                p->child = sd;
                cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7628,7 +8111,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
        /* Set up CPU (sibling) groups */
        for_each_cpu(i, cpu_map) {
                cpumask_and(this_sibling_map,
-                            &per_cpu(cpu_sibling_map, i), cpu_map);
+                            topology_thread_cpumask(i), cpu_map);
                if (i != cpumask_first(this_sibling_map))
                        continue;
@@ -8209,11 +8692,15 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        __set_bit(MAX_RT_PRIO, array->bitmap);
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-        rt_rq->highest_prio = MAX_RT_PRIO;
+        rt_rq->highest_prio.curr = MAX_RT_PRIO;
+#ifdef CONFIG_SMP
+        rt_rq->highest_prio.next = MAX_RT_PRIO;
+#endif
 #endif
 #ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
+        plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
 #endif
        rt_rq->rt_time = 0;
@@ -8300,6 +8787,9 @@ void __init sched_init(void)
 #ifdef CONFIG_USER_SCHED
        alloc_size *= 2;
 #endif
+#ifdef CONFIG_CPUMASK_OFFSTACK
+        alloc_size += num_possible_cpus() * cpumask_size();
+#endif
        /*
         * As sched_init() is called before page_alloc is setup,
         * we use alloc_bootmem().
@@ -8337,6 +8827,12 @@ void __init sched_init(void)
                ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_CPUMASK_OFFSTACK
+                for_each_possible_cpu(i) {
+                        per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+                        ptr += cpumask_size();
+                }
+#endif /* CONFIG_CPUMASK_OFFSTACK */
        }
 #ifdef CONFIG_SMP
@@ -9215,6 +9711,16 @@ static int sched_rt_global_constraints(void)
        return ret;
 }
+int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+        /* Don't accept realtime tasks when there is no way for them to run */
+        if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+                return 0;
+        return 1;
+}
 #else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
@@ -9308,8 +9814,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
                      struct task_struct *tsk)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
-        /* Don't accept realtime tasks when there is no way for them to run */
+        if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
-        if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
                return -EINVAL;
 #else
        /* We don't support RT-tasks being in separate groups */
@@ -9472,7 +9977,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 {
-        u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
        u64 data;
 #ifndef CONFIG_64BIT
@@ -9491,7 +9996,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 {
-        u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+        u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 #ifndef CONFIG_64BIT
        /*
@@ -9580,14 +10085,14 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
        struct cpuacct *ca;
        int cpu;
-        if (!cpuacct_subsys.active)
+        if (unlikely(!cpuacct_subsys.active))
                return;
        cpu = task_cpu(tsk);
        ca = task_ca(tsk);
        for (; ca; ca = ca->parent) {
-                u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
 }