1 files changed, 394 insertions, 79 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 26efa475bdc1..7c9098d186e6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -39,6 +39,7 @@
 #include <linux/completion.h>
 #include <linux/kernel_stat.h>
 #include <linux/debug_locks.h>
+#include <linux/perf_counter.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/profile.h>
@@ -68,17 +69,18 @@
 #include <linux/pagemap.h>
 #include <linux/hrtimer.h>
 #include <linux/tick.h>
-#include <linux/bootmem.h>
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include "sched_cpupri.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
 /*
 * Convert user-nice values [ -20 ... 0 ... 19 ]
 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -118,12 +120,6 @@
 */
 #define RUNTIME_INF     ((u64)~0ULL)
-DEFINE_TRACE(sched_wait_task);
-DEFINE_TRACE(sched_wakeup);
-DEFINE_TRACE(sched_wakeup_new);
-DEFINE_TRACE(sched_switch);
-DEFINE_TRACE(sched_migrate_task);
 #ifdef CONFIG_SMP
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -244,7 +240,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
                hard = hrtimer_get_expires(&rt_b->rt_period_timer);
                delta = ktime_to_ns(ktime_sub(hard, soft));
                __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
-                                HRTIMER_MODE_ABS, 0);
+                                HRTIMER_MODE_ABS_PINNED, 0);
        }
        spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -584,6 +580,7 @@ struct rq {
        struct load_weight load;
        unsigned long nr_load_updates;
        u64 nr_switches;
+        u64 nr_migrations_in;
        struct cfs_rq cfs;
        struct rt_rq rt;
@@ -630,6 +627,10 @@ struct rq {
        struct list_head migration_queue;
 #endif
+        /* calc_load related fields */
+        unsigned long calc_load_update;
+        long calc_load_active;
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
        int hrtick_csd_pending;
@@ -692,7 +693,7 @@ static inline int cpu_of(struct rq *rq)
 #define task_rq(p)              cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
-static inline void update_rq_clock(struct rq *rq)
+inline void update_rq_clock(struct rq *rq)
 {
        rq->clock = sched_clock_cpu(cpu_of(rq));
 }
@@ -1154,7 +1155,7 @@ static __init void init_hrtick(void)
 static void hrtick_start(struct rq *rq, u64 delay)
 {
        __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
-                        HRTIMER_MODE_REL, 0);
+                        HRTIMER_MODE_REL_PINNED, 0);
 }
 static inline void init_hrtick(void)
@@ -1728,6 +1729,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
+static void calc_load_account_active(struct rq *this_rq);
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -1958,7 +1961,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        clock_offset = old_rq->clock - new_rq->clock;
-        trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+        trace_sched_migrate_task(p, new_cpu);
 #ifdef CONFIG_SCHEDSTATS
        if (p->se.wait_start)
@@ -1967,12 +1970,17 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                p->se.sleep_start -= clock_offset;
        if (p->se.block_start)
                p->se.block_start -= clock_offset;
+#endif
        if (old_cpu != new_cpu) {
-                schedstat_inc(p, se.nr_migrations);
+                p->se.nr_migrations++;
+                new_rq->nr_migrations_in++;
+#ifdef CONFIG_SCHEDSTATS
                if (task_hot(p, old_rq->clock, NULL))
                        schedstat_inc(p, se.nr_forced2_migrations);
-        }
 #endif
+                perf_swcounter_event(PERF_COUNT_SW_CPU_MIGRATIONS,
+                                     1, 1, NULL, 0);
+        }
        p->se.vruntime -= old_cfsrq->min_vruntime -
                                         new_cfsrq->min_vruntime;
@@ -2015,6 +2023,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
 }
 /*
+ * wait_task_context_switch -   wait for a thread to complete at least one
+ *                              context switch.
+ *
+ * @p must not be current.
+ */
+void wait_task_context_switch(struct task_struct *p)
+{
+        unsigned long nvcsw, nivcsw, flags;
+        int running;
+        struct rq *rq;
+        nvcsw   = p->nvcsw;
+        nivcsw  = p->nivcsw;
+        for (;;) {
+                /*
+                 * The runqueue is assigned before the actual context
+                 * switch. We need to take the runqueue lock.
+                 *
+                 * We could check initially without the lock but it is
+                 * very likely that we need to take the lock in every
+                 * iteration.
+                 */
+                rq = task_rq_lock(p, &flags);
+                running = task_running(rq, p);
+                task_rq_unlock(rq, &flags);
+                if (likely(!running))
+                        break;
+                /*
+                 * The switch count is incremented before the actual
+                 * context switch. We thus wait for two switches to be
+                 * sure at least one completed.
+                 */
+                if ((p->nvcsw - nvcsw) > 1)
+                        break;
+                if ((p->nivcsw - nivcsw) > 1)
+                        break;
+                cpu_relax();
+        }
+}
+/*
 * wait_task_inactive - wait for a thread to unschedule.
 *
 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2142,6 +2193,7 @@ void kick_process(struct task_struct *p)
                smp_send_reschedule(cpu);
        preempt_enable();
 }
+EXPORT_SYMBOL_GPL(kick_process);
 /*
 * Return a low guess at the load of a migration-source cpu weighted
@@ -2324,6 +2376,27 @@ static int sched_balance_self(int cpu, int flag)
 #endif /* CONFIG_SMP */
+/**
+ * task_oncpu_function_call - call a function on the cpu on which a task runs
+ * @p:          the task to evaluate
+ * @func:       the function to be called
+ * @info:       the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ */
+void task_oncpu_function_call(struct task_struct *p,
+                              void (*func) (void *info), void *info)
+{
+        int cpu;
+        preempt_disable();
+        cpu = task_cpu(p);
+        if (task_curr(p))
+                smp_call_function_single(cpu, func, info, 1);
+        preempt_enable();
+}
 /***
 * try_to_wake_up - wake up a thread
 * @p: the to-be-woken-up thread
@@ -2458,6 +2531,17 @@ out:
        return success;
 }
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.  Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
 int wake_up_process(struct task_struct *p)
 {
        return try_to_wake_up(p, TASK_ALL, 0);
@@ -2480,6 +2564,7 @@ static void __sched_fork(struct task_struct *p)
        p->se.exec_start                = 0;
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
+        p->se.nr_migrations             = 0;
        p->se.last_wakeup               = 0;
        p->se.avg_overlap               = 0;
        p->se.start_runtime             = 0;
@@ -2710,6 +2795,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
+        perf_counter_task_sched_in(current, cpu_of(rq));
        finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
        if (post_schedule)
@@ -2766,7 +2852,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
         * combine the page table reload and the switch backend into
         * one hypercall.
         */
-        arch_enter_lazy_cpu_mode();
+        arch_start_context_switch(prev);
        if (unlikely(!mm)) {
                next->active_mm = oldmm;
@@ -2856,19 +2942,81 @@ unsigned long nr_iowait(void)
        return sum;
 }
-unsigned long nr_active(void)
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+/**
+ * get_avenrun - get the load average array
+ * @loads:      pointer to dest load array
+ * @offset:     offset to add
+ * @shift:      shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 {
-        unsigned long i, running = 0, uninterruptible = 0;
+        loads[0] = (avenrun[0] + offset) << shift;
+        loads[1] = (avenrun[1] + offset) << shift;
+        loads[2] = (avenrun[2] + offset) << shift;
+}
-        for_each_online_cpu(i) {
+static unsigned long
-                running += cpu_rq(i)->nr_running;
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
-                uninterruptible += cpu_rq(i)->nr_uninterruptible;
+{
-        }
+        load *= exp;
+        load += active * (FIXED_1 - exp);
+        return load >> FSHIFT;
+}
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(void)
+{
+        unsigned long upd = calc_load_update + 10;
+        long active;
+        if (time_before(jiffies, upd))
+                return;
+        active = atomic_long_read(&calc_load_tasks);
+        active = active > 0 ? active * FIXED_1 : 0;
+        avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+        avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+        avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+        calc_load_update += LOAD_FREQ;
+}
+/*
+ * Either called from update_cpu_load() or from a cpu going idle
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+        long nr_active, delta;
+        nr_active = this_rq->nr_running;
+        nr_active += (long) this_rq->nr_uninterruptible;
-        if (unlikely((long)uninterruptible < 0))
+        if (nr_active != this_rq->calc_load_active) {
-                uninterruptible = 0;
+                delta = nr_active - this_rq->calc_load_active;
+                this_rq->calc_load_active = nr_active;
+                atomic_long_add(delta, &calc_load_tasks);
+        }
+}
-        return running + uninterruptible;
+/*
+ * Externally visible per-cpu scheduler statistics:
+ * cpu_nr_migrations(cpu) - number of migrations into that cpu
+ */
+u64 cpu_nr_migrations(int cpu)
+{
+        return cpu_rq(cpu)->nr_migrations_in;
 }
 /*
@@ -2899,6 +3047,11 @@ static void update_cpu_load(struct rq *this_rq)
                        new_load += scale-1;
                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
        }
+        if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+                this_rq->calc_load_update += LOAD_FREQ;
+                calc_load_account_active(this_rq);
+        }
 }
 #ifdef CONFIG_SMP
@@ -4240,10 +4393,131 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 static struct {
        atomic_t load_balancer;
        cpumask_var_t cpu_mask;
+        cpumask_var_t ilb_grp_nohz_mask;
 } nohz ____cacheline_aligned = {
        .load_balancer = ATOMIC_INIT(-1),
 };
+int get_nohz_load_balancer(void)
+{
+        return atomic_read(&nohz.load_balancer);
+}
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:        The cpu whose lowest level of sched domain is to
+ *              be returned.
+ * @flag:       The flag to check for the lowest sched_domain
+ *              for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+        struct sched_domain *sd;
+        for_each_domain(cpu, sd)
+                if (sd && (sd->flags & flag))
+                        break;
+        return sd;
+}
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:        The cpu whose domains we're iterating over.
+ * @sd:         variable holding the value of the power_savings_sd
+ *              for cpu.
+ * @flag:       The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+        for (sd = lowest_flag_domain(cpu, flag); \
+                (sd && (sd->flags & flag)); sd = sd->parent)
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group:  group to be checked for semi-idleness
+ *
+ * Returns:     1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+        cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+                                        sched_group_cpus(ilb_group));
+        /*
+         * A sched_group is semi-idle when it has atleast one busy cpu
+         * and atleast one idle cpu.
+         */
+        if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+                return 0;
+        if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+                return 0;
+        return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:        The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:     Returns the id of the idle load balancer if it exists,
+ *              Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+        struct sched_domain *sd;
+        struct sched_group *ilb_group;
+        /*
+         * Have idle load balancer selection from semi-idle packages only
+         * when power-aware load balancing is enabled
+         */
+        if (!(sched_smt_power_savings || sched_mc_power_savings))
+                goto out_done;
+        /*
+         * Optimize for the case when we have no idle CPUs or only one
+         * idle CPU. Don't walk the sched_domain hierarchy in such cases
+         */
+        if (cpumask_weight(nohz.cpu_mask) < 2)
+                goto out_done;
+        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+                ilb_group = sd->groups;
+                do {
+                        if (is_semi_idle_group(ilb_group))
+                                return cpumask_first(nohz.ilb_grp_nohz_mask);
+                        ilb_group = ilb_group->next;
+                } while (ilb_group != sd->groups);
+        }
+out_done:
+        return cpumask_first(nohz.cpu_mask);
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+        return cpumask_first(nohz.cpu_mask);
+}
+#endif
 /*
 * This routine will try to nominate the ilb (idle load balancing)
 * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4572,24 @@ int select_nohz_load_balancer(int stop_tick)
                        /* make me the ilb owner */
                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
                                return 1;
-                } else if (atomic_read(&nohz.load_balancer) == cpu)
+                } else if (atomic_read(&nohz.load_balancer) == cpu) {
+                        int new_ilb;
+                        if (!(sched_smt_power_savings ||
+                                                sched_mc_power_savings))
+                                return 1;
+                        /*
+                         * Check to see if there is a more power-efficient
+                         * ilb.
+                         */
+                        new_ilb = find_new_ilb(cpu);
+                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+                                atomic_set(&nohz.load_balancer, -1);
+                                resched_cpu(new_ilb);
+                                return 0;
+                        }
                        return 1;
+                }
        } else {
                if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                        return 0;
@@ -4468,15 +4758,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
                }
                if (atomic_read(&nohz.load_balancer) == -1) {
-                        /*
+                        int ilb = find_new_ilb(cpu);
-                         * simple selection for now: Nominate the
-                         * first cpu in the nohz list to be the next
-                         * ilb owner.
-                         *
-                         * TBD: Traverse the sched domains and nominate
-                         * the nearest cpu in the nohz.cpu_mask.
-                         */
-                        int ilb = cpumask_first(nohz.cpu_mask);
                        if (ilb < nr_cpu_ids)
                                resched_cpu(ilb);
@@ -4840,6 +5122,8 @@ void scheduler_tick(void)
        curr->sched_class->task_tick(rq, curr, 0);
        spin_unlock(&rq->lock);
+        perf_counter_task_tick(curr, cpu);
 #ifdef CONFIG_SMP
        rq->idle_at_tick = idle_cpu(cpu);
        trigger_load_balance(rq, cpu);
@@ -5007,13 +5291,15 @@ pick_next_task(struct rq *rq)
 /*
 * schedule() is the main scheduler function.
 */
-asmlinkage void __sched __schedule(void)
+asmlinkage void __sched schedule(void)
 {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        struct rq *rq;
        int cpu;
+need_resched:
+        preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        rcu_qsctr_inc(cpu);
@@ -5053,6 +5339,7 @@ need_resched_nonpreemptible:
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
+                perf_counter_task_sched_out(prev, next, cpu);
                rq->nr_switches++;
                rq->curr = next;
@@ -5070,15 +5357,9 @@ need_resched_nonpreemptible:
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
-}
-asmlinkage void __sched schedule(void)
-{
-need_resched:
-        preempt_disable();
-        __schedule();
        preempt_enable_no_resched();
-        if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+        if (need_resched())
                goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
@@ -5221,7 +5502,7 @@ EXPORT_SYMBOL(default_wake_function);
 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
 * zero in this (rare) case, and we handle it by continuing to scan the queue.
 */
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, int sync, void *key)
 {
        wait_queue_t *curr, *next;
@@ -5241,6 +5522,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 * @mode: which threads
 * @nr_exclusive: how many wake-one or wake-many threads to wake up
 * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
 */
 void __wake_up(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, void *key)
@@ -5279,6 +5563,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 * with each other. This can prevent needless bouncing between CPUs.
 *
 * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
 */
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, void *key)
@@ -5315,6 +5602,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 * awakened in the same order in which they were queued.
 *
 * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
 */
 void complete(struct completion *x)
 {
@@ -5332,6 +5622,9 @@ EXPORT_SYMBOL(complete);
 * @x:  holds the state of this particular completion
 *
 * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
 */
 void complete_all(struct completion *x)
 {
@@ -6490,8 +6783,9 @@ void sched_show_task(struct task_struct *p)
 #ifdef CONFIG_DEBUG_STACK_USAGE
        free = stack_not_used(p);
 #endif
-        printk(KERN_CONT "%5lu %5d %6d\n", free,
+        printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
-                task_pid_nr(p), task_pid_nr(p->real_parent));
+                task_pid_nr(p), task_pid_nr(p->real_parent),
+                (unsigned long)task_thread_info(p)->flags);
        show_stack(p, NULL);
 }
@@ -6752,7 +7046,7 @@ static int migration_thread(void *data)
                if (cpu_is_offline(cpu)) {
                        spin_unlock_irq(&rq->lock);
-                        goto wait_to_die;
+                        break;
                }
                if (rq->active_balance) {
@@ -6778,16 +7072,7 @@ static int migration_thread(void *data)
                complete(&req->done);
        }
        __set_current_state(TASK_RUNNING);
-        return 0;
-wait_to_die:
-        /* Wait for kthread_stop */
-        set_current_state(TASK_INTERRUPTIBLE);
-        while (!kthread_should_stop()) {
-                schedule();
-                set_current_state(TASK_INTERRUPTIBLE);
-        }
-        __set_current_state(TASK_RUNNING);
        return 0;
 }
@@ -6970,6 +7255,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
        }
 }
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+        atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+}
 #endif /* CONFIG_HOTPLUG_CPU */
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7193,6 +7486,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                rq = task_rq_lock(p, &flags);
                __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
                task_rq_unlock(rq, &flags);
+                get_task_struct(p);
                cpu_rq(cpu)->migration_thread = p;
                break;
@@ -7204,6 +7498,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                /* Update our root-domain */
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
+                rq->calc_load_update = calc_load_update;
+                rq->calc_load_active = 0;
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
@@ -7221,6 +7517,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                kthread_bind(cpu_rq(cpu)->migration_thread,
                             cpumask_any(cpu_online_mask));
                kthread_stop(cpu_rq(cpu)->migration_thread);
+                put_task_struct(cpu_rq(cpu)->migration_thread);
                cpu_rq(cpu)->migration_thread = NULL;
                break;
@@ -7230,6 +7527,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                migrate_live_tasks(cpu);
                rq = cpu_rq(cpu);
                kthread_stop(rq->migration_thread);
+                put_task_struct(rq->migration_thread);
                rq->migration_thread = NULL;
                /* Idle task back to normal (off runqueue, low prio) */
                spin_lock_irq(&rq->lock);
@@ -7243,7 +7541,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                cpuset_unlock();
                migrate_nr_uninterruptible(rq);
                BUG_ON(rq->nr_running != 0);
+                calc_global_load_remove(rq);
                /*
                 * No need to migrate the tasks: it was best-effort if
                 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7279,8 +7577,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
        return NOTIFY_OK;
 }
-/* Register at highest priority so that task migration (migrate_all_tasks)
+/*
- * happens before everything else.
+ * Register at high priority so that task migration (migrate_all_tasks)
+ * happens before everything else.  This has to be lower priority than
+ * the notifier in the perf_counter subsystem, though.
 */
 static struct notifier_block __cpuinitdata migration_notifier = {
        .notifier_call = migration_call,
@@ -7523,26 +7823,23 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                free_rootdomain(old_rd);
 }
-static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd, bool bootmem)
 {
+        gfp_t gfp = GFP_KERNEL;
        memset(rd, 0, sizeof(*rd));
-        if (bootmem) {
+        if (bootmem)
-                alloc_bootmem_cpumask_var(&def_root_domain.span);
+                gfp = GFP_NOWAIT;
-                alloc_bootmem_cpumask_var(&def_root_domain.online);
-                alloc_bootmem_cpumask_var(&def_root_domain.rto_mask);
-                cpupri_init(&rd->cpupri, true);
-                return 0;
-        }
-        if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
+        if (!alloc_cpumask_var(&rd->span, gfp))
                goto out;
-        if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
+        if (!alloc_cpumask_var(&rd->online, gfp))
                goto free_span;
-        if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+        if (!alloc_cpumask_var(&rd->rto_mask, gfp))
                goto free_online;
-        if (cpupri_init(&rd->cpupri, false) != 0)
+        if (cpupri_init(&rd->cpupri, bootmem) != 0)
                goto free_rto_mask;
        return 0;
@@ -7753,8 +8050,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 /*
 * The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
+ *
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ *   and struct sched_domain. )
 */
 struct static_sched_group {
        struct sched_group sg;
@@ -7875,7 +8173,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
                        struct sched_domain *sd;
                        sd = &per_cpu(phys_domains, j).sd;
-                        if (j != cpumask_first(sched_group_cpus(sd->groups))) {
+                        if (j != group_first_cpu(sd->groups)) {
                                /*
                                 * Only add "power" once for each
                                 * physical package.
@@ -7953,7 +8251,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        WARN_ON(!sd || !sd->groups);
-        if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
+        if (cpu != group_first_cpu(sd->groups))
                return;
        child = sd->child;
@@ -8731,6 +9029,8 @@ void __init sched_init_smp(void)
 }
 #endif /* CONFIG_SMP */
+const_debug unsigned int sysctl_timer_migration = 1;
 int in_sched_functions(unsigned long addr)
 {
        return in_lock_functions(addr) ||
@@ -8865,7 +9165,7 @@ void __init sched_init(void)
         * we use alloc_bootmem().
         */
        if (alloc_size) {
-                ptr = (unsigned long)alloc_bootmem(alloc_size);
+                ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
 #ifdef CONFIG_FAIR_GROUP_SCHED
                init_task_group.se = (struct sched_entity **)ptr;
@@ -8938,6 +9238,8 @@ void __init sched_init(void)
                rq = cpu_rq(i);
                spin_lock_init(&rq->lock);
                rq->nr_running = 0;
+                rq->calc_load_active = 0;
+                rq->calc_load_update = jiffies + LOAD_FREQ;
                init_cfs_rq(&rq->cfs, rq);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8958,7 +9260,7 @@ void __init sched_init(void)
                 * 1024) and two child groups A0 and A1 (of weight 1024 each),
                 * then A0's share of the cpu resource is:
                 *
-                 *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
+                 *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
                 *
                 * We achieve this by letting init_task_group's tasks sit
                 * directly in rq->cfs (i.e init_task_group->se[] = NULL).
@@ -9045,20 +9347,26 @@ void __init sched_init(void)
         * when this runqueue becomes "idle".
         */
        init_idle(current, smp_processor_id());
+        calc_load_update = jiffies + LOAD_FREQ;
        /*
         * During early bootup we pretend to be a normal task:
         */
        current->sched_class = &fair_sched_class;
        /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-        alloc_bootmem_cpumask_var(&nohz_cpu_mask);
+        alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
-        alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+        alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+        alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
 #endif
-        alloc_bootmem_cpumask_var(&cpu_isolated_map);
+        alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
+        perf_counter_init();
        scheduler_running = 1;
 }
@@ -9800,6 +10108,13 @@ static int sched_rt_global_constraints(void)
        if (sysctl_sched_rt_period <= 0)
                return -EINVAL;
+        /*
+         * There's always some RT tasks in the root group
+         * -- migration, kstopmachine etc..
+         */
+        if (sysctl_sched_rt_runtime == 0)
+                return -EBUSY;
        spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
        for_each_possible_cpu(i) {
                struct rt_rq *rt_rq = &cpu_rq(i)->rt;