1 files changed, 315 insertions, 106 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index f52a8801b7a2..dc85ceb90832 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -77,6 +77,7 @@
 #include <asm/irq_regs.h>
 #include "sched_cpupri.h"
+#include "workqueue_sched.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -456,9 +457,10 @@ struct rq {
        unsigned long nr_running;
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+        unsigned long last_load_update_tick;
 #ifdef CONFIG_NO_HZ
        u64 nohz_stamp;
-        unsigned char in_nohz_recently;
+        unsigned char nohz_balance_kick;
 #endif
        unsigned int skip_clock_update;
@@ -1193,6 +1195,27 @@ static void resched_cpu(int cpu)
 #ifdef CONFIG_NO_HZ
 /*
+ * In the semi idle case, use the nearest busy cpu for migrating timers
+ * from an idle cpu.  This is good for power-savings.
+ *
+ * We don't do similar optimization for completely idle system, as
+ * selecting an idle cpu will add more delays to the timers than intended
+ * (as that cpu's timer base may not be uptodate wrt jiffies etc).
+ */
+int get_nohz_timer_target(void)
+{
+        int cpu = smp_processor_id();
+        int i;
+        struct sched_domain *sd;
+        for_each_domain(cpu, sd) {
+                for_each_cpu(i, sched_domain_span(sd))
+                        if (!idle_cpu(i))
+                                return i;
+        }
+        return cpu;
+}
+/*
 * When add_timer_on() enqueues a timer into the timer wheel of an
 * idle CPU then this timer might expire before the next timer event
 * which is scheduled to wake up that CPU. In case of a completely
@@ -1232,16 +1255,6 @@ void wake_up_idle_cpu(int cpu)
                smp_send_reschedule(cpu);
 }
-int nohz_ratelimit(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        u64 diff = rq->clock - rq->nohz_stamp;
-        rq->nohz_stamp = rq->clock;
-        return diff < (NSEC_PER_SEC / HZ) >> 1;
-}
 #endif /* CONFIG_NO_HZ */
 static u64 sched_avg_period(void)
@@ -1281,6 +1294,10 @@ static void resched_task(struct task_struct *p)
 static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
 }
+static void sched_avg_update(struct rq *rq)
+{
+}
 #endif /* CONFIG_SMP */
 #if BITS_PER_LONG == 32
@@ -1652,7 +1669,7 @@ static void update_shares(struct sched_domain *sd)
        if (root_task_group_empty())
                return;
-        now = cpu_clock(raw_smp_processor_id());
+        now = local_clock();
        elapsed = now - sd->last_update;
        if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
@@ -1805,6 +1822,7 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 static void calc_load_account_idle(struct rq *this_rq);
 static void update_sysctl(void);
 static int get_update_sysctl_factor(void);
+static void update_cpu_load(struct rq *this_rq);
 static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
@@ -2267,11 +2285,55 @@ static void update_avg(u64 *avg, u64 sample)
 }
 #endif
-/***
+static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+                                 bool is_sync, bool is_migrate, bool is_local,
+                                 unsigned long en_flags)
+{
+        schedstat_inc(p, se.statistics.nr_wakeups);
+        if (is_sync)
+                schedstat_inc(p, se.statistics.nr_wakeups_sync);
+        if (is_migrate)
+                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+        if (is_local)
+                schedstat_inc(p, se.statistics.nr_wakeups_local);
+        else
+                schedstat_inc(p, se.statistics.nr_wakeups_remote);
+        activate_task(rq, p, en_flags);
+}
+static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+                                        int wake_flags, bool success)
+{
+        trace_sched_wakeup(p, success);
+        check_preempt_curr(rq, p, wake_flags);
+        p->state = TASK_RUNNING;
+#ifdef CONFIG_SMP
+        if (p->sched_class->task_woken)
+                p->sched_class->task_woken(rq, p);
+        if (unlikely(rq->idle_stamp)) {
+                u64 delta = rq->clock - rq->idle_stamp;
+                u64 max = 2*sysctl_sched_migration_cost;
+                if (delta > max)
+                        rq->avg_idle = max;
+                else
+                        update_avg(&rq->avg_idle, delta);
+                rq->idle_stamp = 0;
+        }
+#endif
+        /* if a worker is waking up, notify workqueue */
+        if ((p->flags & PF_WQ_WORKER) && success)
+                wq_worker_waking_up(p, cpu_of(rq));
+}
+/**
 * try_to_wake_up - wake up a thread
- * @p: the to-be-woken-up thread
+ * @p: the thread to be awakened
 * @state: the mask of task states that can be woken
- * @sync: do a synchronous wakeup?
+ * @wake_flags: wake modifier flags (WF_*)
 *
 * Put it on the run-queue if it's not already there. The "current"
 * thread is always on the run-queue (except when the actual
@@ -2279,7 +2341,8 @@ static void update_avg(u64 *avg, u64 sample)
 * the simpler "current->state = TASK_RUNNING" to mark yourself
 * runnable without the overhead of this.
 *
- * returns failure only if the task is already active.
+ * Returns %true if @p was woken up, %false if it was already running
+ * or @state didn't match @p's state.
 */
 static int try_to_wake_up(struct task_struct *p, unsigned int state,
                          int wake_flags)
@@ -2359,38 +2422,11 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 out_activate:
 #endif /* CONFIG_SMP */
-        schedstat_inc(p, se.statistics.nr_wakeups);
+        ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
-        if (wake_flags & WF_SYNC)
+                      cpu == this_cpu, en_flags);
-                schedstat_inc(p, se.statistics.nr_wakeups_sync);
-        if (orig_cpu != cpu)
-                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-        if (cpu == this_cpu)
-                schedstat_inc(p, se.statistics.nr_wakeups_local);
-        else
-                schedstat_inc(p, se.statistics.nr_wakeups_remote);
-        activate_task(rq, p, en_flags);
        success = 1;
 out_running:
-        trace_sched_wakeup(p, success);
+        ttwu_post_activation(p, rq, wake_flags, success);
-        check_preempt_curr(rq, p, wake_flags);
-        p->state = TASK_RUNNING;
-#ifdef CONFIG_SMP
-        if (p->sched_class->task_woken)
-                p->sched_class->task_woken(rq, p);
-        if (unlikely(rq->idle_stamp)) {
-                u64 delta = rq->clock - rq->idle_stamp;
-                u64 max = 2*sysctl_sched_migration_cost;
-                if (delta > max)
-                        rq->avg_idle = max;
-                else
-                        update_avg(&rq->avg_idle, delta);
-                rq->idle_stamp = 0;
-        }
-#endif
 out:
        task_rq_unlock(rq, &flags);
        put_cpu();
@@ -2399,6 +2435,37 @@ out:
 }
 /**
+ * try_to_wake_up_local - try to wake up a local task with rq lock held
+ * @p: the thread to be awakened
+ *
+ * Put @p on the run-queue if it's not alredy there.  The caller must
+ * ensure that this_rq() is locked, @p is bound to this_rq() and not
+ * the current task.  this_rq() stays locked over invocation.
+ */
+static void try_to_wake_up_local(struct task_struct *p)
+{
+        struct rq *rq = task_rq(p);
+        bool success = false;
+        BUG_ON(rq != this_rq());
+        BUG_ON(p == current);
+        lockdep_assert_held(&rq->lock);
+        if (!(p->state & TASK_NORMAL))
+                return;
+        if (!p->se.on_rq) {
+                if (likely(!task_running(rq, p))) {
+                        schedstat_inc(rq, ttwu_count);
+                        schedstat_inc(rq, ttwu_local);
+                }
+                ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+                success = true;
+        }
+        ttwu_post_activation(p, rq, 0, success);
+}
+/**
 * wake_up_process - Wake up a specific process
 * @p: The process to be woken up.
 *
@@ -3012,23 +3079,102 @@ static void calc_load_account_active(struct rq *this_rq)
 }
 /*
+ * The exact cpuload at various idx values, calculated at every tick would be
+ * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
+ *
+ * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
+ * on nth tick when cpu may be busy, then we have:
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
+ *
+ * decay_load_missed() below does efficient calculation of
+ * load = ((2^idx - 1) / 2^idx)^(n-1) * load
+ * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
+ *
+ * The calculation is approximated on a 128 point scale.
+ * degrade_zero_ticks is the number of ticks after which load at any
+ * particular idx is approximated to be zero.
+ * degrade_factor is a precomputed table, a row for each load idx.
+ * Each column corresponds to degradation factor for a power of two ticks,
+ * based on 128 point scale.
+ * Example:
+ * row 2, col 3 (=12) says that the degradation at load idx 2 after
+ * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
+ *
+ * With this power of 2 load factors, we can degrade the load n times
+ * by looking at 1 bits in n and doing as many mult/shift instead of
+ * n mult/shifts needed by the exact degradation.
+ */
+#define DEGRADE_SHIFT           7
+static const unsigned char
+                degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
+static const unsigned char
+                degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
+                                        {0, 0, 0, 0, 0, 0, 0, 0},
+                                        {64, 32, 8, 0, 0, 0, 0, 0},
+                                        {96, 72, 40, 12, 1, 0, 0},
+                                        {112, 98, 75, 43, 15, 1, 0},
+                                        {120, 112, 98, 76, 45, 16, 2} };
+/*
+ * Update cpu_load for any missed ticks, due to tickless idle. The backlog
+ * would be when CPU is idle and so we just decay the old load without
+ * adding any new load.
+ */
+static unsigned long
+decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
+{
+        int j = 0;
+        if (!missed_updates)
+                return load;
+        if (missed_updates >= degrade_zero_ticks[idx])
+                return 0;
+        if (idx == 1)
+                return load >> missed_updates;
+        while (missed_updates) {
+                if (missed_updates % 2)
+                        load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
+                missed_updates >>= 1;
+                j++;
+        }
+        return load;
+}
+/*
 * Update rq->cpu_load[] statistics. This function is usually called every
- * scheduler tick (TICK_NSEC).
+ * scheduler tick (TICK_NSEC). With tickless idle this will not be called
+ * every tick. We fix it up based on jiffies.
 */
 static void update_cpu_load(struct rq *this_rq)
 {
        unsigned long this_load = this_rq->load.weight;
+        unsigned long curr_jiffies = jiffies;
+        unsigned long pending_updates;
        int i, scale;
        this_rq->nr_load_updates++;
+        /* Avoid repeated calls on same jiffy, when moving in and out of idle */
+        if (curr_jiffies == this_rq->last_load_update_tick)
+                return;
+        pending_updates = curr_jiffies - this_rq->last_load_update_tick;
+        this_rq->last_load_update_tick = curr_jiffies;
        /* Update our load: */
-        for (i = 0, scale = 1; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
+        this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
+        for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
                unsigned long old_load, new_load;
                /* scale is effectively 1 << i now, and >> i divides by scale */
                old_load = this_rq->cpu_load[i];
+                old_load = decay_load_missed(old_load, pending_updates - 1, i);
                new_load = this_load;
                /*
                 * Round up the averaging division if load is increasing. This
@@ -3036,10 +3182,18 @@ static void update_cpu_load(struct rq *this_rq)
                 * example.
                 */
                if (new_load > old_load)
-                        new_load += scale-1;
+                        new_load += scale - 1;
-                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
+                this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
        }
+        sched_avg_update(this_rq);
+}
+static void update_cpu_load_active(struct rq *this_rq)
+{
+        update_cpu_load(this_rq);
        calc_load_account_active(this_rq);
 }
@@ -3359,9 +3513,9 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
        rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
        if (total) {
-                u64 temp;
+                u64 temp = rtime;
-                temp = (u64)(rtime * utime);
+                temp *= utime;
                do_div(temp, total);
                utime = (cputime_t)temp;
        } else
@@ -3392,9 +3546,9 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
        rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
        if (total) {
-                u64 temp;
+                u64 temp = rtime;
-                temp = (u64)(rtime * cputime.utime);
+                temp *= cputime.utime;
                do_div(temp, total);
                utime = (cputime_t)temp;
        } else
@@ -3426,7 +3580,7 @@ void scheduler_tick(void)
        raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
-        update_cpu_load(rq);
+        update_cpu_load_active(rq);
        curr->sched_class->task_tick(rq, curr, 0);
        raw_spin_unlock(&rq->lock);
@@ -3598,7 +3752,6 @@ need_resched:
        rq = cpu_rq(cpu);
        rcu_note_context_switch(cpu);
        prev = rq->curr;
-        switch_count = &prev->nivcsw;
        release_kernel_lock(prev);
 need_resched_nonpreemptible:
@@ -3611,11 +3764,26 @@ need_resched_nonpreemptible:
        raw_spin_lock_irq(&rq->lock);
        clear_tsk_need_resched(prev);
+        switch_count = &prev->nivcsw;
        if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
-                if (unlikely(signal_pending_state(prev->state, prev)))
+                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;
-                else
+                } else {
+                        /*
+                         * If a worker is going to sleep, notify and
+                         * ask workqueue whether it wants to wake up a
+                         * task to maintain concurrency.  If so, wake
+                         * up the task.
+                         */
+                        if (prev->flags & PF_WQ_WORKER) {
+                                struct task_struct *to_wakeup;
+                                to_wakeup = wq_worker_sleeping(prev, cpu);
+                                if (to_wakeup)
+                                        try_to_wake_up_local(to_wakeup);
+                        }
                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                }
                switch_count = &prev->nvcsw;
        }
@@ -3637,8 +3805,10 @@ need_resched_nonpreemptible:
                context_switch(rq, prev, next); /* unlocks the rq */
                /*
-                 * the context switch might have flipped the stack from under
+                 * The context switch have flipped the stack from under us
-                 * us, hence refresh the local variables.
+                 * and restored the local variables which were saved when
+                 * this task called schedule() in the past. prev == current
+                 * is still correct, but it can be moved to another cpu/rq.
                 */
                cpu = smp_processor_id();
                rq = cpu_rq(cpu);
@@ -3647,11 +3817,8 @@ need_resched_nonpreemptible:
        post_schedule(rq);
-        if (unlikely(reacquire_kernel_lock(current) < 0)) {
+        if (unlikely(reacquire_kernel_lock(prev)))
-                prev = rq->curr;
-                switch_count = &prev->nivcsw;
                goto need_resched_nonpreemptible;
-        }
        preempt_enable_no_resched();
        if (need_resched())
@@ -3704,8 +3871,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
                /*
                 * Owner changed, break to re-assess state.
                 */
-                if (lock->owner != owner)
+                if (lock->owner != owner) {
+                        /*
+                         * If the lock has switched to a different owner,
+                         * we likely have heavy contention. Return 0 to quit
+                         * optimistic spinning and not contend further:
+                         */
+                        if (lock->owner)
+                                return 0;
                        break;
+                }
                /*
                 * Is that owner really running on that cpu?
@@ -3726,7 +3901,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
 * off of preempt_enable. Kernel preemptions off return from interrupt
 * occur there and call schedule directly.
 */
-asmlinkage void __sched preempt_schedule(void)
+asmlinkage void __sched notrace preempt_schedule(void)
 {
        struct thread_info *ti = current_thread_info();
@@ -3738,9 +3913,9 @@ asmlinkage void __sched preempt_schedule(void)
                return;
        do {
-                add_preempt_count(PREEMPT_ACTIVE);
+                add_preempt_count_notrace(PREEMPT_ACTIVE);
                schedule();
-                sub_preempt_count(PREEMPT_ACTIVE);
+                sub_preempt_count_notrace(PREEMPT_ACTIVE);
                /*
                 * Check again in case we missed a preemption opportunity
@@ -4441,12 +4616,8 @@ recheck:
         */
        if (user && !capable(CAP_SYS_NICE)) {
                if (rt_policy(policy)) {
-                        unsigned long rlim_rtprio;
+                        unsigned long rlim_rtprio =
+                                        task_rlimit(p, RLIMIT_RTPRIO);
-                        if (!lock_task_sighand(p, &flags))
-                                return -ESRCH;
-                        rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
-                        unlock_task_sighand(p, &flags);
                        /* can't set/change the rt policy */
                        if (policy != p->policy && !rlim_rtprio)
@@ -5816,20 +5987,49 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 */
 static struct notifier_block __cpuinitdata migration_notifier = {
        .notifier_call = migration_call,
-        .priority = 10
+        .priority = CPU_PRI_MIGRATION,
 };
+static int __cpuinit sched_cpu_active(struct notifier_block *nfb,
+                                      unsigned long action, void *hcpu)
+{
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_ONLINE:
+        case CPU_DOWN_FAILED:
+                set_cpu_active((long)hcpu, true);
+                return NOTIFY_OK;
+        default:
+                return NOTIFY_DONE;
+        }
+}
+static int __cpuinit sched_cpu_inactive(struct notifier_block *nfb,
+                                        unsigned long action, void *hcpu)
+{
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_DOWN_PREPARE:
+                set_cpu_active((long)hcpu, false);
+                return NOTIFY_OK;
+        default:
+                return NOTIFY_DONE;
+        }
+}
 static int __init migration_init(void)
 {
        void *cpu = (void *)(long)smp_processor_id();
        int err;
-        /* Start one for the boot CPU: */
+        /* Initialize migration for the boot CPU */
        err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
        BUG_ON(err == NOTIFY_BAD);
        migration_call(&migration_notifier, CPU_ONLINE, cpu);
        register_cpu_notifier(&migration_notifier);
+        /* Register cpu active notifiers */
+        cpu_notifier(sched_cpu_active, CPU_PRI_SCHED_ACTIVE);
+        cpu_notifier(sched_cpu_inactive, CPU_PRI_SCHED_INACTIVE);
        return 0;
 }
 early_initcall(migration_init);
@@ -6064,23 +6264,18 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
                free_rootdomain(old_rd);
 }
-static int init_rootdomain(struct root_domain *rd, bool bootmem)
+static int init_rootdomain(struct root_domain *rd)
 {
-        gfp_t gfp = GFP_KERNEL;
        memset(rd, 0, sizeof(*rd));
-        if (bootmem)
+        if (!alloc_cpumask_var(&rd->span, GFP_KERNEL))
-                gfp = GFP_NOWAIT;
-        if (!alloc_cpumask_var(&rd->span, gfp))
                goto out;
-        if (!alloc_cpumask_var(&rd->online, gfp))
+        if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
                goto free_span;
-        if (!alloc_cpumask_var(&rd->rto_mask, gfp))
+        if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
                goto free_online;
-        if (cpupri_init(&rd->cpupri, bootmem) != 0)
+        if (cpupri_init(&rd->cpupri) != 0)
                goto free_rto_mask;
        return 0;
@@ -6096,7 +6291,7 @@ out:
 static void init_defrootdomain(void)
 {
-        init_rootdomain(&def_root_domain, true);
+        init_rootdomain(&def_root_domain);
        atomic_set(&def_root_domain.refcount, 1);
 }
@@ -6109,7 +6304,7 @@ static struct root_domain *alloc_rootdomain(void)
        if (!rd)
                return NULL;
-        if (init_rootdomain(rd, false) != 0) {
+        if (init_rootdomain(rd) != 0) {
                kfree(rd);
                return NULL;
        }
@@ -7288,29 +7483,35 @@ int __init sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
 }
 #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
-#ifndef CONFIG_CPUSETS
 /*
- * Add online and remove offline CPUs from the scheduler domains.
+ * Update cpusets according to cpu_active mask.  If cpusets are
- * When cpusets are enabled they take over this function.
+ * disabled, cpuset_update_active_cpus() becomes a simple wrapper
+ * around partition_sched_domains().
 */
-static int update_sched_domains(struct notifier_block *nfb,
+static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
-                                unsigned long action, void *hcpu)
+                             void *hcpu)
 {
-        switch (action) {
+        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-        case CPU_DOWN_PREPARE:
-        case CPU_DOWN_PREPARE_FROZEN:
        case CPU_DOWN_FAILED:
-        case CPU_DOWN_FAILED_FROZEN:
+                cpuset_update_active_cpus();
-                partition_sched_domains(1, NULL, NULL);
                return NOTIFY_OK;
+        default:
+                return NOTIFY_DONE;
+        }
+}
+static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
+                               void *hcpu)
+{
+        switch (action & ~CPU_TASKS_FROZEN) {
+        case CPU_DOWN_PREPARE:
+                cpuset_update_active_cpus();
+                return NOTIFY_OK;
        default:
                return NOTIFY_DONE;
        }
 }
-#endif
 static int update_runtime(struct notifier_block *nfb,
                                unsigned long action, void *hcpu)
@@ -7356,10 +7557,8 @@ void __init sched_init_smp(void)
        mutex_unlock(&sched_domains_mutex);
        put_online_cpus();
-#ifndef CONFIG_CPUSETS
+        hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
-        /* XXX: Theoretical race here - CPU may be hotplugged now */
+        hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
-        hotcpu_notifier(update_sched_domains, 0);
-#endif
        /* RT runtime code needs to handle some hotplug events */
        hotcpu_notifier(update_runtime, 0);
@@ -7604,6 +7803,9 @@ void __init sched_init(void)
                for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
                        rq->cpu_load[j] = 0;
+                rq->last_load_update_tick = jiffies;
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
@@ -7617,6 +7819,10 @@ void __init sched_init(void)
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
                rq_attach_root(rq, &def_root_domain);
+#ifdef CONFIG_NO_HZ
+                rq->nohz_balance_kick = 0;
+                init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
+#endif
 #endif
                init_rq_hrtick(rq);
                atomic_set(&rq->nr_iowait, 0);
@@ -7661,8 +7867,11 @@ void __init sched_init(void)
        zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
-        zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
-        alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
+        alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
+        atomic_set(&nohz.load_balancer, nr_cpu_ids);
+        atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
+        atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
 #endif
        /* May be allocated at isolcpus cmdline parse time */
        if (cpu_isolated_map == NULL)