1 files changed, 1005 insertions, 1117 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index 18d38e4ec7ba..5e43e9dc65d1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
 #include <linux/init.h>
 #include <linux/uaccess.h>
 #include <linux/highmem.h>
-#include <linux/smp_lock.h>
 #include <asm/mmu_context.h>
 #include <linux/interrupt.h>
 #include <linux/capability.h>
@@ -232,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 #endif
 /*
- * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ * sched_domains_mutex serializes calls to init_sched_domains,
 * detach_destroy_domains and partition_sched_domains.
 */
 static DEFINE_MUTEX(sched_domains_mutex);
@@ -294,7 +293,7 @@ static DEFINE_SPINLOCK(task_group_lock);
 *  limitation from this.)
 */
 #define MIN_SHARES      2
-#define MAX_SHARES      (1UL << 18)
+#define MAX_SHARES      (1UL << (18 + SCHED_LOAD_RESOLUTION))
 static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
 #endif
@@ -313,6 +312,9 @@ struct cfs_rq {
        u64 exec_clock;
        u64 min_vruntime;
+#ifndef CONFIG_64BIT
+        u64 min_vruntime_copy;
+#endif
        struct rb_root tasks_timeline;
        struct rb_node *rb_leftmost;
@@ -324,9 +326,11 @@ struct cfs_rq {
         * 'curr' points to currently running entity on this cfs_rq.
         * It is set to NULL otherwise (i.e when none are currently running).
         */
-        struct sched_entity *curr, *next, *last;
+        struct sched_entity *curr, *next, *last, *skip;
+#ifdef  CONFIG_SCHED_DEBUG
        unsigned int nr_spread_over;
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
@@ -418,6 +422,7 @@ struct rt_rq {
 */
 struct root_domain {
        atomic_t refcount;
+        struct rcu_head rcu;
        cpumask_var_t span;
        cpumask_var_t online;
@@ -461,7 +466,7 @@ struct rq {
        u64 nohz_stamp;
        unsigned char nohz_balance_kick;
 #endif
-        unsigned int skip_clock_update;
+        int skip_clock_update;
        /* capture load from *all* tasks on this cpu: */
        struct load_weight load;
@@ -554,6 +559,10 @@ struct rq {
        unsigned int ttwu_count;
        unsigned int ttwu_local;
 #endif
+#ifdef CONFIG_SMP
+        struct task_struct *wake_list;
+#endif
 };
 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -572,7 +581,7 @@ static inline int cpu_of(struct rq *rq)
 #define rcu_dereference_check_sched_domain(p) \
        rcu_dereference_check((p), \
-                              rcu_read_lock_sched_held() || \
+                              rcu_read_lock_held() || \
                              lockdep_is_held(&sched_domains_mutex))
 /*
@@ -597,7 +606,7 @@ static inline int cpu_of(struct rq *rq)
 * Return the group to which this tasks belongs.
 *
 * We use task_subsys_state_check() and extend the RCU verification
- * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
 * holds that lock for each task it moves into the cgroup. Therefore
 * by holding that lock, we pin the task to the current cgroup.
 */
@@ -606,11 +615,8 @@ static inline struct task_group *task_group(struct task_struct *p)
        struct task_group *tg;
        struct cgroup_subsys_state *css;
-        if (p->flags & PF_EXITING)
-                return &root_task_group;
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-                        lockdep_is_held(&task_rq(p)->lock));
+                        lockdep_is_held(&p->pi_lock));
        tg = container_of(css, struct task_group, css);
        return autogroup_task_group(p, tg);
@@ -646,7 +652,7 @@ static void update_rq_clock(struct rq *rq)
 {
        s64 delta;
-        if (rq->skip_clock_update)
+        if (rq->skip_clock_update > 0)
                return;
        delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -664,10 +670,9 @@ static void update_rq_clock(struct rq *rq)
 #endif
 /**
- * runqueue_is_locked
+ * runqueue_is_locked - Returns true if the current cpu runqueue is locked
 * @cpu: the processor in question.
 *
- * Returns true if the current cpu runqueue is locked.
 * This interface allows printk to be called with the runqueue lock
 * held and know whether or not it is OK to wake up the klogd.
 */
@@ -843,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
        return rq->curr == p;
 }
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline int task_running(struct rq *rq, struct task_struct *p)
 {
+#ifdef CONFIG_SMP
+        return p->on_cpu;
+#else
        return task_current(rq, p);
+#endif
 }
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
+#ifdef CONFIG_SMP
+        /*
+         * We can optimise this out completely for !SMP, because the
+         * SMP rebalancing from interrupt is the only thing that cares
+         * here.
+         */
+        next->on_cpu = 1;
+#endif
 }
 static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
+#ifdef CONFIG_SMP
+        /*
+         * After ->on_cpu is cleared, the task can be moved to a different CPU.
+         * We must ensure this doesn't happen until the switch is completely
+         * finished.
+         */
+        smp_wmb();
+        prev->on_cpu = 0;
+#endif
 #ifdef CONFIG_DEBUG_SPINLOCK
        /* this is a valid case when another task releases the spinlock */
        rq->lock.owner = current;
@@ -870,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 }
 #else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
-        return p->oncpu;
-#else
-        return task_current(rq, p);
-#endif
-}
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
@@ -887,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
         * SMP rebalancing from interrupt is the only thing that cares
         * here.
         */
-        next->oncpu = 1;
+        next->on_cpu = 1;
 #endif
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        raw_spin_unlock_irq(&rq->lock);
@@ -900,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
        /*
-         * After ->oncpu is cleared, the task can be moved to a different CPU.
+         * After ->on_cpu is cleared, the task can be moved to a different CPU.
         * We must ensure this doesn't happen until the switch is completely
         * finished.
         */
        smp_wmb();
-        prev->oncpu = 0;
+        prev->on_cpu = 0;
 #endif
 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        local_irq_enable();
@@ -914,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
- * Check whether the task is waking, we use this to synchronize ->cpus_allowed
+ * __task_rq_lock - lock the rq @p resides on.
- * against ttwu().
- */
-static inline int task_is_waking(struct task_struct *p)
-{
-        return unlikely(p->state == TASK_WAKING);
-}
-/*
- * __task_rq_lock - lock the runqueue a given task resides on.
- * Must be called interrupts disabled.
 */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
        __acquires(rq->lock)
 {
        struct rq *rq;
+        lockdep_assert_held(&p->pi_lock);
        for (;;) {
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
@@ -941,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
 }
 /*
- * task_rq_lock - lock the runqueue a given task resides on and disable
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
- * interrupts. Note the ordering: we can safely lookup the task_rq without
- * explicitly disabling preemption.
 */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+        __acquires(p->pi_lock)
        __acquires(rq->lock)
 {
        struct rq *rq;
        for (;;) {
-                local_irq_save(*flags);
+                raw_spin_lock_irqsave(&p->pi_lock, *flags);
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
                if (likely(rq == task_rq(p)))
                        return rq;
-                raw_spin_unlock_irqrestore(&rq->lock, *flags);
+                raw_spin_unlock(&rq->lock);
+                raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
        }
 }
@@ -966,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq)
        raw_spin_unlock(&rq->lock);
 }
-static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
        __releases(rq->lock)
+        __releases(p->pi_lock)
 {
-        raw_spin_unlock_irqrestore(&rq->lock, *flags);
+        raw_spin_unlock(&rq->lock);
+        raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 /*
@@ -1198,11 +1210,17 @@ int get_nohz_timer_target(void)
        int i;
        struct sched_domain *sd;
+        rcu_read_lock();
        for_each_domain(cpu, sd) {
-                for_each_cpu(i, sched_domain_span(sd))
+                for_each_cpu(i, sched_domain_span(sd)) {
-                        if (!idle_cpu(i))
+                        if (!idle_cpu(i)) {
-                                return i;
+                                cpu = i;
+                                goto unlock;
+                        }
+                }
        }
+unlock:
+        rcu_read_unlock();
        return cpu;
 }
 /*
@@ -1312,15 +1330,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 {
        u64 tmp;
+        /*
+         * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+         * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+         * 2^SCHED_LOAD_RESOLUTION.
+         */
+        if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+                tmp = (u64)delta_exec * scale_load_down(weight);
+        else
+                tmp = (u64)delta_exec;
        if (!lw->inv_weight) {
-                if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+                unsigned long w = scale_load_down(lw->weight);
+                if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
                        lw->inv_weight = 1;
+                else if (unlikely(!w))
+                        lw->inv_weight = WMULT_CONST;
                else
-                        lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
+                        lw->inv_weight = WMULT_CONST / w;
-                                / (lw->weight+1);
        }
-        tmp = (u64)delta_exec * weight;
        /*
         * Check whether we'd overflow the 64-bit multiplication:
         */
@@ -1686,6 +1716,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
                __release(rq2->lock);
 }
+#else /* CONFIG_SMP */
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+        __acquires(rq1->lock)
+        __acquires(rq2->lock)
+{
+        BUG_ON(!irqs_disabled());
+        BUG_ON(rq1 != rq2);
+        raw_spin_lock(&rq1->lock);
+        __acquire(rq2->lock);   /* Fake it out ;) */
+}
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+        __releases(rq1->lock)
+        __releases(rq2->lock)
+{
+        BUG_ON(rq1 != rq2);
+        raw_spin_unlock(&rq1->lock);
+        __release(rq2->lock);
+}
 #endif
 static void calc_load_account_idle(struct rq *this_rq);
@@ -1727,17 +1790,20 @@ static void dec_nr_running(struct rq *rq)
 static void set_load_weight(struct task_struct *p)
 {
+        int prio = p->static_prio - MAX_RT_PRIO;
+        struct load_weight *load = &p->se.load;
        /*
         * SCHED_IDLE tasks get minimal weight:
         */
        if (p->policy == SCHED_IDLE) {
-                p->se.load.weight = WEIGHT_IDLEPRIO;
+                load->weight = scale_load(WEIGHT_IDLEPRIO);
-                p->se.load.inv_weight = WMULT_IDLEPRIO;
+                load->inv_weight = WMULT_IDLEPRIO;
                return;
        }
-        p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
+        load->weight = scale_load(prio_to_weight[prio]);
-        p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
+        load->inv_weight = prio_to_wmult[prio];
 }
 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1745,7 +1811,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
        update_rq_clock(rq);
        sched_info_queued(p);
        p->sched_class->enqueue_task(rq, p, flags);
-        p->se.on_rq = 1;
 }
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1753,7 +1818,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
        update_rq_clock(rq);
        sched_info_dequeued(p);
        p->sched_class->dequeue_task(rq, p, flags);
-        p->se.on_rq = 0;
 }
 /*
@@ -1880,7 +1944,7 @@ void account_system_vtime(struct task_struct *curr)
         */
        if (hardirq_count())
                __this_cpu_add(cpu_hardirq_time, delta);
-        else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+        else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
                __this_cpu_add(cpu_softirq_time, delta);
        irq_time_write_end();
@@ -1920,8 +1984,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
                sched_rt_avg_update(rq, irq_delta);
 }
+static int irqtime_account_hi_update(void)
+{
+        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        unsigned long flags;
+        u64 latest_ns;
+        int ret = 0;
+        local_irq_save(flags);
+        latest_ns = this_cpu_read(cpu_hardirq_time);
+        if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+                ret = 1;
+        local_irq_restore(flags);
+        return ret;
+}
+static int irqtime_account_si_update(void)
+{
+        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        unsigned long flags;
+        u64 latest_ns;
+        int ret = 0;
+        local_irq_save(flags);
+        latest_ns = this_cpu_read(cpu_softirq_time);
+        if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+                ret = 1;
+        local_irq_restore(flags);
+        return ret;
+}
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
+#define sched_clock_irqtime     (0)
 static void update_rq_clock_task(struct rq *rq, s64 delta)
 {
        rq->clock_task += delta;
@@ -2025,14 +2121,14 @@ inline int task_curr(const struct task_struct *p)
 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                                       const struct sched_class *prev_class,
-                                       int oldprio, int running)
+                                       int oldprio)
 {
        if (prev_class != p->sched_class) {
                if (prev_class->switched_from)
-                        prev_class->switched_from(rq, p, running);
+                        prev_class->switched_from(rq, p);
-                p->sched_class->switched_to(rq, p, running);
+                p->sched_class->switched_to(rq, p);
-        } else
+        } else if (oldprio != p->prio)
-                p->sched_class->prio_changed(rq, p, oldprio, running);
+                p->sched_class->prio_changed(rq, p, oldprio);
 }
 static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2056,7 +2152,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
         * A queue event has occurred, and we're going to schedule.  In
         * this case, we can save a useless back to back clock update.
         */
-        if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
+        if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
                rq->skip_clock_update = 1;
 }
@@ -2102,6 +2198,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         */
        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+#ifdef CONFIG_LOCKDEP
+        WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+                                      lockdep_is_held(&task_rq(p)->lock)));
+#endif
 #endif
        trace_sched_migrate_task(p, new_cpu);
@@ -2122,19 +2223,6 @@ struct migration_arg {
 static int migration_cpu_stop(void *data);
 /*
- * The task's runqueue lock must be held.
- * Returns true if you have to wait for migration thread.
- */
-static bool migrate_task(struct task_struct *p, struct rq *rq)
-{
-        /*
-         * If the task is not on a runqueue (and not running), then
-         * the next wake-up will properly place the task.
-         */
-        return p->se.on_rq || task_running(rq, p);
-}
-/*
 * wait_task_inactive - wait for a thread to unschedule.
 *
 * If @match_state is nonzero, it's the @p->state value just checked and
@@ -2191,11 +2279,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                rq = task_rq_lock(p, &flags);
                trace_sched_wait_task(p);
                running = task_running(rq, p);
-                on_rq = p->se.on_rq;
+                on_rq = p->on_rq;
                ncsw = 0;
                if (!match_state || p->state == match_state)
                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
-                task_rq_unlock(rq, &flags);
+                task_rq_unlock(rq, p, &flags);
                /*
                 * If it changed from the expected state, bail out now.
@@ -2224,7 +2312,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 * yield - it could be a while.
                 */
                if (unlikely(on_rq)) {
-                        schedule_timeout_uninterruptible(1);
+                        ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+                        set_current_state(TASK_UNINTERRUPTIBLE);
+                        schedule_hrtimeout(&to, HRTIMER_MODE_REL);
                        continue;
                }
@@ -2246,7 +2337,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 * Cause a process which is running on another CPU to enter
 * kernel-mode, without any delay. (to get signals handled.)
 *
- * NOTE: this function doesnt have to take the runqueue lock,
+ * NOTE: this function doesn't have to take the runqueue lock,
 * because all it wants to ensure is that the remote task enters
 * the kernel. If the IPI races and the task has been migrated
 * to another CPU then no harm is done and the purpose has been
@@ -2265,30 +2356,9 @@ void kick_process(struct task_struct *p)
 EXPORT_SYMBOL_GPL(kick_process);
 #endif /* CONFIG_SMP */
-/**
- * task_oncpu_function_call - call a function on the cpu on which a task runs
- * @p:          the task to evaluate
- * @func:       the function to be called
- * @info:       the function call argument
- *
- * Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
- */
-void task_oncpu_function_call(struct task_struct *p,
-                              void (*func) (void *info), void *info)
-{
-        int cpu;
-        preempt_disable();
-        cpu = task_cpu(p);
-        if (task_curr(p))
-                smp_call_function_single(cpu, func, info, 1);
-        preempt_enable();
-}
 #ifdef CONFIG_SMP
 /*
- * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ * ->cpus_allowed is protected by both rq->lock and p->pi_lock
 */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
@@ -2321,12 +2391,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 }
 /*
- * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
 */
 static inline
-int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
 {
-        int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+        int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -2352,27 +2422,62 @@ static void update_avg(u64 *avg, u64 sample)
 }
 #endif
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
+static void
-                                 bool is_sync, bool is_migrate, bool is_local,
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
-                                 unsigned long en_flags)
 {
+#ifdef CONFIG_SCHEDSTATS
+        struct rq *rq = this_rq();
+#ifdef CONFIG_SMP
+        int this_cpu = smp_processor_id();
+        if (cpu == this_cpu) {
+                schedstat_inc(rq, ttwu_local);
+                schedstat_inc(p, se.statistics.nr_wakeups_local);
+        } else {
+                struct sched_domain *sd;
+                schedstat_inc(p, se.statistics.nr_wakeups_remote);
+                rcu_read_lock();
+                for_each_domain(this_cpu, sd) {
+                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+                                schedstat_inc(sd, ttwu_wake_remote);
+                                break;
+                        }
+                }
+                rcu_read_unlock();
+        }
+#endif /* CONFIG_SMP */
+        schedstat_inc(rq, ttwu_count);
        schedstat_inc(p, se.statistics.nr_wakeups);
-        if (is_sync)
+        if (wake_flags & WF_SYNC)
                schedstat_inc(p, se.statistics.nr_wakeups_sync);
-        if (is_migrate)
+        if (cpu != task_cpu(p))
                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
-        if (is_local)
-                schedstat_inc(p, se.statistics.nr_wakeups_local);
-        else
-                schedstat_inc(p, se.statistics.nr_wakeups_remote);
+#endif /* CONFIG_SCHEDSTATS */
+}
+static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+{
        activate_task(rq, p, en_flags);
+        p->on_rq = 1;
+        /* if a worker is waking up, notify workqueue */
+        if (p->flags & PF_WQ_WORKER)
+                wq_worker_waking_up(p, cpu_of(rq));
 }
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
+/*
-                                        int wake_flags, bool success)
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void
+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
 {
-        trace_sched_wakeup(p, success);
+        trace_sched_wakeup(p, true);
        check_preempt_curr(rq, p, wake_flags);
        p->state = TASK_RUNNING;
@@ -2391,9 +2496,99 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
                rq->idle_stamp = 0;
        }
 #endif
-        /* if a worker is waking up, notify workqueue */
+}
-        if ((p->flags & PF_WQ_WORKER) && success)
-                wq_worker_waking_up(p, cpu_of(rq));
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+#ifdef CONFIG_SMP
+        if (p->sched_contributes_to_load)
+                rq->nr_uninterruptible--;
+#endif
+        ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+        ttwu_do_wakeup(rq, p, wake_flags);
+}
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+        struct rq *rq;
+        int ret = 0;
+        rq = __task_rq_lock(p);
+        if (p->on_rq) {
+                ttwu_do_wakeup(rq, p, wake_flags);
+                ret = 1;
+        }
+        __task_rq_unlock(rq);
+        return ret;
+}
+#ifdef CONFIG_SMP
+static void sched_ttwu_pending(void)
+{
+        struct rq *rq = this_rq();
+        struct task_struct *list = xchg(&rq->wake_list, NULL);
+        if (!list)
+                return;
+        raw_spin_lock(&rq->lock);
+        while (list) {
+                struct task_struct *p = list;
+                list = list->wake_entry;
+                ttwu_do_activate(rq, p, 0);
+        }
+        raw_spin_unlock(&rq->lock);
+}
+void scheduler_ipi(void)
+{
+        sched_ttwu_pending();
+}
+static void ttwu_queue_remote(struct task_struct *p, int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        struct task_struct *next = rq->wake_list;
+        for (;;) {
+                struct task_struct *old = next;
+                p->wake_entry = next;
+                next = cmpxchg(&rq->wake_list, old, p);
+                if (next == old)
+                        break;
+        }
+        if (!next)
+                smp_send_reschedule(cpu);
+}
+#endif
+static void ttwu_queue(struct task_struct *p, int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+#if defined(CONFIG_SMP)
+        if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+                ttwu_queue_remote(p, cpu);
+                return;
+        }
+#endif
+        raw_spin_lock(&rq->lock);
+        ttwu_do_activate(rq, p, 0);
+        raw_spin_unlock(&rq->lock);
 }
 /**
@@ -2411,92 +2606,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
 * Returns %true if @p was woken up, %false if it was already running
 * or @state didn't match @p's state.
 */
-static int try_to_wake_up(struct task_struct *p, unsigned int state,
+static int
-                          int wake_flags)
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 {
-        int cpu, orig_cpu, this_cpu, success = 0;
        unsigned long flags;
-        unsigned long en_flags = ENQUEUE_WAKEUP;
+        int cpu, success = 0;
-        struct rq *rq;
-        this_cpu = get_cpu();
        smp_wmb();
-        rq = task_rq_lock(p, &flags);
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
        if (!(p->state & state))
                goto out;
-        if (p->se.on_rq)
+        success = 1; /* we're going to change ->state */
-                goto out_running;
        cpu = task_cpu(p);
-        orig_cpu = cpu;
-#ifdef CONFIG_SMP
+        if (p->on_rq && ttwu_remote(p, wake_flags))
-        if (unlikely(task_running(rq, p)))
+                goto stat;
-                goto out_activate;
+#ifdef CONFIG_SMP
        /*
-         * In order to handle concurrent wakeups and release the rq->lock
+         * If the owning (remote) cpu is still in the middle of schedule() with
-         * we put the task in TASK_WAKING state.
+         * this task as prev, wait until its done referencing the task.
-         *
-         * First fix up the nr_uninterruptible count:
         */
-        if (task_contributes_to_load(p)) {
+        while (p->on_cpu) {
-                if (likely(cpu_online(orig_cpu)))
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-                        rq->nr_uninterruptible--;
+                /*
-                else
+                 * If called from interrupt context we could have landed in the
-                        this_rq()->nr_uninterruptible--;
+                 * middle of schedule(), in this case we should take care not
-        }
+                 * to spin on ->on_cpu if p is current, since that would
-        p->state = TASK_WAKING;
+                 * deadlock.
+                 */
-        if (p->sched_class->task_waking) {
+                if (p == current) {
-                p->sched_class->task_waking(rq, p);
+                        ttwu_queue(p, cpu);
-                en_flags |= ENQUEUE_WAKING;
+                        goto stat;
+                }
+#endif
+                cpu_relax();
        }
-        cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
-        if (cpu != orig_cpu)
-                set_task_cpu(p, cpu);
-        __task_rq_unlock(rq);
-        rq = cpu_rq(cpu);
-        raw_spin_lock(&rq->lock);
        /*
-         * We migrated the task without holding either rq->lock, however
+         * Pairs with the smp_wmb() in finish_lock_switch().
-         * since the task is not on the task list itself, nobody else
-         * will try and migrate the task, hence the rq should match the
-         * cpu we just moved it to.
         */
-        WARN_ON(task_cpu(p) != cpu);
+        smp_rmb();
-        WARN_ON(p->state != TASK_WAKING);
-#ifdef CONFIG_SCHEDSTATS
+        p->sched_contributes_to_load = !!task_contributes_to_load(p);
-        schedstat_inc(rq, ttwu_count);
+        p->state = TASK_WAKING;
-        if (cpu == this_cpu)
-                schedstat_inc(rq, ttwu_local);
-        else {
-                struct sched_domain *sd;
-                for_each_domain(this_cpu, sd) {
-                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                                schedstat_inc(sd, ttwu_wake_remote);
-                                break;
-                        }
-                }
-        }
-#endif /* CONFIG_SCHEDSTATS */
-out_activate:
+        if (p->sched_class->task_waking)
+                p->sched_class->task_waking(p);
+        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+        if (task_cpu(p) != cpu)
+                set_task_cpu(p, cpu);
 #endif /* CONFIG_SMP */
-        ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
-                      cpu == this_cpu, en_flags);
+        ttwu_queue(p, cpu);
-        success = 1;
+stat:
-out_running:
+        ttwu_stat(p, cpu, wake_flags);
-        ttwu_post_activation(p, rq, wake_flags, success);
 out:
-        task_rq_unlock(rq, &flags);
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-        put_cpu();
        return success;
 }
@@ -2505,31 +2672,34 @@ out:
 * try_to_wake_up_local - try to wake up a local task with rq lock held
 * @p: the thread to be awakened
 *
- * Put @p on the run-queue if it's not already there.  The caller must
+ * Put @p on the run-queue if it's not already there. The caller must
 * ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task.  this_rq() stays locked over invocation.
+ * the current task.
 */
 static void try_to_wake_up_local(struct task_struct *p)
 {
        struct rq *rq = task_rq(p);
-        bool success = false;
        BUG_ON(rq != this_rq());
        BUG_ON(p == current);
        lockdep_assert_held(&rq->lock);
+        if (!raw_spin_trylock(&p->pi_lock)) {
+                raw_spin_unlock(&rq->lock);
+                raw_spin_lock(&p->pi_lock);
+                raw_spin_lock(&rq->lock);
+        }
        if (!(p->state & TASK_NORMAL))
-                return;
+                goto out;
-        if (!p->se.on_rq) {
+        if (!p->on_rq)
-                if (likely(!task_running(rq, p))) {
+                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
-                        schedstat_inc(rq, ttwu_count);
-                        schedstat_inc(rq, ttwu_local);
+        ttwu_do_wakeup(rq, p, 0);
-                }
+        ttwu_stat(p, smp_processor_id(), 0);
-                ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
+out:
-                success = true;
+        raw_spin_unlock(&p->pi_lock);
-        }
-        ttwu_post_activation(p, rq, 0, success);
 }
 /**
@@ -2562,18 +2732,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
 */
 static void __sched_fork(struct task_struct *p)
 {
+        p->on_rq                        = 0;
+        p->se.on_rq                     = 0;
        p->se.exec_start                = 0;
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
+        p->se.vruntime                  = 0;
+        INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_SCHEDSTATS
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
        INIT_LIST_HEAD(&p->rt.run_list);
-        p->se.on_rq = 0;
-        INIT_LIST_HEAD(&p->se.group_node);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2583,8 +2756,9 @@ static void __sched_fork(struct task_struct *p)
 /*
 * fork()/clone()-time setup:
 */
-void sched_fork(struct task_struct *p, int clone_flags)
+void sched_fork(struct task_struct *p)
 {
+        unsigned long flags;
        int cpu = get_cpu();
        __sched_fork(p);
@@ -2635,16 +2809,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
         *
         * Silence PROVE_RCU.
         */
-        rcu_read_lock();
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
        set_task_cpu(p, cpu);
-        rcu_read_unlock();
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
        if (likely(sched_info_on()))
                memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
-        p->oncpu = 0;
+        p->on_cpu = 0;
 #endif
 #ifdef CONFIG_PREEMPT
        /* Want to start with kernel preemption disabled. */
@@ -2664,41 +2838,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
 * that must be done for every newly created context, then puts the task
 * on the runqueue and wakes it.
 */
-void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
+void wake_up_new_task(struct task_struct *p)
 {
        unsigned long flags;
        struct rq *rq;
-        int cpu __maybe_unused = get_cpu();
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
 #ifdef CONFIG_SMP
-        rq = task_rq_lock(p, &flags);
-        p->state = TASK_WAKING;
        /*
         * Fork balancing, do it here and not earlier because:
         *  - cpus_allowed can change in the fork path
         *  - any previously selected cpu might disappear through hotplug
-         *
-         * We set TASK_WAKING so that select_task_rq() can drop rq->lock
-         * without people poking at ->cpus_allowed.
         */
-        cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
+        set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
-        set_task_cpu(p, cpu);
-        p->state = TASK_RUNNING;
-        task_rq_unlock(rq, &flags);
 #endif
-        rq = task_rq_lock(p, &flags);
+        rq = __task_rq_lock(p);
        activate_task(rq, p, 0);
-        trace_sched_wakeup_new(p, 1);
+        p->on_rq = 1;
+        trace_sched_wakeup_new(p, true);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
        if (p->sched_class->task_woken)
                p->sched_class->task_woken(rq, p);
 #endif
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
-        put_cpu();
 }
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2776,9 +2940,12 @@ static inline void
 prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
+        sched_info_switch(prev, next);
+        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
        prepare_lock_switch(rq, next);
        prepare_arch_switch(next);
+        trace_sched_switch(prev, next);
 }
 /**
@@ -2911,7 +3078,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
        struct mm_struct *mm, *oldmm;
        prepare_task_switch(rq, prev, next);
-        trace_sched_switch(prev, next);
        mm = next->mm;
        oldmm = prev->active_mm;
        /*
@@ -3404,27 +3571,22 @@ void sched_exec(void)
 {
        struct task_struct *p = current;
        unsigned long flags;
-        struct rq *rq;
        int dest_cpu;
-        rq = task_rq_lock(p, &flags);
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+        dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
        if (dest_cpu == smp_processor_id())
                goto unlock;
-        /*
+        if (likely(cpu_active(dest_cpu))) {
-         * select_task_rq() can race against ->cpus_allowed
-         */
-        if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
-            likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
                struct migration_arg arg = { p, dest_cpu };
-                task_rq_unlock(rq, &flags);
+                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+                stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
                return;
        }
 unlock:
-        task_rq_unlock(rq, &flags);
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 }
 #endif
@@ -3461,7 +3623,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
        rq = task_rq_lock(p, &flags);
        ns = do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ns;
 }
@@ -3479,7 +3641,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
        rq = task_rq_lock(p, &flags);
        ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ns;
 }
@@ -3503,7 +3665,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
        rq = task_rq_lock(p, &flags);
        thread_group_cputime(p, &totals);
        ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ns;
 }
@@ -3568,6 +3730,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
 }
 /*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+                        cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+        cputime64_t tmp = cputime_to_cputime64(cputime);
+        /* Add system time to process. */
+        p->stime = cputime_add(p->stime, cputime);
+        p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+        account_group_system_time(p, cputime);
+        /* Add system time to cpustat. */
+        *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+        cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+        /* Account for system time used */
+        acct_update_integrals(p);
+}
+/*
 * Account system cpu time to a process.
 * @p: the process that the cpu time gets accounted to
 * @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3578,36 +3766,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
                         cputime_t cputime, cputime_t cputime_scaled)
 {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-        cputime64_t tmp;
+        cputime64_t *target_cputime64;
        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
                account_guest_time(p, cputime, cputime_scaled);
                return;
        }
-        /* Add system time to process. */
-        p->stime = cputime_add(p->stime, cputime);
-        p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
-        account_group_system_time(p, cputime);
-        /* Add system time to cpustat. */
-        tmp = cputime_to_cputime64(cputime);
        if (hardirq_count() - hardirq_offset)
-                cpustat->irq = cputime64_add(cpustat->irq, tmp);
+                target_cputime64 = &cpustat->irq;
        else if (in_serving_softirq())
-                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+                target_cputime64 = &cpustat->softirq;
        else
-                cpustat->system = cputime64_add(cpustat->system, tmp);
+                target_cputime64 = &cpustat->system;
-        cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+        __account_system_time(p, cputime, cputime_scaled, target_cputime64);
-        /* Account for system time used */
-        acct_update_integrals(p);
 }
 /*
 * Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
+ * @cputime: the cpu time spent in involuntary wait
 */
 void account_steal_time(cputime_t cputime)
 {
@@ -3635,6 +3813,73 @@ void account_idle_time(cputime_t cputime)
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ *   - check for guest_time
+ *   - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                                struct rq *rq)
+{
+        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+        cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        if (irqtime_account_hi_update()) {
+                cpustat->irq = cputime64_add(cpustat->irq, tmp);
+        } else if (irqtime_account_si_update()) {
+                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+        } else if (this_cpu_ksoftirqd() == p) {
+                /*
+                 * ksoftirqd time do not get accounted in cpu_softirq_time.
+                 * So, we have to handle it separately here.
+                 * Also, p->stime needs to be updated for ksoftirqd.
+                 */
+                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                        &cpustat->softirq);
+        } else if (user_tick) {
+                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+        } else if (p == rq->idle) {
+                account_idle_time(cputime_one_jiffy);
+        } else if (p->flags & PF_VCPU) { /* System time or guest time */
+                account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+        } else {
+                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+                                        &cpustat->system);
+        }
+}
+static void irqtime_account_idle_ticks(int ticks)
+{
+        int i;
+        struct rq *rq = this_rq();
+        for (i = 0; i < ticks; i++)
+                irqtime_account_process_tick(current, 0, rq);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+                                                struct rq *rq) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 /*
 * Account a single tick of cpu time.
 * @p: the process that the cpu time gets accounted to
@@ -3645,6 +3890,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
        struct rq *rq = this_rq();
+        if (sched_clock_irqtime) {
+                irqtime_account_process_tick(p, user_tick, rq);
+                return;
+        }
        if (user_tick)
                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3670,6 +3920,12 @@ void account_steal_ticks(unsigned long ticks)
 */
 void account_idle_ticks(unsigned long ticks)
 {
+        if (sched_clock_irqtime) {
+                irqtime_account_idle_ticks(ticks);
+                return;
+        }
        account_idle_time(jiffies_to_cputime(ticks));
 }
@@ -3763,9 +4019,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 /*
 * This function gets called by the timer code, with HZ frequency.
 * We call it with interrupts disabled.
- *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
 */
 void scheduler_tick(void)
 {
@@ -3885,17 +4138,11 @@ static inline void schedule_debug(struct task_struct *prev)
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
        schedstat_inc(this_rq(), sched_count);
-#ifdef CONFIG_SCHEDSTATS
-        if (unlikely(prev->lock_depth >= 0)) {
-                schedstat_inc(this_rq(), rq_sched_info.bkl_count);
-                schedstat_inc(prev, sched_info.bkl_count);
-        }
-#endif
 }
 static void put_prev_task(struct rq *rq, struct task_struct *prev)
 {
-        if (prev->se.on_rq)
+        if (prev->on_rq || rq->skip_clock_update < 0)
                update_rq_clock(rq);
        prev->sched_class->put_prev_task(rq, prev);
 }
@@ -3945,9 +4192,6 @@ need_resched:
        rcu_note_context_switch(cpu);
        prev = rq->curr;
-        release_kernel_lock(prev);
-need_resched_nonpreemptible:
        schedule_debug(prev);
        if (sched_feat(HRTICK))
@@ -3960,11 +4204,13 @@ need_resched_nonpreemptible:
                if (unlikely(signal_pending_state(prev->state, prev))) {
                        prev->state = TASK_RUNNING;
                } else {
+                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                        prev->on_rq = 0;
                        /*
-                         * If a worker is going to sleep, notify and
+                         * If a worker went to sleep, notify and ask workqueue
-                         * ask workqueue whether it wants to wake up a
+                         * whether it wants to wake up a task to maintain
-                         * task to maintain concurrency.  If so, wake
+                         * concurrency.
-                         * up the task.
                         */
                        if (prev->flags & PF_WQ_WORKER) {
                                struct task_struct *to_wakeup;
@@ -3973,7 +4219,16 @@ need_resched_nonpreemptible:
                                if (to_wakeup)
                                        try_to_wake_up_local(to_wakeup);
                        }
-                        deactivate_task(rq, prev, DEQUEUE_SLEEP);
+                        /*
+                         * If we are going to sleep and we have plugged IO
+                         * queued, make sure to submit it to avoid deadlocks.
+                         */
+                        if (blk_needs_flush_plug(prev)) {
+                                raw_spin_unlock(&rq->lock);
+                                blk_schedule_flush_plug(prev);
+                                raw_spin_lock(&rq->lock);
+                        }
                }
                switch_count = &prev->nvcsw;
        }
@@ -3989,9 +4244,6 @@ need_resched_nonpreemptible:
        rq->skip_clock_update = 0;
        if (likely(prev != next)) {
-                sched_info_switch(prev, next);
-                perf_event_task_sched_out(prev, next);
                rq->nr_switches++;
                rq->curr = next;
                ++*switch_count;
@@ -4010,9 +4262,6 @@ need_resched_nonpreemptible:
        post_schedule(rq);
-        if (unlikely(reacquire_kernel_lock(prev)))
-                goto need_resched_nonpreemptible;
        preempt_enable_no_resched();
        if (need_resched())
                goto need_resched;
@@ -4020,70 +4269,53 @@ need_resched_nonpreemptible:
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
-{
-        unsigned int cpu;
-        struct rq *rq;
-        if (!sched_feat(OWNER_SPIN))
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-                return 0;
+{
+        bool ret = false;
-#ifdef CONFIG_DEBUG_PAGEALLOC
+        rcu_read_lock();
-        /*
+        if (lock->owner != owner)
-         * Need to access the cpu field knowing that
+                goto fail;
-         * DEBUG_PAGEALLOC could have unmapped it if
-         * the mutex owner just released it and exited.
-         */
-        if (probe_kernel_address(&owner->cpu, cpu))
-                return 0;
-#else
-        cpu = owner->cpu;
-#endif
        /*
-         * Even if the access succeeded (likely case),
+         * Ensure we emit the owner->on_cpu, dereference _after_ checking
-         * the cpu field may no longer be valid.
+         * lock->owner still matches owner, if that fails, owner might
+         * point to free()d memory, if it still matches, the rcu_read_lock()
+         * ensures the memory stays valid.
         */
-        if (cpu >= nr_cpumask_bits)
+        barrier();
-                return 0;
-        /*
+        ret = owner->on_cpu;
-         * We need to validate that we can do a
+fail:
-         * get_cpu() and that we have the percpu area.
+        rcu_read_unlock();
-         */
-        if (!cpu_online(cpu))
-                return 0;
-        rq = cpu_rq(cpu);
+        return ret;
+}
-        for (;;) {
+/*
-                /*
+ * Look out! "owner" is an entirely speculative pointer
-                 * Owner changed, break to re-assess state.
+ * access and not reliable.
-                 */
+ */
-                if (lock->owner != owner) {
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
-                        /*
+{
-                         * If the lock has switched to a different owner,
+        if (!sched_feat(OWNER_SPIN))
-                         * we likely have heavy contention. Return 0 to quit
+                return 0;
-                         * optimistic spinning and not contend further:
-                         */
-                        if (lock->owner)
-                                return 0;
-                        break;
-                }
-                /*
+        while (owner_running(lock, owner)) {
-                 * Is that owner really running on that cpu?
+                if (need_resched())
-                 */
-                if (task_thread_info(rq->curr) != owner || need_resched())
                        return 0;
                arch_mutex_cpu_relax();
        }
+        /*
+         * If the owner changed to another task there is likely
+         * heavy contention, stop spinning.
+         */
+        if (lock->owner)
+                return 0;
        return 1;
 }
 #endif
@@ -4213,6 +4445,7 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
 {
        __wake_up_common(q, mode, 1, 0, key);
 }
+EXPORT_SYMBOL_GPL(__wake_up_locked_key);
 /**
 * __wake_up_sync_key - wake up threads blocked on a waitqueue.
@@ -4542,19 +4775,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
 */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-        unsigned long flags;
        int oldprio, on_rq, running;
        struct rq *rq;
        const struct sched_class *prev_class;
        BUG_ON(prio < 0 || prio > MAX_PRIO);
-        rq = task_rq_lock(p, &flags);
+        rq = __task_rq_lock(p);
        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
        prev_class = p->sched_class;
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
                dequeue_task(rq, p, 0);
@@ -4570,12 +4802,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->set_curr_task(rq);
-        if (on_rq) {
+        if (on_rq)
                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
-                check_class_changed(rq, p, prev_class, oldprio, running);
+        check_class_changed(rq, p, prev_class, oldprio);
-        }
+        __task_rq_unlock(rq);
-        task_rq_unlock(rq, &flags);
 }
 #endif
@@ -4603,7 +4834,7 @@ void set_user_nice(struct task_struct *p, long nice)
                p->static_prio = NICE_TO_PRIO(nice);
                goto out_unlock;
        }
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        if (on_rq)
                dequeue_task(rq, p, 0);
@@ -4623,7 +4854,7 @@ void set_user_nice(struct task_struct *p, long nice)
                        resched_task(rq->curr);
        }
 out_unlock:
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
 }
 EXPORT_SYMBOL(set_user_nice);
@@ -4737,8 +4968,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
 static void
 __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
 {
-        BUG_ON(p->se.on_rq);
        p->policy = policy;
        p->rt_priority = prio;
        p->normal_prio = normal_prio(p);
@@ -4761,8 +4990,11 @@ static bool check_same_owner(struct task_struct *p)
        rcu_read_lock();
        pcred = __task_cred(p);
-        match = (cred->euid == pcred->euid ||
+        if (cred->user->user_ns == pcred->user->user_ns)
-                 cred->euid == pcred->uid);
+                match = (cred->euid == pcred->euid ||
+                         cred->euid == pcred->uid);
+        else
+                match = false;
        rcu_read_unlock();
        return match;
 }
@@ -4822,12 +5054,15 @@ recheck:
                            param->sched_priority > rlim_rtprio)
                                return -EPERM;
                }
                /*
-                 * Like positive nice levels, dont allow tasks to
+                 * Treat SCHED_IDLE as nice 20. Only allow a switch to
-                 * move out of SCHED_IDLE either:
+                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                 */
-                if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
+                if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
-                        return -EPERM;
+                        if (!can_nice(p, TASK_NICE(p)))
+                                return -EPERM;
+                }
                /* can't change other user's priorities */
                if (!check_same_owner(p))
@@ -4847,21 +5082,29 @@ recheck:
        /*
         * make sure no PI-waiters arrive (or leave) while we are
         * changing the priority of the task:
-         */
+         *
-        raw_spin_lock_irqsave(&p->pi_lock, flags);
+         * To be able to change p->policy safely, the appropriate
-        /*
-         * To be able to change p->policy safely, the apropriate
         * runqueue lock must be held.
         */
-        rq = __task_rq_lock(p);
+        rq = task_rq_lock(p, &flags);
        /*
         * Changing the policy of the stop threads its a very bad idea
         */
        if (p == rq->stop) {
+                task_rq_unlock(rq, p, &flags);
+                return -EINVAL;
+        }
+        /*
+         * If not changing anything there's no need to proceed further:
+         */
+        if (unlikely(policy == p->policy && (!rt_policy(policy) ||
+                        param->sched_priority == p->rt_priority))) {
                __task_rq_unlock(rq);
                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
-                return -EINVAL;
+                return 0;
        }
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -4873,8 +5116,7 @@ recheck:
                if (rt_bandwidth_enabled() && rt_policy(policy) &&
                                task_group(p)->rt_bandwidth.rt_runtime == 0 &&
                                !task_group_is_autogroup(task_group(p))) {
-                        __task_rq_unlock(rq);
+                        task_rq_unlock(rq, p, &flags);
-                        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                        return -EPERM;
                }
        }
@@ -4883,11 +5125,10 @@ recheck:
        /* recheck policy now with rq lock held */
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
                policy = oldpolicy = -1;
-                __task_rq_unlock(rq);
+                task_rq_unlock(rq, p, &flags);
-                raw_spin_unlock_irqrestore(&p->pi_lock, flags);
                goto recheck;
        }
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
                deactivate_task(rq, p, 0);
@@ -4902,13 +5143,11 @@ recheck:
        if (running)
                p->sched_class->set_curr_task(rq);
-        if (on_rq) {
+        if (on_rq)
                activate_task(rq, p, 0);
-                check_class_changed(rq, p, prev_class, oldprio, running);
+        check_class_changed(rq, p, prev_class, oldprio);
-        }
+        task_rq_unlock(rq, p, &flags);
-        __task_rq_unlock(rq);
-        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
        rt_mutex_adjust_pi(p);
@@ -5088,7 +5327,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                goto out_free_cpus_allowed;
        }
        retval = -EPERM;
-        if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+        if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
                goto out_unlock;
        retval = security_task_setscheduler(p);
@@ -5159,7 +5398,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
 {
        struct task_struct *p;
        unsigned long flags;
-        struct rq *rq;
        int retval;
        get_online_cpus();
@@ -5174,9 +5412,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
        if (retval)
                goto out_unlock;
-        rq = task_rq_lock(p, &flags);
+        raw_spin_lock_irqsave(&p->pi_lock, flags);
        cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
-        task_rq_unlock(rq, &flags);
+        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out_unlock:
        rcu_read_unlock();
@@ -5323,6 +5561,67 @@ void __sched yield(void)
 }
 EXPORT_SYMBOL(yield);
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+        struct task_struct *curr = current;
+        struct rq *rq, *p_rq;
+        unsigned long flags;
+        bool yielded = 0;
+        local_irq_save(flags);
+        rq = this_rq();
+again:
+        p_rq = task_rq(p);
+        double_rq_lock(rq, p_rq);
+        while (task_rq(p) != p_rq) {
+                double_rq_unlock(rq, p_rq);
+                goto again;
+        }
+        if (!curr->sched_class->yield_to_task)
+                goto out;
+        if (curr->sched_class != p->sched_class)
+                goto out;
+        if (task_running(p_rq, p) || p->state)
+                goto out;
+        yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+        if (yielded) {
+                schedstat_inc(rq, yld_count);
+                /*
+                 * Make p's CPU reschedule; pick_next_entity takes care of
+                 * fairness.
+                 */
+                if (preempt && rq != p_rq)
+                        resched_task(p_rq->curr);
+        }
+out:
+        double_rq_unlock(rq, p_rq);
+        local_irq_restore(flags);
+        if (yielded)
+                schedule();
+        return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
 /*
 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
 * that process accounting knows that this is a task in IO wait state.
@@ -5333,6 +5632,7 @@ void __sched io_schedule(void)
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+        blk_flush_plug(current);
        current->in_iowait = 1;
        schedule();
        current->in_iowait = 0;
@@ -5348,6 +5648,7 @@ long __sched io_schedule_timeout(long timeout)
        delayacct_blkio_start();
        atomic_inc(&rq->nr_iowait);
+        blk_flush_plug(current);
        current->in_iowait = 1;
        ret = schedule_timeout(timeout);
        current->in_iowait = 0;
@@ -5438,7 +5739,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
        rq = task_rq_lock(p, &flags);
        time_slice = p->sched_class->get_rr_interval(rq, p);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        rcu_read_unlock();
        jiffies_to_timespec(time_slice, &t);
@@ -5496,7 +5797,7 @@ void show_state_filter(unsigned long state_filter)
        do_each_thread(g, p) {
                /*
                 * reset the NMI-timeout, listing all files on a slow
-                 * console might take alot of time:
+                 * console might take a lot of time:
                 */
                touch_nmi_watchdog();
                if (!state_filter || (p->state & state_filter))
@@ -5556,22 +5857,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
        rcu_read_unlock();
        rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
+#if defined(CONFIG_SMP)
-        idle->oncpu = 1;
+        idle->on_cpu = 1;
 #endif
        raw_spin_unlock_irqrestore(&rq->lock, flags);
        /* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT)
-        task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
-#else
        task_thread_info(idle)->preempt_count = 0;
-#endif
        /*
         * The idle tasks have their own, simple scheduling class:
         */
        idle->sched_class = &idle_sched_class;
-        ftrace_graph_init_task(idle);
+        ftrace_graph_init_idle_task(idle, cpu);
 }
 /*
@@ -5661,26 +5959,17 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
        unsigned int dest_cpu;
        int ret = 0;
-        /*
-         * Serialize against TASK_WAKING so that ttwu() and wunt() can
-         * drop the rq->lock and still rely on ->cpus_allowed.
-         */
-again:
-        while (task_is_waking(p))
-                cpu_relax();
        rq = task_rq_lock(p, &flags);
-        if (task_is_waking(p)) {
-                task_rq_unlock(rq, &flags);
+        if (cpumask_equal(&p->cpus_allowed, new_mask))
-                goto again;
+                goto out;
-        }
        if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                ret = -EINVAL;
                goto out;
        }
-        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
+        if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
-                     !cpumask_equal(&p->cpus_allowed, new_mask))) {
                ret = -EINVAL;
                goto out;
        }
@@ -5697,16 +5986,16 @@ again:
                goto out;
        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-        if (migrate_task(p, rq)) {
+        if (p->on_rq) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
-                task_rq_unlock(rq, &flags);
+                task_rq_unlock(rq, p, &flags);
                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                tlb_migrate_finish(p->mm);
                return 0;
        }
 out:
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, p, &flags);
        return ret;
 }
@@ -5734,6 +6023,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        rq_src = cpu_rq(src_cpu);
        rq_dest = cpu_rq(dest_cpu);
+        raw_spin_lock(&p->pi_lock);
        double_rq_lock(rq_src, rq_dest);
        /* Already moved. */
        if (task_cpu(p) != src_cpu)
@@ -5746,7 +6036,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
         * If we're not on a rq, the next wake-up will ensure we're
         * placed properly.
         */
-        if (p->se.on_rq) {
+        if (p->on_rq) {
                deactivate_task(rq_src, p, 0);
                set_task_cpu(p, dest_cpu);
                activate_task(rq_dest, p, 0);
@@ -5756,6 +6046,7 @@ done:
        ret = 1;
 fail:
        double_rq_unlock(rq_src, rq_dest);
+        raw_spin_unlock(&p->pi_lock);
        return ret;
 }
@@ -6096,6 +6387,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 #ifdef CONFIG_HOTPLUG_CPU
        case CPU_DYING:
+                sched_ttwu_pending();
                /* Update our root-domain */
                raw_spin_lock_irqsave(&rq->lock, flags);
                if (rq->rd) {
@@ -6111,6 +6403,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                break;
 #endif
        }
+        update_max_interval();
        return NOTIFY_OK;
 }
@@ -6171,6 +6466,8 @@ early_initcall(migration_init);
 #ifdef CONFIG_SMP
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
 #ifdef CONFIG_SCHED_DEBUG
 static __read_mostly int sched_domain_debug_enabled;
@@ -6245,7 +6542,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
                printk(KERN_CONT " %s", str);
-                if (group->cpu_power != SCHED_LOAD_SCALE) {
+                if (group->cpu_power != SCHED_POWER_SCALE) {
                        printk(KERN_CONT " (cpu_power = %d)",
                                group->cpu_power);
                }
@@ -6266,7 +6563,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
 static void sched_domain_debug(struct sched_domain *sd, int cpu)
 {
-        cpumask_var_t groupmask;
        int level = 0;
        if (!sched_domain_debug_enabled)
@@ -6279,20 +6575,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
        printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
-        if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
-                printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
-                return;
-        }
        for (;;) {
-                if (sched_domain_debug_one(sd, cpu, level, groupmask))
+                if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
                        break;
                level++;
                sd = sd->parent;
                if (!sd)
                        break;
        }
-        free_cpumask_var(groupmask);
 }
 #else /* !CONFIG_SCHED_DEBUG */
 # define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6349,12 +6639,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
        return 1;
 }
-static void free_rootdomain(struct root_domain *rd)
+static void free_rootdomain(struct rcu_head *rcu)
 {
-        synchronize_sched();
+        struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
        cpupri_cleanup(&rd->cpupri);
        free_cpumask_var(rd->rto_mask);
        free_cpumask_var(rd->online);
        free_cpumask_var(rd->span);
@@ -6395,7 +6684,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
        if (old_rd)
-                free_rootdomain(old_rd);
+                call_rcu_sched(&old_rd->rcu, free_rootdomain);
 }
 static int init_rootdomain(struct root_domain *rd)
@@ -6446,6 +6735,25 @@ static struct root_domain *alloc_rootdomain(void)
        return rd;
 }
+static void free_sched_domain(struct rcu_head *rcu)
+{
+        struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+        if (atomic_dec_and_test(&sd->groups->ref))
+                kfree(sd->groups);
+        kfree(sd);
+}
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+        call_rcu(&sd->rcu, free_sched_domain);
+}
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+        for (; sd; sd = sd->parent)
+                destroy_sched_domain(sd, cpu);
+}
 /*
 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
 * hold the hotplug lock.
@@ -6456,9 +6764,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        struct rq *rq = cpu_rq(cpu);
        struct sched_domain *tmp;
-        for (tmp = sd; tmp; tmp = tmp->parent)
-                tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
        /* Remove the sched domains which do not contribute to scheduling. */
        for (tmp = sd; tmp; ) {
                struct sched_domain *parent = tmp->parent;
@@ -6469,12 +6774,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
                        tmp->parent = parent->parent;
                        if (parent->parent)
                                parent->parent->child = tmp;
+                        destroy_sched_domain(parent, cpu);
                } else
                        tmp = tmp->parent;
        }
        if (sd && sd_degenerate(sd)) {
+                tmp = sd;
                sd = sd->parent;
+                destroy_sched_domain(tmp, cpu);
                if (sd)
                        sd->child = NULL;
        }
@@ -6482,7 +6790,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        sched_domain_debug(sd, cpu);
        rq_attach_root(rq, rd);
+        tmp = rq->sd;
        rcu_assign_pointer(rq->sd, sd);
+        destroy_sched_domains(tmp, cpu);
 }
 /* cpus with isolated domains */
@@ -6498,56 +6808,6 @@ static int __init isolated_cpu_setup(char *str)
 __setup("isolcpus=", isolated_cpu_setup);
-/*
- * init_sched_build_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
- * init_sched_build_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
- */
-static void
-init_sched_build_groups(const struct cpumask *span,
-                        const struct cpumask *cpu_map,
-                        int (*group_fn)(int cpu, const struct cpumask *cpu_map,
-                                        struct sched_group **sg,
-                                        struct cpumask *tmpmask),
-                        struct cpumask *covered, struct cpumask *tmpmask)
-{
-        struct sched_group *first = NULL, *last = NULL;
-        int i;
-        cpumask_clear(covered);
-        for_each_cpu(i, span) {
-                struct sched_group *sg;
-                int group = group_fn(i, cpu_map, &sg, tmpmask);
-                int j;
-                if (cpumask_test_cpu(i, covered))
-                        continue;
-                cpumask_clear(sched_group_cpus(sg));
-                sg->cpu_power = 0;
-                for_each_cpu(j, span) {
-                        if (group_fn(j, cpu_map, NULL, tmpmask) != group)
-                                continue;
-                        cpumask_set_cpu(j, covered);
-                        cpumask_set_cpu(j, sched_group_cpus(sg));
-                }
-                if (!first)
-                        first = sg;
-                if (last)
-                        last->next = sg;
-                last = sg;
-        }
-        last->next = first;
-}
 #define SD_NODES_PER_DOMAIN 16
 #ifdef CONFIG_NUMA
@@ -6564,7 +6824,7 @@ init_sched_build_groups(const struct cpumask *span,
 */
 static int find_next_best_node(int node, nodemask_t *used_nodes)
 {
-        int i, n, val, min_val, best_node = 0;
+        int i, n, val, min_val, best_node = -1;
        min_val = INT_MAX;
@@ -6588,7 +6848,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
                }
        }
-        node_set(best_node, *used_nodes);
+        if (best_node != -1)
+                node_set(best_node, *used_nodes);
        return best_node;
 }
@@ -6614,315 +6875,130 @@ static void sched_domain_node_span(int node, struct cpumask *span)
        for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
                int next_node = find_next_best_node(node, &used_nodes);
+                if (next_node < 0)
+                        break;
                cpumask_or(span, span, cpumask_of_node(next_node));
        }
 }
+static const struct cpumask *cpu_node_mask(int cpu)
+{
+        lockdep_assert_held(&sched_domains_mutex);
+        sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
+        return sched_domains_tmpmask;
+}
+static const struct cpumask *cpu_allnodes_mask(int cpu)
+{
+        return cpu_possible_mask;
+}
 #endif /* CONFIG_NUMA */
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+static const struct cpumask *cpu_cpu_mask(int cpu)
+{
+        return cpumask_of_node(cpu_to_node(cpu));
+}
-/*
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
- * The cpus mask in sched_group and sched_domain hangs off the end.
- *
- * ( See the the comments in include/linux/sched.h:struct sched_group
- *   and struct sched_domain. )
- */
-struct static_sched_group {
-        struct sched_group sg;
-        DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
-};
-struct static_sched_domain {
+struct sd_data {
-        struct sched_domain sd;
+        struct sched_domain **__percpu sd;
-        DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+        struct sched_group **__percpu sg;
 };
 struct s_data {
-#ifdef CONFIG_NUMA
+        struct sched_domain ** __percpu sd;
-        int                     sd_allnodes;
-        cpumask_var_t           domainspan;
-        cpumask_var_t           covered;
-        cpumask_var_t           notcovered;
-#endif
-        cpumask_var_t           nodemask;
-        cpumask_var_t           this_sibling_map;
-        cpumask_var_t           this_core_map;
-        cpumask_var_t           this_book_map;
-        cpumask_var_t           send_covered;
-        cpumask_var_t           tmpmask;
-        struct sched_group      **sched_group_nodes;
        struct root_domain      *rd;
 };
 enum s_alloc {
-        sa_sched_groups = 0,
        sa_rootdomain,
-        sa_tmpmask,
+        sa_sd,
-        sa_send_covered,
+        sa_sd_storage,
-        sa_this_book_map,
-        sa_this_core_map,
-        sa_this_sibling_map,
-        sa_nodemask,
-        sa_sched_group_nodes,
-#ifdef CONFIG_NUMA
-        sa_notcovered,
-        sa_covered,
-        sa_domainspan,
-#endif
        sa_none,
 };
-/*
+struct sched_domain_topology_level;
- * SMT sched-domains:
- */
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
-static int
+typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
+typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-                 struct sched_group **sg, struct cpumask *unused)
-{
-        if (sg)
-                *sg = &per_cpu(sched_groups, cpu).sg;
-        return cpu;
-}
-#endif /* CONFIG_SCHED_SMT */
-/*
+struct sched_domain_topology_level {
- * multi-core sched-domains:
+        sched_domain_init_f init;
- */
+        sched_domain_mask_f mask;
-#ifdef CONFIG_SCHED_MC
+        struct sd_data      data;
-static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
+};
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
-static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
-                  struct sched_group **sg, struct cpumask *mask)
-{
-        int group;
-#ifdef CONFIG_SCHED_SMT
-        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
-        group = cpumask_first(mask);
-#else
-        group = cpu;
-#endif
-        if (sg)
-                *sg = &per_cpu(sched_group_core, group).sg;
-        return group;
-}
-#endif /* CONFIG_SCHED_MC */
 /*
- * book sched-domains:
+ * Assumes the sched_domain tree is fully constructed
 */
-#ifdef CONFIG_SCHED_BOOK
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
-static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
-static int
-cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
-                  struct sched_group **sg, struct cpumask *mask)
 {
-        int group = cpu;
+        struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
-#ifdef CONFIG_SCHED_MC
+        struct sched_domain *child = sd->child;
-        cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
-        group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
-        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
-        group = cpumask_first(mask);
-#endif
-        if (sg)
-                *sg = &per_cpu(sched_group_book, group).sg;
-        return group;
-}
-#endif /* CONFIG_SCHED_BOOK */
-static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
+        if (child)
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
+                cpu = cpumask_first(sched_domain_span(child));
-static int
-cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
-                  struct sched_group **sg, struct cpumask *mask)
-{
-        int group;
-#ifdef CONFIG_SCHED_BOOK
-        cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
-        group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_MC)
-        cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
-        group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
-        cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
-        group = cpumask_first(mask);
-#else
-        group = cpu;
-#endif
        if (sg)
-                *sg = &per_cpu(sched_group_phys, group).sg;
+                *sg = *per_cpu_ptr(sdd->sg, cpu);
-        return group;
+        return cpu;
 }
-#ifdef CONFIG_NUMA
 /*
- * The init_sched_build_groups can't handle what we want to do with node
+ * build_sched_groups takes the cpumask we wish to span, and a pointer
- * groups, so roll our own. Now each node has its own list of groups which
+ * to a function which identifies what group(along with sched group) a CPU
- * gets dynamically allocated.
+ * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+ * (due to the fact that we keep track of groups covered with a struct cpumask).
+ *
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_power to 0.
 */
-static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
+static void
-static struct sched_group ***sched_group_nodes_bycpu;
+build_sched_groups(struct sched_domain *sd)
-static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
-static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
-                                 struct sched_group **sg,
-                                 struct cpumask *nodemask)
-{
-        int group;
-        cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
-        group = cpumask_first(nodemask);
-        if (sg)
-                *sg = &per_cpu(sched_group_allnodes, group).sg;
-        return group;
-}
-static void init_numa_sched_groups_power(struct sched_group *group_head)
-{
-        struct sched_group *sg = group_head;
-        int j;
-        if (!sg)
-                return;
-        do {
-                for_each_cpu(j, sched_group_cpus(sg)) {
-                        struct sched_domain *sd;
-                        sd = &per_cpu(phys_domains, j).sd;
-                        if (j != group_first_cpu(sd->groups)) {
-                                /*
-                                 * Only add "power" once for each
-                                 * physical package.
-                                 */
-                                continue;
-                        }
-                        sg->cpu_power += sd->groups->cpu_power;
-                }
-                sg = sg->next;
-        } while (sg != group_head);
-}
-static int build_numa_sched_groups(struct s_data *d,
-                                   const struct cpumask *cpu_map, int num)
 {
-        struct sched_domain *sd;
+        struct sched_group *first = NULL, *last = NULL;
-        struct sched_group *sg, *prev;
+        struct sd_data *sdd = sd->private;
-        int n, j;
+        const struct cpumask *span = sched_domain_span(sd);
+        struct cpumask *covered;
-        cpumask_clear(d->covered);
+        int i;
-        cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
-        if (cpumask_empty(d->nodemask)) {
-                d->sched_group_nodes[num] = NULL;
-                goto out;
-        }
-        sched_domain_node_span(num, d->domainspan);
-        cpumask_and(d->domainspan, d->domainspan, cpu_map);
-        sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
-                          GFP_KERNEL, num);
-        if (!sg) {
-                printk(KERN_WARNING "Can not alloc domain group for node %d\n",
-                       num);
-                return -ENOMEM;
-        }
-        d->sched_group_nodes[num] = sg;
-        for_each_cpu(j, d->nodemask) {
-                sd = &per_cpu(node_domains, j).sd;
-                sd->groups = sg;
-        }
-        sg->cpu_power = 0;
-        cpumask_copy(sched_group_cpus(sg), d->nodemask);
-        sg->next = sg;
-        cpumask_or(d->covered, d->covered, d->nodemask);
-        prev = sg;
+        lockdep_assert_held(&sched_domains_mutex);
-        for (j = 0; j < nr_node_ids; j++) {
+        covered = sched_domains_tmpmask;
-                n = (num + j) % nr_node_ids;
-                cpumask_complement(d->notcovered, d->covered);
-                cpumask_and(d->tmpmask, d->notcovered, cpu_map);
-                cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
-                if (cpumask_empty(d->tmpmask))
-                        break;
-                cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
-                if (cpumask_empty(d->tmpmask))
-                        continue;
-                sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
-                                  GFP_KERNEL, num);
-                if (!sg) {
-                        printk(KERN_WARNING
-                               "Can not alloc domain group for node %d\n", j);
-                        return -ENOMEM;
-                }
-                sg->cpu_power = 0;
-                cpumask_copy(sched_group_cpus(sg), d->tmpmask);
-                sg->next = prev->next;
-                cpumask_or(d->covered, d->covered, d->tmpmask);
-                prev->next = sg;
-                prev = sg;
-        }
-out:
-        return 0;
-}
-#endif /* CONFIG_NUMA */
-#ifdef CONFIG_NUMA
+        cpumask_clear(covered);
-/* Free memory allocated for various sched_group structures */
-static void free_sched_groups(const struct cpumask *cpu_map,
-                              struct cpumask *nodemask)
-{
-        int cpu, i;
-        for_each_cpu(cpu, cpu_map) {
+        for_each_cpu(i, span) {
-                struct sched_group **sched_group_nodes
+                struct sched_group *sg;
-                        = sched_group_nodes_bycpu[cpu];
+                int group = get_group(i, sdd, &sg);
+                int j;
-                if (!sched_group_nodes)
+                if (cpumask_test_cpu(i, covered))
                        continue;
-                for (i = 0; i < nr_node_ids; i++) {
+                cpumask_clear(sched_group_cpus(sg));
-                        struct sched_group *oldsg, *sg = sched_group_nodes[i];
+                sg->cpu_power = 0;
-                        cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
+                for_each_cpu(j, span) {
-                        if (cpumask_empty(nodemask))
+                        if (get_group(j, sdd, NULL) != group)
                                continue;
-                        if (sg == NULL)
+                        cpumask_set_cpu(j, covered);
-                                continue;
+                        cpumask_set_cpu(j, sched_group_cpus(sg));
-                        sg = sg->next;
-next_sg:
-                        oldsg = sg;
-                        sg = sg->next;
-                        kfree(oldsg);
-                        if (oldsg != sched_group_nodes[i])
-                                goto next_sg;
                }
-                kfree(sched_group_nodes);
-                sched_group_nodes_bycpu[cpu] = NULL;
+                if (!first)
+                        first = sg;
+                if (last)
+                        last->next = sg;
+                last = sg;
        }
+        last->next = first;
 }
-#else /* !CONFIG_NUMA */
-static void free_sched_groups(const struct cpumask *cpu_map,
-                              struct cpumask *nodemask)
-{
-}
-#endif /* CONFIG_NUMA */
 /*
 * Initialize sched groups cpu_power.
@@ -6936,11 +7012,6 @@ static void free_sched_groups(const struct cpumask *cpu_map,
 */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
-        struct sched_domain *child;
-        struct sched_group *group;
-        long power;
-        int weight;
        WARN_ON(!sd || !sd->groups);
        if (cpu != group_first_cpu(sd->groups))
@@ -6948,36 +7019,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
-        child = sd->child;
+        update_group_power(sd, cpu);
-        sd->groups->cpu_power = 0;
-        if (!child) {
-                power = SCHED_LOAD_SCALE;
-                weight = cpumask_weight(sched_domain_span(sd));
-                /*
-                 * SMT siblings share the power of a single core.
-                 * Usually multiple threads get a better yield out of
-                 * that one core than a single thread would have,
-                 * reflect that in sd->smt_gain.
-                 */
-                if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
-                        power *= sd->smt_gain;
-                        power /= weight;
-                        power >>= SCHED_LOAD_SHIFT;
-                }
-                sd->groups->cpu_power += power;
-                return;
-        }
-        /*
-         * Add cpu_power of each child group to this groups cpu_power.
-         */
-        group = child->groups;
-        do {
-                sd->groups->cpu_power += group->cpu_power;
-                group = group->next;
-        } while (group != child->groups);
 }
 /*
@@ -6991,15 +7033,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 # define SD_INIT_NAME(sd, type)         do { } while (0)
 #endif
-#define SD_INIT(sd, type)       sd_init_##type(sd)
+#define SD_INIT_FUNC(type)                                              \
+static noinline struct sched_domain *                                   \
-#define SD_INIT_FUNC(type)      \
+sd_init_##type(struct sched_domain_topology_level *tl, int cpu)         \
-static noinline void sd_init_##type(struct sched_domain *sd)    \
+{                                                                       \
-{                                                               \
+        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
-        memset(sd, 0, sizeof(*sd));                             \
+        *sd = SD_##type##_INIT;                                         \
-        *sd = SD_##type##_INIT;                                 \
+        SD_INIT_NAME(sd, type);                                         \
-        sd->level = SD_LV_##type;                               \
+        sd->private = &tl->data;                                        \
-        SD_INIT_NAME(sd, type);                                 \
+        return sd;                                                      \
 }
 SD_INIT_FUNC(CPU)
@@ -7018,13 +7060,14 @@ SD_INIT_FUNC(CPU)
 #endif
 static int default_relax_domain_level = -1;
+int sched_domain_level_max;
 static int __init setup_relax_domain_level(char *str)
 {
        unsigned long val;
        val = simple_strtoul(str, NULL, 0);
-        if (val < SD_LV_MAX)
+        if (val < sched_domain_level_max)
                default_relax_domain_level = val;
        return 1;
@@ -7052,37 +7095,20 @@ static void set_domain_attribute(struct sched_domain *sd,
        }
 }
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
                                 const struct cpumask *cpu_map)
 {
        switch (what) {
-        case sa_sched_groups:
-                free_sched_groups(cpu_map, d->tmpmask); /* fall through */
-                d->sched_group_nodes = NULL;
        case sa_rootdomain:
-                free_rootdomain(d->rd); /* fall through */
+                if (!atomic_read(&d->rd->refcount))
-        case sa_tmpmask:
+                        free_rootdomain(&d->rd->rcu); /* fall through */
-                free_cpumask_var(d->tmpmask); /* fall through */
+        case sa_sd:
-        case sa_send_covered:
+                free_percpu(d->sd); /* fall through */
-                free_cpumask_var(d->send_covered); /* fall through */
+        case sa_sd_storage:
-        case sa_this_book_map:
+                __sdt_free(cpu_map); /* fall through */
-                free_cpumask_var(d->this_book_map); /* fall through */
-        case sa_this_core_map:
-                free_cpumask_var(d->this_core_map); /* fall through */
-        case sa_this_sibling_map:
-                free_cpumask_var(d->this_sibling_map); /* fall through */
-        case sa_nodemask:
-                free_cpumask_var(d->nodemask); /* fall through */
-        case sa_sched_group_nodes:
-#ifdef CONFIG_NUMA
-                kfree(d->sched_group_nodes); /* fall through */
-        case sa_notcovered:
-                free_cpumask_var(d->notcovered); /* fall through */
-        case sa_covered:
-                free_cpumask_var(d->covered); /* fall through */
-        case sa_domainspan:
-                free_cpumask_var(d->domainspan); /* fall through */
-#endif
        case sa_none:
                break;
        }
@@ -7091,308 +7117,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
 static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
                                                   const struct cpumask *cpu_map)
 {
-#ifdef CONFIG_NUMA
+        memset(d, 0, sizeof(*d));
-        if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
-                return sa_none;
+        if (__sdt_alloc(cpu_map))
-        if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
+                return sa_sd_storage;
-                return sa_domainspan;
+        d->sd = alloc_percpu(struct sched_domain *);
-        if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
+        if (!d->sd)
-                return sa_covered;
+                return sa_sd_storage;
-        /* Allocate the per-node list of sched groups */
-        d->sched_group_nodes = kcalloc(nr_node_ids,
-                                      sizeof(struct sched_group *), GFP_KERNEL);
-        if (!d->sched_group_nodes) {
-                printk(KERN_WARNING "Can not alloc sched group node list\n");
-                return sa_notcovered;
-        }
-        sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
-#endif
-        if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
-                return sa_sched_group_nodes;
-        if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
-                return sa_nodemask;
-        if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
-                return sa_this_sibling_map;
-        if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
-                return sa_this_core_map;
-        if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
-                return sa_this_book_map;
-        if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
-                return sa_send_covered;
        d->rd = alloc_rootdomain();
-        if (!d->rd) {
+        if (!d->rd)
-                printk(KERN_WARNING "Cannot alloc root domain\n");
+                return sa_sd;
-                return sa_tmpmask;
-        }
        return sa_rootdomain;
 }
-static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
+/*
-        const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
 {
-        struct sched_domain *sd = NULL;
+        struct sd_data *sdd = sd->private;
-#ifdef CONFIG_NUMA
+        struct sched_group *sg = sd->groups;
-        struct sched_domain *parent;
-        d->sd_allnodes = 0;
-        if (cpumask_weight(cpu_map) >
-            SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
-                sd = &per_cpu(allnodes_domains, i).sd;
-                SD_INIT(sd, ALLNODES);
-                set_domain_attribute(sd, attr);
-                cpumask_copy(sched_domain_span(sd), cpu_map);
-                cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
-                d->sd_allnodes = 1;
-        }
-        parent = sd;
-        sd = &per_cpu(node_domains, i).sd;
-        SD_INIT(sd, NODE);
-        set_domain_attribute(sd, attr);
-        sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
-        sd->parent = parent;
-        if (parent)
-                parent->child = sd;
-        cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
-#endif
-        return sd;
-}
-static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
+        WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
-        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+        *per_cpu_ptr(sdd->sd, cpu) = NULL;
-        struct sched_domain *parent, int i)
-{
-        struct sched_domain *sd;
-        sd = &per_cpu(phys_domains, i).sd;
-        SD_INIT(sd, CPU);
-        set_domain_attribute(sd, attr);
-        cpumask_copy(sched_domain_span(sd), d->nodemask);
-        sd->parent = parent;
-        if (parent)
-                parent->child = sd;
-        cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
-        return sd;
-}
-static struct sched_domain *__build_book_sched_domain(struct s_data *d,
+        if (cpu == cpumask_first(sched_group_cpus(sg))) {
-        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+                WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
-        struct sched_domain *parent, int i)
+                *per_cpu_ptr(sdd->sg, cpu) = NULL;
-{
+        }
-        struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_BOOK
-        sd = &per_cpu(book_domains, i).sd;
-        SD_INIT(sd, BOOK);
-        set_domain_attribute(sd, attr);
-        cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
-        sd->parent = parent;
-        parent->child = sd;
-        cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
-        return sd;
 }
-static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
+#ifdef CONFIG_SCHED_SMT
-        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
+static const struct cpumask *cpu_smt_mask(int cpu)
-        struct sched_domain *parent, int i)
 {
-        struct sched_domain *sd = parent;
+        return topology_thread_cpumask(cpu);
-#ifdef CONFIG_SCHED_MC
-        sd = &per_cpu(core_domains, i).sd;
-        SD_INIT(sd, MC);
-        set_domain_attribute(sd, attr);
-        cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
-        sd->parent = parent;
-        parent->child = sd;
-        cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
-        return sd;
 }
-static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
-        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
-        struct sched_domain *parent, int i)
-{
-        struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_SMT
-        sd = &per_cpu(cpu_domains, i).sd;
-        SD_INIT(sd, SIBLING);
-        set_domain_attribute(sd, attr);
-        cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
-        sd->parent = parent;
-        parent->child = sd;
-        cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
 #endif
-        return sd;
-}
-static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
+/*
-                               const struct cpumask *cpu_map, int cpu)
+ * Topology list, bottom-up.
-{
+ */
-        switch (l) {
+static struct sched_domain_topology_level default_topology[] = {
 #ifdef CONFIG_SCHED_SMT
-        case SD_LV_SIBLING: /* set up CPU (sibling) groups */
+        { sd_init_SIBLING, cpu_smt_mask, },
-                cpumask_and(d->this_sibling_map, cpu_map,
-                            topology_thread_cpumask(cpu));
-                if (cpu == cpumask_first(d->this_sibling_map))
-                        init_sched_build_groups(d->this_sibling_map, cpu_map,
-                                                &cpu_to_cpu_group,
-                                                d->send_covered, d->tmpmask);
-                break;
 #endif
 #ifdef CONFIG_SCHED_MC
-        case SD_LV_MC: /* set up multi-core groups */
+        { sd_init_MC, cpu_coregroup_mask, },
-                cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
-                if (cpu == cpumask_first(d->this_core_map))
-                        init_sched_build_groups(d->this_core_map, cpu_map,
-                                                &cpu_to_core_group,
-                                                d->send_covered, d->tmpmask);
-                break;
 #endif
 #ifdef CONFIG_SCHED_BOOK
-        case SD_LV_BOOK: /* set up book groups */
+        { sd_init_BOOK, cpu_book_mask, },
-                cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
-                if (cpu == cpumask_first(d->this_book_map))
-                        init_sched_build_groups(d->this_book_map, cpu_map,
-                                                &cpu_to_book_group,
-                                                d->send_covered, d->tmpmask);
-                break;
 #endif
-        case SD_LV_CPU: /* set up physical groups */
+        { sd_init_CPU, cpu_cpu_mask, },
-                cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
-                if (!cpumask_empty(d->nodemask))
-                        init_sched_build_groups(d->nodemask, cpu_map,
-                                                &cpu_to_phys_group,
-                                                d->send_covered, d->tmpmask);
-                break;
 #ifdef CONFIG_NUMA
-        case SD_LV_ALLNODES:
+        { sd_init_NODE, cpu_node_mask, },
-                init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
+        { sd_init_ALLNODES, cpu_allnodes_mask, },
-                                        d->send_covered, d->tmpmask);
-                break;
 #endif
-        default:
+        { NULL, },
-                break;
+};
+static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+        struct sched_domain_topology_level *tl;
+        int j;
+        for (tl = sched_domain_topology; tl->init; tl++) {
+                struct sd_data *sdd = &tl->data;
+                sdd->sd = alloc_percpu(struct sched_domain *);
+                if (!sdd->sd)
+                        return -ENOMEM;
+                sdd->sg = alloc_percpu(struct sched_group *);
+                if (!sdd->sg)
+                        return -ENOMEM;
+                for_each_cpu(j, cpu_map) {
+                        struct sched_domain *sd;
+                        struct sched_group *sg;
+                        sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+                                        GFP_KERNEL, cpu_to_node(j));
+                        if (!sd)
+                                return -ENOMEM;
+                        *per_cpu_ptr(sdd->sd, j) = sd;
+                        sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                        GFP_KERNEL, cpu_to_node(j));
+                        if (!sg)
+                                return -ENOMEM;
+                        *per_cpu_ptr(sdd->sg, j) = sg;
+                }
+        }
+        return 0;
+}
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+        struct sched_domain_topology_level *tl;
+        int j;
+        for (tl = sched_domain_topology; tl->init; tl++) {
+                struct sd_data *sdd = &tl->data;
+                for_each_cpu(j, cpu_map) {
+                        kfree(*per_cpu_ptr(sdd->sd, j));
+                        kfree(*per_cpu_ptr(sdd->sg, j));
+                }
+                free_percpu(sdd->sd);
+                free_percpu(sdd->sg);
        }
 }
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+                struct s_data *d, const struct cpumask *cpu_map,
+                struct sched_domain_attr *attr, struct sched_domain *child,
+                int cpu)
+{
+        struct sched_domain *sd = tl->init(tl, cpu);
+        if (!sd)
+                return child;
+        set_domain_attribute(sd, attr);
+        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+        if (child) {
+                sd->level = child->level + 1;
+                sched_domain_level_max = max(sched_domain_level_max, sd->level);
+                child->parent = sd;
+        }
+        sd->child = child;
+        return sd;
+}
 /*
 * Build sched domains for a given set of cpus and attach the sched domains
 * to the individual cpus
 */
-static int __build_sched_domains(const struct cpumask *cpu_map,
+static int build_sched_domains(const struct cpumask *cpu_map,
-                                 struct sched_domain_attr *attr)
+                               struct sched_domain_attr *attr)
 {
        enum s_alloc alloc_state = sa_none;
-        struct s_data d;
        struct sched_domain *sd;
-        int i;
+        struct s_data d;
-#ifdef CONFIG_NUMA
+        int i, ret = -ENOMEM;
-        d.sd_allnodes = 0;
-#endif
        alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
        if (alloc_state != sa_rootdomain)
                goto error;
-        alloc_state = sa_sched_groups;
-        /*
+        /* Set up domains for cpus specified by the cpu_map. */
-         * Set up domains for cpus specified by the cpu_map.
-         */
        for_each_cpu(i, cpu_map) {
-                cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
+                struct sched_domain_topology_level *tl;
-                            cpu_map);
-                sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
-                sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
-                sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
-                sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
-                sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
-        }
-        for_each_cpu(i, cpu_map) {
-                build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
-                build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
-                build_sched_groups(&d, SD_LV_MC, cpu_map, i);
-        }
-        /* Set up physical groups */
+                sd = NULL;
-        for (i = 0; i < nr_node_ids; i++)
+                for (tl = sched_domain_topology; tl->init; tl++)
-                build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
+                        sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
-#ifdef CONFIG_NUMA
+                while (sd->child)
-        /* Set up node groups */
+                        sd = sd->child;
-        if (d.sd_allnodes)
-                build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
-        for (i = 0; i < nr_node_ids; i++)
-                if (build_numa_sched_groups(&d, cpu_map, i))
-                        goto error;
-#endif
-        /* Calculate CPU power for physical packages and nodes */
+                *per_cpu_ptr(d.sd, i) = sd;
-#ifdef CONFIG_SCHED_SMT
-        for_each_cpu(i, cpu_map) {
-                sd = &per_cpu(cpu_domains, i).sd;
-                init_sched_groups_power(i, sd);
-        }
-#endif
-#ifdef CONFIG_SCHED_MC
-        for_each_cpu(i, cpu_map) {
-                sd = &per_cpu(core_domains, i).sd;
-                init_sched_groups_power(i, sd);
-        }
-#endif
-#ifdef CONFIG_SCHED_BOOK
-        for_each_cpu(i, cpu_map) {
-                sd = &per_cpu(book_domains, i).sd;
-                init_sched_groups_power(i, sd);
        }
-#endif
+        /* Build the groups for the domains */
        for_each_cpu(i, cpu_map) {
-                sd = &per_cpu(phys_domains, i).sd;
+                for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
-                init_sched_groups_power(i, sd);
+                        sd->span_weight = cpumask_weight(sched_domain_span(sd));
-        }
+                        get_group(i, sd->private, &sd->groups);
+                        atomic_inc(&sd->groups->ref);
-#ifdef CONFIG_NUMA
+                        if (i != cpumask_first(sched_domain_span(sd)))
-        for (i = 0; i < nr_node_ids; i++)
+                                continue;
-                init_numa_sched_groups_power(d.sched_group_nodes[i]);
-        if (d.sd_allnodes) {
+                        build_sched_groups(sd);
-                struct sched_group *sg;
+                }
+        }
-                cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
+        /* Calculate CPU power for physical packages and nodes */
-                                                                d.tmpmask);
+        for (i = nr_cpumask_bits-1; i >= 0; i--) {
-                init_numa_sched_groups_power(sg);
+                if (!cpumask_test_cpu(i, cpu_map))
+                        continue;
+                for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+                        claim_allocations(i, sd);
+                        init_sched_groups_power(i, sd);
+                }
        }
-#endif
        /* Attach the domains */
+        rcu_read_lock();
        for_each_cpu(i, cpu_map) {
-#ifdef CONFIG_SCHED_SMT
+                sd = *per_cpu_ptr(d.sd, i);
-                sd = &per_cpu(cpu_domains, i).sd;
-#elif defined(CONFIG_SCHED_MC)
-                sd = &per_cpu(core_domains, i).sd;
-#elif defined(CONFIG_SCHED_BOOK)
-                sd = &per_cpu(book_domains, i).sd;
-#else
-                sd = &per_cpu(phys_domains, i).sd;
-#endif
                cpu_attach_domain(sd, d.rd, i);
        }
+        rcu_read_unlock();
-        d.sched_group_nodes = NULL; /* don't free this we still need it */
+        ret = 0;
-        __free_domain_allocs(&d, sa_tmpmask, cpu_map);
-        return 0;
 error:
        __free_domain_allocs(&d, alloc_state, cpu_map);
-        return -ENOMEM;
+        return ret;
-}
-static int build_sched_domains(const struct cpumask *cpu_map)
-{
-        return __build_sched_domains(cpu_map, NULL);
 }
 static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7447,7 +7377,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
 * For now this just excludes isolated cpus, but could be used to
 * exclude other special cases in the future.
 */
-static int arch_init_sched_domains(const struct cpumask *cpu_map)
+static int init_sched_domains(const struct cpumask *cpu_map)
 {
        int err;
@@ -7458,32 +7388,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
                doms_cur = &fallback_doms;
        cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
        dattr_cur = NULL;
-        err = build_sched_domains(doms_cur[0]);
+        err = build_sched_domains(doms_cur[0], NULL);
        register_sched_domain_sysctl();
        return err;
 }
-static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
-                                       struct cpumask *tmpmask)
-{
-        free_sched_groups(cpu_map, tmpmask);
-}
 /*
 * Detach sched domains from a group of cpus specified in cpu_map
 * These cpus will now be attached to the NULL domain
 */
 static void detach_destroy_domains(const struct cpumask *cpu_map)
 {
-        /* Save because hotplug lock held. */
-        static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
        int i;
+        rcu_read_lock();
        for_each_cpu(i, cpu_map)
                cpu_attach_domain(NULL, &def_root_domain, i);
-        synchronize_sched();
+        rcu_read_unlock();
-        arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
 }
 /* handle null as "default" */
@@ -7572,8 +7494,7 @@ match1:
                                goto match2;
                }
                /* no match - add a new doms_new */
-                __build_sched_domains(doms_new[i],
+                build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
-                                        dattr_new ? dattr_new + i : NULL);
 match2:
                ;
        }
@@ -7592,7 +7513,7 @@ match2:
 }
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void arch_reinit_sched_domains(void)
+static void reinit_sched_domains(void)
 {
        get_online_cpus();
@@ -7625,7 +7546,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
        else
                sched_mc_power_savings = level;
-        arch_reinit_sched_domains();
+        reinit_sched_domains();
        return count;
 }
@@ -7744,14 +7665,9 @@ void __init sched_init_smp(void)
        alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
        alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-#if defined(CONFIG_NUMA)
-        sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
-                                                                GFP_KERNEL);
-        BUG_ON(sched_group_nodes_bycpu == NULL);
-#endif
        get_online_cpus();
        mutex_lock(&sched_domains_mutex);
-        arch_init_sched_domains(cpu_active_mask);
+        init_sched_domains(cpu_active_mask);
        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
        if (cpumask_empty(non_isolated_cpus))
                cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -7796,6 +7712,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
        INIT_LIST_HEAD(&cfs_rq->tasks);
 #ifdef CONFIG_FAIR_GROUP_SCHED
        cfs_rq->rq = rq;
+        /* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_SMP
+        cfs_rq->load_stamp = 1;
+#endif
 #endif
        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 }
@@ -7997,7 +7917,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
                rq->sd = NULL;
                rq->rd = NULL;
-                rq->cpu_power = SCHED_LOAD_SCALE;
+                rq->cpu_power = SCHED_POWER_SCALE;
                rq->post_schedule = 0;
                rq->active_balance = 0;
                rq->next_balance = jiffies;
@@ -8054,6 +7974,7 @@ void __init sched_init(void)
        /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
        zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
+        zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
 #ifdef CONFIG_NO_HZ
        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
        alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8074,7 +7995,7 @@ static inline int preempt_count_equals(int preempt_offset)
 {
        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
-        return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+        return (nested == preempt_offset);
 }
 void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8109,9 +8030,11 @@ EXPORT_SYMBOL(__might_sleep);
 #ifdef CONFIG_MAGIC_SYSRQ
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
+        const struct sched_class *prev_class = p->sched_class;
+        int old_prio = p->prio;
        int on_rq;
-        on_rq = p->se.on_rq;
+        on_rq = p->on_rq;
        if (on_rq)
                deactivate_task(rq, p, 0);
        __setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8119,6 +8042,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
                activate_task(rq, p, 0);
                resched_task(rq->curr);
        }
+        check_class_changed(rq, p, prev_class, old_prio);
 }
 void normalize_rt_tasks(void)
@@ -8234,7 +8159,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se;
-        struct rq *rq;
        int i;
        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8247,8 +8171,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        tg->shares = NICE_0_LOAD;
        for_each_possible_cpu(i) {
-                rq = cpu_rq(i);
                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                      GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
@@ -8325,7 +8247,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct rt_rq *rt_rq;
        struct sched_rt_entity *rt_se;
-        struct rq *rq;
        int i;
        tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8339,8 +8260,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                        ktime_to_ns(def_rt_bandwidth.rt_period), 0);
        for_each_possible_cpu(i) {
-                rq = cpu_rq(i);
                rt_rq = kzalloc_node(sizeof(struct rt_rq),
                                     GFP_KERNEL, cpu_to_node(i));
                if (!rt_rq)
@@ -8455,7 +8374,7 @@ void sched_move_task(struct task_struct *tsk)
        rq = task_rq_lock(tsk, &flags);
        running = task_current(rq, tsk);
-        on_rq = tsk->se.on_rq;
+        on_rq = tsk->on_rq;
        if (on_rq)
                dequeue_task(rq, tsk, 0);
@@ -8474,7 +8393,7 @@ void sched_move_task(struct task_struct *tsk)
        if (on_rq)
                enqueue_task(rq, tsk, 0);
-        task_rq_unlock(rq, &flags);
+        task_rq_unlock(rq, tsk, &flags);
 }
 #endif /* CONFIG_CGROUP_SCHED */
@@ -8510,7 +8429,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
                /* Propagate contribution to hierarchy */
                raw_spin_lock_irqsave(&rq->lock, flags);
                for_each_sched_entity(se)
-                        update_cfs_shares(group_cfs_rq(se), 0);
+                        update_cfs_shares(group_cfs_rq(se));
                raw_spin_unlock_irqrestore(&rq->lock, flags);
        }
@@ -8845,46 +8764,15 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        return 0;
 }
-static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
-                      struct task_struct *tsk, bool threadgroup)
-{
-        int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
-        if (retval)
-                return retval;
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        retval = cpu_cgroup_can_attach_task(cgrp, c);
-                        if (retval) {
-                                rcu_read_unlock();
-                                return retval;
-                        }
-                }
-                rcu_read_unlock();
-        }
-        return 0;
-}
 static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
-                  struct cgroup *old_cont, struct task_struct *tsk,
-                  bool threadgroup)
 {
        sched_move_task(tsk);
-        if (threadgroup) {
-                struct task_struct *c;
-                rcu_read_lock();
-                list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
-                        sched_move_task(c);
-                }
-                rcu_read_unlock();
-        }
 }
 static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+                struct cgroup *old_cgrp, struct task_struct *task)
 {
        /*
         * cgroup_exit() is called in the copy_process() failure path.
@@ -8901,14 +8789,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
 static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
                                u64 shareval)
 {
-        return sched_group_set_shares(cgroup_tg(cgrp), shareval);
+        return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
 }
 static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
 {
        struct task_group *tg = cgroup_tg(cgrp);
-        return (u64) tg->shares;
+        return (u64) scale_load_down(tg->shares);
 }
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8967,8 +8855,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .create         = cpu_cgroup_create,
        .destroy        = cpu_cgroup_destroy,
-        .can_attach     = cpu_cgroup_can_attach,
+        .can_attach_task = cpu_cgroup_can_attach_task,
-        .attach         = cpu_cgroup_attach,
+        .attach_task    = cpu_cgroup_attach_task,
        .exit           = cpu_cgroup_exit,
        .populate       = cpu_cgroup_populate,
        .subsys_id      = cpu_cgroup_subsys_id,