Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar: "The main changes in this cycle were: - Optimized support for Intel "Cluster-on-Die" (CoD) topologies (Dave Hansen) - Various sched/idle refinements for better idle handling (Nicolas Pitre, Daniel Lezcano, Chuansheng Liu, Vincent Guittot) - sched/numa updates and optimizations (Rik van Riel) - sysbench speedup (Vincent Guittot) - capacity calculation cleanups/refactoring (Vincent Guittot) - Various cleanups to thread group iteration (Oleg Nesterov) - Double-rq-lock removal optimization and various refactorings (Kirill Tkhai) - various sched/deadline fixes ... and lots of other changes" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (72 commits) sched/dl: Use dl_bw_of() under rcu_read_lock_sched() sched/fair: Delete resched_cpu() from idle_balance() sched, time: Fix build error with 64 bit cputime_t on 32 bit systems sched: Improve sysbench performance by fixing spurious active migration sched/x86: Fix up typo in topology detection x86, sched: Add new topology for multi-NUMA-node CPUs sched/rt: Use resched_curr() in task_tick_rt() sched: Use rq->rd in sched_setaffinity() under RCU read lock sched: cleanup: Rename 'out_unlock' to 'out_free_new_mask' sched: Use dl_bw_of() under RCU read lock sched/fair: Remove duplicate code from can_migrate_task() sched, mips, ia64: Remove __ARCH_WANT_UNLOCKED_CTXSW sched: print_rq(): Don't use tasklist_lock sched: normalize_rt_tasks(): Don't use _irqsave for tasklist_lock, use task_rq_lock() sched: Fix the task-group check in tg_has_rt_tasks() sched/fair: Leverage the idle state info when choosing the "idlest" cpu sched: Let the scheduler see CPU idle states sched/deadline: Fix inter- exclusive cpusets migrations sched/deadline: Clear dl_entity params when setscheduling to different class sched/numa: Kill the wrong/dead TASK_DEAD check in task_numa_fault() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2014-10-13 10:23:15 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2014-10-13 10:23:15 -0400
commit: faafcba3b5e15999cf75d5c5a513ac8e47e2545f (patch)
tree: 47d58d1c00e650e820506c91eb9a41268756bdda /kernel
parent: 13ead805c5a14b0e7ecd34f61404a5bfba655895 (diff)
parent: f10e00f4bf360c36edbe6bf18a6c75b171cbe012 (diff)
19 files changed, 667 insertions, 441 deletions
diff --git a/kernel/exit.c b/kernel/exit.c
index d13f2eec4bb8..5d30019ff953 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -115,32 +115,33 @@ static void __exit_signal(struct task_struct *tsk)
                if (tsk == sig->curr_target)
                        sig->curr_target = next_thread(tsk);
-                /*
-                 * Accumulate here the counters for all threads but the
-                 * group leader as they die, so they can be added into
-                 * the process-wide totals when those are taken.
-                 * The group leader stays around as a zombie as long
-                 * as there are other threads.  When it gets reaped,
-                 * the exit.c code will add its counts into these totals.
-                 * We won't ever get here for the group leader, since it
-                 * will have been the last reference on the signal_struct.
-                 */
-                task_cputime(tsk, &utime, &stime);
-                sig->utime += utime;
-                sig->stime += stime;
-                sig->gtime += task_gtime(tsk);
-                sig->min_flt += tsk->min_flt;
-                sig->maj_flt += tsk->maj_flt;
-                sig->nvcsw += tsk->nvcsw;
-                sig->nivcsw += tsk->nivcsw;
-                sig->inblock += task_io_get_inblock(tsk);
-                sig->oublock += task_io_get_oublock(tsk);
-                task_io_accounting_add(&sig->ioac, &tsk->ioac);
-                sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
        }
+        /*
+         * Accumulate here the counters for all threads but the group leader
+         * as they die, so they can be added into the process-wide totals
+         * when those are taken.  The group leader stays around as a zombie as
+         * long as there are other threads.  When it gets reaped, the exit.c
+         * code will add its counts into these totals.  We won't ever get here
+         * for the group leader, since it will have been the last reference on
+         * the signal_struct.
+         */
+        task_cputime(tsk, &utime, &stime);
+        write_seqlock(&sig->stats_lock);
+        sig->utime += utime;
+        sig->stime += stime;
+        sig->gtime += task_gtime(tsk);
+        sig->min_flt += tsk->min_flt;
+        sig->maj_flt += tsk->maj_flt;
+        sig->nvcsw += tsk->nvcsw;
+        sig->nivcsw += tsk->nivcsw;
+        sig->inblock += task_io_get_inblock(tsk);
+        sig->oublock += task_io_get_oublock(tsk);
+        task_io_accounting_add(&sig->ioac, &tsk->ioac);
+        sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
        sig->nr_threads--;
        __unhash_process(tsk, group_dead);
+        write_sequnlock(&sig->stats_lock);
        /*
         * Do this under ->siglock, we can race with another thread
@@ -1046,6 +1047,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                spin_lock_irq(&p->real_parent->sighand->siglock);
                psig = p->real_parent->signal;
                sig = p->signal;
+                write_seqlock(&psig->stats_lock);
                psig->cutime += tgutime + sig->cutime;
                psig->cstime += tgstime + sig->cstime;
                psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
@@ -1068,6 +1070,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                        psig->cmaxrss = maxrss;
                task_io_accounting_add(&psig->ioac, &p->ioac);
                task_io_accounting_add(&psig->ioac, &sig->ioac);
+                write_sequnlock(&psig->stats_lock);
                spin_unlock_irq(&p->real_parent->sighand->siglock);
        }
diff --git a/kernel/fork.c b/kernel/fork.c
index 8c162d102740..9b7d746d6d62 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -294,11 +294,18 @@ int __weak arch_dup_task_struct(struct task_struct *dst,
        return 0;
 }
+void set_task_stack_end_magic(struct task_struct *tsk)
+{
+        unsigned long *stackend;
+        stackend = end_of_stack(tsk);
+        *stackend = STACK_END_MAGIC;    /* for overflow detection */
+}
 static struct task_struct *dup_task_struct(struct task_struct *orig)
 {
        struct task_struct *tsk;
        struct thread_info *ti;
-        unsigned long *stackend;
        int node = tsk_fork_get_node(orig);
        int err;
@@ -328,8 +335,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
-        stackend = end_of_stack(tsk);
+        set_task_stack_end_magic(tsk);
-        *stackend = STACK_END_MAGIC;    /* for overflow detection */
 #ifdef CONFIG_CC_STACKPROTECTOR
        tsk->stack_canary = get_random_int();
@@ -1067,6 +1073,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
        sig->curr_target = tsk;
        init_sigpending(&sig->shared_pending);
        INIT_LIST_HEAD(&sig->posix_timers);
+        seqlock_init(&sig->stats_lock);
        hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        sig->real_timer.function = it_real_fn;
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index e73efba98301..8a2e230fb86a 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -148,11 +148,8 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
        if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
                goto out;
-        t = p;
+        for_each_thread(p, t)
-        do {
                sched_move_task(t);
-        } while_each_thread(p, t);
 out:
        unlock_task_sighand(p, &flags);
        autogroup_kref_put(prev);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f235c41a3532..44999505e1bf 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -317,9 +317,12 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
        for (;;) {
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p)))
+                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
                        return rq;
                raw_spin_unlock(&rq->lock);
+                while (unlikely(task_on_rq_migrating(p)))
+                        cpu_relax();
        }
 }
@@ -336,10 +339,13 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
                raw_spin_lock_irqsave(&p->pi_lock, *flags);
                rq = task_rq(p);
                raw_spin_lock(&rq->lock);
-                if (likely(rq == task_rq(p)))
+                if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
                        return rq;
                raw_spin_unlock(&rq->lock);
                raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
+                while (unlikely(task_on_rq_migrating(p)))
+                        cpu_relax();
        }
 }
@@ -433,7 +439,15 @@ static void __hrtick_start(void *arg)
 void hrtick_start(struct rq *rq, u64 delay)
 {
        struct hrtimer *timer = &rq->hrtick_timer;
-        ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
+        ktime_t time;
+        s64 delta;
+        /*
+         * Don't schedule slices shorter than 10000ns, that just
+         * doesn't make sense and can cause timer DoS.
+         */
+        delta = max_t(s64, delay, 10000LL);
+        time = ktime_add_ns(timer->base->get_time(), delta);
        hrtimer_set_expires(timer, time);
@@ -1027,7 +1041,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
         * A queue event has occurred, and we're going to schedule.  In
         * this case, we can save a useless back to back clock update.
         */
-        if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
+        if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
                rq->skip_clock_update = 1;
 }
@@ -1072,7 +1086,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 static void __migrate_swap_task(struct task_struct *p, int cpu)
 {
-        if (p->on_rq) {
+        if (task_on_rq_queued(p)) {
                struct rq *src_rq, *dst_rq;
                src_rq = task_rq(p);
@@ -1198,7 +1212,7 @@ static int migration_cpu_stop(void *data);
 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 {
        unsigned long flags;
-        int running, on_rq;
+        int running, queued;
        unsigned long ncsw;
        struct rq *rq;
@@ -1236,7 +1250,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                rq = task_rq_lock(p, &flags);
                trace_sched_wait_task(p);
                running = task_running(rq, p);
-                on_rq = p->on_rq;
+                queued = task_on_rq_queued(p);
                ncsw = 0;
                if (!match_state || p->state == match_state)
                        ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
@@ -1268,7 +1282,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
                 * running right now), it's preempted, and we should
                 * yield - it could be a while.
                 */
-                if (unlikely(on_rq)) {
+                if (unlikely(queued)) {
                        ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
                        set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1462,7 +1476,7 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
 {
        activate_task(rq, p, en_flags);
-        p->on_rq = 1;
+        p->on_rq = TASK_ON_RQ_QUEUED;
        /* if a worker is waking up, notify workqueue */
        if (p->flags & PF_WQ_WORKER)
@@ -1521,7 +1535,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
        int ret = 0;
        rq = __task_rq_lock(p);
-        if (p->on_rq) {
+        if (task_on_rq_queued(p)) {
                /* check_preempt_curr() may use rq clock */
                update_rq_clock(rq);
                ttwu_do_wakeup(rq, p, wake_flags);
@@ -1604,6 +1618,25 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu)
        }
 }
+void wake_up_if_idle(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long flags;
+        if (!is_idle_task(rq->curr))
+                return;
+        if (set_nr_if_polling(rq->idle)) {
+                trace_sched_wake_idle_without_ipi(cpu);
+        } else {
+                raw_spin_lock_irqsave(&rq->lock, flags);
+                if (is_idle_task(rq->curr))
+                        smp_send_reschedule(cpu);
+                /* Else cpu is not in idle, do nothing here */
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
+        }
+}
 bool cpus_share_cache(int this_cpu, int that_cpu)
 {
        return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
@@ -1726,7 +1759,7 @@ static void try_to_wake_up_local(struct task_struct *p)
        if (!(p->state & TASK_NORMAL))
                goto out;
-        if (!p->on_rq)
+        if (!task_on_rq_queued(p))
                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
        ttwu_do_wakeup(rq, p, 0);
@@ -1760,6 +1793,20 @@ int wake_up_state(struct task_struct *p, unsigned int state)
 }
 /*
+ * This function clears the sched_dl_entity static params.
+ */
+void __dl_clear_params(struct task_struct *p)
+{
+        struct sched_dl_entity *dl_se = &p->dl;
+        dl_se->dl_runtime = 0;
+        dl_se->dl_deadline = 0;
+        dl_se->dl_period = 0;
+        dl_se->flags = 0;
+        dl_se->dl_bw = 0;
+}
+/*
 * Perform scheduler related setup for a newly forked process p.
 * p is forked by current.
 *
@@ -1783,10 +1830,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        RB_CLEAR_NODE(&p->dl.rb_node);
        hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        p->dl.dl_runtime = p->dl.runtime = 0;
+        __dl_clear_params(p);
-        p->dl.dl_deadline = p->dl.deadline = 0;
-        p->dl.dl_period = 0;
-        p->dl.flags = 0;
        INIT_LIST_HEAD(&p->rt.run_list);
@@ -1961,6 +2005,8 @@ unsigned long to_ratio(u64 period, u64 runtime)
 #ifdef CONFIG_SMP
 inline struct dl_bw *dl_bw_of(int i)
 {
+        rcu_lockdep_assert(rcu_read_lock_sched_held(),
+                           "sched RCU must be held");
        return &cpu_rq(i)->rd->dl_bw;
 }
@@ -1969,6 +2015,8 @@ static inline int dl_bw_cpus(int i)
        struct root_domain *rd = cpu_rq(i)->rd;
        int cpus = 0;
+        rcu_lockdep_assert(rcu_read_lock_sched_held(),
+                           "sched RCU must be held");
        for_each_cpu_and(i, rd->span, cpu_active_mask)
                cpus++;
@@ -2079,7 +2127,7 @@ void wake_up_new_task(struct task_struct *p)
        init_task_runnable_average(p);
        rq = __task_rq_lock(p);
        activate_task(rq, p, 0);
-        p->on_rq = 1;
+        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p, true);
        check_preempt_curr(rq, p, WF_FORK);
 #ifdef CONFIG_SMP
@@ -2271,10 +2319,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev)
         */
        post_schedule(rq);
-#ifdef __ARCH_WANT_UNLOCKED_CTXSW
-        /* In this case, finish_task_switch does not reenable preemption */
-        preempt_enable();
-#endif
        if (current->set_child_tid)
                put_user(task_pid_vnr(current), current->set_child_tid);
 }
@@ -2317,9 +2361,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
         * of the scheduler it's an obvious special-case), so we
         * do an early lockdep release here:
         */
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
        spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
-#endif
        context_tracking_task_switch(prev, next);
        /* Here we just switch the register state and the stack. */
@@ -2447,7 +2489,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
         * project cycles that may never be accounted to this
         * thread, breaking clock_gettime().
         */
-        if (task_current(rq, p) && p->on_rq) {
+        if (task_current(rq, p) && task_on_rq_queued(p)) {
                update_rq_clock(rq);
                ns = rq_clock_task(rq) - p->se.exec_start;
                if ((s64)ns < 0)
@@ -2493,7 +2535,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
         * If we see ->on_cpu without ->on_rq, the task is leaving, and has
         * been accounted, so we're correct here as well.
         */
-        if (!p->on_cpu || !p->on_rq)
+        if (!p->on_cpu || !task_on_rq_queued(p))
                return p->se.sum_exec_runtime;
 #endif
@@ -2656,6 +2698,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
 */
 static inline void schedule_debug(struct task_struct *prev)
 {
+#ifdef CONFIG_SCHED_STACK_END_CHECK
+        BUG_ON(unlikely(task_stack_end_corrupted(prev)));
+#endif
        /*
         * Test if we are atomic. Since do_exit() needs to call into
         * schedule() atomically, we ignore that path. Otherwise whine
@@ -2797,7 +2842,7 @@ need_resched:
                switch_count = &prev->nvcsw;
        }
-        if (prev->on_rq || rq->skip_clock_update < 0)
+        if (task_on_rq_queued(prev) || rq->skip_clock_update < 0)
                update_rq_clock(rq);
        next = pick_next_task(rq, prev);
@@ -2962,7 +3007,7 @@ EXPORT_SYMBOL(default_wake_function);
 */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-        int oldprio, on_rq, running, enqueue_flag = 0;
+        int oldprio, queued, running, enqueue_flag = 0;
        struct rq *rq;
        const struct sched_class *prev_class;
@@ -2991,12 +3036,12 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
        prev_class = p->sched_class;
-        on_rq = p->on_rq;
+        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
-        if (on_rq)
+        if (queued)
                dequeue_task(rq, p, 0);
        if (running)
-                p->sched_class->put_prev_task(rq, p);
+                put_prev_task(rq, p);
        /*
         * Boosting condition are:
@@ -3033,7 +3078,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->set_curr_task(rq);
-        if (on_rq)
+        if (queued)
                enqueue_task(rq, p, enqueue_flag);
        check_class_changed(rq, p, prev_class, oldprio);
@@ -3044,7 +3089,7 @@ out_unlock:
 void set_user_nice(struct task_struct *p, long nice)
 {
-        int old_prio, delta, on_rq;
+        int old_prio, delta, queued;
        unsigned long flags;
        struct rq *rq;
@@ -3065,8 +3110,8 @@ void set_user_nice(struct task_struct *p, long nice)
                p->static_prio = NICE_TO_PRIO(nice);
                goto out_unlock;
        }
-        on_rq = p->on_rq;
+        queued = task_on_rq_queued(p);
-        if (on_rq)
+        if (queued)
                dequeue_task(rq, p, 0);
        p->static_prio = NICE_TO_PRIO(nice);
@@ -3075,7 +3120,7 @@ void set_user_nice(struct task_struct *p, long nice)
        p->prio = effective_prio(p);
        delta = p->prio - old_prio;
-        if (on_rq) {
+        if (queued) {
                enqueue_task(rq, p, 0);
                /*
                 * If the task increased its priority or is running and
@@ -3347,7 +3392,7 @@ static int __sched_setscheduler(struct task_struct *p,
 {
        int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
                      MAX_RT_PRIO - 1 - attr->sched_priority;
-        int retval, oldprio, oldpolicy = -1, on_rq, running;
+        int retval, oldprio, oldpolicy = -1, queued, running;
        int policy = attr->sched_policy;
        unsigned long flags;
        const struct sched_class *prev_class;
@@ -3544,19 +3589,19 @@ change:
                return 0;
        }
-        on_rq = p->on_rq;
+        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
-        if (on_rq)
+        if (queued)
                dequeue_task(rq, p, 0);
        if (running)
-                p->sched_class->put_prev_task(rq, p);
+                put_prev_task(rq, p);
        prev_class = p->sched_class;
        __setscheduler(rq, p, attr);
        if (running)
                p->sched_class->set_curr_task(rq);
-        if (on_rq) {
+        if (queued) {
                /*
                 * We enqueue to tail when the priority of a task is
                 * increased (user space view).
@@ -3980,14 +4025,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
                rcu_read_lock();
                if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
                        rcu_read_unlock();
-                        goto out_unlock;
+                        goto out_free_new_mask;
                }
                rcu_read_unlock();
        }
        retval = security_task_setscheduler(p);
        if (retval)
-                goto out_unlock;
+                goto out_free_new_mask;
        cpuset_cpus_allowed(p, cpus_allowed);
@@ -4000,13 +4045,14 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
         * root_domain.
         */
 #ifdef CONFIG_SMP
-        if (task_has_dl_policy(p)) {
+        if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
-                const struct cpumask *span = task_rq(p)->rd->span;
+                rcu_read_lock();
+                if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
-                if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
                        retval = -EBUSY;
-                        goto out_unlock;
+                        rcu_read_unlock();
+                        goto out_free_new_mask;
                }
+                rcu_read_unlock();
        }
 #endif
 again:
@@ -4024,7 +4070,7 @@ again:
                        goto again;
                }
        }
-out_unlock:
+out_free_new_mask:
        free_cpumask_var(new_mask);
 out_free_cpus_allowed:
        free_cpumask_var(cpus_allowed);
@@ -4508,7 +4554,7 @@ void show_state_filter(unsigned long state_filter)
                "  task                        PC stack   pid father\n");
 #endif
        rcu_read_lock();
-        do_each_thread(g, p) {
+        for_each_process_thread(g, p) {
                /*
                 * reset the NMI-timeout, listing all files on a slow
                 * console might take a lot of time:
@@ -4516,7 +4562,7 @@ void show_state_filter(unsigned long state_filter)
                touch_nmi_watchdog();
                if (!state_filter || (p->state & state_filter))
                        sched_show_task(p);
-        } while_each_thread(g, p);
+        }
        touch_all_softlockup_watchdogs();
@@ -4571,7 +4617,7 @@ void init_idle(struct task_struct *idle, int cpu)
        rcu_read_unlock();
        rq->curr = rq->idle = idle;
-        idle->on_rq = 1;
+        idle->on_rq = TASK_ON_RQ_QUEUED;
 #if defined(CONFIG_SMP)
        idle->on_cpu = 1;
 #endif
@@ -4592,6 +4638,33 @@ void init_idle(struct task_struct *idle, int cpu)
 }
 #ifdef CONFIG_SMP
+/*
+ * move_queued_task - move a queued task to new rq.
+ *
+ * Returns (locked) new rq. Old rq's lock is released.
+ */
+static struct rq *move_queued_task(struct task_struct *p, int new_cpu)
+{
+        struct rq *rq = task_rq(p);
+        lockdep_assert_held(&rq->lock);
+        dequeue_task(rq, p, 0);
+        p->on_rq = TASK_ON_RQ_MIGRATING;
+        set_task_cpu(p, new_cpu);
+        raw_spin_unlock(&rq->lock);
+        rq = cpu_rq(new_cpu);
+        raw_spin_lock(&rq->lock);
+        BUG_ON(task_cpu(p) != new_cpu);
+        p->on_rq = TASK_ON_RQ_QUEUED;
+        enqueue_task(rq, p, 0);
+        check_preempt_curr(rq, p, 0);
+        return rq;
+}
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
        if (p->sched_class && p->sched_class->set_cpus_allowed)
@@ -4648,14 +4721,15 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
                goto out;
        dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
-        if (p->on_rq) {
+        if (task_running(rq, p) || p->state == TASK_WAKING) {
                struct migration_arg arg = { p, dest_cpu };
                /* Need help from migration thread: drop lock and wait. */
                task_rq_unlock(rq, p, &flags);
                stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
                tlb_migrate_finish(p->mm);
                return 0;
-        }
+        } else if (task_on_rq_queued(p))
+                rq = move_queued_task(p, dest_cpu);
 out:
        task_rq_unlock(rq, p, &flags);
@@ -4676,20 +4750,20 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
 */
 static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
-        struct rq *rq_dest, *rq_src;
+        struct rq *rq;
        int ret = 0;
        if (unlikely(!cpu_active(dest_cpu)))
                return ret;
-        rq_src = cpu_rq(src_cpu);
+        rq = cpu_rq(src_cpu);
-        rq_dest = cpu_rq(dest_cpu);
        raw_spin_lock(&p->pi_lock);
-        double_rq_lock(rq_src, rq_dest);
+        raw_spin_lock(&rq->lock);
        /* Already moved. */
        if (task_cpu(p) != src_cpu)
                goto done;
        /* Affinity changed (again). */
        if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                goto fail;
@@ -4698,16 +4772,12 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
         * If we're not on a rq, the next wake-up will ensure we're
         * placed properly.
         */
-        if (p->on_rq) {
+        if (task_on_rq_queued(p))
-                dequeue_task(rq_src, p, 0);
+                rq = move_queued_task(p, dest_cpu);
-                set_task_cpu(p, dest_cpu);
-                enqueue_task(rq_dest, p, 0);
-                check_preempt_curr(rq_dest, p, 0);
-        }
 done:
        ret = 1;
 fail:
-        double_rq_unlock(rq_src, rq_dest);
+        raw_spin_unlock(&rq->lock);
        raw_spin_unlock(&p->pi_lock);
        return ret;
 }
@@ -4739,22 +4809,22 @@ void sched_setnuma(struct task_struct *p, int nid)
 {
        struct rq *rq;
        unsigned long flags;
-        bool on_rq, running;
+        bool queued, running;
        rq = task_rq_lock(p, &flags);
-        on_rq = p->on_rq;
+        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
-        if (on_rq)
+        if (queued)
                dequeue_task(rq, p, 0);
        if (running)
-                p->sched_class->put_prev_task(rq, p);
+                put_prev_task(rq, p);
        p->numa_preferred_nid = nid;
        if (running)
                p->sched_class->set_curr_task(rq);
-        if (on_rq)
+        if (queued)
                enqueue_task(rq, p, 0);
        task_rq_unlock(rq, p, &flags);
 }
@@ -4774,6 +4844,12 @@ static int migration_cpu_stop(void *data)
         * be on another cpu but it doesn't matter.
         */
        local_irq_disable();
+        /*
+         * We need to explicitly wake pending tasks before running
+         * __migrate_task() such that we will not miss enforcing cpus_allowed
+         * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
+         */
+        sched_ttwu_pending();
        __migrate_task(arg->task, raw_smp_processor_id(), arg->dest_cpu);
        local_irq_enable();
        return 0;
@@ -5184,6 +5260,7 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
 {
        unsigned long flags;
        long cpu = (long)hcpu;
+        struct dl_bw *dl_b;
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
@@ -5191,15 +5268,19 @@ static int sched_cpu_inactive(struct notifier_block *nfb,
                /* explicitly allow suspend */
                if (!(action & CPU_TASKS_FROZEN)) {
-                        struct dl_bw *dl_b = dl_bw_of(cpu);
                        bool overflow;
                        int cpus;
+                        rcu_read_lock_sched();
+                        dl_b = dl_bw_of(cpu);
                        raw_spin_lock_irqsave(&dl_b->lock, flags);
                        cpus = dl_bw_cpus(cpu);
                        overflow = __dl_overflow(dl_b, cpus, 0, 0);
                        raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+                        rcu_read_unlock_sched();
                        if (overflow)
                                return notifier_from_errno(-EBUSY);
                }
@@ -5742,7 +5823,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
        const struct cpumask *span = sched_domain_span(sd);
        struct cpumask *covered = sched_domains_tmpmask;
        struct sd_data *sdd = sd->private;
-        struct sched_domain *child;
+        struct sched_domain *sibling;
        int i;
        cpumask_clear(covered);
@@ -5753,10 +5834,10 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                if (cpumask_test_cpu(i, covered))
                        continue;
-                child = *per_cpu_ptr(sdd->sd, i);
+                sibling = *per_cpu_ptr(sdd->sd, i);
                /* See the comment near build_group_mask(). */
-                if (!cpumask_test_cpu(i, sched_domain_span(child)))
+                if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
                        continue;
                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
@@ -5766,10 +5847,9 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                        goto fail;
                sg_span = sched_group_cpus(sg);
-                if (child->child) {
+                if (sibling->child)
-                        child = child->child;
+                        cpumask_copy(sg_span, sched_domain_span(sibling->child));
-                        cpumask_copy(sg_span, sched_domain_span(child));
+                else
-                } else
                        cpumask_set_cpu(i, sg_span);
                cpumask_or(covered, covered, sg_span);
@@ -7120,13 +7200,13 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
                .sched_policy = SCHED_NORMAL,
        };
        int old_prio = p->prio;
-        int on_rq;
+        int queued;
-        on_rq = p->on_rq;
+        queued = task_on_rq_queued(p);
-        if (on_rq)
+        if (queued)
                dequeue_task(rq, p, 0);
        __setscheduler(rq, p, &attr);
-        if (on_rq) {
+        if (queued) {
                enqueue_task(rq, p, 0);
                resched_curr(rq);
        }
@@ -7140,12 +7220,12 @@ void normalize_rt_tasks(void)
        unsigned long flags;
        struct rq *rq;
-        read_lock_irqsave(&tasklist_lock, flags);
+        read_lock(&tasklist_lock);
-        do_each_thread(g, p) {
+        for_each_process_thread(g, p) {
                /*
                 * Only normalize user tasks:
                 */
-                if (!p->mm)
+                if (p->flags & PF_KTHREAD)
                        continue;
                p->se.exec_start                = 0;
@@ -7160,21 +7240,16 @@ void normalize_rt_tasks(void)
                         * Renice negative nice level userspace
                         * tasks back to 0:
                         */
-                        if (task_nice(p) < 0 && p->mm)
+                        if (task_nice(p) < 0)
                                set_user_nice(p, 0);
                        continue;
                }
-                raw_spin_lock(&p->pi_lock);
+                rq = task_rq_lock(p, &flags);
-                rq = __task_rq_lock(p);
                normalize_task(rq, p);
+                task_rq_unlock(rq, p, &flags);
-                __task_rq_unlock(rq);
+        }
-                raw_spin_unlock(&p->pi_lock);
+        read_unlock(&tasklist_lock);
-        } while_each_thread(g, p);
-        read_unlock_irqrestore(&tasklist_lock, flags);
 }
 #endif /* CONFIG_MAGIC_SYSRQ */
@@ -7314,19 +7389,19 @@ void sched_offline_group(struct task_group *tg)
 void sched_move_task(struct task_struct *tsk)
 {
        struct task_group *tg;
-        int on_rq, running;
+        int queued, running;
        unsigned long flags;
        struct rq *rq;
        rq = task_rq_lock(tsk, &flags);
        running = task_current(rq, tsk);
-        on_rq = tsk->on_rq;
+        queued = task_on_rq_queued(tsk);
-        if (on_rq)
+        if (queued)
                dequeue_task(rq, tsk, 0);
        if (unlikely(running))
-                tsk->sched_class->put_prev_task(rq, tsk);
+                put_prev_task(rq, tsk);
        tg = container_of(task_css_check(tsk, cpu_cgrp_id,
                                lockdep_is_held(&tsk->sighand->siglock)),
@@ -7336,14 +7411,14 @@ void sched_move_task(struct task_struct *tsk)
 #ifdef CONFIG_FAIR_GROUP_SCHED
        if (tsk->sched_class->task_move_group)
-                tsk->sched_class->task_move_group(tsk, on_rq);
+                tsk->sched_class->task_move_group(tsk, queued);
        else
 #endif
                set_task_rq(tsk, task_cpu(tsk));
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
-        if (on_rq)
+        if (queued)
                enqueue_task(rq, tsk, 0);
        task_rq_unlock(rq, tsk, &flags);
@@ -7361,10 +7436,10 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
 {
        struct task_struct *g, *p;
-        do_each_thread(g, p) {
+        for_each_process_thread(g, p) {
-                if (rt_task(p) && task_rq(p)->rt.tg == tg)
+                if (rt_task(p) && task_group(p) == tg)
                        return 1;
-        } while_each_thread(g, p);
+        }
        return 0;
 }
@@ -7573,6 +7648,7 @@ static int sched_dl_global_constraints(void)
        u64 runtime = global_rt_runtime();
        u64 period = global_rt_period();
        u64 new_bw = to_ratio(period, runtime);
+        struct dl_bw *dl_b;
        int cpu, ret = 0;
        unsigned long flags;
@@ -7586,13 +7662,16 @@ static int sched_dl_global_constraints(void)
         * solutions is welcome!
         */
        for_each_possible_cpu(cpu) {
-                struct dl_bw *dl_b = dl_bw_of(cpu);
+                rcu_read_lock_sched();
+                dl_b = dl_bw_of(cpu);
                raw_spin_lock_irqsave(&dl_b->lock, flags);
                if (new_bw < dl_b->total_bw)
                        ret = -EBUSY;
                raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+                rcu_read_unlock_sched();
                if (ret)
                        break;
        }
@@ -7603,6 +7682,7 @@ static int sched_dl_global_constraints(void)
 static void sched_dl_do_global(void)
 {
        u64 new_bw = -1;
+        struct dl_bw *dl_b;
        int cpu;
        unsigned long flags;
@@ -7616,11 +7696,14 @@ static void sched_dl_do_global(void)
         * FIXME: As above...
         */
        for_each_possible_cpu(cpu) {
-                struct dl_bw *dl_b = dl_bw_of(cpu);
+                rcu_read_lock_sched();
+                dl_b = dl_bw_of(cpu);
                raw_spin_lock_irqsave(&dl_b->lock, flags);
                dl_b->bw = new_bw;
                raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+                rcu_read_unlock_sched();
        }
 }
@@ -8001,7 +8084,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
                struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
                quota = normalize_cfs_quota(tg, d);
-                parent_quota = parent_b->hierarchal_quota;
+                parent_quota = parent_b->hierarchical_quota;
                /*
                 * ensure max(child_quota) <= parent_quota, inherit when no
@@ -8012,7 +8095,7 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
                else if (parent_quota != RUNTIME_INF && quota > parent_quota)
                        return -EINVAL;
        }
-        cfs_b->hierarchal_quota = quota;
+        cfs_b->hierarchical_quota = quota;
        return 0;
 }
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index bd95963dae80..539ca3ce071b 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -107,9 +107,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
        int best_cpu = -1;
        const struct sched_dl_entity *dl_se = &p->dl;
-        if (later_mask && cpumask_and(later_mask, cp->free_cpus,
+        if (later_mask && cpumask_and(later_mask, later_mask, cp->free_cpus)) {
-                        &p->cpus_allowed) && cpumask_and(later_mask,
-                        later_mask, cpu_active_mask)) {
                best_cpu = cpumask_any(later_mask);
                goto out;
        } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 72fdf06ef865..8394b1ee600c 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -288,24 +288,29 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        struct signal_struct *sig = tsk->signal;
        cputime_t utime, stime;
        struct task_struct *t;
+        unsigned int seq, nextseq;
-        times->utime = sig->utime;
+        unsigned long flags;
-        times->stime = sig->stime;
-        times->sum_exec_runtime = sig->sum_sched_runtime;
        rcu_read_lock();
-        /* make sure we can trust tsk->thread_group list */
+        /* Attempt a lockless read on the first round. */
-        if (!likely(pid_alive(tsk)))
+        nextseq = 0;
-                goto out;
-        t = tsk;
        do {
-                task_cputime(t, &utime, &stime);
+                seq = nextseq;
-                times->utime += utime;
+                flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq);
-                times->stime += stime;
+                times->utime = sig->utime;
-                times->sum_exec_runtime += task_sched_runtime(t);
+                times->stime = sig->stime;
-        } while_each_thread(tsk, t);
+                times->sum_exec_runtime = sig->sum_sched_runtime;
-out:
+                for_each_thread(tsk, t) {
+                        task_cputime(t, &utime, &stime);
+                        times->utime += utime;
+                        times->stime += stime;
+                        times->sum_exec_runtime += task_sched_runtime(t);
+                }
+                /* If lockless access failed, take the lock. */
+                nextseq = 1;
+        } while (need_seqretry(&sig->stats_lock, seq));
+        done_seqretry_irqrestore(&sig->stats_lock, seq, flags);
        rcu_read_unlock();
 }
@@ -550,6 +555,23 @@ drop_precision:
 }
 /*
+ * Atomically advance counter to the new value. Interrupts, vcpu
+ * scheduling, and scaling inaccuracies can cause cputime_advance
+ * to be occasionally called with a new value smaller than counter.
+ * Let's enforce atomicity.
+ *
+ * Normally a caller will only go through this loop once, or not
+ * at all in case a previous caller updated counter the same jiffy.
+ */
+static void cputime_advance(cputime_t *counter, cputime_t new)
+{
+        cputime_t old;
+        while (new > (old = ACCESS_ONCE(*counter)))
+                cmpxchg_cputime(counter, old, new);
+}
+/*
 * Adjust tick based cputime random precision against scheduler
 * runtime accounting.
 */
@@ -594,13 +616,8 @@ static void cputime_adjust(struct task_cputime *curr,
                utime = rtime - stime;
        }
-        /*
+        cputime_advance(&prev->stime, stime);
-         * If the tick based count grows faster than the scheduler one,
+        cputime_advance(&prev->utime, utime);
-         * the result of the scaling may go backward.
-         * Let's enforce monotonicity.
-         */
-        prev->stime = max(prev->stime, stime);
-        prev->utime = max(prev->utime, utime);
 out:
        *ut = prev->utime;
@@ -617,9 +634,6 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
        cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
-/*
- * Must be called with siglock held.
- */
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        struct task_cputime cputime;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 255ce138b652..abfaf3d9a29f 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -530,7 +530,7 @@ again:
        update_rq_clock(rq);
        dl_se->dl_throttled = 0;
        dl_se->dl_yielded = 0;
-        if (p->on_rq) {
+        if (task_on_rq_queued(p)) {
                enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
                if (task_has_dl_policy(rq->curr))
                        check_preempt_curr_dl(rq, p, 0);
@@ -997,10 +997,7 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
 #ifdef CONFIG_SCHED_HRTICK
 static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
 {
-        s64 delta = p->dl.dl_runtime - p->dl.runtime;
+        hrtick_start(rq, p->dl.runtime);
-        if (delta > 10000)
-                hrtick_start(rq, p->dl.runtime);
 }
 #endif
@@ -1030,7 +1027,7 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
                 * means a stop task can slip in, in which case we need to
                 * re-start task selection.
                 */
-                if (rq->stop && rq->stop->on_rq)
+                if (rq->stop && task_on_rq_queued(rq->stop))
                        return RETRY_TASK;
        }
@@ -1124,10 +1121,8 @@ static void set_curr_task_dl(struct rq *rq)
 static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
-            (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
+            cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
-            (p->nr_cpus_allowed > 1))
                return 1;
        return 0;
 }
@@ -1169,6 +1164,13 @@ static int find_later_rq(struct task_struct *task)
        if (task->nr_cpus_allowed == 1)
                return -1;
+        /*
+         * We have to consider system topology and task affinity
+         * first, then we can look for a suitable cpu.
+         */
+        cpumask_copy(later_mask, task_rq(task)->rd->span);
+        cpumask_and(later_mask, later_mask, cpu_active_mask);
+        cpumask_and(later_mask, later_mask, &task->cpus_allowed);
        best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
                        task, later_mask);
        if (best_cpu == -1)
@@ -1257,7 +1259,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
                        if (unlikely(task_rq(task) != rq ||
                                     !cpumask_test_cpu(later_rq->cpu,
                                                       &task->cpus_allowed) ||
-                                     task_running(rq, task) || !task->on_rq)) {
+                                     task_running(rq, task) ||
+                                     !task_on_rq_queued(task))) {
                                double_unlock_balance(rq, later_rq);
                                later_rq = NULL;
                                break;
@@ -1296,7 +1299,7 @@ static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
        BUG_ON(task_current(rq, p));
        BUG_ON(p->nr_cpus_allowed <= 1);
-        BUG_ON(!p->on_rq);
+        BUG_ON(!task_on_rq_queued(p));
        BUG_ON(!dl_task(p));
        return p;
@@ -1443,7 +1446,7 @@ static int pull_dl_task(struct rq *this_rq)
                     dl_time_before(p->dl.deadline,
                                    this_rq->dl.earliest_dl.curr))) {
                        WARN_ON(p == src_rq->curr);
-                        WARN_ON(!p->on_rq);
+                        WARN_ON(!task_on_rq_queued(p));
                        /*
                         * Then we pull iff p has actually an earlier
@@ -1569,6 +1572,8 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
        if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
                hrtimer_try_to_cancel(&p->dl.dl_timer);
+        __dl_clear_params(p);
 #ifdef CONFIG_SMP
        /*
         * Since this might be the only -deadline task on the rq,
@@ -1596,7 +1601,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
        if (unlikely(p->dl.dl_throttled))
                return;
-        if (p->on_rq && rq->curr != p) {
+        if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
                        /* Only reschedule if pushing failed */
@@ -1614,7 +1619,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
 static void prio_changed_dl(struct rq *rq, struct task_struct *p,
                            int oldprio)
 {
-        if (p->on_rq || rq->curr == p) {
+        if (task_on_rq_queued(p) || rq->curr == p) {
 #ifdef CONFIG_SMP
                /*
                 * This might be too much, but unfortunately
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 627b3c34b821..ce33780d8f20 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -150,7 +150,6 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
 static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
 {
        struct task_struct *g, *p;
-        unsigned long flags;
        SEQ_printf(m,
        "\nrunnable tasks:\n"
@@ -159,16 +158,14 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
        "------------------------------------------------------"
        "----------------------------------------------------\n");
-        read_lock_irqsave(&tasklist_lock, flags);
+        rcu_read_lock();
+        for_each_process_thread(g, p) {
-        do_each_thread(g, p) {
                if (task_cpu(p) != rq_cpu)
                        continue;
                print_task(m, rq, p);
-        } while_each_thread(g, p);
+        }
+        rcu_read_unlock();
-        read_unlock_irqrestore(&tasklist_lock, flags);
 }
 void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
@@ -333,9 +330,7 @@ do {									\
        print_cfs_stats(m, cpu);
        print_rt_stats(m, cpu);
-        rcu_read_lock();
        print_rq(m, rq, cpu);
-        rcu_read_unlock();
        spin_unlock_irqrestore(&sched_debug_lock, flags);
        SEQ_printf(m, "\n");
 }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 82088b29704e..b78280c59b46 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,7 @@
 #include <linux/latencytop.h>
 #include <linux/sched.h>
 #include <linux/cpumask.h>
+#include <linux/cpuidle.h>
 #include <linux/slab.h>
 #include <linux/profile.h>
 #include <linux/interrupt.h>
@@ -665,6 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 #ifdef CONFIG_SMP
+static int select_idle_sibling(struct task_struct *p, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 static inline void __update_task_entity_contrib(struct sched_entity *se);
@@ -1038,7 +1040,8 @@ struct numa_stats {
 */
 static void update_numa_stats(struct numa_stats *ns, int nid)
 {
-        int cpu, cpus = 0;
+        int smt, cpu, cpus = 0;
+        unsigned long capacity;
        memset(ns, 0, sizeof(*ns));
        for_each_cpu(cpu, cpumask_of_node(nid)) {
@@ -1062,8 +1065,12 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
        if (!cpus)
                return;
-        ns->task_capacity =
+        /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
-                DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
+        smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
+        capacity = cpus / smt; /* cores */
+        ns->task_capacity = min_t(unsigned, capacity,
+                DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
        ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
 }
@@ -1206,7 +1213,7 @@ static void task_numa_compare(struct task_numa_env *env,
        if (!cur) {
                /* Is there capacity at our destination? */
-                if (env->src_stats.has_free_capacity &&
+                if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
                    !env->dst_stats.has_free_capacity)
                        goto unlock;
@@ -1252,6 +1259,13 @@ balance:
        if (load_too_imbalanced(src_load, dst_load, env))
                goto unlock;
+        /*
+         * One idle CPU per node is evaluated for a task numa move.
+         * Call select_idle_sibling to maybe find a better one.
+         */
+        if (!cur)
+                env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
 assign:
        task_numa_assign(env, cur, imp);
 unlock:
@@ -1775,7 +1789,7 @@ void task_numa_free(struct task_struct *p)
                list_del(&p->numa_entry);
                grp->nr_tasks--;
                spin_unlock_irqrestore(&grp->lock, flags);
-                rcu_assign_pointer(p->numa_group, NULL);
+                RCU_INIT_POINTER(p->numa_group, NULL);
                put_numa_group(grp);
        }
@@ -1804,10 +1818,6 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
        if (!p->mm)
                return;
-        /* Do not worry about placement if exiting */
-        if (p->state == TASK_DEAD)
-                return;
        /* Allocate buffer to track faults on a per-node basis */
        if (unlikely(!p->numa_faults_memory)) {
                int size = sizeof(*p->numa_faults_memory) *
@@ -2211,8 +2221,8 @@ static __always_inline u64 decay_load(u64 val, u64 n)
        /*
         * As y^PERIOD = 1/2, we can combine
-         *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
+         *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
-         * With a look-up table which covers k^n (n<PERIOD)
+         * With a look-up table which covers y^n (n<PERIOD)
         *
         * To achieve constant time decay_load.
         */
@@ -2377,6 +2387,9 @@ static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
        tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
        tg_contrib -= cfs_rq->tg_load_contrib;
+        if (!tg_contrib)
+                return;
        if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
                atomic_long_add(tg_contrib, &tg->load_avg);
                cfs_rq->tg_load_contrib += tg_contrib;
@@ -3892,14 +3905,6 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
                                resched_curr(rq);
                        return;
                }
-                /*
-                 * Don't schedule slices shorter than 10000ns, that just
-                 * doesn't make sense. Rely on vruntime for fairness.
-                 */
-                if (rq->curr != p)
-                        delta = max_t(s64, 10000LL, delta);
                hrtick_start(rq, delta);
        }
 }
@@ -4087,7 +4092,7 @@ static unsigned long capacity_of(int cpu)
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+        unsigned long nr_running = ACCESS_ONCE(rq->cfs.h_nr_running);
        unsigned long load_avg = rq->cfs.runnable_load_avg;
        if (nr_running)
@@ -4276,8 +4281,8 @@ static int wake_wide(struct task_struct *p)
 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 {
        s64 this_load, load;
+        s64 this_eff_load, prev_eff_load;
        int idx, this_cpu, prev_cpu;
-        unsigned long tl_per_task;
        struct task_group *tg;
        unsigned long weight;
        int balanced;
@@ -4320,47 +4325,30 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
         * Otherwise check if either cpus are near enough in load to allow this
         * task to be woken on this_cpu.
         */
-        if (this_load > 0) {
+        this_eff_load = 100;
-                s64 this_eff_load, prev_eff_load;
+        this_eff_load *= capacity_of(prev_cpu);
-                this_eff_load = 100;
+        prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-                this_eff_load *= capacity_of(prev_cpu);
+        prev_eff_load *= capacity_of(this_cpu);
+        if (this_load > 0) {
                this_eff_load *= this_load +
                        effective_load(tg, this_cpu, weight, weight);
-                prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
-                prev_eff_load *= capacity_of(this_cpu);
                prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
+        }
-                balanced = this_eff_load <= prev_eff_load;
+        balanced = this_eff_load <= prev_eff_load;
-        } else
-                balanced = true;
-        /*
-         * If the currently running task will sleep within
-         * a reasonable amount of time then attract this newly
-         * woken task:
-         */
-        if (sync && balanced)
-                return 1;
        schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
-        tl_per_task = cpu_avg_load_per_task(this_cpu);
-        if (balanced ||
+        if (!balanced)
-            (this_load <= load &&
+                return 0;
-             this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
-                /*
-                 * This domain has SD_WAKE_AFFINE and
-                 * p is cache cold in this domain, and
-                 * there is no bad imbalance.
-                 */
-                schedstat_inc(sd, ttwu_move_affine);
-                schedstat_inc(p, se.statistics.nr_wakeups_affine);
-                return 1;
+        schedstat_inc(sd, ttwu_move_affine);
-        }
+        schedstat_inc(p, se.statistics.nr_wakeups_affine);
-        return 0;
+        return 1;
 }
 /*
@@ -4428,20 +4416,46 @@ static int
 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 {
        unsigned long load, min_load = ULONG_MAX;
-        int idlest = -1;
+        unsigned int min_exit_latency = UINT_MAX;
+        u64 latest_idle_timestamp = 0;
+        int least_loaded_cpu = this_cpu;
+        int shallowest_idle_cpu = -1;
        int i;
        /* Traverse only the allowed CPUs */
        for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
-                load = weighted_cpuload(i);
+                if (idle_cpu(i)) {
+                        struct rq *rq = cpu_rq(i);
-                if (load < min_load || (load == min_load && i == this_cpu)) {
+                        struct cpuidle_state *idle = idle_get_state(rq);
-                        min_load = load;
+                        if (idle && idle->exit_latency < min_exit_latency) {
-                        idlest = i;
+                                /*
+                                 * We give priority to a CPU whose idle state
+                                 * has the smallest exit latency irrespective
+                                 * of any idle timestamp.
+                                 */
+                                min_exit_latency = idle->exit_latency;
+                                latest_idle_timestamp = rq->idle_stamp;
+                                shallowest_idle_cpu = i;
+                        } else if ((!idle || idle->exit_latency == min_exit_latency) &&
+                                   rq->idle_stamp > latest_idle_timestamp) {
+                                /*
+                                 * If equal or no active idle state, then
+                                 * the most recently idled CPU might have
+                                 * a warmer cache.
+                                 */
+                                latest_idle_timestamp = rq->idle_stamp;
+                                shallowest_idle_cpu = i;
+                        }
+                } else {
+                        load = weighted_cpuload(i);
+                        if (load < min_load || (load == min_load && i == this_cpu)) {
+                                min_load = load;
+                                least_loaded_cpu = i;
+                        }
                }
        }
-        return idlest;
+        return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
 }
 /*
@@ -4513,11 +4527,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
        if (p->nr_cpus_allowed == 1)
                return prev_cpu;
-        if (sd_flag & SD_BALANCE_WAKE) {
+        if (sd_flag & SD_BALANCE_WAKE)
-                if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+                want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
-                        want_affine = 1;
-                new_cpu = prev_cpu;
-        }
        rcu_read_lock();
        for_each_domain(cpu, tmp) {
@@ -4704,7 +4715,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
                return;
        /*
-         * This is possible from callers such as move_task(), in which we
+         * This is possible from callers such as attach_tasks(), in which we
         * unconditionally check_prempt_curr() after an enqueue (which may have
         * lead to a throttle).  This both saves work and prevents false
         * next-buddy nomination below.
@@ -5112,27 +5123,18 @@ struct lb_env {
        unsigned int            loop_max;
        enum fbq_type           fbq_type;
+        struct list_head        tasks;
 };
 /*
- * move_task - move a task from one runqueue to another runqueue.
- * Both runqueues must be locked.
- */
-static void move_task(struct task_struct *p, struct lb_env *env)
-{
-        deactivate_task(env->src_rq, p, 0);
-        set_task_cpu(p, env->dst_cpu);
-        activate_task(env->dst_rq, p, 0);
-        check_preempt_curr(env->dst_rq, p, 0);
-}
-/*
 * Is this task likely cache-hot:
 */
 static int task_hot(struct task_struct *p, struct lb_env *env)
 {
        s64 delta;
+        lockdep_assert_held(&env->src_rq->lock);
        if (p->sched_class != &fair_sched_class)
                return 0;
@@ -5252,6 +5254,9 @@ static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
        int tsk_cache_hot = 0;
+        lockdep_assert_held(&env->src_rq->lock);
        /*
         * We do not migrate tasks that are:
         * 1) throttled_lb_pair, or
@@ -5310,24 +5315,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        if (!tsk_cache_hot)
                tsk_cache_hot = migrate_degrades_locality(p, env);
-        if (migrate_improves_locality(p, env)) {
+        if (migrate_improves_locality(p, env) || !tsk_cache_hot ||
-#ifdef CONFIG_SCHEDSTATS
+            env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
                if (tsk_cache_hot) {
                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
                        schedstat_inc(p, se.statistics.nr_forced_migrations);
                }
-#endif
-                return 1;
-        }
-        if (!tsk_cache_hot ||
-                env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
-                if (tsk_cache_hot) {
-                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
-                        schedstat_inc(p, se.statistics.nr_forced_migrations);
-                }
                return 1;
        }
@@ -5336,47 +5329,63 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
 }
 /*
- * move_one_task tries to move exactly one task from busiest to this_rq, as
+ * detach_task() -- detach the task for the migration specified in env
+ */
+static void detach_task(struct task_struct *p, struct lb_env *env)
+{
+        lockdep_assert_held(&env->src_rq->lock);
+        deactivate_task(env->src_rq, p, 0);
+        p->on_rq = TASK_ON_RQ_MIGRATING;
+        set_task_cpu(p, env->dst_cpu);
+}
+/*
+ * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
 * part of active balancing operations within "domain".
- * Returns 1 if successful and 0 otherwise.
 *
- * Called with both runqueues locked.
+ * Returns a task if successful and NULL otherwise.
 */
-static int move_one_task(struct lb_env *env)
+static struct task_struct *detach_one_task(struct lb_env *env)
 {
        struct task_struct *p, *n;
+        lockdep_assert_held(&env->src_rq->lock);
        list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
                if (!can_migrate_task(p, env))
                        continue;
-                move_task(p, env);
+                detach_task(p, env);
                /*
-                 * Right now, this is only the second place move_task()
+                 * Right now, this is only the second place where
-                 * is called, so we can safely collect move_task()
+                 * lb_gained[env->idle] is updated (other is detach_tasks)
-                 * stats here rather than inside move_task().
+                 * so we can safely collect stats here rather than
+                 * inside detach_tasks().
                 */
                schedstat_inc(env->sd, lb_gained[env->idle]);
-                return 1;
+                return p;
        }
-        return 0;
+        return NULL;
 }
 static const unsigned int sched_nr_migrate_break = 32;
 /*
- * move_tasks tries to move up to imbalance weighted load from busiest to
+ * detach_tasks() -- tries to detach up to imbalance weighted load from
- * this_rq, as part of a balancing operation within domain "sd".
+ * busiest_rq, as part of a balancing operation within domain "sd".
- * Returns 1 if successful and 0 otherwise.
 *
- * Called with both runqueues locked.
+ * Returns number of detached tasks if successful and 0 otherwise.
 */
-static int move_tasks(struct lb_env *env)
+static int detach_tasks(struct lb_env *env)
 {
        struct list_head *tasks = &env->src_rq->cfs_tasks;
        struct task_struct *p;
        unsigned long load;
-        int pulled = 0;
+        int detached = 0;
+        lockdep_assert_held(&env->src_rq->lock);
        if (env->imbalance <= 0)
                return 0;
@@ -5407,14 +5416,16 @@ static int move_tasks(struct lb_env *env)
                if ((load / 2) > env->imbalance)
                        goto next;
-                move_task(p, env);
+                detach_task(p, env);
-                pulled++;
+                list_add(&p->se.group_node, &env->tasks);
+                detached++;
                env->imbalance -= load;
 #ifdef CONFIG_PREEMPT
                /*
                 * NEWIDLE balancing is a source of latency, so preemptible
-                 * kernels will stop after the first task is pulled to minimize
+                 * kernels will stop after the first task is detached to minimize
                 * the critical section.
                 */
                if (env->idle == CPU_NEWLY_IDLE)
@@ -5434,13 +5445,58 @@ next:
        }
        /*
-         * Right now, this is one of only two places move_task() is called,
+         * Right now, this is one of only two places we collect this stat
-         * so we can safely collect move_task() stats here rather than
+         * so we can safely collect detach_one_task() stats here rather
-         * inside move_task().
+         * than inside detach_one_task().
         */
-        schedstat_add(env->sd, lb_gained[env->idle], pulled);
+        schedstat_add(env->sd, lb_gained[env->idle], detached);
+        return detached;
+}
+/*
+ * attach_task() -- attach the task detached by detach_task() to its new rq.
+ */
+static void attach_task(struct rq *rq, struct task_struct *p)
+{
+        lockdep_assert_held(&rq->lock);
+        BUG_ON(task_rq(p) != rq);
+        p->on_rq = TASK_ON_RQ_QUEUED;
+        activate_task(rq, p, 0);
+        check_preempt_curr(rq, p, 0);
+}
+/*
+ * attach_one_task() -- attaches the task returned from detach_one_task() to
+ * its new rq.
+ */
+static void attach_one_task(struct rq *rq, struct task_struct *p)
+{
+        raw_spin_lock(&rq->lock);
+        attach_task(rq, p);
+        raw_spin_unlock(&rq->lock);
+}
+/*
+ * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
+ * new rq.
+ */
+static void attach_tasks(struct lb_env *env)
+{
+        struct list_head *tasks = &env->tasks;
+        struct task_struct *p;
+        raw_spin_lock(&env->dst_rq->lock);
+        while (!list_empty(tasks)) {
+                p = list_first_entry(tasks, struct task_struct, se.group_node);
+                list_del_init(&p->se.group_node);
-        return pulled;
+                attach_task(env->dst_rq, p);
+        }
+        raw_spin_unlock(&env->dst_rq->lock);
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -5559,6 +5615,13 @@ static unsigned long task_h_load(struct task_struct *p)
 #endif
 /********** Helpers for find_busiest_group ************************/
+enum group_type {
+        group_other = 0,
+        group_imbalanced,
+        group_overloaded,
+};
 /*
 * sg_lb_stats - stats of a sched_group required for load_balancing
 */
@@ -5572,7 +5635,7 @@ struct sg_lb_stats {
        unsigned int group_capacity_factor;
        unsigned int idle_cpus;
        unsigned int group_weight;
-        int group_imb; /* Is there an imbalance in the group ? */
+        enum group_type group_type;
        int group_has_free_capacity;
 #ifdef CONFIG_NUMA_BALANCING
        unsigned int nr_numa_running;
@@ -5610,6 +5673,8 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
                .total_capacity = 0UL,
                .busiest_stat = {
                        .avg_load = 0UL,
+                        .sum_nr_running = 0,
+                        .group_type = group_other,
                },
        };
 }
@@ -5652,19 +5717,17 @@ unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
        return default_scale_capacity(sd, cpu);
 }
-static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-        unsigned long weight = sd->span_weight;
+        if ((sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1))
-        unsigned long smt_gain = sd->smt_gain;
+                return sd->smt_gain / sd->span_weight;
-        smt_gain /= weight;
+        return SCHED_CAPACITY_SCALE;
-        return smt_gain;
 }
-unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-        return default_scale_smt_capacity(sd, cpu);
+        return default_scale_cpu_capacity(sd, cpu);
 }
 static unsigned long scale_rt_capacity(int cpu)
@@ -5703,18 +5766,15 @@ static unsigned long scale_rt_capacity(int cpu)
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-        unsigned long weight = sd->span_weight;
        unsigned long capacity = SCHED_CAPACITY_SCALE;
        struct sched_group *sdg = sd->groups;
-        if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
+        if (sched_feat(ARCH_CAPACITY))
-                if (sched_feat(ARCH_CAPACITY))
+                capacity *= arch_scale_cpu_capacity(sd, cpu);
-                        capacity *= arch_scale_smt_capacity(sd, cpu);
+        else
-                else
+                capacity *= default_scale_cpu_capacity(sd, cpu);
-                        capacity *= default_scale_smt_capacity(sd, cpu);
-                capacity >>= SCHED_CAPACITY_SHIFT;
+        capacity >>= SCHED_CAPACITY_SHIFT;
-        }
        sdg->sgc->capacity_orig = capacity;
@@ -5891,6 +5951,18 @@ static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *gro
        return capacity_factor;
 }
+static enum group_type
+group_classify(struct sched_group *group, struct sg_lb_stats *sgs)
+{
+        if (sgs->sum_nr_running > sgs->group_capacity_factor)
+                return group_overloaded;
+        if (sg_imbalanced(group))
+                return group_imbalanced;
+        return group_other;
+}
 /**
 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
 * @env: The load balancing environment.
@@ -5920,7 +5992,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        load = source_load(i, load_idx);
                sgs->group_load += load;
-                sgs->sum_nr_running += rq->nr_running;
+                sgs->sum_nr_running += rq->cfs.h_nr_running;
                if (rq->nr_running > 1)
                        *overload = true;
@@ -5942,9 +6014,8 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
        sgs->group_weight = group->group_weight;
-        sgs->group_imb = sg_imbalanced(group);
        sgs->group_capacity_factor = sg_capacity_factor(env, group);
+        sgs->group_type = group_classify(group, sgs);
        if (sgs->group_capacity_factor > sgs->sum_nr_running)
                sgs->group_has_free_capacity = 1;
@@ -5968,13 +6039,19 @@ static bool update_sd_pick_busiest(struct lb_env *env,
                                   struct sched_group *sg,
                                   struct sg_lb_stats *sgs)
 {
-        if (sgs->avg_load <= sds->busiest_stat.avg_load)
+        struct sg_lb_stats *busiest = &sds->busiest_stat;
-                return false;
-        if (sgs->sum_nr_running > sgs->group_capacity_factor)
+        if (sgs->group_type > busiest->group_type)
                return true;
-        if (sgs->group_imb)
+        if (sgs->group_type < busiest->group_type)
+                return false;
+        if (sgs->avg_load <= busiest->avg_load)
+                return false;
+        /* This is the busiest node in its class. */
+        if (!(env->sd->flags & SD_ASYM_PACKING))
                return true;
        /*
@@ -5982,8 +6059,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
         * numbered CPUs in the group, therefore mark all groups
         * higher than ourself as busy.
         */
-        if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
+        if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
-            env->dst_cpu < group_first_cpu(sg)) {
                if (!sds->busiest)
                        return true;
@@ -6228,7 +6304,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
        local = &sds->local_stat;
        busiest = &sds->busiest_stat;
-        if (busiest->group_imb) {
+        if (busiest->group_type == group_imbalanced) {
                /*
                 * In the group_imb case we cannot rely on group-wide averages
                 * to ensure cpu-load equilibrium, look at wider averages. XXX
@@ -6248,12 +6324,11 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                return fix_small_imbalance(env, sds);
        }
-        if (!busiest->group_imb) {
+        /*
-                /*
+         * If there aren't any idle cpus, avoid creating some.
-                 * Don't want to pull so many tasks that a group would go idle.
+         */
-                 * Except of course for the group_imb case, since then we might
+        if (busiest->group_type == group_overloaded &&
-                 * have to drop below capacity to reach cpu-load equilibrium.
+            local->group_type   == group_overloaded) {
-                 */
                load_above_capacity =
                        (busiest->sum_nr_running - busiest->group_capacity_factor);
@@ -6337,7 +6412,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
         * work because they assume all things are equal, which typically
         * isn't true due to cpus_allowed constraints and the like.
         */
-        if (busiest->group_imb)
+        if (busiest->group_type == group_imbalanced)
                goto force_balance;
        /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
@@ -6346,7 +6421,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
                goto force_balance;
        /*
-         * If the local group is more busy than the selected busiest group
+         * If the local group is busier than the selected busiest group
         * don't try and pull any tasks.
         */
        if (local->avg_load >= busiest->avg_load)
@@ -6361,13 +6436,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
        if (env->idle == CPU_IDLE) {
                /*
-                 * This cpu is idle. If the busiest group load doesn't
+                 * This cpu is idle. If the busiest group is not overloaded
-                 * have more tasks than the number of available cpu's and
+                 * and there is no imbalance between this and busiest group
-                 * there is no imbalance between this and busiest group
+                 * wrt idle cpus, it is balanced. The imbalance becomes
-                 * wrt to idle cpu's, it is balanced.
+                 * significant if the diff is greater than 1 otherwise we
+                 * might end up to just move the imbalance on another group
                 */
-                if ((local->idle_cpus < busiest->idle_cpus) &&
+                if ((busiest->group_type != group_overloaded) &&
-                    busiest->sum_nr_running <= busiest->group_weight)
+                                (local->idle_cpus <= (busiest->idle_cpus + 1)))
                        goto out_balanced;
        } else {
                /*
@@ -6550,6 +6626,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .loop_break     = sched_nr_migrate_break,
                .cpus           = cpus,
                .fbq_type       = all,
+                .tasks          = LIST_HEAD_INIT(env.tasks),
        };
        /*
@@ -6599,23 +6676,30 @@ redo:
                env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
 more_balance:
-                local_irq_save(flags);
+                raw_spin_lock_irqsave(&busiest->lock, flags);
-                double_rq_lock(env.dst_rq, busiest);
                /*
                 * cur_ld_moved - load moved in current iteration
                 * ld_moved     - cumulative load moved across iterations
                 */
-                cur_ld_moved = move_tasks(&env);
+                cur_ld_moved = detach_tasks(&env);
-                ld_moved += cur_ld_moved;
-                double_rq_unlock(env.dst_rq, busiest);
-                local_irq_restore(flags);
                /*
-                 * some other cpu did the load balance for us.
+                 * We've detached some tasks from busiest_rq. Every
+                 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
+                 * unlock busiest->lock, and we are able to be sure
+                 * that nobody can manipulate the tasks in parallel.
+                 * See task_rq_lock() family for the details.
                 */
-                if (cur_ld_moved && env.dst_cpu != smp_processor_id())
-                        resched_cpu(env.dst_cpu);
+                raw_spin_unlock(&busiest->lock);
+                if (cur_ld_moved) {
+                        attach_tasks(&env);
+                        ld_moved += cur_ld_moved;
+                }
+                local_irq_restore(flags);
                if (env.flags & LBF_NEED_BREAK) {
                        env.flags &= ~LBF_NEED_BREAK;
@@ -6665,10 +6749,8 @@ more_balance:
                if (sd_parent) {
                        int *group_imbalance = &sd_parent->groups->sgc->imbalance;
-                        if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+                        if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
                                *group_imbalance = 1;
-                        } else if (*group_imbalance)
-                                *group_imbalance = 0;
                }
                /* All tasks on this runqueue were pinned by CPU affinity */
@@ -6679,7 +6761,7 @@ more_balance:
                                env.loop_break = sched_nr_migrate_break;
                                goto redo;
                        }
-                        goto out_balanced;
+                        goto out_all_pinned;
                }
        }
@@ -6744,7 +6826,7 @@ more_balance:
                 * If we've begun active balancing, start to back off. This
                 * case may not be covered by the all_pinned logic if there
                 * is only 1 task on the busy runqueue (because we don't call
-                 * move_tasks).
+                 * detach_tasks).
                 */
                if (sd->balance_interval < sd->max_interval)
                        sd->balance_interval *= 2;
@@ -6753,6 +6835,23 @@ more_balance:
        goto out;
 out_balanced:
+        /*
+         * We reach balance although we may have faced some affinity
+         * constraints. Clear the imbalance flag if it was set.
+         */
+        if (sd_parent) {
+                int *group_imbalance = &sd_parent->groups->sgc->imbalance;
+                if (*group_imbalance)
+                        *group_imbalance = 0;
+        }
+out_all_pinned:
+        /*
+         * We reach balance because all tasks are pinned at this level so
+         * we can't migrate them. Let the imbalance flag set so parent level
+         * can try to migrate them.
+         */
        schedstat_inc(sd, lb_balanced[idle]);
        sd->nr_balance_failed = 0;
@@ -6914,6 +7013,7 @@ static int active_load_balance_cpu_stop(void *data)
        int target_cpu = busiest_rq->push_cpu;
        struct rq *target_rq = cpu_rq(target_cpu);
        struct sched_domain *sd;
+        struct task_struct *p = NULL;
        raw_spin_lock_irq(&busiest_rq->lock);
@@ -6933,9 +7033,6 @@ static int active_load_balance_cpu_stop(void *data)
         */
        BUG_ON(busiest_rq == target_rq);
-        /* move a task from busiest_rq to target_rq */
-        double_lock_balance(busiest_rq, target_rq);
        /* Search for an sd spanning us and the target CPU. */
        rcu_read_lock();
        for_each_domain(target_cpu, sd) {
@@ -6956,16 +7053,22 @@ static int active_load_balance_cpu_stop(void *data)
                schedstat_inc(sd, alb_count);
-                if (move_one_task(&env))
+                p = detach_one_task(&env);
+                if (p)
                        schedstat_inc(sd, alb_pushed);
                else
                        schedstat_inc(sd, alb_failed);
        }
        rcu_read_unlock();
-        double_unlock_balance(busiest_rq, target_rq);
 out_unlock:
        busiest_rq->active_balance = 0;
-        raw_spin_unlock_irq(&busiest_rq->lock);
+        raw_spin_unlock(&busiest_rq->lock);
+        if (p)
+                attach_one_task(target_rq, p);
+        local_irq_enable();
        return 0;
 }
@@ -7465,7 +7568,7 @@ static void task_fork_fair(struct task_struct *p)
 static void
 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
 {
-        if (!p->se.on_rq)
+        if (!task_on_rq_queued(p))
                return;
        /*
@@ -7490,11 +7593,11 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
         * switched back to the fair class the enqueue_entity(.flags=0) will
         * do the right thing.
         *
-         * If it's on_rq, then the dequeue_entity(.flags=0) will already
+         * If it's queued, then the dequeue_entity(.flags=0) will already
-         * have normalized the vruntime, if it's !on_rq, then only when
+         * have normalized the vruntime, if it's !queued, then only when
         * the task is sleeping will it still have non-normalized vruntime.
         */
-        if (!p->on_rq && p->state != TASK_RUNNING) {
+        if (!task_on_rq_queued(p) && p->state != TASK_RUNNING) {
                /*
                 * Fix up our vruntime so that the current sleep doesn't
                 * cause 'unlimited' sleep bonus.
@@ -7521,15 +7624,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
 */
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
-        struct sched_entity *se = &p->se;
 #ifdef CONFIG_FAIR_GROUP_SCHED
+        struct sched_entity *se = &p->se;
        /*
         * Since the real-depth could have been changed (only FAIR
         * class maintain depth value), reset depth properly.
         */
        se->depth = se->parent ? se->parent->depth + 1 : 0;
 #endif
-        if (!se->on_rq)
+        if (!task_on_rq_queued(p))
                return;
        /*
@@ -7575,7 +7678,7 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static void task_move_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int queued)
 {
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq;
@@ -7594,7 +7697,7 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
         * fair sleeper stuff for the first placement, but who cares.
         */
        /*
-         * When !on_rq, vruntime of the task has usually NOT been normalized.
+         * When !queued, vruntime of the task has usually NOT been normalized.
         * But there are some cases where it has already been normalized:
         *
         * - Moving a forked child which is waiting for being woken up by
@@ -7605,14 +7708,14 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
         * To prevent boost or penalty in the new cfs_rq caused by delta
         * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
         */
-        if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
+        if (!queued && (!se->sum_exec_runtime || p->state == TASK_WAKING))
-                on_rq = 1;
+                queued = 1;
-        if (!on_rq)
+        if (!queued)
                se->vruntime -= cfs_rq_of(se)->min_vruntime;
        set_task_rq(p, task_cpu(p));
        se->depth = se->parent ? se->parent->depth + 1 : 0;
-        if (!on_rq) {
+        if (!queued) {
                cfs_rq = cfs_rq_of(se);
                se->vruntime += cfs_rq->min_vruntime;
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 11e7bc434f43..c47fce75e666 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -147,6 +147,9 @@ use_default:
            clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &dev->cpu))
                goto use_default;
+        /* Take note of the planned idle state. */
+        idle_set_state(this_rq(), &drv->states[next_state]);
        /*
         * Enter the idle state previously returned by the governor decision.
         * This function will block until an interrupt occurs and will take
@@ -154,6 +157,9 @@ use_default:
         */
        entered_state = cpuidle_enter(drv, dev, next_state);
+        /* The cpu is no longer idle or about to enter idle. */
+        idle_set_state(this_rq(), NULL);
        if (broadcast)
                clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &dev->cpu);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 5f6edca4fafd..87ea5bf1b87f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1448,7 +1448,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
                 * means a dl or stop task can slip in, in which case we need
                 * to re-start task selection.
                 */
-                if (unlikely((rq->stop && rq->stop->on_rq) ||
+                if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
                             rq->dl.dl_nr_running))
                        return RETRY_TASK;
        }
@@ -1468,8 +1468,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev)
        p = _pick_next_task_rt(rq);
        /* The running task is never eligible for pushing */
-        if (p)
+        dequeue_pushable_task(rq, p);
-                dequeue_pushable_task(rq, p);
        set_post_schedule(rq);
@@ -1624,7 +1623,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                                     !cpumask_test_cpu(lowest_rq->cpu,
                                                       tsk_cpus_allowed(task)) ||
                                     task_running(rq, task) ||
-                                     !task->on_rq)) {
+                                     !task_on_rq_queued(task))) {
                                double_unlock_balance(rq, lowest_rq);
                                lowest_rq = NULL;
@@ -1658,7 +1657,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
        BUG_ON(task_current(rq, p));
        BUG_ON(p->nr_cpus_allowed <= 1);
-        BUG_ON(!p->on_rq);
+        BUG_ON(!task_on_rq_queued(p));
        BUG_ON(!rt_task(p));
        return p;
@@ -1809,7 +1808,7 @@ static int pull_rt_task(struct rq *this_rq)
                 */
                if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
                        WARN_ON(p == src_rq->curr);
-                        WARN_ON(!p->on_rq);
+                        WARN_ON(!task_on_rq_queued(p));
                        /*
                         * There's a chance that p is higher in priority
@@ -1870,7 +1869,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
        BUG_ON(!rt_task(p));
-        if (!p->on_rq)
+        if (!task_on_rq_queued(p))
                return;
        weight = cpumask_weight(new_mask);
@@ -1936,7 +1935,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
         * we may need to handle the pulling of RT tasks
         * now.
         */
-        if (!p->on_rq || rq->rt.rt_nr_running)
+        if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
                return;
        if (pull_rt_task(rq))
@@ -1970,7 +1969,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
         * If that current running task is also an RT task
         * then see if we can move to another run queue.
         */
-        if (p->on_rq && rq->curr != p) {
+        if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
                    /* Don't resched if we changed runqueues */
@@ -1989,7 +1988,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
 static void
 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
 {
-        if (!p->on_rq)
+        if (!task_on_rq_queued(p))
                return;
        if (rq->curr == p) {
@@ -2073,7 +2072,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        for_each_sched_rt_entity(rt_se) {
                if (rt_se->run_list.prev != rt_se->run_list.next) {
                        requeue_task_rt(rq, p, 0);
-                        set_tsk_need_resched(p);
+                        resched_curr(rq);
                        return;
                }
        }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 579712f4e9d5..6130251de280 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -14,6 +14,11 @@
 #include "cpuacct.h"
 struct rq;
+struct cpuidle_state;
+/* task_struct::on_rq states: */
+#define TASK_ON_RQ_QUEUED       1
+#define TASK_ON_RQ_MIGRATING    2
 extern __read_mostly int scheduler_running;
@@ -126,6 +131,9 @@ struct rt_bandwidth {
        u64                     rt_runtime;
        struct hrtimer          rt_period_timer;
 };
+void __dl_clear_params(struct task_struct *p);
 /*
 * To keep the bandwidth of -deadline tasks and groups under control
 * we need some place where:
@@ -184,7 +192,7 @@ struct cfs_bandwidth {
        raw_spinlock_t lock;
        ktime_t period;
        u64 quota, runtime;
-        s64 hierarchal_quota;
+        s64 hierarchical_quota;
        u64 runtime_expires;
        int idle, timer_active;
@@ -636,6 +644,11 @@ struct rq {
 #ifdef CONFIG_SMP
        struct llist_head wake_list;
 #endif
+#ifdef CONFIG_CPU_IDLE
+        /* Must be inspected within a rcu lock section */
+        struct cpuidle_state *idle_state;
+#endif
 };
 static inline int cpu_of(struct rq *rq)
@@ -647,7 +660,7 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
-DECLARE_PER_CPU(struct rq, runqueues);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
 #define this_rq()               (&__get_cpu_var(runqueues))
@@ -942,6 +955,15 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
 #endif
 }
+static inline int task_on_rq_queued(struct task_struct *p)
+{
+        return p->on_rq == TASK_ON_RQ_QUEUED;
+}
+static inline int task_on_rq_migrating(struct task_struct *p)
+{
+        return p->on_rq == TASK_ON_RQ_MIGRATING;
+}
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)      do { } while (0)
@@ -953,7 +975,6 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
 # define finish_arch_post_lock_switch() do { } while (0)
 #endif
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
 {
 #ifdef CONFIG_SMP
@@ -991,35 +1012,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
        raw_spin_unlock_irq(&rq->lock);
 }
-#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
-        /*
-         * We can optimise this out completely for !SMP, because the
-         * SMP rebalancing from interrupt is the only thing that cares
-         * here.
-         */
-        next->on_cpu = 1;
-#endif
-        raw_spin_unlock(&rq->lock);
-}
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
-        /*
-         * After ->on_cpu is cleared, the task can be moved to a different CPU.
-         * We must ensure this doesn't happen until the switch is completely
-         * finished.
-         */
-        smp_wmb();
-        prev->on_cpu = 0;
-#endif
-        local_irq_enable();
-}
-#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
 * wake flags
 */
@@ -1180,6 +1172,30 @@ static inline void idle_exit_fair(struct rq *rq) { }
 #endif
+#ifdef CONFIG_CPU_IDLE
+static inline void idle_set_state(struct rq *rq,
+                                  struct cpuidle_state *idle_state)
+{
+        rq->idle_state = idle_state;
+}
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+        WARN_ON(!rcu_read_lock_held());
+        return rq->idle_state;
+}
+#else
+static inline void idle_set_state(struct rq *rq,
+                                  struct cpuidle_state *idle_state)
+{
+}
+static inline struct cpuidle_state *idle_get_state(struct rq *rq)
+{
+        return NULL;
+}
+#endif
 extern void sysrq_sched_debug_show(void);
 extern void sched_init_granularity(void);
 extern void update_max_interval(void);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index bfe0edadbfbb..67426e529f59 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -28,7 +28,7 @@ pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 {
        struct task_struct *stop = rq->stop;
-        if (!stop || !stop->on_rq)
+        if (!stop || !task_on_rq_queued(stop))
                return NULL;
        put_prev_task(rq, prev);
diff --git a/kernel/smp.c b/kernel/smp.c
index aff8aa14f547..9e0d0b289118 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -13,6 +13,7 @@
 #include <linux/gfp.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
+#include <linux/sched.h>
 #include "smpboot.h"
@@ -699,3 +700,24 @@ void kick_all_cpus_sync(void)
        smp_call_function(do_nothing, NULL, 1);
 }
 EXPORT_SYMBOL_GPL(kick_all_cpus_sync);
+/**
+ * wake_up_all_idle_cpus - break all cpus out of idle
+ * wake_up_all_idle_cpus try to break all cpus which is in idle state even
+ * including idle polling cpus, for non-idle cpus, we will do nothing
+ * for them.
+ */
+void wake_up_all_idle_cpus(void)
+{
+        int cpu;
+        preempt_disable();
+        for_each_online_cpu(cpu) {
+                if (cpu == smp_processor_id())
+                        continue;
+                wake_up_if_idle(cpu);
+        }
+        preempt_enable();
+}
+EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
diff --git a/kernel/sys.c b/kernel/sys.c
index dfce4debd138..1eaa2f0b0246 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -869,11 +869,9 @@ void do_sys_times(struct tms *tms)
 {
        cputime_t tgutime, tgstime, cutime, cstime;
-        spin_lock_irq(&current->sighand->siglock);
        thread_group_cputime_adjusted(current, &tgutime, &tgstime);
        cutime = current->signal->cutime;
        cstime = current->signal->cstime;
-        spin_unlock_irq(&current->sighand->siglock);
        tms->tms_utime = cputime_to_clock_t(tgutime);
        tms->tms_stime = cputime_to_clock_t(tgstime);
        tms->tms_cutime = cputime_to_clock_t(cutime);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 1c2fe7de2842..ab370ffffd53 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -1776,7 +1776,6 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
         */
        if (!expires) {
                schedule();
-                __set_current_state(TASK_RUNNING);
                return -EINTR;
        }
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
index 3b8946416a5f..492b986195d5 100644
--- a/kernel/time/posix-cpu-timers.c
+++ b/kernel/time/posix-cpu-timers.c
@@ -272,22 +272,8 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
                if (same_thread_group(tsk, current))
                        err = cpu_clock_sample(which_clock, tsk, &rtn);
        } else {
-                unsigned long flags;
-                struct sighand_struct *sighand;
-                /*
-                 * while_each_thread() is not yet entirely RCU safe,
-                 * keep locking the group while sampling process
-                 * clock for now.
-                 */
-                sighand = lock_task_sighand(tsk, &flags);
-                if (!sighand)
-                        return err;
                if (tsk == current || thread_group_leader(tsk))
                        err = cpu_clock_sample_group(which_clock, tsk, &rtn);
-                unlock_task_sighand(tsk, &flags);
        }
        if (!err)
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index 0434ff1b808e..3f9e328c30b5 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -205,7 +205,6 @@ static void ring_buffer_consumer(void)
                        break;
                schedule();
-                __set_current_state(TASK_RUNNING);
        }
        reader_finish = 0;
        complete(&read_done);
@@ -379,7 +378,6 @@ static int ring_buffer_consumer_thread(void *arg)
                        break;
                schedule();
-                __set_current_state(TASK_RUNNING);
        }
        __set_current_state(TASK_RUNNING);
@@ -407,7 +405,6 @@ static int ring_buffer_producer_thread(void *arg)
                trace_printk("Sleeping for 10 secs\n");
                set_current_state(TASK_INTERRUPTIBLE);
                schedule_timeout(HZ * SLEEP_TIME);
-                __set_current_state(TASK_RUNNING);
        }
        if (kill_test)
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 8a4e5cb66a4c..16eddb308c33 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -13,7 +13,6 @@
 #include <linux/sysctl.h>
 #include <linux/init.h>
 #include <linux/fs.h>
-#include <linux/magic.h>
 #include <asm/setup.h>
@@ -171,8 +170,7 @@ check_stack(unsigned long ip, unsigned long *stack)
                        i++;
        }
-        if ((current != &init_task &&
+        if (task_stack_end_corrupted(current)) {
-                *(end_of_stack(current)) != STACK_END_MAGIC)) {
                print_max_stack();
                BUG();
        }
author	Linus Torvalds <torvalds@linux-foundation.org>	2014-10-13 10:23:15 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2014-10-13 10:23:15 -0400
commit	faafcba3b5e15999cf75d5c5a513ac8e47e2545f (patch)
tree	47d58d1c00e650e820506c91eb9a41268756bdda /kernel
parent	13ead805c5a14b0e7ecd34f61404a5bfba655895 (diff)
parent	f10e00f4bf360c36edbe6bf18a6c75b171cbe012 (diff)