16 files changed, 1286 insertions, 767 deletions
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index a5d966cb8891..f1c8fd566246 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -111,10 +111,13 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 {
        if (tg != &root_task_group)
                return false;
        /*
-         * We can only assume the task group can't go away on us if
+         * If we race with autogroup_move_group() the caller can use the old
-         * autogroup_move_group() can see us on ->thread_group list.
+         * value of signal->autogroup but in this case sched_move_task() will
+         * be called again before autogroup_kref_put().
+         *
+         * However, there is no way sched_autogroup_exit_task() could tell us
+         * to avoid autogroup->tg, so we abuse PF_EXITING flag for this case.
         */
        if (p->flags & PF_EXITING)
                return false;
@@ -122,6 +125,16 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
        return true;
 }
+void sched_autogroup_exit_task(struct task_struct *p)
+{
+        /*
+         * We are going to call exit_notify() and autogroup_move_group() can't
+         * see this thread after that: we can no longer use signal->autogroup.
+         * See the PF_EXITING check in task_wants_autogroup().
+         */
+        sched_move_task(p);
+}
 static void
 autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 {
@@ -138,13 +151,20 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
        }
        p->signal->autogroup = autogroup_kref_get(ag);
+        /*
-        if (!READ_ONCE(sysctl_sched_autogroup_enabled))
+         * We can't avoid sched_move_task() after we changed signal->autogroup,
-                goto out;
+         * this process can already run with task_group() == prev->tg or we can
+         * race with cgroup code which can read autogroup = prev under rq->lock.
+         * In the latter case for_each_thread() can not miss a migrating thread,
+         * cpu_cgroup_attach() must not be possible after cgroup_exit() and it
+         * can't be removed from thread list, we hold ->siglock.
+         *
+         * If an exiting thread was already removed from thread list we rely on
+         * sched_autogroup_exit_task().
+         */
        for_each_thread(p, t)
                sched_move_task(t);
-out:
        unlock_task_sighand(p, &flags);
        autogroup_kref_put(prev);
 }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44817c640e99..154fd689fe02 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -581,6 +581,8 @@ static bool wake_up_full_nohz_cpu(int cpu)
         * If needed we can still optimize that later with an
         * empty IRQ.
         */
+        if (cpu_is_offline(cpu))
+                return true;  /* Don't try to wake offline CPUs. */
        if (tick_nohz_full_cpu(cpu)) {
                if (cpu != smp_processor_id() ||
                    tick_nohz_tick_stopped())
@@ -591,6 +593,11 @@ static bool wake_up_full_nohz_cpu(int cpu)
        return false;
 }
+/*
+ * Wake up the specified CPU.  If the CPU is going offline, it is the
+ * caller's responsibility to deal with the lost wakeup, for example,
+ * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
+ */
 void wake_up_nohz_cpu(int cpu)
 {
        if (!wake_up_full_nohz_cpu(cpu))
@@ -1063,8 +1070,12 @@ static int migration_cpu_stop(void *data)
         * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
         * we're holding p->pi_lock.
         */
-        if (task_rq(p) == rq && task_on_rq_queued(p))
+        if (task_rq(p) == rq) {
-                rq = __migrate_task(rq, p, arg->dest_cpu);
+                if (task_on_rq_queued(p))
+                        rq = __migrate_task(rq, p, arg->dest_cpu);
+                else
+                        p->wake_cpu = arg->dest_cpu;
+        }
        raw_spin_unlock(&rq->lock);
        raw_spin_unlock(&p->pi_lock);
@@ -1105,10 +1116,10 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
        p->sched_class->set_cpus_allowed(p, new_mask);
-        if (running)
-                p->sched_class->set_curr_task(rq);
        if (queued)
                enqueue_task(rq, p, ENQUEUE_RESTORE);
+        if (running)
+                set_curr_task(rq, p);
 }
 /*
@@ -1265,7 +1276,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu)
                /*
                 * Task isn't running anymore; make it appear like we migrated
                 * it before it went to sleep. This means on wakeup we make the
-                 * previous cpu our targer instead of where it really is.
+                 * previous cpu our target instead of where it really is.
                 */
                p->wake_cpu = cpu;
        }
@@ -1629,23 +1640,25 @@ static inline int __set_cpus_allowed_ptr(struct task_struct *p,
 static void
 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
 {
-#ifdef CONFIG_SCHEDSTATS
+        struct rq *rq;
-        struct rq *rq = this_rq();
-#ifdef CONFIG_SMP
+        if (!schedstat_enabled())
-        int this_cpu = smp_processor_id();
+                return;
-        if (cpu == this_cpu) {
+        rq = this_rq();
-                schedstat_inc(rq, ttwu_local);
-                schedstat_inc(p, se.statistics.nr_wakeups_local);
+#ifdef CONFIG_SMP
+        if (cpu == rq->cpu) {
+                schedstat_inc(rq->ttwu_local);
+                schedstat_inc(p->se.statistics.nr_wakeups_local);
        } else {
                struct sched_domain *sd;
-                schedstat_inc(p, se.statistics.nr_wakeups_remote);
+                schedstat_inc(p->se.statistics.nr_wakeups_remote);
                rcu_read_lock();
-                for_each_domain(this_cpu, sd) {
+                for_each_domain(rq->cpu, sd) {
                        if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                                schedstat_inc(sd, ttwu_wake_remote);
+                                schedstat_inc(sd->ttwu_wake_remote);
                                break;
                        }
                }
@@ -1653,17 +1666,14 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
        }
        if (wake_flags & WF_MIGRATED)
-                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
+                schedstat_inc(p->se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */
-        schedstat_inc(rq, ttwu_count);
+        schedstat_inc(rq->ttwu_count);
-        schedstat_inc(p, se.statistics.nr_wakeups);
+        schedstat_inc(p->se.statistics.nr_wakeups);
        if (wake_flags & WF_SYNC)
-                schedstat_inc(p, se.statistics.nr_wakeups_sync);
+                schedstat_inc(p->se.statistics.nr_wakeups_sync);
-#endif /* CONFIG_SCHEDSTATS */
 }
 static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
@@ -2084,8 +2094,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        ttwu_queue(p, cpu, wake_flags);
 stat:
-        if (schedstat_enabled())
+        ttwu_stat(p, cpu, wake_flags);
-                ttwu_stat(p, cpu, wake_flags);
 out:
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -2095,6 +2104,7 @@ out:
 /**
 * try_to_wake_up_local - try to wake up a local task with rq lock held
 * @p: the thread to be awakened
+ * @cookie: context's cookie for pinning
 *
 * Put @p on the run-queue if it's not already there. The caller must
 * ensure that this_rq() is locked, @p is bound to this_rq() and not
@@ -2133,8 +2143,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct pin_cookie cookie
                ttwu_activate(rq, p, ENQUEUE_WAKEUP);
        ttwu_do_wakeup(rq, p, 0, cookie);
-        if (schedstat_enabled())
+        ttwu_stat(p, smp_processor_id(), 0);
-                ttwu_stat(p, smp_processor_id(), 0);
 out:
        raw_spin_unlock(&p->pi_lock);
 }
@@ -2772,6 +2781,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
                 * task and put them back on the free list.
                 */
                kprobe_flush_task(prev);
+                /* Task is done with its stack. */
+                put_task_stack(prev);
                put_task_struct(prev);
        }
@@ -3192,6 +3205,9 @@ static inline void preempt_latency_stop(int val) { }
 */
 static noinline void __schedule_bug(struct task_struct *prev)
 {
+        /* Save this before calling printk(), since that will clobber it */
+        unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
        if (oops_in_progress)
                return;
@@ -3202,13 +3218,12 @@ static noinline void __schedule_bug(struct task_struct *prev)
        print_modules();
        if (irqs_disabled())
                print_irqtrace_events(prev);
-#ifdef CONFIG_DEBUG_PREEMPT
+        if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
-        if (in_atomic_preempt_off()) {
+            && in_atomic_preempt_off()) {
                pr_err("Preemption disabled at:");
-                print_ip_sym(current->preempt_disable_ip);
+                print_ip_sym(preempt_disable_ip);
                pr_cont("\n");
        }
-#endif
        if (panic_on_warn)
                panic("scheduling while atomic\n");
@@ -3234,7 +3249,7 @@ static inline void schedule_debug(struct task_struct *prev)
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
-        schedstat_inc(this_rq(), sched_count);
+        schedstat_inc(this_rq()->sched_count);
 }
 /*
@@ -3327,17 +3342,6 @@ static void __sched notrace __schedule(bool preempt)
        rq = cpu_rq(cpu);
        prev = rq->curr;
-        /*
-         * do_exit() calls schedule() with preemption disabled as an exception;
-         * however we must fix that up, otherwise the next task will see an
-         * inconsistent (higher) preempt count.
-         *
-         * It also avoids the below schedule_debug() test from complaining
-         * about this.
-         */
-        if (unlikely(prev->state == TASK_DEAD))
-                preempt_enable_no_resched_notrace();
        schedule_debug(prev);
        if (sched_feat(HRTICK))
@@ -3403,7 +3407,33 @@ static void __sched notrace __schedule(bool preempt)
        balance_callback(rq);
 }
-STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
+void __noreturn do_task_dead(void)
+{
+        /*
+         * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
+         * when the following two conditions become true.
+         *   - There is race condition of mmap_sem (It is acquired by
+         *     exit_mm()), and
+         *   - SMI occurs before setting TASK_RUNINNG.
+         *     (or hypervisor of virtual machine switches to other guest)
+         *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
+         *
+         * To avoid it, we have to wait for releasing tsk->pi_lock which
+         * is held by try_to_wake_up()
+         */
+        smp_mb();
+        raw_spin_unlock_wait(&current->pi_lock);
+        /* causes final put_task_struct in finish_task_switch(). */
+        __set_current_state(TASK_DEAD);
+        current->flags |= PF_NOFREEZE;  /* tell freezer to ignore us */
+        __schedule(false);
+        BUG();
+        /* Avoid "noreturn function does return".  */
+        for (;;)
+                cpu_relax();    /* For when BUG is null */
+}
 static inline void sched_submit_work(struct task_struct *tsk)
 {
@@ -3687,10 +3717,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        p->prio = prio;
-        if (running)
-                p->sched_class->set_curr_task(rq);
        if (queued)
                enqueue_task(rq, p, queue_flag);
+        if (running)
+                set_curr_task(rq, p);
        check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@@ -3704,7 +3734,8 @@ out_unlock:
 void set_user_nice(struct task_struct *p, long nice)
 {
-        int old_prio, delta, queued;
+        bool queued, running;
+        int old_prio, delta;
        struct rq_flags rf;
        struct rq *rq;
@@ -3726,8 +3757,11 @@ void set_user_nice(struct task_struct *p, long nice)
                goto out_unlock;
        }
        queued = task_on_rq_queued(p);
+        running = task_current(rq, p);
        if (queued)
                dequeue_task(rq, p, DEQUEUE_SAVE);
+        if (running)
+                put_prev_task(rq, p);
        p->static_prio = NICE_TO_PRIO(nice);
        set_load_weight(p);
@@ -3744,6 +3778,8 @@ void set_user_nice(struct task_struct *p, long nice)
                if (delta < 0 || (delta > 0 && task_running(rq, p)))
                        resched_curr(rq);
        }
+        if (running)
+                set_curr_task(rq, p);
 out_unlock:
        task_rq_unlock(rq, p, &rf);
 }
@@ -4243,8 +4279,6 @@ change:
        prev_class = p->sched_class;
        __setscheduler(rq, p, attr, pi);
-        if (running)
-                p->sched_class->set_curr_task(rq);
        if (queued) {
                /*
                 * We enqueue to tail when the priority of a task is
@@ -4255,6 +4289,8 @@ change:
                enqueue_task(rq, p, queue_flags);
        }
+        if (running)
+                set_curr_task(rq, p);
        check_class_changed(rq, p, prev_class, oldprio);
        preempt_disable(); /* avoid rq from going away on us */
@@ -4846,7 +4882,7 @@ SYSCALL_DEFINE0(sched_yield)
 {
        struct rq *rq = this_rq_lock();
-        schedstat_inc(rq, yld_count);
+        schedstat_inc(rq->yld_count);
        current->sched_class->yield_task(rq);
        /*
@@ -4863,6 +4899,7 @@ SYSCALL_DEFINE0(sched_yield)
        return 0;
 }
+#ifndef CONFIG_PREEMPT
 int __sched _cond_resched(void)
 {
        if (should_resched(0)) {
@@ -4872,6 +4909,7 @@ int __sched _cond_resched(void)
        return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
+#endif
 /*
 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
@@ -4997,7 +5035,7 @@ again:
        yielded = curr->sched_class->yield_to_task(rq, p, preempt);
        if (yielded) {
-                schedstat_inc(rq, yld_count);
+                schedstat_inc(rq->yld_count);
                /*
                 * Make p's CPU reschedule; pick_next_entity takes care of
                 * fairness.
@@ -5154,21 +5192,14 @@ void sched_show_task(struct task_struct *p)
        int ppid;
        unsigned long state = p->state;
+        if (!try_get_task_stack(p))
+                return;
        if (state)
                state = __ffs(state) + 1;
        printk(KERN_INFO "%-15.15s %c", p->comm,
                state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if BITS_PER_LONG == 32
-        if (state == TASK_RUNNING)
-                printk(KERN_CONT " running  ");
-        else
-                printk(KERN_CONT " %08lx ", thread_saved_pc(p));
-#else
        if (state == TASK_RUNNING)
                printk(KERN_CONT "  running task    ");
-        else
-                printk(KERN_CONT " %016lx ", thread_saved_pc(p));
-#endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
        free = stack_not_used(p);
 #endif
@@ -5183,6 +5214,7 @@ void sched_show_task(struct task_struct *p)
        print_worker_info(KERN_INFO, p);
        show_stack(p, NULL);
+        put_task_stack(p);
 }
 void show_state_filter(unsigned long state_filter)
@@ -5417,10 +5449,10 @@ void sched_setnuma(struct task_struct *p, int nid)
        p->numa_preferred_nid = nid;
-        if (running)
-                p->sched_class->set_curr_task(rq);
        if (queued)
                enqueue_task(rq, p, ENQUEUE_RESTORE);
+        if (running)
+                set_curr_task(rq, p);
        task_rq_unlock(rq, p, &rf);
 }
 #endif /* CONFIG_NUMA_BALANCING */
@@ -5717,6 +5749,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
        }
 }
 #else /* !CONFIG_SCHED_DEBUG */
+# define sched_debug_enabled 0
 # define sched_domain_debug(sd, cpu) do { } while (0)
 static inline bool sched_debug(void)
 {
@@ -5735,6 +5769,7 @@ static int sd_degenerate(struct sched_domain *sd)
                         SD_BALANCE_FORK |
                         SD_BALANCE_EXEC |
                         SD_SHARE_CPUCAPACITY |
+                         SD_ASYM_CPUCAPACITY |
                         SD_SHARE_PKG_RESOURCES |
                         SD_SHARE_POWERDOMAIN)) {
                if (sd->groups != sd->groups->next)
@@ -5765,6 +5800,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                SD_BALANCE_NEWIDLE |
                                SD_BALANCE_FORK |
                                SD_BALANCE_EXEC |
+                                SD_ASYM_CPUCAPACITY |
                                SD_SHARE_CPUCAPACITY |
                                SD_SHARE_PKG_RESOURCES |
                                SD_PREFER_SIBLING |
@@ -5909,10 +5945,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgc)
        } while (sg != first);
 }
-static void free_sched_domain(struct rcu_head *rcu)
+static void destroy_sched_domain(struct sched_domain *sd)
 {
-        struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
        /*
         * If its an overlapping domain it has private groups, iterate and
         * nuke them all.
@@ -5923,18 +5957,26 @@ static void free_sched_domain(struct rcu_head *rcu)
                kfree(sd->groups->sgc);
                kfree(sd->groups);
        }
+        if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
+                kfree(sd->shared);
        kfree(sd);
 }
-static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+static void destroy_sched_domains_rcu(struct rcu_head *rcu)
 {
-        call_rcu(&sd->rcu, free_sched_domain);
+        struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+        while (sd) {
+                struct sched_domain *parent = sd->parent;
+                destroy_sched_domain(sd);
+                sd = parent;
+        }
 }
-static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+static void destroy_sched_domains(struct sched_domain *sd)
 {
-        for (; sd; sd = sd->parent)
+        if (sd)
-                destroy_sched_domain(sd, cpu);
+                call_rcu(&sd->rcu, destroy_sched_domains_rcu);
 }
 /*
@@ -5949,14 +5991,14 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
-DEFINE_PER_CPU(struct sched_domain *, sd_busy);
 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
 static void update_top_cache_domain(int cpu)
 {
+        struct sched_domain_shared *sds = NULL;
        struct sched_domain *sd;
-        struct sched_domain *busy_sd = NULL;
        int id = cpu;
        int size = 1;
@@ -5964,13 +6006,13 @@ static void update_top_cache_domain(int cpu)
        if (sd) {
                id = cpumask_first(sched_domain_span(sd));
                size = cpumask_weight(sched_domain_span(sd));
-                busy_sd = sd->parent; /* sd_busy */
+                sds = sd->shared;
        }
-        rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_size, cpu) = size;
        per_cpu(sd_llc_id, cpu) = id;
+        rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
        sd = lowest_flag_domain(cpu, SD_NUMA);
        rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
@@ -6006,7 +6048,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
                         */
                        if (parent->flags & SD_PREFER_SIBLING)
                                tmp->flags |= SD_PREFER_SIBLING;
-                        destroy_sched_domain(parent, cpu);
+                        destroy_sched_domain(parent);
                } else
                        tmp = tmp->parent;
        }
@@ -6014,7 +6056,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        if (sd && sd_degenerate(sd)) {
                tmp = sd;
                sd = sd->parent;
-                destroy_sched_domain(tmp, cpu);
+                destroy_sched_domain(tmp);
                if (sd)
                        sd->child = NULL;
        }
@@ -6024,7 +6066,7 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        rq_attach_root(rq, rd);
        tmp = rq->sd;
        rcu_assign_pointer(rq->sd, sd);
-        destroy_sched_domains(tmp, cpu);
+        destroy_sched_domains(tmp);
        update_top_cache_domain(cpu);
 }
@@ -6267,7 +6309,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
                return;
        update_group_capacity(sd, cpu);
-        atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
 }
 /*
@@ -6355,6 +6396,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
        WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
        *per_cpu_ptr(sdd->sd, cpu) = NULL;
+        if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
+                *per_cpu_ptr(sdd->sds, cpu) = NULL;
        if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
                *per_cpu_ptr(sdd->sg, cpu) = NULL;
@@ -6374,26 +6418,37 @@ static int sched_domains_curr_level;
 /*
 * SD_flags allowed in topology descriptions.
 *
- * SD_SHARE_CPUCAPACITY      - describes SMT topologies
+ * These flags are purely descriptive of the topology and do not prescribe
- * SD_SHARE_PKG_RESOURCES - describes shared caches
+ * behaviour. Behaviour is artificial and mapped in the below sd_init()
- * SD_NUMA                - describes NUMA topologies
+ * function:
- * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *
+ *   SD_SHARE_CPUCAPACITY   - describes SMT topologies
+ *   SD_SHARE_PKG_RESOURCES - describes shared caches
+ *   SD_NUMA                - describes NUMA topologies
+ *   SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
 *
- * Odd one out:
+ * Odd one out, which beside describing the topology has a quirk also
- * SD_ASYM_PACKING        - describes SMT quirks
+ * prescribes the desired behaviour that goes along with it:
+ *
+ *   SD_ASYM_PACKING        - describes SMT quirks
 */
 #define TOPOLOGY_SD_FLAGS               \
        (SD_SHARE_CPUCAPACITY |         \
         SD_SHARE_PKG_RESOURCES |       \
         SD_NUMA |                      \
         SD_ASYM_PACKING |              \
+         SD_ASYM_CPUCAPACITY |          \
         SD_SHARE_POWERDOMAIN)
 static struct sched_domain *
-sd_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl,
+        const struct cpumask *cpu_map,
+        struct sched_domain *child, int cpu)
 {
-        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
+        struct sd_data *sdd = &tl->data;
-        int sd_weight, sd_flags = 0;
+        struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+        int sd_id, sd_weight, sd_flags = 0;
 #ifdef CONFIG_NUMA
        /*
@@ -6442,15 +6497,26 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
                .smt_gain               = 0,
                .max_newidle_lb_cost    = 0,
                .next_decay_max_lb_cost = jiffies,
+                .child                  = child,
 #ifdef CONFIG_SCHED_DEBUG
                .name                   = tl->name,
 #endif
        };
+        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+        sd_id = cpumask_first(sched_domain_span(sd));
        /*
         * Convert topological properties into behaviour.
         */
+        if (sd->flags & SD_ASYM_CPUCAPACITY) {
+                struct sched_domain *t = sd;
+                for_each_lower_domain(t)
+                        t->flags |= SD_BALANCE_WAKE;
+        }
        if (sd->flags & SD_SHARE_CPUCAPACITY) {
                sd->flags |= SD_PREFER_SIBLING;
                sd->imbalance_pct = 110;
@@ -6482,7 +6548,17 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
                sd->idle_idx = 1;
        }
-        sd->private = &tl->data;
+        /*
+         * For all levels sharing cache; connect a sched_domain_shared
+         * instance.
+         */
+        if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+                sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+                atomic_inc(&sd->shared->ref);
+                atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
+        }
+        sd->private = sdd;
        return sd;
 }
@@ -6509,6 +6585,9 @@ static struct sched_domain_topology_level *sched_domain_topology =
 void set_sched_topology(struct sched_domain_topology_level *tl)
 {
+        if (WARN_ON_ONCE(sched_smp_initialized))
+                return;
        sched_domain_topology = tl;
 }
@@ -6789,6 +6868,10 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                if (!sdd->sd)
                        return -ENOMEM;
+                sdd->sds = alloc_percpu(struct sched_domain_shared *);
+                if (!sdd->sds)
+                        return -ENOMEM;
                sdd->sg = alloc_percpu(struct sched_group *);
                if (!sdd->sg)
                        return -ENOMEM;
@@ -6799,6 +6882,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                for_each_cpu(j, cpu_map) {
                        struct sched_domain *sd;
+                        struct sched_domain_shared *sds;
                        struct sched_group *sg;
                        struct sched_group_capacity *sgc;
@@ -6809,6 +6893,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                        *per_cpu_ptr(sdd->sd, j) = sd;
+                        sds = kzalloc_node(sizeof(struct sched_domain_shared),
+                                        GFP_KERNEL, cpu_to_node(j));
+                        if (!sds)
+                                return -ENOMEM;
+                        *per_cpu_ptr(sdd->sds, j) = sds;
                        sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
                        if (!sg)
@@ -6848,6 +6939,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
                                kfree(*per_cpu_ptr(sdd->sd, j));
                        }
+                        if (sdd->sds)
+                                kfree(*per_cpu_ptr(sdd->sds, j));
                        if (sdd->sg)
                                kfree(*per_cpu_ptr(sdd->sg, j));
                        if (sdd->sgc)
@@ -6855,6 +6948,8 @@ static void __sdt_free(const struct cpumask *cpu_map)
                }
                free_percpu(sdd->sd);
                sdd->sd = NULL;
+                free_percpu(sdd->sds);
+                sdd->sds = NULL;
                free_percpu(sdd->sg);
                sdd->sg = NULL;
                free_percpu(sdd->sgc);
@@ -6866,16 +6961,12 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                const struct cpumask *cpu_map, struct sched_domain_attr *attr,
                struct sched_domain *child, int cpu)
 {
-        struct sched_domain *sd = sd_init(tl, cpu);
+        struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
-        if (!sd)
-                return child;
-        cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
        if (child) {
                sd->level = child->level + 1;
                sched_domain_level_max = max(sched_domain_level_max, sd->level);
                child->parent = sd;
-                sd->child = child;
                if (!cpumask_subset(sched_domain_span(child),
                                    sched_domain_span(sd))) {
@@ -6906,6 +6997,7 @@ static int build_sched_domains(const struct cpumask *cpu_map,
        enum s_alloc alloc_state;
        struct sched_domain *sd;
        struct s_data d;
+        struct rq *rq = NULL;
        int i, ret = -ENOMEM;
        alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
@@ -6956,11 +7048,22 @@ static int build_sched_domains(const struct cpumask *cpu_map,
        /* Attach the domains */
        rcu_read_lock();
        for_each_cpu(i, cpu_map) {
+                rq = cpu_rq(i);
                sd = *per_cpu_ptr(d.sd, i);
+                /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
+                if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
+                        WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
                cpu_attach_domain(sd, d.rd, i);
        }
        rcu_read_unlock();
+        if (rq && sched_debug_enabled) {
+                pr_info("span: %*pbl (max cpu_capacity = %lu)\n",
+                        cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
+        }
        ret = 0;
 error:
        __free_domain_allocs(&d, alloc_state, cpu_map);
@@ -7319,6 +7422,22 @@ int sched_cpu_dying(unsigned int cpu)
 }
 #endif
+#ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+static void sched_init_smt(void)
+{
+        /*
+         * We've enumerated all CPUs and will assume that if any CPU
+         * has SMT siblings, CPU0 will too.
+         */
+        if (cpumask_weight(cpu_smt_mask(0)) > 1)
+                static_branch_enable(&sched_smt_present);
+}
+#else
+static inline void sched_init_smt(void) { }
+#endif
 void __init sched_init_smp(void)
 {
        cpumask_var_t non_isolated_cpus;
@@ -7348,6 +7467,9 @@ void __init sched_init_smp(void)
        init_sched_rt_class();
        init_sched_dl_class();
+        sched_init_smt();
        sched_smp_initialized = true;
 }
@@ -7385,12 +7507,29 @@ static struct kmem_cache *task_group_cache __read_mostly;
 #endif
 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
+DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
+#define WAIT_TABLE_BITS 8
+#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
+static wait_queue_head_t bit_wait_table[WAIT_TABLE_SIZE] __cacheline_aligned;
+wait_queue_head_t *bit_waitqueue(void *word, int bit)
+{
+        const int shift = BITS_PER_LONG == 32 ? 5 : 6;
+        unsigned long val = (unsigned long)word << shift | bit;
+        return bit_wait_table + hash_long(val, WAIT_TABLE_BITS);
+}
+EXPORT_SYMBOL(bit_waitqueue);
 void __init sched_init(void)
 {
        int i, j;
        unsigned long alloc_size = 0, ptr;
+        for (i = 0; i < WAIT_TABLE_SIZE; i++)
+                init_waitqueue_head(bit_wait_table + i);
 #ifdef CONFIG_FAIR_GROUP_SCHED
        alloc_size += 2 * nr_cpu_ids * sizeof(void **);
 #endif
@@ -7421,6 +7560,8 @@ void __init sched_init(void)
        for_each_possible_cpu(i) {
                per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
                        cpumask_size(), GFP_KERNEL, cpu_to_node(i));
+                per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
+                        cpumask_size(), GFP_KERNEL, cpu_to_node(i));
        }
 #endif /* CONFIG_CPUMASK_OFFSTACK */
@@ -7523,10 +7664,6 @@ void __init sched_init(void)
        set_load_weight(&init_task);
-#ifdef CONFIG_PREEMPT_NOTIFIERS
-        INIT_HLIST_HEAD(&init_task.preempt_notifiers);
-#endif
        /*
         * The boot idle thread does lazy MMU switching as well:
         */
@@ -7534,11 +7671,6 @@ void __init sched_init(void)
        enter_lazy_tlb(&init_mm, current);
        /*
-         * During early bootup we pretend to be a normal task:
-         */
-        current->sched_class = &fair_sched_class;
-        /*
         * Make us the idle thread. Technically, schedule() should not be
         * called from this thread, however somewhere below it might be,
         * but because we are the idle thread, we just pick up running again
@@ -7592,6 +7724,7 @@ EXPORT_SYMBOL(__might_sleep);
 void ___might_sleep(const char *file, int line, int preempt_offset)
 {
        static unsigned long prev_jiffy;        /* ratelimiting */
+        unsigned long preempt_disable_ip;
        rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
        if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
@@ -7602,6 +7735,9 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
                return;
        prev_jiffy = jiffies;
+        /* Save this before calling printk(), since that will clobber it */
+        preempt_disable_ip = get_preempt_disable_ip(current);
        printk(KERN_ERR
                "BUG: sleeping function called from invalid context at %s:%d\n",
                        file, line);
@@ -7616,14 +7752,14 @@ void ___might_sleep(const char *file, int line, int preempt_offset)
        debug_show_held_locks(current);
        if (irqs_disabled())
                print_irqtrace_events(current);
-#ifdef CONFIG_DEBUG_PREEMPT
+        if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
-        if (!preempt_count_equals(preempt_offset)) {
+            && !preempt_count_equals(preempt_offset)) {
                pr_err("Preemption disabled at:");
-                print_ip_sym(current->preempt_disable_ip);
+                print_ip_sym(preempt_disable_ip);
                pr_cont("\n");
        }
-#endif
        dump_stack();
+        add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
 EXPORT_SYMBOL(___might_sleep);
 #endif
@@ -7644,12 +7780,10 @@ void normalize_rt_tasks(void)
                if (p->flags & PF_KTHREAD)
                        continue;
-                p->se.exec_start                = 0;
+                p->se.exec_start = 0;
-#ifdef CONFIG_SCHEDSTATS
+                schedstat_set(p->se.statistics.wait_start,  0);
-                p->se.statistics.wait_start     = 0;
+                schedstat_set(p->se.statistics.sleep_start, 0);
-                p->se.statistics.sleep_start    = 0;
+                schedstat_set(p->se.statistics.block_start, 0);
-                p->se.statistics.block_start    = 0;
-#endif
                if (!dl_task(p) && !rt_task(p)) {
                        /*
@@ -7710,7 +7844,7 @@ struct task_struct *curr_task(int cpu)
 *
 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
 */
-void set_curr_task(int cpu, struct task_struct *p)
+void ia64_set_curr_task(int cpu, struct task_struct *p)
 {
        cpu_curr(cpu) = p;
 }
@@ -7841,10 +7975,10 @@ void sched_move_task(struct task_struct *tsk)
        sched_change_group(tsk, TASK_MOVE_GROUP);
-        if (unlikely(running))
-                tsk->sched_class->set_curr_task(rq);
        if (queued)
                enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
+        if (unlikely(running))
+                set_curr_task(rq, tsk);
        task_rq_unlock(rq, tsk, &rf);
 }
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index d4184498c9f5..e73119013c53 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -31,56 +31,81 @@ static inline int right_child(int i)
        return (i << 1) + 2;
 }
-static void cpudl_exchange(struct cpudl *cp, int a, int b)
+static void cpudl_heapify_down(struct cpudl *cp, int idx)
 {
-        int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
+        int l, r, largest;
-        swap(cp->elements[a].cpu, cp->elements[b].cpu);
+        int orig_cpu = cp->elements[idx].cpu;
-        swap(cp->elements[a].dl , cp->elements[b].dl );
+        u64 orig_dl = cp->elements[idx].dl;
-        swap(cp->elements[cpu_a].idx, cp->elements[cpu_b].idx);
+        if (left_child(idx) >= cp->size)
-}
+                return;
-static void cpudl_heapify(struct cpudl *cp, int idx)
-{
-        int l, r, largest;
        /* adapted from lib/prio_heap.c */
        while(1) {
+                u64 largest_dl;
                l = left_child(idx);
                r = right_child(idx);
                largest = idx;
+                largest_dl = orig_dl;
-                if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
+                if ((l < cp->size) && dl_time_before(orig_dl,
-                                                        cp->elements[l].dl))
+                                                cp->elements[l].dl)) {
                        largest = l;
-                if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
+                        largest_dl = cp->elements[l].dl;
-                                                        cp->elements[r].dl))
+                }
+                if ((r < cp->size) && dl_time_before(largest_dl,
+                                                cp->elements[r].dl))
                        largest = r;
                if (largest == idx)
                        break;
-                /* Push idx down the heap one level and bump one up */
+                /* pull largest child onto idx */
-                cpudl_exchange(cp, largest, idx);
+                cp->elements[idx].cpu = cp->elements[largest].cpu;
+                cp->elements[idx].dl = cp->elements[largest].dl;
+                cp->elements[cp->elements[idx].cpu].idx = idx;
                idx = largest;
        }
+        /* actual push down of saved original values orig_* */
+        cp->elements[idx].cpu = orig_cpu;
+        cp->elements[idx].dl = orig_dl;
+        cp->elements[cp->elements[idx].cpu].idx = idx;
 }
-static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
+static void cpudl_heapify_up(struct cpudl *cp, int idx)
 {
-        WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
+        int p;
-        if (dl_time_before(new_dl, cp->elements[idx].dl)) {
+        int orig_cpu = cp->elements[idx].cpu;
-                cp->elements[idx].dl = new_dl;
+        u64 orig_dl = cp->elements[idx].dl;
-                cpudl_heapify(cp, idx);
-        } else {
+        if (idx == 0)
-                cp->elements[idx].dl = new_dl;
+                return;
-                while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
-                                        cp->elements[idx].dl)) {
+        do {
-                        cpudl_exchange(cp, idx, parent(idx));
+                p = parent(idx);
-                        idx = parent(idx);
+                if (dl_time_before(orig_dl, cp->elements[p].dl))
-                }
+                        break;
-        }
+                /* pull parent onto idx */
+                cp->elements[idx].cpu = cp->elements[p].cpu;
+                cp->elements[idx].dl = cp->elements[p].dl;
+                cp->elements[cp->elements[idx].cpu].idx = idx;
+                idx = p;
+        } while (idx != 0);
+        /* actual push up of saved original values orig_* */
+        cp->elements[idx].cpu = orig_cpu;
+        cp->elements[idx].dl = orig_dl;
+        cp->elements[cp->elements[idx].cpu].idx = idx;
+}
+static void cpudl_heapify(struct cpudl *cp, int idx)
+{
+        if (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
+                                cp->elements[idx].dl))
+                cpudl_heapify_up(cp, idx);
+        else
+                cpudl_heapify_down(cp, idx);
 }
 static inline int cpudl_maximum(struct cpudl *cp)
@@ -120,16 +145,15 @@ out:
 }
 /*
- * cpudl_set - update the cpudl max-heap
+ * cpudl_clear - remove a cpu from the cpudl max-heap
 * @cp: the cpudl max-heap context
 * @cpu: the target cpu
- * @dl: the new earliest deadline for this cpu
 *
 * Notes: assumes cpu_rq(cpu)->lock is locked
 *
 * Returns: (void)
 */
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
+void cpudl_clear(struct cpudl *cp, int cpu)
 {
        int old_idx, new_cpu;
        unsigned long flags;
@@ -137,47 +161,60 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
        WARN_ON(!cpu_present(cpu));
        raw_spin_lock_irqsave(&cp->lock, flags);
        old_idx = cp->elements[cpu].idx;
-        if (!is_valid) {
+        if (old_idx == IDX_INVALID) {
-                /* remove item */
+                /*
-                if (old_idx == IDX_INVALID) {
+                 * Nothing to remove if old_idx was invalid.
-                        /*
+                 * This could happen if a rq_offline_dl is
-                         * Nothing to remove if old_idx was invalid.
+                 * called for a CPU without -dl tasks running.
-                         * This could happen if a rq_offline_dl is
+                 */
-                         * called for a CPU without -dl tasks running.
+        } else {
-                         */
-                        goto out;
-                }
                new_cpu = cp->elements[cp->size - 1].cpu;
                cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
                cp->elements[old_idx].cpu = new_cpu;
                cp->size--;
                cp->elements[new_cpu].idx = old_idx;
                cp->elements[cpu].idx = IDX_INVALID;
-                while (old_idx > 0 && dl_time_before(
+                cpudl_heapify(cp, old_idx);
-                                cp->elements[parent(old_idx)].dl,
-                                cp->elements[old_idx].dl)) {
-                        cpudl_exchange(cp, old_idx, parent(old_idx));
-                        old_idx = parent(old_idx);
-                }
-                cpumask_set_cpu(cpu, cp->free_cpus);
-                cpudl_heapify(cp, old_idx);
-                goto out;
+                cpumask_set_cpu(cpu, cp->free_cpus);
        }
+        raw_spin_unlock_irqrestore(&cp->lock, flags);
+}
+/*
+ * cpudl_set - update the cpudl max-heap
+ * @cp: the cpudl max-heap context
+ * @cpu: the target cpu
+ * @dl: the new earliest deadline for this cpu
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
+{
+        int old_idx;
+        unsigned long flags;
+        WARN_ON(!cpu_present(cpu));
+        raw_spin_lock_irqsave(&cp->lock, flags);
+        old_idx = cp->elements[cpu].idx;
        if (old_idx == IDX_INVALID) {
-                cp->size++;
+                int new_idx = cp->size++;
-                cp->elements[cp->size - 1].dl = dl;
+                cp->elements[new_idx].dl = dl;
-                cp->elements[cp->size - 1].cpu = cpu;
+                cp->elements[new_idx].cpu = cpu;
-                cp->elements[cpu].idx = cp->size - 1;
+                cp->elements[cpu].idx = new_idx;
-                cpudl_change_key(cp, cp->size - 1, dl);
+                cpudl_heapify_up(cp, new_idx);
                cpumask_clear_cpu(cpu, cp->free_cpus);
        } else {
-                cpudl_change_key(cp, old_idx, dl);
+                cp->elements[old_idx].dl = dl;
+                cpudl_heapify(cp, old_idx);
        }
-out:
        raw_spin_unlock_irqrestore(&cp->lock, flags);
 }
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index fcbdf83fed7e..f7da8c55bba0 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -23,7 +23,8 @@ struct cpudl {
 #ifdef CONFIG_SMP
 int cpudl_find(struct cpudl *cp, struct task_struct *p,
               struct cpumask *later_mask);
-void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
+void cpudl_clear(struct cpudl *cp, int cpu);
 int cpudl_init(struct cpudl *cp);
 void cpudl_set_freecpu(struct cpudl *cp, int cpu);
 void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index 1141954e73b4..dbc51442ecbc 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
 */
 void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
                        void (*func)(struct update_util_data *data, u64 time,
-                                     unsigned long util, unsigned long max))
+                                     unsigned int flags))
 {
        if (WARN_ON(!data || !func))
                return;
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index a84641b222c1..69e06898997d 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -12,7 +12,6 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 #include <linux/cpufreq.h>
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <trace/events/power.h>
@@ -48,11 +47,14 @@ struct sugov_cpu {
        struct sugov_policy *sg_policy;
        unsigned int cached_raw_freq;
+        unsigned long iowait_boost;
+        unsigned long iowait_boost_max;
+        u64 last_update;
        /* The fields below are only needed when sharing a policy. */
        unsigned long util;
        unsigned long max;
-        u64 last_update;
+        unsigned int flags;
 };
 static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
@@ -144,24 +146,75 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
        return cpufreq_driver_resolve_freq(policy, freq);
 }
+static void sugov_get_util(unsigned long *util, unsigned long *max)
+{
+        struct rq *rq = this_rq();
+        unsigned long cfs_max;
+        cfs_max = arch_scale_cpu_capacity(NULL, smp_processor_id());
+        *util = min(rq->cfs.avg.util_avg, cfs_max);
+        *max = cfs_max;
+}
+static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
+                                   unsigned int flags)
+{
+        if (flags & SCHED_CPUFREQ_IOWAIT) {
+                sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+        } else if (sg_cpu->iowait_boost) {
+                s64 delta_ns = time - sg_cpu->last_update;
+                /* Clear iowait_boost if the CPU apprears to have been idle. */
+                if (delta_ns > TICK_NSEC)
+                        sg_cpu->iowait_boost = 0;
+        }
+}
+static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util,
+                               unsigned long *max)
+{
+        unsigned long boost_util = sg_cpu->iowait_boost;
+        unsigned long boost_max = sg_cpu->iowait_boost_max;
+        if (!boost_util)
+                return;
+        if (*util * boost_max < *max * boost_util) {
+                *util = boost_util;
+                *max = boost_max;
+        }
+        sg_cpu->iowait_boost >>= 1;
+}
 static void sugov_update_single(struct update_util_data *hook, u64 time,
-                                unsigned long util, unsigned long max)
+                                unsigned int flags)
 {
        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
        struct cpufreq_policy *policy = sg_policy->policy;
+        unsigned long util, max;
        unsigned int next_f;
+        sugov_set_iowait_boost(sg_cpu, time, flags);
+        sg_cpu->last_update = time;
        if (!sugov_should_update_freq(sg_policy, time))
                return;
-        next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
+        if (flags & SCHED_CPUFREQ_RT_DL) {
-                        get_next_freq(sg_cpu, util, max);
+                next_f = policy->cpuinfo.max_freq;
+        } else {
+                sugov_get_util(&util, &max);
+                sugov_iowait_boost(sg_cpu, &util, &max);
+                next_f = get_next_freq(sg_cpu, util, max);
+        }
        sugov_update_commit(sg_policy, time, next_f);
 }
 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
-                                           unsigned long util, unsigned long max)
+                                           unsigned long util, unsigned long max,
+                                           unsigned int flags)
 {
        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
        struct cpufreq_policy *policy = sg_policy->policy;
@@ -169,9 +222,11 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
        u64 last_freq_update_time = sg_policy->last_freq_update_time;
        unsigned int j;
-        if (util == ULONG_MAX)
+        if (flags & SCHED_CPUFREQ_RT_DL)
                return max_f;
+        sugov_iowait_boost(sg_cpu, &util, &max);
        for_each_cpu(j, policy->cpus) {
                struct sugov_cpu *j_sg_cpu;
                unsigned long j_util, j_max;
@@ -186,41 +241,50 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu,
                 * frequency update and the time elapsed between the last update
                 * of the CPU utilization and the last frequency update is long
                 * enough, don't take the CPU into account as it probably is
-                 * idle now.
+                 * idle now (and clear iowait_boost for it).
                 */
                delta_ns = last_freq_update_time - j_sg_cpu->last_update;
-                if (delta_ns > TICK_NSEC)
+                if (delta_ns > TICK_NSEC) {
+                        j_sg_cpu->iowait_boost = 0;
                        continue;
+                }
-                j_util = j_sg_cpu->util;
+                if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
-                if (j_util == ULONG_MAX)
                        return max_f;
+                j_util = j_sg_cpu->util;
                j_max = j_sg_cpu->max;
                if (j_util * max > j_max * util) {
                        util = j_util;
                        max = j_max;
                }
+                sugov_iowait_boost(j_sg_cpu, &util, &max);
        }
        return get_next_freq(sg_cpu, util, max);
 }
 static void sugov_update_shared(struct update_util_data *hook, u64 time,
-                                unsigned long util, unsigned long max)
+                                unsigned int flags)
 {
        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+        unsigned long util, max;
        unsigned int next_f;
+        sugov_get_util(&util, &max);
        raw_spin_lock(&sg_policy->update_lock);
        sg_cpu->util = util;
        sg_cpu->max = max;
+        sg_cpu->flags = flags;
+        sugov_set_iowait_boost(sg_cpu, time, flags);
        sg_cpu->last_update = time;
        if (sugov_should_update_freq(sg_policy, time)) {
-                next_f = sugov_next_freq_shared(sg_cpu, util, max);
+                next_f = sugov_next_freq_shared(sg_cpu, util, max, flags);
                sugov_update_commit(sg_policy, time, next_f);
        }
@@ -444,10 +508,13 @@ static int sugov_start(struct cpufreq_policy *policy)
                sg_cpu->sg_policy = sg_policy;
                if (policy_is_shared(policy)) {
-                        sg_cpu->util = ULONG_MAX;
+                        sg_cpu->util = 0;
                        sg_cpu->max = 0;
+                        sg_cpu->flags = SCHED_CPUFREQ_RT;
                        sg_cpu->last_update = 0;
                        sg_cpu->cached_raw_freq = 0;
+                        sg_cpu->iowait_boost = 0;
+                        sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
                        cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
                                                     sugov_update_shared);
                } else {
@@ -495,28 +562,15 @@ static struct cpufreq_governor schedutil_gov = {
        .limits = sugov_limits,
 };
-static int __init sugov_module_init(void)
-{
-        return cpufreq_register_governor(&schedutil_gov);
-}
-static void __exit sugov_module_exit(void)
-{
-        cpufreq_unregister_governor(&schedutil_gov);
-}
-MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>");
-MODULE_DESCRIPTION("Utilization-based CPU frequency selection");
-MODULE_LICENSE("GPL");
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
 struct cpufreq_governor *cpufreq_default_governor(void)
 {
        return &schedutil_gov;
 }
-fs_initcall(sugov_module_init);
-#else
-module_init(sugov_module_init);
 #endif
-module_exit(sugov_module_exit);
+static int __init sugov_register(void)
+{
+        return cpufreq_register_governor(&schedutil_gov);
+}
+fs_initcall(sugov_register);
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index a846cf89eb96..5ebee3164e64 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -23,10 +23,8 @@
 * task when irq is in progress while we read rq->clock. That is a worthy
 * compromise in place of having locks on each irq in account_system_time.
 */
-DEFINE_PER_CPU(u64, cpu_hardirq_time);
+DEFINE_PER_CPU(struct irqtime, cpu_irqtime);
-DEFINE_PER_CPU(u64, cpu_softirq_time);
-static DEFINE_PER_CPU(u64, irq_start_time);
 static int sched_clock_irqtime;
 void enable_sched_clock_irqtime(void)
@@ -39,16 +37,13 @@ void disable_sched_clock_irqtime(void)
        sched_clock_irqtime = 0;
 }
-#ifndef CONFIG_64BIT
-DEFINE_PER_CPU(seqcount_t, irq_time_seq);
-#endif /* CONFIG_64BIT */
 /*
 * Called before incrementing preempt_count on {soft,}irq_enter
 * and before decrementing preempt_count on {soft,}irq_exit.
 */
 void irqtime_account_irq(struct task_struct *curr)
 {
+        struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
        s64 delta;
        int cpu;
@@ -56,10 +51,10 @@ void irqtime_account_irq(struct task_struct *curr)
                return;
        cpu = smp_processor_id();
-        delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
+        delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
-        __this_cpu_add(irq_start_time, delta);
+        irqtime->irq_start_time += delta;
-        irq_time_write_begin();
+        u64_stats_update_begin(&irqtime->sync);
        /*
         * We do not account for softirq time from ksoftirqd here.
         * We want to continue accounting softirq time to ksoftirqd thread
@@ -67,42 +62,36 @@ void irqtime_account_irq(struct task_struct *curr)
         * that do not consume any time, but still wants to run.
         */
        if (hardirq_count())
-                __this_cpu_add(cpu_hardirq_time, delta);
+                irqtime->hardirq_time += delta;
        else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
-                __this_cpu_add(cpu_softirq_time, delta);
+                irqtime->softirq_time += delta;
-        irq_time_write_end();
+        u64_stats_update_end(&irqtime->sync);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
-static cputime_t irqtime_account_hi_update(cputime_t maxtime)
+static cputime_t irqtime_account_update(u64 irqtime, int idx, cputime_t maxtime)
 {
        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        unsigned long flags;
        cputime_t irq_cputime;
-        local_irq_save(flags);
+        irq_cputime = nsecs_to_cputime64(irqtime) - cpustat[idx];
-        irq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_hardirq_time)) -
-                      cpustat[CPUTIME_IRQ];
        irq_cputime = min(irq_cputime, maxtime);
-        cpustat[CPUTIME_IRQ] += irq_cputime;
+        cpustat[idx] += irq_cputime;
-        local_irq_restore(flags);
        return irq_cputime;
 }
-static cputime_t irqtime_account_si_update(cputime_t maxtime)
+static cputime_t irqtime_account_hi_update(cputime_t maxtime)
 {
-        u64 *cpustat = kcpustat_this_cpu->cpustat;
+        return irqtime_account_update(__this_cpu_read(cpu_irqtime.hardirq_time),
-        unsigned long flags;
+                                      CPUTIME_IRQ, maxtime);
-        cputime_t softirq_cputime;
+}
-        local_irq_save(flags);
+static cputime_t irqtime_account_si_update(cputime_t maxtime)
-        softirq_cputime = nsecs_to_cputime64(this_cpu_read(cpu_softirq_time)) -
+{
-                          cpustat[CPUTIME_SOFTIRQ];
+        return irqtime_account_update(__this_cpu_read(cpu_irqtime.softirq_time),
-        softirq_cputime = min(softirq_cputime, maxtime);
+                                      CPUTIME_SOFTIRQ, maxtime);
-        cpustat[CPUTIME_SOFTIRQ] += softirq_cputime;
-        local_irq_restore(flags);
-        return softirq_cputime;
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
@@ -295,6 +284,9 @@ static inline cputime_t account_other_time(cputime_t max)
 {
        cputime_t accounted;
+        /* Shall be converted to a lockdep-enabled lightweight check */
+        WARN_ON_ONCE(!irqs_disabled());
        accounted = steal_account_process_time(max);
        if (accounted < max)
@@ -306,6 +298,26 @@ static inline cputime_t account_other_time(cputime_t max)
        return accounted;
 }
+#ifdef CONFIG_64BIT
+static inline u64 read_sum_exec_runtime(struct task_struct *t)
+{
+        return t->se.sum_exec_runtime;
+}
+#else
+static u64 read_sum_exec_runtime(struct task_struct *t)
+{
+        u64 ns;
+        struct rq_flags rf;
+        struct rq *rq;
+        rq = task_rq_lock(t, &rf);
+        ns = t->se.sum_exec_runtime;
+        task_rq_unlock(rq, t, &rf);
+        return ns;
+}
+#endif
 /*
 * Accumulate raw cputime values of dead tasks (sig->[us]time) and live
 * tasks (sum on group iteration) belonging to @tsk's group.
@@ -318,6 +330,17 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        unsigned int seq, nextseq;
        unsigned long flags;
+        /*
+         * Update current task runtime to account pending time since last
+         * scheduler action or thread_group_cputime() call. This thread group
+         * might have other running tasks on different CPUs, but updating
+         * their runtime can affect syscall performance, so we skip account
+         * those pending times and rely only on values updated on tick or
+         * other scheduler action.
+         */
+        if (same_thread_group(current, tsk))
+                (void) task_sched_runtime(current);
        rcu_read_lock();
        /* Attempt a lockless read on the first round. */
        nextseq = 0;
@@ -332,7 +355,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
                        task_cputime(t, &utime, &stime);
                        times->utime += utime;
                        times->stime += stime;
-                        times->sum_exec_runtime += task_sched_runtime(t);
+                        times->sum_exec_runtime += read_sum_exec_runtime(t);
                }
                /* If lockless access failed, take the lock. */
                nextseq = 1;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 1ce8867283dc..37e2449186c4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -243,10 +243,8 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq);
 static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p)
 {
        struct rq *later_rq = NULL;
-        bool fallback = false;
        later_rq = find_lock_later_rq(p, rq);
        if (!later_rq) {
                int cpu;
@@ -254,7 +252,6 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
                 * If we cannot preempt any rq, fall back to pick any
                 * online cpu.
                 */
-                fallback = true;
                cpu = cpumask_any_and(cpu_active_mask, tsk_cpus_allowed(p));
                if (cpu >= nr_cpu_ids) {
                        /*
@@ -274,16 +271,7 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
                double_lock_balance(rq, later_rq);
        }
-        /*
-         * By now the task is replenished and enqueued; migrate it.
-         */
-        deactivate_task(rq, p, 0);
        set_task_cpu(p, later_rq->cpu);
-        activate_task(later_rq, p, 0);
-        if (!fallback)
-                resched_curr(later_rq);
        double_unlock_balance(later_rq, rq);
        return later_rq;
@@ -346,12 +334,12 @@ static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
 * one, and to (try to!) reconcile itself with its own scheduling
 * parameters.
 */
-static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se)
-                                       struct sched_dl_entity *pi_se)
 {
        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
        struct rq *rq = rq_of_dl_rq(dl_rq);
+        WARN_ON(dl_se->dl_boosted);
        WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
        /*
@@ -367,8 +355,8 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
         * future; in fact, we must consider execution overheads (time
         * spent on hardirq context, etc.).
         */
-        dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+        dl_se->deadline = rq_clock(rq) + dl_se->dl_deadline;
-        dl_se->runtime = pi_se->dl_runtime;
+        dl_se->runtime = dl_se->dl_runtime;
 }
 /*
@@ -641,29 +629,31 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
                goto unlock;
        }
-        enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
-        if (dl_task(rq->curr))
-                check_preempt_curr_dl(rq, p, 0);
-        else
-                resched_curr(rq);
 #ifdef CONFIG_SMP
-        /*
-         * Perform balancing operations here; after the replenishments.  We
-         * cannot drop rq->lock before this, otherwise the assertion in
-         * start_dl_timer() about not missing updates is not true.
-         *
-         * If we find that the rq the task was on is no longer available, we
-         * need to select a new rq.
-         *
-         * XXX figure out if select_task_rq_dl() deals with offline cpus.
-         */
        if (unlikely(!rq->online)) {
+                /*
+                 * If the runqueue is no longer available, migrate the
+                 * task elsewhere. This necessarily changes rq.
+                 */
                lockdep_unpin_lock(&rq->lock, rf.cookie);
                rq = dl_task_offline_migration(rq, p);
                rf.cookie = lockdep_pin_lock(&rq->lock);
+                /*
+                 * Now that the task has been migrated to the new RQ and we
+                 * have that locked, proceed as normal and enqueue the task
+                 * there.
+                 */
        }
+#endif
+        enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+        if (dl_task(rq->curr))
+                check_preempt_curr_dl(rq, p, 0);
+        else
+                resched_curr(rq);
+#ifdef CONFIG_SMP
        /*
         * Queueing this task back might have overloaded rq, check if we need
         * to kick someone away.
@@ -735,9 +725,8 @@ static void update_curr_dl(struct rq *rq)
                return;
        }
-        /* kick cpufreq (see the comment in linux/cpufreq.h). */
+        /* kick cpufreq (see the comment in kernel/sched/sched.h). */
-        if (cpu_of(rq) == smp_processor_id())
+        cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
-                cpufreq_trigger_update(rq_clock(rq));
        schedstat_set(curr->se.statistics.exec_max,
                      max(curr->se.statistics.exec_max, delta_exec));
@@ -798,7 +787,7 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
        if (dl_rq->earliest_dl.curr == 0 ||
            dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
                dl_rq->earliest_dl.curr = deadline;
-                cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
+                cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
        }
 }
@@ -813,14 +802,14 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
        if (!dl_rq->dl_nr_running) {
                dl_rq->earliest_dl.curr = 0;
                dl_rq->earliest_dl.next = 0;
-                cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+                cpudl_clear(&rq->rd->cpudl, rq->cpu);
        } else {
                struct rb_node *leftmost = dl_rq->rb_leftmost;
                struct sched_dl_entity *entry;
                entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
                dl_rq->earliest_dl.curr = entry->deadline;
-                cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
+                cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline);
        }
 }
@@ -1671,7 +1660,7 @@ static void rq_online_dl(struct rq *rq)
        cpudl_set_freecpu(&rq->rd->cpudl, rq->cpu);
        if (rq->dl.dl_nr_running > 0)
-                cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
+                cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr);
 }
 /* Assumes rq->lock is held */
@@ -1680,7 +1669,7 @@ static void rq_offline_dl(struct rq *rq)
        if (rq->dl.overloaded)
                dl_clear_overload(rq);
-        cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+        cpudl_clear(&rq->rd->cpudl, rq->cpu);
        cpudl_clear_freecpu(&rq->rd->cpudl, rq->cpu);
 }
@@ -1723,10 +1712,20 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
 */
 static void switched_to_dl(struct rq *rq, struct task_struct *p)
 {
+        /* If p is not queued we will update its parameters at next wakeup. */
+        if (!task_on_rq_queued(p))
+                return;
+        /*
+         * If p is boosted we already updated its params in
+         * rt_mutex_setprio()->enqueue_task(..., ENQUEUE_REPLENISH),
+         * p's deadline being now already after rq_clock(rq).
+         */
        if (dl_time_before(p->dl.deadline, rq_clock(rq)))
-                setup_new_dl_entity(&p->dl, &p->dl);
+                setup_new_dl_entity(&p->dl);
-        if (task_on_rq_queued(p) && rq->curr != p) {
+        if (rq->curr != p) {
 #ifdef CONFIG_SMP
                if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
                        queue_push_tasks(rq);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2a0a9995256d..fa178b62ea79 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -369,8 +369,12 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 #define P(F) \
        SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
+#define P_SCHEDSTAT(F) \
+        SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)schedstat_val(F))
 #define PN(F) \
        SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN_SCHEDSTAT(F) \
+        SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
        if (!se)
                return;
@@ -378,26 +382,27 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
        PN(se->exec_start);
        PN(se->vruntime);
        PN(se->sum_exec_runtime);
-#ifdef CONFIG_SCHEDSTATS
        if (schedstat_enabled()) {
-                PN(se->statistics.wait_start);
+                PN_SCHEDSTAT(se->statistics.wait_start);
-                PN(se->statistics.sleep_start);
+                PN_SCHEDSTAT(se->statistics.sleep_start);
-                PN(se->statistics.block_start);
+                PN_SCHEDSTAT(se->statistics.block_start);
-                PN(se->statistics.sleep_max);
+                PN_SCHEDSTAT(se->statistics.sleep_max);
-                PN(se->statistics.block_max);
+                PN_SCHEDSTAT(se->statistics.block_max);
-                PN(se->statistics.exec_max);
+                PN_SCHEDSTAT(se->statistics.exec_max);
-                PN(se->statistics.slice_max);
+                PN_SCHEDSTAT(se->statistics.slice_max);
-                PN(se->statistics.wait_max);
+                PN_SCHEDSTAT(se->statistics.wait_max);
-                PN(se->statistics.wait_sum);
+                PN_SCHEDSTAT(se->statistics.wait_sum);
-                P(se->statistics.wait_count);
+                P_SCHEDSTAT(se->statistics.wait_count);
        }
-#endif
        P(se->load.weight);
 #ifdef CONFIG_SMP
        P(se->avg.load_avg);
        P(se->avg.util_avg);
 #endif
+#undef PN_SCHEDSTAT
 #undef PN
+#undef P_SCHEDSTAT
 #undef P
 }
 #endif
@@ -410,7 +415,8 @@ static char *task_group_path(struct task_group *tg)
        if (autogroup_path(tg, group_path, PATH_MAX))
                return group_path;
-        return cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+        cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
+        return group_path;
 }
 #endif
@@ -429,9 +435,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                p->prio);
        SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
-                SPLIT_NS(schedstat_val(p, se.statistics.wait_sum)),
+                SPLIT_NS(schedstat_val_or_zero(p->se.statistics.wait_sum)),
                SPLIT_NS(p->se.sum_exec_runtime),
-                SPLIT_NS(schedstat_val(p, se.statistics.sum_sleep_runtime)));
+                SPLIT_NS(schedstat_val_or_zero(p->se.statistics.sum_sleep_runtime)));
 #ifdef CONFIG_NUMA_BALANCING
        SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p));
@@ -626,9 +632,7 @@ do {									\
 #undef P64
 #endif
-#ifdef CONFIG_SCHEDSTATS
+#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, schedstat_val(rq->n));
-#define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
        if (schedstat_enabled()) {
                P(yld_count);
                P(sched_count);
@@ -636,9 +640,8 @@ do {									\
                P(ttwu_count);
                P(ttwu_local);
        }
 #undef P
-#endif
        spin_lock_irqsave(&sched_debug_lock, flags);
        print_cfs_stats(m, cpu);
        print_rt_stats(m, cpu);
@@ -868,10 +871,14 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
 #define P(F) \
        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define P_SCHEDSTAT(F) \
+        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)schedstat_val(p->F))
 #define __PN(F) \
        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
 #define PN(F) \
        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+#define PN_SCHEDSTAT(F) \
+        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(p->F)))
        PN(se.exec_start);
        PN(se.vruntime);
@@ -881,37 +888,36 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
        P(se.nr_migrations);
-#ifdef CONFIG_SCHEDSTATS
        if (schedstat_enabled()) {
                u64 avg_atom, avg_per_cpu;
-                PN(se.statistics.sum_sleep_runtime);
+                PN_SCHEDSTAT(se.statistics.sum_sleep_runtime);
-                PN(se.statistics.wait_start);
+                PN_SCHEDSTAT(se.statistics.wait_start);
-                PN(se.statistics.sleep_start);
+                PN_SCHEDSTAT(se.statistics.sleep_start);
-                PN(se.statistics.block_start);
+                PN_SCHEDSTAT(se.statistics.block_start);
-                PN(se.statistics.sleep_max);
+                PN_SCHEDSTAT(se.statistics.sleep_max);
-                PN(se.statistics.block_max);
+                PN_SCHEDSTAT(se.statistics.block_max);
-                PN(se.statistics.exec_max);
+                PN_SCHEDSTAT(se.statistics.exec_max);
-                PN(se.statistics.slice_max);
+                PN_SCHEDSTAT(se.statistics.slice_max);
-                PN(se.statistics.wait_max);
+                PN_SCHEDSTAT(se.statistics.wait_max);
-                PN(se.statistics.wait_sum);
+                PN_SCHEDSTAT(se.statistics.wait_sum);
-                P(se.statistics.wait_count);
+                P_SCHEDSTAT(se.statistics.wait_count);
-                PN(se.statistics.iowait_sum);
+                PN_SCHEDSTAT(se.statistics.iowait_sum);
-                P(se.statistics.iowait_count);
+                P_SCHEDSTAT(se.statistics.iowait_count);
-                P(se.statistics.nr_migrations_cold);
+                P_SCHEDSTAT(se.statistics.nr_migrations_cold);
-                P(se.statistics.nr_failed_migrations_affine);
+                P_SCHEDSTAT(se.statistics.nr_failed_migrations_affine);
-                P(se.statistics.nr_failed_migrations_running);
+                P_SCHEDSTAT(se.statistics.nr_failed_migrations_running);
-                P(se.statistics.nr_failed_migrations_hot);
+                P_SCHEDSTAT(se.statistics.nr_failed_migrations_hot);
-                P(se.statistics.nr_forced_migrations);
+                P_SCHEDSTAT(se.statistics.nr_forced_migrations);
-                P(se.statistics.nr_wakeups);
+                P_SCHEDSTAT(se.statistics.nr_wakeups);
-                P(se.statistics.nr_wakeups_sync);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_sync);
-                P(se.statistics.nr_wakeups_migrate);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_migrate);
-                P(se.statistics.nr_wakeups_local);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_local);
-                P(se.statistics.nr_wakeups_remote);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_remote);
-                P(se.statistics.nr_wakeups_affine);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_affine);
-                P(se.statistics.nr_wakeups_affine_attempts);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_affine_attempts);
-                P(se.statistics.nr_wakeups_passive);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_passive);
-                P(se.statistics.nr_wakeups_idle);
+                P_SCHEDSTAT(se.statistics.nr_wakeups_idle);
                avg_atom = p->se.sum_exec_runtime;
                if (nr_switches)
@@ -930,7 +936,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
                __PN(avg_atom);
                __PN(avg_per_cpu);
        }
-#endif
        __P(nr_switches);
        SEQ_printf(m, "%-45s:%21Ld\n",
                   "nr_voluntary_switches", (long long)p->nvcsw);
@@ -947,8 +953,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 #endif
        P(policy);
        P(prio);
+#undef PN_SCHEDSTAT
 #undef PN
 #undef __PN
+#undef P_SCHEDSTAT
 #undef P
 #undef __P
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 039de34f1521..c242944f5cbd 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -114,6 +114,12 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * 1024 < capacity * margin
+ */
+unsigned int capacity_margin = 1280; /* ~20% */
 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
 {
        lw->weight += inc;
@@ -256,9 +262,7 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
-#ifdef CONFIG_SCHED_DEBUG
+        SCHED_WARN_ON(!entity_is_task(se));
-        WARN_ON_ONCE(!entity_is_task(se));
-#endif
        return container_of(se, struct task_struct, se);
 }
@@ -456,17 +460,23 @@ static inline int entity_before(struct sched_entity *a,
 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 {
+        struct sched_entity *curr = cfs_rq->curr;
        u64 vruntime = cfs_rq->min_vruntime;
-        if (cfs_rq->curr)
+        if (curr) {
-                vruntime = cfs_rq->curr->vruntime;
+                if (curr->on_rq)
+                        vruntime = curr->vruntime;
+                else
+                        curr = NULL;
+        }
        if (cfs_rq->rb_leftmost) {
                struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
                                                   struct sched_entity,
                                                   run_node);
-                if (!cfs_rq->curr)
+                if (!curr)
                        vruntime = se->vruntime;
                else
                        vruntime = min_vruntime(vruntime, se->vruntime);
@@ -656,7 +666,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 #ifdef CONFIG_SMP
-static int select_idle_sibling(struct task_struct *p, int cpu);
+static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
 /*
@@ -680,7 +690,14 @@ void init_entity_runnable_average(struct sched_entity *se)
         * will definitely be update (after enqueue).
         */
        sa->period_contrib = 1023;
-        sa->load_avg = scale_load_down(se->load.weight);
+        /*
+         * Tasks are intialized with full load to be seen as heavy tasks until
+         * they get a chance to stabilize to their real load level.
+         * Group entities are intialized with zero load to reflect the fact that
+         * nothing has been attached to the task group yet.
+         */
+        if (entity_is_task(se))
+                sa->load_avg = scale_load_down(se->load.weight);
        sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
        /*
         * At this point, util_avg won't be used in select_task_rq_fair anyway
@@ -726,7 +743,6 @@ void post_init_entity_util_avg(struct sched_entity *se)
        struct sched_avg *sa = &se->avg;
        long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
        u64 now = cfs_rq_clock_task(cfs_rq);
-        int tg_update;
        if (cap > 0) {
                if (cfs_rq->avg.util_avg != 0) {
@@ -759,10 +775,9 @@ void post_init_entity_util_avg(struct sched_entity *se)
                }
        }
-        tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+        update_cfs_rq_load_avg(now, cfs_rq, false);
        attach_entity_load_avg(cfs_rq, se);
-        if (tg_update)
+        update_tg_load_avg(cfs_rq, false);
-                update_tg_load_avg(cfs_rq, false);
 }
 #else /* !CONFIG_SMP */
@@ -799,7 +814,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
                      max(delta_exec, curr->statistics.exec_max));
        curr->sum_exec_runtime += delta_exec;
-        schedstat_add(cfs_rq, exec_clock, delta_exec);
+        schedstat_add(cfs_rq->exec_clock, delta_exec);
        curr->vruntime += calc_delta_fair(delta_exec, curr);
        update_min_vruntime(cfs_rq);
@@ -820,26 +835,34 @@ static void update_curr_fair(struct rq *rq)
        update_curr(cfs_rq_of(&rq->curr->se));
 }
-#ifdef CONFIG_SCHEDSTATS
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-        u64 wait_start = rq_clock(rq_of(cfs_rq));
+        u64 wait_start, prev_wait_start;
+        if (!schedstat_enabled())
+                return;
+        wait_start = rq_clock(rq_of(cfs_rq));
+        prev_wait_start = schedstat_val(se->statistics.wait_start);
        if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) &&
-            likely(wait_start > se->statistics.wait_start))
+            likely(wait_start > prev_wait_start))
-                wait_start -= se->statistics.wait_start;
+                wait_start -= prev_wait_start;
-        se->statistics.wait_start = wait_start;
+        schedstat_set(se->statistics.wait_start, wait_start);
 }
-static void
+static inline void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        struct task_struct *p;
        u64 delta;
-        delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
+        if (!schedstat_enabled())
+                return;
+        delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
        if (entity_is_task(se)) {
                p = task_of(se);
@@ -849,35 +872,114 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
                         * time stamp can be adjusted to accumulate wait time
                         * prior to migration.
                         */
-                        se->statistics.wait_start = delta;
+                        schedstat_set(se->statistics.wait_start, delta);
                        return;
                }
                trace_sched_stat_wait(p, delta);
        }
-        se->statistics.wait_max = max(se->statistics.wait_max, delta);
+        schedstat_set(se->statistics.wait_max,
-        se->statistics.wait_count++;
+                      max(schedstat_val(se->statistics.wait_max), delta));
-        se->statistics.wait_sum += delta;
+        schedstat_inc(se->statistics.wait_count);
-        se->statistics.wait_start = 0;
+        schedstat_add(se->statistics.wait_sum, delta);
+        schedstat_set(se->statistics.wait_start, 0);
+}
+static inline void
+update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+        struct task_struct *tsk = NULL;
+        u64 sleep_start, block_start;
+        if (!schedstat_enabled())
+                return;
+        sleep_start = schedstat_val(se->statistics.sleep_start);
+        block_start = schedstat_val(se->statistics.block_start);
+        if (entity_is_task(se))
+                tsk = task_of(se);
+        if (sleep_start) {
+                u64 delta = rq_clock(rq_of(cfs_rq)) - sleep_start;
+                if ((s64)delta < 0)
+                        delta = 0;
+                if (unlikely(delta > schedstat_val(se->statistics.sleep_max)))
+                        schedstat_set(se->statistics.sleep_max, delta);
+                schedstat_set(se->statistics.sleep_start, 0);
+                schedstat_add(se->statistics.sum_sleep_runtime, delta);
+                if (tsk) {
+                        account_scheduler_latency(tsk, delta >> 10, 1);
+                        trace_sched_stat_sleep(tsk, delta);
+                }
+        }
+        if (block_start) {
+                u64 delta = rq_clock(rq_of(cfs_rq)) - block_start;
+                if ((s64)delta < 0)
+                        delta = 0;
+                if (unlikely(delta > schedstat_val(se->statistics.block_max)))
+                        schedstat_set(se->statistics.block_max, delta);
+                schedstat_set(se->statistics.block_start, 0);
+                schedstat_add(se->statistics.sum_sleep_runtime, delta);
+                if (tsk) {
+                        if (tsk->in_iowait) {
+                                schedstat_add(se->statistics.iowait_sum, delta);
+                                schedstat_inc(se->statistics.iowait_count);
+                                trace_sched_stat_iowait(tsk, delta);
+                        }
+                        trace_sched_stat_blocked(tsk, delta);
+                        /*
+                         * Blocking time is in units of nanosecs, so shift by
+                         * 20 to get a milliseconds-range estimation of the
+                         * amount of time that the task spent sleeping:
+                         */
+                        if (unlikely(prof_on == SLEEP_PROFILING)) {
+                                profile_hits(SLEEP_PROFILING,
+                                                (void *)get_wchan(tsk),
+                                                delta >> 20);
+                        }
+                        account_scheduler_latency(tsk, delta >> 10, 0);
+                }
+        }
 }
 /*
 * Task is being enqueued - update stats:
 */
 static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
+update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+        if (!schedstat_enabled())
+                return;
        /*
         * Are we enqueueing a waiting task? (for current tasks
         * a dequeue/enqueue event is a NOP)
         */
        if (se != cfs_rq->curr)
                update_stats_wait_start(cfs_rq, se);
+        if (flags & ENQUEUE_WAKEUP)
+                update_stats_enqueue_sleeper(cfs_rq, se);
 }
 static inline void
 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+        if (!schedstat_enabled())
+                return;
        /*
         * Mark the end of the wait period if dequeueing a
         * waiting task:
@@ -885,40 +987,18 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        if (se != cfs_rq->curr)
                update_stats_wait_end(cfs_rq, se);
-        if (flags & DEQUEUE_SLEEP) {
+        if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) {
-                if (entity_is_task(se)) {
+                struct task_struct *tsk = task_of(se);
-                        struct task_struct *tsk = task_of(se);
-                        if (tsk->state & TASK_INTERRUPTIBLE)
+                if (tsk->state & TASK_INTERRUPTIBLE)
-                                se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
+                        schedstat_set(se->statistics.sleep_start,
-                        if (tsk->state & TASK_UNINTERRUPTIBLE)
+                                      rq_clock(rq_of(cfs_rq)));
-                                se->statistics.block_start = rq_clock(rq_of(cfs_rq));
+                if (tsk->state & TASK_UNINTERRUPTIBLE)
-                }
+                        schedstat_set(se->statistics.block_start,
+                                      rq_clock(rq_of(cfs_rq)));
        }
-}
-#else
-static inline void
-update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
 }
-static inline void
-update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-static inline void
-update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-}
-static inline void
-update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
-{
-}
-#endif
 /*
 * We are picking a new current task - update its stats:
 */
@@ -1513,8 +1593,16 @@ balance:
         * One idle CPU per node is evaluated for a task numa move.
         * Call select_idle_sibling to maybe find a better one.
         */
-        if (!cur)
+        if (!cur) {
-                env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
+                /*
+                 * select_idle_siblings() uses an per-cpu cpumask that
+                 * can be used from IRQ context.
+                 */
+                local_irq_disable();
+                env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
+                                                   env->dst_cpu);
+                local_irq_enable();
+        }
 assign:
        task_numa_assign(env, cur, imp);
@@ -2292,7 +2380,7 @@ void task_numa_work(struct callback_head *work)
        unsigned long nr_pte_updates = 0;
        long pages, virtpages;
-        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
+        SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work));
        work->next = work; /* protect against double add */
        /*
@@ -2803,9 +2891,21 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-/*
+/**
- * Updating tg's load_avg is necessary before update_cfs_share (which is done)
+ * update_tg_load_avg - update the tg's load avg
- * and effective_load (which is not done because it is too costly).
+ * @cfs_rq: the cfs_rq whose avg changed
+ * @force: update regardless of how small the difference
+ *
+ * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
+ * However, because tg->load_avg is a global value there are performance
+ * considerations.
+ *
+ * In order to avoid having to look at the other cfs_rq's, we use a
+ * differential update where we store the last value we propagated. This in
+ * turn allows skipping updates if the differential is 'small'.
+ *
+ * Updating tg's load_avg is necessary before update_cfs_share() (which is
+ * done) and effective_load() (which is not done because it is too costly).
 */
 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 {
@@ -2875,12 +2975,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 {
-        struct rq *rq = rq_of(cfs_rq);
+        if (&this_rq()->cfs == cfs_rq) {
-        int cpu = cpu_of(rq);
-        if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
-                unsigned long max = rq->cpu_capacity_orig;
                /*
                 * There are a few boundary cases this might miss but it should
                 * get called often enough that that should (hopefully) not be
@@ -2897,8 +2992,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
                 *
                 * See cpu_util().
                 */
-                cpufreq_update_util(rq_clock(rq),
+                cpufreq_update_util(rq_of(cfs_rq), 0);
-                                    min(cfs_rq->avg.util_avg, max), max);
        }
 }
@@ -2931,10 +3025,10 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
 *
 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
 *
- * Returns true if the load decayed or we removed utilization. It is expected
+ * Returns true if the load decayed or we removed load.
- * that one calls update_tg_load_avg() on this condition, but after you've
+ *
- * modified the cfs_rq avg (attach/detach), such that we propagate the new
+ * Since both these conditions indicate a changed cfs_rq->avg.load we should
- * avg up.
+ * call update_tg_load_avg() when this function returns true.
 */
 static inline int
 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
@@ -3159,10 +3253,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
 static inline void update_load_avg(struct sched_entity *se, int not_used)
 {
-        struct cfs_rq *cfs_rq = cfs_rq_of(se);
+        cpufreq_update_util(rq_of(cfs_rq_of(se)), 0);
-        struct rq *rq = rq_of(cfs_rq);
-        cpufreq_trigger_update(rq_clock(rq));
 }
 static inline void
@@ -3183,68 +3274,6 @@ static inline int idle_balance(struct rq *rq)
 #endif /* CONFIG_SMP */
-static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-#ifdef CONFIG_SCHEDSTATS
-        struct task_struct *tsk = NULL;
-        if (entity_is_task(se))
-                tsk = task_of(se);
-        if (se->statistics.sleep_start) {
-                u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
-                if ((s64)delta < 0)
-                        delta = 0;
-                if (unlikely(delta > se->statistics.sleep_max))
-                        se->statistics.sleep_max = delta;
-                se->statistics.sleep_start = 0;
-                se->statistics.sum_sleep_runtime += delta;
-                if (tsk) {
-                        account_scheduler_latency(tsk, delta >> 10, 1);
-                        trace_sched_stat_sleep(tsk, delta);
-                }
-        }
-        if (se->statistics.block_start) {
-                u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
-                if ((s64)delta < 0)
-                        delta = 0;
-                if (unlikely(delta > se->statistics.block_max))
-                        se->statistics.block_max = delta;
-                se->statistics.block_start = 0;
-                se->statistics.sum_sleep_runtime += delta;
-                if (tsk) {
-                        if (tsk->in_iowait) {
-                                se->statistics.iowait_sum += delta;
-                                se->statistics.iowait_count++;
-                                trace_sched_stat_iowait(tsk, delta);
-                        }
-                        trace_sched_stat_blocked(tsk, delta);
-                        /*
-                         * Blocking time is in units of nanosecs, so shift by
-                         * 20 to get a milliseconds-range estimation of the
-                         * amount of time that the task spent sleeping:
-                         */
-                        if (unlikely(prof_on == SLEEP_PROFILING)) {
-                                profile_hits(SLEEP_PROFILING,
-                                                (void *)get_wchan(tsk),
-                                                delta >> 20);
-                        }
-                        account_scheduler_latency(tsk, delta >> 10, 0);
-                }
-        }
-#endif
-}
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -3254,7 +3283,7 @@ static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
                d = -d;
        if (d > 3*sysctl_sched_latency)
-                schedstat_inc(cfs_rq, nr_spread_over);
+                schedstat_inc(cfs_rq->nr_spread_over);
 #endif
 }
@@ -3371,17 +3400,12 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        account_entity_enqueue(cfs_rq, se);
        update_cfs_shares(cfs_rq);
-        if (flags & ENQUEUE_WAKEUP) {
+        if (flags & ENQUEUE_WAKEUP)
                place_entity(cfs_rq, se, 0);
-                if (schedstat_enabled())
-                        enqueue_sleeper(cfs_rq, se);
-        }
        check_schedstat_required();
-        if (schedstat_enabled()) {
+        update_stats_enqueue(cfs_rq, se, flags);
-                update_stats_enqueue(cfs_rq, se);
+        check_spread(cfs_rq, se);
-                check_spread(cfs_rq, se);
-        }
        if (!curr)
                __enqueue_entity(cfs_rq, se);
        se->on_rq = 1;
@@ -3448,8 +3472,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        update_curr(cfs_rq);
        dequeue_entity_load_avg(cfs_rq, se);
-        if (schedstat_enabled())
+        update_stats_dequeue(cfs_rq, se, flags);
-                update_stats_dequeue(cfs_rq, se, flags);
        clear_buddies(cfs_rq, se);
@@ -3459,9 +3482,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        account_entity_dequeue(cfs_rq, se);
        /*
-         * Normalize the entity after updating the min_vruntime because the
+         * Normalize after update_curr(); which will also have moved
-         * update can refer to the ->curr item and we need to reflect this
+         * min_vruntime if @se is the one holding it back. But before doing
-         * movement in our normalized position.
+         * update_min_vruntime() again, which will discount @se's position and
+         * can move min_vruntime forward still more.
         */
        if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;
@@ -3469,8 +3493,16 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        /* return excess runtime on last dequeue */
        return_cfs_rq_runtime(cfs_rq);
-        update_min_vruntime(cfs_rq);
        update_cfs_shares(cfs_rq);
+        /*
+         * Now advance min_vruntime if @se was the entity holding it back,
+         * except when: DEQUEUE_SAVE && !DEQUEUE_MOVE, in this case we'll be
+         * put back on, and if we advance min_vruntime, we'll be placed back
+         * further than we started -- ie. we'll be penalized.
+         */
+        if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+                update_min_vruntime(cfs_rq);
 }
 /*
@@ -3523,25 +3555,25 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
                 * a CPU. So account for the time it spent waiting on the
                 * runqueue.
                 */
-                if (schedstat_enabled())
+                update_stats_wait_end(cfs_rq, se);
-                        update_stats_wait_end(cfs_rq, se);
                __dequeue_entity(cfs_rq, se);
                update_load_avg(se, 1);
        }
        update_stats_curr_start(cfs_rq, se);
        cfs_rq->curr = se;
-#ifdef CONFIG_SCHEDSTATS
        /*
         * Track our maximum slice length, if the CPU's load is at
         * least twice that of our own weight (i.e. dont track it
         * when there are only lesser-weight tasks around):
         */
        if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
-                se->statistics.slice_max = max(se->statistics.slice_max,
+                schedstat_set(se->statistics.slice_max,
-                        se->sum_exec_runtime - se->prev_sum_exec_runtime);
+                        max((u64)schedstat_val(se->statistics.slice_max),
+                            se->sum_exec_runtime - se->prev_sum_exec_runtime));
        }
-#endif
        se->prev_sum_exec_runtime = se->sum_exec_runtime;
 }
@@ -3620,13 +3652,10 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
        /* throttle cfs_rqs exceeding runtime */
        check_cfs_rq_runtime(cfs_rq);
-        if (schedstat_enabled()) {
+        check_spread(cfs_rq, prev);
-                check_spread(cfs_rq, prev);
-                if (prev->on_rq)
-                        update_stats_wait_start(cfs_rq, prev);
-        }
        if (prev->on_rq) {
+                update_stats_wait_start(cfs_rq, prev);
                /* Put 'current' back into the tree. */
                __enqueue_entity(cfs_rq, prev);
                /* in !on_rq case, update occurred at dequeue */
@@ -4456,9 +4485,9 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
-        WARN_ON(task_rq(p) != rq);
+        SCHED_WARN_ON(task_rq(p) != rq);
-        if (cfs_rq->nr_running > 1) {
+        if (rq->cfs.h_nr_running > 1) {
                u64 slice = sched_slice(cfs_rq, se);
                u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
                s64 delta = slice - ran;
@@ -4509,6 +4538,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se;
+        /*
+         * If in_iowait is set, the code below may not trigger any cpufreq
+         * utilization updates, so do it here explicitly with the IOWAIT flag
+         * passed.
+         */
+        if (p->in_iowait)
+                cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
        for_each_sched_entity(se) {
                if (se->on_rq)
                        break;
@@ -4605,6 +4642,11 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 }
 #ifdef CONFIG_SMP
+/* Working cpumask for: load_balance, load_balance_newidle. */
+DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
+DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
 #ifdef CONFIG_NO_HZ_COMMON
 /*
 * per rq 'load' arrray crap; XXX kill this.
@@ -5006,9 +5048,9 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
                 * wl = S * s'_i; see (2)
                 */
                if (W > 0 && w < W)
-                        wl = (w * (long)tg->shares) / W;
+                        wl = (w * (long)scale_load_down(tg->shares)) / W;
                else
-                        wl = tg->shares;
+                        wl = scale_load_down(tg->shares);
                /*
                 * Per the above, wl is the new se->load.weight value; since
@@ -5091,18 +5133,18 @@ static int wake_wide(struct task_struct *p)
        return 1;
 }
-static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+                       int prev_cpu, int sync)
 {
        s64 this_load, load;
        s64 this_eff_load, prev_eff_load;
-        int idx, this_cpu, prev_cpu;
+        int idx, this_cpu;
        struct task_group *tg;
        unsigned long weight;
        int balanced;
        idx       = sd->wake_idx;
        this_cpu  = smp_processor_id();
-        prev_cpu  = task_cpu(p);
        load      = source_load(prev_cpu, idx);
        this_load = target_load(this_cpu, idx);
@@ -5146,13 +5188,13 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
        balanced = this_eff_load <= prev_eff_load;
-        schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
+        schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
        if (!balanced)
                return 0;
-        schedstat_inc(sd, ttwu_move_affine);
+        schedstat_inc(sd->ttwu_move_affine);
-        schedstat_inc(p, se.statistics.nr_wakeups_affine);
+        schedstat_inc(p->se.statistics.nr_wakeups_affine);
        return 1;
 }
@@ -5228,6 +5270,10 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
        int shallowest_idle_cpu = -1;
        int i;
+        /* Check if we have any choice: */
+        if (group->group_weight == 1)
+                return cpumask_first(sched_group_cpus(group));
        /* Traverse only the allowed CPUs */
        for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
                if (idle_cpu(i)) {
@@ -5265,64 +5311,242 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 }
 /*
- * Try and locate an idle CPU in the sched_domain.
+ * Implement a for_each_cpu() variant that starts the scan at a given cpu
+ * (@start), and wraps around.
+ *
+ * This is used to scan for idle CPUs; such that not all CPUs looking for an
+ * idle CPU find the same CPU. The down-side is that tasks tend to cycle
+ * through the LLC domain.
+ *
+ * Especially tbench is found sensitive to this.
+ */
+static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
+{
+        int next;
+again:
+        next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
+        if (*wrapped) {
+                if (next >= start)
+                        return nr_cpumask_bits;
+        } else {
+                if (next >= nr_cpumask_bits) {
+                        *wrapped = 1;
+                        n = -1;
+                        goto again;
+                }
+        }
+        return next;
+}
+#define for_each_cpu_wrap(cpu, mask, start, wrap)                               \
+        for ((wrap) = 0, (cpu) = (start)-1;                                     \
+                (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)),     \
+                (cpu) < nr_cpumask_bits; )
+#ifdef CONFIG_SCHED_SMT
+static inline void set_idle_cores(int cpu, int val)
+{
+        struct sched_domain_shared *sds;
+        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+        if (sds)
+                WRITE_ONCE(sds->has_idle_cores, val);
+}
+static inline bool test_idle_cores(int cpu, bool def)
+{
+        struct sched_domain_shared *sds;
+        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
+        if (sds)
+                return READ_ONCE(sds->has_idle_cores);
+        return def;
+}
+/*
+ * Scans the local SMT mask to see if the entire core is idle, and records this
+ * information in sd_llc_shared->has_idle_cores.
+ *
+ * Since SMT siblings share all cache levels, inspecting this limited remote
+ * state should be fairly cheap.
+ */
+void __update_idle_core(struct rq *rq)
+{
+        int core = cpu_of(rq);
+        int cpu;
+        rcu_read_lock();
+        if (test_idle_cores(core, true))
+                goto unlock;
+        for_each_cpu(cpu, cpu_smt_mask(core)) {
+                if (cpu == core)
+                        continue;
+                if (!idle_cpu(cpu))
+                        goto unlock;
+        }
+        set_idle_cores(core, 1);
+unlock:
+        rcu_read_unlock();
+}
+/*
+ * Scan the entire LLC domain for idle cores; this dynamically switches off if
+ * there are no idle cores left in the system; tracked through
+ * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above.
+ */
+static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+        int core, cpu, wrap;
+        if (!static_branch_likely(&sched_smt_present))
+                return -1;
+        if (!test_idle_cores(target, false))
+                return -1;
+        cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
+        for_each_cpu_wrap(core, cpus, target, wrap) {
+                bool idle = true;
+                for_each_cpu(cpu, cpu_smt_mask(core)) {
+                        cpumask_clear_cpu(cpu, cpus);
+                        if (!idle_cpu(cpu))
+                                idle = false;
+                }
+                if (idle)
+                        return core;
+        }
+        /*
+         * Failed to find an idle core; stop looking for one.
+         */
+        set_idle_cores(target, 0);
+        return -1;
+}
+/*
+ * Scan the local SMT mask for idle CPUs.
+ */
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        int cpu;
+        if (!static_branch_likely(&sched_smt_present))
+                return -1;
+        for_each_cpu(cpu, cpu_smt_mask(target)) {
+                if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+                        continue;
+                if (idle_cpu(cpu))
+                        return cpu;
+        }
+        return -1;
+}
+#else /* CONFIG_SCHED_SMT */
+static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        return -1;
+}
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        return -1;
+}
+#endif /* CONFIG_SCHED_SMT */
+/*
+ * Scan the LLC domain for idle CPUs; this is dynamically regulated by
+ * comparing the average scan cost (tracked in sd->avg_scan_cost) against the
+ * average idle time for this rq (as found in rq->avg_idle).
+ */
+static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target)
+{
+        struct sched_domain *this_sd;
+        u64 avg_cost, avg_idle = this_rq()->avg_idle;
+        u64 time, cost;
+        s64 delta;
+        int cpu, wrap;
+        this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
+        if (!this_sd)
+                return -1;
+        avg_cost = this_sd->avg_scan_cost;
+        /*
+         * Due to large variance we need a large fuzz factor; hackbench in
+         * particularly is sensitive here.
+         */
+        if ((avg_idle / 512) < avg_cost)
+                return -1;
+        time = local_clock();
+        for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+                if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+                        continue;
+                if (idle_cpu(cpu))
+                        break;
+        }
+        time = local_clock() - time;
+        cost = this_sd->avg_scan_cost;
+        delta = (s64)(time - cost) / 8;
+        this_sd->avg_scan_cost += delta;
+        return cpu;
+}
+/*
+ * Try and locate an idle core/thread in the LLC cache domain.
 */
-static int select_idle_sibling(struct task_struct *p, int target)
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
 {
        struct sched_domain *sd;
-        struct sched_group *sg;
+        int i;
-        int i = task_cpu(p);
        if (idle_cpu(target))
                return target;
        /*
-         * If the prevous cpu is cache affine and idle, don't be stupid.
+         * If the previous cpu is cache affine and idle, don't be stupid.
         */
-        if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
+        if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
-                return i;
+                return prev;
-        /*
-         * Otherwise, iterate the domains and find an eligible idle cpu.
-         *
-         * A completely idle sched group at higher domains is more
-         * desirable than an idle group at a lower level, because lower
-         * domains have smaller groups and usually share hardware
-         * resources which causes tasks to contend on them, e.g. x86
-         * hyperthread siblings in the lowest domain (SMT) can contend
-         * on the shared cpu pipeline.
-         *
-         * However, while we prefer idle groups at higher domains
-         * finding an idle cpu at the lowest domain is still better than
-         * returning 'target', which we've already established, isn't
-         * idle.
-         */
        sd = rcu_dereference(per_cpu(sd_llc, target));
-        for_each_lower_domain(sd) {
+        if (!sd)
-                sg = sd->groups;
+                return target;
-                do {
-                        if (!cpumask_intersects(sched_group_cpus(sg),
+        i = select_idle_core(p, sd, target);
-                                                tsk_cpus_allowed(p)))
+        if ((unsigned)i < nr_cpumask_bits)
-                                goto next;
+                return i;
-                        /* Ensure the entire group is idle */
+        i = select_idle_cpu(p, sd, target);
-                        for_each_cpu(i, sched_group_cpus(sg)) {
+        if ((unsigned)i < nr_cpumask_bits)
-                                if (i == target || !idle_cpu(i))
+                return i;
-                                        goto next;
-                        }
+        i = select_idle_smt(p, sd, target);
+        if ((unsigned)i < nr_cpumask_bits)
+                return i;
-                        /*
-                         * It doesn't matter which cpu we pick, the
-                         * whole group is idle.
-                         */
-                        target = cpumask_first_and(sched_group_cpus(sg),
-                                        tsk_cpus_allowed(p));
-                        goto done;
-next:
-                        sg = sg->next;
-                } while (sg != sd->groups);
-        }
-done:
        return target;
 }
@@ -5360,6 +5584,32 @@ static int cpu_util(int cpu)
        return (util >= capacity) ? capacity : util;
 }
+static inline int task_util(struct task_struct *p)
+{
+        return p->se.avg.util_avg;
+}
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+        long min_cap, max_cap;
+        min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+        max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+        /* Minimum capacity is close to max, no need to abort wake_affine */
+        if (max_cap - min_cap < max_cap >> 3)
+                return 0;
+        return min_cap * 1024 < task_util(p) * capacity_margin;
+}
 /*
 * select_task_rq_fair: Select target runqueue for the waking task in domains
 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5383,7 +5633,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
        if (sd_flag & SD_BALANCE_WAKE) {
                record_wakee(p);
-                want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+                want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
+                              && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
        }
        rcu_read_lock();
@@ -5409,13 +5660,13 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
        if (affine_sd) {
                sd = NULL; /* Prefer wake_affine over balance flags */
-                if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
+                if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
                        new_cpu = cpu;
        }
        if (!sd) {
                if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
-                        new_cpu = select_idle_sibling(p, new_cpu);
+                        new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
        } else while (sd) {
                struct sched_group *group;
@@ -5939,7 +6190,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 *
 * The adjacency matrix of the resulting graph is given by:
 *
- *             log_2 n     
+ *             log_2 n
 *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
 *             k = 0
 *
@@ -5985,7 +6236,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 *
 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
 *      rewrite all of this once again.]
- */ 
+ */
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
@@ -6133,7 +6384,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
                int cpu;
-                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+                schedstat_inc(p->se.statistics.nr_failed_migrations_affine);
                env->flags |= LBF_SOME_PINNED;
@@ -6164,7 +6415,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        env->flags &= ~LBF_ALL_PINNED;
        if (task_running(env->src_rq, p)) {
-                schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+                schedstat_inc(p->se.statistics.nr_failed_migrations_running);
                return 0;
        }
@@ -6181,13 +6432,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        if (tsk_cache_hot <= 0 ||
            env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
                if (tsk_cache_hot == 1) {
-                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+                        schedstat_inc(env->sd->lb_hot_gained[env->idle]);
-                        schedstat_inc(p, se.statistics.nr_forced_migrations);
+                        schedstat_inc(p->se.statistics.nr_forced_migrations);
                }
                return 1;
        }
-        schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
+        schedstat_inc(p->se.statistics.nr_failed_migrations_hot);
        return 0;
 }
@@ -6227,7 +6478,7 @@ static struct task_struct *detach_one_task(struct lb_env *env)
                 * so we can safely collect stats here rather than
                 * inside detach_tasks().
                 */
-                schedstat_inc(env->sd, lb_gained[env->idle]);
+                schedstat_inc(env->sd->lb_gained[env->idle]);
                return p;
        }
        return NULL;
@@ -6319,7 +6570,7 @@ next:
         * so we can safely collect detach_one_task() stats here rather
         * than inside detach_one_task().
         */
-        schedstat_add(env->sd, lb_gained[env->idle], detached);
+        schedstat_add(env->sd->lb_gained[env->idle], detached);
        return detached;
 }
@@ -6647,7 +6898,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                /*
                 * !SD_OVERLAP domains can assume that child groups
                 * span the current group.
-                 */ 
+                 */
                group = child->groups;
                do {
@@ -7147,7 +7398,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                load_above_capacity = busiest->sum_nr_running * SCHED_CAPACITY_SCALE;
                if (load_above_capacity > busiest->group_capacity) {
                        load_above_capacity -= busiest->group_capacity;
-                        load_above_capacity *= NICE_0_LOAD;
+                        load_above_capacity *= scale_load_down(NICE_0_LOAD);
                        load_above_capacity /= busiest->group_capacity;
                } else
                        load_above_capacity = ~0UL;
@@ -7354,9 +7605,6 @@ static struct rq *find_busiest_queue(struct lb_env *env,
 */
 #define MAX_PINNED_INTERVAL     512
-/* Working cpumask for load_balance and load_balance_newidle. */
-DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
 static int need_active_balance(struct lb_env *env)
 {
        struct sched_domain *sd = env->sd;
@@ -7460,7 +7708,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        cpumask_copy(cpus, cpu_active_mask);
-        schedstat_inc(sd, lb_count[idle]);
+        schedstat_inc(sd->lb_count[idle]);
 redo:
        if (!should_we_balance(&env)) {
@@ -7470,19 +7718,19 @@ redo:
        group = find_busiest_group(&env);
        if (!group) {
-                schedstat_inc(sd, lb_nobusyg[idle]);
+                schedstat_inc(sd->lb_nobusyg[idle]);
                goto out_balanced;
        }
        busiest = find_busiest_queue(&env, group);
        if (!busiest) {
-                schedstat_inc(sd, lb_nobusyq[idle]);
+                schedstat_inc(sd->lb_nobusyq[idle]);
                goto out_balanced;
        }
        BUG_ON(busiest == env.dst_rq);
-        schedstat_add(sd, lb_imbalance[idle], env.imbalance);
+        schedstat_add(sd->lb_imbalance[idle], env.imbalance);
        env.src_cpu = busiest->cpu;
        env.src_rq = busiest;
@@ -7589,7 +7837,7 @@ more_balance:
        }
        if (!ld_moved) {
-                schedstat_inc(sd, lb_failed[idle]);
+                schedstat_inc(sd->lb_failed[idle]);
                /*
                 * Increment the failure counter only on periodic balance.
                 * We do not want newidle balance, which can be very
@@ -7672,7 +7920,7 @@ out_all_pinned:
         * we can't migrate them. Let the imbalance flag set so parent level
         * can try to migrate them.
         */
-        schedstat_inc(sd, lb_balanced[idle]);
+        schedstat_inc(sd->lb_balanced[idle]);
        sd->nr_balance_failed = 0;
@@ -7704,11 +7952,12 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
 }
 static inline void
-update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
+update_next_balance(struct sched_domain *sd, unsigned long *next_balance)
 {
        unsigned long interval, next;
-        interval = get_sd_balance_interval(sd, cpu_busy);
+        /* used by idle balance, so cpu_busy = 0 */
+        interval = get_sd_balance_interval(sd, 0);
        next = sd->last_balance + interval;
        if (time_after(*next_balance, next))
@@ -7738,7 +7987,7 @@ static int idle_balance(struct rq *this_rq)
                rcu_read_lock();
                sd = rcu_dereference_check_sched_domain(this_rq->sd);
                if (sd)
-                        update_next_balance(sd, 0, &next_balance);
+                        update_next_balance(sd, &next_balance);
                rcu_read_unlock();
                goto out;
@@ -7756,7 +8005,7 @@ static int idle_balance(struct rq *this_rq)
                        continue;
                if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
-                        update_next_balance(sd, 0, &next_balance);
+                        update_next_balance(sd, &next_balance);
                        break;
                }
@@ -7774,7 +8023,7 @@ static int idle_balance(struct rq *this_rq)
                        curr_cost += domain_cost;
                }
-                update_next_balance(sd, 0, &next_balance);
+                update_next_balance(sd, &next_balance);
                /*
                 * Stop searching for tasks to pull if there are
@@ -7864,15 +8113,15 @@ static int active_load_balance_cpu_stop(void *data)
                        .idle           = CPU_IDLE,
                };
-                schedstat_inc(sd, alb_count);
+                schedstat_inc(sd->alb_count);
                p = detach_one_task(&env);
                if (p) {
-                        schedstat_inc(sd, alb_pushed);
+                        schedstat_inc(sd->alb_pushed);
                        /* Active balancing done, reset the failure counter. */
                        sd->nr_balance_failed = 0;
                } else {
-                        schedstat_inc(sd, alb_failed);
+                        schedstat_inc(sd->alb_failed);
                }
        }
        rcu_read_unlock();
@@ -7964,13 +8213,13 @@ static inline void set_cpu_sd_state_busy(void)
        int cpu = smp_processor_id();
        rcu_read_lock();
-        sd = rcu_dereference(per_cpu(sd_busy, cpu));
+        sd = rcu_dereference(per_cpu(sd_llc, cpu));
        if (!sd || !sd->nohz_idle)
                goto unlock;
        sd->nohz_idle = 0;
-        atomic_inc(&sd->groups->sgc->nr_busy_cpus);
+        atomic_inc(&sd->shared->nr_busy_cpus);
 unlock:
        rcu_read_unlock();
 }
@@ -7981,13 +8230,13 @@ void set_cpu_sd_state_idle(void)
        int cpu = smp_processor_id();
        rcu_read_lock();
-        sd = rcu_dereference(per_cpu(sd_busy, cpu));
+        sd = rcu_dereference(per_cpu(sd_llc, cpu));
        if (!sd || sd->nohz_idle)
                goto unlock;
        sd->nohz_idle = 1;
-        atomic_dec(&sd->groups->sgc->nr_busy_cpus);
+        atomic_dec(&sd->shared->nr_busy_cpus);
 unlock:
        rcu_read_unlock();
 }
@@ -8214,8 +8463,8 @@ end:
 static inline bool nohz_kick_needed(struct rq *rq)
 {
        unsigned long now = jiffies;
+        struct sched_domain_shared *sds;
        struct sched_domain *sd;
-        struct sched_group_capacity *sgc;
        int nr_busy, cpu = rq->cpu;
        bool kick = false;
@@ -8243,11 +8492,13 @@ static inline bool nohz_kick_needed(struct rq *rq)
                return true;
        rcu_read_lock();
-        sd = rcu_dereference(per_cpu(sd_busy, cpu));
+        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-        if (sd) {
+        if (sds) {
-                sgc = sd->groups->sgc;
+                /*
-                nr_busy = atomic_read(&sgc->nr_busy_cpus);
+                 * XXX: write a coherent comment on why we do this.
+                 * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
+                 */
+                nr_busy = atomic_read(&sds->nr_busy_cpus);
                if (nr_busy > 1) {
                        kick = true;
                        goto unlock;
@@ -8283,7 +8534,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
 * run_rebalance_domains is triggered when needed from the scheduler tick.
 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
 */
-static void run_rebalance_domains(struct softirq_action *h)
+static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
 {
        struct rq *this_rq = this_rq();
        enum cpu_idle_type idle = this_rq->idle_balance ?
@@ -8441,7 +8692,6 @@ static void detach_task_cfs_rq(struct task_struct *p)
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        u64 now = cfs_rq_clock_task(cfs_rq);
-        int tg_update;
        if (!vruntime_normalized(p)) {
                /*
@@ -8453,10 +8703,9 @@ static void detach_task_cfs_rq(struct task_struct *p)
        }
        /* Catch up with the cfs_rq and remove our load when we leave */
-        tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+        update_cfs_rq_load_avg(now, cfs_rq, false);
        detach_entity_load_avg(cfs_rq, se);
-        if (tg_update)
+        update_tg_load_avg(cfs_rq, false);
-                update_tg_load_avg(cfs_rq, false);
 }
 static void attach_task_cfs_rq(struct task_struct *p)
@@ -8464,7 +8713,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        u64 now = cfs_rq_clock_task(cfs_rq);
-        int tg_update;
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /*
@@ -8475,10 +8723,9 @@ static void attach_task_cfs_rq(struct task_struct *p)
 #endif
        /* Synchronize task with its cfs_rq */
-        tg_update = update_cfs_rq_load_avg(now, cfs_rq, false);
+        update_cfs_rq_load_avg(now, cfs_rq, false);
        attach_entity_load_avg(cfs_rq, se);
-        if (tg_update)
+        update_tg_load_avg(cfs_rq, false);
-                update_tg_load_avg(cfs_rq, false);
        if (!vruntime_normalized(p))
                se->vruntime += cfs_rq->min_vruntime;
@@ -8592,7 +8839,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 {
        struct sched_entity *se;
        struct cfs_rq *cfs_rq;
-        struct rq *rq;
        int i;
        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8607,8 +8853,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
        for_each_possible_cpu(i) {
-                rq = cpu_rq(i);
                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                      GFP_KERNEL, cpu_to_node(i));
                if (!cfs_rq)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 9fb873cfc75c..1d8718d5300d 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -16,6 +16,9 @@
 #include "sched.h"
+/* Linker adds these: start and end of __cpuidle functions */
+extern char __cpuidle_text_start[], __cpuidle_text_end[];
 /**
 * sched_idle_set_state - Record idle state for the current CPU.
 * @idle_state: State to record.
@@ -53,7 +56,7 @@ static int __init cpu_idle_nopoll_setup(char *__unused)
 __setup("hlt", cpu_idle_nopoll_setup);
 #endif
-static inline int cpu_idle_poll(void)
+static noinline int __cpuidle cpu_idle_poll(void)
 {
        rcu_idle_enter();
        trace_cpu_idle_rcuidle(0, smp_processor_id());
@@ -84,7 +87,7 @@ void __weak arch_cpu_idle(void)
 *
 * To use when the cpuidle framework cannot be used.
 */
-void default_idle_call(void)
+void __cpuidle default_idle_call(void)
 {
        if (current_clr_polling_and_test()) {
                local_irq_enable();
@@ -271,6 +274,12 @@ static void cpu_idle_loop(void)
        }
 }
+bool cpu_in_idle(unsigned long pc)
+{
+        return pc >= (unsigned long)__cpuidle_text_start &&
+                pc < (unsigned long)__cpuidle_text_end;
+}
 void cpu_startup_entry(enum cpuhp_state state)
 {
        /*
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 2ce5458bbe1d..5405d3feb112 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -27,8 +27,8 @@ static struct task_struct *
 pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct pin_cookie cookie)
 {
        put_prev_task(rq, prev);
+        update_idle_core(rq);
-        schedstat_inc(rq, sched_goidle);
+        schedstat_inc(rq->sched_goidle);
        return rq->idle;
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d5690b722691..2516b8df6dbb 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -957,9 +957,8 @@ static void update_curr_rt(struct rq *rq)
        if (unlikely((s64)delta_exec <= 0))
                return;
-        /* Kick cpufreq (see the comment in linux/cpufreq.h). */
+        /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
-        if (cpu_of(rq) == smp_processor_id())
+        cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_RT);
-                cpufreq_trigger_update(rq_clock(rq));
        schedstat_set(curr->se.statistics.exec_max,
                      max(curr->se.statistics.exec_max, delta_exec));
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c64fc5114004..055f935d4421 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,7 @@
 #include <linux/sched.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
+#include <linux/u64_stats_sync.h>
 #include <linux/sched/deadline.h>
 #include <linux/binfmts.h>
 #include <linux/mutex.h>
@@ -15,6 +16,12 @@
 #include "cpudeadline.h"
 #include "cpuacct.h"
+#ifdef CONFIG_SCHED_DEBUG
+#define SCHED_WARN_ON(x)        WARN_ONCE(x, #x)
+#else
+#define SCHED_WARN_ON(x)        ((void)(x))
+#endif
 struct rq;
 struct cpuidle_state;
@@ -565,6 +572,8 @@ struct root_domain {
         */
        cpumask_var_t rto_mask;
        struct cpupri cpupri;
+        unsigned long max_cpu_capacity;
 };
 extern struct root_domain def_root_domain;
@@ -597,7 +606,6 @@ struct rq {
 #ifdef CONFIG_SMP
        unsigned long last_load_update_tick;
 #endif /* CONFIG_SMP */
-        u64 nohz_stamp;
        unsigned long nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
 #ifdef CONFIG_NO_HZ_FULL
@@ -723,6 +731,23 @@ static inline int cpu_of(struct rq *rq)
 #endif
 }
+#ifdef CONFIG_SCHED_SMT
+extern struct static_key_false sched_smt_present;
+extern void __update_idle_core(struct rq *rq);
+static inline void update_idle_core(struct rq *rq)
+{
+        if (static_branch_unlikely(&sched_smt_present))
+                __update_idle_core(rq);
+}
+#else
+static inline void update_idle_core(struct rq *rq) { }
+#endif
 DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 #define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
@@ -857,8 +882,8 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
 DECLARE_PER_CPU(struct sched_domain *, sd_numa);
-DECLARE_PER_CPU(struct sched_domain *, sd_busy);
 DECLARE_PER_CPU(struct sched_domain *, sd_asym);
 struct sched_group_capacity {
@@ -870,10 +895,6 @@ struct sched_group_capacity {
        unsigned int capacity;
        unsigned long next_update;
        int imbalance; /* XXX unrelated to capacity but shared group state */
-        /*
-         * Number of busy cpus in this group.
-         */
-        atomic_t nr_busy_cpus;
        unsigned long cpumask[0]; /* iteration mask */
 };
@@ -1000,7 +1021,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
         * per-task data have been completed by this moment.
         */
        smp_wmb();
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+        p->cpu = cpu;
+#else
        task_thread_info(p)->cpu = cpu;
+#endif
        p->wake_cpu = cpu;
 #endif
 }
@@ -1260,6 +1285,11 @@ static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
        prev->sched_class->put_prev_task(rq, prev);
 }
+static inline void set_curr_task(struct rq *rq, struct task_struct *curr)
+{
+        curr->sched_class->set_curr_task(rq);
+}
 #define sched_class_highest (&stop_sched_class)
 #define for_each_class(class) \
   for (class = sched_class_highest; class; class = class->next)
@@ -1290,7 +1320,7 @@ static inline void idle_set_state(struct rq *rq,
 static inline struct cpuidle_state *idle_get_state(struct rq *rq)
 {
-        WARN_ON(!rcu_read_lock_held());
+        SCHED_WARN_ON(!rcu_read_lock_held());
        return rq->idle_state;
 }
 #else
@@ -1710,52 +1740,28 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
 #endif
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
+struct irqtime {
+        u64                     hardirq_time;
+        u64                     softirq_time;
+        u64                     irq_start_time;
+        struct u64_stats_sync   sync;
+};
-DECLARE_PER_CPU(u64, cpu_hardirq_time);
+DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
-DECLARE_PER_CPU(u64, cpu_softirq_time);
-#ifndef CONFIG_64BIT
-DECLARE_PER_CPU(seqcount_t, irq_time_seq);
-static inline void irq_time_write_begin(void)
-{
-        __this_cpu_inc(irq_time_seq.sequence);
-        smp_wmb();
-}
-static inline void irq_time_write_end(void)
-{
-        smp_wmb();
-        __this_cpu_inc(irq_time_seq.sequence);
-}
 static inline u64 irq_time_read(int cpu)
 {
-        u64 irq_time;
+        struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
-        unsigned seq;
+        unsigned int seq;
+        u64 total;
        do {
-                seq = read_seqcount_begin(&per_cpu(irq_time_seq, cpu));
+                seq = __u64_stats_fetch_begin(&irqtime->sync);
-                irq_time = per_cpu(cpu_softirq_time, cpu) +
+                total = irqtime->softirq_time + irqtime->hardirq_time;
-                           per_cpu(cpu_hardirq_time, cpu);
+        } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
-        } while (read_seqcount_retry(&per_cpu(irq_time_seq, cpu), seq));
-        return irq_time;
-}
-#else /* CONFIG_64BIT */
-static inline void irq_time_write_begin(void)
-{
-}
-static inline void irq_time_write_end(void)
-{
-}
-static inline u64 irq_time_read(int cpu)
+        return total;
-{
-        return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
 }
-#endif /* CONFIG_64BIT */
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 #ifdef CONFIG_CPU_FREQ
@@ -1763,27 +1769,13 @@ DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
 /**
 * cpufreq_update_util - Take a note about CPU utilization changes.
- * @time: Current time.
+ * @rq: Runqueue to carry out the update for.
- * @util: Current utilization.
+ * @flags: Update reason flags.
- * @max: Utilization ceiling.
 *
- * This function is called by the scheduler on every invocation of
+ * This function is called by the scheduler on the CPU whose utilization is
- * update_load_avg() on the CPU whose utilization is being updated.
+ * being updated.
 *
 * It can only be called from RCU-sched read-side critical sections.
- */
-static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
-{
-       struct update_util_data *data;
-       data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
-       if (data)
-               data->func(data, time, util, max);
-}
-/**
- * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
- * @time: Current time.
 *
 * The way cpufreq is currently arranged requires it to evaluate the CPU
 * performance state (frequency/voltage) on a regular basis to prevent it from
@@ -1797,13 +1789,23 @@ static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned lo
 * but that really is a band-aid.  Going forward it should be replaced with
 * solutions targeted more specifically at RT and DL tasks.
 */
-static inline void cpufreq_trigger_update(u64 time)
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
+{
+        struct update_util_data *data;
+        data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+        if (data)
+                data->func(data, rq_clock(rq), flags);
+}
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags)
 {
-        cpufreq_update_util(time, ULONG_MAX, 0);
+        if (cpu_of(rq) == smp_processor_id())
+                cpufreq_update_util(rq, flags);
 }
 #else
-static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
+static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
-static inline void cpufreq_trigger_update(u64 time) {}
+static inline void cpufreq_update_this_cpu(struct rq *rq, unsigned int flags) {}
 #endif /* CONFIG_CPU_FREQ */
 #ifdef arch_scale_freq_capacity
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 78955cbea31c..34659a853505 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -29,11 +29,12 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
        if (rq)
                rq->rq_sched_info.run_delay += delta;
 }
-# define schedstat_enabled()            static_branch_unlikely(&sched_schedstats)
+#define schedstat_enabled()             static_branch_unlikely(&sched_schedstats)
-# define schedstat_inc(rq, field)       do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
+#define schedstat_inc(var)              do { if (schedstat_enabled()) { var++; } } while (0)
-# define schedstat_add(rq, field, amt)  do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
+#define schedstat_add(var, amt)         do { if (schedstat_enabled()) { var += (amt); } } while (0)
-# define schedstat_set(var, val)        do { if (schedstat_enabled()) { var = (val); } } while (0)
+#define schedstat_set(var, val)         do { if (schedstat_enabled()) { var = (val); } } while (0)
-# define schedstat_val(rq, field)       ((schedstat_enabled()) ? (rq)->field : 0)
+#define schedstat_val(var)              (var)
+#define schedstat_val_or_zero(var)      ((schedstat_enabled()) ? (var) : 0)
 #else /* !CONFIG_SCHEDSTATS */
 static inline void
@@ -45,12 +46,13 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
 static inline void
 rq_sched_info_depart(struct rq *rq, unsigned long long delta)
 {}
-# define schedstat_enabled()            0
+#define schedstat_enabled()             0
-# define schedstat_inc(rq, field)       do { } while (0)
+#define schedstat_inc(var)              do { } while (0)
-# define schedstat_add(rq, field, amt)  do { } while (0)
+#define schedstat_add(var, amt)         do { } while (0)
-# define schedstat_set(var, val)        do { } while (0)
+#define schedstat_set(var, val)         do { } while (0)
-# define schedstat_val(rq, field)       0
+#define schedstat_val(var)              0
-#endif
+#define schedstat_val_or_zero(var)      0
+#endif /* CONFIG_SCHEDSTATS */
 #ifdef CONFIG_SCHED_INFO
 static inline void sched_info_reset_dequeued(struct task_struct *t)
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index f15d6b6a538a..9453efe9b25a 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -196,27 +196,48 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
-long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+void init_wait_entry(wait_queue_t *wait, int flags)
 {
-        unsigned long flags;
+        wait->flags = flags;
-        if (signal_pending_state(state, current))
-                return -ERESTARTSYS;
        wait->private = current;
        wait->func = autoremove_wake_function;
+        INIT_LIST_HEAD(&wait->task_list);
+}
+EXPORT_SYMBOL(init_wait_entry);
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+        unsigned long flags;
+        long ret = 0;
        spin_lock_irqsave(&q->lock, flags);
-        if (list_empty(&wait->task_list)) {
+        if (unlikely(signal_pending_state(state, current))) {
-                if (wait->flags & WQ_FLAG_EXCLUSIVE)
+                /*
-                        __add_wait_queue_tail(q, wait);
+                 * Exclusive waiter must not fail if it was selected by wakeup,
-                else
+                 * it should "consume" the condition we were waiting for.
-                        __add_wait_queue(q, wait);
+                 *
+                 * The caller will recheck the condition and return success if
+                 * we were already woken up, we can not miss the event because
+                 * wakeup locks/unlocks the same q->lock.
+                 *
+                 * But we need to ensure that set-condition + wakeup after that
+                 * can't see us, it should wake up another exclusive waiter if
+                 * we fail.
+                 */
+                list_del_init(&wait->task_list);
+                ret = -ERESTARTSYS;
+        } else {
+                if (list_empty(&wait->task_list)) {
+                        if (wait->flags & WQ_FLAG_EXCLUSIVE)
+                                __add_wait_queue_tail(q, wait);
+                        else
+                                __add_wait_queue(q, wait);
+                }
+                set_current_state(state);
        }
-        set_current_state(state);
        spin_unlock_irqrestore(&q->lock, flags);
-        return 0;
+        return ret;
 }
 EXPORT_SYMBOL(prepare_to_wait_event);
@@ -255,39 +276,6 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
 }
 EXPORT_SYMBOL(finish_wait);
-/**
- * abort_exclusive_wait - abort exclusive waiting in a queue
- * @q: waitqueue waited on
- * @wait: wait descriptor
- * @mode: runstate of the waiter to be woken
- * @key: key to identify a wait bit queue or %NULL
- *
- * Sets current thread back to running state and removes
- * the wait descriptor from the given waitqueue if still
- * queued.
- *
- * Wakes up the next waiter if the caller is concurrently
- * woken up through the queue.
- *
- * This prevents waiter starvation where an exclusive waiter
- * aborts and is woken up concurrently and no one wakes up
- * the next waiter.
- */
-void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
-                        unsigned int mode, void *key)
-{
-        unsigned long flags;
-        __set_current_state(TASK_RUNNING);
-        spin_lock_irqsave(&q->lock, flags);
-        if (!list_empty(&wait->task_list))
-                list_del_init(&wait->task_list);
-        else if (waitqueue_active(q))
-                __wake_up_locked_key(q, mode, key);
-        spin_unlock_irqrestore(&q->lock, flags);
-}
-EXPORT_SYMBOL(abort_exclusive_wait);
 int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
 {
        int ret = default_wake_function(wait, mode, sync, key);
@@ -425,20 +413,29 @@ int __sched
 __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
                        wait_bit_action_f *action, unsigned mode)
 {
-        do {
+        int ret = 0;
-                int ret;
+        for (;;) {
                prepare_to_wait_exclusive(wq, &q->wait, mode);
-                if (!test_bit(q->key.bit_nr, q->key.flags))
+                if (test_bit(q->key.bit_nr, q->key.flags)) {
-                        continue;
+                        ret = action(&q->key, mode);
-                ret = action(&q->key, mode);
+                        /*
-                if (!ret)
+                         * See the comment in prepare_to_wait_event().
-                        continue;
+                         * finish_wait() does not necessarily takes wq->lock,
-                abort_exclusive_wait(wq, &q->wait, mode, &q->key);
+                         * but test_and_set_bit() implies mb() which pairs with
-                return ret;
+                         * smp_mb__after_atomic() before wake_up_page().
-        } while (test_and_set_bit(q->key.bit_nr, q->key.flags));
+                         */
-        finish_wait(wq, &q->wait);
+                        if (ret)
-        return 0;
+                                finish_wait(wq, &q->wait);
+                }
+                if (!test_and_set_bit(q->key.bit_nr, q->key.flags)) {
+                        if (!ret)
+                                finish_wait(wq, &q->wait);
+                        return 0;
+                } else if (ret) {
+                        return ret;
+                }
+        }
 }
 EXPORT_SYMBOL(__wait_on_bit_lock);
@@ -483,16 +480,6 @@ void wake_up_bit(void *word, int bit)
 }
 EXPORT_SYMBOL(wake_up_bit);
-wait_queue_head_t *bit_waitqueue(void *word, int bit)
-{
-        const int shift = BITS_PER_LONG == 32 ? 5 : 6;
-        const struct zone *zone = page_zone(virt_to_page(word));
-        unsigned long val = (unsigned long)word << shift | bit;
-        return &zone->wait_table[hash_long(val, zone->wait_table_bits)];
-}
-EXPORT_SYMBOL(bit_waitqueue);
 /*
 * Manipulate the atomic_t address to produce a better bit waitqueue table hash
 * index (we're keying off bit -1, but that would produce a horrible hash