Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Thomas Gleixner: - Cleanup and improvement of NUMA balancing - Refactoring and improvements to the PELT (Per Entity Load Tracking) code - Watchdog simplification and related cleanups - The usual pile of small incremental fixes and improvements * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (41 commits) watchdog: Reduce message verbosity stop_machine: Reflow cpu_stop_queue_two_works() sched/numa: Move task_numa_placement() closer to numa_migrate_preferred() sched/numa: Use group_weights to identify if migration degrades locality sched/numa: Update the scan period without holding the numa_group lock sched/numa: Remove numa_has_capacity() sched/numa: Modify migrate_swap() to accept additional parameters sched/numa: Remove unused task_capacity from 'struct numa_stats' sched/numa: Skip nodes that are at 'hoplimit' sched/debug: Reverse the order of printing faults sched/numa: Use task faults only if numa_group is not yet set up sched/numa: Set preferred_node based on best_cpu sched/numa: Simplify load_too_imbalanced() sched/numa: Evaluate move once per node sched/numa: Remove redundant field sched/debug: Show the sum wait time of a task group sched/fair: Remove #ifdefs from scale_rt_capacity() sched/core: Remove get_cpu() from sched_fork() sched/cpufreq: Clarify sugov_get_util() sched/sysctl: Remove unused sched_time_avg_ms sysctl ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2018-08-13 14:25:07 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-08-13 14:25:07 -0400
commit: f7951c33f0fed14ee26651a70a46899a59a31e18 (patch)
tree: dff372035ceaa7b3a01e2f15c885ff0ff2510e68
parent: 2406fb8d94fb17fee3ace0c09427c08825eacb16 (diff)
parent: 1b6266ebe3da8198e9a02fbad77bbb56e2f7ce2e (diff)
38 files changed, 1009 insertions, 870 deletions
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 7cd76f93a438..f7ea8e21656b 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -515,7 +515,7 @@ int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
        dvcpu->arch.wait = 0;
        if (swq_has_sleeper(&dvcpu->wq))
-                swake_up(&dvcpu->wq);
+                swake_up_one(&dvcpu->wq);
        return 0;
 }
@@ -1204,7 +1204,7 @@ static void kvm_mips_comparecount_func(unsigned long data)
        vcpu->arch.wait = 0;
        if (swq_has_sleeper(&vcpu->wq))
-                swake_up(&vcpu->wq);
+                swake_up_one(&vcpu->wq);
 }
 /* low level hrtimer wake routine */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index de686b340f4a..ee4a8854985e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -216,7 +216,7 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu)
        wqp = kvm_arch_vcpu_wq(vcpu);
        if (swq_has_sleeper(wqp)) {
-                swake_up(wqp);
+                swake_up_one(wqp);
                ++vcpu->stat.halt_wakeup;
        }
@@ -3188,7 +3188,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
                }
        }
-        prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE);
+        prepare_to_swait_exclusive(&vc->wq, &wait, TASK_INTERRUPTIBLE);
        if (kvmppc_vcore_check_block(vc)) {
                finish_swait(&vc->wq, &wait);
@@ -3311,7 +3311,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
                        kvmppc_start_thread(vcpu, vc);
                        trace_kvm_guest_enter(vcpu);
                } else if (vc->vcore_state == VCORE_SLEEPING) {
-                        swake_up(&vc->wq);
+                        swake_up_one(&vc->wq);
                }
        }
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index daa09f89ca2d..fcb55b02990e 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -1145,7 +1145,7 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
                 * yield-candidate.
                 */
                vcpu->preempted = true;
-                swake_up(&vcpu->wq);
+                swake_up_one(&vcpu->wq);
                vcpu->stat.halt_wakeup++;
        }
        /*
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 5b2300b818af..a37bda38d205 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -154,7 +154,7 @@ void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
        for (;;) {
                if (!n.halted)
-                        prepare_to_swait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
+                        prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
                if (hlist_unhashed(&n.link))
                        break;
@@ -188,7 +188,7 @@ static void apf_task_wake_one(struct kvm_task_sleep_node *n)
        if (n->halted)
                smp_send_reschedule(n->cpu);
        else if (swq_has_sleeper(&n->wq))
-                swake_up(&n->wq);
+                swake_up_one(&n->wq);
 }
 static void apf_task_wake_all(void)
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index b5cd8465d44f..d536d457517b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1379,7 +1379,7 @@ static void apic_timer_expired(struct kvm_lapic *apic)
         * using swait_active() is safe.
         */
        if (swait_active(q))
-                swake_up(q);
+                swake_up_one(q);
        if (apic_lvtt_tscdeadline(apic))
                ktimer->expired_tscdeadline = ktimer->tscdeadline;
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 8796ba387152..4cf06a64bc02 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -164,6 +164,7 @@ enum cpuhp_state {
        CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE,
        CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE,
        CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE,
+        CPUHP_AP_WATCHDOG_ONLINE,
        CPUHP_AP_WORKQUEUE_ONLINE,
        CPUHP_AP_RCUTREE_ONLINE,
        CPUHP_AP_ONLINE_DYN,
diff --git a/include/linux/nmi.h b/include/linux/nmi.h
index b8d868d23e79..08f9247e9827 100644
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -45,12 +45,18 @@ extern void touch_softlockup_watchdog(void);
 extern void touch_softlockup_watchdog_sync(void);
 extern void touch_all_softlockup_watchdogs(void);
 extern unsigned int  softlockup_panic;
-#else
+extern int lockup_detector_online_cpu(unsigned int cpu);
+extern int lockup_detector_offline_cpu(unsigned int cpu);
+#else /* CONFIG_SOFTLOCKUP_DETECTOR */
 static inline void touch_softlockup_watchdog_sched(void) { }
 static inline void touch_softlockup_watchdog(void) { }
 static inline void touch_softlockup_watchdog_sync(void) { }
 static inline void touch_all_softlockup_watchdogs(void) { }
-#endif
+#define lockup_detector_online_cpu      NULL
+#define lockup_detector_offline_cpu     NULL
+#endif /* CONFIG_SOFTLOCKUP_DETECTOR */
 #ifdef CONFIG_DETECT_HUNG_TASK
 void reset_hung_task_detector(void);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 43731fe51c97..e0f4f56c9310 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1017,7 +1017,6 @@ struct task_struct {
        u64                             last_sum_exec_runtime;
        struct callback_head            numa_work;
-        struct list_head                numa_entry;
        struct numa_group               *numa_group;
        /*
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 1c1a1512ec55..913488d828cb 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -40,7 +40,6 @@ extern unsigned int sysctl_numa_balancing_scan_size;
 #ifdef CONFIG_SCHED_DEBUG
 extern __read_mostly unsigned int sysctl_sched_migration_cost;
 extern __read_mostly unsigned int sysctl_sched_nr_migrate;
-extern __read_mostly unsigned int sysctl_sched_time_avg;
 int sched_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *length,
diff --git a/include/linux/smpboot.h b/include/linux/smpboot.h
index c174844cf663..d0884b525001 100644
--- a/include/linux/smpboot.h
+++ b/include/linux/smpboot.h
@@ -25,8 +25,6 @@ struct smpboot_thread_data;
 *                      parked (cpu offline)
 * @unpark:             Optional unpark function, called when the thread is
 *                      unparked (cpu online)
- * @cpumask:            Internal state.  To update which threads are unparked,
- *                      call smpboot_update_cpumask_percpu_thread().
 * @selfparking:        Thread is not parked by the park function.
 * @thread_comm:        The base name of the thread
 */
@@ -40,23 +38,12 @@ struct smp_hotplug_thread {
        void                            (*cleanup)(unsigned int cpu, bool online);
        void                            (*park)(unsigned int cpu);
        void                            (*unpark)(unsigned int cpu);
-        cpumask_var_t                   cpumask;
        bool                            selfparking;
        const char                      *thread_comm;
 };
-int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread);
-                                           const struct cpumask *cpumask);
-static inline int
-smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
-{
-        return smpboot_register_percpu_thread_cpumask(plug_thread,
-                                                      cpu_possible_mask);
-}
 void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread);
-void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
-                                          const struct cpumask *);
 #endif
diff --git a/include/linux/swait.h b/include/linux/swait.h
index bf8cb0dee23c..73e06e9986d4 100644
--- a/include/linux/swait.h
+++ b/include/linux/swait.h
@@ -16,7 +16,7 @@
 * wait-queues, but the semantics are actually completely different, and
 * every single user we have ever had has been buggy (or pointless).
 *
- * A "swake_up()" only wakes up _one_ waiter, which is not at all what
+ * A "swake_up_one()" only wakes up _one_ waiter, which is not at all what
 * "wake_up()" does, and has led to problems. In other cases, it has
 * been fine, because there's only ever one waiter (kvm), but in that
 * case gthe whole "simple" wait-queue is just pointless to begin with,
@@ -38,8 +38,8 @@
 *    all wakeups are TASK_NORMAL in order to avoid O(n) lookups for the right
 *    sleeper state.
 *
- *  - the exclusive mode; because this requires preserving the list order
+ *  - the !exclusive mode; because that leads to O(n) wakeups, everything is
- *    and this is hard.
+ *    exclusive.
 *
 *  - custom wake callback functions; because you cannot give any guarantees
 *    about random code. This also allows swait to be used in RT, such that
@@ -115,7 +115,7 @@ extern void __init_swait_queue_head(struct swait_queue_head *q, const char *name
 *      CPU0 - waker                    CPU1 - waiter
 *
 *                                      for (;;) {
- *      @cond = true;                     prepare_to_swait(&wq_head, &wait, state);
+ *      @cond = true;                     prepare_to_swait_exclusive(&wq_head, &wait, state);
 *      smp_mb();                         // smp_mb() from set_current_state()
 *      if (swait_active(wq_head))        if (@cond)
 *        wake_up(wq_head);                      break;
@@ -157,20 +157,20 @@ static inline bool swq_has_sleeper(struct swait_queue_head *wq)
        return swait_active(wq);
 }
-extern void swake_up(struct swait_queue_head *q);
+extern void swake_up_one(struct swait_queue_head *q);
 extern void swake_up_all(struct swait_queue_head *q);
 extern void swake_up_locked(struct swait_queue_head *q);
-extern void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait);
+extern void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state);
-extern void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state);
 extern long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state);
 extern void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
 extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
-/* as per ___wait_event() but for swait, therefore "exclusive == 0" */
+/* as per ___wait_event() but for swait, therefore "exclusive == 1" */
 #define ___swait_event(wq, condition, state, ret, cmd)                  \
 ({                                                                      \
+        __label__ __out;                                                \
        struct swait_queue __wait;                                      \
        long __ret = ret;                                               \
                                                                        \
@@ -183,20 +183,20 @@ extern void finish_swait(struct swait_queue_head *q, struct swait_queue *wait);
                                                                        \
                if (___wait_is_interruptible(state) && __int) {         \
                        __ret = __int;                                  \
-                        break;                                          \
+                        goto __out;                                     \
                }                                                       \
                                                                        \
                cmd;                                                    \
        }                                                               \
        finish_swait(&wq, &__wait);                                     \
-        __ret;                                                          \
+__out:  __ret;                                                          \
 })
 #define __swait_event(wq, condition)                                    \
        (void)___swait_event(wq, condition, TASK_UNINTERRUPTIBLE, 0,    \
                            schedule())
-#define swait_event(wq, condition)                                      \
+#define swait_event_exclusive(wq, condition)                            \
 do {                                                                    \
        if (condition)                                                  \
                break;                                                  \
@@ -208,7 +208,7 @@ do {									\
                      TASK_UNINTERRUPTIBLE, timeout,                    \
                      __ret = schedule_timeout(__ret))
-#define swait_event_timeout(wq, condition, timeout)                     \
+#define swait_event_timeout_exclusive(wq, condition, timeout)           \
 ({                                                                      \
        long __ret = timeout;                                           \
        if (!___wait_cond_timeout(condition))                           \
@@ -220,7 +220,7 @@ do {									\
        ___swait_event(wq, condition, TASK_INTERRUPTIBLE, 0,            \
                      schedule())
-#define swait_event_interruptible(wq, condition)                        \
+#define swait_event_interruptible_exclusive(wq, condition)              \
 ({                                                                      \
        int __ret = 0;                                                  \
        if (!(condition))                                               \
@@ -233,7 +233,7 @@ do {									\
                      TASK_INTERRUPTIBLE, timeout,                      \
                      __ret = schedule_timeout(__ret))
-#define swait_event_interruptible_timeout(wq, condition, timeout)       \
+#define swait_event_interruptible_timeout_exclusive(wq, condition, timeout)\
 ({                                                                      \
        long __ret = timeout;                                           \
        if (!___wait_cond_timeout(condition))                           \
@@ -246,7 +246,7 @@ do {									\
        (void)___swait_event(wq, condition, TASK_IDLE, 0, schedule())
 /**
- * swait_event_idle - wait without system load contribution
+ * swait_event_idle_exclusive - wait without system load contribution
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 *
@@ -257,7 +257,7 @@ do {									\
 * condition and doesn't want to contribute to system load. Signals are
 * ignored.
 */
-#define swait_event_idle(wq, condition)                                 \
+#define swait_event_idle_exclusive(wq, condition)                       \
 do {                                                                    \
        if (condition)                                                  \
                break;                                                  \
@@ -270,7 +270,7 @@ do {									\
                       __ret = schedule_timeout(__ret))
 /**
- * swait_event_idle_timeout - wait up to timeout without load contribution
+ * swait_event_idle_timeout_exclusive - wait up to timeout without load contribution
 * @wq: the waitqueue to wait on
 * @condition: a C expression for the event to wait for
 * @timeout: timeout at which we'll give up in jiffies
@@ -288,7 +288,7 @@ do {									\
 * or the remaining jiffies (at least 1) if the @condition evaluated
 * to %true before the @timeout elapsed.
 */
-#define swait_event_idle_timeout(wq, condition, timeout)                \
+#define swait_event_idle_timeout_exclusive(wq, condition, timeout)      \
 ({                                                                      \
        long __ret = timeout;                                           \
        if (!___wait_cond_timeout(condition))                           \
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2f8f338e77cf..15be70aae8ac 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1344,6 +1344,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
                .startup.single         = perf_event_init_cpu,
                .teardown.single        = perf_event_exit_cpu,
        },
+        [CPUHP_AP_WATCHDOG_ONLINE] = {
+                .name                   = "lockup_detector:online",
+                .startup.single         = lockup_detector_online_cpu,
+                .teardown.single        = lockup_detector_offline_cpu,
+        },
        [CPUHP_AP_WORKQUEUE_ONLINE] = {
                .name                   = "workqueue:online",
                .startup.single         = workqueue_online_cpu,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 486dedbd9af5..087d18d771b5 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -190,7 +190,7 @@ static void __kthread_parkme(struct kthread *self)
                if (!test_bit(KTHREAD_SHOULD_PARK, &self->flags))
                        break;
-                complete_all(&self->parked);
+                complete(&self->parked);
                schedule();
        }
        __set_current_state(TASK_RUNNING);
@@ -471,7 +471,6 @@ void kthread_unpark(struct task_struct *k)
        if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
                __kthread_bind(k, kthread->cpu, TASK_PARKED);
-        reinit_completion(&kthread->parked);
        clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        /*
         * __kthread_parkme() will either see !SHOULD_PARK or get the wakeup.
@@ -499,6 +498,9 @@ int kthread_park(struct task_struct *k)
        if (WARN_ON(k->flags & PF_EXITING))
                return -ENOSYS;
+        if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags)))
+                return -EBUSY;
        set_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
        if (k != current) {
                wake_up_process(k);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 87331565e505..70178f6ffdc4 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -92,7 +92,7 @@ static void s2idle_enter(void)
        /* Push all the CPUs into the idle loop. */
        wake_up_all_idle_cpus();
        /* Make the current CPU wait so it can enter the idle loop too. */
-        swait_event(s2idle_wait_head,
+        swait_event_exclusive(s2idle_wait_head,
                    s2idle_state == S2IDLE_STATE_WAKE);
        cpuidle_pause();
@@ -160,7 +160,7 @@ void s2idle_wake(void)
        raw_spin_lock_irqsave(&s2idle_lock, flags);
        if (s2idle_state > S2IDLE_STATE_NONE) {
                s2idle_state = S2IDLE_STATE_WAKE;
-                swake_up(&s2idle_wait_head);
+                swake_up_one(&s2idle_wait_head);
        }
        raw_spin_unlock_irqrestore(&s2idle_lock, flags);
 }
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 622792abe41a..04fc2ed71af8 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -110,7 +110,7 @@ void __srcu_read_unlock(struct srcu_struct *sp, int idx)
        WRITE_ONCE(sp->srcu_lock_nesting[idx], newval);
        if (!newval && READ_ONCE(sp->srcu_gp_waiting))
-                swake_up(&sp->srcu_wq);
+                swake_up_one(&sp->srcu_wq);
 }
 EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -140,7 +140,7 @@ void srcu_drive_gp(struct work_struct *wp)
        idx = sp->srcu_idx;
        WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx);
        WRITE_ONCE(sp->srcu_gp_waiting, true);  /* srcu_read_unlock() wakes! */
-        swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
+        swait_event_exclusive(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx]));
        WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */
        /* Invoke the callbacks we removed above. */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 6930934e8b9f..0b760c1369f7 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1701,7 +1701,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
            !READ_ONCE(rsp->gp_flags) ||
            !rsp->gp_kthread)
                return;
-        swake_up(&rsp->gp_wq);
+        swake_up_one(&rsp->gp_wq);
 }
 /*
@@ -2015,7 +2015,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
 }
 /*
- * Helper function for swait_event_idle() wakeup at force-quiescent-state
+ * Helper function for swait_event_idle_exclusive() wakeup at force-quiescent-state
 * time.
 */
 static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
@@ -2163,7 +2163,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                               READ_ONCE(rsp->gp_seq),
                                               TPS("reqwait"));
                        rsp->gp_state = RCU_GP_WAIT_GPS;
-                        swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
+                        swait_event_idle_exclusive(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
                                                     RCU_GP_FLAG_INIT);
                        rsp->gp_state = RCU_GP_DONE_GPS;
                        /* Locking provides needed memory barrier. */
@@ -2191,7 +2191,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
                                               READ_ONCE(rsp->gp_seq),
                                               TPS("fqswait"));
                        rsp->gp_state = RCU_GP_WAIT_FQS;
-                        ret = swait_event_idle_timeout(rsp->gp_wq,
+                        ret = swait_event_idle_timeout_exclusive(rsp->gp_wq,
                                        rcu_gp_fqs_check_wake(rsp, &gf), j);
                        rsp->gp_state = RCU_GP_DOING_FQS;
                        /* Locking provides needed memory barriers. */
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index b3df3b770afb..0b2c2ad69629 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -212,7 +212,7 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
                        raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
                        if (wake) {
                                smp_mb(); /* EGP done before wake_up(). */
-                                swake_up(&rsp->expedited_wq);
+                                swake_up_one(&rsp->expedited_wq);
                        }
                        break;
                }
@@ -526,7 +526,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
        jiffies_start = jiffies;
        for (;;) {
-                ret = swait_event_timeout(
+                ret = swait_event_timeout_exclusive(
                                rsp->expedited_wq,
                                sync_rcu_preempt_exp_done_unlocked(rnp_root),
                                jiffies_stall);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index c1b17f5b9361..a97c20ea9bce 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1926,8 +1926,8 @@ static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
                WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
                del_timer(&rdp->nocb_timer);
                raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
-                smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
+                smp_mb(); /* ->nocb_leader_sleep before swake_up_one(). */
-                swake_up(&rdp_leader->nocb_wq);
+                swake_up_one(&rdp_leader->nocb_wq);
        } else {
                raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
        }
@@ -2159,7 +2159,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
         */
        trace_rcu_this_gp(rnp, rdp, c, TPS("StartWait"));
        for (;;) {
-                swait_event_interruptible(
+                swait_event_interruptible_exclusive(
                        rnp->nocb_gp_wq[rcu_seq_ctr(c) & 0x1],
                        (d = rcu_seq_done(&rnp->gp_seq, c)));
                if (likely(d))
@@ -2188,7 +2188,7 @@ wait_again:
        /* Wait for callbacks to appear. */
        if (!rcu_nocb_poll) {
                trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
-                swait_event_interruptible(my_rdp->nocb_wq,
+                swait_event_interruptible_exclusive(my_rdp->nocb_wq,
                                !READ_ONCE(my_rdp->nocb_leader_sleep));
                raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
                my_rdp->nocb_leader_sleep = true;
@@ -2253,7 +2253,7 @@ wait_again:
                raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
                if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
                        /* List was empty, so wake up the follower.  */
-                        swake_up(&rdp->nocb_wq);
+                        swake_up_one(&rdp->nocb_wq);
                }
        }
@@ -2270,7 +2270,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
 {
        for (;;) {
                trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
-                swait_event_interruptible(rdp->nocb_wq,
+                swait_event_interruptible_exclusive(rdp->nocb_wq,
                                         READ_ONCE(rdp->nocb_follower_head));
                if (smp_load_acquire(&rdp->nocb_follower_head)) {
                        /* ^^^ Ensure CB invocation follows _head test. */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index d9a02b318108..7fe183404c38 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -20,7 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o
 obj-y += idle.o fair.o rt.o deadline.o
 obj-y += wait.o wait_bit.o swait.o completion.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o pelt.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fe365c9a08e9..deafa9fe602b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -17,6 +17,8 @@
 #include "../workqueue_internal.h"
 #include "../smpboot.h"
+#include "pelt.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
@@ -45,14 +47,6 @@ const_debug unsigned int sysctl_sched_features =
 const_debug unsigned int sysctl_sched_nr_migrate = 32;
 /*
- * period over which we average the RT time consumption, measured
- * in ms.
- *
- * default: 1s
- */
-const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
-/*
 * period over which we measure -rt task CPU usage in us.
 * default: 1s
 */
@@ -183,9 +177,9 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
        rq->clock_task += delta;
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+#ifdef HAVE_SCHED_AVG_IRQ
        if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
-                sched_rt_avg_update(rq, irq_delta + steal);
+                update_irq_load_avg(rq, irq_delta + steal);
 #endif
 }
@@ -649,23 +643,6 @@ bool sched_can_stop_tick(struct rq *rq)
        return true;
 }
 #endif /* CONFIG_NO_HZ_FULL */
-void sched_avg_update(struct rq *rq)
-{
-        s64 period = sched_avg_period();
-        while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
-                /*
-                 * Inline assembly required to prevent the compiler
-                 * optimising this loop into a divmod call.
-                 * See __iter_div_u64_rem() for another example of this.
-                 */
-                asm("" : "+rm" (rq->age_stamp));
-                rq->age_stamp += period;
-                rq->rt_avg /= 2;
-        }
-}
 #endif /* CONFIG_SMP */
 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -1199,6 +1176,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        __set_task_cpu(p, new_cpu);
 }
+#ifdef CONFIG_NUMA_BALANCING
 static void __migrate_swap_task(struct task_struct *p, int cpu)
 {
        if (task_on_rq_queued(p)) {
@@ -1280,16 +1258,17 @@ unlock:
 /*
 * Cross migrate two tasks
 */
-int migrate_swap(struct task_struct *cur, struct task_struct *p)
+int migrate_swap(struct task_struct *cur, struct task_struct *p,
+                int target_cpu, int curr_cpu)
 {
        struct migration_swap_arg arg;
        int ret = -EINVAL;
        arg = (struct migration_swap_arg){
                .src_task = cur,
-                .src_cpu = task_cpu(cur),
+                .src_cpu = curr_cpu,
                .dst_task = p,
-                .dst_cpu = task_cpu(p),
+                .dst_cpu = target_cpu,
        };
        if (arg.src_cpu == arg.dst_cpu)
@@ -1314,6 +1293,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
 out:
        return ret;
 }
+#endif /* CONFIG_NUMA_BALANCING */
 /*
 * wait_task_inactive - wait for a thread to unschedule.
@@ -2317,7 +2297,6 @@ static inline void init_schedstats(void) {}
 int sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
        unsigned long flags;
-        int cpu = get_cpu();
        __sched_fork(clone_flags, p);
        /*
@@ -2353,14 +2332,12 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
                p->sched_reset_on_fork = 0;
        }
-        if (dl_prio(p->prio)) {
+        if (dl_prio(p->prio))
-                put_cpu();
                return -EAGAIN;
-        } else if (rt_prio(p->prio)) {
+        else if (rt_prio(p->prio))
                p->sched_class = &rt_sched_class;
-        } else {
+        else
                p->sched_class = &fair_sched_class;
-        }
        init_entity_runnable_average(&p->se);
@@ -2376,7 +2353,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
         * We're setting the CPU for the first time, we don't migrate,
         * so use __set_task_cpu().
         */
-        __set_task_cpu(p, cpu);
+        __set_task_cpu(p, smp_processor_id());
        if (p->sched_class->task_fork)
                p->sched_class->task_fork(p);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -2393,8 +2370,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
        plist_node_init(&p->pushable_tasks, MAX_PRIO);
        RB_CLEAR_NODE(&p->pushable_dl_tasks);
 #endif
-        put_cpu();
        return 0;
 }
@@ -5714,13 +5689,6 @@ void set_rq_offline(struct rq *rq)
        }
 }
-static void set_cpu_rq_start_time(unsigned int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        rq->age_stamp = sched_clock_cpu(cpu);
-}
 /*
 * used to mark begin/end of suspend/resume:
 */
@@ -5838,7 +5806,6 @@ static void sched_rq_cpu_starting(unsigned int cpu)
 int sched_cpu_starting(unsigned int cpu)
 {
-        set_cpu_rq_start_time(cpu);
        sched_rq_cpu_starting(cpu);
        sched_tick_start(cpu);
        return 0;
@@ -6106,7 +6073,6 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
        idle_thread_set_boot_cpu();
-        set_cpu_rq_start_time(smp_processor_id());
 #endif
        init_sched_fair_class();
@@ -6785,6 +6751,16 @@ static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
        seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
        seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
+        if (schedstat_enabled() && tg != &root_task_group) {
+                u64 ws = 0;
+                int i;
+                for_each_possible_cpu(i)
+                        ws += schedstat_val(tg->se[i]->statistics.wait_sum);
+                seq_printf(sf, "wait_sum %llu\n", ws);
+        }
        return 0;
 }
 #endif /* CONFIG_CFS_BANDWIDTH */
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index c907fde01eaa..3fffad3bc8a8 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -53,9 +53,7 @@ struct sugov_cpu {
        unsigned int            iowait_boost_max;
        u64                     last_update;
-        /* The fields below are only needed when sharing a policy: */
+        unsigned long           bw_dl;
-        unsigned long           util_cfs;
-        unsigned long           util_dl;
        unsigned long           max;
        /* The field below is for single-CPU policies only: */
@@ -179,33 +177,90 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
        return cpufreq_driver_resolve_freq(policy, freq);
 }
-static void sugov_get_util(struct sugov_cpu *sg_cpu)
+/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ *   cpu_util_{cfs,rt,dl,irq}()
+ *   cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the irq utilization.
+ *
+ * The DL bandwidth number otoh is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
+static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
 {
        struct rq *rq = cpu_rq(sg_cpu->cpu);
+        unsigned long util, irq, max;
-        sg_cpu->max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
+        sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu);
-        sg_cpu->util_cfs = cpu_util_cfs(rq);
+        sg_cpu->bw_dl = cpu_bw_dl(rq);
-        sg_cpu->util_dl  = cpu_util_dl(rq);
-}
-static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu)
-{
-        struct rq *rq = cpu_rq(sg_cpu->cpu);
        if (rt_rq_is_runnable(&rq->rt))
-                return sg_cpu->max;
+                return max;
+        /*
+         * Early check to see if IRQ/steal time saturates the CPU, can be
+         * because of inaccuracies in how we track these -- see
+         * update_irq_load_avg().
+         */
+        irq = cpu_util_irq(rq);
+        if (unlikely(irq >= max))
+                return max;
+        /*
+         * Because the time spend on RT/DL tasks is visible as 'lost' time to
+         * CFS tasks and we use the same metric to track the effective
+         * utilization (PELT windows are synchronized) we can directly add them
+         * to obtain the CPU's actual utilization.
+         */
+        util = cpu_util_cfs(rq);
+        util += cpu_util_rt(rq);
+        /*
+         * We do not make cpu_util_dl() a permanent part of this sum because we
+         * want to use cpu_bw_dl() later on, but we need to check if the
+         * CFS+RT+DL sum is saturated (ie. no idle time) such that we select
+         * f_max when there is no idle time.
+         *
+         * NOTE: numerical errors or stop class might cause us to not quite hit
+         * saturation when we should -- something for later.
+         */
+        if ((util + cpu_util_dl(rq)) >= max)
+                return max;
+        /*
+         * There is still idle time; further improve the number by using the
+         * irq metric. Because IRQ/steal time is hidden from the task clock we
+         * need to scale the task numbers:
+         *
+         *              1 - irq
+         *   U' = irq + ------- * U
+         *                max
+         */
+        util = scale_irq_capacity(util, irq, max);
+        util += irq;
        /*
-         * Utilization required by DEADLINE must always be granted while, for
+         * Bandwidth required by DEADLINE must always be granted while, for
-         * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to
+         * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
-         * gracefully reduce the frequency when no tasks show up for longer
+         * to gracefully reduce the frequency when no tasks show up for longer
         * periods of time.
         *
-         * Ideally we would like to set util_dl as min/guaranteed freq and
+         * Ideally we would like to set bw_dl as min/guaranteed freq and util +
-         * util_cfs + util_dl as requested freq. However, cpufreq is not yet
+         * bw_dl as requested freq. However, cpufreq is not yet ready for such
-         * ready for such an interface. So, we only do the latter for now.
+         * an interface. So, we only do the latter for now.
         */
-        return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs));
+        return min(max, util + sg_cpu->bw_dl);
 }
 /**
@@ -360,7 +415,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
 */
 static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
 {
-        if (cpu_util_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->util_dl)
+        if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
                sg_policy->need_freq_update = true;
 }
@@ -383,9 +438,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
        busy = sugov_cpu_is_busy(sg_cpu);
-        sugov_get_util(sg_cpu);
+        util = sugov_get_util(sg_cpu);
        max = sg_cpu->max;
-        util = sugov_aggregate_util(sg_cpu);
        sugov_iowait_apply(sg_cpu, time, &util, &max);
        next_f = get_next_freq(sg_policy, util, max);
        /*
@@ -424,9 +478,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
                struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j);
                unsigned long j_util, j_max;
-                sugov_get_util(j_sg_cpu);
+                j_util = sugov_get_util(j_sg_cpu);
                j_max = j_sg_cpu->max;
-                j_util = sugov_aggregate_util(j_sg_cpu);
                sugov_iowait_apply(j_sg_cpu, time, &j_util, &j_max);
                if (j_util * max > j_max * util) {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b5fbdde6afa9..997ea7b839fa 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -16,6 +16,7 @@
 *                    Fabio Checconi <fchecconi@gmail.com>
 */
 #include "sched.h"
+#include "pelt.h"
 struct dl_bandwidth def_dl_bandwidth;
@@ -1179,8 +1180,6 @@ static void update_curr_dl(struct rq *rq)
        curr->se.exec_start = now;
        cgroup_account_cputime(curr, delta_exec);
-        sched_rt_avg_update(rq, delta_exec);
        if (dl_entity_is_special(dl_se))
                return;
@@ -1761,6 +1760,9 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
        deadline_queue_push_tasks(rq);
+        if (rq->curr->sched_class != &dl_sched_class)
+                update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
        return p;
 }
@@ -1768,6 +1770,7 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
 {
        update_curr_dl(rq);
+        update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
        if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
                enqueue_pushable_dl_task(rq, p);
 }
@@ -1784,6 +1787,7 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
 {
        update_curr_dl(rq);
+        update_dl_rq_load_avg(rq_clock_task(rq), rq, 1);
        /*
         * Even when we have runtime, update_curr_dl() might have resulted in us
         * not being the leftmost task anymore. In that case NEED_RESCHED will
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index e593b4118578..870d4f3da285 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -111,20 +111,19 @@ static int sched_feat_set(char *cmp)
                cmp += 3;
        }
-        for (i = 0; i < __SCHED_FEAT_NR; i++) {
+        i = match_string(sched_feat_names, __SCHED_FEAT_NR, cmp);
-                if (strcmp(cmp, sched_feat_names[i]) == 0) {
+        if (i < 0)
-                        if (neg) {
+                return i;
-                                sysctl_sched_features &= ~(1UL << i);
-                                sched_feat_disable(i);
+        if (neg) {
-                        } else {
+                sysctl_sched_features &= ~(1UL << i);
-                                sysctl_sched_features |= (1UL << i);
+                sched_feat_disable(i);
-                                sched_feat_enable(i);
+        } else {
-                        }
+                sysctl_sched_features |= (1UL << i);
-                        break;
+                sched_feat_enable(i);
-                }
        }
-        return i;
+        return 0;
 }
 static ssize_t
@@ -133,7 +132,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
 {
        char buf[64];
        char *cmp;
-        int i;
+        int ret;
        struct inode *inode;
        if (cnt > 63)
@@ -148,10 +147,10 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
        /* Ensure the static_key remains in a consistent state */
        inode = file_inode(filp);
        inode_lock(inode);
-        i = sched_feat_set(cmp);
+        ret = sched_feat_set(cmp);
        inode_unlock(inode);
-        if (i == __SCHED_FEAT_NR)
+        if (ret < 0)
-                return -EINVAL;
+                return ret;
        *ppos += cnt;
@@ -843,8 +842,8 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
                unsigned long tpf, unsigned long gsf, unsigned long gpf)
 {
        SEQ_printf(m, "numa_faults node=%d ", node);
-        SEQ_printf(m, "task_private=%lu task_shared=%lu ", tsf, tpf);
+        SEQ_printf(m, "task_private=%lu task_shared=%lu ", tpf, tsf);
-        SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gsf, gpf);
+        SEQ_printf(m, "group_private=%lu group_shared=%lu\n", gpf, gsf);
 }
 #endif
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 2f0a0be4d344..309c93fcc604 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -255,9 +255,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
        return cfs_rq->rq;
 }
-/* An entity is a task if it doesn't "own" a runqueue */
-#define entity_is_task(se)      (!se->my_q)
 static inline struct task_struct *task_of(struct sched_entity *se)
 {
        SCHED_WARN_ON(!entity_is_task(se));
@@ -419,7 +416,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
        return container_of(cfs_rq, struct rq, cfs);
 }
-#define entity_is_task(se)      1
 #define for_each_sched_entity(se) \
                for (; se; se = NULL)
@@ -692,7 +688,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 #ifdef CONFIG_SMP
+#include "pelt.h"
 #include "sched-pelt.h"
 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
@@ -735,11 +731,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
 * To solve this problem, we also cap the util_avg of successive tasks to
 * only 1/2 of the left utilization budget:
 *
- *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
 *
- * where n denotes the nth task.
+ * where n denotes the nth task and cpu_scale the CPU capacity.
 *
- * For example, a simplest series from the beginning would be like:
+ * For example, for a CPU with 1024 of capacity, a simplest series from
+ * the beginning would be like:
 *
 *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
@@ -751,7 +748,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        struct sched_avg *sa = &se->avg;
-        long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+        long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+        long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
        if (cap > 0) {
                if (cfs_rq->avg.util_avg != 0) {
@@ -1314,7 +1312,7 @@ static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
                 * of each group. Skip other nodes.
                 */
                if (sched_numa_topology_type == NUMA_BACKPLANE &&
-                                        dist > maxdist)
+                                        dist >= maxdist)
                        continue;
                /* Add up the faults from nearby nodes. */
@@ -1452,15 +1450,12 @@ static unsigned long capacity_of(int cpu);
 /* Cached statistics for all CPUs within a node */
 struct numa_stats {
-        unsigned long nr_running;
        unsigned long load;
        /* Total compute capacity of CPUs on a node */
        unsigned long compute_capacity;
-        /* Approximate capacity in terms of runnable tasks on a node */
+        unsigned int nr_running;
-        unsigned long task_capacity;
-        int has_free_capacity;
 };
 /*
@@ -1487,8 +1482,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
         * the @ns structure is NULL'ed and task_numa_compare() will
         * not find this node attractive.
         *
-         * We'll either bail at !has_free_capacity, or we'll detect a huge
+         * We'll detect a huge imbalance and bail there.
-         * imbalance and bail there.
         */
        if (!cpus)
                return;
@@ -1497,9 +1491,8 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
        smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
        capacity = cpus / smt; /* cores */
-        ns->task_capacity = min_t(unsigned, capacity,
+        capacity = min_t(unsigned, capacity,
                DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
-        ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
 }
 struct task_numa_env {
@@ -1548,28 +1541,12 @@ static bool load_too_imbalanced(long src_load, long dst_load,
        src_capacity = env->src_stats.compute_capacity;
        dst_capacity = env->dst_stats.compute_capacity;
-        /* We care about the slope of the imbalance, not the direction. */
+        imb = abs(dst_load * src_capacity - src_load * dst_capacity);
-        if (dst_load < src_load)
-                swap(dst_load, src_load);
-        /* Is the difference below the threshold? */
-        imb = dst_load * src_capacity * 100 -
-              src_load * dst_capacity * env->imbalance_pct;
-        if (imb <= 0)
-                return false;
-        /*
-         * The imbalance is above the allowed threshold.
-         * Compare it with the old imbalance.
-         */
        orig_src_load = env->src_stats.load;
        orig_dst_load = env->dst_stats.load;
-        if (orig_dst_load < orig_src_load)
+        old_imb = abs(orig_dst_load * src_capacity - orig_src_load * dst_capacity);
-                swap(orig_dst_load, orig_src_load);
-        old_imb = orig_dst_load * src_capacity * 100 -
-                  orig_src_load * dst_capacity * env->imbalance_pct;
        /* Would this change make things worse? */
        return (imb > old_imb);
@@ -1582,9 +1559,8 @@ static bool load_too_imbalanced(long src_load, long dst_load,
 * be exchanged with the source task
 */
 static void task_numa_compare(struct task_numa_env *env,
-                              long taskimp, long groupimp)
+                              long taskimp, long groupimp, bool maymove)
 {
-        struct rq *src_rq = cpu_rq(env->src_cpu);
        struct rq *dst_rq = cpu_rq(env->dst_cpu);
        struct task_struct *cur;
        long src_load, dst_load;
@@ -1605,97 +1581,73 @@ static void task_numa_compare(struct task_numa_env *env,
        if (cur == env->p)
                goto unlock;
+        if (!cur) {
+                if (maymove || imp > env->best_imp)
+                        goto assign;
+                else
+                        goto unlock;
+        }
        /*
         * "imp" is the fault differential for the source task between the
         * source and destination node. Calculate the total differential for
         * the source task and potential destination task. The more negative
-         * the value is, the more rmeote accesses that would be expected to
+         * the value is, the more remote accesses that would be expected to
         * be incurred if the tasks were swapped.
         */
-        if (cur) {
+        /* Skip this swap candidate if cannot move to the source cpu */
-                /* Skip this swap candidate if cannot move to the source CPU: */
+        if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
-                if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
+                goto unlock;
-                        goto unlock;
+        /*
+         * If dst and source tasks are in the same NUMA group, or not
+         * in any group then look only at task weights.
+         */
+        if (cur->numa_group == env->p->numa_group) {
+                imp = taskimp + task_weight(cur, env->src_nid, dist) -
+                      task_weight(cur, env->dst_nid, dist);
                /*
-                 * If dst and source tasks are in the same NUMA group, or not
+                 * Add some hysteresis to prevent swapping the
-                 * in any group then look only at task weights.
+                 * tasks within a group over tiny differences.
                 */
-                if (cur->numa_group == env->p->numa_group) {
+                if (cur->numa_group)
-                        imp = taskimp + task_weight(cur, env->src_nid, dist) -
+                        imp -= imp / 16;
-                              task_weight(cur, env->dst_nid, dist);
+        } else {
-                        /*
+                /*
-                         * Add some hysteresis to prevent swapping the
+                 * Compare the group weights. If a task is all by itself
-                         * tasks within a group over tiny differences.
+                 * (not part of a group), use the task weight instead.
-                         */
+                 */
-                        if (cur->numa_group)
+                if (cur->numa_group && env->p->numa_group)
-                                imp -= imp/16;
+                        imp += group_weight(cur, env->src_nid, dist) -
-                } else {
+                               group_weight(cur, env->dst_nid, dist);
-                        /*
+                else
-                         * Compare the group weights. If a task is all by
+                        imp += task_weight(cur, env->src_nid, dist) -
-                         * itself (not part of a group), use the task weight
+                               task_weight(cur, env->dst_nid, dist);
-                         * instead.
-                         */
-                        if (cur->numa_group)
-                                imp += group_weight(cur, env->src_nid, dist) -
-                                       group_weight(cur, env->dst_nid, dist);
-                        else
-                                imp += task_weight(cur, env->src_nid, dist) -
-                                       task_weight(cur, env->dst_nid, dist);
-                }
        }
-        if (imp <= env->best_imp && moveimp <= env->best_imp)
+        if (imp <= env->best_imp)
                goto unlock;
-        if (!cur) {
+        if (maymove && moveimp > imp && moveimp > env->best_imp) {
-                /* Is there capacity at our destination? */
+                imp = moveimp - 1;
-                if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
+                cur = NULL;
-                    !env->dst_stats.has_free_capacity)
-                        goto unlock;
-                goto balance;
-        }
-        /* Balance doesn't matter much if we're running a task per CPU: */
-        if (imp > env->best_imp && src_rq->nr_running == 1 &&
-                        dst_rq->nr_running == 1)
                goto assign;
+        }
        /*
         * In the overloaded case, try and keep the load balanced.
         */
-balance:
+        load = task_h_load(env->p) - task_h_load(cur);
-        load = task_h_load(env->p);
+        if (!load)
+                goto assign;
        dst_load = env->dst_stats.load + load;
        src_load = env->src_stats.load - load;
-        if (moveimp > imp && moveimp > env->best_imp) {
-                /*
-                 * If the improvement from just moving env->p direction is
-                 * better than swapping tasks around, check if a move is
-                 * possible. Store a slightly smaller score than moveimp,
-                 * so an actually idle CPU will win.
-                 */
-                if (!load_too_imbalanced(src_load, dst_load, env)) {
-                        imp = moveimp - 1;
-                        cur = NULL;
-                        goto assign;
-                }
-        }
-        if (imp <= env->best_imp)
-                goto unlock;
-        if (cur) {
-                load = task_h_load(cur);
-                dst_load -= load;
-                src_load += load;
-        }
        if (load_too_imbalanced(src_load, dst_load, env))
                goto unlock;
+assign:
        /*
         * One idle CPU per node is evaluated for a task numa move.
         * Call select_idle_sibling to maybe find a better one.
@@ -1711,7 +1663,6 @@ balance:
                local_irq_enable();
        }
-assign:
        task_numa_assign(env, cur, imp);
 unlock:
        rcu_read_unlock();
@@ -1720,43 +1671,30 @@ unlock:
 static void task_numa_find_cpu(struct task_numa_env *env,
                                long taskimp, long groupimp)
 {
+        long src_load, dst_load, load;
+        bool maymove = false;
        int cpu;
+        load = task_h_load(env->p);
+        dst_load = env->dst_stats.load + load;
+        src_load = env->src_stats.load - load;
+        /*
+         * If the improvement from just moving env->p direction is better
+         * than swapping tasks around, check if a move is possible.
+         */
+        maymove = !load_too_imbalanced(src_load, dst_load, env);
        for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
                /* Skip this CPU if the source task cannot migrate */
                if (!cpumask_test_cpu(cpu, &env->p->cpus_allowed))
                        continue;
                env->dst_cpu = cpu;
-                task_numa_compare(env, taskimp, groupimp);
+                task_numa_compare(env, taskimp, groupimp, maymove);
        }
 }
-/* Only move tasks to a NUMA node less busy than the current node. */
-static bool numa_has_capacity(struct task_numa_env *env)
-{
-        struct numa_stats *src = &env->src_stats;
-        struct numa_stats *dst = &env->dst_stats;
-        if (src->has_free_capacity && !dst->has_free_capacity)
-                return false;
-        /*
-         * Only consider a task move if the source has a higher load
-         * than the destination, corrected for CPU capacity on each node.
-         *
-         *      src->load                dst->load
-         * --------------------- vs ---------------------
-         * src->compute_capacity    dst->compute_capacity
-         */
-        if (src->load * dst->compute_capacity * env->imbalance_pct >
-            dst->load * src->compute_capacity * 100)
-                return true;
-        return false;
-}
 static int task_numa_migrate(struct task_struct *p)
 {
        struct task_numa_env env = {
@@ -1797,7 +1735,7 @@ static int task_numa_migrate(struct task_struct *p)
         * elsewhere, so there is no point in (re)trying.
         */
        if (unlikely(!sd)) {
-                p->numa_preferred_nid = task_node(p);
+                sched_setnuma(p, task_node(p));
                return -EINVAL;
        }
@@ -1811,8 +1749,7 @@ static int task_numa_migrate(struct task_struct *p)
        update_numa_stats(&env.dst_stats, env.dst_nid);
        /* Try to find a spot on the preferred nid. */
-        if (numa_has_capacity(&env))
+        task_numa_find_cpu(&env, taskimp, groupimp);
-                task_numa_find_cpu(&env, taskimp, groupimp);
        /*
         * Look at other nodes in these cases:
@@ -1842,8 +1779,7 @@ static int task_numa_migrate(struct task_struct *p)
                        env.dist = dist;
                        env.dst_nid = nid;
                        update_numa_stats(&env.dst_stats, env.dst_nid);
-                        if (numa_has_capacity(&env))
+                        task_numa_find_cpu(&env, taskimp, groupimp);
-                                task_numa_find_cpu(&env, taskimp, groupimp);
                }
        }
@@ -1856,15 +1792,13 @@ static int task_numa_migrate(struct task_struct *p)
         * trying for a better one later. Do not set the preferred node here.
         */
        if (p->numa_group) {
-                struct numa_group *ng = p->numa_group;
                if (env.best_cpu == -1)
                        nid = env.src_nid;
                else
-                        nid = env.dst_nid;
+                        nid = cpu_to_node(env.best_cpu);
-                if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
+                if (nid != p->numa_preferred_nid)
-                        sched_setnuma(p, env.dst_nid);
+                        sched_setnuma(p, nid);
        }
        /* No better CPU than the current one was found. */
@@ -1884,7 +1818,8 @@ static int task_numa_migrate(struct task_struct *p)
                return ret;
        }
-        ret = migrate_swap(p, env.best_task);
+        ret = migrate_swap(p, env.best_task, env.best_cpu, env.src_cpu);
        if (ret != 0)
                trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
        put_task_struct(env.best_task);
@@ -2144,8 +2079,8 @@ static int preferred_group_nid(struct task_struct *p, int nid)
 static void task_numa_placement(struct task_struct *p)
 {
-        int seq, nid, max_nid = -1, max_group_nid = -1;
+        int seq, nid, max_nid = -1;
-        unsigned long max_faults = 0, max_group_faults = 0;
+        unsigned long max_faults = 0;
        unsigned long fault_types[2] = { 0, 0 };
        unsigned long total_faults;
        u64 runtime, period;
@@ -2224,33 +2159,30 @@ static void task_numa_placement(struct task_struct *p)
                        }
                }
-                if (faults > max_faults) {
+                if (!p->numa_group) {
-                        max_faults = faults;
+                        if (faults > max_faults) {
+                                max_faults = faults;
+                                max_nid = nid;
+                        }
+                } else if (group_faults > max_faults) {
+                        max_faults = group_faults;
                        max_nid = nid;
                }
-                if (group_faults > max_group_faults) {
-                        max_group_faults = group_faults;
-                        max_group_nid = nid;
-                }
        }
-        update_task_scan_period(p, fault_types[0], fault_types[1]);
        if (p->numa_group) {
                numa_group_count_active_nodes(p->numa_group);
                spin_unlock_irq(group_lock);
-                max_nid = preferred_group_nid(p, max_group_nid);
+                max_nid = preferred_group_nid(p, max_nid);
        }
        if (max_faults) {
                /* Set the new preferred node */
                if (max_nid != p->numa_preferred_nid)
                        sched_setnuma(p, max_nid);
-                if (task_node(p) != p->numa_preferred_nid)
-                        numa_migrate_preferred(p);
        }
+        update_task_scan_period(p, fault_types[0], fault_types[1]);
 }
 static inline int get_numa_group(struct numa_group *grp)
@@ -2450,14 +2382,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
                                numa_is_active_node(mem_node, ng))
                local = 1;
-        task_numa_placement(p);
        /*
         * Retry task to preferred node migration periodically, in case it
         * case it previously failed, or the scheduler moved us.
         */
-        if (time_after(jiffies, p->numa_migrate_retry))
+        if (time_after(jiffies, p->numa_migrate_retry)) {
+                task_numa_placement(p);
                numa_migrate_preferred(p);
+        }
        if (migrated)
                p->numa_pages_migrated += pages;
@@ -2749,19 +2681,6 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 } while (0)
 #ifdef CONFIG_SMP
-/*
- * XXX we want to get rid of these helpers and use the full load resolution.
- */
-static inline long se_weight(struct sched_entity *se)
-{
-        return scale_load_down(se->load.weight);
-}
-static inline long se_runnable(struct sched_entity *se)
-{
-        return scale_load_down(se->runnable_weight);
-}
 static inline void
 enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -3062,314 +2981,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq, int flags)
 }
 #ifdef CONFIG_SMP
-/*
- * Approximate:
- *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
- */
-static u64 decay_load(u64 val, u64 n)
-{
-        unsigned int local_n;
-        if (unlikely(n > LOAD_AVG_PERIOD * 63))
-                return 0;
-        /* after bounds checking we can collapse to 32-bit */
-        local_n = n;
-        /*
-         * As y^PERIOD = 1/2, we can combine
-         *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
-         * With a look-up table which covers y^n (n<PERIOD)
-         *
-         * To achieve constant time decay_load.
-         */
-        if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
-                val >>= local_n / LOAD_AVG_PERIOD;
-                local_n %= LOAD_AVG_PERIOD;
-        }
-        val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
-        return val;
-}
-static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
-{
-        u32 c1, c2, c3 = d3; /* y^0 == 1 */
-        /*
-         * c1 = d1 y^p
-         */
-        c1 = decay_load((u64)d1, periods);
-        /*
-         *            p-1
-         * c2 = 1024 \Sum y^n
-         *            n=1
-         *
-         *              inf        inf
-         *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
-         *              n=0        n=p
-         */
-        c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
-        return c1 + c2 + c3;
-}
-/*
- * Accumulate the three separate parts of the sum; d1 the remainder
- * of the last (incomplete) period, d2 the span of full periods and d3
- * the remainder of the (incomplete) current period.
- *
- *           d1          d2           d3
- *           ^           ^            ^
- *           |           |            |
- *         |<->|<----------------->|<--->|
- * ... |---x---|------| ... |------|-----x (now)
- *
- *                           p-1
- * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
- *                           n=1
- *
- *    = u y^p +                                 (Step 1)
- *
- *                     p-1
- *      d1 y^p + 1024 \Sum y^n + d3 y^0         (Step 2)
- *                     n=1
- */
-static __always_inline u32
-accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
-               unsigned long load, unsigned long runnable, int running)
-{
-        unsigned long scale_freq, scale_cpu;
-        u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
-        u64 periods;
-        scale_freq = arch_scale_freq_capacity(cpu);
-        scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
-        delta += sa->period_contrib;
-        periods = delta / 1024; /* A period is 1024us (~1ms) */
-        /*
-         * Step 1: decay old *_sum if we crossed period boundaries.
-         */
-        if (periods) {
-                sa->load_sum = decay_load(sa->load_sum, periods);
-                sa->runnable_load_sum =
-                        decay_load(sa->runnable_load_sum, periods);
-                sa->util_sum = decay_load((u64)(sa->util_sum), periods);
-                /*
-                 * Step 2
-                 */
-                delta %= 1024;
-                contrib = __accumulate_pelt_segments(periods,
-                                1024 - sa->period_contrib, delta);
-        }
-        sa->period_contrib = delta;
-        contrib = cap_scale(contrib, scale_freq);
-        if (load)
-                sa->load_sum += load * contrib;
-        if (runnable)
-                sa->runnable_load_sum += runnable * contrib;
-        if (running)
-                sa->util_sum += contrib * scale_cpu;
-        return periods;
-}
-/*
- * We can represent the historical contribution to runnable average as the
- * coefficients of a geometric series.  To do this we sub-divide our runnable
- * history into segments of approximately 1ms (1024us); label the segment that
- * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
- *
- * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
- *      p0            p1           p2
- *     (now)       (~1ms ago)  (~2ms ago)
- *
- * Let u_i denote the fraction of p_i that the entity was runnable.
- *
- * We then designate the fractions u_i as our co-efficients, yielding the
- * following representation of historical load:
- *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
- *
- * We choose y based on the with of a reasonably scheduling period, fixing:
- *   y^32 = 0.5
- *
- * This means that the contribution to load ~32ms ago (u_32) will be weighted
- * approximately half as much as the contribution to load within the last ms
- * (u_0).
- *
- * When a period "rolls over" and we have new u_0`, multiplying the previous
- * sum again by y is sufficient to update:
- *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
- *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
- */
-static __always_inline int
-___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
-                  unsigned long load, unsigned long runnable, int running)
-{
-        u64 delta;
-        delta = now - sa->last_update_time;
-        /*
-         * This should only happen when time goes backwards, which it
-         * unfortunately does during sched clock init when we swap over to TSC.
-         */
-        if ((s64)delta < 0) {
-                sa->last_update_time = now;
-                return 0;
-        }
-        /*
-         * Use 1024ns as the unit of measurement since it's a reasonable
-         * approximation of 1us and fast to compute.
-         */
-        delta >>= 10;
-        if (!delta)
-                return 0;
-        sa->last_update_time += delta << 10;
-        /*
-         * running is a subset of runnable (weight) so running can't be set if
-         * runnable is clear. But there are some corner cases where the current
-         * se has been already dequeued but cfs_rq->curr still points to it.
-         * This means that weight will be 0 but not running for a sched_entity
-         * but also for a cfs_rq if the latter becomes idle. As an example,
-         * this happens during idle_balance() which calls
-         * update_blocked_averages()
-         */
-        if (!load)
-                runnable = running = 0;
-        /*
-         * Now we know we crossed measurement unit boundaries. The *_avg
-         * accrues by two steps:
-         *
-         * Step 1: accumulate *_sum since last_update_time. If we haven't
-         * crossed period boundaries, finish.
-         */
-        if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
-                return 0;
-        return 1;
-}
-static __always_inline void
-___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
-{
-        u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
-        /*
-         * Step 2: update *_avg.
-         */
-        sa->load_avg = div_u64(load * sa->load_sum, divider);
-        sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
-        sa->util_avg = sa->util_sum / divider;
-}
-/*
- * When a task is dequeued, its estimated utilization should not be update if
- * its util_avg has not been updated at least once.
- * This flag is used to synchronize util_avg updates with util_est updates.
- * We map this information into the LSB bit of the utilization saved at
- * dequeue time (i.e. util_est.dequeued).
- */
-#define UTIL_AVG_UNCHANGED 0x1
-static inline void cfs_se_util_change(struct sched_avg *avg)
-{
-        unsigned int enqueued;
-        if (!sched_feat(UTIL_EST))
-                return;
-        /* Avoid store if the flag has been already set */
-        enqueued = avg->util_est.enqueued;
-        if (!(enqueued & UTIL_AVG_UNCHANGED))
-                return;
-        /* Reset flag to report util_avg has been updated */
-        enqueued &= ~UTIL_AVG_UNCHANGED;
-        WRITE_ONCE(avg->util_est.enqueued, enqueued);
-}
-/*
- * sched_entity:
- *
- *   task:
- *     se_runnable() == se_weight()
- *
- *   group: [ see update_cfs_group() ]
- *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
- *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
- *
- *   load_sum := runnable_sum
- *   load_avg = se_weight(se) * runnable_avg
- *
- *   runnable_load_sum := runnable_sum
- *   runnable_load_avg = se_runnable(se) * runnable_avg
- *
- * XXX collapse load_sum and runnable_load_sum
- *
- * cfq_rs:
- *
- *   load_sum = \Sum se_weight(se) * se->avg.load_sum
- *   load_avg = \Sum se->avg.load_avg
- *
- *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
- *   runnable_load_avg = \Sum se->avg.runable_load_avg
- */
-static int
-__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
-{
-        if (entity_is_task(se))
-                se->runnable_weight = se->load.weight;
-        if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
-                ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
-                return 1;
-        }
-        return 0;
-}
-static int
-__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
-{
-        if (entity_is_task(se))
-                se->runnable_weight = se->load.weight;
-        if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
-                                cfs_rq->curr == se)) {
-                ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
-                cfs_se_util_change(&se->avg);
-                return 1;
-        }
-        return 0;
-}
-static int
-__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
-{
-        if (___update_load_sum(now, cpu, &cfs_rq->avg,
-                                scale_load_down(cfs_rq->load.weight),
-                                scale_load_down(cfs_rq->runnable_weight),
-                                cfs_rq->curr != NULL)) {
-                ___update_load_avg(&cfs_rq->avg, 1, 1);
-                return 1;
-        }
-        return 0;
-}
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /**
 * update_tg_load_avg - update the tg's load avg
@@ -4037,12 +3648,6 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
 #else /* CONFIG_SMP */
-static inline int
-update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
-{
-        return 0;
-}
 #define UPDATE_TG       0x0
 #define SKIP_AGE_LOAD   0x0
 #define DO_ATTACH       0x0
@@ -4726,7 +4331,6 @@ static inline int throttled_lb_pair(struct task_group *tg,
               throttled_hierarchy(dest_cfs_rq);
 }
-/* updated child weight may affect parent so we have to do this bottom up */
 static int tg_unthrottle_up(struct task_group *tg, void *data)
 {
        struct rq *rq = data;
@@ -5653,8 +5257,6 @@ static void cpu_load_update(struct rq *this_rq, unsigned long this_load,
                this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
        }
-        sched_avg_update(this_rq);
 }
 /* Used instead of source_load when we know the type == 0 */
@@ -7294,8 +6896,8 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
 {
        struct numa_group *numa_group = rcu_dereference(p->numa_group);
-        unsigned long src_faults, dst_faults;
+        unsigned long src_weight, dst_weight;
-        int src_nid, dst_nid;
+        int src_nid, dst_nid, dist;
        if (!static_branch_likely(&sched_numa_balancing))
                return -1;
@@ -7322,18 +6924,19 @@ static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
                return 0;
        /* Leaving a core idle is often worse than degrading locality. */
-        if (env->idle != CPU_NOT_IDLE)
+        if (env->idle == CPU_IDLE)
                return -1;
+        dist = node_distance(src_nid, dst_nid);
        if (numa_group) {
-                src_faults = group_faults(p, src_nid);
+                src_weight = group_weight(p, src_nid, dist);
-                dst_faults = group_faults(p, dst_nid);
+                dst_weight = group_weight(p, dst_nid, dist);
        } else {
-                src_faults = task_faults(p, src_nid);
+                src_weight = task_weight(p, src_nid, dist);
-                dst_faults = task_faults(p, dst_nid);
+                dst_weight = task_weight(p, dst_nid, dist);
        }
-        return dst_faults < src_faults;
+        return dst_weight < src_weight;
 }
 #else
@@ -7620,6 +7223,22 @@ static inline bool cfs_rq_has_blocked(struct cfs_rq *cfs_rq)
        return false;
 }
+static inline bool others_have_blocked(struct rq *rq)
+{
+        if (READ_ONCE(rq->avg_rt.util_avg))
+                return true;
+        if (READ_ONCE(rq->avg_dl.util_avg))
+                return true;
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+        if (READ_ONCE(rq->avg_irq.util_avg))
+                return true;
+#endif
+        return false;
+}
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
@@ -7679,6 +7298,12 @@ static void update_blocked_averages(int cpu)
                if (cfs_rq_has_blocked(cfs_rq))
                        done = false;
        }
+        update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
+        update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+        update_irq_load_avg(rq, 0);
+        /* Don't need periodic decay once load/util_avg are null */
+        if (others_have_blocked(rq))
+                done = false;
 #ifdef CONFIG_NO_HZ_COMMON
        rq->last_blocked_load_update_tick = jiffies;
@@ -7744,9 +7369,12 @@ static inline void update_blocked_averages(int cpu)
        rq_lock_irqsave(rq, &rf);
        update_rq_clock(rq);
        update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+        update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
+        update_dl_rq_load_avg(rq_clock_task(rq), rq, 0);
+        update_irq_load_avg(rq, 0);
 #ifdef CONFIG_NO_HZ_COMMON
        rq->last_blocked_load_update_tick = jiffies;
-        if (!cfs_rq_has_blocked(cfs_rq))
+        if (!cfs_rq_has_blocked(cfs_rq) && !others_have_blocked(rq))
                rq->has_blocked_load = 0;
 #endif
        rq_unlock_irqrestore(rq, &rf);
@@ -7856,39 +7484,32 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
 static unsigned long scale_rt_capacity(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-        u64 total, used, age_stamp, avg;
+        unsigned long max = arch_scale_cpu_capacity(NULL, cpu);
-        s64 delta;
+        unsigned long used, free;
+        unsigned long irq;
-        /*
+        irq = cpu_util_irq(rq);
-         * Since we're reading these variables without serialization make sure
-         * we read them once before doing sanity checks on them.
-         */
-        age_stamp = READ_ONCE(rq->age_stamp);
-        avg = READ_ONCE(rq->rt_avg);
-        delta = __rq_clock_broken(rq) - age_stamp;
-        if (unlikely(delta < 0))
+        if (unlikely(irq >= max))
-                delta = 0;
+                return 1;
-        total = sched_avg_period() + delta;
+        used = READ_ONCE(rq->avg_rt.util_avg);
+        used += READ_ONCE(rq->avg_dl.util_avg);
-        used = div_u64(avg, total);
+        if (unlikely(used >= max))
+                return 1;
-        if (likely(used < SCHED_CAPACITY_SCALE))
+        free = max - used;
-                return SCHED_CAPACITY_SCALE - used;
-        return 1;
+        return scale_irq_capacity(free, irq, max);
 }
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-        unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
+        unsigned long capacity = scale_rt_capacity(cpu);
        struct sched_group *sdg = sd->groups;
-        cpu_rq(cpu)->cpu_capacity_orig = capacity;
+        cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(sd, cpu);
-        capacity *= scale_rt_capacity(cpu);
-        capacity >>= SCHED_CAPACITY_SHIFT;
        if (!capacity)
                capacity = 1;
diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c
new file mode 100644
index 000000000000..35475c0c5419
--- /dev/null
+++ b/kernel/sched/pelt.c
@@ -0,0 +1,399 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Per Entity Load Tracking
+ *
+ *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ *  Interactivity improvements by Mike Galbraith
+ *  (C) 2007 Mike Galbraith <efault@gmx.de>
+ *
+ *  Various enhancements by Dmitry Adamushko.
+ *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
+ *
+ *  Group scheduling enhancements by Srivatsa Vaddagiri
+ *  Copyright IBM Corporation, 2007
+ *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
+ *
+ *  Scaled math optimizations by Thomas Gleixner
+ *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
+ *
+ *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
+ *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
+ *
+ *  Move PELT related code from fair.c into this pelt.c file
+ *  Author: Vincent Guittot <vincent.guittot@linaro.org>
+ */
+#include <linux/sched.h>
+#include "sched.h"
+#include "sched-pelt.h"
+#include "pelt.h"
+/*
+ * Approximate:
+ *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
+ */
+static u64 decay_load(u64 val, u64 n)
+{
+        unsigned int local_n;
+        if (unlikely(n > LOAD_AVG_PERIOD * 63))
+                return 0;
+        /* after bounds checking we can collapse to 32-bit */
+        local_n = n;
+        /*
+         * As y^PERIOD = 1/2, we can combine
+         *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
+         * With a look-up table which covers y^n (n<PERIOD)
+         *
+         * To achieve constant time decay_load.
+         */
+        if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
+                val >>= local_n / LOAD_AVG_PERIOD;
+                local_n %= LOAD_AVG_PERIOD;
+        }
+        val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
+        return val;
+}
+static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
+{
+        u32 c1, c2, c3 = d3; /* y^0 == 1 */
+        /*
+         * c1 = d1 y^p
+         */
+        c1 = decay_load((u64)d1, periods);
+        /*
+         *            p-1
+         * c2 = 1024 \Sum y^n
+         *            n=1
+         *
+         *              inf        inf
+         *    = 1024 ( \Sum y^n - \Sum y^n - y^0 )
+         *              n=0        n=p
+         */
+        c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
+        return c1 + c2 + c3;
+}
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+/*
+ * Accumulate the three separate parts of the sum; d1 the remainder
+ * of the last (incomplete) period, d2 the span of full periods and d3
+ * the remainder of the (incomplete) current period.
+ *
+ *           d1          d2           d3
+ *           ^           ^            ^
+ *           |           |            |
+ *         |<->|<----------------->|<--->|
+ * ... |---x---|------| ... |------|-----x (now)
+ *
+ *                           p-1
+ * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
+ *                           n=1
+ *
+ *    = u y^p +                                 (Step 1)
+ *
+ *                     p-1
+ *      d1 y^p + 1024 \Sum y^n + d3 y^0         (Step 2)
+ *                     n=1
+ */
+static __always_inline u32
+accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
+               unsigned long load, unsigned long runnable, int running)
+{
+        unsigned long scale_freq, scale_cpu;
+        u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */
+        u64 periods;
+        scale_freq = arch_scale_freq_capacity(cpu);
+        scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+        delta += sa->period_contrib;
+        periods = delta / 1024; /* A period is 1024us (~1ms) */
+        /*
+         * Step 1: decay old *_sum if we crossed period boundaries.
+         */
+        if (periods) {
+                sa->load_sum = decay_load(sa->load_sum, periods);
+                sa->runnable_load_sum =
+                        decay_load(sa->runnable_load_sum, periods);
+                sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+                /*
+                 * Step 2
+                 */
+                delta %= 1024;
+                contrib = __accumulate_pelt_segments(periods,
+                                1024 - sa->period_contrib, delta);
+        }
+        sa->period_contrib = delta;
+        contrib = cap_scale(contrib, scale_freq);
+        if (load)
+                sa->load_sum += load * contrib;
+        if (runnable)
+                sa->runnable_load_sum += runnable * contrib;
+        if (running)
+                sa->util_sum += contrib * scale_cpu;
+        return periods;
+}
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series.  To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ *      p0            p1           p2
+ *     (now)       (~1ms ago)  (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ *   y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int
+___update_load_sum(u64 now, int cpu, struct sched_avg *sa,
+                  unsigned long load, unsigned long runnable, int running)
+{
+        u64 delta;
+        delta = now - sa->last_update_time;
+        /*
+         * This should only happen when time goes backwards, which it
+         * unfortunately does during sched clock init when we swap over to TSC.
+         */
+        if ((s64)delta < 0) {
+                sa->last_update_time = now;
+                return 0;
+        }
+        /*
+         * Use 1024ns as the unit of measurement since it's a reasonable
+         * approximation of 1us and fast to compute.
+         */
+        delta >>= 10;
+        if (!delta)
+                return 0;
+        sa->last_update_time += delta << 10;
+        /*
+         * running is a subset of runnable (weight) so running can't be set if
+         * runnable is clear. But there are some corner cases where the current
+         * se has been already dequeued but cfs_rq->curr still points to it.
+         * This means that weight will be 0 but not running for a sched_entity
+         * but also for a cfs_rq if the latter becomes idle. As an example,
+         * this happens during idle_balance() which calls
+         * update_blocked_averages()
+         */
+        if (!load)
+                runnable = running = 0;
+        /*
+         * Now we know we crossed measurement unit boundaries. The *_avg
+         * accrues by two steps:
+         *
+         * Step 1: accumulate *_sum since last_update_time. If we haven't
+         * crossed period boundaries, finish.
+         */
+        if (!accumulate_sum(delta, cpu, sa, load, runnable, running))
+                return 0;
+        return 1;
+}
+static __always_inline void
+___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runnable)
+{
+        u32 divider = LOAD_AVG_MAX - 1024 + sa->period_contrib;
+        /*
+         * Step 2: update *_avg.
+         */
+        sa->load_avg = div_u64(load * sa->load_sum, divider);
+        sa->runnable_load_avg = div_u64(runnable * sa->runnable_load_sum, divider);
+        WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
+}
+/*
+ * sched_entity:
+ *
+ *   task:
+ *     se_runnable() == se_weight()
+ *
+ *   group: [ see update_cfs_group() ]
+ *     se_weight()   = tg->weight * grq->load_avg / tg->load_avg
+ *     se_runnable() = se_weight(se) * grq->runnable_load_avg / grq->load_avg
+ *
+ *   load_sum := runnable_sum
+ *   load_avg = se_weight(se) * runnable_avg
+ *
+ *   runnable_load_sum := runnable_sum
+ *   runnable_load_avg = se_runnable(se) * runnable_avg
+ *
+ * XXX collapse load_sum and runnable_load_sum
+ *
+ * cfq_rq:
+ *
+ *   load_sum = \Sum se_weight(se) * se->avg.load_sum
+ *   load_avg = \Sum se->avg.load_avg
+ *
+ *   runnable_load_sum = \Sum se_runnable(se) * se->avg.runnable_load_sum
+ *   runnable_load_avg = \Sum se->avg.runable_load_avg
+ */
+int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
+{
+        if (entity_is_task(se))
+                se->runnable_weight = se->load.weight;
+        if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
+                ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+                return 1;
+        }
+        return 0;
+}
+int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+        if (entity_is_task(se))
+                se->runnable_weight = se->load.weight;
+        if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
+                                cfs_rq->curr == se)) {
+                ___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
+                cfs_se_util_change(&se->avg);
+                return 1;
+        }
+        return 0;
+}
+int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
+{
+        if (___update_load_sum(now, cpu, &cfs_rq->avg,
+                                scale_load_down(cfs_rq->load.weight),
+                                scale_load_down(cfs_rq->runnable_weight),
+                                cfs_rq->curr != NULL)) {
+                ___update_load_avg(&cfs_rq->avg, 1, 1);
+                return 1;
+        }
+        return 0;
+}
+/*
+ * rt_rq:
+ *
+ *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
+ *   util_sum = cpu_scale * load_sum
+ *   runnable_load_sum = load_sum
+ *
+ *   load_avg and runnable_load_avg are not supported and meaningless.
+ *
+ */
+int update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
+{
+        if (___update_load_sum(now, rq->cpu, &rq->avg_rt,
+                                running,
+                                running,
+                                running)) {
+                ___update_load_avg(&rq->avg_rt, 1, 1);
+                return 1;
+        }
+        return 0;
+}
+/*
+ * dl_rq:
+ *
+ *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
+ *   util_sum = cpu_scale * load_sum
+ *   runnable_load_sum = load_sum
+ *
+ */
+int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
+{
+        if (___update_load_sum(now, rq->cpu, &rq->avg_dl,
+                                running,
+                                running,
+                                running)) {
+                ___update_load_avg(&rq->avg_dl, 1, 1);
+                return 1;
+        }
+        return 0;
+}
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+/*
+ * irq:
+ *
+ *   util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
+ *   util_sum = cpu_scale * load_sum
+ *   runnable_load_sum = load_sum
+ *
+ */
+int update_irq_load_avg(struct rq *rq, u64 running)
+{
+        int ret = 0;
+        /*
+         * We know the time that has been used by interrupt since last update
+         * but we don't when. Let be pessimistic and assume that interrupt has
+         * happened just before the update. This is not so far from reality
+         * because interrupt will most probably wake up task and trig an update
+         * of rq clock during which the metric si updated.
+         * We start to decay with normal context time and then we add the
+         * interrupt context time.
+         * We can safely remove running from rq->clock because
+         * rq->clock += delta with delta >= running
+         */
+        ret = ___update_load_sum(rq->clock - running, rq->cpu, &rq->avg_irq,
+                                0,
+                                0,
+                                0);
+        ret += ___update_load_sum(rq->clock, rq->cpu, &rq->avg_irq,
+                                1,
+                                1,
+                                1);
+        if (ret)
+                ___update_load_avg(&rq->avg_irq, 1, 1);
+        return ret;
+}
+#endif
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
new file mode 100644
index 000000000000..d2894db28955
--- /dev/null
+++ b/kernel/sched/pelt.h
@@ -0,0 +1,72 @@
+#ifdef CONFIG_SMP
+int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se);
+int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se);
+int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
+int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
+int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+int update_irq_load_avg(struct rq *rq, u64 running);
+#else
+static inline int
+update_irq_load_avg(struct rq *rq, u64 running)
+{
+        return 0;
+}
+#endif
+/*
+ * When a task is dequeued, its estimated utilization should not be update if
+ * its util_avg has not been updated at least once.
+ * This flag is used to synchronize util_avg updates with util_est updates.
+ * We map this information into the LSB bit of the utilization saved at
+ * dequeue time (i.e. util_est.dequeued).
+ */
+#define UTIL_AVG_UNCHANGED 0x1
+static inline void cfs_se_util_change(struct sched_avg *avg)
+{
+        unsigned int enqueued;
+        if (!sched_feat(UTIL_EST))
+                return;
+        /* Avoid store if the flag has been already set */
+        enqueued = avg->util_est.enqueued;
+        if (!(enqueued & UTIL_AVG_UNCHANGED))
+                return;
+        /* Reset flag to report util_avg has been updated */
+        enqueued &= ~UTIL_AVG_UNCHANGED;
+        WRITE_ONCE(avg->util_est.enqueued, enqueued);
+}
+#else
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+{
+        return 0;
+}
+static inline int
+update_rt_rq_load_avg(u64 now, struct rq *rq, int running)
+{
+        return 0;
+}
+static inline int
+update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
+{
+        return 0;
+}
+static inline int
+update_irq_load_avg(struct rq *rq, u64 running)
+{
+        return 0;
+}
+#endif
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index eaaec8364f96..2e2955a8cf8f 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -5,6 +5,8 @@
 */
 #include "sched.h"
+#include "pelt.h"
 int sched_rr_timeslice = RR_TIMESLICE;
 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
@@ -973,8 +975,6 @@ static void update_curr_rt(struct rq *rq)
        curr->se.exec_start = now;
        cgroup_account_cputime(curr, delta_exec);
-        sched_rt_avg_update(rq, delta_exec);
        if (!rt_bandwidth_enabled())
                return;
@@ -1578,6 +1578,14 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
        rt_queue_push_tasks(rq);
+        /*
+         * If prev task was rt, put_prev_task() has already updated the
+         * utilization. We only care of the case where we start to schedule a
+         * rt task
+         */
+        if (rq->curr->sched_class != &rt_sched_class)
+                update_rt_rq_load_avg(rq_clock_task(rq), rq, 0);
        return p;
 }
@@ -1585,6 +1593,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
        update_curr_rt(rq);
+        update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
        /*
         * The previous task needs to be made eligible for pushing
         * if it is still active
@@ -2314,6 +2324,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        struct sched_rt_entity *rt_se = &p->rt;
        update_curr_rt(rq);
+        update_rt_rq_load_avg(rq_clock_task(rq), rq, 1);
        watchdog(rq, p);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c7742dcc136c..4a2e8cae63c4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -594,6 +594,7 @@ struct rt_rq {
        unsigned long           rt_nr_total;
        int                     overloaded;
        struct plist_head       pushable_tasks;
 #endif /* CONFIG_SMP */
        int                     rt_queued;
@@ -673,7 +674,26 @@ struct dl_rq {
        u64                     bw_ratio;
 };
+#ifdef CONFIG_FAIR_GROUP_SCHED
+/* An entity is a task if it doesn't "own" a runqueue */
+#define entity_is_task(se)      (!se->my_q)
+#else
+#define entity_is_task(se)      1
+#endif
 #ifdef CONFIG_SMP
+/*
+ * XXX we want to get rid of these helpers and use the full load resolution.
+ */
+static inline long se_weight(struct sched_entity *se)
+{
+        return scale_load_down(se->load.weight);
+}
+static inline long se_runnable(struct sched_entity *se)
+{
+        return scale_load_down(se->runnable_weight);
+}
 static inline bool sched_asym_prefer(int a, int b)
 {
@@ -833,8 +853,12 @@ struct rq {
        struct list_head cfs_tasks;
-        u64                     rt_avg;
+        struct sched_avg        avg_rt;
-        u64                     age_stamp;
+        struct sched_avg        avg_dl;
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+#define HAVE_SCHED_AVG_IRQ
+        struct sched_avg        avg_irq;
+#endif
        u64                     idle_stamp;
        u64                     avg_idle;
@@ -1075,7 +1099,8 @@ enum numa_faults_stats {
 };
 extern void sched_setnuma(struct task_struct *p, int node);
 extern int migrate_task_to(struct task_struct *p, int cpu);
-extern int migrate_swap(struct task_struct *, struct task_struct *);
+extern int migrate_swap(struct task_struct *p, struct task_struct *t,
+                        int cpu, int scpu);
 extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p);
 #else
 static inline void
@@ -1690,15 +1715,9 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
 extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
-extern const_debug unsigned int sysctl_sched_time_avg;
 extern const_debug unsigned int sysctl_sched_nr_migrate;
 extern const_debug unsigned int sysctl_sched_migration_cost;
-static inline u64 sched_avg_period(void)
-{
-        return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
-}
 #ifdef CONFIG_SCHED_HRTICK
 /*
@@ -1735,8 +1754,6 @@ unsigned long arch_scale_freq_capacity(int cpu)
 #endif
 #ifdef CONFIG_SMP
-extern void sched_avg_update(struct rq *rq);
 #ifndef arch_scale_cpu_capacity
 static __always_inline
 unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
@@ -1747,12 +1764,6 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
        return SCHED_CAPACITY_SCALE;
 }
 #endif
-static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
-{
-        rq->rt_avg += rt_delta * arch_scale_freq_capacity(cpu_of(rq));
-        sched_avg_update(rq);
-}
 #else
 #ifndef arch_scale_cpu_capacity
 static __always_inline
@@ -1761,8 +1772,6 @@ unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu)
        return SCHED_CAPACITY_SCALE;
 }
 #endif
-static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
-static inline void sched_avg_update(struct rq *rq) { }
 #endif
 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
@@ -2177,11 +2186,16 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 #endif
 #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
-static inline unsigned long cpu_util_dl(struct rq *rq)
+static inline unsigned long cpu_bw_dl(struct rq *rq)
 {
        return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
 }
+static inline unsigned long cpu_util_dl(struct rq *rq)
+{
+        return READ_ONCE(rq->avg_dl.util_avg);
+}
 static inline unsigned long cpu_util_cfs(struct rq *rq)
 {
        unsigned long util = READ_ONCE(rq->cfs.avg.util_avg);
@@ -2193,4 +2207,37 @@ static inline unsigned long cpu_util_cfs(struct rq *rq)
        return util;
 }
+static inline unsigned long cpu_util_rt(struct rq *rq)
+{
+        return READ_ONCE(rq->avg_rt.util_avg);
+}
+#endif
+#ifdef HAVE_SCHED_AVG_IRQ
+static inline unsigned long cpu_util_irq(struct rq *rq)
+{
+        return rq->avg_irq.util_avg;
+}
+static inline
+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
+{
+        util *= (max - irq);
+        util /= max;
+        return util;
+}
+#else
+static inline unsigned long cpu_util_irq(struct rq *rq)
+{
+        return 0;
+}
+static inline
+unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max)
+{
+        return util;
+}
 #endif
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index b6fb2c3b3ff7..66b59ac77c22 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -32,7 +32,7 @@ void swake_up_locked(struct swait_queue_head *q)
 }
 EXPORT_SYMBOL(swake_up_locked);
-void swake_up(struct swait_queue_head *q)
+void swake_up_one(struct swait_queue_head *q)
 {
        unsigned long flags;
@@ -40,7 +40,7 @@ void swake_up(struct swait_queue_head *q)
        swake_up_locked(q);
        raw_spin_unlock_irqrestore(&q->lock, flags);
 }
-EXPORT_SYMBOL(swake_up);
+EXPORT_SYMBOL(swake_up_one);
 /*
 * Does not allow usage from IRQ disabled, since we must be able to
@@ -69,14 +69,14 @@ void swake_up_all(struct swait_queue_head *q)
 }
 EXPORT_SYMBOL(swake_up_all);
-void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
+static void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
 {
        wait->task = current;
        if (list_empty(&wait->task_list))
-                list_add(&wait->task_list, &q->task_list);
+                list_add_tail(&wait->task_list, &q->task_list);
 }
-void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
+void prepare_to_swait_exclusive(struct swait_queue_head *q, struct swait_queue *wait, int state)
 {
        unsigned long flags;
@@ -85,16 +85,28 @@ void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int
        set_current_state(state);
        raw_spin_unlock_irqrestore(&q->lock, flags);
 }
-EXPORT_SYMBOL(prepare_to_swait);
+EXPORT_SYMBOL(prepare_to_swait_exclusive);
 long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
 {
-        if (signal_pending_state(state, current))
+        unsigned long flags;
-                return -ERESTARTSYS;
+        long ret = 0;
-        prepare_to_swait(q, wait, state);
+        raw_spin_lock_irqsave(&q->lock, flags);
+        if (unlikely(signal_pending_state(state, current))) {
+                /*
+                 * See prepare_to_wait_event(). TL;DR, subsequent swake_up_one()
+                 * must not see us.
+                 */
+                list_del_init(&wait->task_list);
+                ret = -ERESTARTSYS;
+        } else {
+                __prepare_to_swait(q, wait);
+                set_current_state(state);
+        }
+        raw_spin_unlock_irqrestore(&q->lock, flags);
-        return 0;
+        return ret;
 }
 EXPORT_SYMBOL(prepare_to_swait_event);
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 5043e7433f4b..c230c2dd48e1 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -238,8 +238,7 @@ int smpboot_unpark_threads(unsigned int cpu)
        mutex_lock(&smpboot_threads_lock);
        list_for_each_entry(cur, &hotplug_threads, list)
-                if (cpumask_test_cpu(cpu, cur->cpumask))
+                smpboot_unpark_thread(cur, cpu);
-                        smpboot_unpark_thread(cur, cpu);
        mutex_unlock(&smpboot_threads_lock);
        return 0;
 }
@@ -280,34 +279,26 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
 }
 /**
- * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
+ * smpboot_register_percpu_thread - Register a per_cpu thread related
 *                                          to hotplug
 * @plug_thread:        Hotplug thread descriptor
- * @cpumask:            The cpumask where threads run
 *
 * Creates and starts the threads on all online cpus.
 */
-int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
-                                           const struct cpumask *cpumask)
 {
        unsigned int cpu;
        int ret = 0;
-        if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
-                return -ENOMEM;
-        cpumask_copy(plug_thread->cpumask, cpumask);
        get_online_cpus();
        mutex_lock(&smpboot_threads_lock);
        for_each_online_cpu(cpu) {
                ret = __smpboot_create_thread(plug_thread, cpu);
                if (ret) {
                        smpboot_destroy_threads(plug_thread);
-                        free_cpumask_var(plug_thread->cpumask);
                        goto out;
                }
-                if (cpumask_test_cpu(cpu, cpumask))
+                smpboot_unpark_thread(plug_thread, cpu);
-                        smpboot_unpark_thread(plug_thread, cpu);
        }
        list_add(&plug_thread->list, &hotplug_threads);
 out:
@@ -315,7 +306,7 @@ out:
        put_online_cpus();
        return ret;
 }
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
 /**
 * smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
@@ -331,44 +322,9 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
        smpboot_destroy_threads(plug_thread);
        mutex_unlock(&smpboot_threads_lock);
        put_online_cpus();
-        free_cpumask_var(plug_thread->cpumask);
 }
 EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
-/**
- * smpboot_update_cpumask_percpu_thread - Adjust which per_cpu hotplug threads stay parked
- * @plug_thread:        Hotplug thread descriptor
- * @new:                Revised mask to use
- *
- * The cpumask field in the smp_hotplug_thread must not be updated directly
- * by the client, but only by calling this function.
- * This function can only be called on a registered smp_hotplug_thread.
- */
-void smpboot_update_cpumask_percpu_thread(struct smp_hotplug_thread *plug_thread,
-                                          const struct cpumask *new)
-{
-        struct cpumask *old = plug_thread->cpumask;
-        static struct cpumask tmp;
-        unsigned int cpu;
-        lockdep_assert_cpus_held();
-        mutex_lock(&smpboot_threads_lock);
-        /* Park threads that were exclusively enabled on the old mask. */
-        cpumask_andnot(&tmp, old, new);
-        for_each_cpu_and(cpu, &tmp, cpu_online_mask)
-                smpboot_park_thread(plug_thread, cpu);
-        /* Unpark threads that are exclusively enabled on the new mask. */
-        cpumask_andnot(&tmp, new, old);
-        for_each_cpu_and(cpu, &tmp, cpu_online_mask)
-                smpboot_unpark_thread(plug_thread, cpu);
-        cpumask_copy(old, new);
-        mutex_unlock(&smpboot_threads_lock);
-}
 static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
 /*
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 69eb76daed34..067cb83f37ea 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -238,13 +238,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
        struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
        DEFINE_WAKE_Q(wakeq);
        int err;
 retry:
+        /*
+         * The waking up of stopper threads has to happen in the same
+         * scheduling context as the queueing.  Otherwise, there is a
+         * possibility of one of the above stoppers being woken up by another
+         * CPU, and preempting us. This will cause us to not wake up the other
+         * stopper forever.
+         */
+        preempt_disable();
        raw_spin_lock_irq(&stopper1->lock);
        raw_spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
-        err = -ENOENT;
+        if (!stopper1->enabled || !stopper2->enabled) {
-        if (!stopper1->enabled || !stopper2->enabled)
+                err = -ENOENT;
                goto unlock;
+        }
        /*
         * Ensure that if we race with __stop_cpus() the stoppers won't get
         * queued up in reverse order leading to system deadlock.
@@ -255,36 +266,30 @@ retry:
         * It can be falsely true but it is safe to spin until it is cleared,
         * queue_stop_cpus_work() does everything under preempt_disable().
         */
-        err = -EDEADLK;
+        if (unlikely(stop_cpus_in_progress)) {
-        if (unlikely(stop_cpus_in_progress))
+                err = -EDEADLK;
-                        goto unlock;
+                goto unlock;
+        }
        err = 0;
        __cpu_stop_queue_work(stopper1, work1, &wakeq);
        __cpu_stop_queue_work(stopper2, work2, &wakeq);
-        /*
-         * The waking up of stopper threads has to happen
-         * in the same scheduling context as the queueing.
-         * Otherwise, there is a possibility of one of the
-         * above stoppers being woken up by another CPU,
-         * and preempting us. This will cause us to n ot
-         * wake up the other stopper forever.
-         */
-        preempt_disable();
 unlock:
        raw_spin_unlock(&stopper2->lock);
        raw_spin_unlock_irq(&stopper1->lock);
        if (unlikely(err == -EDEADLK)) {
+                preempt_enable();
                while (stop_cpus_in_progress)
                        cpu_relax();
                goto retry;
        }
-        if (!err) {
+        wake_up_q(&wakeq);
-                wake_up_q(&wakeq);
+        preempt_enable();
-                preempt_enable();
-        }
        return err;
 }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2d9837c0aff4..f22f76b7a138 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -368,14 +368,6 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
-        {
-                .procname       = "sched_time_avg_ms",
-                .data           = &sysctl_sched_time_avg,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = &one,
-        },
 #ifdef CONFIG_SCHEDSTATS
        {
                .procname       = "sched_schedstats",
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 576d18045811..5470dce212c0 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -18,18 +18,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
-#include <linux/smpboot.h>
-#include <linux/sched/rt.h>
-#include <uapi/linux/sched/types.h>
 #include <linux/tick.h>
-#include <linux/workqueue.h>
 #include <linux/sched/clock.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/isolation.h>
+#include <linux/stop_machine.h>
 #include <asm/irq_regs.h>
 #include <linux/kvm_para.h>
-#include <linux/kthread.h>
 static DEFINE_MUTEX(watchdog_mutex);
@@ -169,11 +165,10 @@ static void lockup_detector_update_enable(void)
 unsigned int __read_mostly softlockup_panic =
                        CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
-static bool softlockup_threads_initialized __read_mostly;
+static bool softlockup_initialized __read_mostly;
 static u64 __read_mostly sample_period;
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
-static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
 static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
 static DEFINE_PER_CPU(bool, softlockup_touch_sync);
 static DEFINE_PER_CPU(bool, soft_watchdog_warn);
@@ -335,6 +330,27 @@ static void watchdog_interrupt_count(void)
        __this_cpu_inc(hrtimer_interrupts);
 }
+static DEFINE_PER_CPU(struct completion, softlockup_completion);
+static DEFINE_PER_CPU(struct cpu_stop_work, softlockup_stop_work);
+/*
+ * The watchdog thread function - touches the timestamp.
+ *
+ * It only runs once every sample_period seconds (4 seconds by
+ * default) to reset the softlockup timestamp. If this gets delayed
+ * for more than 2*watchdog_thresh seconds then the debug-printout
+ * triggers in watchdog_timer_fn().
+ */
+static int softlockup_fn(void *data)
+{
+        __this_cpu_write(soft_lockup_hrtimer_cnt,
+                         __this_cpu_read(hrtimer_interrupts));
+        __touch_watchdog();
+        complete(this_cpu_ptr(&softlockup_completion));
+        return 0;
+}
 /* watchdog kicker functions */
 static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 {
@@ -350,7 +366,12 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        watchdog_interrupt_count();
        /* kick the softlockup detector */
-        wake_up_process(__this_cpu_read(softlockup_watchdog));
+        if (completion_done(this_cpu_ptr(&softlockup_completion))) {
+                reinit_completion(this_cpu_ptr(&softlockup_completion));
+                stop_one_cpu_nowait(smp_processor_id(),
+                                softlockup_fn, NULL,
+                                this_cpu_ptr(&softlockup_stop_work));
+        }
        /* .. and repeat */
        hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
@@ -448,16 +469,15 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
        return HRTIMER_RESTART;
 }
-static void watchdog_set_prio(unsigned int policy, unsigned int prio)
-{
-        struct sched_param param = { .sched_priority = prio };
-        sched_setscheduler(current, policy, &param);
-}
 static void watchdog_enable(unsigned int cpu)
 {
        struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
+        struct completion *done = this_cpu_ptr(&softlockup_completion);
+        WARN_ON_ONCE(cpu != smp_processor_id());
+        init_completion(done);
+        complete(done);
        /*
         * Start the timer first to prevent the NMI watchdog triggering
@@ -473,15 +493,14 @@ static void watchdog_enable(unsigned int cpu)
        /* Enable the perf event */
        if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
                watchdog_nmi_enable(cpu);
-        watchdog_set_prio(SCHED_FIFO, MAX_RT_PRIO - 1);
 }
 static void watchdog_disable(unsigned int cpu)
 {
        struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
-        watchdog_set_prio(SCHED_NORMAL, 0);
+        WARN_ON_ONCE(cpu != smp_processor_id());
        /*
         * Disable the perf event first. That prevents that a large delay
         * between disabling the timer and disabling the perf event causes
@@ -489,79 +508,66 @@ static void watchdog_disable(unsigned int cpu)
         */
        watchdog_nmi_disable(cpu);
        hrtimer_cancel(hrtimer);
+        wait_for_completion(this_cpu_ptr(&softlockup_completion));
 }
-static void watchdog_cleanup(unsigned int cpu, bool online)
+static int softlockup_stop_fn(void *data)
 {
-        watchdog_disable(cpu);
+        watchdog_disable(smp_processor_id());
+        return 0;
 }
-static int watchdog_should_run(unsigned int cpu)
+static void softlockup_stop_all(void)
 {
-        return __this_cpu_read(hrtimer_interrupts) !=
+        int cpu;
-                __this_cpu_read(soft_lockup_hrtimer_cnt);
+        if (!softlockup_initialized)
+                return;
+        for_each_cpu(cpu, &watchdog_allowed_mask)
+                smp_call_on_cpu(cpu, softlockup_stop_fn, NULL, false);
+        cpumask_clear(&watchdog_allowed_mask);
 }
-/*
+static int softlockup_start_fn(void *data)
- * The watchdog thread function - touches the timestamp.
- *
- * It only runs once every sample_period seconds (4 seconds by
- * default) to reset the softlockup timestamp. If this gets delayed
- * for more than 2*watchdog_thresh seconds then the debug-printout
- * triggers in watchdog_timer_fn().
- */
-static void watchdog(unsigned int cpu)
 {
-        __this_cpu_write(soft_lockup_hrtimer_cnt,
+        watchdog_enable(smp_processor_id());
-                         __this_cpu_read(hrtimer_interrupts));
+        return 0;
-        __touch_watchdog();
 }
-static struct smp_hotplug_thread watchdog_threads = {
+static void softlockup_start_all(void)
-        .store                  = &softlockup_watchdog,
-        .thread_should_run      = watchdog_should_run,
-        .thread_fn              = watchdog,
-        .thread_comm            = "watchdog/%u",
-        .setup                  = watchdog_enable,
-        .cleanup                = watchdog_cleanup,
-        .park                   = watchdog_disable,
-        .unpark                 = watchdog_enable,
-};
-static void softlockup_update_smpboot_threads(void)
 {
-        lockdep_assert_held(&watchdog_mutex);
+        int cpu;
-        if (!softlockup_threads_initialized)
-                return;
-        smpboot_update_cpumask_percpu_thread(&watchdog_threads,
+        cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
-                                             &watchdog_allowed_mask);
+        for_each_cpu(cpu, &watchdog_allowed_mask)
+                smp_call_on_cpu(cpu, softlockup_start_fn, NULL, false);
 }
-/* Temporarily park all watchdog threads */
+int lockup_detector_online_cpu(unsigned int cpu)
-static void softlockup_park_all_threads(void)
 {
-        cpumask_clear(&watchdog_allowed_mask);
+        watchdog_enable(cpu);
-        softlockup_update_smpboot_threads();
+        return 0;
 }
-/* Unpark enabled threads */
+int lockup_detector_offline_cpu(unsigned int cpu)
-static void softlockup_unpark_threads(void)
 {
-        cpumask_copy(&watchdog_allowed_mask, &watchdog_cpumask);
+        watchdog_disable(cpu);
-        softlockup_update_smpboot_threads();
+        return 0;
 }
 static void lockup_detector_reconfigure(void)
 {
        cpus_read_lock();
        watchdog_nmi_stop();
-        softlockup_park_all_threads();
+        softlockup_stop_all();
        set_sample_period();
        lockup_detector_update_enable();
        if (watchdog_enabled && watchdog_thresh)
-                softlockup_unpark_threads();
+                softlockup_start_all();
        watchdog_nmi_start();
        cpus_read_unlock();
        /*
@@ -580,8 +586,6 @@ static void lockup_detector_reconfigure(void)
 */
 static __init void lockup_detector_setup(void)
 {
-        int ret;
        /*
         * If sysctl is off and watchdog got disabled on the command line,
         * nothing to do here.
@@ -592,24 +596,13 @@ static __init void lockup_detector_setup(void)
            !(watchdog_enabled && watchdog_thresh))
                return;
-        ret = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
-                                                     &watchdog_allowed_mask);
-        if (ret) {
-                pr_err("Failed to initialize soft lockup detector threads\n");
-                return;
-        }
        mutex_lock(&watchdog_mutex);
-        softlockup_threads_initialized = true;
        lockup_detector_reconfigure();
+        softlockup_initialized = true;
        mutex_unlock(&watchdog_mutex);
 }
 #else /* CONFIG_SOFTLOCKUP_DETECTOR */
-static inline int watchdog_park_threads(void) { return 0; }
-static inline void watchdog_unpark_threads(void) { }
-static inline int watchdog_enable_all_cpus(void) { return 0; }
-static inline void watchdog_disable_all_cpus(void) { }
 static void lockup_detector_reconfigure(void)
 {
        cpus_read_lock();
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
index e449a23e9d59..1f7020d65d0a 100644
--- a/kernel/watchdog_hld.c
+++ b/kernel/watchdog_hld.c
@@ -175,8 +175,8 @@ static int hardlockup_detector_event_create(void)
        evt = perf_event_create_kernel_counter(wd_attr, cpu, NULL,
                                               watchdog_overflow_callback, NULL);
        if (IS_ERR(evt)) {
-                pr_info("Perf event create on CPU %d failed with %ld\n", cpu,
+                pr_debug("Perf event create on CPU %d failed with %ld\n", cpu,
-                        PTR_ERR(evt));
+                         PTR_ERR(evt));
                return PTR_ERR(evt);
        }
        this_cpu_write(watchdog_ev, evt);
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 04e554cae3a2..108250e4d376 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -604,7 +604,7 @@ void kvm_arm_resume_guest(struct kvm *kvm)
        kvm_for_each_vcpu(i, vcpu, kvm) {
                vcpu->arch.pause = false;
-                swake_up(kvm_arch_vcpu_wq(vcpu));
+                swake_up_one(kvm_arch_vcpu_wq(vcpu));
        }
 }
@@ -612,7 +612,7 @@ static void vcpu_req_sleep(struct kvm_vcpu *vcpu)
 {
        struct swait_queue_head *wq = kvm_arch_vcpu_wq(vcpu);
-        swait_event_interruptible(*wq, ((!vcpu->arch.power_off) &&
+        swait_event_interruptible_exclusive(*wq, ((!vcpu->arch.power_off) &&
                                       (!vcpu->arch.pause)));
        if (vcpu->arch.power_off || vcpu->arch.pause) {
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
index c95ab4c5a475..9b73d3ad918a 100644
--- a/virt/kvm/arm/psci.c
+++ b/virt/kvm/arm/psci.c
@@ -155,7 +155,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
        smp_mb();               /* Make sure the above is visible */
        wq = kvm_arch_vcpu_wq(vcpu);
-        swake_up(wq);
+        swake_up_one(wq);
        return PSCI_RET_SUCCESS;
 }
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 57bcb27dcf30..23c2519c5b32 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -107,7 +107,7 @@ static void async_pf_execute(struct work_struct *work)
        trace_kvm_async_pf_completed(addr, gva);
        if (swq_has_sleeper(&vcpu->wq))
-                swake_up(&vcpu->wq);
+                swake_up_one(&vcpu->wq);
        mmput(mm);
        kvm_put_kvm(vcpu->kvm);
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 8b47507faab5..3d233ebfbee9 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2172,7 +2172,7 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu)
        kvm_arch_vcpu_blocking(vcpu);
        for (;;) {
-                prepare_to_swait(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
+                prepare_to_swait_exclusive(&vcpu->wq, &wait, TASK_INTERRUPTIBLE);
                if (kvm_vcpu_check_block(vcpu) < 0)
                        break;
@@ -2214,7 +2214,7 @@ bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
        wqp = kvm_arch_vcpu_wq(vcpu);
        if (swq_has_sleeper(wqp)) {
-                swake_up(wqp);
+                swake_up_one(wqp);
                ++vcpu->stat.halt_wakeup;
                return true;
        }
author	Linus Torvalds <torvalds@linux-foundation.org>	2018-08-13 14:25:07 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-08-13 14:25:07 -0400
commit	f7951c33f0fed14ee26651a70a46899a59a31e18 (patch)
tree	dff372035ceaa7b3a01e2f15c885ff0ff2510e68
parent	2406fb8d94fb17fee3ace0c09427c08825eacb16 (diff)
parent	1b6266ebe3da8198e9a02fbad77bbb56e2f7ce2e (diff)