20 files changed, 1860 insertions, 416 deletions
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 0c9b862292b2..e8ca97b5c386 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -10,6 +10,7 @@
 #include <linux/mmzone.h>
 #include <linux/kbuild.h>
 #include <linux/page_cgroup.h>
+#include <linux/log2.h>
 void foo(void)
 {
@@ -17,5 +18,8 @@ void foo(void)
        DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
        DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
        DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
+#ifdef CONFIG_SMP
+        DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS));
+#endif
        /* End of constants */
 }
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 859c8dfd78a1..e5f3917aa05b 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -120,7 +120,7 @@ void context_tracking_user_enter(void)
 * instead of preempt_schedule() to exit user context if needed before
 * calling the scheduler.
 */
-void __sched notrace preempt_schedule_context(void)
+asmlinkage void __sched notrace preempt_schedule_context(void)
 {
        enum ctx_state prev_ctx;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index d7f07a2da5a6..63aa50d7ce1e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -308,6 +308,23 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
        }
        smpboot_park_threads(cpu);
+        /*
+         * By now we've cleared cpu_active_mask, wait for all preempt-disabled
+         * and RCU users of this state to go away such that all new such users
+         * will observe it.
+         *
+         * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
+         * not imply sync_sched(), so explicitly call both.
+         */
+#ifdef CONFIG_PREEMPT
+        synchronize_sched();
+#endif
+        synchronize_rcu();
+        /*
+         * So now all preempt/rcu users must observe !cpu_active().
+         */
        err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
        if (err) {
                /* CPU didn't die: tell everyone.  Can't complain. */
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index e695c0a0bcb5..988573a9a387 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -44,7 +44,7 @@ static inline int cpu_idle_poll(void)
        rcu_idle_enter();
        trace_cpu_idle_rcuidle(0, smp_processor_id());
        local_irq_enable();
-        while (!need_resched())
+        while (!tif_need_resched())
                cpu_relax();
        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
        rcu_idle_exit();
@@ -92,8 +92,7 @@ static void cpu_idle_loop(void)
                        if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
                                cpu_idle_poll();
                        } else {
-                                current_clr_polling();
+                                if (!current_clr_polling_and_test()) {
-                                if (!need_resched()) {
                                        stop_critical_timings();
                                        rcu_idle_enter();
                                        arch_cpu_idle();
@@ -103,9 +102,16 @@ static void cpu_idle_loop(void)
                                } else {
                                        local_irq_enable();
                                }
-                                current_set_polling();
+                                __current_set_polling();
                        }
                        arch_cpu_idle_exit();
+                        /*
+                         * We need to test and propagate the TIF_NEED_RESCHED
+                         * bit here because we might not have send the
+                         * reschedule IPI to idle tasks.
+                         */
+                        if (tif_need_resched())
+                                set_preempt_need_resched();
                }
                tick_nohz_idle_exit();
                schedule_preempt_disabled();
@@ -129,7 +135,7 @@ void cpu_startup_entry(enum cpuhp_state state)
         */
        boot_init_stack_canary();
 #endif
-        current_set_polling();
+        __current_set_polling();
        arch_cpu_idle_prepare();
        cpu_idle_loop();
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 086fe73ad6bd..c93be06dee87 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -817,9 +817,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        mm->pmd_huge_pte = NULL;
 #endif
-#ifdef CONFIG_NUMA_BALANCING
-        mm->first_nid = NUMA_PTE_SCAN_INIT;
-#endif
        if (!mm_init(mm, tsk))
                goto fail_nomem;
@@ -1313,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
        /* Perform scheduler related setup. Assign this task to a CPU. */
-        sched_fork(p);
+        sched_fork(clone_flags, p);
        retval = perf_event_init_task(p);
        if (retval)
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 32618b3fe4e6..1dc9f3604ad8 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -898,6 +898,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        force_quiescent_state(rsp);  /* Kick them all. */
 }
+/*
+ * This function really isn't for public consumption, but RCU is special in
+ * that context switches can allow the state machine to make progress.
+ */
+extern void resched_cpu(int cpu);
 static void print_cpu_stall(struct rcu_state *rsp)
 {
        int cpu;
@@ -927,7 +933,14 @@ static void print_cpu_stall(struct rcu_state *rsp)
                                     3 * rcu_jiffies_till_stall_check() + 3;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        set_need_resched();  /* kick ourselves to get things going. */
+        /*
+         * Attempt to revive the RCU machinery by forcing a context switch.
+         *
+         * A context switch would normally allow the RCU state machine to make
+         * progress and it could be we're stuck in kernel space without context
+         * switches for an entirely unreasonable amount of time.
+         */
+        resched_cpu(smp_processor_id());
 }
 static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5ac63c9a995a..450a34b2a637 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -513,12 +513,11 @@ static inline void init_hrtick(void)
 * might also involve a cross-CPU call to trigger the scheduler on
 * the target CPU.
 */
-#ifdef CONFIG_SMP
 void resched_task(struct task_struct *p)
 {
        int cpu;
-        assert_raw_spin_locked(&task_rq(p)->lock);
+        lockdep_assert_held(&task_rq(p)->lock);
        if (test_tsk_need_resched(p))
                return;
@@ -526,8 +525,10 @@ void resched_task(struct task_struct *p)
        set_tsk_need_resched(p);
        cpu = task_cpu(p);
-        if (cpu == smp_processor_id())
+        if (cpu == smp_processor_id()) {
+                set_preempt_need_resched();
                return;
+        }
        /* NEED_RESCHED must be visible before we test polling */
        smp_mb();
@@ -546,6 +547,7 @@ void resched_cpu(int cpu)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
+#ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ_COMMON
 /*
 * In the semi idle case, use the nearest busy cpu for migrating timers
@@ -693,12 +695,6 @@ void sched_avg_update(struct rq *rq)
        }
 }
-#else /* !CONFIG_SMP */
-void resched_task(struct task_struct *p)
-{
-        assert_raw_spin_locked(&task_rq(p)->lock);
-        set_tsk_need_resched(p);
-}
 #endif /* CONFIG_SMP */
 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
@@ -767,14 +763,14 @@ static void set_load_weight(struct task_struct *p)
 static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        update_rq_clock(rq);
-        sched_info_queued(p);
+        sched_info_queued(rq, p);
        p->sched_class->enqueue_task(rq, p, flags);
 }
 static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 {
        update_rq_clock(rq);
-        sched_info_dequeued(p);
+        sched_info_dequeued(rq, p);
        p->sched_class->dequeue_task(rq, p, flags);
 }
@@ -987,7 +983,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
         * ttwu() will sort out the placement.
         */
        WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
-                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+                        !(task_preempt_count(p) & PREEMPT_ACTIVE));
 #ifdef CONFIG_LOCKDEP
        /*
@@ -1017,6 +1013,107 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        __set_task_cpu(p, new_cpu);
 }
+static void __migrate_swap_task(struct task_struct *p, int cpu)
+{
+        if (p->on_rq) {
+                struct rq *src_rq, *dst_rq;
+                src_rq = task_rq(p);
+                dst_rq = cpu_rq(cpu);
+                deactivate_task(src_rq, p, 0);
+                set_task_cpu(p, cpu);
+                activate_task(dst_rq, p, 0);
+                check_preempt_curr(dst_rq, p, 0);
+        } else {
+                /*
+                 * Task isn't running anymore; make it appear like we migrated
+                 * it before it went to sleep. This means on wakeup we make the
+                 * previous cpu our targer instead of where it really is.
+                 */
+                p->wake_cpu = cpu;
+        }
+}
+struct migration_swap_arg {
+        struct task_struct *src_task, *dst_task;
+        int src_cpu, dst_cpu;
+};
+static int migrate_swap_stop(void *data)
+{
+        struct migration_swap_arg *arg = data;
+        struct rq *src_rq, *dst_rq;
+        int ret = -EAGAIN;
+        src_rq = cpu_rq(arg->src_cpu);
+        dst_rq = cpu_rq(arg->dst_cpu);
+        double_raw_lock(&arg->src_task->pi_lock,
+                        &arg->dst_task->pi_lock);
+        double_rq_lock(src_rq, dst_rq);
+        if (task_cpu(arg->dst_task) != arg->dst_cpu)
+                goto unlock;
+        if (task_cpu(arg->src_task) != arg->src_cpu)
+                goto unlock;
+        if (!cpumask_test_cpu(arg->dst_cpu, tsk_cpus_allowed(arg->src_task)))
+                goto unlock;
+        if (!cpumask_test_cpu(arg->src_cpu, tsk_cpus_allowed(arg->dst_task)))
+                goto unlock;
+        __migrate_swap_task(arg->src_task, arg->dst_cpu);
+        __migrate_swap_task(arg->dst_task, arg->src_cpu);
+        ret = 0;
+unlock:
+        double_rq_unlock(src_rq, dst_rq);
+        raw_spin_unlock(&arg->dst_task->pi_lock);
+        raw_spin_unlock(&arg->src_task->pi_lock);
+        return ret;
+}
+/*
+ * Cross migrate two tasks
+ */
+int migrate_swap(struct task_struct *cur, struct task_struct *p)
+{
+        struct migration_swap_arg arg;
+        int ret = -EINVAL;
+        arg = (struct migration_swap_arg){
+                .src_task = cur,
+                .src_cpu = task_cpu(cur),
+                .dst_task = p,
+                .dst_cpu = task_cpu(p),
+        };
+        if (arg.src_cpu == arg.dst_cpu)
+                goto out;
+        /*
+         * These three tests are all lockless; this is OK since all of them
+         * will be re-checked with proper locks held further down the line.
+         */
+        if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
+                goto out;
+        if (!cpumask_test_cpu(arg.dst_cpu, tsk_cpus_allowed(arg.src_task)))
+                goto out;
+        if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
+                goto out;
+        ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
+out:
+        return ret;
+}
 struct migration_arg {
        struct task_struct *task;
        int dest_cpu;
@@ -1236,9 +1333,9 @@ out:
 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
 */
 static inline
-int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
-        int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
+        cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
        /*
         * In order not to call set_task_cpu() on a blocking task we need
@@ -1330,12 +1427,13 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
        if (rq->idle_stamp) {
                u64 delta = rq_clock(rq) - rq->idle_stamp;
-                u64 max = 2*sysctl_sched_migration_cost;
+                u64 max = 2*rq->max_idle_balance_cost;
-                if (delta > max)
+                update_avg(&rq->avg_idle, delta);
+                if (rq->avg_idle > max)
                        rq->avg_idle = max;
-                else
-                        update_avg(&rq->avg_idle, delta);
                rq->idle_stamp = 0;
        }
 #endif
@@ -1396,6 +1494,14 @@ static void sched_ttwu_pending(void)
 void scheduler_ipi(void)
 {
+        /*
+         * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
+         * TIF_NEED_RESCHED remotely (for the first time) will also send
+         * this IPI.
+         */
+        if (tif_need_resched())
+                set_preempt_need_resched();
        if (llist_empty(&this_rq()->wake_list)
                        && !tick_nohz_full_cpu(smp_processor_id())
                        && !got_nohz_idle_kick())
@@ -1513,7 +1619,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
        if (p->sched_class->task_waking)
                p->sched_class->task_waking(p);
-        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+        cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
        if (task_cpu(p) != cpu) {
                wake_flags |= WF_MIGRATED;
                set_task_cpu(p, cpu);
@@ -1595,7 +1701,7 @@ int wake_up_state(struct task_struct *p, unsigned int state)
 *
 * __sched_fork() is basic setup used by init_idle() too:
 */
-static void __sched_fork(struct task_struct *p)
+static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
        p->on_rq                        = 0;
@@ -1619,16 +1725,24 @@ static void __sched_fork(struct task_struct *p)
 #ifdef CONFIG_NUMA_BALANCING
        if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
-                p->mm->numa_next_scan = jiffies;
+                p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
-                p->mm->numa_next_reset = jiffies;
                p->mm->numa_scan_seq = 0;
        }
+        if (clone_flags & CLONE_VM)
+                p->numa_preferred_nid = current->numa_preferred_nid;
+        else
+                p->numa_preferred_nid = -1;
        p->node_stamp = 0ULL;
        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
-        p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
        p->numa_work.next = &p->numa_work;
+        p->numa_faults = NULL;
+        p->numa_faults_buffer = NULL;
+        INIT_LIST_HEAD(&p->numa_entry);
+        p->numa_group = NULL;
 #endif /* CONFIG_NUMA_BALANCING */
 }
@@ -1654,12 +1768,12 @@ void set_numabalancing_state(bool enabled)
 /*
 * fork()/clone()-time setup:
 */
-void sched_fork(struct task_struct *p)
+void sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
        unsigned long flags;
        int cpu = get_cpu();
-        __sched_fork(p);
+        __sched_fork(clone_flags, p);
        /*
         * We mark the process as running here. This guarantees that
         * nobody will actually run it, and a signal or other external
@@ -1717,10 +1831,7 @@ void sched_fork(struct task_struct *p)
 #if defined(CONFIG_SMP)
        p->on_cpu = 0;
 #endif
-#ifdef CONFIG_PREEMPT_COUNT
+        init_task_preempt_count(p);
-        /* Want to start with kernel preemption disabled. */
-        task_thread_info(p)->preempt_count = 1;
-#endif
 #ifdef CONFIG_SMP
        plist_node_init(&p->pushable_tasks, MAX_PRIO);
 #endif
@@ -1747,7 +1858,7 @@ void wake_up_new_task(struct task_struct *p)
         *  - cpus_allowed can change in the fork path
         *  - any previously selected cpu might disappear through hotplug
         */
-        set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
+        set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
        /* Initialize new task's runnable average */
@@ -1838,7 +1949,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
                    struct task_struct *next)
 {
        trace_sched_switch(prev, next);
-        sched_info_switch(prev, next);
+        sched_info_switch(rq, prev, next);
        perf_event_task_sched_out(prev, next);
        fire_sched_out_preempt_notifiers(prev, next);
        prepare_lock_switch(rq, next);
@@ -1890,6 +2001,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        if (mm)
                mmdrop(mm);
        if (unlikely(prev_state == TASK_DEAD)) {
+                task_numa_free(prev);
                /*
                 * Remove function-return probe instances associated with this
                 * task and put them back on the free list.
@@ -2073,7 +2186,7 @@ void sched_exec(void)
        int dest_cpu;
        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
+        dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
        if (dest_cpu == smp_processor_id())
                goto unlock;
@@ -2215,7 +2328,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
                                defined(CONFIG_PREEMPT_TRACER))
-void __kprobes add_preempt_count(int val)
+void __kprobes preempt_count_add(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
        /*
@@ -2224,7 +2337,7 @@ void __kprobes add_preempt_count(int val)
        if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
                return;
 #endif
-        preempt_count() += val;
+        __preempt_count_add(val);
 #ifdef CONFIG_DEBUG_PREEMPT
        /*
         * Spinlock count overflowing soon?
@@ -2235,9 +2348,9 @@ void __kprobes add_preempt_count(int val)
        if (preempt_count() == val)
                trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
-EXPORT_SYMBOL(add_preempt_count);
+EXPORT_SYMBOL(preempt_count_add);
-void __kprobes sub_preempt_count(int val)
+void __kprobes preempt_count_sub(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
        /*
@@ -2255,9 +2368,9 @@ void __kprobes sub_preempt_count(int val)
        if (preempt_count() == val)
                trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
-        preempt_count() -= val;
+        __preempt_count_sub(val);
 }
-EXPORT_SYMBOL(sub_preempt_count);
+EXPORT_SYMBOL(preempt_count_sub);
 #endif
@@ -2430,6 +2543,7 @@ need_resched:
        put_prev_task(rq, prev);
        next = pick_next_task(rq);
        clear_tsk_need_resched(prev);
+        clear_preempt_need_resched();
        rq->skip_clock_update = 0;
        if (likely(prev != next)) {
@@ -2520,9 +2634,9 @@ asmlinkage void __sched notrace preempt_schedule(void)
                return;
        do {
-                add_preempt_count_notrace(PREEMPT_ACTIVE);
+                __preempt_count_add(PREEMPT_ACTIVE);
                __schedule();
-                sub_preempt_count_notrace(PREEMPT_ACTIVE);
+                __preempt_count_sub(PREEMPT_ACTIVE);
                /*
                 * Check again in case we missed a preemption opportunity
@@ -2541,20 +2655,19 @@ EXPORT_SYMBOL(preempt_schedule);
 */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
-        struct thread_info *ti = current_thread_info();
        enum ctx_state prev_state;
        /* Catch callers which need to be fixed */
-        BUG_ON(ti->preempt_count || !irqs_disabled());
+        BUG_ON(preempt_count() || !irqs_disabled());
        prev_state = exception_enter();
        do {
-                add_preempt_count(PREEMPT_ACTIVE);
+                __preempt_count_add(PREEMPT_ACTIVE);
                local_irq_enable();
                __schedule();
                local_irq_disable();
-                sub_preempt_count(PREEMPT_ACTIVE);
+                __preempt_count_sub(PREEMPT_ACTIVE);
                /*
                 * Check again in case we missed a preemption opportunity
@@ -3598,13 +3711,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        struct task_struct *p;
        int retval;
-        get_online_cpus();
        rcu_read_lock();
        p = find_process_by_pid(pid);
        if (!p) {
                rcu_read_unlock();
-                put_online_cpus();
                return -ESRCH;
        }
@@ -3661,7 +3772,6 @@ out_free_cpus_allowed:
        free_cpumask_var(cpus_allowed);
 out_put_task:
        put_task_struct(p);
-        put_online_cpus();
        return retval;
 }
@@ -3706,7 +3816,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
        unsigned long flags;
        int retval;
-        get_online_cpus();
        rcu_read_lock();
        retval = -ESRCH;
@@ -3719,12 +3828,11 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
                goto out_unlock;
        raw_spin_lock_irqsave(&p->pi_lock, flags);
-        cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
+        cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out_unlock:
        rcu_read_unlock();
-        put_online_cpus();
        return retval;
 }
@@ -3794,16 +3902,11 @@ SYSCALL_DEFINE0(sched_yield)
        return 0;
 }
-static inline int should_resched(void)
-{
-        return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
-}
 static void __cond_resched(void)
 {
-        add_preempt_count(PREEMPT_ACTIVE);
+        __preempt_count_add(PREEMPT_ACTIVE);
        __schedule();
-        sub_preempt_count(PREEMPT_ACTIVE);
+        __preempt_count_sub(PREEMPT_ACTIVE);
 }
 int __sched _cond_resched(void)
@@ -4186,7 +4289,7 @@ void init_idle(struct task_struct *idle, int cpu)
        raw_spin_lock_irqsave(&rq->lock, flags);
-        __sched_fork(idle);
+        __sched_fork(0, idle);
        idle->state = TASK_RUNNING;
        idle->se.exec_start = sched_clock();
@@ -4212,7 +4315,7 @@ void init_idle(struct task_struct *idle, int cpu)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
        /* Set the preempt count _outside_ the spinlocks! */
-        task_thread_info(idle)->preempt_count = 0;
+        init_idle_preempt_count(idle, cpu);
        /*
         * The idle tasks have their own, simple scheduling class:
@@ -4346,6 +4449,53 @@ fail:
        return ret;
 }
+#ifdef CONFIG_NUMA_BALANCING
+/* Migrate current task p to target_cpu */
+int migrate_task_to(struct task_struct *p, int target_cpu)
+{
+        struct migration_arg arg = { p, target_cpu };
+        int curr_cpu = task_cpu(p);
+        if (curr_cpu == target_cpu)
+                return 0;
+        if (!cpumask_test_cpu(target_cpu, tsk_cpus_allowed(p)))
+                return -EINVAL;
+        /* TODO: This is not properly updating schedstats */
+        return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
+}
+/*
+ * Requeue a task on a given node and accurately track the number of NUMA
+ * tasks on the runqueues
+ */
+void sched_setnuma(struct task_struct *p, int nid)
+{
+        struct rq *rq;
+        unsigned long flags;
+        bool on_rq, running;
+        rq = task_rq_lock(p, &flags);
+        on_rq = p->on_rq;
+        running = task_current(rq, p);
+        if (on_rq)
+                dequeue_task(rq, p, 0);
+        if (running)
+                p->sched_class->put_prev_task(rq, p);
+        p->numa_preferred_nid = nid;
+        if (running)
+                p->sched_class->set_curr_task(rq);
+        if (on_rq)
+                enqueue_task(rq, p, 0);
+        task_rq_unlock(rq, p, &flags);
+}
+#endif
 /*
 * migration_cpu_stop - this will be executed by a highprio stopper thread
 * and performs thread migration by bumping thread off CPU then
@@ -5119,6 +5269,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
 DEFINE_PER_CPU(int, sd_llc_size);
 DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
 static void update_top_cache_domain(int cpu)
 {
@@ -5135,6 +5286,9 @@ static void update_top_cache_domain(int cpu)
        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
        per_cpu(sd_llc_size, cpu) = size;
        per_cpu(sd_llc_id, cpu) = id;
+        sd = lowest_flag_domain(cpu, SD_NUMA);
+        rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
 }
 /*
@@ -5654,6 +5808,7 @@ sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
                                        | 0*SD_SHARE_PKG_RESOURCES
                                        | 1*SD_SERIALIZE
                                        | 0*SD_PREFER_SIBLING
+                                        | 1*SD_NUMA
                                        | sd_local_flags(level)
                                        ,
                .last_balance           = jiffies,
@@ -6335,14 +6490,17 @@ void __init sched_init_smp(void)
        sched_init_numa();
-        get_online_cpus();
+        /*
+         * There's no userspace yet to cause hotplug operations; hence all the
+         * cpu masks are stable and all blatant races in the below code cannot
+         * happen.
+         */
        mutex_lock(&sched_domains_mutex);
        init_sched_domains(cpu_active_mask);
        cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
        if (cpumask_empty(non_isolated_cpus))
                cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
        mutex_unlock(&sched_domains_mutex);
-        put_online_cpus();
        hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
        hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
@@ -6505,6 +6663,7 @@ void __init sched_init(void)
                rq->online = 0;
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
+                rq->max_idle_balance_cost = sysctl_sched_migration_cost;
                INIT_LIST_HEAD(&rq->cfs_tasks);
@@ -7277,7 +7436,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        runtime_enabled = quota != RUNTIME_INF;
        runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
-        account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
+        /*
+         * If we need to toggle cfs_bandwidth_used, off->on must occur
+         * before making related changes, and on->off must occur afterwards
+         */
+        if (runtime_enabled && !runtime_was_enabled)
+                cfs_bandwidth_usage_inc();
        raw_spin_lock_irq(&cfs_b->lock);
        cfs_b->period = ns_to_ktime(period);
        cfs_b->quota = quota;
@@ -7303,6 +7467,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
                        unthrottle_cfs_rq(cfs_rq);
                raw_spin_unlock_irq(&rq->lock);
        }
+        if (runtime_was_enabled && !runtime_enabled)
+                cfs_bandwidth_usage_dec();
 out_unlock:
        mutex_unlock(&cfs_constraints_mutex);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 196559994f7c..5c34d1817e8f 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -15,6 +15,7 @@
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
+#include <linux/mempolicy.h>
 #include "sched.h"
@@ -137,6 +138,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
        SEQ_printf(m, "%15Ld %15Ld %15Ld.%06ld %15Ld.%06ld %15Ld.%06ld",
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
+#ifdef CONFIG_NUMA_BALANCING
+        SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
+#endif
 #ifdef CONFIG_CGROUP_SCHED
        SEQ_printf(m, " %s", task_group_path(task_group(p)));
 #endif
@@ -159,7 +163,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
        read_lock_irqsave(&tasklist_lock, flags);
        do_each_thread(g, p) {
-                if (!p->on_rq || task_cpu(p) != rq_cpu)
+                if (task_cpu(p) != rq_cpu)
                        continue;
                print_task(m, rq, p);
@@ -225,6 +229,14 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                        atomic_read(&cfs_rq->tg->runnable_avg));
 #endif
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+        SEQ_printf(m, "  .%-30s: %d\n", "tg->cfs_bandwidth.timer_active",
+                        cfs_rq->tg->cfs_bandwidth.timer_active);
+        SEQ_printf(m, "  .%-30s: %d\n", "throttled",
+                        cfs_rq->throttled);
+        SEQ_printf(m, "  .%-30s: %d\n", "throttle_count",
+                        cfs_rq->throttle_count);
+#endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
        print_cfs_group_stats(m, cpu, cfs_rq->tg);
@@ -345,7 +357,7 @@ static void sched_debug_header(struct seq_file *m)
        cpu_clk = local_clock();
        local_irq_restore(flags);
-        SEQ_printf(m, "Sched Debug Version: v0.10, %s %.*s\n",
+        SEQ_printf(m, "Sched Debug Version: v0.11, %s %.*s\n",
                init_utsname()->release,
                (int)strcspn(init_utsname()->version, " "),
                init_utsname()->version);
@@ -488,6 +500,56 @@ static int __init init_sched_debug_procfs(void)
 __initcall(init_sched_debug_procfs);
+#define __P(F) \
+        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
+#define P(F) \
+        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define __PN(F) \
+        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
+#define PN(F) \
+        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
+static void sched_show_numa(struct task_struct *p, struct seq_file *m)
+{
+#ifdef CONFIG_NUMA_BALANCING
+        struct mempolicy *pol;
+        int node, i;
+        if (p->mm)
+                P(mm->numa_scan_seq);
+        task_lock(p);
+        pol = p->mempolicy;
+        if (pol && !(pol->flags & MPOL_F_MORON))
+                pol = NULL;
+        mpol_get(pol);
+        task_unlock(p);
+        SEQ_printf(m, "numa_migrations, %ld\n", xchg(&p->numa_pages_migrated, 0));
+        for_each_online_node(node) {
+                for (i = 0; i < 2; i++) {
+                        unsigned long nr_faults = -1;
+                        int cpu_current, home_node;
+                        if (p->numa_faults)
+                                nr_faults = p->numa_faults[2*node + i];
+                        cpu_current = !i ? (task_node(p) == node) :
+                                (pol && node_isset(node, pol->v.nodes));
+                        home_node = (p->numa_preferred_nid == node);
+                        SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
+                                i, node, cpu_current, home_node, nr_faults);
+                }
+        }
+        mpol_put(pol);
+#endif
+}
 void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 {
        unsigned long nr_switches;
@@ -591,6 +653,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
                SEQ_printf(m, "%-45s:%21Ld\n",
                           "clock-delta", (long long)(t1-t0));
        }
+        sched_show_numa(p, m);
 }
 void proc_sched_set_task(struct task_struct *p)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7c70201fbc61..41c02b6b090e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 #ifdef CONFIG_SMP
+static unsigned long task_h_load(struct task_struct *p);
 static inline void __update_task_entity_contrib(struct sched_entity *se);
 /* Give new task start runnable values to heavy its load in infant time */
@@ -818,11 +820,12 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 #ifdef CONFIG_NUMA_BALANCING
 /*
- * numa task sample period in ms
+ * Approximate time to scan a full NUMA task in ms. The task scan period is
+ * calculated based on the tasks virtual memory size and
+ * numa_balancing_scan_size.
 */
-unsigned int sysctl_numa_balancing_scan_period_min = 100;
+unsigned int sysctl_numa_balancing_scan_period_min = 1000;
-unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
+unsigned int sysctl_numa_balancing_scan_period_max = 60000;
-unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
 /* Portion of address space to scan in MB */
 unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -830,41 +833,810 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
-static void task_numa_placement(struct task_struct *p)
+/*
+ * After skipping a page migration on a shared page, skip N more numa page
+ * migrations unconditionally. This reduces the number of NUMA migrations
+ * in shared memory workloads, and has the effect of pulling tasks towards
+ * where their memory lives, over pulling the memory towards the task.
+ */
+unsigned int sysctl_numa_balancing_migrate_deferred = 16;
+static unsigned int task_nr_scan_windows(struct task_struct *p)
+{
+        unsigned long rss = 0;
+        unsigned long nr_scan_pages;
+        /*
+         * Calculations based on RSS as non-present and empty pages are skipped
+         * by the PTE scanner and NUMA hinting faults should be trapped based
+         * on resident pages
+         */
+        nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
+        rss = get_mm_rss(p->mm);
+        if (!rss)
+                rss = nr_scan_pages;
+        rss = round_up(rss, nr_scan_pages);
+        return rss / nr_scan_pages;
+}
+/* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
+#define MAX_SCAN_WINDOW 2560
+static unsigned int task_scan_min(struct task_struct *p)
+{
+        unsigned int scan, floor;
+        unsigned int windows = 1;
+        if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
+                windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+        floor = 1000 / windows;
+        scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
+        return max_t(unsigned int, floor, scan);
+}
+static unsigned int task_scan_max(struct task_struct *p)
+{
+        unsigned int smin = task_scan_min(p);
+        unsigned int smax;
+        /* Watch for min being lower than max due to floor calculations */
+        smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
+        return max(smin, smax);
+}
+/*
+ * Once a preferred node is selected the scheduler balancer will prefer moving
+ * a task to that node for sysctl_numa_balancing_settle_count number of PTE
+ * scans. This will give the process the chance to accumulate more faults on
+ * the preferred node but still allow the scheduler to move the task again if
+ * the nodes CPUs are overloaded.
+ */
+unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+        rq->nr_numa_running += (p->numa_preferred_nid != -1);
+        rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+        rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+        rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+struct numa_group {
+        atomic_t refcount;
+        spinlock_t lock; /* nr_tasks, tasks */
+        int nr_tasks;
+        pid_t gid;
+        struct list_head task_list;
+        struct rcu_head rcu;
+        unsigned long total_faults;
+        unsigned long faults[0];
+};
+pid_t task_numa_group_id(struct task_struct *p)
+{
+        return p->numa_group ? p->numa_group->gid : 0;
+}
+static inline int task_faults_idx(int nid, int priv)
+{
+        return 2 * nid + priv;
+}
+static inline unsigned long task_faults(struct task_struct *p, int nid)
+{
+        if (!p->numa_faults)
+                return 0;
+        return p->numa_faults[task_faults_idx(nid, 0)] +
+                p->numa_faults[task_faults_idx(nid, 1)];
+}
+static inline unsigned long group_faults(struct task_struct *p, int nid)
+{
+        if (!p->numa_group)
+                return 0;
+        return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
+}
+/*
+ * These return the fraction of accesses done by a particular task, or
+ * task group, on a particular numa node.  The group weight is given a
+ * larger multiplier, in order to group tasks together that are almost
+ * evenly spread out between numa nodes.
+ */
+static inline unsigned long task_weight(struct task_struct *p, int nid)
+{
+        unsigned long total_faults;
+        if (!p->numa_faults)
+                return 0;
+        total_faults = p->total_numa_faults;
+        if (!total_faults)
+                return 0;
+        return 1000 * task_faults(p, nid) / total_faults;
+}
+static inline unsigned long group_weight(struct task_struct *p, int nid)
 {
-        int seq;
+        if (!p->numa_group || !p->numa_group->total_faults)
+                return 0;
+        return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
+}
+static unsigned long weighted_cpuload(const int cpu);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long power_of(int cpu);
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+/* Cached statistics for all CPUs within a node */
+struct numa_stats {
+        unsigned long nr_running;
+        unsigned long load;
+        /* Total compute capacity of CPUs on a node */
+        unsigned long power;
+        /* Approximate capacity in terms of runnable tasks on a node */
+        unsigned long capacity;
+        int has_capacity;
+};
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct numa_stats *ns, int nid)
+{
+        int cpu;
+        memset(ns, 0, sizeof(*ns));
+        for_each_cpu(cpu, cpumask_of_node(nid)) {
+                struct rq *rq = cpu_rq(cpu);
+                ns->nr_running += rq->nr_running;
+                ns->load += weighted_cpuload(cpu);
+                ns->power += power_of(cpu);
+        }
+        ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
+        ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
+        ns->has_capacity = (ns->nr_running < ns->capacity);
+}
+struct task_numa_env {
+        struct task_struct *p;
+        int src_cpu, src_nid;
+        int dst_cpu, dst_nid;
+        struct numa_stats src_stats, dst_stats;
+        int imbalance_pct, idx;
+        struct task_struct *best_task;
+        long best_imp;
+        int best_cpu;
+};
+static void task_numa_assign(struct task_numa_env *env,
+                             struct task_struct *p, long imp)
+{
+        if (env->best_task)
+                put_task_struct(env->best_task);
+        if (p)
+                get_task_struct(p);
+        env->best_task = p;
+        env->best_imp = imp;
+        env->best_cpu = env->dst_cpu;
+}
+/*
+ * This checks if the overall compute and NUMA accesses of the system would
+ * be improved if the source tasks was migrated to the target dst_cpu taking
+ * into account that it might be best if task running on the dst_cpu should
+ * be exchanged with the source task
+ */
+static void task_numa_compare(struct task_numa_env *env,
+                              long taskimp, long groupimp)
+{
+        struct rq *src_rq = cpu_rq(env->src_cpu);
+        struct rq *dst_rq = cpu_rq(env->dst_cpu);
+        struct task_struct *cur;
+        long dst_load, src_load;
+        long load;
+        long imp = (groupimp > 0) ? groupimp : taskimp;
+        rcu_read_lock();
+        cur = ACCESS_ONCE(dst_rq->curr);
+        if (cur->pid == 0) /* idle */
+                cur = NULL;
+        /*
+         * "imp" is the fault differential for the source task between the
+         * source and destination node. Calculate the total differential for
+         * the source task and potential destination task. The more negative
+         * the value is, the more rmeote accesses that would be expected to
+         * be incurred if the tasks were swapped.
+         */
+        if (cur) {
+                /* Skip this swap candidate if cannot move to the source cpu */
+                if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+                        goto unlock;
+                /*
+                 * If dst and source tasks are in the same NUMA group, or not
+                 * in any group then look only at task weights.
+                 */
+                if (cur->numa_group == env->p->numa_group) {
+                        imp = taskimp + task_weight(cur, env->src_nid) -
+                              task_weight(cur, env->dst_nid);
+                        /*
+                         * Add some hysteresis to prevent swapping the
+                         * tasks within a group over tiny differences.
+                         */
+                        if (cur->numa_group)
+                                imp -= imp/16;
+                } else {
+                        /*
+                         * Compare the group weights. If a task is all by
+                         * itself (not part of a group), use the task weight
+                         * instead.
+                         */
+                        if (env->p->numa_group)
+                                imp = groupimp;
+                        else
+                                imp = taskimp;
+                        if (cur->numa_group)
+                                imp += group_weight(cur, env->src_nid) -
+                                       group_weight(cur, env->dst_nid);
+                        else
+                                imp += task_weight(cur, env->src_nid) -
+                                       task_weight(cur, env->dst_nid);
+                }
+        }
+        if (imp < env->best_imp)
+                goto unlock;
+        if (!cur) {
+                /* Is there capacity at our destination? */
+                if (env->src_stats.has_capacity &&
+                    !env->dst_stats.has_capacity)
+                        goto unlock;
+                goto balance;
+        }
+        /* Balance doesn't matter much if we're running a task per cpu */
+        if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+                goto assign;
+        /*
+         * In the overloaded case, try and keep the load balanced.
+         */
+balance:
+        dst_load = env->dst_stats.load;
+        src_load = env->src_stats.load;
+        /* XXX missing power terms */
+        load = task_h_load(env->p);
+        dst_load += load;
+        src_load -= load;
+        if (cur) {
+                load = task_h_load(cur);
+                dst_load -= load;
+                src_load += load;
+        }
+        /* make src_load the smaller */
+        if (dst_load < src_load)
+                swap(dst_load, src_load);
+        if (src_load * env->imbalance_pct < dst_load * 100)
+                goto unlock;
+assign:
+        task_numa_assign(env, cur, imp);
+unlock:
+        rcu_read_unlock();
+}
+static void task_numa_find_cpu(struct task_numa_env *env,
+                                long taskimp, long groupimp)
+{
+        int cpu;
+        for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
+                /* Skip this CPU if the source task cannot migrate */
+                if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
+                        continue;
+                env->dst_cpu = cpu;
+                task_numa_compare(env, taskimp, groupimp);
+        }
+}
+static int task_numa_migrate(struct task_struct *p)
+{
+        struct task_numa_env env = {
+                .p = p,
+                .src_cpu = task_cpu(p),
+                .src_nid = task_node(p),
+                .imbalance_pct = 112,
+                .best_task = NULL,
+                .best_imp = 0,
+                .best_cpu = -1
+        };
+        struct sched_domain *sd;
+        unsigned long taskweight, groupweight;
+        int nid, ret;
+        long taskimp, groupimp;
+        /*
+         * Pick the lowest SD_NUMA domain, as that would have the smallest
+         * imbalance and would be the first to start moving tasks about.
+         *
+         * And we want to avoid any moving of tasks about, as that would create
+         * random movement of tasks -- counter the numa conditions we're trying
+         * to satisfy here.
+         */
+        rcu_read_lock();
+        sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+        env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
+        rcu_read_unlock();
+        taskweight = task_weight(p, env.src_nid);
+        groupweight = group_weight(p, env.src_nid);
+        update_numa_stats(&env.src_stats, env.src_nid);
+        env.dst_nid = p->numa_preferred_nid;
+        taskimp = task_weight(p, env.dst_nid) - taskweight;
+        groupimp = group_weight(p, env.dst_nid) - groupweight;
+        update_numa_stats(&env.dst_stats, env.dst_nid);
+        /* If the preferred nid has capacity, try to use it. */
+        if (env.dst_stats.has_capacity)
+                task_numa_find_cpu(&env, taskimp, groupimp);
+        /* No space available on the preferred nid. Look elsewhere. */
+        if (env.best_cpu == -1) {
+                for_each_online_node(nid) {
+                        if (nid == env.src_nid || nid == p->numa_preferred_nid)
+                                continue;
+                        /* Only consider nodes where both task and groups benefit */
+                        taskimp = task_weight(p, nid) - taskweight;
+                        groupimp = group_weight(p, nid) - groupweight;
+                        if (taskimp < 0 && groupimp < 0)
+                                continue;
+                        env.dst_nid = nid;
+                        update_numa_stats(&env.dst_stats, env.dst_nid);
+                        task_numa_find_cpu(&env, taskimp, groupimp);
+                }
+        }
+        /* No better CPU than the current one was found. */
+        if (env.best_cpu == -1)
+                return -EAGAIN;
+        sched_setnuma(p, env.dst_nid);
+        /*
+         * Reset the scan period if the task is being rescheduled on an
+         * alternative node to recheck if the tasks is now properly placed.
+         */
+        p->numa_scan_period = task_scan_min(p);
+        if (env.best_task == NULL) {
+                int ret = migrate_task_to(p, env.best_cpu);
+                return ret;
+        }
+        ret = migrate_swap(p, env.best_task);
+        put_task_struct(env.best_task);
+        return ret;
+}
+/* Attempt to migrate a task to a CPU on the preferred node. */
+static void numa_migrate_preferred(struct task_struct *p)
+{
+        /* This task has no NUMA fault statistics yet */
+        if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+                return;
+        /* Periodically retry migrating the task to the preferred node */
+        p->numa_migrate_retry = jiffies + HZ;
+        /* Success if task is already running on preferred CPU */
+        if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
+                return;
+        /* Otherwise, try migrate to a CPU on the preferred node */
+        task_numa_migrate(p);
+}
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+                        unsigned long shared, unsigned long private)
+{
+        unsigned int period_slot;
+        int ratio;
+        int diff;
+        unsigned long remote = p->numa_faults_locality[0];
+        unsigned long local = p->numa_faults_locality[1];
+        /*
+         * If there were no record hinting faults then either the task is
+         * completely idle or all activity is areas that are not of interest
+         * to automatic numa balancing. Scan slower
+         */
+        if (local + shared == 0) {
+                p->numa_scan_period = min(p->numa_scan_period_max,
+                        p->numa_scan_period << 1);
+                p->mm->numa_next_scan = jiffies +
+                        msecs_to_jiffies(p->numa_scan_period);
-        if (!p->mm)     /* for example, ksmd faulting in a user's mm */
                return;
+        }
+        /*
+         * Prepare to scale scan period relative to the current period.
+         *       == NUMA_PERIOD_THRESHOLD scan period stays the same
+         *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+         *       >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+         */
+        period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+        ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+        if (ratio >= NUMA_PERIOD_THRESHOLD) {
+                int slot = ratio - NUMA_PERIOD_THRESHOLD;
+                if (!slot)
+                        slot = 1;
+                diff = slot * period_slot;
+        } else {
+                diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+                /*
+                 * Scale scan rate increases based on sharing. There is an
+                 * inverse relationship between the degree of sharing and
+                 * the adjustment made to the scanning period. Broadly
+                 * speaking the intent is that there is little point
+                 * scanning faster if shared accesses dominate as it may
+                 * simply bounce migrations uselessly
+                 */
+                period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
+                ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+                diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+        }
+        p->numa_scan_period = clamp(p->numa_scan_period + diff,
+                        task_scan_min(p), task_scan_max(p));
+        memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
+static void task_numa_placement(struct task_struct *p)
+{
+        int seq, nid, max_nid = -1, max_group_nid = -1;
+        unsigned long max_faults = 0, max_group_faults = 0;
+        unsigned long fault_types[2] = { 0, 0 };
+        spinlock_t *group_lock = NULL;
        seq = ACCESS_ONCE(p->mm->numa_scan_seq);
        if (p->numa_scan_seq == seq)
                return;
        p->numa_scan_seq = seq;
+        p->numa_scan_period_max = task_scan_max(p);
+        /* If the task is part of a group prevent parallel updates to group stats */
+        if (p->numa_group) {
+                group_lock = &p->numa_group->lock;
+                spin_lock(group_lock);
+        }
+        /* Find the node with the highest number of faults */
+        for_each_online_node(nid) {
+                unsigned long faults = 0, group_faults = 0;
+                int priv, i;
+                for (priv = 0; priv < 2; priv++) {
+                        long diff;
+                        i = task_faults_idx(nid, priv);
+                        diff = -p->numa_faults[i];
+                        /* Decay existing window, copy faults since last scan */
+                        p->numa_faults[i] >>= 1;
+                        p->numa_faults[i] += p->numa_faults_buffer[i];
+                        fault_types[priv] += p->numa_faults_buffer[i];
+                        p->numa_faults_buffer[i] = 0;
+                        faults += p->numa_faults[i];
+                        diff += p->numa_faults[i];
+                        p->total_numa_faults += diff;
+                        if (p->numa_group) {
+                                /* safe because we can only change our own group */
+                                p->numa_group->faults[i] += diff;
+                                p->numa_group->total_faults += diff;
+                                group_faults += p->numa_group->faults[i];
+                        }
+                }
+                if (faults > max_faults) {
+                        max_faults = faults;
+                        max_nid = nid;
+                }
+                if (group_faults > max_group_faults) {
+                        max_group_faults = group_faults;
+                        max_group_nid = nid;
+                }
+        }
+        update_task_scan_period(p, fault_types[0], fault_types[1]);
+        if (p->numa_group) {
+                /*
+                 * If the preferred task and group nids are different,
+                 * iterate over the nodes again to find the best place.
+                 */
+                if (max_nid != max_group_nid) {
+                        unsigned long weight, max_weight = 0;
+                        for_each_online_node(nid) {
+                                weight = task_weight(p, nid) + group_weight(p, nid);
+                                if (weight > max_weight) {
+                                        max_weight = weight;
+                                        max_nid = nid;
+                                }
+                        }
+                }
+                spin_unlock(group_lock);
+        }
-        /* FIXME: Scheduling placement policy hints go here */
+        /* Preferred node as the node with the most faults */
+        if (max_faults && max_nid != p->numa_preferred_nid) {
+                /* Update the preferred nid and migrate task if possible */
+                sched_setnuma(p, max_nid);
+                numa_migrate_preferred(p);
+        }
+}
+static inline int get_numa_group(struct numa_group *grp)
+{
+        return atomic_inc_not_zero(&grp->refcount);
+}
+static inline void put_numa_group(struct numa_group *grp)
+{
+        if (atomic_dec_and_test(&grp->refcount))
+                kfree_rcu(grp, rcu);
+}
+static void task_numa_group(struct task_struct *p, int cpupid, int flags,
+                        int *priv)
+{
+        struct numa_group *grp, *my_grp;
+        struct task_struct *tsk;
+        bool join = false;
+        int cpu = cpupid_to_cpu(cpupid);
+        int i;
+        if (unlikely(!p->numa_group)) {
+                unsigned int size = sizeof(struct numa_group) +
+                                    2*nr_node_ids*sizeof(unsigned long);
+                grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
+                if (!grp)
+                        return;
+                atomic_set(&grp->refcount, 1);
+                spin_lock_init(&grp->lock);
+                INIT_LIST_HEAD(&grp->task_list);
+                grp->gid = p->pid;
+                for (i = 0; i < 2*nr_node_ids; i++)
+                        grp->faults[i] = p->numa_faults[i];
+                grp->total_faults = p->total_numa_faults;
+                list_add(&p->numa_entry, &grp->task_list);
+                grp->nr_tasks++;
+                rcu_assign_pointer(p->numa_group, grp);
+        }
+        rcu_read_lock();
+        tsk = ACCESS_ONCE(cpu_rq(cpu)->curr);
+        if (!cpupid_match_pid(tsk, cpupid))
+                goto no_join;
+        grp = rcu_dereference(tsk->numa_group);
+        if (!grp)
+                goto no_join;
+        my_grp = p->numa_group;
+        if (grp == my_grp)
+                goto no_join;
+        /*
+         * Only join the other group if its bigger; if we're the bigger group,
+         * the other task will join us.
+         */
+        if (my_grp->nr_tasks > grp->nr_tasks)
+                goto no_join;
+        /*
+         * Tie-break on the grp address.
+         */
+        if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
+                goto no_join;
+        /* Always join threads in the same process. */
+        if (tsk->mm == current->mm)
+                join = true;
+        /* Simple filter to avoid false positives due to PID collisions */
+        if (flags & TNF_SHARED)
+                join = true;
+        /* Update priv based on whether false sharing was detected */
+        *priv = !join;
+        if (join && !get_numa_group(grp))
+                goto no_join;
+        rcu_read_unlock();
+        if (!join)
+                return;
+        double_lock(&my_grp->lock, &grp->lock);
+        for (i = 0; i < 2*nr_node_ids; i++) {
+                my_grp->faults[i] -= p->numa_faults[i];
+                grp->faults[i] += p->numa_faults[i];
+        }
+        my_grp->total_faults -= p->total_numa_faults;
+        grp->total_faults += p->total_numa_faults;
+        list_move(&p->numa_entry, &grp->task_list);
+        my_grp->nr_tasks--;
+        grp->nr_tasks++;
+        spin_unlock(&my_grp->lock);
+        spin_unlock(&grp->lock);
+        rcu_assign_pointer(p->numa_group, grp);
+        put_numa_group(my_grp);
+        return;
+no_join:
+        rcu_read_unlock();
+        return;
+}
+void task_numa_free(struct task_struct *p)
+{
+        struct numa_group *grp = p->numa_group;
+        int i;
+        void *numa_faults = p->numa_faults;
+        if (grp) {
+                spin_lock(&grp->lock);
+                for (i = 0; i < 2*nr_node_ids; i++)
+                        grp->faults[i] -= p->numa_faults[i];
+                grp->total_faults -= p->total_numa_faults;
+                list_del(&p->numa_entry);
+                grp->nr_tasks--;
+                spin_unlock(&grp->lock);
+                rcu_assign_pointer(p->numa_group, NULL);
+                put_numa_group(grp);
+        }
+        p->numa_faults = NULL;
+        p->numa_faults_buffer = NULL;
+        kfree(numa_faults);
 }
 /*
 * Got a PROT_NONE fault for a page on @node.
 */
-void task_numa_fault(int node, int pages, bool migrated)
+void task_numa_fault(int last_cpupid, int node, int pages, int flags)
 {
        struct task_struct *p = current;
+        bool migrated = flags & TNF_MIGRATED;
+        int priv;
        if (!numabalancing_enabled)
                return;
-        /* FIXME: Allocate task-specific structure for placement policy here */
+        /* for example, ksmd faulting in a user's mm */
+        if (!p->mm)
+                return;
+        /* Do not worry about placement if exiting */
+        if (p->state == TASK_DEAD)
+                return;
+        /* Allocate buffer to track faults on a per-node basis */
+        if (unlikely(!p->numa_faults)) {
+                int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
+                /* numa_faults and numa_faults_buffer share the allocation */
+                p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
+                if (!p->numa_faults)
+                        return;
+                BUG_ON(p->numa_faults_buffer);
+                p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+                p->total_numa_faults = 0;
+                memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+        }
        /*
-         * If pages are properly placed (did not migrate) then scan slower.
+         * First accesses are treated as private, otherwise consider accesses
-         * This is reset periodically in case of phase changes
+         * to be private if the accessing pid has not changed
         */
-        if (!migrated)
+        if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
-                p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
+                priv = 1;
-                        p->numa_scan_period + jiffies_to_msecs(10));
+        } else {
+                priv = cpupid_match_pid(p, last_cpupid);
+                if (!priv && !(flags & TNF_NO_GROUP))
+                        task_numa_group(p, last_cpupid, flags, &priv);
+        }
        task_numa_placement(p);
+        /*
+         * Retry task to preferred node migration periodically, in case it
+         * case it previously failed, or the scheduler moved us.
+         */
+        if (time_after(jiffies, p->numa_migrate_retry))
+                numa_migrate_preferred(p);
+        if (migrated)
+                p->numa_pages_migrated += pages;
+        p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+        p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
 }
 static void reset_ptenuma_scan(struct task_struct *p)
@@ -884,6 +1656,7 @@ void task_numa_work(struct callback_head *work)
        struct mm_struct *mm = p->mm;
        struct vm_area_struct *vma;
        unsigned long start, end;
+        unsigned long nr_pte_updates = 0;
        long pages;
        WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
@@ -900,35 +1673,9 @@ void task_numa_work(struct callback_head *work)
        if (p->flags & PF_EXITING)
                return;
-        /*
+        if (!mm->numa_next_scan) {
-         * We do not care about task placement until a task runs on a node
+                mm->numa_next_scan = now +
-         * other than the first one used by the address space. This is
+                        msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
-         * largely because migrations are driven by what CPU the task
-         * is running on. If it's never scheduled on another node, it'll
-         * not migrate so why bother trapping the fault.
-         */
-        if (mm->first_nid == NUMA_PTE_SCAN_INIT)
-                mm->first_nid = numa_node_id();
-        if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
-                /* Are we running on a new node yet? */
-                if (numa_node_id() == mm->first_nid &&
-                    !sched_feat_numa(NUMA_FORCE))
-                        return;
-                mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
-        }
-        /*
-         * Reset the scan period if enough time has gone by. Objective is that
-         * scanning will be reduced if pages are properly placed. As tasks
-         * can enter different phases this needs to be re-examined. Lacking
-         * proper tracking of reference behaviour, this blunt hammer is used.
-         */
-        migrate = mm->numa_next_reset;
-        if (time_after(now, migrate)) {
-                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
-                next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
-                xchg(&mm->numa_next_reset, next_scan);
        }
        /*
@@ -938,20 +1685,20 @@ void task_numa_work(struct callback_head *work)
        if (time_before(now, migrate))
                return;
-        if (p->numa_scan_period == 0)
+        if (p->numa_scan_period == 0) {
-                p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+                p->numa_scan_period_max = task_scan_max(p);
+                p->numa_scan_period = task_scan_min(p);
+        }
        next_scan = now + msecs_to_jiffies(p->numa_scan_period);
        if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
                return;
        /*
-         * Do not set pte_numa if the current running node is rate-limited.
+         * Delay this task enough that another task of this mm will likely win
-         * This loses statistics on the fault but if we are unwilling to
+         * the next time around.
-         * migrate to this node, it is less likely we can do useful work
         */
-        if (migrate_ratelimited(numa_node_id()))
+        p->node_stamp += 2 * TICK_NSEC;
-                return;
        start = mm->numa_scan_offset;
        pages = sysctl_numa_balancing_scan_size;
@@ -967,18 +1714,32 @@ void task_numa_work(struct callback_head *work)
                vma = mm->mmap;
        }
        for (; vma; vma = vma->vm_next) {
-                if (!vma_migratable(vma))
+                if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
                        continue;
-                /* Skip small VMAs. They are not likely to be of relevance */
+                /*
-                if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
+                 * Shared library pages mapped by multiple processes are not
+                 * migrated as it is expected they are cache replicated. Avoid
+                 * hinting faults in read-only file-backed mappings or the vdso
+                 * as migrating the pages will be of marginal benefit.
+                 */
+                if (!vma->vm_mm ||
+                    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
                        continue;
                do {
                        start = max(start, vma->vm_start);
                        end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
                        end = min(end, vma->vm_end);
-                        pages -= change_prot_numa(vma, start, end);
+                        nr_pte_updates += change_prot_numa(vma, start, end);
+                        /*
+                         * Scan sysctl_numa_balancing_scan_size but ensure that
+                         * at least one PTE is updated so that unused virtual
+                         * address space is quickly skipped.
+                         */
+                        if (nr_pte_updates)
+                                pages -= (end - start) >> PAGE_SHIFT;
                        start = end;
                        if (pages <= 0)
@@ -988,10 +1749,10 @@ void task_numa_work(struct callback_head *work)
 out:
        /*
-         * It is possible to reach the end of the VMA list but the last few VMAs are
+         * It is possible to reach the end of the VMA list but the last few
-         * not guaranteed to the vma_migratable. If they are not, we would find the
+         * VMAs are not guaranteed to the vma_migratable. If they are not, we
-         * !migratable VMA on the next scan but not reset the scanner to the start
+         * would find the !migratable VMA on the next scan but not reset the
-         * so check it now.
+         * scanner to the start so check it now.
         */
        if (vma)
                mm->numa_scan_offset = start;
@@ -1025,8 +1786,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
        if (now - curr->node_stamp > period) {
                if (!curr->node_stamp)
-                        curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+                        curr->numa_scan_period = task_scan_min(curr);
-                curr->node_stamp = now;
+                curr->node_stamp += period;
                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
                        init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
@@ -1038,6 +1799,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 {
 }
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
 #endif /* CONFIG_NUMA_BALANCING */
 static void
@@ -1047,8 +1816,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        if (!parent_entity(se))
                update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
 #ifdef CONFIG_SMP
-        if (entity_is_task(se))
+        if (entity_is_task(se)) {
-                list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+                struct rq *rq = rq_of(cfs_rq);
+                account_numa_enqueue(rq, task_of(se));
+                list_add(&se->group_node, &rq->cfs_tasks);
+        }
 #endif
        cfs_rq->nr_running++;
 }
@@ -1059,8 +1832,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
        update_load_sub(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
                update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-        if (entity_is_task(se))
+        if (entity_is_task(se)) {
+                account_numa_dequeue(rq_of(cfs_rq), task_of(se));
                list_del_init(&se->group_node);
+        }
        cfs_rq->nr_running--;
 }
@@ -2070,13 +2845,14 @@ static inline bool cfs_bandwidth_used(void)
        return static_key_false(&__cfs_bandwidth_used);
 }
-void account_cfs_bandwidth_used(int enabled, int was_enabled)
+void cfs_bandwidth_usage_inc(void)
 {
-        /* only need to count groups transitioning between enabled/!enabled */
+        static_key_slow_inc(&__cfs_bandwidth_used);
-        if (enabled && !was_enabled)
+}
-                static_key_slow_inc(&__cfs_bandwidth_used);
-        else if (!enabled && was_enabled)
+void cfs_bandwidth_usage_dec(void)
-                static_key_slow_dec(&__cfs_bandwidth_used);
+{
+        static_key_slow_dec(&__cfs_bandwidth_used);
 }
 #else /* HAVE_JUMP_LABEL */
 static bool cfs_bandwidth_used(void)
@@ -2084,7 +2860,8 @@ static bool cfs_bandwidth_used(void)
        return true;
 }
-void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
 #endif /* HAVE_JUMP_LABEL */
 /*
@@ -2335,6 +3112,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        cfs_rq->throttled_clock = rq_clock(rq);
        raw_spin_lock(&cfs_b->lock);
        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+        if (!cfs_b->timer_active)
+                __start_cfs_bandwidth(cfs_b);
        raw_spin_unlock(&cfs_b->lock);
 }
@@ -2448,6 +3227,13 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
        if (idle)
                goto out_unlock;
+        /*
+         * if we have relooped after returning idle once, we need to update our
+         * status as actually running, so that other cpus doing
+         * __start_cfs_bandwidth will stop trying to cancel us.
+         */
+        cfs_b->timer_active = 1;
        __refill_cfs_bandwidth_runtime(cfs_b);
        if (!throttled) {
@@ -2508,7 +3294,13 @@ static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
 /* how long we wait to gather additional slack before distributing */
 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
-/* are we near the end of the current quota period? */
+/*
+ * Are we near the end of the current quota period?
+ *
+ * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
+ * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
+ * migrate_hrtimers, base is never cleared, so we are fine.
+ */
 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
 {
        struct hrtimer *refresh_timer = &cfs_b->period_timer;
@@ -2584,10 +3376,12 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        u64 expires;
        /* confirm we're still not at a refresh boundary */
-        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+        raw_spin_lock(&cfs_b->lock);
+        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
+                raw_spin_unlock(&cfs_b->lock);
                return;
+        }
-        raw_spin_lock(&cfs_b->lock);
        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
                runtime = cfs_b->runtime;
                cfs_b->runtime = 0;
@@ -2708,11 +3502,11 @@ void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
         * (timer_active==0 becomes visible before the hrtimer call-back
         * terminates).  In either case we ensure that it's re-programmed
         */
-        while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+        while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
+               hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
+                /* bounce the lock to allow do_sched_cfs_period_timer to run */
                raw_spin_unlock(&cfs_b->lock);
-                /* ensure cfs_b->lock is available while we wait */
+                cpu_relax();
-                hrtimer_cancel(&cfs_b->period_timer);
                raw_spin_lock(&cfs_b->lock);
                /* if someone else restarted the timer then we're done */
                if (cfs_b->timer_active)
@@ -3113,7 +3907,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
-        if (!tg->parent)        /* the trivial, non-cgroup case */
+        if (!tg->parent || !wl) /* the trivial, non-cgroup case */
                return wl;
        for_each_sched_entity(se) {
@@ -3166,8 +3960,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 }
 #else
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
-                unsigned long wl, unsigned long wg)
 {
        return wl;
 }
@@ -3420,11 +4213,10 @@ done:
 * preempt must be disabled.
 */
 static int
-select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
 {
        struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
        int cpu = smp_processor_id();
-        int prev_cpu = task_cpu(p);
        int new_cpu = cpu;
        int want_affine = 0;
        int sync = wake_flags & WF_SYNC;
@@ -3904,9 +4696,12 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+enum fbq_type { regular, remote, all };
 #define LBF_ALL_PINNED  0x01
 #define LBF_NEED_BREAK  0x02
-#define LBF_SOME_PINNED 0x04
+#define LBF_DST_PINNED  0x04
+#define LBF_SOME_PINNED 0x08
 struct lb_env {
        struct sched_domain     *sd;
@@ -3929,6 +4724,8 @@ struct lb_env {
        unsigned int            loop;
        unsigned int            loop_break;
        unsigned int            loop_max;
+        enum fbq_type           fbq_type;
 };
 /*
@@ -3975,6 +4772,78 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
        return delta < (s64)sysctl_sched_migration_cost;
 }
+#ifdef CONFIG_NUMA_BALANCING
+/* Returns true if the destination node has incurred more faults */
+static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
+{
+        int src_nid, dst_nid;
+        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+            !(env->sd->flags & SD_NUMA)) {
+                return false;
+        }
+        src_nid = cpu_to_node(env->src_cpu);
+        dst_nid = cpu_to_node(env->dst_cpu);
+        if (src_nid == dst_nid)
+                return false;
+        /* Always encourage migration to the preferred node. */
+        if (dst_nid == p->numa_preferred_nid)
+                return true;
+        /* If both task and group weight improve, this move is a winner. */
+        if (task_weight(p, dst_nid) > task_weight(p, src_nid) &&
+            group_weight(p, dst_nid) > group_weight(p, src_nid))
+                return true;
+        return false;
+}
+static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
+{
+        int src_nid, dst_nid;
+        if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
+                return false;
+        if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+                return false;
+        src_nid = cpu_to_node(env->src_cpu);
+        dst_nid = cpu_to_node(env->dst_cpu);
+        if (src_nid == dst_nid)
+                return false;
+        /* Migrating away from the preferred node is always bad. */
+        if (src_nid == p->numa_preferred_nid)
+                return true;
+        /* If either task or group weight get worse, don't do it. */
+        if (task_weight(p, dst_nid) < task_weight(p, src_nid) ||
+            group_weight(p, dst_nid) < group_weight(p, src_nid))
+                return true;
+        return false;
+}
+#else
+static inline bool migrate_improves_locality(struct task_struct *p,
+                                             struct lb_env *env)
+{
+        return false;
+}
+static inline bool migrate_degrades_locality(struct task_struct *p,
+                                             struct lb_env *env)
+{
+        return false;
+}
+#endif
 /*
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 */
@@ -3997,6 +4866,8 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+                env->flags |= LBF_SOME_PINNED;
                /*
                 * Remember if this task can be migrated to any other cpu in
                 * our sched_group. We may want to revisit it if we couldn't
@@ -4005,13 +4876,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                 * Also avoid computing new_dst_cpu if we have already computed
                 * one in current iteration.
                 */
-                if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
+                if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
                        return 0;
                /* Prevent to re-select dst_cpu via env's cpus */
                for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
                        if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
-                                env->flags |= LBF_SOME_PINNED;
+                                env->flags |= LBF_DST_PINNED;
                                env->new_dst_cpu = cpu;
                                break;
                        }
@@ -4030,11 +4901,24 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
        /*
         * Aggressive migration if:
-         * 1) task is cache cold, or
+         * 1) destination numa is preferred
-         * 2) too many balance attempts have failed.
+         * 2) task is cache cold, or
+         * 3) too many balance attempts have failed.
         */
        tsk_cache_hot = task_hot(p, rq_clock_task(env->src_rq), env->sd);
+        if (!tsk_cache_hot)
+                tsk_cache_hot = migrate_degrades_locality(p, env);
+        if (migrate_improves_locality(p, env)) {
+#ifdef CONFIG_SCHEDSTATS
+                if (tsk_cache_hot) {
+                        schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+                        schedstat_inc(p, se.statistics.nr_forced_migrations);
+                }
+#endif
+                return 1;
+        }
        if (!tsk_cache_hot ||
                env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
@@ -4077,8 +4961,6 @@ static int move_one_task(struct lb_env *env)
        return 0;
 }
-static unsigned long task_h_load(struct task_struct *p);
 static const unsigned int sched_nr_migrate_break = 32;
 /*
@@ -4291,6 +5173,10 @@ struct sg_lb_stats {
        unsigned int group_weight;
        int group_imb; /* Is there an imbalance in the group ? */
        int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA_BALANCING
+        unsigned int nr_numa_running;
+        unsigned int nr_preferred_running;
+#endif
 };
 /*
@@ -4330,7 +5216,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 /**
 * get_sd_load_idx - Obtain the load index for a given sched domain.
 * @sd: The sched_domain whose load_idx is to be obtained.
- * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ * @idle: The idle status of the CPU for whose sd load_idx is obtained.
 *
 * Return: The load index.
 */
@@ -4447,7 +5333,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
 {
        struct sched_domain *child = sd->child;
        struct sched_group *group, *sdg = sd->groups;
-        unsigned long power;
+        unsigned long power, power_orig;
        unsigned long interval;
        interval = msecs_to_jiffies(sd->balance_interval);
@@ -4459,7 +5345,7 @@ void update_group_power(struct sched_domain *sd, int cpu)
                return;
        }
-        power = 0;
+        power_orig = power = 0;
        if (child->flags & SD_OVERLAP) {
                /*
@@ -4467,8 +5353,12 @@ void update_group_power(struct sched_domain *sd, int cpu)
                 * span the current group.
                 */
-                for_each_cpu(cpu, sched_group_cpus(sdg))
+                for_each_cpu(cpu, sched_group_cpus(sdg)) {
-                        power += power_of(cpu);
+                        struct sched_group *sg = cpu_rq(cpu)->sd->groups;
+                        power_orig += sg->sgp->power_orig;
+                        power += sg->sgp->power;
+                }
        } else  {
                /*
                 * !SD_OVERLAP domains can assume that child groups
@@ -4477,12 +5367,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
                group = child->groups;
                do {
+                        power_orig += group->sgp->power_orig;
                        power += group->sgp->power;
                        group = group->next;
                } while (group != child->groups);
        }
-        sdg->sgp->power_orig = sdg->sgp->power = power;
+        sdg->sgp->power_orig = power_orig;
+        sdg->sgp->power = power;
 }
 /*
@@ -4526,13 +5418,12 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 * cpu 3 and leave one of the cpus in the second group unused.
 *
 * The current solution to this issue is detecting the skew in the first group
- * by noticing it has a cpu that is overloaded while the remaining cpus are
+ * by noticing the lower domain failed to reach balance and had difficulty
- * idle -- or rather, there's a distinct imbalance in the cpus; see
+ * moving tasks due to affinity constraints.
- * sg_imbalanced().
 *
 * When this is so detected; this group becomes a candidate for busiest; see
- * update_sd_pick_busiest(). And calculcate_imbalance() and
+ * update_sd_pick_busiest(). And calculate_imbalance() and
- * find_busiest_group() avoid some of the usual balance conditional to allow it
+ * find_busiest_group() avoid some of the usual balance conditions to allow it
 * to create an effective group imbalance.
 *
 * This is a somewhat tricky proposition since the next run might not find the
@@ -4540,49 +5431,36 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
 * subtle and fragile situation.
 */
-struct sg_imb_stats {
+static inline int sg_imbalanced(struct sched_group *group)
-        unsigned long max_nr_running, min_nr_running;
-        unsigned long max_cpu_load, min_cpu_load;
-};
-static inline void init_sg_imb_stats(struct sg_imb_stats *sgi)
 {
-        sgi->max_cpu_load = sgi->max_nr_running = 0UL;
+        return group->sgp->imbalance;
-        sgi->min_cpu_load = sgi->min_nr_running = ~0UL;
 }
-static inline void
+/*
-update_sg_imb_stats(struct sg_imb_stats *sgi,
+ * Compute the group capacity.
-                    unsigned long load, unsigned long nr_running)
+ *
+ * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
+ * first dividing out the smt factor and computing the actual number of cores
+ * and limit power unit capacity with that.
+ */
+static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
 {
-        if (load > sgi->max_cpu_load)
+        unsigned int capacity, smt, cpus;
-                sgi->max_cpu_load = load;
+        unsigned int power, power_orig;
-        if (sgi->min_cpu_load > load)
-                sgi->min_cpu_load = load;
-        if (nr_running > sgi->max_nr_running)
+        power = group->sgp->power;
-                sgi->max_nr_running = nr_running;
+        power_orig = group->sgp->power_orig;
-        if (sgi->min_nr_running > nr_running)
+        cpus = group->group_weight;
-                sgi->min_nr_running = nr_running;
-}
-static inline int
+        /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
-sg_imbalanced(struct sg_lb_stats *sgs, struct sg_imb_stats *sgi)
+        smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
-{
+        capacity = cpus / smt; /* cores */
-        /*
-         * Consider the group unbalanced when the imbalance is larger
-         * than the average weight of a task.
-         *
-         * APZ: with cgroup the avg task weight can vary wildly and
-         *      might not be a suitable number - should we keep a
-         *      normalized nr_running number somewhere that negates
-         *      the hierarchy?
-         */
-        if ((sgi->max_cpu_load - sgi->min_cpu_load) >= sgs->load_per_task &&
-            (sgi->max_nr_running - sgi->min_nr_running) > 1)
-                return 1;
-        return 0;
+        capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
+        if (!capacity)
+                capacity = fix_small_capacity(env->sd, group);
+        return capacity;
 }
 /**
@@ -4597,12 +5475,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        struct sched_group *group, int load_idx,
                        int local_group, struct sg_lb_stats *sgs)
 {
-        struct sg_imb_stats sgi;
        unsigned long nr_running;
        unsigned long load;
        int i;
-        init_sg_imb_stats(&sgi);
+        memset(sgs, 0, sizeof(*sgs));
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                struct rq *rq = cpu_rq(i);
@@ -4610,24 +5487,22 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                nr_running = rq->nr_running;
                /* Bias balancing toward cpus of our domain */
-                if (local_group) {
+                if (local_group)
                        load = target_load(i, load_idx);
-                } else {
+                else
                        load = source_load(i, load_idx);
-                        update_sg_imb_stats(&sgi, load, nr_running);
-                }
                sgs->group_load += load;
                sgs->sum_nr_running += nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+                sgs->nr_numa_running += rq->nr_numa_running;
+                sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
                sgs->sum_weighted_load += weighted_cpuload(i);
                if (idle_cpu(i))
                        sgs->idle_cpus++;
        }
-        if (local_group && (env->idle != CPU_NEWLY_IDLE ||
-                        time_after_eq(jiffies, group->sgp->next_update)))
-                update_group_power(env->sd, env->dst_cpu);
        /* Adjust by relative CPU power of the group */
        sgs->group_power = group->sgp->power;
        sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
@@ -4635,16 +5510,11 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        if (sgs->sum_nr_running)
                sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
-        sgs->group_imb = sg_imbalanced(sgs, &sgi);
-        sgs->group_capacity =
-                DIV_ROUND_CLOSEST(sgs->group_power, SCHED_POWER_SCALE);
-        if (!sgs->group_capacity)
-                sgs->group_capacity = fix_small_capacity(env->sd, group);
        sgs->group_weight = group->group_weight;
+        sgs->group_imb = sg_imbalanced(group);
+        sgs->group_capacity = sg_capacity(env, group);
        if (sgs->group_capacity > sgs->sum_nr_running)
                sgs->group_has_capacity = 1;
 }
@@ -4693,14 +5563,42 @@ static bool update_sd_pick_busiest(struct lb_env *env,
        return false;
 }
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+        if (sgs->sum_nr_running > sgs->nr_numa_running)
+                return regular;
+        if (sgs->sum_nr_running > sgs->nr_preferred_running)
+                return remote;
+        return all;
+}
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+        if (rq->nr_running > rq->nr_numa_running)
+                return regular;
+        if (rq->nr_running > rq->nr_preferred_running)
+                return remote;
+        return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+        return all;
+}
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+        return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
 /**
 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 * @env: The load balancing environment.
- * @balance: Should we balance.
 * @sds: variable to hold the statistics for this sched_domain.
 */
-static inline void update_sd_lb_stats(struct lb_env *env,
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
-                                        struct sd_lb_stats *sds)
 {
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
@@ -4720,11 +5618,17 @@ static inline void update_sd_lb_stats(struct lb_env *env,
                if (local_group) {
                        sds->local = sg;
                        sgs = &sds->local_stat;
+                        if (env->idle != CPU_NEWLY_IDLE ||
+                            time_after_eq(jiffies, sg->sgp->next_update))
+                                update_group_power(env->sd, env->dst_cpu);
                }
-                memset(sgs, 0, sizeof(*sgs));
                update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
+                if (local_group)
+                        goto next_group;
                /*
                 * In case the child domain prefers tasks go to siblings
                 * first, lower the sg capacity to one so that we'll try
@@ -4735,21 +5639,25 @@ static inline void update_sd_lb_stats(struct lb_env *env,
                 * heaviest group when it is already under-utilized (possible
                 * with a large weight task outweighs the tasks on the system).
                 */
-                if (prefer_sibling && !local_group &&
+                if (prefer_sibling && sds->local &&
-                                sds->local && sds->local_stat.group_has_capacity)
+                    sds->local_stat.group_has_capacity)
                        sgs->group_capacity = min(sgs->group_capacity, 1U);
-                /* Now, start updating sd_lb_stats */
+                if (update_sd_pick_busiest(env, sds, sg, sgs)) {
-                sds->total_load += sgs->group_load;
-                sds->total_pwr += sgs->group_power;
-                if (!local_group && update_sd_pick_busiest(env, sds, sg, sgs)) {
                        sds->busiest = sg;
                        sds->busiest_stat = *sgs;
                }
+next_group:
+                /* Now, start updating sd_lb_stats */
+                sds->total_load += sgs->group_load;
+                sds->total_pwr += sgs->group_power;
                sg = sg->next;
        } while (sg != env->sd->groups);
+        if (env->sd->flags & SD_NUMA)
+                env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 }
 /**
@@ -5053,15 +5961,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
        int i;
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
-                unsigned long power = power_of(i);
+                unsigned long power, capacity, wl;
-                unsigned long capacity = DIV_ROUND_CLOSEST(power,
+                enum fbq_type rt;
-                                                           SCHED_POWER_SCALE);
-                unsigned long wl;
+                rq = cpu_rq(i);
+                rt = fbq_classify_rq(rq);
+                /*
+                 * We classify groups/runqueues into three groups:
+                 *  - regular: there are !numa tasks
+                 *  - remote:  there are numa tasks that run on the 'wrong' node
+                 *  - all:     there is no distinction
+                 *
+                 * In order to avoid migrating ideally placed numa tasks,
+                 * ignore those when there's better options.
+                 *
+                 * If we ignore the actual busiest queue to migrate another
+                 * task, the next balance pass can still reduce the busiest
+                 * queue by moving tasks around inside the node.
+                 *
+                 * If we cannot move enough load due to this classification
+                 * the next pass will adjust the group classification and
+                 * allow migration of more tasks.
+                 *
+                 * Both cases only affect the total convergence complexity.
+                 */
+                if (rt > env->fbq_type)
+                        continue;
+                power = power_of(i);
+                capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
                if (!capacity)
                        capacity = fix_small_capacity(env->sd, group);
-                rq = cpu_rq(i);
                wl = weighted_cpuload(i);
                /*
@@ -5164,6 +6096,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        int *continue_balancing)
 {
        int ld_moved, cur_ld_moved, active_balance = 0;
+        struct sched_domain *sd_parent = sd->parent;
        struct sched_group *group;
        struct rq *busiest;
        unsigned long flags;
@@ -5177,6 +6110,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                .idle           = idle,
                .loop_break     = sched_nr_migrate_break,
                .cpus           = cpus,
+                .fbq_type       = all,
        };
        /*
@@ -5268,17 +6202,17 @@ more_balance:
                 * moreover subsequent load balance cycles should correct the
                 * excess load moved.
                 */
-                if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+                if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
+                        /* Prevent to re-select dst_cpu via env's cpus */
+                        cpumask_clear_cpu(env.dst_cpu, env.cpus);
                        env.dst_rq       = cpu_rq(env.new_dst_cpu);
                        env.dst_cpu      = env.new_dst_cpu;
-                        env.flags       &= ~LBF_SOME_PINNED;
+                        env.flags       &= ~LBF_DST_PINNED;
                        env.loop         = 0;
                        env.loop_break   = sched_nr_migrate_break;
-                        /* Prevent to re-select dst_cpu via env's cpus */
-                        cpumask_clear_cpu(env.dst_cpu, env.cpus);
                        /*
                         * Go back to "more_balance" rather than "redo" since we
                         * need to continue with same src_cpu.
@@ -5286,6 +6220,18 @@ more_balance:
                        goto more_balance;
                }
+                /*
+                 * We failed to reach balance because of affinity.
+                 */
+                if (sd_parent) {
+                        int *group_imbalance = &sd_parent->groups->sgp->imbalance;
+                        if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
+                                *group_imbalance = 1;
+                        } else if (*group_imbalance)
+                                *group_imbalance = 0;
+                }
                /* All tasks on this runqueue were pinned by CPU affinity */
                if (unlikely(env.flags & LBF_ALL_PINNED)) {
                        cpumask_clear_cpu(cpu_of(busiest), cpus);
@@ -5393,6 +6339,7 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        struct sched_domain *sd;
        int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
+        u64 curr_cost = 0;
        this_rq->idle_stamp = rq_clock(this_rq);
@@ -5409,15 +6356,27 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
                int continue_balancing = 1;
+                u64 t0, domain_cost;
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
+                if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost)
+                        break;
                if (sd->flags & SD_BALANCE_NEWIDLE) {
+                        t0 = sched_clock_cpu(this_cpu);
                        /* If we've pulled tasks over stop searching: */
                        pulled_task = load_balance(this_cpu, this_rq,
                                                   sd, CPU_NEWLY_IDLE,
                                                   &continue_balancing);
+                        domain_cost = sched_clock_cpu(this_cpu) - t0;
+                        if (domain_cost > sd->max_newidle_lb_cost)
+                                sd->max_newidle_lb_cost = domain_cost;
+                        curr_cost += domain_cost;
                }
                interval = msecs_to_jiffies(sd->balance_interval);
@@ -5439,6 +6398,9 @@ void idle_balance(int this_cpu, struct rq *this_rq)
                 */
                this_rq->next_balance = next_balance;
        }
+        if (curr_cost > this_rq->max_idle_balance_cost)
+                this_rq->max_idle_balance_cost = curr_cost;
 }
 /*
@@ -5662,15 +6624,39 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
        /* Earliest time when we have to do rebalance again */
        unsigned long next_balance = jiffies + 60*HZ;
        int update_next_balance = 0;
-        int need_serialize;
+        int need_serialize, need_decay = 0;
+        u64 max_cost = 0;
        update_blocked_averages(cpu);
        rcu_read_lock();
        for_each_domain(cpu, sd) {
+                /*
+                 * Decay the newidle max times here because this is a regular
+                 * visit to all the domains. Decay ~1% per second.
+                 */
+                if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
+                        sd->max_newidle_lb_cost =
+                                (sd->max_newidle_lb_cost * 253) / 256;
+                        sd->next_decay_max_lb_cost = jiffies + HZ;
+                        need_decay = 1;
+                }
+                max_cost += sd->max_newidle_lb_cost;
                if (!(sd->flags & SD_LOAD_BALANCE))
                        continue;
+                /*
+                 * Stop the load balance at this level. There is another
+                 * CPU in our sched group which is doing load balancing more
+                 * actively.
+                 */
+                if (!continue_balancing) {
+                        if (need_decay)
+                                continue;
+                        break;
+                }
                interval = sd->balance_interval;
                if (idle != CPU_IDLE)
                        interval *= sd->busy_factor;
@@ -5689,7 +6675,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
                if (time_after_eq(jiffies, sd->last_balance + interval)) {
                        if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
                                /*
-                                 * The LBF_SOME_PINNED logic could have changed
+                                 * The LBF_DST_PINNED logic could have changed
                                 * env->dst_cpu, so we can't know our idle
                                 * state even if we migrated tasks. Update it.
                                 */
@@ -5704,14 +6690,14 @@ out:
                        next_balance = sd->last_balance + interval;
                        update_next_balance = 1;
                }
+        }
+        if (need_decay) {
                /*
-                 * Stop the load balance at this level. There is another
+                 * Ensure the rq-wide value also decays but keep it at a
-                 * CPU in our sched group which is doing load balancing more
+                 * reasonable floor to avoid funnies with rq->avg_idle.
-                 * actively.
                 */
-                if (!continue_balancing)
+                rq->max_idle_balance_cost =
-                        break;
+                        max((u64)sysctl_sched_migration_cost, max_cost);
        }
        rcu_read_unlock();
@@ -6214,7 +7200,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                se->cfs_rq = parent->my_q;
        se->my_q = cfs_rq;
-        update_load_set(&se->load, 0);
+        /* guarantee group entities always have weight */
+        update_load_set(&se->load, NICE_0_LOAD);
        se->parent = parent;
 }
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 99399f8e4799..5716929a2e3a 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -63,10 +63,23 @@ SCHED_FEAT(LB_MIN, false)
 /*
 * Apply the automatic NUMA scheduling policy. Enabled automatically
 * at runtime if running on a NUMA machine. Can be controlled via
- * numa_balancing=. Allow PTE scanning to be forced on UMA machines
+ * numa_balancing=
- * for debugging the core machinery.
 */
 #ifdef CONFIG_NUMA_BALANCING
 SCHED_FEAT(NUMA,        false)
-SCHED_FEAT(NUMA_FORCE,  false)
+/*
+ * NUMA_FAVOUR_HIGHER will favor moving tasks towards nodes where a
+ * higher number of hinting faults are recorded during active load
+ * balancing.
+ */
+SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
+/*
+ * NUMA_RESIST_LOWER will resist moving tasks towards nodes where a
+ * lower number of hinting faults have been recorded. As this has
+ * the potential to prevent a task ever migrating to a new node
+ * due to CPU overload it is disabled by default.
+ */
+SCHED_FEAT(NUMA_RESIST_LOWER, false)
 #endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index d8da01008d39..516c3d9ceea1 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -9,7 +9,7 @@
 #ifdef CONFIG_SMP
 static int
-select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
        return task_cpu(p); /* IDLE tasks as never migrated */
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 01970c8e64df..7d57275fc396 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -246,8 +246,10 @@ static inline void rt_set_overload(struct rq *rq)
         * if we should look at the mask. It would be a shame
         * if we looked at the mask, but the mask was not
         * updated yet.
+         *
+         * Matched by the barrier in pull_rt_task().
         */
-        wmb();
+        smp_wmb();
        atomic_inc(&rq->rd->rto_count);
 }
@@ -1169,13 +1171,10 @@ static void yield_task_rt(struct rq *rq)
 static int find_lowest_rq(struct task_struct *task);
 static int
-select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
        struct task_struct *curr;
        struct rq *rq;
-        int cpu;
-        cpu = task_cpu(p);
        if (p->nr_cpus_allowed == 1)
                goto out;
@@ -1213,8 +1212,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
         */
        if (curr && unlikely(rt_task(curr)) &&
            (curr->nr_cpus_allowed < 2 ||
-             curr->prio <= p->prio) &&
+             curr->prio <= p->prio)) {
-            (p->nr_cpus_allowed > 1)) {
                int target = find_lowest_rq(p);
                if (target != -1)
@@ -1630,6 +1628,12 @@ static int pull_rt_task(struct rq *this_rq)
        if (likely(!rt_overloaded(this_rq)))
                return 0;
+        /*
+         * Match the barrier from rt_set_overloaded; this guarantees that if we
+         * see overloaded we must also see the rto_mask bit.
+         */
+        smp_rmb();
        for_each_cpu(cpu, this_rq->rd->rto_mask) {
                if (this_cpu == cpu)
                        continue;
@@ -1931,8 +1935,8 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        p->rt.time_slice = sched_rr_timeslice;
        /*
-         * Requeue to the end of queue if we (and all of our ancestors) are the
+         * Requeue to the end of queue if we (and all of our ancestors) are not
-         * only element on the queue
+         * the only element on the queue
         */
        for_each_sched_rt_entity(rt_se) {
                if (rt_se->run_list.prev != rt_se->run_list.next) {
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3c5653e1dca..4e650acffed7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -6,6 +6,7 @@
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
 #include <linux/tick.h>
+#include <linux/slab.h>
 #include "cpupri.h"
 #include "cpuacct.h"
@@ -408,6 +409,10 @@ struct rq {
         * remote CPUs use both these fields when doing load calculation.
         */
        unsigned int nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+        unsigned int nr_numa_running;
+        unsigned int nr_preferred_running;
+#endif
        #define CPU_LOAD_IDX_MAX 5
        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
        unsigned long last_load_update_tick;
@@ -476,6 +481,9 @@ struct rq {
        u64 age_stamp;
        u64 idle_stamp;
        u64 avg_idle;
+        /* This is used to determine avg_idle's max value */
+        u64 max_idle_balance_cost;
 #endif
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -552,6 +560,12 @@ static inline u64 rq_clock_task(struct rq *rq)
        return rq->clock_task;
 }
+#ifdef CONFIG_NUMA_BALANCING
+extern void sched_setnuma(struct task_struct *p, int node);
+extern int migrate_task_to(struct task_struct *p, int cpu);
+extern int migrate_swap(struct task_struct *, struct task_struct *);
+#endif /* CONFIG_NUMA_BALANCING */
 #ifdef CONFIG_SMP
 #define rcu_dereference_check_sched_domain(p) \
@@ -593,9 +607,22 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
        return hsd;
 }
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+        struct sched_domain *sd;
+        for_each_domain(cpu, sd) {
+                if (sd->flags & flag)
+                        break;
+        }
+        return sd;
+}
 DECLARE_PER_CPU(struct sched_domain *, sd_llc);
 DECLARE_PER_CPU(int, sd_llc_size);
 DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain *, sd_numa);
 struct sched_group_power {
        atomic_t ref;
@@ -605,6 +632,7 @@ struct sched_group_power {
         */
        unsigned int power, power_orig;
        unsigned long next_update;
+        int imbalance; /* XXX unrelated to power but shared group state */
        /*
         * Number of busy cpus in this group.
         */
@@ -719,6 +747,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
         */
        smp_wmb();
        task_thread_info(p)->cpu = cpu;
+        p->wake_cpu = cpu;
 #endif
 }
@@ -974,7 +1003,7 @@ struct sched_class {
        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 #ifdef CONFIG_SMP
-        int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+        int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
        void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
        void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
@@ -1220,6 +1249,24 @@ static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
 }
+static inline void double_lock(spinlock_t *l1, spinlock_t *l2)
+{
+        if (l1 > l2)
+                swap(l1, l2);
+        spin_lock(l1);
+        spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
+static inline void double_raw_lock(raw_spinlock_t *l1, raw_spinlock_t *l2)
+{
+        if (l1 > l2)
+                swap(l1, l2);
+        raw_spin_lock(l1);
+        raw_spin_lock_nested(l2, SINGLE_DEPTH_NESTING);
+}
 /*
 * double_rq_lock - safely lock two runqueues
 *
@@ -1305,7 +1352,8 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
 extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
-extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
+extern void cfs_bandwidth_usage_inc(void);
+extern void cfs_bandwidth_usage_dec(void);
 #ifdef CONFIG_NO_HZ_COMMON
 enum rq_nohz_flag_bits {
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index c7edee71bce8..4ab704339656 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -59,9 +59,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
 * from dequeue_task() to account for possible rq->clock skew across cpus. The
 * delta taken on each cpu would annul the skew.
 */
-static inline void sched_info_dequeued(struct task_struct *t)
+static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
 {
-        unsigned long long now = rq_clock(task_rq(t)), delta = 0;
+        unsigned long long now = rq_clock(rq), delta = 0;
        if (unlikely(sched_info_on()))
                if (t->sched_info.last_queued)
@@ -69,7 +69,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
        sched_info_reset_dequeued(t);
        t->sched_info.run_delay += delta;
-        rq_sched_info_dequeued(task_rq(t), delta);
+        rq_sched_info_dequeued(rq, delta);
 }
 /*
@@ -77,9 +77,9 @@ static inline void sched_info_dequeued(struct task_struct *t)
 * long it was waiting to run.  We also note when it began so that we
 * can keep stats on how long its timeslice is.
 */
-static void sched_info_arrive(struct task_struct *t)
+static void sched_info_arrive(struct rq *rq, struct task_struct *t)
 {
-        unsigned long long now = rq_clock(task_rq(t)), delta = 0;
+        unsigned long long now = rq_clock(rq), delta = 0;
        if (t->sched_info.last_queued)
                delta = now - t->sched_info.last_queued;
@@ -88,7 +88,7 @@ static void sched_info_arrive(struct task_struct *t)
        t->sched_info.last_arrival = now;
        t->sched_info.pcount++;
-        rq_sched_info_arrive(task_rq(t), delta);
+        rq_sched_info_arrive(rq, delta);
 }
 /*
@@ -96,11 +96,11 @@ static void sched_info_arrive(struct task_struct *t)
 * the timestamp if it is already not set.  It's assumed that
 * sched_info_dequeued() will clear that stamp when appropriate.
 */
-static inline void sched_info_queued(struct task_struct *t)
+static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
 {
        if (unlikely(sched_info_on()))
                if (!t->sched_info.last_queued)
-                        t->sched_info.last_queued = rq_clock(task_rq(t));
+                        t->sched_info.last_queued = rq_clock(rq);
 }
 /*
@@ -111,15 +111,15 @@ static inline void sched_info_queued(struct task_struct *t)
 * sched_info_queued() to mark that it has now again started waiting on
 * the runqueue.
 */
-static inline void sched_info_depart(struct task_struct *t)
+static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
 {
-        unsigned long long delta = rq_clock(task_rq(t)) -
+        unsigned long long delta = rq_clock(rq) -
                                        t->sched_info.last_arrival;
-        rq_sched_info_depart(task_rq(t), delta);
+        rq_sched_info_depart(rq, delta);
        if (t->state == TASK_RUNNING)
-                sched_info_queued(t);
+                sched_info_queued(rq, t);
 }
 /*
@@ -128,32 +128,34 @@ static inline void sched_info_depart(struct task_struct *t)
 * the idle task.)  We are only called when prev != next.
 */
 static inline void
-__sched_info_switch(struct task_struct *prev, struct task_struct *next)
+__sched_info_switch(struct rq *rq,
+                    struct task_struct *prev, struct task_struct *next)
 {
-        struct rq *rq = task_rq(prev);
        /*
         * prev now departs the cpu.  It's not interesting to record
         * stats about how efficient we were at scheduling the idle
         * process, however.
         */
        if (prev != rq->idle)
-                sched_info_depart(prev);
+                sched_info_depart(rq, prev);
        if (next != rq->idle)
-                sched_info_arrive(next);
+                sched_info_arrive(rq, next);
 }
 static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
+sched_info_switch(struct rq *rq,
+                  struct task_struct *prev, struct task_struct *next)
 {
        if (unlikely(sched_info_on()))
-                __sched_info_switch(prev, next);
+                __sched_info_switch(rq, prev, next);
 }
 #else
-#define sched_info_queued(t)                    do { } while (0)
+#define sched_info_queued(rq, t)                do { } while (0)
 #define sched_info_reset_dequeued(t)    do { } while (0)
-#define sched_info_dequeued(t)                  do { } while (0)
+#define sched_info_dequeued(rq, t)              do { } while (0)
-#define sched_info_switch(t, next)              do { } while (0)
+#define sched_info_depart(rq, t)                do { } while (0)
+#define sched_info_arrive(rq, next)             do { } while (0)
+#define sched_info_switch(rq, t, next)          do { } while (0)
 #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 /*
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index e08fbeeb54b9..47197de8abd9 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -11,7 +11,7 @@
 #ifdef CONFIG_SMP
 static int
-select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
        return task_cpu(p); /* stop tasks as never migrate */
 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index d7d498d8cc4f..dcab1d3fb53d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -100,13 +100,13 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
        raw_local_irq_save(flags);
        /*
-         * The preempt tracer hooks into add_preempt_count and will break
+         * The preempt tracer hooks into preempt_count_add and will break
         * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
         * is set and before current->softirq_enabled is cleared.
         * We must manually increment preempt_count here and manually
         * call the trace_preempt_off later.
         */
-        preempt_count() += cnt;
+        __preempt_count_add(cnt);
        /*
         * Were softirqs turned off above:
         */
@@ -120,7 +120,7 @@ static void __local_bh_disable(unsigned long ip, unsigned int cnt)
 #else /* !CONFIG_TRACE_IRQFLAGS */
 static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
 {
-        add_preempt_count(cnt);
+        preempt_count_add(cnt);
        barrier();
 }
 #endif /* CONFIG_TRACE_IRQFLAGS */
@@ -139,7 +139,7 @@ static void __local_bh_enable(unsigned int cnt)
        if (softirq_count() == cnt)
                trace_softirqs_on(_RET_IP_);
-        sub_preempt_count(cnt);
+        preempt_count_sub(cnt);
 }
 /*
@@ -169,12 +169,12 @@ static inline void _local_bh_enable_ip(unsigned long ip)
         * Keep preemption disabled until we are done with
         * softirq processing:
         */
-        sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
+        preempt_count_sub(SOFTIRQ_DISABLE_OFFSET - 1);
        if (unlikely(!in_interrupt() && local_softirq_pending()))
                do_softirq();
-        dec_preempt_count();
+        preempt_count_dec();
 #ifdef CONFIG_TRACE_IRQFLAGS
        local_irq_enable();
 #endif
@@ -256,7 +256,7 @@ restart:
                                       " exited with %08x?\n", vec_nr,
                                       softirq_to_name[vec_nr], h->action,
                                       prev_count, preempt_count());
-                                preempt_count() = prev_count;
+                                preempt_count_set(prev_count);
                        }
                        rcu_bh_qs(cpu);
@@ -369,7 +369,7 @@ void irq_exit(void)
        account_irq_exit_time(current);
        trace_hardirq_exit();
-        sub_preempt_count(HARDIRQ_OFFSET);
+        preempt_count_sub(HARDIRQ_OFFSET);
        if (!in_interrupt() && local_softirq_pending())
                invoke_softirq();
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index c09f2955ae30..c530bc5be7cf 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -115,6 +115,182 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg)
        return done.executed ? done.ret : -ENOENT;
 }
+/* This controls the threads on each CPU. */
+enum multi_stop_state {
+        /* Dummy starting state for thread. */
+        MULTI_STOP_NONE,
+        /* Awaiting everyone to be scheduled. */
+        MULTI_STOP_PREPARE,
+        /* Disable interrupts. */
+        MULTI_STOP_DISABLE_IRQ,
+        /* Run the function */
+        MULTI_STOP_RUN,
+        /* Exit */
+        MULTI_STOP_EXIT,
+};
+struct multi_stop_data {
+        int                     (*fn)(void *);
+        void                    *data;
+        /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
+        unsigned int            num_threads;
+        const struct cpumask    *active_cpus;
+        enum multi_stop_state   state;
+        atomic_t                thread_ack;
+};
+static void set_state(struct multi_stop_data *msdata,
+                      enum multi_stop_state newstate)
+{
+        /* Reset ack counter. */
+        atomic_set(&msdata->thread_ack, msdata->num_threads);
+        smp_wmb();
+        msdata->state = newstate;
+}
+/* Last one to ack a state moves to the next state. */
+static void ack_state(struct multi_stop_data *msdata)
+{
+        if (atomic_dec_and_test(&msdata->thread_ack))
+                set_state(msdata, msdata->state + 1);
+}
+/* This is the cpu_stop function which stops the CPU. */
+static int multi_cpu_stop(void *data)
+{
+        struct multi_stop_data *msdata = data;
+        enum multi_stop_state curstate = MULTI_STOP_NONE;
+        int cpu = smp_processor_id(), err = 0;
+        unsigned long flags;
+        bool is_active;
+        /*
+         * When called from stop_machine_from_inactive_cpu(), irq might
+         * already be disabled.  Save the state and restore it on exit.
+         */
+        local_save_flags(flags);
+        if (!msdata->active_cpus)
+                is_active = cpu == cpumask_first(cpu_online_mask);
+        else
+                is_active = cpumask_test_cpu(cpu, msdata->active_cpus);
+        /* Simple state machine */
+        do {
+                /* Chill out and ensure we re-read multi_stop_state. */
+                cpu_relax();
+                if (msdata->state != curstate) {
+                        curstate = msdata->state;
+                        switch (curstate) {
+                        case MULTI_STOP_DISABLE_IRQ:
+                                local_irq_disable();
+                                hard_irq_disable();
+                                break;
+                        case MULTI_STOP_RUN:
+                                if (is_active)
+                                        err = msdata->fn(msdata->data);
+                                break;
+                        default:
+                                break;
+                        }
+                        ack_state(msdata);
+                }
+        } while (curstate != MULTI_STOP_EXIT);
+        local_irq_restore(flags);
+        return err;
+}
+struct irq_cpu_stop_queue_work_info {
+        int cpu1;
+        int cpu2;
+        struct cpu_stop_work *work1;
+        struct cpu_stop_work *work2;
+};
+/*
+ * This function is always run with irqs and preemption disabled.
+ * This guarantees that both work1 and work2 get queued, before
+ * our local migrate thread gets the chance to preempt us.
+ */
+static void irq_cpu_stop_queue_work(void *arg)
+{
+        struct irq_cpu_stop_queue_work_info *info = arg;
+        cpu_stop_queue_work(info->cpu1, info->work1);
+        cpu_stop_queue_work(info->cpu2, info->work2);
+}
+/**
+ * stop_two_cpus - stops two cpus
+ * @cpu1: the cpu to stop
+ * @cpu2: the other cpu to stop
+ * @fn: function to execute
+ * @arg: argument to @fn
+ *
+ * Stops both the current and specified CPU and runs @fn on one of them.
+ *
+ * returns when both are completed.
+ */
+int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void *arg)
+{
+        struct cpu_stop_done done;
+        struct cpu_stop_work work1, work2;
+        struct irq_cpu_stop_queue_work_info call_args;
+        struct multi_stop_data msdata;
+        preempt_disable();
+        msdata = (struct multi_stop_data){
+                .fn = fn,
+                .data = arg,
+                .num_threads = 2,
+                .active_cpus = cpumask_of(cpu1),
+        };
+        work1 = work2 = (struct cpu_stop_work){
+                .fn = multi_cpu_stop,
+                .arg = &msdata,
+                .done = &done
+        };
+        call_args = (struct irq_cpu_stop_queue_work_info){
+                .cpu1 = cpu1,
+                .cpu2 = cpu2,
+                .work1 = &work1,
+                .work2 = &work2,
+        };
+        cpu_stop_init_done(&done, 2);
+        set_state(&msdata, MULTI_STOP_PREPARE);
+        /*
+         * If we observe both CPUs active we know _cpu_down() cannot yet have
+         * queued its stop_machine works and therefore ours will get executed
+         * first. Or its not either one of our CPUs that's getting unplugged,
+         * in which case we don't care.
+         *
+         * This relies on the stopper workqueues to be FIFO.
+         */
+        if (!cpu_active(cpu1) || !cpu_active(cpu2)) {
+                preempt_enable();
+                return -ENOENT;
+        }
+        /*
+         * Queuing needs to be done by the lowest numbered CPU, to ensure
+         * that works are always queued in the same order on every CPU.
+         * This prevents deadlocks.
+         */
+        smp_call_function_single(min(cpu1, cpu2),
+                                 &irq_cpu_stop_queue_work,
+                                 &call_args, 0);
+        preempt_enable();
+        wait_for_completion(&done.completion);
+        return done.executed ? done.ret : -ENOENT;
+}
 /**
 * stop_one_cpu_nowait - stop a cpu but don't wait for completion
 * @cpu: cpu to stop
@@ -359,98 +535,14 @@ early_initcall(cpu_stop_init);
 #ifdef CONFIG_STOP_MACHINE
-/* This controls the threads on each CPU. */
-enum stopmachine_state {
-        /* Dummy starting state for thread. */
-        STOPMACHINE_NONE,
-        /* Awaiting everyone to be scheduled. */
-        STOPMACHINE_PREPARE,
-        /* Disable interrupts. */
-        STOPMACHINE_DISABLE_IRQ,
-        /* Run the function */
-        STOPMACHINE_RUN,
-        /* Exit */
-        STOPMACHINE_EXIT,
-};
-struct stop_machine_data {
-        int                     (*fn)(void *);
-        void                    *data;
-        /* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
-        unsigned int            num_threads;
-        const struct cpumask    *active_cpus;
-        enum stopmachine_state  state;
-        atomic_t                thread_ack;
-};
-static void set_state(struct stop_machine_data *smdata,
-                      enum stopmachine_state newstate)
-{
-        /* Reset ack counter. */
-        atomic_set(&smdata->thread_ack, smdata->num_threads);
-        smp_wmb();
-        smdata->state = newstate;
-}
-/* Last one to ack a state moves to the next state. */
-static void ack_state(struct stop_machine_data *smdata)
-{
-        if (atomic_dec_and_test(&smdata->thread_ack))
-                set_state(smdata, smdata->state + 1);
-}
-/* This is the cpu_stop function which stops the CPU. */
-static int stop_machine_cpu_stop(void *data)
-{
-        struct stop_machine_data *smdata = data;
-        enum stopmachine_state curstate = STOPMACHINE_NONE;
-        int cpu = smp_processor_id(), err = 0;
-        unsigned long flags;
-        bool is_active;
-        /*
-         * When called from stop_machine_from_inactive_cpu(), irq might
-         * already be disabled.  Save the state and restore it on exit.
-         */
-        local_save_flags(flags);
-        if (!smdata->active_cpus)
-                is_active = cpu == cpumask_first(cpu_online_mask);
-        else
-                is_active = cpumask_test_cpu(cpu, smdata->active_cpus);
-        /* Simple state machine */
-        do {
-                /* Chill out and ensure we re-read stopmachine_state. */
-                cpu_relax();
-                if (smdata->state != curstate) {
-                        curstate = smdata->state;
-                        switch (curstate) {
-                        case STOPMACHINE_DISABLE_IRQ:
-                                local_irq_disable();
-                                hard_irq_disable();
-                                break;
-                        case STOPMACHINE_RUN:
-                                if (is_active)
-                                        err = smdata->fn(smdata->data);
-                                break;
-                        default:
-                                break;
-                        }
-                        ack_state(smdata);
-                }
-        } while (curstate != STOPMACHINE_EXIT);
-        local_irq_restore(flags);
-        return err;
-}
 int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 {
-        struct stop_machine_data smdata = { .fn = fn, .data = data,
+        struct multi_stop_data msdata = {
-                                            .num_threads = num_online_cpus(),
+                .fn = fn,
-                                            .active_cpus = cpus };
+                .data = data,
+                .num_threads = num_online_cpus(),
+                .active_cpus = cpus,
+        };
        if (!stop_machine_initialized) {
                /*
@@ -461,7 +553,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
                unsigned long flags;
                int ret;
-                WARN_ON_ONCE(smdata.num_threads != 1);
+                WARN_ON_ONCE(msdata.num_threads != 1);
                local_irq_save(flags);
                hard_irq_disable();
@@ -472,8 +564,8 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
        }
        /* Set the initial state and stop all online cpus. */
-        set_state(&smdata, STOPMACHINE_PREPARE);
+        set_state(&msdata, MULTI_STOP_PREPARE);
-        return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
+        return stop_cpus(cpu_online_mask, multi_cpu_stop, &msdata);
 }
 int stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
@@ -513,25 +605,25 @@ EXPORT_SYMBOL_GPL(stop_machine);
 int stop_machine_from_inactive_cpu(int (*fn)(void *), void *data,
                                  const struct cpumask *cpus)
 {
-        struct stop_machine_data smdata = { .fn = fn, .data = data,
+        struct multi_stop_data msdata = { .fn = fn, .data = data,
                                            .active_cpus = cpus };
        struct cpu_stop_done done;
        int ret;
        /* Local CPU must be inactive and CPU hotplug in progress. */
        BUG_ON(cpu_active(raw_smp_processor_id()));
-        smdata.num_threads = num_active_cpus() + 1;     /* +1 for local */
+        msdata.num_threads = num_active_cpus() + 1;     /* +1 for local */
        /* No proper task established and can't sleep - busy wait for lock. */
        while (!mutex_trylock(&stop_cpus_mutex))
                cpu_relax();
        /* Schedule work on other CPUs and execute directly for local CPU */
-        set_state(&smdata, STOPMACHINE_PREPARE);
+        set_state(&msdata, MULTI_STOP_PREPARE);
        cpu_stop_init_done(&done, num_active_cpus());
-        queue_stop_cpus_work(cpu_active_mask, stop_machine_cpu_stop, &smdata,
+        queue_stop_cpus_work(cpu_active_mask, multi_cpu_stop, &msdata,
                             &done);
-        ret = stop_machine_cpu_stop(&smdata);
+        ret = multi_cpu_stop(&msdata);
        /* Busy wait for completion. */
        while (!completion_done(&done.completion))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b2f06f3c6a3f..a159e1fd2013 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -371,13 +371,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
-                .procname       = "numa_balancing_scan_period_reset",
-                .data           = &sysctl_numa_balancing_scan_period_reset,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
                .procname       = "numa_balancing_scan_period_max_ms",
                .data           = &sysctl_numa_balancing_scan_period_max,
                .maxlen         = sizeof(unsigned int),
@@ -391,6 +384,20 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
+        {
+                .procname       = "numa_balancing_settle_count",
+                .data           = &sysctl_numa_balancing_settle_count,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
+        {
+                .procname       = "numa_balancing_migrate_deferred",
+                .data           = &sysctl_numa_balancing_migrate_deferred,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec,
+        },
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
        {
diff --git a/kernel/timer.c b/kernel/timer.c
index 4296d13db3d1..6582b82fa966 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1092,7 +1092,7 @@ static int cascade(struct tvec_base *base, struct tvec *tv, int index)
 static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
                          unsigned long data)
 {
-        int preempt_count = preempt_count();
+        int count = preempt_count();
 #ifdef CONFIG_LOCKDEP
        /*
@@ -1119,16 +1119,16 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
        lock_map_release(&lockdep_map);
-        if (preempt_count != preempt_count()) {
+        if (count != preempt_count()) {
                WARN_ONCE(1, "timer: %pF preempt leak: %08x -> %08x\n",
-                          fn, preempt_count, preempt_count());
+                          fn, count, preempt_count());
                /*
                 * Restore the preempt count. That gives us a decent
                 * chance to survive and extract information. If the
                 * callback kept a lock held, bad luck, but not worse
                 * than the BUG() we had.
                 */
-                preempt_count() = preempt_count;
+                preempt_count_set(count);
        }
 }
diff --git a/kernel/wait.c b/kernel/wait.c
index d550920e040c..de21c6305a44 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -92,6 +92,30 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
 }
 EXPORT_SYMBOL(prepare_to_wait_exclusive);
+long prepare_to_wait_event(wait_queue_head_t *q, wait_queue_t *wait, int state)
+{
+        unsigned long flags;
+        if (signal_pending_state(state, current))
+                return -ERESTARTSYS;
+        wait->private = current;
+        wait->func = autoremove_wake_function;
+        spin_lock_irqsave(&q->lock, flags);
+        if (list_empty(&wait->task_list)) {
+                if (wait->flags & WQ_FLAG_EXCLUSIVE)
+                        __add_wait_queue_tail(q, wait);
+                else
+                        __add_wait_queue(q, wait);
+        }
+        set_current_state(state);
+        spin_unlock_irqrestore(&q->lock, flags);
+        return 0;
+}
+EXPORT_SYMBOL(prepare_to_wait_event);
 /**
 * finish_wait - clean up after waiting in a queue
 * @q: waitqueue waited on