26 files changed, 746 insertions, 396 deletions
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt
index e55124e7c40c..04bf16ad8561 100644
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -441,8 +441,7 @@ feature should be disabled. Otherwise, if the system overhead from the
 feature is too high then the rate the kernel samples for NUMA hinting
 faults may be controlled by the numa_balancing_scan_period_min_ms,
 numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
-numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
+numa_balancing_scan_size_mb, and numa_balancing_settle_count sysctls.
-numa_balancing_migrate_deferred.
 ==============================================================
@@ -483,13 +482,6 @@ rate for each task.
 numa_balancing_scan_size_mb is how many megabytes worth of pages are
 scanned for a given scan.
-numa_balancing_migrate_deferred is how many page migrations get skipped
-unconditionally, after a page migration is skipped because a page is shared
-with other tasks. This reduces page migration overhead, and determines
-how much stronger the "move task near its memory" policy scheduler becomes,
-versus the "move memory near its task" memory management policy, for workloads
-with shared memory.
 ==============================================================
 osrelease, ostype & version:
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 92f7b15dd221..adabeababeb0 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -30,7 +30,6 @@
 #include <linux/uaccess.h>
 #include <linux/random.h>
 #include <linux/hw_breakpoint.h>
-#include <linux/cpuidle.h>
 #include <linux/leds.h>
 #include <linux/reboot.h>
@@ -133,7 +132,11 @@ EXPORT_SYMBOL_GPL(arm_pm_restart);
 void (*arm_pm_idle)(void);
-static void default_idle(void)
+/*
+ * Called from the core idle loop.
+ */
+void arch_cpu_idle(void)
 {
        if (arm_pm_idle)
                arm_pm_idle();
@@ -168,15 +171,6 @@ void arch_cpu_idle_dead(void)
 #endif
 /*
- * Called from the core idle loop.
- */
-void arch_cpu_idle(void)
-{
-        if (cpuidle_idle_call())
-                default_idle();
-}
-/*
 * Called by kexec, immediately prior to machine_kexec().
 *
 * This must completely disable all secondary CPUs; simply causing those CPUs
diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 49318385d4fa..4a0a64fe25df 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -83,7 +83,6 @@ static struct timer_list spuloadavg_timer;
 #define MIN_SPU_TIMESLICE       max(5 * HZ / (1000 * SPUSCHED_TICK), 1)
 #define DEF_SPU_TIMESLICE       (100 * HZ / (1000 * SPUSCHED_TICK))
-#define MAX_USER_PRIO           (MAX_PRIO - MAX_RT_PRIO)
 #define SCALE_PRIO(x, prio) \
        max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_SPU_TIMESLICE)
diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 972df0ffd4dc..2db8cc691bf4 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -39,7 +39,6 @@
 #include <linux/irq.h>
 #include <linux/seq_file.h>
 #include <linux/root_dev.h>
-#include <linux/cpuidle.h>
 #include <linux/of.h>
 #include <linux/kexec.h>
@@ -356,29 +355,24 @@ early_initcall(alloc_dispatch_log_kmem_cache);
 static void pseries_lpar_idle(void)
 {
-        /* This would call on the cpuidle framework, and the back-end pseries
+        /*
-         * driver to  go to idle states
+         * Default handler to go into low thread priority and possibly
+         * low power mode by cedeing processor to hypervisor
         */
-        if (cpuidle_idle_call()) {
-                /* On error, execute default handler
-                 * to go into low thread priority and possibly
-                 * low power mode by cedeing processor to hypervisor
-                 */
-                /* Indicate to hypervisor that we are idle. */
+        /* Indicate to hypervisor that we are idle. */
-                get_lppaca()->idle = 1;
+        get_lppaca()->idle = 1;
-                /*
+        /*
-                 * Yield the processor to the hypervisor.  We return if
+         * Yield the processor to the hypervisor.  We return if
-                 * an external interrupt occurs (which are driven prior
+         * an external interrupt occurs (which are driven prior
-                 * to returning here) or if a prod occurs from another
+         * to returning here) or if a prod occurs from another
-                 * processor. When returning here, external interrupts
+         * processor. When returning here, external interrupts
-                 * are enabled.
+         * are enabled.
-                 */
+         */
-                cede_processor();
+        cede_processor();
-                get_lppaca()->idle = 0;
+        get_lppaca()->idle = 0;
-        }
 }
 /*
diff --git a/arch/sh/kernel/idle.c b/arch/sh/kernel/idle.c
index 2ea4483fd722..be616ee0cf87 100644
--- a/arch/sh/kernel/idle.c
+++ b/arch/sh/kernel/idle.c
@@ -16,7 +16,6 @@
 #include <linux/thread_info.h>
 #include <linux/irqflags.h>
 #include <linux/smp.h>
-#include <linux/cpuidle.h>
 #include <linux/atomic.h>
 #include <asm/pgalloc.h>
 #include <asm/smp.h>
@@ -40,8 +39,7 @@ void arch_cpu_idle_dead(void)
 void arch_cpu_idle(void)
 {
-        if (cpuidle_idle_call())
+        sh_idle();
-                sh_idle();
 }
 void __init select_idle_routine(void)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 3fb8d95ab8b5..4505e2a950d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -298,10 +298,7 @@ void arch_cpu_idle_dead(void)
 */
 void arch_cpu_idle(void)
 {
-        if (cpuidle_idle_call())
+        x86_idle();
-                x86_idle();
-        else
-                local_irq_enable();
 }
 /*
diff --git a/drivers/cpuidle/cpuidle-pseries.c b/drivers/cpuidle/cpuidle-pseries.c
index 7ab564aa0b1c..6f7b01956885 100644
--- a/drivers/cpuidle/cpuidle-pseries.c
+++ b/drivers/cpuidle/cpuidle-pseries.c
@@ -17,6 +17,7 @@
 #include <asm/reg.h>
 #include <asm/machdep.h>
 #include <asm/firmware.h>
+#include <asm/runlatch.h>
 #include <asm/plpar_wrappers.h>
 struct cpuidle_driver pseries_idle_driver = {
@@ -29,6 +30,7 @@ static struct cpuidle_state *cpuidle_state_table;
 static inline void idle_loop_prolog(unsigned long *in_purr)
 {
+        ppc64_runlatch_off();
        *in_purr = mfspr(SPRN_PURR);
        /*
         * Indicate to the HV that we are idle. Now would be
@@ -45,6 +47,10 @@ static inline void idle_loop_epilog(unsigned long in_purr)
        wait_cycles += mfspr(SPRN_PURR) - in_purr;
        get_lppaca()->wait_state_cycles = cpu_to_be64(wait_cycles);
        get_lppaca()->idle = 0;
+        if (irqs_disabled())
+                local_irq_enable();
+        ppc64_runlatch_on();
 }
 static int snooze_loop(struct cpuidle_device *dev,
diff --git a/include/linux/sched.h b/include/linux/sched.h
index a781dec1cd0b..c49a2585ff7d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3,6 +3,8 @@
 #include <uapi/linux/sched.h>
+#include <linux/sched/prio.h>
 struct sched_param {
        int sched_priority;
@@ -1077,6 +1079,7 @@ struct sched_entity {
 #endif
 #ifdef CONFIG_FAIR_GROUP_SCHED
+        int                     depth;
        struct sched_entity     *parent;
        /* rq on which this entity is (to be) queued: */
        struct cfs_rq           *cfs_rq;
@@ -1470,9 +1473,10 @@ struct task_struct {
        unsigned int numa_scan_period;
        unsigned int numa_scan_period_max;
        int numa_preferred_nid;
-        int numa_migrate_deferred;
        unsigned long numa_migrate_retry;
        u64 node_stamp;                 /* migration stamp  */
+        u64 last_task_numa_placement;
+        u64 last_sum_exec_runtime;
        struct callback_head numa_work;
        struct list_head numa_entry;
@@ -1483,15 +1487,22 @@ struct task_struct {
         * Scheduling placement decisions are made based on the these counts.
         * The values remain static for the duration of a PTE scan
         */
-        unsigned long *numa_faults;
+        unsigned long *numa_faults_memory;
        unsigned long total_numa_faults;
        /*
         * numa_faults_buffer records faults per node during the current
-         * scan window. When the scan completes, the counts in numa_faults
+         * scan window. When the scan completes, the counts in
-         * decay and these values are copied.
+         * numa_faults_memory decay and these values are copied.
+         */
+        unsigned long *numa_faults_buffer_memory;
+        /*
+         * Track the nodes the process was running on when a NUMA hinting
+         * fault was incurred.
         */
-        unsigned long *numa_faults_buffer;
+        unsigned long *numa_faults_cpu;
+        unsigned long *numa_faults_buffer_cpu;
        /*
         * numa_faults_locality tracks if faults recorded during the last
@@ -1596,8 +1607,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
 extern void set_numabalancing_state(bool enabled);
 extern void task_numa_free(struct task_struct *p);
+extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
-extern unsigned int sysctl_numa_balancing_migrate_deferred;
+                                        int src_nid, int dst_cpu);
 #else
 static inline void task_numa_fault(int last_node, int node, int pages,
                                   int flags)
@@ -1613,6 +1624,11 @@ static inline void set_numabalancing_state(bool enabled)
 static inline void task_numa_free(struct task_struct *p)
 {
 }
+static inline bool should_numa_migrate_memory(struct task_struct *p,
+                                struct page *page, int src_nid, int dst_cpu)
+{
+        return true;
+}
 #endif
 static inline struct pid *task_pid(struct task_struct *task)
@@ -2080,7 +2096,16 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { }
 extern bool yield_to(struct task_struct *p, bool preempt);
 extern void set_user_nice(struct task_struct *p, long nice);
 extern int task_prio(const struct task_struct *p);
-extern int task_nice(const struct task_struct *p);
+/**
+ * task_nice - return the nice value of a given task.
+ * @p: the task in question.
+ *
+ * Return: The nice value [ -20 ... 0 ... 19 ].
+ */
+static inline int task_nice(const struct task_struct *p)
+{
+        return PRIO_TO_NICE((p)->static_prio);
+}
 extern int can_nice(const struct task_struct *p, const int nice);
 extern int task_curr(const struct task_struct *p);
 extern int idle_cpu(int cpu);
diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h
new file mode 100644
index 000000000000..410ccb74c9e6
--- /dev/null
+++ b/include/linux/sched/prio.h
@@ -0,0 +1,40 @@
+#ifndef _SCHED_PRIO_H
+#define _SCHED_PRIO_H
+/*
+ * Priority of a process goes from 0..MAX_PRIO-1, valid RT
+ * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
+ * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
+ * values are inverted: lower p->prio value means higher priority.
+ *
+ * The MAX_USER_RT_PRIO value allows the actual maximum
+ * RT priority to be separate from the value exported to
+ * user-space.  This allows kernel threads to set their
+ * priority to a value higher than any user task. Note:
+ * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
+ */
+#define MAX_USER_RT_PRIO        100
+#define MAX_RT_PRIO             MAX_USER_RT_PRIO
+#define MAX_PRIO                (MAX_RT_PRIO + 40)
+#define DEFAULT_PRIO            (MAX_RT_PRIO + 20)
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
+#endif /* _SCHED_PRIO_H */
diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h
index 34e4ebea8fce..f7453d4c5613 100644
--- a/include/linux/sched/rt.h
+++ b/include/linux/sched/rt.h
@@ -1,24 +1,7 @@
 #ifndef _SCHED_RT_H
 #define _SCHED_RT_H
-/*
+#include <linux/sched/prio.h>
- * Priority of a process goes from 0..MAX_PRIO-1, valid RT
- * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
- * tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
- * values are inverted: lower p->prio value means higher priority.
- *
- * The MAX_USER_RT_PRIO value allows the actual maximum
- * RT priority to be separate from the value exported to
- * user-space.  This allows kernel threads to set their
- * priority to a value higher than any user task. Note:
- * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
- */
-#define MAX_USER_RT_PRIO        100
-#define MAX_RT_PRIO             MAX_USER_RT_PRIO
-#define MAX_PRIO                (MAX_RT_PRIO + 40)
-#define DEFAULT_PRIO            (MAX_RT_PRIO + 20)
 static inline int rt_prio(int prio)
 {
diff --git a/kernel/Makefile b/kernel/Makefile
index bc010ee272b6..6f1c7e5cfca1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,6 @@ obj-y += sched/
 obj-y += locking/
 obj-y += power/
 obj-y += printk/
-obj-y += cpu/
 obj-y += irq/
 obj-y += rcu/
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
deleted file mode 100644
index 59ab052ef7a0..000000000000
--- a/kernel/cpu/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-y   = idle.o
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 277f494c2a9a..b7976a127178 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -3,6 +3,7 @@
 */
 #include <linux/sched.h>
 #include <linux/cpu.h>
+#include <linux/cpuidle.h>
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
@@ -95,8 +96,10 @@ static void cpu_idle_loop(void)
                                if (!current_clr_polling_and_test()) {
                                        stop_critical_timings();
                                        rcu_idle_enter();
-                                        arch_cpu_idle();
+                                        if (cpuidle_idle_call())
-                                        WARN_ON_ONCE(irqs_disabled());
+                                                arch_cpu_idle();
+                                        if (WARN_ON_ONCE(irqs_disabled()))
+                                                local_irq_enable();
                                        rcu_idle_exit();
                                        start_critical_timings();
                                } else {
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a95c8c2af2a..ab32b7b0db5c 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
 obj-y += core.o proc.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o
+obj-y += wait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131ef6aab..fb9764fbc537 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1745,8 +1745,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
        p->numa_work.next = &p->numa_work;
-        p->numa_faults = NULL;
+        p->numa_faults_memory = NULL;
-        p->numa_faults_buffer = NULL;
+        p->numa_faults_buffer_memory = NULL;
+        p->last_task_numa_placement = 0;
+        p->last_sum_exec_runtime = 0;
        INIT_LIST_HEAD(&p->numa_entry);
        p->numa_group = NULL;
@@ -2167,13 +2169,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 #ifdef CONFIG_SMP
-/* assumes rq->lock is held */
-static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
-{
-        if (prev->sched_class->pre_schedule)
-                prev->sched_class->pre_schedule(rq, prev);
-}
 /* rq->lock is NOT held, but preemption is disabled */
 static inline void post_schedule(struct rq *rq)
 {
@@ -2191,10 +2186,6 @@ static inline void post_schedule(struct rq *rq)
 #else
-static inline void pre_schedule(struct rq *rq, struct task_struct *p)
-{
-}
 static inline void post_schedule(struct rq *rq)
 {
 }
@@ -2577,18 +2568,11 @@ static inline void schedule_debug(struct task_struct *prev)
        schedstat_inc(this_rq(), sched_count);
 }
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
-{
-        if (prev->on_rq || rq->skip_clock_update < 0)
-                update_rq_clock(rq);
-        prev->sched_class->put_prev_task(rq, prev);
-}
 /*
 * Pick up the highest-prio task:
 */
 static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
 {
        const struct sched_class *class;
        struct task_struct *p;
@@ -2597,14 +2581,15 @@ pick_next_task(struct rq *rq)
         * Optimization: we know that if all tasks are in
         * the fair class we can call that function directly:
         */
-        if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
+        if (likely(prev->sched_class == &fair_sched_class &&
-                p = fair_sched_class.pick_next_task(rq);
+                   rq->nr_running == rq->cfs.h_nr_running)) {
+                p = fair_sched_class.pick_next_task(rq, prev);
                if (likely(p))
                        return p;
        }
        for_each_class(class) {
-                p = class->pick_next_task(rq);
+                p = class->pick_next_task(rq, prev);
                if (p)
                        return p;
        }
@@ -2700,13 +2685,10 @@ need_resched:
                switch_count = &prev->nvcsw;
        }
-        pre_schedule(rq, prev);
+        if (prev->on_rq || rq->skip_clock_update < 0)
+                update_rq_clock(rq);
-        if (unlikely(!rq->nr_running))
-                idle_balance(cpu, rq);
-        put_prev_task(rq, prev);
+        next = pick_next_task(rq, prev);
-        next = pick_next_task(rq);
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
        rq->skip_clock_update = 0;
@@ -2998,7 +2980,7 @@ void set_user_nice(struct task_struct *p, long nice)
        unsigned long flags;
        struct rq *rq;
-        if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+        if (task_nice(p) == nice || nice < -20 || nice > 19)
                return;
        /*
         * We have to be careful, if called from sys_setpriority(),
@@ -3076,7 +3058,7 @@ SYSCALL_DEFINE1(nice, int, increment)
        if (increment > 40)
                increment = 40;
-        nice = TASK_NICE(current) + increment;
+        nice = task_nice(current) + increment;
        if (nice < -20)
                nice = -20;
        if (nice > 19)
@@ -3109,18 +3091,6 @@ int task_prio(const struct task_struct *p)
 }
 /**
- * task_nice - return the nice value of a given task.
- * @p: the task in question.
- *
- * Return: The nice value [ -20 ... 0 ... 19 ].
- */
-int task_nice(const struct task_struct *p)
-{
-        return TASK_NICE(p);
-}
-EXPORT_SYMBOL(task_nice);
-/**
 * idle_cpu - is a given cpu idle currently?
 * @cpu: the processor in question.
 *
@@ -3319,7 +3289,7 @@ recheck:
         */
        if (user && !capable(CAP_SYS_NICE)) {
                if (fair_policy(policy)) {
-                        if (attr->sched_nice < TASK_NICE(p) &&
+                        if (attr->sched_nice < task_nice(p) &&
                            !can_nice(p, attr->sched_nice))
                                return -EPERM;
                }
@@ -3343,7 +3313,7 @@ recheck:
                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                 */
                if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
-                        if (!can_nice(p, TASK_NICE(p)))
+                        if (!can_nice(p, task_nice(p)))
                                return -EPERM;
                }
@@ -3383,7 +3353,7 @@ recheck:
         * If not changing anything there's no need to proceed further:
         */
        if (unlikely(policy == p->policy)) {
-                if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
+                if (fair_policy(policy) && attr->sched_nice != task_nice(p))
                        goto change;
                if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
                        goto change;
@@ -3835,7 +3805,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
        else if (task_has_rt_policy(p))
                attr.sched_priority = p->rt_priority;
        else
-                attr.sched_nice = TASK_NICE(p);
+                attr.sched_nice = task_nice(p);
        rcu_read_unlock();
@@ -4751,7 +4721,7 @@ static void migrate_tasks(unsigned int dead_cpu)
                if (rq->nr_running == 1)
                        break;
-                next = pick_next_task(rq);
+                next = pick_next_task(rq, NULL);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
@@ -4841,7 +4811,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-        struct ctl_table *table = sd_alloc_ctl_entry(13);
+        struct ctl_table *table = sd_alloc_ctl_entry(14);
        if (table == NULL)
                return NULL;
@@ -4869,9 +4839,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
                sizeof(int), 0644, proc_dointvec_minmax, false);
        set_table_entry(&table[10], "flags", &sd->flags,
                sizeof(int), 0644, proc_dointvec_minmax, false);
-        set_table_entry(&table[11], "name", sd->name,
+        set_table_entry(&table[11], "max_newidle_lb_cost",
+                &sd->max_newidle_lb_cost,
+                sizeof(long), 0644, proc_doulongvec_minmax, false);
+        set_table_entry(&table[12], "name", sd->name,
                CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-        /* &table[12] is terminator */
+        /* &table[13] is terminator */
        return table;
 }
@@ -7008,7 +6981,7 @@ void normalize_rt_tasks(void)
                         * Renice negative nice level userspace
                         * tasks back to 0:
                         */
-                        if (TASK_NICE(p) < 0 && p->mm)
+                        if (task_nice(p) < 0 && p->mm)
                                set_user_nice(p, 0);
                        continue;
                }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 99947919e30b..58624a65f124 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
        p->utimescaled += cputime_scaled;
        account_group_user_time(p, cputime);
-        index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+        index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
        /* Add user time to cpustat. */
        task_group_account_field(p, index, (__force u64) cputime);
@@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
        p->gtime += cputime;
        /* Add guest time to cpustat. */
-        if (TASK_NICE(p) > 0) {
+        if (task_nice(p) > 0) {
                cpustat[CPUTIME_NICE] += (__force u64) cputime;
                cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
        } else {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0dd5e0971a07..ed31ef66ab9d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -944,6 +944,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
        resched_task(rq->curr);
 }
+static int pull_dl_task(struct rq *this_rq);
 #endif /* CONFIG_SMP */
 /*
@@ -990,7 +992,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
        return rb_entry(left, struct sched_dl_entity, rb_node);
 }
-struct task_struct *pick_next_task_dl(struct rq *rq)
+struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 {
        struct sched_dl_entity *dl_se;
        struct task_struct *p;
@@ -998,9 +1000,17 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
        dl_rq = &rq->dl;
+#ifdef CONFIG_SMP
+        if (dl_task(prev))
+                pull_dl_task(rq);
+#endif
        if (unlikely(!dl_rq->dl_nr_running))
                return NULL;
+        if (prev)
+                prev->sched_class->put_prev_task(rq, prev);
        dl_se = pick_next_dl_entity(rq, dl_rq);
        BUG_ON(!dl_se);
@@ -1426,13 +1436,6 @@ skip:
        return ret;
 }
-static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
-{
-        /* Try to pull other tasks here */
-        if (dl_task(prev))
-                pull_dl_task(rq);
-}
 static void post_schedule_dl(struct rq *rq)
 {
        push_dl_tasks(rq);
@@ -1560,7 +1563,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
        if (unlikely(p->dl.dl_throttled))
                return;
-        if (p->on_rq || rq->curr != p) {
+        if (p->on_rq && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
                        /* Only reschedule if pushing failed */
@@ -1625,7 +1628,6 @@ const struct sched_class dl_sched_class = {
        .set_cpus_allowed       = set_cpus_allowed_dl,
        .rq_online              = rq_online_dl,
        .rq_offline             = rq_offline_dl,
-        .pre_schedule           = pre_schedule_dl,
        .post_schedule          = post_schedule_dl,
        .task_woken             = task_woken_dl,
 #endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index dd52e7ffb10e..f3344c31632a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -321,6 +321,7 @@ do {									\
        P(sched_goidle);
 #ifdef CONFIG_SMP
        P64(avg_idle);
+        P64(max_idle_balance_cost);
 #endif
        P(ttwu_count);
@@ -533,15 +534,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
                        unsigned long nr_faults = -1;
                        int cpu_current, home_node;
-                        if (p->numa_faults)
+                        if (p->numa_faults_memory)
-                                nr_faults = p->numa_faults[2*node + i];
+                                nr_faults = p->numa_faults_memory[2*node + i];
                        cpu_current = !i ? (task_node(p) == node) :
                                (pol && node_isset(node, pol->v.nodes));
                        home_node = (p->numa_preferred_nid == node);
-                        SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
+                        SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
                                i, node, cpu_current, home_node, nr_faults);
                }
        }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 966cc2bfcb77..235cfa7ad8fc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
        list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 /* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
        if (se->cfs_rq == pse->cfs_rq)
-                return 1;
+                return se->cfs_rq;
-        return 0;
+        return NULL;
 }
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
        return se->parent;
 }
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-        int depth = 0;
-        for_each_sched_entity(se)
-                depth++;
-        return depth;
-}
 static void
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
         */
        /* First walk up until both entities are at same depth */
-        se_depth = depth_se(*se);
+        se_depth = (*se)->depth;
-        pse_depth = depth_se(*pse);
+        pse_depth = (*pse)->depth;
        while (se_depth > pse_depth) {
                se_depth--;
@@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
                for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
-        return 1;
-}
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
        return NULL;
@@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
-/*
- * After skipping a page migration on a shared page, skip N more numa page
- * migrations unconditionally. This reduces the number of NUMA migrations
- * in shared memory workloads, and has the effect of pulling tasks towards
- * where their memory lives, over pulling the memory towards the task.
- */
-unsigned int sysctl_numa_balancing_migrate_deferred = 16;
 static unsigned int task_nr_scan_windows(struct task_struct *p)
 {
        unsigned long rss = 0;
@@ -893,10 +868,26 @@ struct numa_group {
        struct list_head task_list;
        struct rcu_head rcu;
+        nodemask_t active_nodes;
        unsigned long total_faults;
+        /*
+         * Faults_cpu is used to decide whether memory should move
+         * towards the CPU. As a consequence, these stats are weighted
+         * more by CPU use than by memory faults.
+         */
+        unsigned long *faults_cpu;
        unsigned long faults[0];
 };
+/* Shared or private faults. */
+#define NR_NUMA_HINT_FAULT_TYPES 2
+/* Memory and CPU locality */
+#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
+/* Averaged statistics, and temporary buffers. */
+#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
 pid_t task_numa_group_id(struct task_struct *p)
 {
        return p->numa_group ? p->numa_group->gid : 0;
@@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p)
 static inline int task_faults_idx(int nid, int priv)
 {
-        return 2 * nid + priv;
+        return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
 }
 static inline unsigned long task_faults(struct task_struct *p, int nid)
 {
-        if (!p->numa_faults)
+        if (!p->numa_faults_memory)
                return 0;
-        return p->numa_faults[task_faults_idx(nid, 0)] +
+        return p->numa_faults_memory[task_faults_idx(nid, 0)] +
-                p->numa_faults[task_faults_idx(nid, 1)];
+                p->numa_faults_memory[task_faults_idx(nid, 1)];
 }
 static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
                p->numa_group->faults[task_faults_idx(nid, 1)];
 }
+static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
+{
+        return group->faults_cpu[task_faults_idx(nid, 0)] +
+                group->faults_cpu[task_faults_idx(nid, 1)];
+}
 /*
 * These return the fraction of accesses done by a particular task, or
 * task group, on a particular numa node.  The group weight is given a
@@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
 {
        unsigned long total_faults;
-        if (!p->numa_faults)
+        if (!p->numa_faults_memory)
                return 0;
        total_faults = p->total_numa_faults;
@@ -954,6 +951,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
        return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
 }
+bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+                                int src_nid, int dst_cpu)
+{
+        struct numa_group *ng = p->numa_group;
+        int dst_nid = cpu_to_node(dst_cpu);
+        int last_cpupid, this_cpupid;
+        this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+        /*
+         * Multi-stage node selection is used in conjunction with a periodic
+         * migration fault to build a temporal task<->page relation. By using
+         * a two-stage filter we remove short/unlikely relations.
+         *
+         * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
+         * a task's usage of a particular page (n_p) per total usage of this
+         * page (n_t) (in a given time-span) to a probability.
+         *
+         * Our periodic faults will sample this probability and getting the
+         * same result twice in a row, given these samples are fully
+         * independent, is then given by P(n)^2, provided our sample period
+         * is sufficiently short compared to the usage pattern.
+         *
+         * This quadric squishes small probabilities, making it less likely we
+         * act on an unlikely task<->page relation.
+         */
+        last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+        if (!cpupid_pid_unset(last_cpupid) &&
+                                cpupid_to_nid(last_cpupid) != dst_nid)
+                return false;
+        /* Always allow migrate on private faults */
+        if (cpupid_match_pid(p, last_cpupid))
+                return true;
+        /* A shared fault, but p->numa_group has not been set up yet. */
+        if (!ng)
+                return true;
+        /*
+         * Do not migrate if the destination is not a node that
+         * is actively used by this numa group.
+         */
+        if (!node_isset(dst_nid, ng->active_nodes))
+                return false;
+        /*
+         * Source is a node that is not actively used by this
+         * numa group, while the destination is. Migrate.
+         */
+        if (!node_isset(src_nid, ng->active_nodes))
+                return true;
+        /*
+         * Both source and destination are nodes in active
+         * use by this numa group. Maximize memory bandwidth
+         * by migrating from more heavily used groups, to less
+         * heavily used ones, spreading the load around.
+         * Use a 1/4 hysteresis to avoid spurious page movement.
+         */
+        return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+}
 static unsigned long weighted_cpuload(const int cpu);
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
@@ -1267,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p)
 static void numa_migrate_preferred(struct task_struct *p)
 {
        /* This task has no NUMA fault statistics yet */
-        if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+        if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
                return;
        /* Periodically retry migrating the task to the preferred node */
@@ -1282,6 +1342,38 @@ static void numa_migrate_preferred(struct task_struct *p)
 }
 /*
+ * Find the nodes on which the workload is actively running. We do this by
+ * tracking the nodes from which NUMA hinting faults are triggered. This can
+ * be different from the set of nodes where the workload's memory is currently
+ * located.
+ *
+ * The bitmask is used to make smarter decisions on when to do NUMA page
+ * migrations, To prevent flip-flopping, and excessive page migrations, nodes
+ * are added when they cause over 6/16 of the maximum number of faults, but
+ * only removed when they drop below 3/16.
+ */
+static void update_numa_active_node_mask(struct numa_group *numa_group)
+{
+        unsigned long faults, max_faults = 0;
+        int nid;
+        for_each_online_node(nid) {
+                faults = group_faults_cpu(numa_group, nid);
+                if (faults > max_faults)
+                        max_faults = faults;
+        }
+        for_each_online_node(nid) {
+                faults = group_faults_cpu(numa_group, nid);
+                if (!node_isset(nid, numa_group->active_nodes)) {
+                        if (faults > max_faults * 6 / 16)
+                                node_set(nid, numa_group->active_nodes);
+                } else if (faults < max_faults * 3 / 16)
+                        node_clear(nid, numa_group->active_nodes);
+        }
+}
+/*
 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
 * increments. The more local the fault statistics are, the higher the scan
 * period will be for the next scan window. If local/remote ratio is below
@@ -1355,11 +1447,41 @@ static void update_task_scan_period(struct task_struct *p,
        memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
 }
+/*
+ * Get the fraction of time the task has been running since the last
+ * NUMA placement cycle. The scheduler keeps similar statistics, but
+ * decays those on a 32ms period, which is orders of magnitude off
+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
+ * stats only if the task is so new there are no NUMA statistics yet.
+ */
+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
+{
+        u64 runtime, delta, now;
+        /* Use the start of this time slice to avoid calculations. */
+        now = p->se.exec_start;
+        runtime = p->se.sum_exec_runtime;
+        if (p->last_task_numa_placement) {
+                delta = runtime - p->last_sum_exec_runtime;
+                *period = now - p->last_task_numa_placement;
+        } else {
+                delta = p->se.avg.runnable_avg_sum;
+                *period = p->se.avg.runnable_avg_period;
+        }
+        p->last_sum_exec_runtime = runtime;
+        p->last_task_numa_placement = now;
+        return delta;
+}
 static void task_numa_placement(struct task_struct *p)
 {
        int seq, nid, max_nid = -1, max_group_nid = -1;
        unsigned long max_faults = 0, max_group_faults = 0;
        unsigned long fault_types[2] = { 0, 0 };
+        unsigned long total_faults;
+        u64 runtime, period;
        spinlock_t *group_lock = NULL;
        seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1368,6 +1490,10 @@ static void task_numa_placement(struct task_struct *p)
        p->numa_scan_seq = seq;
        p->numa_scan_period_max = task_scan_max(p);
+        total_faults = p->numa_faults_locality[0] +
+                       p->numa_faults_locality[1];
+        runtime = numa_get_avg_runtime(p, &period);
        /* If the task is part of a group prevent parallel updates to group stats */
        if (p->numa_group) {
                group_lock = &p->numa_group->lock;
@@ -1379,24 +1505,37 @@ static void task_numa_placement(struct task_struct *p)
                unsigned long faults = 0, group_faults = 0;
                int priv, i;
-                for (priv = 0; priv < 2; priv++) {
+                for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
-                        long diff;
+                        long diff, f_diff, f_weight;
                        i = task_faults_idx(nid, priv);
-                        diff = -p->numa_faults[i];
                        /* Decay existing window, copy faults since last scan */
-                        p->numa_faults[i] >>= 1;
+                        diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
-                        p->numa_faults[i] += p->numa_faults_buffer[i];
+                        fault_types[priv] += p->numa_faults_buffer_memory[i];
-                        fault_types[priv] += p->numa_faults_buffer[i];
+                        p->numa_faults_buffer_memory[i] = 0;
-                        p->numa_faults_buffer[i] = 0;
-                        faults += p->numa_faults[i];
+                        /*
-                        diff += p->numa_faults[i];
+                         * Normalize the faults_from, so all tasks in a group
+                         * count according to CPU use, instead of by the raw
+                         * number of faults. Tasks with little runtime have
+                         * little over-all impact on throughput, and thus their
+                         * faults are less important.
+                         */
+                        f_weight = div64_u64(runtime << 16, period + 1);
+                        f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
+                                   (total_faults + 1);
+                        f_diff = f_weight - p->numa_faults_cpu[i] / 2;
+                        p->numa_faults_buffer_cpu[i] = 0;
+                        p->numa_faults_memory[i] += diff;
+                        p->numa_faults_cpu[i] += f_diff;
+                        faults += p->numa_faults_memory[i];
                        p->total_numa_faults += diff;
                        if (p->numa_group) {
                                /* safe because we can only change our own group */
                                p->numa_group->faults[i] += diff;
+                                p->numa_group->faults_cpu[i] += f_diff;
                                p->numa_group->total_faults += diff;
                                group_faults += p->numa_group->faults[i];
                        }
@@ -1416,6 +1555,7 @@ static void task_numa_placement(struct task_struct *p)
        update_task_scan_period(p, fault_types[0], fault_types[1]);
        if (p->numa_group) {
+                update_numa_active_node_mask(p->numa_group);
                /*
                 * If the preferred task and group nids are different,
                 * iterate over the nodes again to find the best place.
@@ -1465,7 +1605,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
        if (unlikely(!p->numa_group)) {
                unsigned int size = sizeof(struct numa_group) +
-                                    2*nr_node_ids*sizeof(unsigned long);
+                                    4*nr_node_ids*sizeof(unsigned long);
                grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
                if (!grp)
@@ -1475,9 +1615,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
                spin_lock_init(&grp->lock);
                INIT_LIST_HEAD(&grp->task_list);
                grp->gid = p->pid;
+                /* Second half of the array tracks nids where faults happen */
+                grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
+                                                nr_node_ids;
+                node_set(task_node(current), grp->active_nodes);
-                for (i = 0; i < 2*nr_node_ids; i++)
+                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
-                        grp->faults[i] = p->numa_faults[i];
+                        grp->faults[i] = p->numa_faults_memory[i];
                grp->total_faults = p->total_numa_faults;
@@ -1534,9 +1679,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
        double_lock(&my_grp->lock, &grp->lock);
-        for (i = 0; i < 2*nr_node_ids; i++) {
+        for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
-                my_grp->faults[i] -= p->numa_faults[i];
+                my_grp->faults[i] -= p->numa_faults_memory[i];
-                grp->faults[i] += p->numa_faults[i];
+                grp->faults[i] += p->numa_faults_memory[i];
        }
        my_grp->total_faults -= p->total_numa_faults;
        grp->total_faults += p->total_numa_faults;
@@ -1562,12 +1707,12 @@ void task_numa_free(struct task_struct *p)
 {
        struct numa_group *grp = p->numa_group;
        int i;
-        void *numa_faults = p->numa_faults;
+        void *numa_faults = p->numa_faults_memory;
        if (grp) {
                spin_lock(&grp->lock);
-                for (i = 0; i < 2*nr_node_ids; i++)
+                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
-                        grp->faults[i] -= p->numa_faults[i];
+                        grp->faults[i] -= p->numa_faults_memory[i];
                grp->total_faults -= p->total_numa_faults;
                list_del(&p->numa_entry);
@@ -1577,18 +1722,21 @@ void task_numa_free(struct task_struct *p)
                put_numa_group(grp);
        }
-        p->numa_faults = NULL;
+        p->numa_faults_memory = NULL;
-        p->numa_faults_buffer = NULL;
+        p->numa_faults_buffer_memory = NULL;
+        p->numa_faults_cpu= NULL;
+        p->numa_faults_buffer_cpu = NULL;
        kfree(numa_faults);
 }
 /*
 * Got a PROT_NONE fault for a page on @node.
 */
-void task_numa_fault(int last_cpupid, int node, int pages, int flags)
+void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 {
        struct task_struct *p = current;
        bool migrated = flags & TNF_MIGRATED;
+        int cpu_node = task_node(current);
        int priv;
        if (!numabalancing_enabled)
@@ -1603,16 +1751,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
                return;
        /* Allocate buffer to track faults on a per-node basis */
-        if (unlikely(!p->numa_faults)) {
+        if (unlikely(!p->numa_faults_memory)) {
-                int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
+                int size = sizeof(*p->numa_faults_memory) *
+                           NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
-                /* numa_faults and numa_faults_buffer share the allocation */
+                p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
-                p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
+                if (!p->numa_faults_memory)
-                if (!p->numa_faults)
                        return;
-                BUG_ON(p->numa_faults_buffer);
+                BUG_ON(p->numa_faults_buffer_memory);
-                p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+                /*
+                 * The averaged statistics, shared & private, memory & cpu,
+                 * occupy the first half of the array. The second half of the
+                 * array is for current counters, which are averaged into the
+                 * first set by task_numa_placement.
+                 */
+                p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
+                p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
+                p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
                p->total_numa_faults = 0;
                memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
        }
@@ -1641,7 +1797,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
        if (migrated)
                p->numa_pages_migrated += pages;
-        p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+        p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
+        p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
        p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
 }
@@ -2414,7 +2571,8 @@ void idle_exit_fair(struct rq *this_rq)
        update_rq_runnable_avg(this_rq, 0);
 }
-#else
+#else /* CONFIG_SMP */
 static inline void update_entity_load_avg(struct sched_entity *se,
                                          int update_cfs_rq) {}
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
@@ -2426,7 +2584,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
                                           int sleep) {}
 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
                                              int force_update) {}
-#endif
+#endif /* CONFIG_SMP */
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -2576,10 +2734,10 @@ static void __clear_buddies_last(struct sched_entity *se)
 {
        for_each_sched_entity(se) {
                struct cfs_rq *cfs_rq = cfs_rq_of(se);
-                if (cfs_rq->last == se)
+                if (cfs_rq->last != se)
-                        cfs_rq->last = NULL;
-                else
                        break;
+                cfs_rq->last = NULL;
        }
 }
@@ -2587,10 +2745,10 @@ static void __clear_buddies_next(struct sched_entity *se)
 {
        for_each_sched_entity(se) {
                struct cfs_rq *cfs_rq = cfs_rq_of(se);
-                if (cfs_rq->next == se)
+                if (cfs_rq->next != se)
-                        cfs_rq->next = NULL;
-                else
                        break;
+                cfs_rq->next = NULL;
        }
 }
@@ -2598,10 +2756,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
 {
        for_each_sched_entity(se) {
                struct cfs_rq *cfs_rq = cfs_rq_of(se);
-                if (cfs_rq->skip == se)
+                if (cfs_rq->skip != se)
-                        cfs_rq->skip = NULL;
-                else
                        break;
+                cfs_rq->skip = NULL;
        }
 }
@@ -2744,17 +2902,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 * 3) pick the "last" process, for cache locality
 * 4) do not run the "skip" process, if something else is available
 */
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-        struct sched_entity *se = __pick_first_entity(cfs_rq);
+        struct sched_entity *left = __pick_first_entity(cfs_rq);
-        struct sched_entity *left = se;
+        struct sched_entity *se;
+        /*
+         * If curr is set we have to see if its left of the leftmost entity
+         * still in the tree, provided there was anything in the tree at all.
+         */
+        if (!left || (curr && entity_before(curr, left)))
+                left = curr;
+        se = left; /* ideally we run the leftmost entity */
        /*
         * Avoid running the skip buddy, if running something else can
         * be done without getting too unfair.
         */
        if (cfs_rq->skip == se) {
-                struct sched_entity *second = __pick_next_entity(se);
+                struct sched_entity *second;
+                if (se == curr) {
+                        second = __pick_first_entity(cfs_rq);
+                } else {
+                        second = __pick_next_entity(se);
+                        if (!second || (curr && entity_before(curr, second)))
+                                second = curr;
+                }
                if (second && wakeup_preempt_entity(second, left) < 1)
                        se = second;
        }
@@ -2776,7 +2953,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
        return se;
 }
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
@@ -3431,22 +3608,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 }
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
        if (!cfs_bandwidth_used())
-                return;
+                return false;
        if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
-                return;
+                return false;
        /*
         * it's possible for a throttled entity to be forced into a running
         * state (e.g. set_curr_task), in this case we're finished.
         */
        if (cfs_rq_throttled(cfs_rq))
-                return;
+                return true;
        throttle_cfs_rq(cfs_rq);
+        return true;
 }
 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3556,7 +3734,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 }
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -4492,26 +4670,125 @@ preempt:
                set_last_buddy(se);
 }
-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 {
-        struct task_struct *p;
        struct cfs_rq *cfs_rq = &rq->cfs;
        struct sched_entity *se;
+        struct task_struct *p;
+again: __maybe_unused
+#ifdef CONFIG_FAIR_GROUP_SCHED
        if (!cfs_rq->nr_running)
-                return NULL;
+                goto idle;
+        if (!prev || prev->sched_class != &fair_sched_class)
+                goto simple;
+        /*
+         * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+         * likely that a next task is from the same cgroup as the current.
+         *
+         * Therefore attempt to avoid putting and setting the entire cgroup
+         * hierarchy, only change the part that actually changes.
+         */
        do {
-                se = pick_next_entity(cfs_rq);
+                struct sched_entity *curr = cfs_rq->curr;
+                /*
+                 * Since we got here without doing put_prev_entity() we also
+                 * have to consider cfs_rq->curr. If it is still a runnable
+                 * entity, update_curr() will update its vruntime, otherwise
+                 * forget we've ever seen it.
+                 */
+                if (curr && curr->on_rq)
+                        update_curr(cfs_rq);
+                else
+                        curr = NULL;
+                /*
+                 * This call to check_cfs_rq_runtime() will do the throttle and
+                 * dequeue its entity in the parent(s). Therefore the 'simple'
+                 * nr_running test will indeed be correct.
+                 */
+                if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+                        goto simple;
+                se = pick_next_entity(cfs_rq, curr);
+                cfs_rq = group_cfs_rq(se);
+        } while (cfs_rq);
+        p = task_of(se);
+        /*
+         * Since we haven't yet done put_prev_entity and if the selected task
+         * is a different task than we started out with, try and touch the
+         * least amount of cfs_rqs.
+         */
+        if (prev != p) {
+                struct sched_entity *pse = &prev->se;
+                while (!(cfs_rq = is_same_group(se, pse))) {
+                        int se_depth = se->depth;
+                        int pse_depth = pse->depth;
+                        if (se_depth <= pse_depth) {
+                                put_prev_entity(cfs_rq_of(pse), pse);
+                                pse = parent_entity(pse);
+                        }
+                        if (se_depth >= pse_depth) {
+                                set_next_entity(cfs_rq_of(se), se);
+                                se = parent_entity(se);
+                        }
+                }
+                put_prev_entity(cfs_rq, pse);
+                set_next_entity(cfs_rq, se);
+        }
+        if (hrtick_enabled(rq))
+                hrtick_start_fair(rq, p);
+        return p;
+simple:
+        cfs_rq = &rq->cfs;
+#endif
+        if (!cfs_rq->nr_running)
+                goto idle;
+        if (prev)
+                prev->sched_class->put_prev_task(rq, prev);
+        do {
+                se = pick_next_entity(cfs_rq, NULL);
                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
        p = task_of(se);
        if (hrtick_enabled(rq))
                hrtick_start_fair(rq, p);
        return p;
+idle:
+#ifdef CONFIG_SMP
+        idle_enter_fair(rq);
+        /*
+         * We must set idle_stamp _before_ calling idle_balance(), such that we
+         * measure the duration of idle_balance() as idle time.
+         */
+        rq->idle_stamp = rq_clock(rq);
+        if (idle_balance(rq)) { /* drops rq->lock */
+                rq->idle_stamp = 0;
+                goto again;
+        }
+#endif
+        return NULL;
 }
 /*
@@ -4783,7 +5060,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
 {
        int src_nid, dst_nid;
-        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
            !(env->sd->flags & SD_NUMA)) {
                return false;
        }
@@ -4814,7 +5091,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
        if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
                return false;
-        if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+        if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
                return false;
        src_nid = cpu_to_node(env->src_cpu);
@@ -6357,17 +6634,16 @@ out:
 * idle_balance is called by schedule() if this_cpu is about to become
 * idle. Attempts to pull tasks from other CPUs.
 */
-void idle_balance(int this_cpu, struct rq *this_rq)
+int idle_balance(struct rq *this_rq)
 {
        struct sched_domain *sd;
        int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
        u64 curr_cost = 0;
+        int this_cpu = this_rq->cpu;
-        this_rq->idle_stamp = rq_clock(this_rq);
        if (this_rq->avg_idle < sysctl_sched_migration_cost)
-                return;
+                return 0;
        /*
         * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6405,15 +6681,20 @@ void idle_balance(int this_cpu, struct rq *this_rq)
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
                        next_balance = sd->last_balance + interval;
-                if (pulled_task) {
+                if (pulled_task)
-                        this_rq->idle_stamp = 0;
                        break;
-                }
        }
        rcu_read_unlock();
        raw_spin_lock(&this_rq->lock);
+        /*
+         * While browsing the domains, we released the rq lock.
+         * A task could have be enqueued in the meantime
+         */
+        if (this_rq->nr_running && !pulled_task)
+                return 1;
        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
                /*
                 * We are going idle. next_balance may be set based on
@@ -6424,6 +6705,8 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        if (curr_cost > this_rq->max_idle_balance_cost)
                this_rq->max_idle_balance_cost = curr_cost;
+        return pulled_task;
 }
 /*
@@ -7082,7 +7365,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
+        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq;
        /*
         * If the task was not on the rq at the time of this cgroup movement
         * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7108,23 +7393,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
         * To prevent boost or penalty in the new cfs_rq caused by delta
         * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
         */
-        if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+        if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
                on_rq = 1;
        if (!on_rq)
-                p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+                se->vruntime -= cfs_rq_of(se)->min_vruntime;
        set_task_rq(p, task_cpu(p));
+        se->depth = se->parent ? se->parent->depth + 1 : 0;
        if (!on_rq) {
-                cfs_rq = cfs_rq_of(&p->se);
+                cfs_rq = cfs_rq_of(se);
-                p->se.vruntime += cfs_rq->min_vruntime;
+                se->vruntime += cfs_rq->min_vruntime;
 #ifdef CONFIG_SMP
                /*
                 * migrate_task_rq_fair() will have removed our previous
                 * contribution, but we must synchronize for ongoing future
                 * decay.
                 */
-                p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+                se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
-                cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+                cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
 #endif
        }
 }
@@ -7220,10 +7506,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
        if (!se)
                return;
-        if (!parent)
+        if (!parent) {
                se->cfs_rq = &rq->cfs;
-        else
+                se->depth = 0;
+        } else {
                se->cfs_rq = parent->my_q;
+                se->depth = parent->depth + 1;
+        }
        se->my_q = cfs_rq;
        /* guarantee group entities always have weight */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
new file mode 100644
index 000000000000..14ca43430aee
--- /dev/null
+++ b/kernel/sched/idle.c
@@ -0,0 +1,144 @@
+/*
+ * Generic entry point for the idle threads
+ */
+#include <linux/sched.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/tick.h>
+#include <linux/mm.h>
+#include <linux/stackprotector.h>
+#include <asm/tlb.h>
+#include <trace/events/power.h>
+static int __read_mostly cpu_idle_force_poll;
+void cpu_idle_poll_ctrl(bool enable)
+{
+        if (enable) {
+                cpu_idle_force_poll++;
+        } else {
+                cpu_idle_force_poll--;
+                WARN_ON_ONCE(cpu_idle_force_poll < 0);
+        }
+}
+#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
+static int __init cpu_idle_poll_setup(char *__unused)
+{
+        cpu_idle_force_poll = 1;
+        return 1;
+}
+__setup("nohlt", cpu_idle_poll_setup);
+static int __init cpu_idle_nopoll_setup(char *__unused)
+{
+        cpu_idle_force_poll = 0;
+        return 1;
+}
+__setup("hlt", cpu_idle_nopoll_setup);
+#endif
+static inline int cpu_idle_poll(void)
+{
+        rcu_idle_enter();
+        trace_cpu_idle_rcuidle(0, smp_processor_id());
+        local_irq_enable();
+        while (!tif_need_resched())
+                cpu_relax();
+        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+        rcu_idle_exit();
+        return 1;
+}
+/* Weak implementations for optional arch specific functions */
+void __weak arch_cpu_idle_prepare(void) { }
+void __weak arch_cpu_idle_enter(void) { }
+void __weak arch_cpu_idle_exit(void) { }
+void __weak arch_cpu_idle_dead(void) { }
+void __weak arch_cpu_idle(void)
+{
+        cpu_idle_force_poll = 1;
+        local_irq_enable();
+}
+/*
+ * Generic idle loop implementation
+ */
+static void cpu_idle_loop(void)
+{
+        while (1) {
+                tick_nohz_idle_enter();
+                while (!need_resched()) {
+                        check_pgt_cache();
+                        rmb();
+                        if (cpu_is_offline(smp_processor_id()))
+                                arch_cpu_idle_dead();
+                        local_irq_disable();
+                        arch_cpu_idle_enter();
+                        /*
+                         * In poll mode we reenable interrupts and spin.
+                         *
+                         * Also if we detected in the wakeup from idle
+                         * path that the tick broadcast device expired
+                         * for us, we don't want to go deep idle as we
+                         * know that the IPI is going to arrive right
+                         * away
+                         */
+                        if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+                                cpu_idle_poll();
+                        } else {
+                                if (!current_clr_polling_and_test()) {
+                                        stop_critical_timings();
+                                        rcu_idle_enter();
+                                        if (cpuidle_idle_call())
+                                                arch_cpu_idle();
+                                        if (WARN_ON_ONCE(irqs_disabled()))
+                                                local_irq_enable();
+                                        rcu_idle_exit();
+                                        start_critical_timings();
+                                } else {
+                                        local_irq_enable();
+                                }
+                                __current_set_polling();
+                        }
+                        arch_cpu_idle_exit();
+                        /*
+                         * We need to test and propagate the TIF_NEED_RESCHED
+                         * bit here because we might not have send the
+                         * reschedule IPI to idle tasks.
+                         */
+                        if (tif_need_resched())
+                                set_preempt_need_resched();
+                }
+                tick_nohz_idle_exit();
+                schedule_preempt_disabled();
+        }
+}
+void cpu_startup_entry(enum cpuhp_state state)
+{
+        /*
+         * This #ifdef needs to die, but it's too late in the cycle to
+         * make this generic (arm and sh have never invoked the canary
+         * init for the non boot cpus!). Will be fixed in 3.11
+         */
+#ifdef CONFIG_X86
+        /*
+         * If we're the non-boot CPU, nothing set the stack canary up
+         * for us. The boot CPU already has it initialized but no harm
+         * in doing it again. This is a good place for updating it, as
+         * we wont ever return from this function (so the invalid
+         * canaries already on the stack wont ever trigger).
+         */
+        boot_init_stack_canary();
+#endif
+        __current_set_polling();
+        arch_cpu_idle_prepare();
+        cpu_idle_loop();
+}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9ceea1..f7d03af79a5b 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,18 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
        return task_cpu(p); /* IDLE tasks as never migrated */
 }
-static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
-{
-        idle_exit_fair(rq);
-        rq_last_tick_reset(rq);
-}
-static void post_schedule_idle(struct rq *rq)
-{
-        idle_enter_fair(rq);
-}
 #endif /* CONFIG_SMP */
 /*
 * Idle tasks are unconditionally rescheduled:
 */
@@ -33,12 +23,15 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
        resched_task(rq->idle);
 }
-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
+        if (prev)
+                prev->sched_class->put_prev_task(rq, prev);
        schedstat_inc(rq, sched_goidle);
 #ifdef CONFIG_SMP
-        /* Trigger the post schedule to do an idle_enter for CFS */
+        idle_enter_fair(rq);
-        rq->post_schedule = 1;
 #endif
        return rq->idle;
 }
@@ -58,6 +51,10 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
+#ifdef CONFIG_SMP
+        idle_exit_fair(rq);
+        rq_last_tick_reset(rq);
+#endif
 }
 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
@@ -101,8 +98,6 @@ const struct sched_class idle_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_idle,
-        .pre_schedule           = pre_schedule_idle,
-        .post_schedule          = post_schedule_idle,
 #endif
        .set_curr_task          = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a2740b775b45..72f9ec759972 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -229,6 +229,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 #ifdef CONFIG_SMP
+static int pull_rt_task(struct rq *this_rq);
 static inline int rt_overloaded(struct rq *rq)
 {
        return atomic_read(&rq->rd->rto_count);
@@ -1310,15 +1312,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 {
        struct sched_rt_entity *rt_se;
        struct task_struct *p;
-        struct rt_rq *rt_rq;
+        struct rt_rq *rt_rq  = &rq->rt;
-        rt_rq = &rq->rt;
-        if (!rt_rq->rt_nr_running)
-                return NULL;
-        if (rt_rq_throttled(rt_rq))
-                return NULL;
        do {
                rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1332,9 +1326,28 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
        return p;
 }
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 {
-        struct task_struct *p = _pick_next_task_rt(rq);
+        struct task_struct *p;
+        struct rt_rq *rt_rq = &rq->rt;
+#ifdef CONFIG_SMP
+        /* Try to pull RT tasks here if we lower this rq's prio */
+        if (rq->rt.highest_prio.curr > prev->prio)
+                pull_rt_task(rq);
+#endif
+        if (!rt_rq->rt_nr_running)
+                return NULL;
+        if (rt_rq_throttled(rt_rq))
+                return NULL;
+        if (prev)
+                prev->sched_class->put_prev_task(rq, prev);
+        p = _pick_next_task_rt(rq);
        /* The running task is never eligible for pushing */
        if (p)
@@ -1716,13 +1729,6 @@ skip:
        return ret;
 }
-static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
-{
-        /* Try to pull RT tasks here if we lower this rq's prio */
-        if (rq->rt.highest_prio.curr > prev->prio)
-                pull_rt_task(rq);
-}
 static void post_schedule_rt(struct rq *rq)
 {
        push_rt_tasks(rq);
@@ -1999,7 +2005,6 @@ const struct sched_class rt_sched_class = {
        .set_cpus_allowed       = set_cpus_allowed_rt,
        .rq_online              = rq_online_rt,
        .rq_offline             = rq_offline_rt,
-        .pre_schedule           = pre_schedule_rt,
        .post_schedule          = post_schedule_rt,
        .task_woken             = task_woken_rt,
        .switched_from          = switched_from_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c2119fd20f8b..1bf34c257d3b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -24,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq);
 extern void update_cpu_load_active(struct rq *this_rq);
 /*
- * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- * and back.
- */
-#define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
-/*
 * Helpers for converting nanosecond timing to jiffy resolution
 */
 #define NS_TO_JIFFIES(TIME)     ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
@@ -1123,14 +1105,19 @@ struct sched_class {
        void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
-        struct task_struct * (*pick_next_task) (struct rq *rq);
+        /*
+         * It is the responsibility of the pick_next_task() method that will
+         * return the next task to call put_prev_task() on the @prev task or
+         * something equivalent.
+         */
+        struct task_struct * (*pick_next_task) (struct rq *rq,
+                                                struct task_struct *prev);
        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 #ifdef CONFIG_SMP
        int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
        void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
-        void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
        void (*post_schedule) (struct rq *this_rq);
        void (*task_waking) (struct task_struct *task);
        void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1176,7 +1163,7 @@ extern const struct sched_class idle_sched_class;
 extern void update_group_power(struct sched_domain *sd, int cpu);
 extern void trigger_load_balance(struct rq *rq);
-extern void idle_balance(int this_cpu, struct rq *this_rq);
+extern int idle_balance(struct rq *this_rq);
 extern void idle_enter_fair(struct rq *this_rq);
 extern void idle_exit_fair(struct rq *this_rq);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fdb6bb0b3356..a4147c9d2017 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,16 +23,20 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
        /* we're never preempted */
 }
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 {
        struct task_struct *stop = rq->stop;
-        if (stop && stop->on_rq) {
+        if (!stop || !stop->on_rq)
-                stop->se.exec_start = rq_clock_task(rq);
+                return NULL;
-                return stop;
-        }
-        return NULL;
+        if (prev)
+                prev->sched_class->put_prev_task(rq, prev);
+        stop->se.exec_start = rq_clock_task(rq);
+        return stop;
 }
 static void
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 49e13e1f8fe6..7754ff16f334 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -386,13 +386,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
-                .procname       = "numa_balancing_migrate_deferred",
-                .data           = &sysctl_numa_balancing_migrate_deferred,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
                .procname       = "numa_balancing",
                .data           = NULL, /* filled in by handler */
                .maxlen         = sizeof(unsigned int),
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index ae3c8f3595d4..f520b9da9c1f 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2301,35 +2301,6 @@ static void sp_free(struct sp_node *n)
        kmem_cache_free(sn_cache, n);
 }
-#ifdef CONFIG_NUMA_BALANCING
-static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
-{
-        /* Never defer a private fault */
-        if (cpupid_match_pid(p, last_cpupid))
-                return false;
-        if (p->numa_migrate_deferred) {
-                p->numa_migrate_deferred--;
-                return true;
-        }
-        return false;
-}
-static inline void defer_numa_migrate(struct task_struct *p)
-{
-        p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
-}
-#else
-static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
-{
-        return false;
-}
-static inline void defer_numa_migrate(struct task_struct *p)
-{
-}
-#endif /* CONFIG_NUMA_BALANCING */
 /**
 * mpol_misplaced - check whether current page node is valid in policy
 *
@@ -2403,52 +2374,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
        /* Migrate the page towards the node whose CPU is referencing it */
        if (pol->flags & MPOL_F_MORON) {
-                int last_cpupid;
-                int this_cpupid;
                polnid = thisnid;
-                this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
-                /*
-                 * Multi-stage node selection is used in conjunction
-                 * with a periodic migration fault to build a temporal
-                 * task<->page relation. By using a two-stage filter we
-                 * remove short/unlikely relations.
-                 *
-                 * Using P(p) ~ n_p / n_t as per frequentist
-                 * probability, we can equate a task's usage of a
-                 * particular page (n_p) per total usage of this
-                 * page (n_t) (in a given time-span) to a probability.
-                 *
-                 * Our periodic faults will sample this probability and
-                 * getting the same result twice in a row, given these
-                 * samples are fully independent, is then given by
-                 * P(n)^2, provided our sample period is sufficiently
-                 * short compared to the usage pattern.
-                 *
-                 * This quadric squishes small probabilities, making
-                 * it less likely we act on an unlikely task<->page
-                 * relation.
-                 */
-                last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
-                if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
-                        /* See sysctl_numa_balancing_migrate_deferred comment */
+                if (!should_numa_migrate_memory(current, page, curnid, thiscpu))
-                        if (!cpupid_match_pid(current, last_cpupid))
-                                defer_numa_migrate(current);
-                        goto out;
-                }
-                /*
-                 * The quadratic filter above reduces extraneous migration
-                 * of shared pages somewhat. This code reduces it even more,
-                 * reducing the overhead of page migrations of shared pages.
-                 * This makes workloads with shared pages rely more on
-                 * "move task near its memory", and less on "move memory
-                 * towards its task", which is exactly what we want.
-                 */
-                if (numa_migrate_deferred(current, last_cpupid))
                        goto out;
        }