15 files changed, 643 insertions, 249 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index bc010ee272b6..6f1c7e5cfca1 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -22,7 +22,6 @@ obj-y += sched/
 obj-y += locking/
 obj-y += power/
 obj-y += printk/
-obj-y += cpu/
 obj-y += irq/
 obj-y += rcu/
diff --git a/kernel/cpu/Makefile b/kernel/cpu/Makefile
deleted file mode 100644
index 59ab052ef7a0..000000000000
--- a/kernel/cpu/Makefile
+++ /dev/null
@@ -1 +0,0 @@
-obj-y   = idle.o
diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c
index 277f494c2a9a..b7976a127178 100644
--- a/kernel/cpu/idle.c
+++ b/kernel/cpu/idle.c
@@ -3,6 +3,7 @@
 */
 #include <linux/sched.h>
 #include <linux/cpu.h>
+#include <linux/cpuidle.h>
 #include <linux/tick.h>
 #include <linux/mm.h>
 #include <linux/stackprotector.h>
@@ -95,8 +96,10 @@ static void cpu_idle_loop(void)
                                if (!current_clr_polling_and_test()) {
                                        stop_critical_timings();
                                        rcu_idle_enter();
-                                        arch_cpu_idle();
+                                        if (cpuidle_idle_call())
-                                        WARN_ON_ONCE(irqs_disabled());
+                                                arch_cpu_idle();
+                                        if (WARN_ON_ONCE(irqs_disabled()))
+                                                local_irq_enable();
                                        rcu_idle_exit();
                                        start_critical_timings();
                                } else {
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 9a95c8c2af2a..ab32b7b0db5c 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -13,7 +13,7 @@ endif
 obj-y += core.o proc.o clock.o cputime.o
 obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
-obj-y += wait.o completion.o
+obj-y += wait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b46131ef6aab..fb9764fbc537 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1745,8 +1745,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
        p->numa_scan_period = sysctl_numa_balancing_scan_delay;
        p->numa_work.next = &p->numa_work;
-        p->numa_faults = NULL;
+        p->numa_faults_memory = NULL;
-        p->numa_faults_buffer = NULL;
+        p->numa_faults_buffer_memory = NULL;
+        p->last_task_numa_placement = 0;
+        p->last_sum_exec_runtime = 0;
        INIT_LIST_HEAD(&p->numa_entry);
        p->numa_group = NULL;
@@ -2167,13 +2169,6 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 #ifdef CONFIG_SMP
-/* assumes rq->lock is held */
-static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
-{
-        if (prev->sched_class->pre_schedule)
-                prev->sched_class->pre_schedule(rq, prev);
-}
 /* rq->lock is NOT held, but preemption is disabled */
 static inline void post_schedule(struct rq *rq)
 {
@@ -2191,10 +2186,6 @@ static inline void post_schedule(struct rq *rq)
 #else
-static inline void pre_schedule(struct rq *rq, struct task_struct *p)
-{
-}
 static inline void post_schedule(struct rq *rq)
 {
 }
@@ -2577,18 +2568,11 @@ static inline void schedule_debug(struct task_struct *prev)
        schedstat_inc(this_rq(), sched_count);
 }
-static void put_prev_task(struct rq *rq, struct task_struct *prev)
-{
-        if (prev->on_rq || rq->skip_clock_update < 0)
-                update_rq_clock(rq);
-        prev->sched_class->put_prev_task(rq, prev);
-}
 /*
 * Pick up the highest-prio task:
 */
 static inline struct task_struct *
-pick_next_task(struct rq *rq)
+pick_next_task(struct rq *rq, struct task_struct *prev)
 {
        const struct sched_class *class;
        struct task_struct *p;
@@ -2597,14 +2581,15 @@ pick_next_task(struct rq *rq)
         * Optimization: we know that if all tasks are in
         * the fair class we can call that function directly:
         */
-        if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
+        if (likely(prev->sched_class == &fair_sched_class &&
-                p = fair_sched_class.pick_next_task(rq);
+                   rq->nr_running == rq->cfs.h_nr_running)) {
+                p = fair_sched_class.pick_next_task(rq, prev);
                if (likely(p))
                        return p;
        }
        for_each_class(class) {
-                p = class->pick_next_task(rq);
+                p = class->pick_next_task(rq, prev);
                if (p)
                        return p;
        }
@@ -2700,13 +2685,10 @@ need_resched:
                switch_count = &prev->nvcsw;
        }
-        pre_schedule(rq, prev);
+        if (prev->on_rq || rq->skip_clock_update < 0)
+                update_rq_clock(rq);
-        if (unlikely(!rq->nr_running))
-                idle_balance(cpu, rq);
-        put_prev_task(rq, prev);
+        next = pick_next_task(rq, prev);
-        next = pick_next_task(rq);
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
        rq->skip_clock_update = 0;
@@ -2998,7 +2980,7 @@ void set_user_nice(struct task_struct *p, long nice)
        unsigned long flags;
        struct rq *rq;
-        if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
+        if (task_nice(p) == nice || nice < -20 || nice > 19)
                return;
        /*
         * We have to be careful, if called from sys_setpriority(),
@@ -3076,7 +3058,7 @@ SYSCALL_DEFINE1(nice, int, increment)
        if (increment > 40)
                increment = 40;
-        nice = TASK_NICE(current) + increment;
+        nice = task_nice(current) + increment;
        if (nice < -20)
                nice = -20;
        if (nice > 19)
@@ -3109,18 +3091,6 @@ int task_prio(const struct task_struct *p)
 }
 /**
- * task_nice - return the nice value of a given task.
- * @p: the task in question.
- *
- * Return: The nice value [ -20 ... 0 ... 19 ].
- */
-int task_nice(const struct task_struct *p)
-{
-        return TASK_NICE(p);
-}
-EXPORT_SYMBOL(task_nice);
-/**
 * idle_cpu - is a given cpu idle currently?
 * @cpu: the processor in question.
 *
@@ -3319,7 +3289,7 @@ recheck:
         */
        if (user && !capable(CAP_SYS_NICE)) {
                if (fair_policy(policy)) {
-                        if (attr->sched_nice < TASK_NICE(p) &&
+                        if (attr->sched_nice < task_nice(p) &&
                            !can_nice(p, attr->sched_nice))
                                return -EPERM;
                }
@@ -3343,7 +3313,7 @@ recheck:
                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
                 */
                if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
-                        if (!can_nice(p, TASK_NICE(p)))
+                        if (!can_nice(p, task_nice(p)))
                                return -EPERM;
                }
@@ -3383,7 +3353,7 @@ recheck:
         * If not changing anything there's no need to proceed further:
         */
        if (unlikely(policy == p->policy)) {
-                if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
+                if (fair_policy(policy) && attr->sched_nice != task_nice(p))
                        goto change;
                if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
                        goto change;
@@ -3835,7 +3805,7 @@ SYSCALL_DEFINE3(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
        else if (task_has_rt_policy(p))
                attr.sched_priority = p->rt_priority;
        else
-                attr.sched_nice = TASK_NICE(p);
+                attr.sched_nice = task_nice(p);
        rcu_read_unlock();
@@ -4751,7 +4721,7 @@ static void migrate_tasks(unsigned int dead_cpu)
                if (rq->nr_running == 1)
                        break;
-                next = pick_next_task(rq);
+                next = pick_next_task(rq, NULL);
                BUG_ON(!next);
                next->sched_class->put_prev_task(rq, next);
@@ -4841,7 +4811,7 @@ set_table_entry(struct ctl_table *entry,
 static struct ctl_table *
 sd_alloc_ctl_domain_table(struct sched_domain *sd)
 {
-        struct ctl_table *table = sd_alloc_ctl_entry(13);
+        struct ctl_table *table = sd_alloc_ctl_entry(14);
        if (table == NULL)
                return NULL;
@@ -4869,9 +4839,12 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
                sizeof(int), 0644, proc_dointvec_minmax, false);
        set_table_entry(&table[10], "flags", &sd->flags,
                sizeof(int), 0644, proc_dointvec_minmax, false);
-        set_table_entry(&table[11], "name", sd->name,
+        set_table_entry(&table[11], "max_newidle_lb_cost",
+                &sd->max_newidle_lb_cost,
+                sizeof(long), 0644, proc_doulongvec_minmax, false);
+        set_table_entry(&table[12], "name", sd->name,
                CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-        /* &table[12] is terminator */
+        /* &table[13] is terminator */
        return table;
 }
@@ -7008,7 +6981,7 @@ void normalize_rt_tasks(void)
                         * Renice negative nice level userspace
                         * tasks back to 0:
                         */
-                        if (TASK_NICE(p) < 0 && p->mm)
+                        if (task_nice(p) < 0 && p->mm)
                                set_user_nice(p, 0);
                        continue;
                }
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 99947919e30b..58624a65f124 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -142,7 +142,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
        p->utimescaled += cputime_scaled;
        account_group_user_time(p, cputime);
-        index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
+        index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
        /* Add user time to cpustat. */
        task_group_account_field(p, index, (__force u64) cputime);
@@ -169,7 +169,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
        p->gtime += cputime;
        /* Add guest time to cpustat. */
-        if (TASK_NICE(p) > 0) {
+        if (task_nice(p) > 0) {
                cpustat[CPUTIME_NICE] += (__force u64) cputime;
                cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
        } else {
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 0dd5e0971a07..ed31ef66ab9d 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -944,6 +944,8 @@ static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
        resched_task(rq->curr);
 }
+static int pull_dl_task(struct rq *this_rq);
 #endif /* CONFIG_SMP */
 /*
@@ -990,7 +992,7 @@ static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
        return rb_entry(left, struct sched_dl_entity, rb_node);
 }
-struct task_struct *pick_next_task_dl(struct rq *rq)
+struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
 {
        struct sched_dl_entity *dl_se;
        struct task_struct *p;
@@ -998,9 +1000,17 @@ struct task_struct *pick_next_task_dl(struct rq *rq)
        dl_rq = &rq->dl;
+#ifdef CONFIG_SMP
+        if (dl_task(prev))
+                pull_dl_task(rq);
+#endif
        if (unlikely(!dl_rq->dl_nr_running))
                return NULL;
+        if (prev)
+                prev->sched_class->put_prev_task(rq, prev);
        dl_se = pick_next_dl_entity(rq, dl_rq);
        BUG_ON(!dl_se);
@@ -1426,13 +1436,6 @@ skip:
        return ret;
 }
-static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
-{
-        /* Try to pull other tasks here */
-        if (dl_task(prev))
-                pull_dl_task(rq);
-}
 static void post_schedule_dl(struct rq *rq)
 {
        push_dl_tasks(rq);
@@ -1560,7 +1563,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
        if (unlikely(p->dl.dl_throttled))
                return;
-        if (p->on_rq || rq->curr != p) {
+        if (p->on_rq && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
                        /* Only reschedule if pushing failed */
@@ -1625,7 +1628,6 @@ const struct sched_class dl_sched_class = {
        .set_cpus_allowed       = set_cpus_allowed_dl,
        .rq_online              = rq_online_dl,
        .rq_offline             = rq_offline_dl,
-        .pre_schedule           = pre_schedule_dl,
        .post_schedule          = post_schedule_dl,
        .task_woken             = task_woken_dl,
 #endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index dd52e7ffb10e..f3344c31632a 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -321,6 +321,7 @@ do {									\
        P(sched_goidle);
 #ifdef CONFIG_SMP
        P64(avg_idle);
+        P64(max_idle_balance_cost);
 #endif
        P(ttwu_count);
@@ -533,15 +534,15 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
                        unsigned long nr_faults = -1;
                        int cpu_current, home_node;
-                        if (p->numa_faults)
+                        if (p->numa_faults_memory)
-                                nr_faults = p->numa_faults[2*node + i];
+                                nr_faults = p->numa_faults_memory[2*node + i];
                        cpu_current = !i ? (task_node(p) == node) :
                                (pol && node_isset(node, pol->v.nodes));
                        home_node = (p->numa_preferred_nid == node);
-                        SEQ_printf(m, "numa_faults, %d, %d, %d, %d, %ld\n",
+                        SEQ_printf(m, "numa_faults_memory, %d, %d, %d, %d, %ld\n",
                                i, node, cpu_current, home_node, nr_faults);
                }
        }
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 966cc2bfcb77..235cfa7ad8fc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -322,13 +322,13 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
        list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 /* Do the two (enqueued) entities belong to the same group ? */
-static inline int
+static inline struct cfs_rq *
 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 {
        if (se->cfs_rq == pse->cfs_rq)
-                return 1;
+                return se->cfs_rq;
-        return 0;
+        return NULL;
 }
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
@@ -336,17 +336,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
        return se->parent;
 }
-/* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-        int depth = 0;
-        for_each_sched_entity(se)
-                depth++;
-        return depth;
-}
 static void
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
@@ -360,8 +349,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
         */
        /* First walk up until both entities are at same depth */
-        se_depth = depth_se(*se);
+        se_depth = (*se)->depth;
-        pse_depth = depth_se(*pse);
+        pse_depth = (*pse)->depth;
        while (se_depth > pse_depth) {
                se_depth--;
@@ -426,12 +415,6 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
                for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
-static inline int
-is_same_group(struct sched_entity *se, struct sched_entity *pse)
-{
-        return 1;
-}
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
        return NULL;
@@ -819,14 +802,6 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
 unsigned int sysctl_numa_balancing_scan_delay = 1000;
-/*
- * After skipping a page migration on a shared page, skip N more numa page
- * migrations unconditionally. This reduces the number of NUMA migrations
- * in shared memory workloads, and has the effect of pulling tasks towards
- * where their memory lives, over pulling the memory towards the task.
- */
-unsigned int sysctl_numa_balancing_migrate_deferred = 16;
 static unsigned int task_nr_scan_windows(struct task_struct *p)
 {
        unsigned long rss = 0;
@@ -893,10 +868,26 @@ struct numa_group {
        struct list_head task_list;
        struct rcu_head rcu;
+        nodemask_t active_nodes;
        unsigned long total_faults;
+        /*
+         * Faults_cpu is used to decide whether memory should move
+         * towards the CPU. As a consequence, these stats are weighted
+         * more by CPU use than by memory faults.
+         */
+        unsigned long *faults_cpu;
        unsigned long faults[0];
 };
+/* Shared or private faults. */
+#define NR_NUMA_HINT_FAULT_TYPES 2
+/* Memory and CPU locality */
+#define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
+/* Averaged statistics, and temporary buffers. */
+#define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
 pid_t task_numa_group_id(struct task_struct *p)
 {
        return p->numa_group ? p->numa_group->gid : 0;
@@ -904,16 +895,16 @@ pid_t task_numa_group_id(struct task_struct *p)
 static inline int task_faults_idx(int nid, int priv)
 {
-        return 2 * nid + priv;
+        return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
 }
 static inline unsigned long task_faults(struct task_struct *p, int nid)
 {
-        if (!p->numa_faults)
+        if (!p->numa_faults_memory)
                return 0;
-        return p->numa_faults[task_faults_idx(nid, 0)] +
+        return p->numa_faults_memory[task_faults_idx(nid, 0)] +
-                p->numa_faults[task_faults_idx(nid, 1)];
+                p->numa_faults_memory[task_faults_idx(nid, 1)];
 }
 static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -925,6 +916,12 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
                p->numa_group->faults[task_faults_idx(nid, 1)];
 }
+static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
+{
+        return group->faults_cpu[task_faults_idx(nid, 0)] +
+                group->faults_cpu[task_faults_idx(nid, 1)];
+}
 /*
 * These return the fraction of accesses done by a particular task, or
 * task group, on a particular numa node.  The group weight is given a
@@ -935,7 +932,7 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
 {
        unsigned long total_faults;
-        if (!p->numa_faults)
+        if (!p->numa_faults_memory)
                return 0;
        total_faults = p->total_numa_faults;
@@ -954,6 +951,69 @@ static inline unsigned long group_weight(struct task_struct *p, int nid)
        return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
 }
+bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
+                                int src_nid, int dst_cpu)
+{
+        struct numa_group *ng = p->numa_group;
+        int dst_nid = cpu_to_node(dst_cpu);
+        int last_cpupid, this_cpupid;
+        this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+        /*
+         * Multi-stage node selection is used in conjunction with a periodic
+         * migration fault to build a temporal task<->page relation. By using
+         * a two-stage filter we remove short/unlikely relations.
+         *
+         * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
+         * a task's usage of a particular page (n_p) per total usage of this
+         * page (n_t) (in a given time-span) to a probability.
+         *
+         * Our periodic faults will sample this probability and getting the
+         * same result twice in a row, given these samples are fully
+         * independent, is then given by P(n)^2, provided our sample period
+         * is sufficiently short compared to the usage pattern.
+         *
+         * This quadric squishes small probabilities, making it less likely we
+         * act on an unlikely task<->page relation.
+         */
+        last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+        if (!cpupid_pid_unset(last_cpupid) &&
+                                cpupid_to_nid(last_cpupid) != dst_nid)
+                return false;
+        /* Always allow migrate on private faults */
+        if (cpupid_match_pid(p, last_cpupid))
+                return true;
+        /* A shared fault, but p->numa_group has not been set up yet. */
+        if (!ng)
+                return true;
+        /*
+         * Do not migrate if the destination is not a node that
+         * is actively used by this numa group.
+         */
+        if (!node_isset(dst_nid, ng->active_nodes))
+                return false;
+        /*
+         * Source is a node that is not actively used by this
+         * numa group, while the destination is. Migrate.
+         */
+        if (!node_isset(src_nid, ng->active_nodes))
+                return true;
+        /*
+         * Both source and destination are nodes in active
+         * use by this numa group. Maximize memory bandwidth
+         * by migrating from more heavily used groups, to less
+         * heavily used ones, spreading the load around.
+         * Use a 1/4 hysteresis to avoid spurious page movement.
+         */
+        return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
+}
 static unsigned long weighted_cpuload(const int cpu);
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
@@ -1267,7 +1327,7 @@ static int task_numa_migrate(struct task_struct *p)
 static void numa_migrate_preferred(struct task_struct *p)
 {
        /* This task has no NUMA fault statistics yet */
-        if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
+        if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
                return;
        /* Periodically retry migrating the task to the preferred node */
@@ -1282,6 +1342,38 @@ static void numa_migrate_preferred(struct task_struct *p)
 }
 /*
+ * Find the nodes on which the workload is actively running. We do this by
+ * tracking the nodes from which NUMA hinting faults are triggered. This can
+ * be different from the set of nodes where the workload's memory is currently
+ * located.
+ *
+ * The bitmask is used to make smarter decisions on when to do NUMA page
+ * migrations, To prevent flip-flopping, and excessive page migrations, nodes
+ * are added when they cause over 6/16 of the maximum number of faults, but
+ * only removed when they drop below 3/16.
+ */
+static void update_numa_active_node_mask(struct numa_group *numa_group)
+{
+        unsigned long faults, max_faults = 0;
+        int nid;
+        for_each_online_node(nid) {
+                faults = group_faults_cpu(numa_group, nid);
+                if (faults > max_faults)
+                        max_faults = faults;
+        }
+        for_each_online_node(nid) {
+                faults = group_faults_cpu(numa_group, nid);
+                if (!node_isset(nid, numa_group->active_nodes)) {
+                        if (faults > max_faults * 6 / 16)
+                                node_set(nid, numa_group->active_nodes);
+                } else if (faults < max_faults * 3 / 16)
+                        node_clear(nid, numa_group->active_nodes);
+        }
+}
+/*
 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
 * increments. The more local the fault statistics are, the higher the scan
 * period will be for the next scan window. If local/remote ratio is below
@@ -1355,11 +1447,41 @@ static void update_task_scan_period(struct task_struct *p,
        memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
 }
+/*
+ * Get the fraction of time the task has been running since the last
+ * NUMA placement cycle. The scheduler keeps similar statistics, but
+ * decays those on a 32ms period, which is orders of magnitude off
+ * from the dozens-of-seconds NUMA balancing period. Use the scheduler
+ * stats only if the task is so new there are no NUMA statistics yet.
+ */
+static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
+{
+        u64 runtime, delta, now;
+        /* Use the start of this time slice to avoid calculations. */
+        now = p->se.exec_start;
+        runtime = p->se.sum_exec_runtime;
+        if (p->last_task_numa_placement) {
+                delta = runtime - p->last_sum_exec_runtime;
+                *period = now - p->last_task_numa_placement;
+        } else {
+                delta = p->se.avg.runnable_avg_sum;
+                *period = p->se.avg.runnable_avg_period;
+        }
+        p->last_sum_exec_runtime = runtime;
+        p->last_task_numa_placement = now;
+        return delta;
+}
 static void task_numa_placement(struct task_struct *p)
 {
        int seq, nid, max_nid = -1, max_group_nid = -1;
        unsigned long max_faults = 0, max_group_faults = 0;
        unsigned long fault_types[2] = { 0, 0 };
+        unsigned long total_faults;
+        u64 runtime, period;
        spinlock_t *group_lock = NULL;
        seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1368,6 +1490,10 @@ static void task_numa_placement(struct task_struct *p)
        p->numa_scan_seq = seq;
        p->numa_scan_period_max = task_scan_max(p);
+        total_faults = p->numa_faults_locality[0] +
+                       p->numa_faults_locality[1];
+        runtime = numa_get_avg_runtime(p, &period);
        /* If the task is part of a group prevent parallel updates to group stats */
        if (p->numa_group) {
                group_lock = &p->numa_group->lock;
@@ -1379,24 +1505,37 @@ static void task_numa_placement(struct task_struct *p)
                unsigned long faults = 0, group_faults = 0;
                int priv, i;
-                for (priv = 0; priv < 2; priv++) {
+                for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
-                        long diff;
+                        long diff, f_diff, f_weight;
                        i = task_faults_idx(nid, priv);
-                        diff = -p->numa_faults[i];
                        /* Decay existing window, copy faults since last scan */
-                        p->numa_faults[i] >>= 1;
+                        diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
-                        p->numa_faults[i] += p->numa_faults_buffer[i];
+                        fault_types[priv] += p->numa_faults_buffer_memory[i];
-                        fault_types[priv] += p->numa_faults_buffer[i];
+                        p->numa_faults_buffer_memory[i] = 0;
-                        p->numa_faults_buffer[i] = 0;
-                        faults += p->numa_faults[i];
+                        /*
-                        diff += p->numa_faults[i];
+                         * Normalize the faults_from, so all tasks in a group
+                         * count according to CPU use, instead of by the raw
+                         * number of faults. Tasks with little runtime have
+                         * little over-all impact on throughput, and thus their
+                         * faults are less important.
+                         */
+                        f_weight = div64_u64(runtime << 16, period + 1);
+                        f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
+                                   (total_faults + 1);
+                        f_diff = f_weight - p->numa_faults_cpu[i] / 2;
+                        p->numa_faults_buffer_cpu[i] = 0;
+                        p->numa_faults_memory[i] += diff;
+                        p->numa_faults_cpu[i] += f_diff;
+                        faults += p->numa_faults_memory[i];
                        p->total_numa_faults += diff;
                        if (p->numa_group) {
                                /* safe because we can only change our own group */
                                p->numa_group->faults[i] += diff;
+                                p->numa_group->faults_cpu[i] += f_diff;
                                p->numa_group->total_faults += diff;
                                group_faults += p->numa_group->faults[i];
                        }
@@ -1416,6 +1555,7 @@ static void task_numa_placement(struct task_struct *p)
        update_task_scan_period(p, fault_types[0], fault_types[1]);
        if (p->numa_group) {
+                update_numa_active_node_mask(p->numa_group);
                /*
                 * If the preferred task and group nids are different,
                 * iterate over the nodes again to find the best place.
@@ -1465,7 +1605,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
        if (unlikely(!p->numa_group)) {
                unsigned int size = sizeof(struct numa_group) +
-                                    2*nr_node_ids*sizeof(unsigned long);
+                                    4*nr_node_ids*sizeof(unsigned long);
                grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
                if (!grp)
@@ -1475,9 +1615,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
                spin_lock_init(&grp->lock);
                INIT_LIST_HEAD(&grp->task_list);
                grp->gid = p->pid;
+                /* Second half of the array tracks nids where faults happen */
+                grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
+                                                nr_node_ids;
+                node_set(task_node(current), grp->active_nodes);
-                for (i = 0; i < 2*nr_node_ids; i++)
+                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
-                        grp->faults[i] = p->numa_faults[i];
+                        grp->faults[i] = p->numa_faults_memory[i];
                grp->total_faults = p->total_numa_faults;
@@ -1534,9 +1679,9 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
        double_lock(&my_grp->lock, &grp->lock);
-        for (i = 0; i < 2*nr_node_ids; i++) {
+        for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
-                my_grp->faults[i] -= p->numa_faults[i];
+                my_grp->faults[i] -= p->numa_faults_memory[i];
-                grp->faults[i] += p->numa_faults[i];
+                grp->faults[i] += p->numa_faults_memory[i];
        }
        my_grp->total_faults -= p->total_numa_faults;
        grp->total_faults += p->total_numa_faults;
@@ -1562,12 +1707,12 @@ void task_numa_free(struct task_struct *p)
 {
        struct numa_group *grp = p->numa_group;
        int i;
-        void *numa_faults = p->numa_faults;
+        void *numa_faults = p->numa_faults_memory;
        if (grp) {
                spin_lock(&grp->lock);
-                for (i = 0; i < 2*nr_node_ids; i++)
+                for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
-                        grp->faults[i] -= p->numa_faults[i];
+                        grp->faults[i] -= p->numa_faults_memory[i];
                grp->total_faults -= p->total_numa_faults;
                list_del(&p->numa_entry);
@@ -1577,18 +1722,21 @@ void task_numa_free(struct task_struct *p)
                put_numa_group(grp);
        }
-        p->numa_faults = NULL;
+        p->numa_faults_memory = NULL;
-        p->numa_faults_buffer = NULL;
+        p->numa_faults_buffer_memory = NULL;
+        p->numa_faults_cpu= NULL;
+        p->numa_faults_buffer_cpu = NULL;
        kfree(numa_faults);
 }
 /*
 * Got a PROT_NONE fault for a page on @node.
 */
-void task_numa_fault(int last_cpupid, int node, int pages, int flags)
+void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
 {
        struct task_struct *p = current;
        bool migrated = flags & TNF_MIGRATED;
+        int cpu_node = task_node(current);
        int priv;
        if (!numabalancing_enabled)
@@ -1603,16 +1751,24 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
                return;
        /* Allocate buffer to track faults on a per-node basis */
-        if (unlikely(!p->numa_faults)) {
+        if (unlikely(!p->numa_faults_memory)) {
-                int size = sizeof(*p->numa_faults) * 2 * nr_node_ids;
+                int size = sizeof(*p->numa_faults_memory) *
+                           NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
-                /* numa_faults and numa_faults_buffer share the allocation */
+                p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
-                p->numa_faults = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
+                if (!p->numa_faults_memory)
-                if (!p->numa_faults)
                        return;
-                BUG_ON(p->numa_faults_buffer);
+                BUG_ON(p->numa_faults_buffer_memory);
-                p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
+                /*
+                 * The averaged statistics, shared & private, memory & cpu,
+                 * occupy the first half of the array. The second half of the
+                 * array is for current counters, which are averaged into the
+                 * first set by task_numa_placement.
+                 */
+                p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
+                p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
+                p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
                p->total_numa_faults = 0;
                memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
        }
@@ -1641,7 +1797,8 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
        if (migrated)
                p->numa_pages_migrated += pages;
-        p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+        p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
+        p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
        p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
 }
@@ -2414,7 +2571,8 @@ void idle_exit_fair(struct rq *this_rq)
        update_rq_runnable_avg(this_rq, 0);
 }
-#else
+#else /* CONFIG_SMP */
 static inline void update_entity_load_avg(struct sched_entity *se,
                                          int update_cfs_rq) {}
 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
@@ -2426,7 +2584,7 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
                                           int sleep) {}
 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
                                              int force_update) {}
-#endif
+#endif /* CONFIG_SMP */
 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -2576,10 +2734,10 @@ static void __clear_buddies_last(struct sched_entity *se)
 {
        for_each_sched_entity(se) {
                struct cfs_rq *cfs_rq = cfs_rq_of(se);
-                if (cfs_rq->last == se)
+                if (cfs_rq->last != se)
-                        cfs_rq->last = NULL;
-                else
                        break;
+                cfs_rq->last = NULL;
        }
 }
@@ -2587,10 +2745,10 @@ static void __clear_buddies_next(struct sched_entity *se)
 {
        for_each_sched_entity(se) {
                struct cfs_rq *cfs_rq = cfs_rq_of(se);
-                if (cfs_rq->next == se)
+                if (cfs_rq->next != se)
-                        cfs_rq->next = NULL;
-                else
                        break;
+                cfs_rq->next = NULL;
        }
 }
@@ -2598,10 +2756,10 @@ static void __clear_buddies_skip(struct sched_entity *se)
 {
        for_each_sched_entity(se) {
                struct cfs_rq *cfs_rq = cfs_rq_of(se);
-                if (cfs_rq->skip == se)
+                if (cfs_rq->skip != se)
-                        cfs_rq->skip = NULL;
-                else
                        break;
+                cfs_rq->skip = NULL;
        }
 }
@@ -2744,17 +2902,36 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 * 3) pick the "last" process, for cache locality
 * 4) do not run the "skip" process, if something else is available
 */
-static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *
+pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
-        struct sched_entity *se = __pick_first_entity(cfs_rq);
+        struct sched_entity *left = __pick_first_entity(cfs_rq);
-        struct sched_entity *left = se;
+        struct sched_entity *se;
+        /*
+         * If curr is set we have to see if its left of the leftmost entity
+         * still in the tree, provided there was anything in the tree at all.
+         */
+        if (!left || (curr && entity_before(curr, left)))
+                left = curr;
+        se = left; /* ideally we run the leftmost entity */
        /*
         * Avoid running the skip buddy, if running something else can
         * be done without getting too unfair.
         */
        if (cfs_rq->skip == se) {
-                struct sched_entity *second = __pick_next_entity(se);
+                struct sched_entity *second;
+                if (se == curr) {
+                        second = __pick_first_entity(cfs_rq);
+                } else {
+                        second = __pick_next_entity(se);
+                        if (!second || (curr && entity_before(curr, second)))
+                                second = curr;
+                }
                if (second && wakeup_preempt_entity(second, left) < 1)
                        se = second;
        }
@@ -2776,7 +2953,7 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
        return se;
 }
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
@@ -3431,22 +3608,23 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 }
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
        if (!cfs_bandwidth_used())
-                return;
+                return false;
        if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
-                return;
+                return false;
        /*
         * it's possible for a throttled entity to be forced into a running
         * state (e.g. set_curr_task), in this case we're finished.
         */
        if (cfs_rq_throttled(cfs_rq))
-                return;
+                return true;
        throttle_cfs_rq(cfs_rq);
+        return true;
 }
 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -3556,7 +3734,7 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 }
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
-static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -4492,26 +4670,125 @@ preempt:
                set_last_buddy(se);
 }
-static struct task_struct *pick_next_task_fair(struct rq *rq)
+static struct task_struct *
+pick_next_task_fair(struct rq *rq, struct task_struct *prev)
 {
-        struct task_struct *p;
        struct cfs_rq *cfs_rq = &rq->cfs;
        struct sched_entity *se;
+        struct task_struct *p;
+again: __maybe_unused
+#ifdef CONFIG_FAIR_GROUP_SCHED
        if (!cfs_rq->nr_running)
-                return NULL;
+                goto idle;
+        if (!prev || prev->sched_class != &fair_sched_class)
+                goto simple;
+        /*
+         * Because of the set_next_buddy() in dequeue_task_fair() it is rather
+         * likely that a next task is from the same cgroup as the current.
+         *
+         * Therefore attempt to avoid putting and setting the entire cgroup
+         * hierarchy, only change the part that actually changes.
+         */
        do {
-                se = pick_next_entity(cfs_rq);
+                struct sched_entity *curr = cfs_rq->curr;
+                /*
+                 * Since we got here without doing put_prev_entity() we also
+                 * have to consider cfs_rq->curr. If it is still a runnable
+                 * entity, update_curr() will update its vruntime, otherwise
+                 * forget we've ever seen it.
+                 */
+                if (curr && curr->on_rq)
+                        update_curr(cfs_rq);
+                else
+                        curr = NULL;
+                /*
+                 * This call to check_cfs_rq_runtime() will do the throttle and
+                 * dequeue its entity in the parent(s). Therefore the 'simple'
+                 * nr_running test will indeed be correct.
+                 */
+                if (unlikely(check_cfs_rq_runtime(cfs_rq)))
+                        goto simple;
+                se = pick_next_entity(cfs_rq, curr);
+                cfs_rq = group_cfs_rq(se);
+        } while (cfs_rq);
+        p = task_of(se);
+        /*
+         * Since we haven't yet done put_prev_entity and if the selected task
+         * is a different task than we started out with, try and touch the
+         * least amount of cfs_rqs.
+         */
+        if (prev != p) {
+                struct sched_entity *pse = &prev->se;
+                while (!(cfs_rq = is_same_group(se, pse))) {
+                        int se_depth = se->depth;
+                        int pse_depth = pse->depth;
+                        if (se_depth <= pse_depth) {
+                                put_prev_entity(cfs_rq_of(pse), pse);
+                                pse = parent_entity(pse);
+                        }
+                        if (se_depth >= pse_depth) {
+                                set_next_entity(cfs_rq_of(se), se);
+                                se = parent_entity(se);
+                        }
+                }
+                put_prev_entity(cfs_rq, pse);
+                set_next_entity(cfs_rq, se);
+        }
+        if (hrtick_enabled(rq))
+                hrtick_start_fair(rq, p);
+        return p;
+simple:
+        cfs_rq = &rq->cfs;
+#endif
+        if (!cfs_rq->nr_running)
+                goto idle;
+        if (prev)
+                prev->sched_class->put_prev_task(rq, prev);
+        do {
+                se = pick_next_entity(cfs_rq, NULL);
                set_next_entity(cfs_rq, se);
                cfs_rq = group_cfs_rq(se);
        } while (cfs_rq);
        p = task_of(se);
        if (hrtick_enabled(rq))
                hrtick_start_fair(rq, p);
        return p;
+idle:
+#ifdef CONFIG_SMP
+        idle_enter_fair(rq);
+        /*
+         * We must set idle_stamp _before_ calling idle_balance(), such that we
+         * measure the duration of idle_balance() as idle time.
+         */
+        rq->idle_stamp = rq_clock(rq);
+        if (idle_balance(rq)) { /* drops rq->lock */
+                rq->idle_stamp = 0;
+                goto again;
+        }
+#endif
+        return NULL;
 }
 /*
@@ -4783,7 +5060,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
 {
        int src_nid, dst_nid;
-        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
+        if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
            !(env->sd->flags & SD_NUMA)) {
                return false;
        }
@@ -4814,7 +5091,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
        if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
                return false;
-        if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
+        if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
                return false;
        src_nid = cpu_to_node(env->src_cpu);
@@ -6357,17 +6634,16 @@ out:
 * idle_balance is called by schedule() if this_cpu is about to become
 * idle. Attempts to pull tasks from other CPUs.
 */
-void idle_balance(int this_cpu, struct rq *this_rq)
+int idle_balance(struct rq *this_rq)
 {
        struct sched_domain *sd;
        int pulled_task = 0;
        unsigned long next_balance = jiffies + HZ;
        u64 curr_cost = 0;
+        int this_cpu = this_rq->cpu;
-        this_rq->idle_stamp = rq_clock(this_rq);
        if (this_rq->avg_idle < sysctl_sched_migration_cost)
-                return;
+                return 0;
        /*
         * Drop the rq->lock, but keep IRQ/preempt disabled.
@@ -6405,15 +6681,20 @@ void idle_balance(int this_cpu, struct rq *this_rq)
                interval = msecs_to_jiffies(sd->balance_interval);
                if (time_after(next_balance, sd->last_balance + interval))
                        next_balance = sd->last_balance + interval;
-                if (pulled_task) {
+                if (pulled_task)
-                        this_rq->idle_stamp = 0;
                        break;
-                }
        }
        rcu_read_unlock();
        raw_spin_lock(&this_rq->lock);
+        /*
+         * While browsing the domains, we released the rq lock.
+         * A task could have be enqueued in the meantime
+         */
+        if (this_rq->nr_running && !pulled_task)
+                return 1;
        if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
                /*
                 * We are going idle. next_balance may be set based on
@@ -6424,6 +6705,8 @@ void idle_balance(int this_cpu, struct rq *this_rq)
        if (curr_cost > this_rq->max_idle_balance_cost)
                this_rq->max_idle_balance_cost = curr_cost;
+        return pulled_task;
 }
 /*
@@ -7082,7 +7365,9 @@ void init_cfs_rq(struct cfs_rq *cfs_rq)
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
+        struct sched_entity *se = &p->se;
        struct cfs_rq *cfs_rq;
        /*
         * If the task was not on the rq at the time of this cgroup movement
         * it must have been asleep, sleeping tasks keep their ->vruntime
@@ -7108,23 +7393,24 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
         * To prevent boost or penalty in the new cfs_rq caused by delta
         * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
         */
-        if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+        if (!on_rq && (!se->sum_exec_runtime || p->state == TASK_WAKING))
                on_rq = 1;
        if (!on_rq)
-                p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+                se->vruntime -= cfs_rq_of(se)->min_vruntime;
        set_task_rq(p, task_cpu(p));
+        se->depth = se->parent ? se->parent->depth + 1 : 0;
        if (!on_rq) {
-                cfs_rq = cfs_rq_of(&p->se);
+                cfs_rq = cfs_rq_of(se);
-                p->se.vruntime += cfs_rq->min_vruntime;
+                se->vruntime += cfs_rq->min_vruntime;
 #ifdef CONFIG_SMP
                /*
                 * migrate_task_rq_fair() will have removed our previous
                 * contribution, but we must synchronize for ongoing future
                 * decay.
                 */
-                p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
+                se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
-                cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
+                cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
 #endif
        }
 }
@@ -7220,10 +7506,13 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
        if (!se)
                return;
-        if (!parent)
+        if (!parent) {
                se->cfs_rq = &rq->cfs;
-        else
+                se->depth = 0;
+        } else {
                se->cfs_rq = parent->my_q;
+                se->depth = parent->depth + 1;
+        }
        se->my_q = cfs_rq;
        /* guarantee group entities always have weight */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
new file mode 100644
index 000000000000..14ca43430aee
--- /dev/null
+++ b/kernel/sched/idle.c
@@ -0,0 +1,144 @@
+/*
+ * Generic entry point for the idle threads
+ */
+#include <linux/sched.h>
+#include <linux/cpu.h>
+#include <linux/cpuidle.h>
+#include <linux/tick.h>
+#include <linux/mm.h>
+#include <linux/stackprotector.h>
+#include <asm/tlb.h>
+#include <trace/events/power.h>
+static int __read_mostly cpu_idle_force_poll;
+void cpu_idle_poll_ctrl(bool enable)
+{
+        if (enable) {
+                cpu_idle_force_poll++;
+        } else {
+                cpu_idle_force_poll--;
+                WARN_ON_ONCE(cpu_idle_force_poll < 0);
+        }
+}
+#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
+static int __init cpu_idle_poll_setup(char *__unused)
+{
+        cpu_idle_force_poll = 1;
+        return 1;
+}
+__setup("nohlt", cpu_idle_poll_setup);
+static int __init cpu_idle_nopoll_setup(char *__unused)
+{
+        cpu_idle_force_poll = 0;
+        return 1;
+}
+__setup("hlt", cpu_idle_nopoll_setup);
+#endif
+static inline int cpu_idle_poll(void)
+{
+        rcu_idle_enter();
+        trace_cpu_idle_rcuidle(0, smp_processor_id());
+        local_irq_enable();
+        while (!tif_need_resched())
+                cpu_relax();
+        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
+        rcu_idle_exit();
+        return 1;
+}
+/* Weak implementations for optional arch specific functions */
+void __weak arch_cpu_idle_prepare(void) { }
+void __weak arch_cpu_idle_enter(void) { }
+void __weak arch_cpu_idle_exit(void) { }
+void __weak arch_cpu_idle_dead(void) { }
+void __weak arch_cpu_idle(void)
+{
+        cpu_idle_force_poll = 1;
+        local_irq_enable();
+}
+/*
+ * Generic idle loop implementation
+ */
+static void cpu_idle_loop(void)
+{
+        while (1) {
+                tick_nohz_idle_enter();
+                while (!need_resched()) {
+                        check_pgt_cache();
+                        rmb();
+                        if (cpu_is_offline(smp_processor_id()))
+                                arch_cpu_idle_dead();
+                        local_irq_disable();
+                        arch_cpu_idle_enter();
+                        /*
+                         * In poll mode we reenable interrupts and spin.
+                         *
+                         * Also if we detected in the wakeup from idle
+                         * path that the tick broadcast device expired
+                         * for us, we don't want to go deep idle as we
+                         * know that the IPI is going to arrive right
+                         * away
+                         */
+                        if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
+                                cpu_idle_poll();
+                        } else {
+                                if (!current_clr_polling_and_test()) {
+                                        stop_critical_timings();
+                                        rcu_idle_enter();
+                                        if (cpuidle_idle_call())
+                                                arch_cpu_idle();
+                                        if (WARN_ON_ONCE(irqs_disabled()))
+                                                local_irq_enable();
+                                        rcu_idle_exit();
+                                        start_critical_timings();
+                                } else {
+                                        local_irq_enable();
+                                }
+                                __current_set_polling();
+                        }
+                        arch_cpu_idle_exit();
+                        /*
+                         * We need to test and propagate the TIF_NEED_RESCHED
+                         * bit here because we might not have send the
+                         * reschedule IPI to idle tasks.
+                         */
+                        if (tif_need_resched())
+                                set_preempt_need_resched();
+                }
+                tick_nohz_idle_exit();
+                schedule_preempt_disabled();
+        }
+}
+void cpu_startup_entry(enum cpuhp_state state)
+{
+        /*
+         * This #ifdef needs to die, but it's too late in the cycle to
+         * make this generic (arm and sh have never invoked the canary
+         * init for the non boot cpus!). Will be fixed in 3.11
+         */
+#ifdef CONFIG_X86
+        /*
+         * If we're the non-boot CPU, nothing set the stack canary up
+         * for us. The boot CPU already has it initialized but no harm
+         * in doing it again. This is a good place for updating it, as
+         * we wont ever return from this function (so the invalid
+         * canaries already on the stack wont ever trigger).
+         */
+        boot_init_stack_canary();
+#endif
+        __current_set_polling();
+        arch_cpu_idle_prepare();
+        cpu_idle_loop();
+}
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 516c3d9ceea1..f7d03af79a5b 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,18 +13,8 @@ select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
 {
        return task_cpu(p); /* IDLE tasks as never migrated */
 }
-static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
-{
-        idle_exit_fair(rq);
-        rq_last_tick_reset(rq);
-}
-static void post_schedule_idle(struct rq *rq)
-{
-        idle_enter_fair(rq);
-}
 #endif /* CONFIG_SMP */
 /*
 * Idle tasks are unconditionally rescheduled:
 */
@@ -33,12 +23,15 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
        resched_task(rq->idle);
 }
-static struct task_struct *pick_next_task_idle(struct rq *rq)
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev)
 {
+        if (prev)
+                prev->sched_class->put_prev_task(rq, prev);
        schedstat_inc(rq, sched_goidle);
 #ifdef CONFIG_SMP
-        /* Trigger the post schedule to do an idle_enter for CFS */
+        idle_enter_fair(rq);
-        rq->post_schedule = 1;
 #endif
        return rq->idle;
 }
@@ -58,6 +51,10 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
 static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
 {
+#ifdef CONFIG_SMP
+        idle_exit_fair(rq);
+        rq_last_tick_reset(rq);
+#endif
 }
 static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
@@ -101,8 +98,6 @@ const struct sched_class idle_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_idle,
-        .pre_schedule           = pre_schedule_idle,
-        .post_schedule          = post_schedule_idle,
 #endif
        .set_curr_task          = set_curr_task_idle,
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a2740b775b45..72f9ec759972 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -229,6 +229,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 #ifdef CONFIG_SMP
+static int pull_rt_task(struct rq *this_rq);
 static inline int rt_overloaded(struct rq *rq)
 {
        return atomic_read(&rq->rd->rto_count);
@@ -1310,15 +1312,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
 {
        struct sched_rt_entity *rt_se;
        struct task_struct *p;
-        struct rt_rq *rt_rq;
+        struct rt_rq *rt_rq  = &rq->rt;
-        rt_rq = &rq->rt;
-        if (!rt_rq->rt_nr_running)
-                return NULL;
-        if (rt_rq_throttled(rt_rq))
-                return NULL;
        do {
                rt_se = pick_next_rt_entity(rq, rt_rq);
@@ -1332,9 +1326,28 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
        return p;
 }
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *
+pick_next_task_rt(struct rq *rq, struct task_struct *prev)
 {
-        struct task_struct *p = _pick_next_task_rt(rq);
+        struct task_struct *p;
+        struct rt_rq *rt_rq = &rq->rt;
+#ifdef CONFIG_SMP
+        /* Try to pull RT tasks here if we lower this rq's prio */
+        if (rq->rt.highest_prio.curr > prev->prio)
+                pull_rt_task(rq);
+#endif
+        if (!rt_rq->rt_nr_running)
+                return NULL;
+        if (rt_rq_throttled(rt_rq))
+                return NULL;
+        if (prev)
+                prev->sched_class->put_prev_task(rq, prev);
+        p = _pick_next_task_rt(rq);
        /* The running task is never eligible for pushing */
        if (p)
@@ -1716,13 +1729,6 @@ skip:
        return ret;
 }
-static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
-{
-        /* Try to pull RT tasks here if we lower this rq's prio */
-        if (rq->rt.highest_prio.curr > prev->prio)
-                pull_rt_task(rq);
-}
 static void post_schedule_rt(struct rq *rq)
 {
        push_rt_tasks(rq);
@@ -1999,7 +2005,6 @@ const struct sched_class rt_sched_class = {
        .set_cpus_allowed       = set_cpus_allowed_rt,
        .rq_online              = rq_online_rt,
        .rq_offline             = rq_offline_rt,
-        .pre_schedule           = pre_schedule_rt,
        .post_schedule          = post_schedule_rt,
        .task_woken             = task_woken_rt,
        .switched_from          = switched_from_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index c2119fd20f8b..1bf34c257d3b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -24,24 +24,6 @@ extern long calc_load_fold_active(struct rq *this_rq);
 extern void update_cpu_load_active(struct rq *this_rq);
 /*
- * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- * and back.
- */
-#define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
-/*
 * Helpers for converting nanosecond timing to jiffy resolution
 */
 #define NS_TO_JIFFIES(TIME)     ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
@@ -1123,14 +1105,19 @@ struct sched_class {
        void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
-        struct task_struct * (*pick_next_task) (struct rq *rq);
+        /*
+         * It is the responsibility of the pick_next_task() method that will
+         * return the next task to call put_prev_task() on the @prev task or
+         * something equivalent.
+         */
+        struct task_struct * (*pick_next_task) (struct rq *rq,
+                                                struct task_struct *prev);
        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
 #ifdef CONFIG_SMP
        int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
        void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
-        void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
        void (*post_schedule) (struct rq *this_rq);
        void (*task_waking) (struct task_struct *task);
        void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1176,7 +1163,7 @@ extern const struct sched_class idle_sched_class;
 extern void update_group_power(struct sched_domain *sd, int cpu);
 extern void trigger_load_balance(struct rq *rq);
-extern void idle_balance(int this_cpu, struct rq *this_rq);
+extern int idle_balance(struct rq *this_rq);
 extern void idle_enter_fair(struct rq *this_rq);
 extern void idle_exit_fair(struct rq *this_rq);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index fdb6bb0b3356..a4147c9d2017 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -23,16 +23,20 @@ check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
        /* we're never preempted */
 }
-static struct task_struct *pick_next_task_stop(struct rq *rq)
+static struct task_struct *
+pick_next_task_stop(struct rq *rq, struct task_struct *prev)
 {
        struct task_struct *stop = rq->stop;
-        if (stop && stop->on_rq) {
+        if (!stop || !stop->on_rq)
-                stop->se.exec_start = rq_clock_task(rq);
+                return NULL;
-                return stop;
-        }
-        return NULL;
+        if (prev)
+                prev->sched_class->put_prev_task(rq, prev);
+        stop->se.exec_start = rq_clock_task(rq);
+        return stop;
 }
 static void
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 49e13e1f8fe6..7754ff16f334 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -386,13 +386,6 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
-                .procname       = "numa_balancing_migrate_deferred",
-                .data           = &sysctl_numa_balancing_migrate_deferred,
-                .maxlen         = sizeof(unsigned int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec,
-        },
-        {
                .procname       = "numa_balancing",
                .data           = NULL, /* filled in by handler */
                .maxlen         = sizeof(unsigned int),