1 files changed, 342 insertions, 150 deletions
diff --git a/kernel/sched.c b/kernel/sched.c
index cbb3a0eee58e..ccacdbdecf45 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -75,6 +75,9 @@
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 #include <asm/mutex.h>
+#ifdef CONFIG_PARAVIRT
+#include <asm/paravirt.h>
+#endif
 #include "sched_cpupri.h"
 #include "workqueue_sched.h"
@@ -124,7 +127,7 @@
 static inline int rt_policy(int policy)
 {
-        if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
+        if (policy == SCHED_FIFO || policy == SCHED_RR)
                return 1;
        return 0;
 }
@@ -292,8 +295,8 @@ static DEFINE_SPINLOCK(task_group_lock);
 * (The default weight is 1024 - so there's no practical
 *  limitation from this.)
 */
-#define MIN_SHARES      2
+#define MIN_SHARES      (1UL <<  1)
-#define MAX_SHARES      (1UL << (18 + SCHED_LOAD_RESOLUTION))
+#define MAX_SHARES      (1UL << 18)
 static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
 #endif
@@ -422,6 +425,7 @@ struct rt_rq {
 */
 struct root_domain {
        atomic_t refcount;
+        atomic_t rto_count;
        struct rcu_head rcu;
        cpumask_var_t span;
        cpumask_var_t online;
@@ -431,7 +435,6 @@ struct root_domain {
         * one runnable RT task.
         */
        cpumask_var_t rto_mask;
-        atomic_t rto_count;
        struct cpupri cpupri;
 };
@@ -528,6 +531,12 @@ struct rq {
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
        u64 prev_irq_time;
 #endif
+#ifdef CONFIG_PARAVIRT
+        u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+        u64 prev_steal_time_rq;
+#endif
        /* calc_load related fields */
        unsigned long calc_load_update;
@@ -581,7 +590,6 @@ static inline int cpu_of(struct rq *rq)
 #define rcu_dereference_check_sched_domain(p) \
        rcu_dereference_check((p), \
-                              rcu_read_lock_held() || \
                              lockdep_is_held(&sched_domains_mutex))
 /*
@@ -605,10 +613,10 @@ static inline int cpu_of(struct rq *rq)
 /*
 * Return the group to which this tasks belongs.
 *
- * We use task_subsys_state_check() and extend the RCU verification
+ * We use task_subsys_state_check() and extend the RCU verification with
- * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
+ * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * holds that lock for each task it moves into the cgroup. Therefore
+ * task it moves into the cgroup. Therefore by holding either of those locks,
- * by holding that lock, we pin the task to the current cgroup.
+ * we pin the task to the current cgroup.
 */
 static inline struct task_group *task_group(struct task_struct *p)
 {
@@ -616,7 +624,8 @@ static inline struct task_group *task_group(struct task_struct *p)
        struct cgroup_subsys_state *css;
        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-                        lockdep_is_held(&p->pi_lock));
+                        lockdep_is_held(&p->pi_lock) ||
+                        lockdep_is_held(&task_rq(p)->lock));
        tg = container_of(css, struct task_group, css);
        return autogroup_task_group(p, tg);
@@ -1567,38 +1576,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
        return rq->avg_load_per_task;
 }
-#ifdef CONFIG_FAIR_GROUP_SCHED
-/*
- * Compute the cpu's hierarchical load factor for each task group.
- * This needs to be done in a top-down fashion because the load of a child
- * group is a fraction of its parents load.
- */
-static int tg_load_down(struct task_group *tg, void *data)
-{
-        unsigned long load;
-        long cpu = (long)data;
-        if (!tg->parent) {
-                load = cpu_rq(cpu)->load.weight;
-        } else {
-                load = tg->parent->cfs_rq[cpu]->h_load;
-                load *= tg->se[cpu]->load.weight;
-                load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
-        }
-        tg->cfs_rq[cpu]->h_load = load;
-        return 0;
-}
-static void update_h_load(long cpu)
-{
-        walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-}
-#endif
 #ifdef CONFIG_PREEMPT
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -1952,10 +1929,28 @@ void account_system_vtime(struct task_struct *curr)
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
-static void update_rq_clock_task(struct rq *rq, s64 delta)
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#ifdef CONFIG_PARAVIRT
+static inline u64 steal_ticks(u64 steal)
 {
-        s64 irq_delta;
+        if (unlikely(steal > NSEC_PER_SEC))
+                return div_u64(steal, TICK_NSEC);
+        return __iter_div_u64_rem(steal, TICK_NSEC, &steal);
+}
+#endif
+static void update_rq_clock_task(struct rq *rq, s64 delta)
+{
+/*
+ * In theory, the compile should just see 0 here, and optimize out the call
+ * to sched_rt_avg_update. But I don't trust it...
+ */
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+        s64 steal = 0, irq_delta = 0;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
        irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
        /*
@@ -1978,12 +1973,35 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
        rq->prev_irq_time += irq_delta;
        delta -= irq_delta;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+        if (static_branch((&paravirt_steal_rq_enabled))) {
+                u64 st;
+                steal = paravirt_steal_clock(cpu_of(rq));
+                steal -= rq->prev_steal_time_rq;
+                if (unlikely(steal > delta))
+                        steal = delta;
+                st = steal_ticks(steal);
+                steal = st * TICK_NSEC;
+                rq->prev_steal_time_rq += steal;
+                delta -= steal;
+        }
+#endif
        rq->clock_task += delta;
-        if (irq_delta && sched_feat(NONIRQ_POWER))
+#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
-                sched_rt_avg_update(rq, irq_delta);
+        if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+                sched_rt_avg_update(rq, irq_delta + steal);
+#endif
 }
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
 static int irqtime_account_hi_update(void)
 {
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -2018,12 +2036,7 @@ static int irqtime_account_si_update(void)
 #define sched_clock_irqtime     (0)
-static void update_rq_clock_task(struct rq *rq, s64 delta)
+#endif
-{
-        rq->clock_task += delta;
-}
-#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -2200,6 +2213,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                        !(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
 #ifdef CONFIG_LOCKDEP
+        /*
+         * The caller should hold either p->pi_lock or rq->lock, when changing
+         * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
+         *
+         * sched_move_task() holds both and thus holding either pins the cgroup,
+         * see set_task_rq().
+         *
+         * Furthermore, all task_rq users should acquire both locks, see
+         * task_rq_lock().
+         */
        WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
                                      lockdep_is_held(&task_rq(p)->lock)));
 #endif
@@ -2209,7 +2232,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
        if (task_cpu(p) != new_cpu) {
                p->se.nr_migrations++;
-                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, 1, NULL, 0);
+                perf_sw_event(PERF_COUNT_SW_CPU_MIGRATIONS, 1, NULL, 0);
        }
        __set_task_cpu(p, new_cpu);
@@ -2447,6 +2470,10 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
                }
                rcu_read_unlock();
        }
+        if (wake_flags & WF_MIGRATED)
+                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SMP */
        schedstat_inc(rq, ttwu_count);
@@ -2455,9 +2482,6 @@ ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
        if (wake_flags & WF_SYNC)
                schedstat_inc(p, se.statistics.nr_wakeups_sync);
-        if (cpu != task_cpu(p))
-                schedstat_inc(p, se.statistics.nr_wakeups_migrate);
 #endif /* CONFIG_SCHEDSTATS */
 }
@@ -2485,7 +2509,7 @@ ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
        if (p->sched_class->task_woken)
                p->sched_class->task_woken(rq, p);
-        if (unlikely(rq->idle_stamp)) {
+        if (rq->idle_stamp) {
                u64 delta = rq->clock - rq->idle_stamp;
                u64 max = 2*sysctl_sched_migration_cost;
@@ -2532,13 +2556,9 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 }
 #ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+static void sched_ttwu_do_pending(struct task_struct *list)
 {
        struct rq *rq = this_rq();
-        struct task_struct *list = xchg(&rq->wake_list, NULL);
-        if (!list)
-                return;
        raw_spin_lock(&rq->lock);
@@ -2551,9 +2571,45 @@ static void sched_ttwu_pending(void)
        raw_spin_unlock(&rq->lock);
 }
+#ifdef CONFIG_HOTPLUG_CPU
+static void sched_ttwu_pending(void)
+{
+        struct rq *rq = this_rq();
+        struct task_struct *list = xchg(&rq->wake_list, NULL);
+        if (!list)
+                return;
+        sched_ttwu_do_pending(list);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
 void scheduler_ipi(void)
 {
-        sched_ttwu_pending();
+        struct rq *rq = this_rq();
+        struct task_struct *list = xchg(&rq->wake_list, NULL);
+        if (!list)
+                return;
+        /*
+         * Not all reschedule IPI handlers call irq_enter/irq_exit, since
+         * traditionally all their work was done from the interrupt return
+         * path. Now that we actually do some work, we need to make sure
+         * we do call them.
+         *
+         * Some archs already do call them, luckily irq_enter/exit nest
+         * properly.
+         *
+         * Arguably we should visit all archs and update all handlers,
+         * however a fair share of IPIs are still resched only so this would
+         * somewhat pessimize the simple resched case.
+         */
+        irq_enter();
+        sched_ttwu_do_pending(list);
+        irq_exit();
 }
 static void ttwu_queue_remote(struct task_struct *p, int cpu)
@@ -2600,6 +2656,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
 #if defined(CONFIG_SMP)
        if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+                sched_clock_cpu(cpu); /* sync clocks x-cpu */
                ttwu_queue_remote(p, cpu);
                return;
        }
@@ -2674,8 +2731,10 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
                p->sched_class->task_waking(p);
        cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-        if (task_cpu(p) != cpu)
+        if (task_cpu(p) != cpu) {
+                wake_flags |= WF_MIGRATED;
                set_task_cpu(p, cpu);
+        }
 #endif /* CONFIG_SMP */
        ttwu_queue(p, cpu);
@@ -2839,7 +2898,7 @@ void sched_fork(struct task_struct *p)
 #if defined(CONFIG_SMP)
        p->on_cpu = 0;
 #endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
        /* Want to start with kernel preemption disabled. */
        task_thread_info(p)->preempt_count = 1;
 #endif
@@ -3830,6 +3889,25 @@ void account_idle_time(cputime_t cputime)
                cpustat->idle = cputime64_add(cpustat->idle, cputime64);
 }
+static __always_inline bool steal_account_process_tick(void)
+{
+#ifdef CONFIG_PARAVIRT
+        if (static_branch(&paravirt_steal_enabled)) {
+                u64 steal, st = 0;
+                steal = paravirt_steal_clock(smp_processor_id());
+                steal -= this_rq()->prev_steal_time;
+                st = steal_ticks(steal);
+                this_rq()->prev_steal_time += st * TICK_NSEC;
+                account_steal_time(st);
+                return st;
+        }
+#endif
+        return false;
+}
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -3861,6 +3939,9 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
        cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        if (steal_account_process_tick())
+                return;
        if (irqtime_account_hi_update()) {
                cpustat->irq = cputime64_add(cpustat->irq, tmp);
        } else if (irqtime_account_si_update()) {
@@ -3914,6 +3995,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
                return;
        }
+        if (steal_account_process_tick())
+                return;
        if (user_tick)
                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
        else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -4291,11 +4375,8 @@ EXPORT_SYMBOL(schedule);
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 {
-        bool ret = false;
-        rcu_read_lock();
        if (lock->owner != owner)
-                goto fail;
+                return false;
        /*
         * Ensure we emit the owner->on_cpu, dereference _after_ checking
@@ -4305,11 +4386,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
         */
        barrier();
-        ret = owner->on_cpu;
+        return owner->on_cpu;
-fail:
-        rcu_read_unlock();
-        return ret;
 }
 /*
@@ -4321,21 +4398,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
        if (!sched_feat(OWNER_SPIN))
                return 0;
+        rcu_read_lock();
        while (owner_running(lock, owner)) {
                if (need_resched())
-                        return 0;
+                        break;
                arch_mutex_cpu_relax();
        }
+        rcu_read_unlock();
        /*
-         * If the owner changed to another task there is likely
+         * We break out the loop above on need_resched() and when the
-         * heavy contention, stop spinning.
+         * owner changed, which is a sign for heavy contention. Return
+         * success only when lock->owner is NULL.
         */
-        if (lock->owner)
+        return lock->owner == NULL;
-                return 0;
-        return 1;
 }
 #endif
@@ -6542,7 +6619,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                        break;
                }
-                if (!group->cpu_power) {
+                if (!group->sgp->power) {
                        printk(KERN_CONT "\n");
                        printk(KERN_ERR "ERROR: domain->cpu_power not "
                                        "set\n");
@@ -6566,9 +6643,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
                cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
                printk(KERN_CONT " %s", str);
-                if (group->cpu_power != SCHED_POWER_SCALE) {
+                if (group->sgp->power != SCHED_POWER_SCALE) {
                        printk(KERN_CONT " (cpu_power = %d)",
-                                group->cpu_power);
+                                group->sgp->power);
                }
                group = group->next;
@@ -6759,11 +6836,39 @@ static struct root_domain *alloc_rootdomain(void)
        return rd;
 }
+static void free_sched_groups(struct sched_group *sg, int free_sgp)
+{
+        struct sched_group *tmp, *first;
+        if (!sg)
+                return;
+        first = sg;
+        do {
+                tmp = sg->next;
+                if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
+                        kfree(sg->sgp);
+                kfree(sg);
+                sg = tmp;
+        } while (sg != first);
+}
 static void free_sched_domain(struct rcu_head *rcu)
 {
        struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
-        if (atomic_dec_and_test(&sd->groups->ref))
+        /*
+         * If its an overlapping domain it has private groups, iterate and
+         * nuke them all.
+         */
+        if (sd->flags & SD_OVERLAP) {
+                free_sched_groups(sd->groups, 1);
+        } else if (atomic_dec_and_test(&sd->groups->ref)) {
+                kfree(sd->groups->sgp);
                kfree(sd->groups);
+        }
        kfree(sd);
 }
@@ -6930,6 +7035,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 struct sd_data {
        struct sched_domain **__percpu sd;
        struct sched_group **__percpu sg;
+        struct sched_group_power **__percpu sgp;
 };
 struct s_data {
@@ -6949,15 +7055,73 @@ struct sched_domain_topology_level;
 typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
 typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
+#define SDTL_OVERLAP    0x01
 struct sched_domain_topology_level {
        sched_domain_init_f init;
        sched_domain_mask_f mask;
+        int                 flags;
        struct sd_data      data;
 };
-/*
+static int
- * Assumes the sched_domain tree is fully constructed
+build_overlap_sched_groups(struct sched_domain *sd, int cpu)
- */
+{
+        struct sched_group *first = NULL, *last = NULL, *groups = NULL, *sg;
+        const struct cpumask *span = sched_domain_span(sd);
+        struct cpumask *covered = sched_domains_tmpmask;
+        struct sd_data *sdd = sd->private;
+        struct sched_domain *child;
+        int i;
+        cpumask_clear(covered);
+        for_each_cpu(i, span) {
+                struct cpumask *sg_span;
+                if (cpumask_test_cpu(i, covered))
+                        continue;
+                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+                                GFP_KERNEL, cpu_to_node(i));
+                if (!sg)
+                        goto fail;
+                sg_span = sched_group_cpus(sg);
+                child = *per_cpu_ptr(sdd->sd, i);
+                if (child->child) {
+                        child = child->child;
+                        cpumask_copy(sg_span, sched_domain_span(child));
+                } else
+                        cpumask_set_cpu(i, sg_span);
+                cpumask_or(covered, covered, sg_span);
+                sg->sgp = *per_cpu_ptr(sdd->sgp, cpumask_first(sg_span));
+                atomic_inc(&sg->sgp->ref);
+                if (cpumask_test_cpu(cpu, sg_span))
+                        groups = sg;
+                if (!first)
+                        first = sg;
+                if (last)
+                        last->next = sg;
+                last = sg;
+                last->next = first;
+        }
+        sd->groups = groups;
+        return 0;
+fail:
+        free_sched_groups(first, 0);
+        return -ENOMEM;
+}
 static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
 {
        struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
@@ -6966,24 +7130,24 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
        if (child)
                cpu = cpumask_first(sched_domain_span(child));
-        if (sg)
+        if (sg) {
                *sg = *per_cpu_ptr(sdd->sg, cpu);
+                (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
+                atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+        }
        return cpu;
 }
 /*
- * build_sched_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
 * build_sched_groups will build a circular linked list of the groups
 * covered by the given span, and will set each group's ->cpumask correctly,
 * and ->cpu_power to 0.
+ *
+ * Assumes the sched_domain tree is fully constructed
 */
-static void
+static int
-build_sched_groups(struct sched_domain *sd)
+build_sched_groups(struct sched_domain *sd, int cpu)
 {
        struct sched_group *first = NULL, *last = NULL;
        struct sd_data *sdd = sd->private;
@@ -6991,6 +7155,12 @@ build_sched_groups(struct sched_domain *sd)
        struct cpumask *covered;
        int i;
+        get_group(cpu, sdd, &sd->groups);
+        atomic_inc(&sd->groups->ref);
+        if (cpu != cpumask_first(sched_domain_span(sd)))
+                return 0;
        lockdep_assert_held(&sched_domains_mutex);
        covered = sched_domains_tmpmask;
@@ -7005,7 +7175,7 @@ build_sched_groups(struct sched_domain *sd)
                        continue;
                cpumask_clear(sched_group_cpus(sg));
-                sg->cpu_power = 0;
+                sg->sgp->power = 0;
                for_each_cpu(j, span) {
                        if (get_group(j, sdd, NULL) != group)
@@ -7022,6 +7192,8 @@ build_sched_groups(struct sched_domain *sd)
                last = sg;
        }
        last->next = first;
+        return 0;
 }
 /*
@@ -7036,12 +7208,17 @@ build_sched_groups(struct sched_domain *sd)
 */
 static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 {
-        WARN_ON(!sd || !sd->groups);
+        struct sched_group *sg = sd->groups;
-        if (cpu != group_first_cpu(sd->groups))
+        WARN_ON(!sd || !sg);
-                return;
-        sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+        do {
+                sg->group_weight = cpumask_weight(sched_group_cpus(sg));
+                sg = sg->next;
+        } while (sg != sd->groups);
+        if (cpu != group_first_cpu(sg))
+                return;
        update_group_power(sd, cpu);
 }
@@ -7162,15 +7339,15 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
 static void claim_allocations(int cpu, struct sched_domain *sd)
 {
        struct sd_data *sdd = sd->private;
-        struct sched_group *sg = sd->groups;
        WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
        *per_cpu_ptr(sdd->sd, cpu) = NULL;
-        if (cpu == cpumask_first(sched_group_cpus(sg))) {
+        if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
-                WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
                *per_cpu_ptr(sdd->sg, cpu) = NULL;
-        }
+        if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
+                *per_cpu_ptr(sdd->sgp, cpu) = NULL;
 }
 #ifdef CONFIG_SCHED_SMT
@@ -7195,7 +7372,7 @@ static struct sched_domain_topology_level default_topology[] = {
 #endif
        { sd_init_CPU, cpu_cpu_mask, },
 #ifdef CONFIG_NUMA
-        { sd_init_NODE, cpu_node_mask, },
+        { sd_init_NODE, cpu_node_mask, SDTL_OVERLAP, },
        { sd_init_ALLNODES, cpu_allnodes_mask, },
 #endif
        { NULL, },
@@ -7219,9 +7396,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                if (!sdd->sg)
                        return -ENOMEM;
+                sdd->sgp = alloc_percpu(struct sched_group_power *);
+                if (!sdd->sgp)
+                        return -ENOMEM;
                for_each_cpu(j, cpu_map) {
                        struct sched_domain *sd;
                        struct sched_group *sg;
+                        struct sched_group_power *sgp;
                        sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
                                        GFP_KERNEL, cpu_to_node(j));
@@ -7236,6 +7418,13 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
                                return -ENOMEM;
                        *per_cpu_ptr(sdd->sg, j) = sg;
+                        sgp = kzalloc_node(sizeof(struct sched_group_power),
+                                        GFP_KERNEL, cpu_to_node(j));
+                        if (!sgp)
+                                return -ENOMEM;
+                        *per_cpu_ptr(sdd->sgp, j) = sgp;
                }
        }
@@ -7251,11 +7440,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
                struct sd_data *sdd = &tl->data;
                for_each_cpu(j, cpu_map) {
-                        kfree(*per_cpu_ptr(sdd->sd, j));
+                        struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
+                        if (sd && (sd->flags & SD_OVERLAP))
+                                free_sched_groups(sd->groups, 0);
                        kfree(*per_cpu_ptr(sdd->sg, j));
+                        kfree(*per_cpu_ptr(sdd->sgp, j));
                }
                free_percpu(sdd->sd);
                free_percpu(sdd->sg);
+                free_percpu(sdd->sgp);
        }
 }
@@ -7301,8 +7494,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
                struct sched_domain_topology_level *tl;
                sd = NULL;
-                for (tl = sched_domain_topology; tl->init; tl++)
+                for (tl = sched_domain_topology; tl->init; tl++) {
                        sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
+                        if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
+                                sd->flags |= SD_OVERLAP;
+                        if (cpumask_equal(cpu_map, sched_domain_span(sd)))
+                                break;
+                }
                while (sd->child)
                        sd = sd->child;
@@ -7314,13 +7512,13 @@ static int build_sched_domains(const struct cpumask *cpu_map,
        for_each_cpu(i, cpu_map) {
                for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
                        sd->span_weight = cpumask_weight(sched_domain_span(sd));
-                        get_group(i, sd->private, &sd->groups);
+                        if (sd->flags & SD_OVERLAP) {
-                        atomic_inc(&sd->groups->ref);
+                                if (build_overlap_sched_groups(sd, i))
+                                        goto error;
-                        if (i != cpumask_first(sched_domain_span(sd)))
+                        } else {
-                                continue;
+                                if (build_sched_groups(sd, i))
+                                        goto error;
-                        build_sched_groups(sd);
+                        }
                }
        }
@@ -7730,18 +7928,14 @@ int in_sched_functions(unsigned long addr)
                && addr < (unsigned long)__sched_text_end);
 }
-static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+static void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
        cfs_rq->tasks_timeline = RB_ROOT;
        INIT_LIST_HEAD(&cfs_rq->tasks);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        cfs_rq->rq = rq;
-        /* allow initial update_cfs_load() to truncate */
-#ifdef CONFIG_SMP
-        cfs_rq->load_stamp = 1;
-#endif
-#endif
        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
 }
 static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
@@ -7757,27 +7951,18 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
        /* delimiter for bitsearch: */
        __set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined CONFIG_SMP
        rt_rq->highest_prio.curr = MAX_RT_PRIO;
-#ifdef CONFIG_SMP
        rt_rq->highest_prio.next = MAX_RT_PRIO;
-#endif
-#endif
-#ifdef CONFIG_SMP
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
-        plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
+        plist_head_init(&rt_rq->pushable_tasks);
 #endif
        rt_rq->rt_time = 0;
        rt_rq->rt_throttled = 0;
        rt_rq->rt_runtime = 0;
        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-#ifdef CONFIG_RT_GROUP_SCHED
-        rt_rq->rt_nr_boosted = 0;
-        rt_rq->rq = rq;
-#endif
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7786,11 +7971,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
                                struct sched_entity *parent)
 {
        struct rq *rq = cpu_rq(cpu);
-        tg->cfs_rq[cpu] = cfs_rq;
-        init_cfs_rq(cfs_rq, rq);
        cfs_rq->tg = tg;
+        cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+        /* allow initial update_cfs_load() to truncate */
+        cfs_rq->load_stamp = 1;
+#endif
+        tg->cfs_rq[cpu] = cfs_rq;
        tg->se[cpu] = se;
        /* se could be NULL for root_task_group */
        if (!se)
                return;
@@ -7813,12 +8004,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 {
        struct rq *rq = cpu_rq(cpu);
-        tg->rt_rq[cpu] = rt_rq;
+        rt_rq->highest_prio.curr = MAX_RT_PRIO;
-        init_rt_rq(rt_rq, rq);
+        rt_rq->rt_nr_boosted = 0;
+        rt_rq->rq = rq;
        rt_rq->tg = tg;
-        rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
+        tg->rt_rq[cpu] = rt_rq;
        tg->rt_se[cpu] = rt_se;
        if (!rt_se)
                return;
@@ -7900,7 +8093,7 @@ void __init sched_init(void)
                rq->nr_running = 0;
                rq->calc_load_active = 0;
                rq->calc_load_update = jiffies + LOAD_FREQ;
-                init_cfs_rq(&rq->cfs, rq);
+                init_cfs_rq(&rq->cfs);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
                root_task_group.shares = root_task_group_load;
@@ -7971,7 +8164,7 @@ void __init sched_init(void)
 #endif
 #ifdef CONFIG_RT_MUTEXES
-        plist_head_init_raw(&init_task.pi_waiters, &init_task.pi_lock);
+        plist_head_init(&init_task.pi_waiters);
 #endif
        /*
@@ -8014,7 +8207,7 @@ void __init sched_init(void)
        scheduler_running = 1;
 }
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
        int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
@@ -8024,7 +8217,6 @@ static inline int preempt_count_equals(int preempt_offset)
 void __might_sleep(const char *file, int line, int preempt_offset)
 {
-#ifdef in_atomic
        static unsigned long prev_jiffy;        /* ratelimiting */
        if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
@@ -8046,7 +8238,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
        if (irqs_disabled())
                print_irqtrace_events(current);
        dump_stack();
-#endif
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
@@ -8205,6 +8396,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
                if (!se)
                        goto err_free_rq;
+                init_cfs_rq(cfs_rq);
                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
        }
@@ -8232,7 +8424,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
        list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
-#else /* !CONFG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
 {
 }
@@ -8253,7 +8445,8 @@ static void free_rt_sched_group(struct task_group *tg)
 {
        int i;
-        destroy_rt_bandwidth(&tg->rt_bandwidth);
+        if (tg->rt_se)
+                destroy_rt_bandwidth(&tg->rt_bandwidth);
        for_each_possible_cpu(i) {
                if (tg->rt_rq)
@@ -8294,6 +8487,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                if (!rt_se)
                        goto err_free_rq;
+                init_rt_rq(rt_rq, cpu_rq(i));
+                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
        }
@@ -8435,10 +8630,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
        if (!tg->se[0])
                return -EINVAL;
-        if (shares < MIN_SHARES)
+        shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
-                shares = MIN_SHARES;
-        else if (shares > MAX_SHARES)
-                shares = MAX_SHARES;
        mutex_lock(&shares_mutex);
        if (tg->shares == shares)