1 files changed, 257 insertions, 175 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 268a45ea238c..c6b98793d647 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -90,6 +90,22 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
+#ifdef smp_mb__before_atomic
+void __smp_mb__before_atomic(void)
+{
+        smp_mb__before_atomic();
+}
+EXPORT_SYMBOL(__smp_mb__before_atomic);
+#endif
+#ifdef smp_mb__after_atomic
+void __smp_mb__after_atomic(void)
+{
+        smp_mb__after_atomic();
+}
+EXPORT_SYMBOL(__smp_mb__after_atomic);
+#endif
 void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
        unsigned long delta;
@@ -506,6 +522,39 @@ static inline void init_hrtick(void)
 #endif  /* CONFIG_SCHED_HRTICK */
 /*
+ * cmpxchg based fetch_or, macro so it works for different integer types
+ */
+#define fetch_or(ptr, val)                                              \
+({      typeof(*(ptr)) __old, __val = *(ptr);                           \
+        for (;;) {                                                      \
+                __old = cmpxchg((ptr), __val, __val | (val));           \
+                if (__old == __val)                                     \
+                        break;                                          \
+                __val = __old;                                          \
+        }                                                               \
+        __old;                                                          \
+})
+#ifdef TIF_POLLING_NRFLAG
+/*
+ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
+ * this avoids any races wrt polling state changes and thereby avoids
+ * spurious IPIs.
+ */
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+        struct thread_info *ti = task_thread_info(p);
+        return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
+}
+#else
+static bool set_nr_and_not_polling(struct task_struct *p)
+{
+        set_tsk_need_resched(p);
+        return true;
+}
+#endif
+/*
 * resched_task - mark a task 'to be rescheduled now'.
 *
 * On UP this means the setting of the need_resched flag, on SMP it
@@ -521,17 +570,15 @@ void resched_task(struct task_struct *p)
        if (test_tsk_need_resched(p))
                return;
-        set_tsk_need_resched(p);
        cpu = task_cpu(p);
        if (cpu == smp_processor_id()) {
+                set_tsk_need_resched(p);
                set_preempt_need_resched();
                return;
        }
-        /* NEED_RESCHED must be visible before we test polling */
+        if (set_nr_and_not_polling(p))
-        smp_mb();
-        if (!tsk_is_polling(p))
                smp_send_reschedule(cpu);
 }
@@ -1320,7 +1367,7 @@ out:
                 * leave kernel.
                 */
                if (p->mm && printk_ratelimit()) {
-                        printk_sched("process %d (%s) no longer affine to cpu%d\n",
+                        printk_deferred("process %d (%s) no longer affine to cpu%d\n",
                                        task_pid_nr(p), p->comm, cpu);
                }
        }
@@ -2192,7 +2239,7 @@ static inline void post_schedule(struct rq *rq)
 * schedule_tail - first thing a freshly forked thread must call.
 * @prev: the thread we just switched away from.
 */
-asmlinkage void schedule_tail(struct task_struct *prev)
+asmlinkage __visible void schedule_tail(struct task_struct *prev)
        __releases(rq->lock)
 {
        struct rq *rq = this_rq();
@@ -2592,8 +2639,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev)
        if (likely(prev->sched_class == class &&
                   rq->nr_running == rq->cfs.h_nr_running)) {
                p = fair_sched_class.pick_next_task(rq, prev);
-                if (likely(p && p != RETRY_TASK))
+                if (unlikely(p == RETRY_TASK))
-                        return p;
+                        goto again;
+                /* assumes fair_sched_class->next == idle_sched_class */
+                if (unlikely(!p))
+                        p = idle_sched_class.pick_next_task(rq, prev);
+                return p;
        }
 again:
@@ -2741,7 +2794,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
                blk_schedule_flush_plug(tsk);
 }
-asmlinkage void __sched schedule(void)
+asmlinkage __visible void __sched schedule(void)
 {
        struct task_struct *tsk = current;
@@ -2751,7 +2804,7 @@ asmlinkage void __sched schedule(void)
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_CONTEXT_TRACKING
-asmlinkage void __sched schedule_user(void)
+asmlinkage __visible void __sched schedule_user(void)
 {
        /*
         * If we come here after a random call to set_need_resched(),
@@ -2783,7 +2836,7 @@ void __sched schedule_preempt_disabled(void)
 * off of preempt_enable. Kernel preemptions off return from interrupt
 * occur there and call schedule directly.
 */
-asmlinkage void __sched notrace preempt_schedule(void)
+asmlinkage __visible void __sched notrace preempt_schedule(void)
 {
        /*
         * If there is a non-zero preempt_count or interrupts are disabled,
@@ -2813,7 +2866,7 @@ EXPORT_SYMBOL(preempt_schedule);
 * Note, that this is called and return with irqs disabled. This will
 * protect us against recursive calling from irq.
 */
-asmlinkage void __sched preempt_schedule_irq(void)
+asmlinkage __visible void __sched preempt_schedule_irq(void)
 {
        enum ctx_state prev_state;
@@ -2996,7 +3049,7 @@ EXPORT_SYMBOL(set_user_nice);
 int can_nice(const struct task_struct *p, const int nice)
 {
        /* convert nice value [19,-20] to rlimit style value [1,40] */
-        int nice_rlim = 20 - nice;
+        int nice_rlim = nice_to_rlimit(nice);
        return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
                capable(CAP_SYS_NICE));
@@ -3020,17 +3073,10 @@ SYSCALL_DEFINE1(nice, int, increment)
         * We don't have to worry. Conceptually one call occurs first
         * and we have a single winner.
         */
-        if (increment < -40)
+        increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
-                increment = -40;
-        if (increment > 40)
-                increment = 40;
        nice = task_nice(current) + increment;
-        if (nice < MIN_NICE)
-                nice = MIN_NICE;
-        if (nice > MAX_NICE)
-                nice = MAX_NICE;
+        nice = clamp_val(nice, MIN_NICE, MAX_NICE);
        if (increment < 0 && !can_nice(current, nice))
                return -EPERM;
@@ -3124,6 +3170,7 @@ __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
        dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
        dl_se->dl_throttled = 0;
        dl_se->dl_new = 1;
+        dl_se->dl_yielded = 0;
 }
 static void __setscheduler_params(struct task_struct *p,
@@ -3188,17 +3235,40 @@ __getparam_dl(struct task_struct *p, struct sched_attr *attr)
 * We ask for the deadline not being zero, and greater or equal
 * than the runtime, as well as the period of being zero or
 * greater than deadline. Furthermore, we have to be sure that
- * user parameters are above the internal resolution (1us); we
+ * user parameters are above the internal resolution of 1us (we
- * check sched_runtime only since it is always the smaller one.
+ * check sched_runtime only since it is always the smaller one) and
+ * below 2^63 ns (we have to check both sched_deadline and
+ * sched_period, as the latter can be zero).
 */
 static bool
 __checkparam_dl(const struct sched_attr *attr)
 {
-        return attr && attr->sched_deadline != 0 &&
+        /* deadline != 0 */
-                (attr->sched_period == 0 ||
+        if (attr->sched_deadline == 0)
-                (s64)(attr->sched_period   - attr->sched_deadline) >= 0) &&
+                return false;
-                (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0  &&
-                attr->sched_runtime >= (2 << (DL_SCALE - 1));
+        /*
+         * Since we truncate DL_SCALE bits, make sure we're at least
+         * that big.
+         */
+        if (attr->sched_runtime < (1ULL << DL_SCALE))
+                return false;
+        /*
+         * Since we use the MSB for wrap-around and sign issues, make
+         * sure it's not set (mind that period can be equal to zero).
+         */
+        if (attr->sched_deadline & (1ULL << 63) ||
+            attr->sched_period & (1ULL << 63))
+                return false;
+        /* runtime <= deadline <= period (if period != 0) */
+        if ((attr->sched_period != 0 &&
+             attr->sched_period < attr->sched_deadline) ||
+            attr->sched_deadline < attr->sched_runtime)
+                return false;
+        return true;
 }
 /*
@@ -3596,13 +3666,11 @@ static int sched_copy_attr(struct sched_attr __user *uattr,
         */
        attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
-out:
+        return 0;
-        return ret;
 err_size:
        put_user(sizeof(*attr), &uattr->size);
-        ret = -E2BIG;
+        return -E2BIG;
-        goto out;
 }
 /**
@@ -3639,6 +3707,7 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 * sys_sched_setattr - same as above, but with extended sched_attr
 * @pid: the pid in question.
 * @uattr: structure containing the extended parameters.
+ * @flags: for future extension.
 */
 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
                               unsigned int, flags)
@@ -3650,8 +3719,12 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
        if (!uattr || pid < 0 || flags)
                return -EINVAL;
-        if (sched_copy_attr(uattr, &attr))
+        retval = sched_copy_attr(uattr, &attr);
-                return -EFAULT;
+        if (retval)
+                return retval;
+        if ((int)attr.sched_policy < 0)
+                return -EINVAL;
        rcu_read_lock();
        retval = -ESRCH;
@@ -3701,7 +3774,7 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 */
 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 {
-        struct sched_param lp;
+        struct sched_param lp = { .sched_priority = 0 };
        struct task_struct *p;
        int retval;
@@ -3718,11 +3791,8 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
        if (retval)
                goto out_unlock;
-        if (task_has_dl_policy(p)) {
+        if (task_has_rt_policy(p))
-                retval = -EINVAL;
+                lp.sched_priority = p->rt_priority;
-                goto out_unlock;
-        }
-        lp.sched_priority = p->rt_priority;
        rcu_read_unlock();
        /*
@@ -3760,7 +3830,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
                for (; addr < end; addr++) {
                        if (*addr)
-                                goto err_size;
+                                return -EFBIG;
                }
                attr->size = usize;
@@ -3770,12 +3840,7 @@ static int sched_read_attr(struct sched_attr __user *uattr,
        if (ret)
                return -EFAULT;
-out:
+        return 0;
-        return ret;
-err_size:
-        ret = -E2BIG;
-        goto out;
 }
 /**
@@ -3783,6 +3848,7 @@ err_size:
 * @pid: the pid in question.
 * @uattr: structure containing the extended parameters.
 * @size: sizeof(attr) for fwd/bwd comp.
+ * @flags: for future extension.
 */
 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
                unsigned int, size, unsigned int, flags)
@@ -4051,6 +4117,7 @@ static void __cond_resched(void)
 int __sched _cond_resched(void)
 {
+        rcu_cond_resched();
        if (should_resched()) {
                __cond_resched();
                return 1;
@@ -4069,15 +4136,18 @@ EXPORT_SYMBOL(_cond_resched);
 */
 int __cond_resched_lock(spinlock_t *lock)
 {
+        bool need_rcu_resched = rcu_should_resched();
        int resched = should_resched();
        int ret = 0;
        lockdep_assert_held(lock);
-        if (spin_needbreak(lock) || resched) {
+        if (spin_needbreak(lock) || resched || need_rcu_resched) {
                spin_unlock(lock);
                if (resched)
                        __cond_resched();
+                else if (unlikely(need_rcu_resched))
+                        rcu_resched();
                else
                        cpu_relax();
                ret = 1;
@@ -4091,6 +4161,7 @@ int __sched __cond_resched_softirq(void)
 {
        BUG_ON(!in_softirq());
+        rcu_cond_resched();  /* BH disabled OK, just recording QSes. */
        if (should_resched()) {
                local_bh_enable();
                __cond_resched();
@@ -5039,11 +5110,20 @@ static struct notifier_block migration_notifier = {
        .priority = CPU_PRI_MIGRATION,
 };
+static void __cpuinit set_cpu_rq_start_time(void)
+{
+        int cpu = smp_processor_id();
+        struct rq *rq = cpu_rq(cpu);
+        rq->age_stamp = sched_clock_cpu(cpu);
+}
 static int sched_cpu_active(struct notifier_block *nfb,
                                      unsigned long action, void *hcpu)
 {
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_STARTING:
+                set_cpu_rq_start_time();
+                return NOTIFY_OK;
        case CPU_DOWN_FAILED:
                set_cpu_active((long)hcpu, true);
                return NOTIFY_OK;
@@ -5252,7 +5332,8 @@ static int sd_degenerate(struct sched_domain *sd)
                         SD_BALANCE_FORK |
                         SD_BALANCE_EXEC |
                         SD_SHARE_CPUPOWER |
-                         SD_SHARE_PKG_RESOURCES)) {
+                         SD_SHARE_PKG_RESOURCES |
+                         SD_SHARE_POWERDOMAIN)) {
                if (sd->groups != sd->groups->next)
                        return 0;
        }
@@ -5283,7 +5364,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                SD_BALANCE_EXEC |
                                SD_SHARE_CPUPOWER |
                                SD_SHARE_PKG_RESOURCES |
-                                SD_PREFER_SIBLING);
+                                SD_PREFER_SIBLING |
+                                SD_SHARE_POWERDOMAIN);
                if (nr_node_ids == 1)
                        pflags &= ~SD_SERIALIZE;
        }
@@ -5557,17 +5639,6 @@ static int __init isolated_cpu_setup(char *str)
 __setup("isolcpus=", isolated_cpu_setup);
-static const struct cpumask *cpu_cpu_mask(int cpu)
-{
-        return cpumask_of_node(cpu_to_node(cpu));
-}
-struct sd_data {
-        struct sched_domain **__percpu sd;
-        struct sched_group **__percpu sg;
-        struct sched_group_power **__percpu sgp;
-};
 struct s_data {
        struct sched_domain ** __percpu sd;
        struct root_domain      *rd;
@@ -5580,21 +5651,6 @@ enum s_alloc {
        sa_none,
 };
-struct sched_domain_topology_level;
-typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
-typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-#define SDTL_OVERLAP    0x01
-struct sched_domain_topology_level {
-        sched_domain_init_f init;
-        sched_domain_mask_f mask;
-        int                 flags;
-        int                 numa_level;
-        struct sd_data      data;
-};
 /*
 * Build an iteration mask that can exclude certain CPUs from the upwards
 * domain traversal.
@@ -5762,8 +5818,6 @@ build_sched_groups(struct sched_domain *sd, int cpu)
                        continue;
                group = get_group(i, sdd, &sg);
-                cpumask_clear(sched_group_cpus(sg));
-                sg->sgp->power = 0;
                cpumask_setall(sched_group_mask(sg));
                for_each_cpu(j, span) {
@@ -5813,44 +5867,11 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
        atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
 }
-int __weak arch_sd_sibling_asym_packing(void)
-{
-       return 0*SD_ASYM_PACKING;
-}
 /*
 * Initializers for schedule domains
 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
 */
-#ifdef CONFIG_SCHED_DEBUG
-# define SD_INIT_NAME(sd, type)         sd->name = #type
-#else
-# define SD_INIT_NAME(sd, type)         do { } while (0)
-#endif
-#define SD_INIT_FUNC(type)                                              \
-static noinline struct sched_domain *                                   \
-sd_init_##type(struct sched_domain_topology_level *tl, int cpu)         \
-{                                                                       \
-        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);       \
-        *sd = SD_##type##_INIT;                                         \
-        SD_INIT_NAME(sd, type);                                         \
-        sd->private = &tl->data;                                        \
-        return sd;                                                      \
-}
-SD_INIT_FUNC(CPU)
-#ifdef CONFIG_SCHED_SMT
- SD_INIT_FUNC(SIBLING)
-#endif
-#ifdef CONFIG_SCHED_MC
- SD_INIT_FUNC(MC)
-#endif
-#ifdef CONFIG_SCHED_BOOK
- SD_INIT_FUNC(BOOK)
-#endif
 static int default_relax_domain_level = -1;
 int sched_domain_level_max;
@@ -5938,97 +5959,154 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
                *per_cpu_ptr(sdd->sgp, cpu) = NULL;
 }
-#ifdef CONFIG_SCHED_SMT
-static const struct cpumask *cpu_smt_mask(int cpu)
-{
-        return topology_thread_cpumask(cpu);
-}
-#endif
-/*
- * Topology list, bottom-up.
- */
-static struct sched_domain_topology_level default_topology[] = {
-#ifdef CONFIG_SCHED_SMT
-        { sd_init_SIBLING, cpu_smt_mask, },
-#endif
-#ifdef CONFIG_SCHED_MC
-        { sd_init_MC, cpu_coregroup_mask, },
-#endif
-#ifdef CONFIG_SCHED_BOOK
-        { sd_init_BOOK, cpu_book_mask, },
-#endif
-        { sd_init_CPU, cpu_cpu_mask, },
-        { NULL, },
-};
-static struct sched_domain_topology_level *sched_domain_topology = default_topology;
-#define for_each_sd_topology(tl)                        \
-        for (tl = sched_domain_topology; tl->init; tl++)
 #ifdef CONFIG_NUMA
 static int sched_domains_numa_levels;
 static int *sched_domains_numa_distance;
 static struct cpumask ***sched_domains_numa_masks;
 static int sched_domains_curr_level;
+#endif
-static inline int sd_local_flags(int level)
+/*
-{
+ * SD_flags allowed in topology descriptions.
-        if (sched_domains_numa_distance[level] > RECLAIM_DISTANCE)
+ *
-                return 0;
+ * SD_SHARE_CPUPOWER      - describes SMT topologies
+ * SD_SHARE_PKG_RESOURCES - describes shared caches
-        return SD_BALANCE_EXEC | SD_BALANCE_FORK | SD_WAKE_AFFINE;
+ * SD_NUMA                - describes NUMA topologies
-}
+ * SD_SHARE_POWERDOMAIN   - describes shared power domain
+ *
+ * Odd one out:
+ * SD_ASYM_PACKING        - describes SMT quirks
+ */
+#define TOPOLOGY_SD_FLAGS               \
+        (SD_SHARE_CPUPOWER |            \
+         SD_SHARE_PKG_RESOURCES |       \
+         SD_NUMA |                      \
+         SD_ASYM_PACKING |              \
+         SD_SHARE_POWERDOMAIN)
 static struct sched_domain *
-sd_numa_init(struct sched_domain_topology_level *tl, int cpu)
+sd_init(struct sched_domain_topology_level *tl, int cpu)
 {
        struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu);
-        int level = tl->numa_level;
+        int sd_weight, sd_flags = 0;
-        int sd_weight = cpumask_weight(
-                        sched_domains_numa_masks[level][cpu_to_node(cpu)]);
+#ifdef CONFIG_NUMA
+        /*
+         * Ugly hack to pass state to sd_numa_mask()...
+         */
+        sched_domains_curr_level = tl->numa_level;
+#endif
+        sd_weight = cpumask_weight(tl->mask(cpu));
+        if (tl->sd_flags)
+                sd_flags = (*tl->sd_flags)();
+        if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
+                        "wrong sd_flags in topology description\n"))
+                sd_flags &= ~TOPOLOGY_SD_FLAGS;
        *sd = (struct sched_domain){
                .min_interval           = sd_weight,
                .max_interval           = 2*sd_weight,
                .busy_factor            = 32,
                .imbalance_pct          = 125,
-                .cache_nice_tries       = 2,
-                .busy_idx               = 3,
+                .cache_nice_tries       = 0,
-                .idle_idx               = 2,
+                .busy_idx               = 0,
+                .idle_idx               = 0,
                .newidle_idx            = 0,
                .wake_idx               = 0,
                .forkexec_idx           = 0,
                .flags                  = 1*SD_LOAD_BALANCE
                                        | 1*SD_BALANCE_NEWIDLE
-                                        | 0*SD_BALANCE_EXEC
+                                        | 1*SD_BALANCE_EXEC
-                                        | 0*SD_BALANCE_FORK
+                                        | 1*SD_BALANCE_FORK
                                        | 0*SD_BALANCE_WAKE
-                                        | 0*SD_WAKE_AFFINE
+                                        | 1*SD_WAKE_AFFINE
                                        | 0*SD_SHARE_CPUPOWER
                                        | 0*SD_SHARE_PKG_RESOURCES
-                                        | 1*SD_SERIALIZE
+                                        | 0*SD_SERIALIZE
                                        | 0*SD_PREFER_SIBLING
-                                        | 1*SD_NUMA
+                                        | 0*SD_NUMA
-                                        | sd_local_flags(level)
+                                        | sd_flags
                                        ,
                .last_balance           = jiffies,
                .balance_interval       = sd_weight,
+                .smt_gain               = 0,
+                .max_newidle_lb_cost    = 0,
+                .next_decay_max_lb_cost = jiffies,
+#ifdef CONFIG_SCHED_DEBUG
+                .name                   = tl->name,
+#endif
        };
-        SD_INIT_NAME(sd, NUMA);
-        sd->private = &tl->data;
        /*
-         * Ugly hack to pass state to sd_numa_mask()...
+         * Convert topological properties into behaviour.
         */
-        sched_domains_curr_level = tl->numa_level;
+        if (sd->flags & SD_SHARE_CPUPOWER) {
+                sd->imbalance_pct = 110;
+                sd->smt_gain = 1178; /* ~15% */
+        } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
+                sd->imbalance_pct = 117;
+                sd->cache_nice_tries = 1;
+                sd->busy_idx = 2;
+#ifdef CONFIG_NUMA
+        } else if (sd->flags & SD_NUMA) {
+                sd->cache_nice_tries = 2;
+                sd->busy_idx = 3;
+                sd->idle_idx = 2;
+                sd->flags |= SD_SERIALIZE;
+                if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
+                        sd->flags &= ~(SD_BALANCE_EXEC |
+                                       SD_BALANCE_FORK |
+                                       SD_WAKE_AFFINE);
+                }
+#endif
+        } else {
+                sd->flags |= SD_PREFER_SIBLING;
+                sd->cache_nice_tries = 1;
+                sd->busy_idx = 2;
+                sd->idle_idx = 1;
+        }
+        sd->private = &tl->data;
        return sd;
 }
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+        { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+        { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+#endif
+        { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+        { NULL, },
+};
+struct sched_domain_topology_level *sched_domain_topology = default_topology;
+#define for_each_sd_topology(tl)                        \
+        for (tl = sched_domain_topology; tl->mask; tl++)
+void set_sched_topology(struct sched_domain_topology_level *tl)
+{
+        sched_domain_topology = tl;
+}
+#ifdef CONFIG_NUMA
 static const struct cpumask *sd_numa_mask(int cpu)
 {
        return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
@@ -6172,7 +6250,10 @@ static void sched_init_numa(void)
                }
        }
-        tl = kzalloc((ARRAY_SIZE(default_topology) + level) *
+        /* Compute default topology size */
+        for (i = 0; sched_domain_topology[i].mask; i++);
+        tl = kzalloc((i + level + 1) *
                        sizeof(struct sched_domain_topology_level), GFP_KERNEL);
        if (!tl)
                return;
@@ -6180,18 +6261,19 @@ static void sched_init_numa(void)
        /*
         * Copy the default topology bits..
         */
-        for (i = 0; default_topology[i].init; i++)
+        for (i = 0; sched_domain_topology[i].mask; i++)
-                tl[i] = default_topology[i];
+                tl[i] = sched_domain_topology[i];
        /*
         * .. and append 'j' levels of NUMA goodness.
         */
        for (j = 0; j < level; i++, j++) {
                tl[i] = (struct sched_domain_topology_level){
-                        .init = sd_numa_init,
                        .mask = sd_numa_mask,
+                        .sd_flags = cpu_numa_flags,
                        .flags = SDTL_OVERLAP,
                        .numa_level = j,
+                        SD_INIT_NAME(NUMA)
                };
        }
@@ -6349,7 +6431,7 @@ struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
                const struct cpumask *cpu_map, struct sched_domain_attr *attr,
                struct sched_domain *child, int cpu)
 {
-        struct sched_domain *sd = tl->init(tl, cpu);
+        struct sched_domain *sd = sd_init(tl, cpu);
        if (!sd)
                return child;
@@ -6919,6 +7001,7 @@ void __init sched_init(void)
        if (cpu_isolated_map == NULL)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
        idle_thread_set_boot_cpu();
+        set_cpu_rq_start_time();
 #endif
        init_sched_fair_class();
@@ -7586,7 +7669,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
 {
        struct task_group *tg = css_tg(css);
-        struct task_group *parent = css_tg(css_parent(css));
+        struct task_group *parent = css_tg(css->parent);
        if (parent)
                sched_online_group(tg, parent);
@@ -7717,8 +7800,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        /* restart the period timer (if active) to handle new period expiry */
        if (runtime_enabled && cfs_b->timer_active) {
                /* force a reprogram */
-                cfs_b->timer_active = 0;
+                __start_cfs_bandwidth(cfs_b, true);
-                __start_cfs_bandwidth(cfs_b);
        }
        raw_spin_unlock_irq(&cfs_b->lock);