12 files changed, 1439 insertions, 187 deletions
diff --git a/Documentation/scheduler/sched-bwc.txt b/Documentation/scheduler/sched-bwc.txt
new file mode 100644
index 000000000000..f6b1873f68ab
--- /dev/null
+++ b/Documentation/scheduler/sched-bwc.txt
@@ -0,0 +1,122 @@
+CFS Bandwidth Control
+=====================
+[ This document only discusses CPU bandwidth control for SCHED_NORMAL.
+  The SCHED_RT case is covered in Documentation/scheduler/sched-rt-group.txt ]
+CFS bandwidth control is a CONFIG_FAIR_GROUP_SCHED extension which allows the
+specification of the maximum CPU bandwidth available to a group or hierarchy.
+The bandwidth allowed for a group is specified using a quota and period. Within
+each given "period" (microseconds), a group is allowed to consume only up to
+"quota" microseconds of CPU time.  When the CPU bandwidth consumption of a
+group exceeds this limit (for that period), the tasks belonging to its
+hierarchy will be throttled and are not allowed to run again until the next
+period.
+A group's unused runtime is globally tracked, being refreshed with quota units
+above at each period boundary.  As threads consume this bandwidth it is
+transferred to cpu-local "silos" on a demand basis.  The amount transferred
+within each of these updates is tunable and described as the "slice".
+Management
+----------
+Quota and period are managed within the cpu subsystem via cgroupfs.
+cpu.cfs_quota_us: the total available run-time within a period (in microseconds)
+cpu.cfs_period_us: the length of a period (in microseconds)
+cpu.stat: exports throttling statistics [explained further below]
+The default values are:
+        cpu.cfs_period_us=100ms
+        cpu.cfs_quota=-1
+A value of -1 for cpu.cfs_quota_us indicates that the group does not have any
+bandwidth restriction in place, such a group is described as an unconstrained
+bandwidth group.  This represents the traditional work-conserving behavior for
+CFS.
+Writing any (valid) positive value(s) will enact the specified bandwidth limit.
+The minimum quota allowed for the quota or period is 1ms.  There is also an
+upper bound on the period length of 1s.  Additional restrictions exist when
+bandwidth limits are used in a hierarchical fashion, these are explained in
+more detail below.
+Writing any negative value to cpu.cfs_quota_us will remove the bandwidth limit
+and return the group to an unconstrained state once more.
+Any updates to a group's bandwidth specification will result in it becoming
+unthrottled if it is in a constrained state.
+System wide settings
+--------------------
+For efficiency run-time is transferred between the global pool and CPU local
+"silos" in a batch fashion.  This greatly reduces global accounting pressure
+on large systems.  The amount transferred each time such an update is required
+is described as the "slice".
+This is tunable via procfs:
+        /proc/sys/kernel/sched_cfs_bandwidth_slice_us (default=5ms)
+Larger slice values will reduce transfer overheads, while smaller values allow
+for more fine-grained consumption.
+Statistics
+----------
+A group's bandwidth statistics are exported via 3 fields in cpu.stat.
+cpu.stat:
+- nr_periods: Number of enforcement intervals that have elapsed.
+- nr_throttled: Number of times the group has been throttled/limited.
+- throttled_time: The total time duration (in nanoseconds) for which entities
+  of the group have been throttled.
+This interface is read-only.
+Hierarchical considerations
+---------------------------
+The interface enforces that an individual entity's bandwidth is always
+attainable, that is: max(c_i) <= C. However, over-subscription in the
+aggregate case is explicitly allowed to enable work-conserving semantics
+within a hierarchy.
+  e.g. \Sum (c_i) may exceed C
+[ Where C is the parent's bandwidth, and c_i its children ]
+There are two ways in which a group may become throttled:
+        a. it fully consumes its own quota within a period
+        b. a parent's quota is fully consumed within its period
+In case b) above, even though the child may have runtime remaining it will not
+be allowed to until the parent's runtime is refreshed.
+Examples
+--------
+1. Limit a group to 1 CPU worth of runtime.
+        If period is 250ms and quota is also 250ms, the group will get
+        1 CPU worth of runtime every 250ms.
+        # echo 250000 > cpu.cfs_quota_us /* quota = 250ms */
+        # echo 250000 > cpu.cfs_period_us /* period = 250ms */
+2. Limit a group to 2 CPUs worth of runtime on a multi-CPU machine.
+        With 500ms period and 1000ms quota, the group can get 2 CPUs worth of
+        runtime every 500ms.
+        # echo 1000000 > cpu.cfs_quota_us /* quota = 1000ms */
+        # echo 500000 > cpu.cfs_period_us /* period = 500ms */
+        The larger period here allows for increased burst capacity.
+3. Limit a group to 20% of 1 CPU.
+        With 50ms period, 10ms quota will be equivalent to 20% of 1 CPU.
+        # echo 10000 > cpu.cfs_quota_us /* quota = 10ms */
+        # echo 50000 > cpu.cfs_period_us /* period = 50ms */
+        By using a small period here we are ensuring a consistent latency
+        response at the expense of burst capacity.
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 41d0237fd449..9fda2888a6ab 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2039,6 +2039,10 @@ static inline void sched_autogroup_fork(struct signal_struct *sig) { }
 static inline void sched_autogroup_exit(struct signal_struct *sig) { }
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+extern unsigned int sysctl_sched_cfs_bandwidth_slice;
+#endif
 #ifdef CONFIG_RT_MUTEXES
 extern int rt_mutex_getprio(struct task_struct *p);
 extern void rt_mutex_setprio(struct task_struct *p, int prio);
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index f6334782a593..959ff18b63b6 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -100,7 +100,7 @@ static inline long __trace_sched_switch_state(struct task_struct *p)
         * For all intents and purposes a preempted task is a running task.
         */
        if (task_thread_info(p)->preempt_count & PREEMPT_ACTIVE)
-                state = TASK_RUNNING;
+                state = TASK_RUNNING | TASK_STATE_MAX;
 #endif
        return state;
@@ -137,13 +137,14 @@ TRACE_EVENT(sched_switch,
                __entry->next_prio      = next->prio;
        ),
-        TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s ==> next_comm=%s next_pid=%d next_prio=%d",
+        TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d",
                __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-                __entry->prev_state ?
+                __entry->prev_state & (TASK_STATE_MAX-1) ?
-                  __print_flags(__entry->prev_state, "|",
+                  __print_flags(__entry->prev_state & (TASK_STATE_MAX-1), "|",
                                { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
                                { 16, "Z" }, { 32, "X" }, { 64, "x" },
                                { 128, "W" }) : "R",
+                __entry->prev_state & TASK_STATE_MAX ? "+" : "",
                __entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
diff --git a/init/Kconfig b/init/Kconfig
index d62778390e55..d19b3a77ab44 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -715,6 +715,18 @@ config FAIR_GROUP_SCHED
        depends on CGROUP_SCHED
        default CGROUP_SCHED
+config CFS_BANDWIDTH
+        bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
+        depends on EXPERIMENTAL
+        depends on FAIR_GROUP_SCHED
+        default n
+        help
+          This option allows users to define CPU bandwidth rates (limits) for
+          tasks running within the fair group scheduler.  Groups with no limit
+          set are considered to be unconstrained and will run with no
+          restriction.
+          See tip/Documentation/scheduler/sched-bwc.txt for more information.
 config RT_GROUP_SCHED
        bool "Group scheduling for SCHED_RR/FIFO"
        depends on EXPERIMENTAL
diff --git a/kernel/sched.c b/kernel/sched.c
index b50b0f0c9aa9..c5cf15e1eb57 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -196,10 +196,28 @@ static inline int rt_bandwidth_enabled(void)
        return sysctl_sched_rt_runtime >= 0;
 }
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
-        ktime_t now;
+        unsigned long delta;
+        ktime_t soft, hard, now;
+        for (;;) {
+                if (hrtimer_active(period_timer))
+                        break;
+                now = hrtimer_cb_get_time(period_timer);
+                hrtimer_forward(period_timer, now, period);
+                soft = hrtimer_get_softexpires(period_timer);
+                hard = hrtimer_get_expires(period_timer);
+                delta = ktime_to_ns(ktime_sub(hard, soft));
+                __hrtimer_start_range_ns(period_timer, soft, delta,
+                                         HRTIMER_MODE_ABS_PINNED, 0);
+        }
+}
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                return;
@@ -207,22 +225,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
                return;
        raw_spin_lock(&rt_b->rt_runtime_lock);
-        for (;;) {
+        start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
-                unsigned long delta;
-                ktime_t soft, hard;
-                if (hrtimer_active(&rt_b->rt_period_timer))
-                        break;
-                now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
-                hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-                soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
-                hard = hrtimer_get_expires(&rt_b->rt_period_timer);
-                delta = ktime_to_ns(ktime_sub(hard, soft));
-                __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
-                                HRTIMER_MODE_ABS_PINNED, 0);
-        }
        raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -247,6 +250,24 @@ struct cfs_rq;
 static LIST_HEAD(task_groups);
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+        raw_spinlock_t lock;
+        ktime_t period;
+        u64 quota, runtime;
+        s64 hierarchal_quota;
+        u64 runtime_expires;
+        int idle, timer_active;
+        struct hrtimer period_timer, slack_timer;
+        struct list_head throttled_cfs_rq;
+        /* statistics */
+        int nr_periods, nr_throttled;
+        u64 throttled_time;
+#endif
+};
 /* task group related information */
 struct task_group {
        struct cgroup_subsys_state css;
@@ -278,6 +299,8 @@ struct task_group {
 #ifdef CONFIG_SCHED_AUTOGROUP
        struct autogroup *autogroup;
 #endif
+        struct cfs_bandwidth cfs_bandwidth;
 };
 /* task_group_lock serializes the addition/removal of task groups */
@@ -311,7 +334,7 @@ struct task_group root_task_group;
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
        struct load_weight load;
-        unsigned long nr_running;
+        unsigned long nr_running, h_nr_running;
        u64 exec_clock;
        u64 min_vruntime;
@@ -377,9 +400,120 @@ struct cfs_rq {
        unsigned long load_contribution;
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+        int runtime_enabled;
+        u64 runtime_expires;
+        s64 runtime_remaining;
+        u64 throttled_timestamp;
+        int throttled, throttle_count;
+        struct list_head throttled_list;
+#endif
 #endif
 };
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+        return &tg->cfs_bandwidth;
+}
+static inline u64 default_cfs_period(void);
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+        struct cfs_bandwidth *cfs_b =
+                container_of(timer, struct cfs_bandwidth, slack_timer);
+        do_sched_cfs_slack_timer(cfs_b);
+        return HRTIMER_NORESTART;
+}
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+        struct cfs_bandwidth *cfs_b =
+                container_of(timer, struct cfs_bandwidth, period_timer);
+        ktime_t now;
+        int overrun;
+        int idle = 0;
+        for (;;) {
+                now = hrtimer_cb_get_time(timer);
+                overrun = hrtimer_forward(timer, now, cfs_b->period);
+                if (!overrun)
+                        break;
+                idle = do_sched_cfs_period_timer(cfs_b, overrun);
+        }
+        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        raw_spin_lock_init(&cfs_b->lock);
+        cfs_b->runtime = 0;
+        cfs_b->quota = RUNTIME_INF;
+        cfs_b->period = ns_to_ktime(default_cfs_period());
+        INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
+        hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        cfs_b->period_timer.function = sched_cfs_period_timer;
+        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        cfs_b->slack_timer.function = sched_cfs_slack_timer;
+}
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        cfs_rq->runtime_enabled = 0;
+        INIT_LIST_HEAD(&cfs_rq->throttled_list);
+}
+/* requires cfs_b->lock, may release to reprogram timer */
+static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        /*
+         * The timer may be active because we're trying to set a new bandwidth
+         * period or because we're racing with the tear-down path
+         * (timer_active==0 becomes visible before the hrtimer call-back
+         * terminates).  In either case we ensure that it's re-programmed
+         */
+        while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+                raw_spin_unlock(&cfs_b->lock);
+                /* ensure cfs_b->lock is available while we wait */
+                hrtimer_cancel(&cfs_b->period_timer);
+                raw_spin_lock(&cfs_b->lock);
+                /* if someone else restarted the timer then we're done */
+                if (cfs_b->timer_active)
+                        return;
+        }
+        cfs_b->timer_active = 1;
+        start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+}
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        hrtimer_cancel(&cfs_b->period_timer);
+        hrtimer_cancel(&cfs_b->slack_timer);
+}
+#else
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+        return NULL;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
        struct rt_prio_array active;
@@ -520,8 +654,6 @@ struct rq {
        int cpu;
        int online;
-        unsigned long avg_load_per_task;
        u64 rt_avg;
        u64 age_stamp;
        u64 idle_stamp;
@@ -1471,24 +1603,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
        update_load_sub(&rq->load, load);
 }
-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
+                        (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
 typedef int (*tg_visitor)(struct task_group *, void *);
 /*
- * Iterate the full tree, calling @down when first entering a node and @up when
+ * Iterate task_group tree rooted at *from, calling @down when first entering a
- * leaving it for the final time.
+ * node and @up when leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
 */
-static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+static int walk_tg_tree_from(struct task_group *from,
+                             tg_visitor down, tg_visitor up, void *data)
 {
        struct task_group *parent, *child;
        int ret;
-        rcu_read_lock();
+        parent = from;
-        parent = &root_task_group;
 down:
        ret = (*down)(parent, data);
        if (ret)
-                goto out_unlock;
+                goto out;
        list_for_each_entry_rcu(child, &parent->children, siblings) {
                parent = child;
                goto down;
@@ -1497,19 +1633,29 @@ up:
                continue;
        }
        ret = (*up)(parent, data);
-        if (ret)
+        if (ret || parent == from)
-                goto out_unlock;
+                goto out;
        child = parent;
        parent = parent->parent;
        if (parent)
                goto up;
-out_unlock:
+out:
-        rcu_read_unlock();
        return ret;
 }
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+{
+        return walk_tg_tree_from(&root_task_group, down, up, data);
+}
 static int tg_nop(struct task_group *tg, void *data)
 {
        return 0;
@@ -1569,11 +1715,9 @@ static unsigned long cpu_avg_load_per_task(int cpu)
        unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
        if (nr_running)
-                rq->avg_load_per_task = rq->load.weight / nr_running;
+                return rq->load.weight / nr_running;
-        else
-                rq->avg_load_per_task = 0;
-        return rq->avg_load_per_task;
+        return 0;
 }
 #ifdef CONFIG_PREEMPT
@@ -1806,7 +1950,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
                rq->nr_uninterruptible--;
        enqueue_task(rq, p, flags);
-        inc_nr_running(rq);
 }
 /*
@@ -1818,7 +1961,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
                rq->nr_uninterruptible++;
        dequeue_task(rq, p, flags);
-        dec_nr_running(rq);
 }
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -2848,19 +2990,23 @@ void sched_fork(struct task_struct *p)
        p->state = TASK_RUNNING;
        /*
+         * Make sure we do not leak PI boosting priority to the child.
+         */
+        p->prio = current->normal_prio;
+        /*
         * Revert to default priority/policy on fork if requested.
         */
        if (unlikely(p->sched_reset_on_fork)) {
-                if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+                if (task_has_rt_policy(p)) {
                        p->policy = SCHED_NORMAL;
-                        p->normal_prio = p->static_prio;
-                }
-                if (PRIO_TO_NICE(p->static_prio) < 0) {
                        p->static_prio = NICE_TO_PRIO(0);
-                        p->normal_prio = p->static_prio;
+                        p->rt_priority = 0;
-                        set_load_weight(p);
+                } else if (PRIO_TO_NICE(p->static_prio) < 0)
-                }
+                        p->static_prio = NICE_TO_PRIO(0);
+                p->prio = p->normal_prio = __normal_prio(p);
+                set_load_weight(p);
                /*
                 * We don't need the reset flag anymore after the fork. It has
@@ -2869,11 +3015,6 @@ void sched_fork(struct task_struct *p)
                p->sched_reset_on_fork = 0;
        }
-        /*
-         * Make sure we do not leak PI boosting priority to the child.
-         */
-        p->prio = current->normal_prio;
        if (!rt_prio(p->prio))
                p->sched_class = &fair_sched_class;
@@ -4239,7 +4380,7 @@ pick_next_task(struct rq *rq)
         * Optimization: we know that if all tasks are in
         * the fair class we can call that function directly:
         */
-        if (likely(rq->nr_running == rq->cfs.nr_running)) {
+        if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
                p = fair_sched_class.pick_next_task(rq);
                if (likely(p))
                        return p;
@@ -6197,6 +6338,30 @@ static void calc_global_load_remove(struct rq *rq)
        rq->calc_load_active = 0;
 }
+#ifdef CONFIG_CFS_BANDWIDTH
+static void unthrottle_offline_cfs_rqs(struct rq *rq)
+{
+        struct cfs_rq *cfs_rq;
+        for_each_leaf_cfs_rq(rq, cfs_rq) {
+                struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+                if (!cfs_rq->runtime_enabled)
+                        continue;
+                /*
+                 * clock_task is not advancing so we just need to make sure
+                 * there's some valid quota amount
+                 */
+                cfs_rq->runtime_remaining = cfs_b->quota;
+                if (cfs_rq_throttled(cfs_rq))
+                        unthrottle_cfs_rq(cfs_rq);
+        }
+}
+#else
+static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+#endif
 /*
 * Migrate all tasks from the rq, sleeping tasks will be migrated by
 * try_to_wake_up()->select_task_rq().
@@ -6222,6 +6387,9 @@ static void migrate_tasks(unsigned int dead_cpu)
         */
        rq->stop = NULL;
+        /* Ensure any throttled groups are reachable by pick_next_task */
+        unthrottle_offline_cfs_rqs(rq);
        for ( ; ; ) {
                /*
                 * There's this thread running, bail when that's the only
@@ -7965,6 +8133,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
        /* allow initial update_cfs_load() to truncate */
        cfs_rq->load_stamp = 1;
 #endif
+        init_cfs_rq_runtime(cfs_rq);
        tg->cfs_rq[cpu] = cfs_rq;
        tg->se[cpu] = se;
@@ -8104,6 +8273,7 @@ void __init sched_init(void)
                 * We achieve this by letting root_task_group's tasks sit
                 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                 */
+                init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
                init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8345,6 +8515,8 @@ static void free_fair_sched_group(struct task_group *tg)
 {
        int i;
+        destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
        for_each_possible_cpu(i) {
                if (tg->cfs_rq)
                        kfree(tg->cfs_rq[i]);
@@ -8372,6 +8544,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        tg->shares = NICE_0_LOAD;
+        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
        for_each_possible_cpu(i) {
                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                      GFP_KERNEL, cpu_to_node(i));
@@ -8647,12 +8821,7 @@ unsigned long sched_group_shares(struct task_group *tg)
 }
 #endif
-#ifdef CONFIG_RT_GROUP_SCHED
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
-/*
- * Ensure that the real time constraints are schedulable.
- */
-static DEFINE_MUTEX(rt_constraints_mutex);
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
        if (runtime == RUNTIME_INF)
@@ -8660,6 +8829,13 @@ static unsigned long to_ratio(u64 period, u64 runtime)
        return div64_u64(runtime << 20, period);
 }
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);
 /* Must be called with tasklist_lock held */
 static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -8680,7 +8856,7 @@ struct rt_schedulable_data {
        u64 rt_runtime;
 };
-static int tg_schedulable(struct task_group *tg, void *data)
+static int tg_rt_schedulable(struct task_group *tg, void *data)
 {
        struct rt_schedulable_data *d = data;
        struct task_group *child;
@@ -8738,16 +8914,22 @@ static int tg_schedulable(struct task_group *tg, void *data)
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
+        int ret;
        struct rt_schedulable_data data = {
                .tg = tg,
                .rt_period = period,
                .rt_runtime = runtime,
        };
-        return walk_tg_tree(tg_schedulable, tg_nop, &data);
+        rcu_read_lock();
+        ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+        rcu_read_unlock();
+        return ret;
 }
-static int tg_set_bandwidth(struct task_group *tg,
+static int tg_set_rt_bandwidth(struct task_group *tg,
                u64 rt_period, u64 rt_runtime)
 {
        int i, err = 0;
@@ -8786,7 +8968,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
        if (rt_runtime_us < 0)
                rt_runtime = RUNTIME_INF;
-        return tg_set_bandwidth(tg, rt_period, rt_runtime);
+        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 long sched_group_rt_runtime(struct task_group *tg)
@@ -8811,7 +8993,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
        if (rt_period == 0)
                return -EINVAL;
-        return tg_set_bandwidth(tg, rt_period, rt_runtime);
+        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 long sched_group_rt_period(struct task_group *tg)
@@ -9001,6 +9183,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
        return (u64) scale_load_down(tg->shares);
 }
+#ifdef CONFIG_CFS_BANDWIDTH
+static DEFINE_MUTEX(cfs_constraints_mutex);
+const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
+const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+        int i, ret = 0, runtime_enabled;
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+        if (tg == &root_task_group)
+                return -EINVAL;
+        /*
+         * Ensure we have at some amount of bandwidth every period.  This is
+         * to prevent reaching a state of large arrears when throttled via
+         * entity_tick() resulting in prolonged exit starvation.
+         */
+        if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
+                return -EINVAL;
+        /*
+         * Likewise, bound things on the otherside by preventing insane quota
+         * periods.  This also allows us to normalize in computing quota
+         * feasibility.
+         */
+        if (period > max_cfs_quota_period)
+                return -EINVAL;
+        mutex_lock(&cfs_constraints_mutex);
+        ret = __cfs_schedulable(tg, period, quota);
+        if (ret)
+                goto out_unlock;
+        runtime_enabled = quota != RUNTIME_INF;
+        raw_spin_lock_irq(&cfs_b->lock);
+        cfs_b->period = ns_to_ktime(period);
+        cfs_b->quota = quota;
+        __refill_cfs_bandwidth_runtime(cfs_b);
+        /* restart the period timer (if active) to handle new period expiry */
+        if (runtime_enabled && cfs_b->timer_active) {
+                /* force a reprogram */
+                cfs_b->timer_active = 0;
+                __start_cfs_bandwidth(cfs_b);
+        }
+        raw_spin_unlock_irq(&cfs_b->lock);
+        for_each_possible_cpu(i) {
+                struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+                struct rq *rq = rq_of(cfs_rq);
+                raw_spin_lock_irq(&rq->lock);
+                cfs_rq->runtime_enabled = runtime_enabled;
+                cfs_rq->runtime_remaining = 0;
+                if (cfs_rq_throttled(cfs_rq))
+                        unthrottle_cfs_rq(cfs_rq);
+                raw_spin_unlock_irq(&rq->lock);
+        }
+out_unlock:
+        mutex_unlock(&cfs_constraints_mutex);
+        return ret;
+}
+int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+{
+        u64 quota, period;
+        period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+        if (cfs_quota_us < 0)
+                quota = RUNTIME_INF;
+        else
+                quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+        return tg_set_cfs_bandwidth(tg, period, quota);
+}
+long tg_get_cfs_quota(struct task_group *tg)
+{
+        u64 quota_us;
+        if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
+                return -1;
+        quota_us = tg_cfs_bandwidth(tg)->quota;
+        do_div(quota_us, NSEC_PER_USEC);
+        return quota_us;
+}
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+        u64 quota, period;
+        period = (u64)cfs_period_us * NSEC_PER_USEC;
+        quota = tg_cfs_bandwidth(tg)->quota;
+        if (period <= 0)
+                return -EINVAL;
+        return tg_set_cfs_bandwidth(tg, period, quota);
+}
+long tg_get_cfs_period(struct task_group *tg)
+{
+        u64 cfs_period_us;
+        cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+        do_div(cfs_period_us, NSEC_PER_USEC);
+        return cfs_period_us;
+}
+static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+{
+        return tg_get_cfs_quota(cgroup_tg(cgrp));
+}
+static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+                                s64 cfs_quota_us)
+{
+        return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+}
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+        return tg_get_cfs_period(cgroup_tg(cgrp));
+}
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+                                u64 cfs_period_us)
+{
+        return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+}
+struct cfs_schedulable_data {
+        struct task_group *tg;
+        u64 period, quota;
+};
+/*
+ * normalize group quota/period to be quota/max_period
+ * note: units are usecs
+ */
+static u64 normalize_cfs_quota(struct task_group *tg,
+                               struct cfs_schedulable_data *d)
+{
+        u64 quota, period;
+        if (tg == d->tg) {
+                period = d->period;
+                quota = d->quota;
+        } else {
+                period = tg_get_cfs_period(tg);
+                quota = tg_get_cfs_quota(tg);
+        }
+        /* note: these should typically be equivalent */
+        if (quota == RUNTIME_INF || quota == -1)
+                return RUNTIME_INF;
+        return to_ratio(period, quota);
+}
+static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
+{
+        struct cfs_schedulable_data *d = data;
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+        s64 quota = 0, parent_quota = -1;
+        if (!tg->parent) {
+                quota = RUNTIME_INF;
+        } else {
+                struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
+                quota = normalize_cfs_quota(tg, d);
+                parent_quota = parent_b->hierarchal_quota;
+                /*
+                 * ensure max(child_quota) <= parent_quota, inherit when no
+                 * limit is set
+                 */
+                if (quota == RUNTIME_INF)
+                        quota = parent_quota;
+                else if (parent_quota != RUNTIME_INF && quota > parent_quota)
+                        return -EINVAL;
+        }
+        cfs_b->hierarchal_quota = quota;
+        return 0;
+}
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
+{
+        int ret;
+        struct cfs_schedulable_data data = {
+                .tg = tg,
+                .period = period,
+                .quota = quota,
+        };
+        if (quota != RUNTIME_INF) {
+                do_div(data.period, NSEC_PER_USEC);
+                do_div(data.quota, NSEC_PER_USEC);
+        }
+        rcu_read_lock();
+        ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+        rcu_read_unlock();
+        return ret;
+}
+static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
+                struct cgroup_map_cb *cb)
+{
+        struct task_group *tg = cgroup_tg(cgrp);
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+        cb->fill(cb, "nr_periods", cfs_b->nr_periods);
+        cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
+        cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+        return 0;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -9035,6 +9449,22 @@ static struct cftype cpu_files[] = {
                .write_u64 = cpu_shares_write_u64,
        },
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+        {
+                .name = "cfs_quota_us",
+                .read_s64 = cpu_cfs_quota_read_s64,
+                .write_s64 = cpu_cfs_quota_write_s64,
+        },
+        {
+                .name = "cfs_period_us",
+                .read_u64 = cpu_cfs_period_read_u64,
+                .write_u64 = cpu_cfs_period_write_u64,
+        },
+        {
+                .name = "stat",
+                .read_map = cpu_stats_show,
+        },
+#endif
 #ifdef CONFIG_RT_GROUP_SCHED
        {
                .name = "rt_runtime_us",
@@ -9344,4 +9774,3 @@ struct cgroup_subsys cpuacct_subsys = {
        .subsys_id = cpuacct_subsys_id,
 };
 #endif  /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 2722dc1b4138..a86cf9d9eb11 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,6 @@ static int convert_prio(int prio)
        return cpupri;
 }
-#define for_each_cpupri_active(array, idx)                    \
-        for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
 /**
 * cpupri_find - find the best (lowest-pri) CPU in the system
 * @cp: The cpupri context
@@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
        int                  idx      = 0;
        int                  task_pri = convert_prio(p->prio);
-        for_each_cpupri_active(cp->pri_active, idx) {
+        if (task_pri >= MAX_RT_PRIO)
-                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+                return 0;
-                if (idx >= task_pri)
+        for (idx = 0; idx < task_pri; idx++) {
-                        break;
+                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+                int skip = 0;
+                if (!atomic_read(&(vec)->count))
+                        skip = 1;
+                /*
+                 * When looking at the vector, we need to read the counter,
+                 * do a memory barrier, then read the mask.
+                 *
+                 * Note: This is still all racey, but we can deal with it.
+                 *  Ideally, we only want to look at masks that are set.
+                 *
+                 *  If a mask is not set, then the only thing wrong is that we
+                 *  did a little more work than necessary.
+                 *
+                 *  If we read a zero count but the mask is set, because of the
+                 *  memory barriers, that can only happen when the highest prio
+                 *  task for a run queue has left the run queue, in which case,
+                 *  it will be followed by a pull. If the task we are processing
+                 *  fails to find a proper place to go, that pull request will
+                 *  pull this task if the run queue is running at a lower
+                 *  priority.
+                 */
+                smp_rmb();
+                /* Need to do the rmb for every iteration */
+                if (skip)
+                        continue;
                if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
                        continue;
@@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 {
        int                 *currpri = &cp->cpu_to_pri[cpu];
        int                  oldpri  = *currpri;
-        unsigned long        flags;
+        int                  do_mb = 0;
        newpri = convert_prio(newpri);
@@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
         * If the cpu was currently mapped to a different value, we
         * need to map it to the new value then remove the old value.
         * Note, we must add the new value first, otherwise we risk the
-         * cpu being cleared from pri_active, and this cpu could be
+         * cpu being missed by the priority loop in cpupri_find.
-         * missed for a push or pull.
         */
        if (likely(newpri != CPUPRI_INVALID)) {
                struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
-                raw_spin_lock_irqsave(&vec->lock, flags);
                cpumask_set_cpu(cpu, vec->mask);
-                vec->count++;
+                /*
-                if (vec->count == 1)
+                 * When adding a new vector, we update the mask first,
-                        set_bit(newpri, cp->pri_active);
+                 * do a write memory barrier, and then update the count, to
+                 * make sure the vector is visible when count is set.
-                raw_spin_unlock_irqrestore(&vec->lock, flags);
+                 */
+                smp_mb__before_atomic_inc();
+                atomic_inc(&(vec)->count);
+                do_mb = 1;
        }
        if (likely(oldpri != CPUPRI_INVALID)) {
                struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
-                raw_spin_lock_irqsave(&vec->lock, flags);
+                /*
+                 * Because the order of modification of the vec->count
-                vec->count--;
+                 * is important, we must make sure that the update
-                if (!vec->count)
+                 * of the new prio is seen before we decrement the
-                        clear_bit(oldpri, cp->pri_active);
+                 * old prio. This makes sure that the loop sees
+                 * one or the other when we raise the priority of
+                 * the run queue. We don't care about when we lower the
+                 * priority, as that will trigger an rt pull anyway.
+                 *
+                 * We only need to do a memory barrier if we updated
+                 * the new priority vec.
+                 */
+                if (do_mb)
+                        smp_mb__after_atomic_inc();
+                /*
+                 * When removing from the vector, we decrement the counter first
+                 * do a memory barrier and then clear the mask.
+                 */
+                atomic_dec(&(vec)->count);
+                smp_mb__after_atomic_inc();
                cpumask_clear_cpu(cpu, vec->mask);
-                raw_spin_unlock_irqrestore(&vec->lock, flags);
        }
        *currpri = newpri;
@@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp)
        for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
                struct cpupri_vec *vec = &cp->pri_to_cpu[i];
-                raw_spin_lock_init(&vec->lock);
+                atomic_set(&vec->count, 0);
-                vec->count = 0;
                if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
                        goto cleanup;
        }
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9fc7d386fea4..f6d756173491 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -4,7 +4,6 @@
 #include <linux/sched.h>
 #define CPUPRI_NR_PRIORITIES    (MAX_RT_PRIO + 2)
-#define CPUPRI_NR_PRI_WORDS     BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
 #define CPUPRI_INVALID -1
 #define CPUPRI_IDLE     0
@@ -12,14 +11,12 @@
 /* values 2-101 are RT priorities 0-99 */
 struct cpupri_vec {
-        raw_spinlock_t lock;
+        atomic_t        count;
-        int        count;
+        cpumask_var_t   mask;
-        cpumask_var_t mask;
 };
 struct cpupri {
        struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
-        long              pri_active[CPUPRI_NR_PRI_WORDS];
        int               cpu_to_pri[NR_CPUS];
 };
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bc8ee9993814..fef0bfde7c8c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 */
 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
+ * each time a cfs_rq requests quota.
+ *
+ * Note: in the case that the slice exceeds the runtime remaining (either due
+ * to consumption or the quota being specified to be smaller than the slice)
+ * we will always only issue the remaining available time.
+ *
+ * default: 5 msec, units: microseconds
+  */
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
 static const struct sched_class fair_sched_class;
 /**************************************************************
@@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                   unsigned long delta_exec);
 /**************************************************************
 * Scheduling class tree data structure manipulation methods:
@@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
                cpuacct_charge(curtask, delta_exec);
                account_group_exec_runtime(curtask, delta_exec);
        }
+        account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 static inline void
@@ -688,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/* we need this in update_cfs_load and load-balance functions below */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 # ifdef CONFIG_SMP
 static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
                                            int global_update)
@@ -710,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
        u64 now, delta;
        unsigned long load = cfs_rq->load.weight;
-        if (cfs_rq->tg == &root_task_group)
+        if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
                return;
        now = rq_of(cfs_rq)->clock_task;
@@ -819,7 +839,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
        tg = cfs_rq->tg;
        se = tg->se[cpu_of(rq_of(cfs_rq))];
-        if (!se)
+        if (!se || throttled_hierarchy(cfs_rq))
                return;
 #ifndef CONFIG_SMP
        if (likely(se->load.weight == tg->shares))
@@ -950,6 +970,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        se->vruntime = vruntime;
 }
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -979,8 +1001,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
                __enqueue_entity(cfs_rq, se);
        se->on_rq = 1;
-        if (cfs_rq->nr_running == 1)
+        if (cfs_rq->nr_running == 1) {
                list_add_leaf_cfs_rq(cfs_rq);
+                check_enqueue_throttle(cfs_rq);
+        }
 }
 static void __clear_buddies_last(struct sched_entity *se)
@@ -1028,6 +1052,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
                __clear_buddies_skip(se);
 }
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -1066,6 +1092,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;
+        /* return excess runtime on last dequeue */
+        return_cfs_rq_runtime(cfs_rq);
        update_min_vruntime(cfs_rq);
        update_cfs_shares(cfs_rq);
 }
@@ -1077,6 +1106,8 @@ static void
 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
        unsigned long ideal_runtime, delta_exec;
+        struct sched_entity *se;
+        s64 delta;
        ideal_runtime = sched_slice(cfs_rq, curr);
        delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -1095,22 +1126,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
         * narrow margin doesn't have to wait for a full slice.
         * This also mitigates buddy induced latencies under load.
         */
-        if (!sched_feat(WAKEUP_PREEMPT))
-                return;
        if (delta_exec < sysctl_sched_min_granularity)
                return;
-        if (cfs_rq->nr_running > 1) {
+        se = __pick_first_entity(cfs_rq);
-                struct sched_entity *se = __pick_first_entity(cfs_rq);
+        delta = curr->vruntime - se->vruntime;
-                s64 delta = curr->vruntime - se->vruntime;
-                if (delta < 0)
+        if (delta < 0)
-                        return;
+                return;
-                if (delta > ideal_runtime)
+        if (delta > ideal_runtime)
-                        resched_task(rq_of(cfs_rq)->curr);
+                resched_task(rq_of(cfs_rq)->curr);
-        }
 }
 static void
@@ -1185,6 +1211,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
        return se;
 }
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
        /*
@@ -1194,6 +1222,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
        if (prev->on_rq)
                update_curr(cfs_rq);
+        /* throttle cfs_rqs exceeding runtime */
+        check_cfs_rq_runtime(cfs_rq);
        check_spread(cfs_rq, prev);
        if (prev->on_rq) {
                update_stats_wait_start(cfs_rq, prev);
@@ -1233,10 +1264,583 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
                return;
 #endif
-        if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
+        if (cfs_rq->nr_running > 1)
                check_preempt_tick(cfs_rq, curr);
 }
+/**************************************************
+ * CFS bandwidth control machinery
+ */
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * default period for cfs group bandwidth.
+ * default: 0.1s, units: nanoseconds
+ */
+static inline u64 default_cfs_period(void)
+{
+        return 100000000ULL;
+}
+static inline u64 sched_cfs_bandwidth_slice(void)
+{
+        return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
+}
+/*
+ * Replenish runtime according to assigned quota and update expiration time.
+ * We use sched_clock_cpu directly instead of rq->clock to avoid adding
+ * additional synchronization around rq->lock.
+ *
+ * requires cfs_b->lock
+ */
+static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+{
+        u64 now;
+        if (cfs_b->quota == RUNTIME_INF)
+                return;
+        now = sched_clock_cpu(smp_processor_id());
+        cfs_b->runtime = cfs_b->quota;
+        cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+}
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        struct task_group *tg = cfs_rq->tg;
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+        u64 amount = 0, min_amount, expires;
+        /* note: this is a positive sum as runtime_remaining <= 0 */
+        min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+        raw_spin_lock(&cfs_b->lock);
+        if (cfs_b->quota == RUNTIME_INF)
+                amount = min_amount;
+        else {
+                /*
+                 * If the bandwidth pool has become inactive, then at least one
+                 * period must have elapsed since the last consumption.
+                 * Refresh the global state and ensure bandwidth timer becomes
+                 * active.
+                 */
+                if (!cfs_b->timer_active) {
+                        __refill_cfs_bandwidth_runtime(cfs_b);
+                        __start_cfs_bandwidth(cfs_b);
+                }
+                if (cfs_b->runtime > 0) {
+                        amount = min(cfs_b->runtime, min_amount);
+                        cfs_b->runtime -= amount;
+                        cfs_b->idle = 0;
+                }
+        }
+        expires = cfs_b->runtime_expires;
+        raw_spin_unlock(&cfs_b->lock);
+        cfs_rq->runtime_remaining += amount;
+        /*
+         * we may have advanced our local expiration to account for allowed
+         * spread between our sched_clock and the one on which runtime was
+         * issued.
+         */
+        if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+                cfs_rq->runtime_expires = expires;
+        return cfs_rq->runtime_remaining > 0;
+}
+/*
+ * Note: This depends on the synchronization provided by sched_clock and the
+ * fact that rq->clock snapshots this value.
+ */
+static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+        struct rq *rq = rq_of(cfs_rq);
+        /* if the deadline is ahead of our clock, nothing to do */
+        if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
+                return;
+        if (cfs_rq->runtime_remaining < 0)
+                return;
+        /*
+         * If the local deadline has passed we have to consider the
+         * possibility that our sched_clock is 'fast' and the global deadline
+         * has not truly expired.
+         *
+         * Fortunately we can check determine whether this the case by checking
+         * whether the global deadline has advanced.
+         */
+        if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+                /* extend local deadline, drift is bounded above by 2 ticks */
+                cfs_rq->runtime_expires += TICK_NSEC;
+        } else {
+                /* global deadline is ahead, expiration has passed */
+                cfs_rq->runtime_remaining = 0;
+        }
+}
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                     unsigned long delta_exec)
+{
+        /* dock delta_exec before expiring quota (as it could span periods) */
+        cfs_rq->runtime_remaining -= delta_exec;
+        expire_cfs_rq_runtime(cfs_rq);
+        if (likely(cfs_rq->runtime_remaining > 0))
+                return;
+        /*
+         * if we're unable to extend our runtime we resched so that the active
+         * hierarchy can be throttled
+         */
+        if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
+                resched_task(rq_of(cfs_rq)->curr);
+}
+static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                                   unsigned long delta_exec)
+{
+        if (!cfs_rq->runtime_enabled)
+                return;
+        __account_cfs_rq_runtime(cfs_rq, delta_exec);
+}
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+        return cfs_rq->throttled;
+}
+/* check whether cfs_rq, or any parent, is throttled */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+        return cfs_rq->throttle_count;
+}
+/*
+ * Ensure that neither of the group entities corresponding to src_cpu or
+ * dest_cpu are members of a throttled hierarchy when performing group
+ * load-balance operations.
+ */
+static inline int throttled_lb_pair(struct task_group *tg,
+                                    int src_cpu, int dest_cpu)
+{
+        struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+        src_cfs_rq = tg->cfs_rq[src_cpu];
+        dest_cfs_rq = tg->cfs_rq[dest_cpu];
+        return throttled_hierarchy(src_cfs_rq) ||
+               throttled_hierarchy(dest_cfs_rq);
+}
+/* updated child weight may affect parent so we have to do this bottom up */
+static int tg_unthrottle_up(struct task_group *tg, void *data)
+{
+        struct rq *rq = data;
+        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+        cfs_rq->throttle_count--;
+#ifdef CONFIG_SMP
+        if (!cfs_rq->throttle_count) {
+                u64 delta = rq->clock_task - cfs_rq->load_stamp;
+                /* leaving throttled state, advance shares averaging windows */
+                cfs_rq->load_stamp += delta;
+                cfs_rq->load_last += delta;
+                /* update entity weight now that we are on_rq again */
+                update_cfs_shares(cfs_rq);
+        }
+#endif
+        return 0;
+}
+static int tg_throttle_down(struct task_group *tg, void *data)
+{
+        struct rq *rq = data;
+        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+        /* group is entering throttled state, record last load */
+        if (!cfs_rq->throttle_count)
+                update_cfs_load(cfs_rq, 0);
+        cfs_rq->throttle_count++;
+        return 0;
+}
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        struct rq *rq = rq_of(cfs_rq);
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+        struct sched_entity *se;
+        long task_delta, dequeue = 1;
+        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+        /* account load preceding throttle */
+        rcu_read_lock();
+        walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
+        rcu_read_unlock();
+        task_delta = cfs_rq->h_nr_running;
+        for_each_sched_entity(se) {
+                struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+                /* throttled entity or throttle-on-deactivate */
+                if (!se->on_rq)
+                        break;
+                if (dequeue)
+                        dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+                qcfs_rq->h_nr_running -= task_delta;
+                if (qcfs_rq->load.weight)
+                        dequeue = 0;
+        }
+        if (!se)
+                rq->nr_running -= task_delta;
+        cfs_rq->throttled = 1;
+        cfs_rq->throttled_timestamp = rq->clock;
+        raw_spin_lock(&cfs_b->lock);
+        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+        raw_spin_unlock(&cfs_b->lock);
+}
+static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        struct rq *rq = rq_of(cfs_rq);
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+        struct sched_entity *se;
+        int enqueue = 1;
+        long task_delta;
+        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+        cfs_rq->throttled = 0;
+        raw_spin_lock(&cfs_b->lock);
+        cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+        list_del_rcu(&cfs_rq->throttled_list);
+        raw_spin_unlock(&cfs_b->lock);
+        cfs_rq->throttled_timestamp = 0;
+        update_rq_clock(rq);
+        /* update hierarchical throttle state */
+        walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
+        if (!cfs_rq->load.weight)
+                return;
+        task_delta = cfs_rq->h_nr_running;
+        for_each_sched_entity(se) {
+                if (se->on_rq)
+                        enqueue = 0;
+                cfs_rq = cfs_rq_of(se);
+                if (enqueue)
+                        enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+                cfs_rq->h_nr_running += task_delta;
+                if (cfs_rq_throttled(cfs_rq))
+                        break;
+        }
+        if (!se)
+                rq->nr_running += task_delta;
+        /* determine whether we need to wake up potentially idle cpu */
+        if (rq->curr == rq->idle && rq->cfs.nr_running)
+                resched_task(rq->curr);
+}
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
+                u64 remaining, u64 expires)
+{
+        struct cfs_rq *cfs_rq;
+        u64 runtime = remaining;
+        rcu_read_lock();
+        list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
+                                throttled_list) {
+                struct rq *rq = rq_of(cfs_rq);
+                raw_spin_lock(&rq->lock);
+                if (!cfs_rq_throttled(cfs_rq))
+                        goto next;
+                runtime = -cfs_rq->runtime_remaining + 1;
+                if (runtime > remaining)
+                        runtime = remaining;
+                remaining -= runtime;
+                cfs_rq->runtime_remaining += runtime;
+                cfs_rq->runtime_expires = expires;
+                /* we check whether we're throttled above */
+                if (cfs_rq->runtime_remaining > 0)
+                        unthrottle_cfs_rq(cfs_rq);
+next:
+                raw_spin_unlock(&rq->lock);
+                if (!remaining)
+                        break;
+        }
+        rcu_read_unlock();
+        return remaining;
+}
+/*
+ * Responsible for refilling a task_group's bandwidth and unthrottling its
+ * cfs_rqs as appropriate. If there has been no activity within the last
+ * period the timer is deactivated until scheduling resumes; cfs_b->idle is
+ * used to track this state.
+ */
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+{
+        u64 runtime, runtime_expires;
+        int idle = 1, throttled;
+        raw_spin_lock(&cfs_b->lock);
+        /* no need to continue the timer with no bandwidth constraint */
+        if (cfs_b->quota == RUNTIME_INF)
+                goto out_unlock;
+        throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+        /* idle depends on !throttled (for the case of a large deficit) */
+        idle = cfs_b->idle && !throttled;
+        cfs_b->nr_periods += overrun;
+        /* if we're going inactive then everything else can be deferred */
+        if (idle)
+                goto out_unlock;
+        __refill_cfs_bandwidth_runtime(cfs_b);
+        if (!throttled) {
+                /* mark as potentially idle for the upcoming period */
+                cfs_b->idle = 1;
+                goto out_unlock;
+        }
+        /* account preceding periods in which throttling occurred */
+        cfs_b->nr_throttled += overrun;
+        /*
+         * There are throttled entities so we must first use the new bandwidth
+         * to unthrottle them before making it generally available.  This
+         * ensures that all existing debts will be paid before a new cfs_rq is
+         * allowed to run.
+         */
+        runtime = cfs_b->runtime;
+        runtime_expires = cfs_b->runtime_expires;
+        cfs_b->runtime = 0;
+        /*
+         * This check is repeated as we are holding onto the new bandwidth
+         * while we unthrottle.  This can potentially race with an unthrottled
+         * group trying to acquire new bandwidth from the global pool.
+         */
+        while (throttled && runtime > 0) {
+                raw_spin_unlock(&cfs_b->lock);
+                /* we can't nest cfs_b->lock while distributing bandwidth */
+                runtime = distribute_cfs_runtime(cfs_b, runtime,
+                                                 runtime_expires);
+                raw_spin_lock(&cfs_b->lock);
+                throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+        }
+        /* return (any) remaining runtime */
+        cfs_b->runtime = runtime;
+        /*
+         * While we are ensured activity in the period following an
+         * unthrottle, this also covers the case in which the new bandwidth is
+         * insufficient to cover the existing bandwidth deficit.  (Forcing the
+         * timer to remain active while there are any throttled entities.)
+         */
+        cfs_b->idle = 0;
+out_unlock:
+        if (idle)
+                cfs_b->timer_active = 0;
+        raw_spin_unlock(&cfs_b->lock);
+        return idle;
+}
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+/* are we near the end of the current quota period? */
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+        struct hrtimer *refresh_timer = &cfs_b->period_timer;
+        u64 remaining;
+        /* if the call-back is running a quota refresh is already occurring */
+        if (hrtimer_callback_running(refresh_timer))
+                return 1;
+        /* is a quota refresh about to occur? */
+        remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+        if (remaining < min_expire)
+                return 1;
+        return 0;
+}
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+        /* if there's a quota refresh soon don't bother with slack */
+        if (runtime_refresh_within(cfs_b, min_left))
+                return;
+        start_bandwidth_timer(&cfs_b->slack_timer,
+                                ns_to_ktime(cfs_bandwidth_slack_period));
+}
+/* we know any runtime found here is valid as update_curr() precedes return */
+static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+        s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
+        if (slack_runtime <= 0)
+                return;
+        raw_spin_lock(&cfs_b->lock);
+        if (cfs_b->quota != RUNTIME_INF &&
+            cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+                cfs_b->runtime += slack_runtime;
+                /* we are under rq->lock, defer unthrottling using a timer */
+                if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+                    !list_empty(&cfs_b->throttled_cfs_rq))
+                        start_cfs_slack_bandwidth(cfs_b);
+        }
+        raw_spin_unlock(&cfs_b->lock);
+        /* even if it's not valid for return we don't want to try again */
+        cfs_rq->runtime_remaining -= slack_runtime;
+}
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        if (!cfs_rq->runtime_enabled || !cfs_rq->nr_running)
+                return;
+        __return_cfs_rq_runtime(cfs_rq);
+}
+/*
+ * This is done with a timer (instead of inline with bandwidth return) since
+ * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
+ */
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+        u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+        u64 expires;
+        /* confirm we're still not at a refresh boundary */
+        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+                return;
+        raw_spin_lock(&cfs_b->lock);
+        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+                runtime = cfs_b->runtime;
+                cfs_b->runtime = 0;
+        }
+        expires = cfs_b->runtime_expires;
+        raw_spin_unlock(&cfs_b->lock);
+        if (!runtime)
+                return;
+        runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+        raw_spin_lock(&cfs_b->lock);
+        if (expires == cfs_b->runtime_expires)
+                cfs_b->runtime = runtime;
+        raw_spin_unlock(&cfs_b->lock);
+}
+/*
+ * When a group wakes up we want to make sure that its quota is not already
+ * expired/exceeded, otherwise it may be allowed to steal additional ticks of
+ * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ */
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
+{
+        /* an active group must be handled by the update_curr()->put() path */
+        if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+                return;
+        /* ensure the group is not already throttled */
+        if (cfs_rq_throttled(cfs_rq))
+                return;
+        /* update runtime allocation */
+        account_cfs_rq_runtime(cfs_rq, 0);
+        if (cfs_rq->runtime_remaining <= 0)
+                throttle_cfs_rq(cfs_rq);
+}
+/* conditionally throttle active cfs_rq's from put_prev_entity() */
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+                return;
+        /*
+         * it's possible for a throttled entity to be forced into a running
+         * state (e.g. set_curr_task), in this case we're finished.
+         */
+        if (cfs_rq_throttled(cfs_rq))
+                return;
+        throttle_cfs_rq(cfs_rq);
+}
+#else
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                     unsigned long delta_exec) {}
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+        return 0;
+}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+        return 0;
+}
+static inline int throttled_lb_pair(struct task_group *tg,
+                                    int src_cpu, int dest_cpu)
+{
+        return 0;
+}
+#endif
 /**************************************************
 * CFS operations on tasks:
 */
@@ -1313,16 +1917,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                        break;
                cfs_rq = cfs_rq_of(se);
                enqueue_entity(cfs_rq, se, flags);
+                /*
+                 * end evaluation on encountering a throttled cfs_rq
+                 *
+                 * note: in the case of encountering a throttled cfs_rq we will
+                 * post the final h_nr_running increment below.
+                */
+                if (cfs_rq_throttled(cfs_rq))
+                        break;
+                cfs_rq->h_nr_running++;
                flags = ENQUEUE_WAKEUP;
        }
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
+                cfs_rq->h_nr_running++;
+                if (cfs_rq_throttled(cfs_rq))
+                        break;
                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
        }
+        if (!se)
+                inc_nr_running(rq);
        hrtick_update(rq);
 }
@@ -1343,6 +1964,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                cfs_rq = cfs_rq_of(se);
                dequeue_entity(cfs_rq, se, flags);
+                /*
+                 * end evaluation on encountering a throttled cfs_rq
+                 *
+                 * note: in the case of encountering a throttled cfs_rq we will
+                 * post the final h_nr_running decrement below.
+                */
+                if (cfs_rq_throttled(cfs_rq))
+                        break;
+                cfs_rq->h_nr_running--;
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
                        /*
@@ -1361,11 +1992,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
+                cfs_rq->h_nr_running--;
+                if (cfs_rq_throttled(cfs_rq))
+                        break;
                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
        }
+        if (!se)
+                dec_nr_running(rq);
        hrtick_update(rq);
 }
@@ -1434,7 +2071,6 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
        return wl;
 }
 #else
 static inline unsigned long effective_load(struct task_group *tg, int cpu,
@@ -1875,6 +2511,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (unlikely(se == pse))
                return;
+        /*
+         * This is possible from callers such as pull_task(), in which we
+         * unconditionally check_prempt_curr() after an enqueue (which may have
+         * lead to a throttle).  This both saves work and prevents false
+         * next-buddy nomination below.
+         */
+        if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+                return;
        if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
                set_next_buddy(pse);
                next_buddy_marked = 1;
@@ -1883,6 +2528,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        /*
         * We can come here with TIF_NEED_RESCHED already set from new task
         * wake up path.
+         *
+         * Note: this also catches the edge-case of curr being in a throttled
+         * group (e.g. via set_curr_task), since update_curr() (in the
+         * enqueue of curr) will have resulted in resched being set.  This
+         * prevents us from potentially nominating it as a false LAST_BUDDY
+         * below.
         */
        if (test_tsk_need_resched(curr))
                return;
@@ -1899,10 +2550,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (unlikely(p->policy != SCHED_NORMAL))
                return;
-        if (!sched_feat(WAKEUP_PREEMPT))
-                return;
        find_matching_se(&se, &pse);
        update_curr(cfs_rq_of(se));
        BUG_ON(!pse);
@@ -2005,7 +2652,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 {
        struct sched_entity *se = &p->se;
-        if (!se->on_rq)
+        /* throttled hierarchies are not runnable */
+        if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
                return false;
        /* Tell the scheduler that we'd really like pse to run next. */
@@ -2102,6 +2750,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
        for_each_leaf_cfs_rq(busiest, cfs_rq) {
                list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+                        if (throttled_lb_pair(task_group(p),
+                                              busiest->cpu, this_cpu))
+                                break;
                        if (!can_migrate_task(p, busiest, this_cpu,
                                                sd, idle, &pinned))
@@ -2217,8 +2868,13 @@ static void update_shares(int cpu)
         * Iterates the task_group tree in a bottom up fashion, see
         * list_add_leaf_cfs_rq() for details.
         */
-        for_each_leaf_cfs_rq(rq, cfs_rq)
+        for_each_leaf_cfs_rq(rq, cfs_rq) {
+                /* throttled entities do not contribute to load */
+                if (throttled_hierarchy(cfs_rq))
+                        continue;
                update_shares_cpu(cfs_rq->tg, cpu);
+        }
        rcu_read_unlock();
 }
@@ -2268,9 +2924,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                u64 rem_load, moved_load;
                /*
-                 * empty group
+                 * empty group or part of a throttled hierarchy
                 */
-                if (!busiest_cfs_rq->task_weight)
+                if (!busiest_cfs_rq->task_weight ||
+                    throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
                        continue;
                rem_load = (u64)rem_load_move * busiest_weight;
@@ -3667,7 +4324,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
        struct sched_domain *sd;
        for_each_domain(cpu, sd)
-                if (sd && (sd->flags & flag))
+                if (sd->flags & flag)
                        break;
        return sd;
@@ -4251,8 +4908,13 @@ static void set_curr_task_fair(struct rq *rq)
 {
        struct sched_entity *se = &rq->curr->se;
-        for_each_sched_entity(se)
+        for_each_sched_entity(se) {
-                set_next_entity(cfs_rq_of(se), se);
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                set_next_entity(cfs_rq, se);
+                /* ensure bandwidth has been allocated on our new cfs_rq */
+                account_cfs_rq_runtime(cfs_rq, 0);
+        }
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 2e74677cb040..efa0a7b75dde 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,11 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
 SCHED_FEAT(START_DEBIT, 1)
 /*
- * Should wakeups try to preempt running tasks.
- */
-SCHED_FEAT(WAKEUP_PREEMPT, 1)
-/*
 * Based on load and program behaviour, see if it makes sense to place
 * a newly woken task on the same cpu as the task that woke it --
 * improve cache locality. Typically used with SYNC wakeups as
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index af1177858be3..0cc188cf7664 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -124,21 +124,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        update_rt_migration(rt_rq);
 }
+static inline int has_pushable_tasks(struct rq *rq)
+{
+        return !plist_head_empty(&rq->rt.pushable_tasks);
+}
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 {
        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
        plist_node_init(&p->pushable_tasks, p->prio);
        plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+        /* Update the highest prio pushable task */
+        if (p->prio < rq->rt.highest_prio.next)
+                rq->rt.highest_prio.next = p->prio;
 }
 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 {
        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
-}
-static inline int has_pushable_tasks(struct rq *rq)
+        /* Update the new highest prio pushable task */
-{
+        if (has_pushable_tasks(rq)) {
-        return !plist_head_empty(&rq->rt.pushable_tasks);
+                p = plist_first_entry(&rq->rt.pushable_tasks,
+                                      struct task_struct, pushable_tasks);
+                rq->rt.highest_prio.next = p->prio;
+        } else
+                rq->rt.highest_prio.next = MAX_RT_PRIO;
 }
 #else
@@ -698,47 +710,13 @@ static void update_curr_rt(struct rq *rq)
 #if defined CONFIG_SMP
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
-static inline int next_prio(struct rq *rq)
-{
-        struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
-        if (next && rt_prio(next->prio))
-                return next->prio;
-        else
-                return MAX_RT_PRIO;
-}
 static void
 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
-        if (prio < prev_prio) {
+        if (rq->online && prio < prev_prio)
+                cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
-                /*
-                 * If the new task is higher in priority than anything on the
-                 * run-queue, we know that the previous high becomes our
-                 * next-highest.
-                 */
-                rt_rq->highest_prio.next = prev_prio;
-                if (rq->online)
-                        cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
-        } else if (prio == rt_rq->highest_prio.curr)
-                /*
-                 * If the next task is equal in priority to the highest on
-                 * the run-queue, then we implicitly know that the next highest
-                 * task cannot be any lower than current
-                 */
-                rt_rq->highest_prio.next = prio;
-        else if (prio < rt_rq->highest_prio.next)
-                /*
-                 * Otherwise, we need to recompute next-highest
-                 */
-                rt_rq->highest_prio.next = next_prio(rq);
 }
 static void
@@ -746,9 +724,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
-        if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
-                rt_rq->highest_prio.next = next_prio(rq);
        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 }
@@ -961,6 +936,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
+        inc_nr_running(rq);
 }
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -971,6 +948,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        dequeue_rt_entity(rt_se);
        dequeue_pushable_task(rq, p);
+        dec_nr_running(rq);
 }
 /*
@@ -1017,10 +996,12 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
        struct rq *rq;
        int cpu;
-        if (sd_flag != SD_BALANCE_WAKE)
-                return smp_processor_id();
        cpu = task_cpu(p);
+        /* For anything but wake ups, just return the task_cpu */
+        if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+                goto out;
        rq = cpu_rq(cpu);
        rcu_read_lock();
@@ -1059,6 +1040,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
        }
        rcu_read_unlock();
+out:
        return cpu;
 }
@@ -1178,7 +1160,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
        update_curr_rt(rq);
-        p->se.exec_start = 0;
        /*
         * The previous task needs to be made eligible for pushing
@@ -1394,6 +1375,7 @@ static int push_rt_task(struct rq *rq)
 {
        struct task_struct *next_task;
        struct rq *lowest_rq;
+        int ret = 0;
        if (!rq->rt.overloaded)
                return 0;
@@ -1426,7 +1408,7 @@ retry:
        if (!lowest_rq) {
                struct task_struct *task;
                /*
-                 * find lock_lowest_rq releases rq->lock
+                 * find_lock_lowest_rq releases rq->lock
                 * so it is possible that next_task has migrated.
                 *
                 * We need to make sure that the task is still on the same
@@ -1436,12 +1418,11 @@ retry:
                task = pick_next_pushable_task(rq);
                if (task_cpu(next_task) == rq->cpu && task == next_task) {
                        /*
-                         * If we get here, the task hasn't moved at all, but
+                         * The task hasn't migrated, and is still the next
-                         * it has failed to push.  We will not try again,
+                         * eligible task, but we failed to find a run-queue
-                         * since the other cpus will pull from us when they
+                         * to push it to.  Do not retry in this case, since
-                         * are ready.
+                         * other cpus will pull from us when ready.
                         */
-                        dequeue_pushable_task(rq, next_task);
                        goto out;
                }
@@ -1460,6 +1441,7 @@ retry:
        deactivate_task(rq, next_task, 0);
        set_task_cpu(next_task, lowest_rq->cpu);
        activate_task(lowest_rq, next_task, 0);
+        ret = 1;
        resched_task(lowest_rq->curr);
@@ -1468,7 +1450,7 @@ retry:
 out:
        put_task_struct(next_task);
-        return 1;
+        return ret;
 }
 static void push_rt_tasks(struct rq *rq)
@@ -1863,4 +1845,3 @@ static void print_rt_stats(struct seq_file *m, int cpu)
        rcu_read_unlock();
 }
 #endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 6f437632afab..8b44e7fa7fb3 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
 static void
 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
+        inc_nr_running(rq);
 }
 static void
 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
+        dec_nr_running(rq);
 }
 static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b531e50..2d2ecdcc8cdb 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -379,6 +379,16 @@ static struct ctl_table kern_table[] = {
                .extra2         = &one,
        },
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+        {
+                .procname       = "sched_cfs_bandwidth_slice_us",
+                .data           = &sysctl_sched_cfs_bandwidth_slice,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &one,
+        },
+#endif
 #ifdef CONFIG_PROVE_LOCKING
        {
                .procname       = "prove_locking",