12 files changed, 2931 insertions, 237 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 7b621409cf15..9a95c8c2af2a 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -11,9 +11,10 @@ ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
-obj-y += core.o proc.o clock.o cputime.o idle_task.o fair.o rt.o stop_task.o
+obj-y += core.o proc.o clock.o cputime.o
+obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
 obj-y += wait.o completion.o
-obj-$(CONFIG_SMP) += cpupri.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c3ae1446461c..b30a2924ef14 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -26,9 +26,10 @@
 * at 0 on boot (but people really shouldn't rely on that).
 *
 * cpu_clock(i)       -- can be used from any context, including NMI.
- * sched_clock_cpu(i) -- must be used with local IRQs disabled (implied by NMI)
 * local_clock()      -- is cpu_clock() on the current cpu.
 *
+ * sched_clock_cpu(i)
+ *
 * How:
 *
 * The implementation either uses sched_clock() when
@@ -50,15 +51,6 @@
 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
 * that is otherwise invisible (TSC gets stopped).
 *
- *
- * Notes:
- *
- * The !IRQ-safetly of sched_clock() and sched_clock_cpu() comes from things
- * like cpufreq interrupts that can change the base clock (TSC) multiplier
- * and cause funny jumps in time -- although the filtering provided by
- * sched_clock_cpu() should mitigate serious artifacts we cannot rely on it
- * in general since for !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK we fully rely on
- * sched_clock().
 */
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
@@ -66,6 +58,8 @@
 #include <linux/percpu.h>
 #include <linux/ktime.h>
 #include <linux/sched.h>
+#include <linux/static_key.h>
+#include <linux/workqueue.h>
 /*
 * Scheduler clock - returns current time in nanosec units.
@@ -82,7 +76,52 @@ EXPORT_SYMBOL_GPL(sched_clock);
 __read_mostly int sched_clock_running;
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-__read_mostly int sched_clock_stable;
+static struct static_key __sched_clock_stable = STATIC_KEY_INIT;
+static int __sched_clock_stable_early;
+int sched_clock_stable(void)
+{
+        return static_key_false(&__sched_clock_stable);
+}
+static void __set_sched_clock_stable(void)
+{
+        if (!sched_clock_stable())
+                static_key_slow_inc(&__sched_clock_stable);
+}
+void set_sched_clock_stable(void)
+{
+        __sched_clock_stable_early = 1;
+        smp_mb(); /* matches sched_clock_init() */
+        if (!sched_clock_running)
+                return;
+        __set_sched_clock_stable();
+}
+static void __clear_sched_clock_stable(struct work_struct *work)
+{
+        /* XXX worry about clock continuity */
+        if (sched_clock_stable())
+                static_key_slow_dec(&__sched_clock_stable);
+}
+static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);
+void clear_sched_clock_stable(void)
+{
+        __sched_clock_stable_early = 0;
+        smp_mb(); /* matches sched_clock_init() */
+        if (!sched_clock_running)
+                return;
+        schedule_work(&sched_clock_work);
+}
 struct sched_clock_data {
        u64                     tick_raw;
@@ -116,6 +155,20 @@ void sched_clock_init(void)
        }
        sched_clock_running = 1;
+        /*
+         * Ensure that it is impossible to not do a static_key update.
+         *
+         * Either {set,clear}_sched_clock_stable() must see sched_clock_running
+         * and do the update, or we must see their __sched_clock_stable_early
+         * and do the update, or both.
+         */
+        smp_mb(); /* matches {set,clear}_sched_clock_stable() */
+        if (__sched_clock_stable_early)
+                __set_sched_clock_stable();
+        else
+                __clear_sched_clock_stable(NULL);
 }
 /*
@@ -242,20 +295,20 @@ u64 sched_clock_cpu(int cpu)
        struct sched_clock_data *scd;
        u64 clock;
-        WARN_ON_ONCE(!irqs_disabled());
+        if (sched_clock_stable())
-        if (sched_clock_stable)
                return sched_clock();
        if (unlikely(!sched_clock_running))
                return 0ull;
+        preempt_disable_notrace();
        scd = cpu_sdc(cpu);
        if (cpu != smp_processor_id())
                clock = sched_clock_remote(scd);
        else
                clock = sched_clock_local(scd);
+        preempt_enable_notrace();
        return clock;
 }
@@ -265,7 +318,7 @@ void sched_clock_tick(void)
        struct sched_clock_data *scd;
        u64 now, now_gtod;
-        if (sched_clock_stable)
+        if (sched_clock_stable())
                return;
        if (unlikely(!sched_clock_running))
@@ -316,14 +369,10 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 */
 u64 cpu_clock(int cpu)
 {
-        u64 clock;
+        if (!sched_clock_stable())
-        unsigned long flags;
+                return sched_clock_cpu(cpu);
-        local_irq_save(flags);
-        clock = sched_clock_cpu(cpu);
-        local_irq_restore(flags);
-        return clock;
+        return sched_clock();
 }
 /*
@@ -335,14 +384,10 @@ u64 cpu_clock(int cpu)
 */
 u64 local_clock(void)
 {
-        u64 clock;
+        if (!sched_clock_stable())
-        unsigned long flags;
+                return sched_clock_cpu(raw_smp_processor_id());
-        local_irq_save(flags);
+        return sched_clock();
-        clock = sched_clock_cpu(smp_processor_id());
-        local_irq_restore(flags);
-        return clock;
 }
 #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
@@ -362,12 +407,12 @@ u64 sched_clock_cpu(int cpu)
 u64 cpu_clock(int cpu)
 {
-        return sched_clock_cpu(cpu);
+        return sched_clock();
 }
 u64 local_clock(void)
 {
-        return sched_clock_cpu(0);
+        return sched_clock();
 }
 #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a88f4a485c5e..f5c6635b806c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -296,8 +296,6 @@ __read_mostly int scheduler_running;
 */
 int sysctl_sched_rt_runtime = 950000;
 /*
 * __task_rq_lock - lock the rq @p resides on.
 */
@@ -899,7 +897,9 @@ static inline int normal_prio(struct task_struct *p)
 {
        int prio;
-        if (task_has_rt_policy(p))
+        if (task_has_dl_policy(p))
+                prio = MAX_DL_PRIO-1;
+        else if (task_has_rt_policy(p))
                prio = MAX_RT_PRIO-1 - p->rt_priority;
        else
                prio = __normal_prio(p);
@@ -945,7 +945,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                if (prev_class->switched_from)
                        prev_class->switched_from(rq, p);
                p->sched_class->switched_to(rq, p);
-        } else if (oldprio != p->prio)
+        } else if (oldprio != p->prio || dl_task(p))
                p->sched_class->prio_changed(rq, p, oldprio);
 }
@@ -1108,6 +1108,7 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p)
        if (!cpumask_test_cpu(arg.src_cpu, tsk_cpus_allowed(arg.dst_task)))
                goto out;
+        trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
        ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
 out:
@@ -1499,8 +1500,7 @@ void scheduler_ipi(void)
         * TIF_NEED_RESCHED remotely (for the first time) will also send
         * this IPI.
         */
-        if (tif_need_resched())
+        preempt_fold_need_resched();
-                set_preempt_need_resched();
        if (llist_empty(&this_rq()->wake_list)
                        && !tick_nohz_full_cpu(smp_processor_id())
@@ -1717,6 +1717,13 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        memset(&p->se.statistics, 0, sizeof(p->se.statistics));
 #endif
+        RB_CLEAR_NODE(&p->dl.rb_node);
+        hrtimer_init(&p->dl.dl_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        p->dl.dl_runtime = p->dl.runtime = 0;
+        p->dl.dl_deadline = p->dl.deadline = 0;
+        p->dl.dl_period = 0;
+        p->dl.flags = 0;
        INIT_LIST_HEAD(&p->rt.run_list);
 #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1763,12 +1770,34 @@ void set_numabalancing_state(bool enabled)
        numabalancing_enabled = enabled;
 }
 #endif /* CONFIG_SCHED_DEBUG */
-#endif /* CONFIG_NUMA_BALANCING */
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_numa_balancing(struct ctl_table *table, int write,
+                         void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        struct ctl_table t;
+        int err;
+        int state = numabalancing_enabled;
+        if (write && !capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        t = *table;
+        t.data = &state;
+        err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+        if (err < 0)
+                return err;
+        if (write)
+                set_numabalancing_state(state);
+        return err;
+}
+#endif
+#endif
 /*
 * fork()/clone()-time setup:
 */
-void sched_fork(unsigned long clone_flags, struct task_struct *p)
+int sched_fork(unsigned long clone_flags, struct task_struct *p)
 {
        unsigned long flags;
        int cpu = get_cpu();
@@ -1790,7 +1819,7 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
         * Revert to default priority/policy on fork if requested.
         */
        if (unlikely(p->sched_reset_on_fork)) {
-                if (task_has_rt_policy(p)) {
+                if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
                        p->policy = SCHED_NORMAL;
                        p->static_prio = NICE_TO_PRIO(0);
                        p->rt_priority = 0;
@@ -1807,8 +1836,14 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
                p->sched_reset_on_fork = 0;
        }
-        if (!rt_prio(p->prio))
+        if (dl_prio(p->prio)) {
+                put_cpu();
+                return -EAGAIN;
+        } else if (rt_prio(p->prio)) {
+                p->sched_class = &rt_sched_class;
+        } else {
                p->sched_class = &fair_sched_class;
+        }
        if (p->sched_class->task_fork)
                p->sched_class->task_fork(p);
@@ -1834,11 +1869,124 @@ void sched_fork(unsigned long clone_flags, struct task_struct *p)
        init_task_preempt_count(p);
 #ifdef CONFIG_SMP
        plist_node_init(&p->pushable_tasks, MAX_PRIO);
+        RB_CLEAR_NODE(&p->pushable_dl_tasks);
 #endif
        put_cpu();
+        return 0;
+}
+unsigned long to_ratio(u64 period, u64 runtime)
+{
+        if (runtime == RUNTIME_INF)
+                return 1ULL << 20;
+        /*
+         * Doing this here saves a lot of checks in all
+         * the calling paths, and returning zero seems
+         * safe for them anyway.
+         */
+        if (period == 0)
+                return 0;
+        return div64_u64(runtime << 20, period);
 }
+#ifdef CONFIG_SMP
+inline struct dl_bw *dl_bw_of(int i)
+{
+        return &cpu_rq(i)->rd->dl_bw;
+}
+static inline int dl_bw_cpus(int i)
+{
+        struct root_domain *rd = cpu_rq(i)->rd;
+        int cpus = 0;
+        for_each_cpu_and(i, rd->span, cpu_active_mask)
+                cpus++;
+        return cpus;
+}
+#else
+inline struct dl_bw *dl_bw_of(int i)
+{
+        return &cpu_rq(i)->dl.dl_bw;
+}
+static inline int dl_bw_cpus(int i)
+{
+        return 1;
+}
+#endif
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+        dl_b->total_bw -= tsk_bw;
+}
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+        dl_b->total_bw += tsk_bw;
+}
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+        return dl_b->bw != -1 &&
+               dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
+/*
+ * We must be sure that accepting a new task (or allowing changing the
+ * parameters of an existing one) is consistent with the bandwidth
+ * constraints. If yes, this function also accordingly updates the currently
+ * allocated bandwidth to reflect the new situation.
+ *
+ * This function is called while holding p's rq->lock.
+ */
+static int dl_overflow(struct task_struct *p, int policy,
+                       const struct sched_attr *attr)
+{
+        struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+        u64 period = attr->sched_period ?: attr->sched_deadline;
+        u64 runtime = attr->sched_runtime;
+        u64 new_bw = dl_policy(policy) ? to_ratio(period, runtime) : 0;
+        int cpus, err = -1;
+        if (new_bw == p->dl.dl_bw)
+                return 0;
+        /*
+         * Either if a task, enters, leave, or stays -deadline but changes
+         * its parameters, we may need to update accordingly the total
+         * allocated bandwidth of the container.
+         */
+        raw_spin_lock(&dl_b->lock);
+        cpus = dl_bw_cpus(task_cpu(p));
+        if (dl_policy(policy) && !task_has_dl_policy(p) &&
+            !__dl_overflow(dl_b, cpus, 0, new_bw)) {
+                __dl_add(dl_b, new_bw);
+                err = 0;
+        } else if (dl_policy(policy) && task_has_dl_policy(p) &&
+                   !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
+                __dl_clear(dl_b, p->dl.dl_bw);
+                __dl_add(dl_b, new_bw);
+                err = 0;
+        } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
+                __dl_clear(dl_b, p->dl.dl_bw);
+                err = 0;
+        }
+        raw_spin_unlock(&dl_b->lock);
+        return err;
+}
+extern void init_dl_bw(struct dl_bw *dl_b);
 /*
 * wake_up_new_task - wake up a newly created task for the first time.
 *
@@ -2003,6 +2151,9 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        if (unlikely(prev_state == TASK_DEAD)) {
                task_numa_free(prev);
+                if (prev->sched_class->task_dead)
+                        prev->sched_class->task_dead(prev);
                /*
                 * Remove function-return probe instances associated with this
                 * task and put them back on the free list.
@@ -2296,7 +2447,7 @@ void scheduler_tick(void)
 #ifdef CONFIG_SMP
        rq->idle_balance = idle_cpu(cpu);
-        trigger_load_balance(rq, cpu);
+        trigger_load_balance(rq);
 #endif
        rq_last_tick_reset(rq);
 }
@@ -2325,7 +2476,7 @@ u64 scheduler_tick_max_deferment(void)
        if (time_before_eq(next, now))
                return 0;
-        return jiffies_to_usecs(next - now) * NSEC_PER_USEC;
+        return jiffies_to_nsecs(next - now);
 }
 #endif
@@ -2414,10 +2565,10 @@ static inline void schedule_debug(struct task_struct *prev)
 {
        /*
         * Test if we are atomic. Since do_exit() needs to call into
-         * schedule() atomically, we ignore that path for now.
+         * schedule() atomically, we ignore that path. Otherwise whine
-         * Otherwise, whine if we are scheduling when we should not be.
+         * if we are scheduling when we should not.
         */
-        if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
+        if (unlikely(in_atomic_preempt_off() && prev->state != TASK_DEAD))
                __schedule_bug(prev);
        rcu_sleep_check();
@@ -2761,11 +2912,11 @@ EXPORT_SYMBOL(sleep_on_timeout);
 */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-        int oldprio, on_rq, running;
+        int oldprio, on_rq, running, enqueue_flag = 0;
        struct rq *rq;
        const struct sched_class *prev_class;
-        BUG_ON(prio < 0 || prio > MAX_PRIO);
+        BUG_ON(prio > MAX_PRIO);
        rq = __task_rq_lock(p);
@@ -2788,6 +2939,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        }
        trace_sched_pi_setprio(p, prio);
+        p->pi_top_task = rt_mutex_get_top_task(p);
        oldprio = p->prio;
        prev_class = p->sched_class;
        on_rq = p->on_rq;
@@ -2797,23 +2949,49 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->put_prev_task(rq, p);
-        if (rt_prio(prio))
+        /*
+         * Boosting condition are:
+         * 1. -rt task is running and holds mutex A
+         *      --> -dl task blocks on mutex A
+         *
+         * 2. -dl task is running and holds mutex A
+         *      --> -dl task blocks on mutex A and could preempt the
+         *          running task
+         */
+        if (dl_prio(prio)) {
+                if (!dl_prio(p->normal_prio) || (p->pi_top_task &&
+                        dl_entity_preempt(&p->pi_top_task->dl, &p->dl))) {
+                        p->dl.dl_boosted = 1;
+                        p->dl.dl_throttled = 0;
+                        enqueue_flag = ENQUEUE_REPLENISH;
+                } else
+                        p->dl.dl_boosted = 0;
+                p->sched_class = &dl_sched_class;
+        } else if (rt_prio(prio)) {
+                if (dl_prio(oldprio))
+                        p->dl.dl_boosted = 0;
+                if (oldprio < prio)
+                        enqueue_flag = ENQUEUE_HEAD;
                p->sched_class = &rt_sched_class;
-        else
+        } else {
+                if (dl_prio(oldprio))
+                        p->dl.dl_boosted = 0;
                p->sched_class = &fair_sched_class;
+        }
        p->prio = prio;
        if (running)
                p->sched_class->set_curr_task(rq);
        if (on_rq)
-                enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
+                enqueue_task(rq, p, enqueue_flag);
        check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
        __task_rq_unlock(rq);
 }
 #endif
 void set_user_nice(struct task_struct *p, long nice)
 {
        int old_prio, delta, on_rq;
@@ -2831,9 +3009,9 @@ void set_user_nice(struct task_struct *p, long nice)
         * The RT priorities are set via sched_setscheduler(), but we still
         * allow the 'normal' nice value to be set - but as expected
         * it wont have any effect on scheduling until the task is
-         * SCHED_FIFO/SCHED_RR:
+         * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
         */
-        if (task_has_rt_policy(p)) {
+        if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
                p->static_prio = NICE_TO_PRIO(nice);
                goto out_unlock;
        }
@@ -2988,22 +3166,95 @@ static struct task_struct *find_process_by_pid(pid_t pid)
        return pid ? find_task_by_vpid(pid) : current;
 }
-/* Actually do priority change: must hold rq lock. */
+/*
+ * This function initializes the sched_dl_entity of a newly becoming
+ * SCHED_DEADLINE task.
+ *
+ * Only the static values are considered here, the actual runtime and the
+ * absolute deadline will be properly calculated when the task is enqueued
+ * for the first time with its new policy.
+ */
 static void
-__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+__setparam_dl(struct task_struct *p, const struct sched_attr *attr)
 {
+        struct sched_dl_entity *dl_se = &p->dl;
+        init_dl_task_timer(dl_se);
+        dl_se->dl_runtime = attr->sched_runtime;
+        dl_se->dl_deadline = attr->sched_deadline;
+        dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
+        dl_se->flags = attr->sched_flags;
+        dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
+        dl_se->dl_throttled = 0;
+        dl_se->dl_new = 1;
+}
+/* Actually do priority change: must hold pi & rq lock. */
+static void __setscheduler(struct rq *rq, struct task_struct *p,
+                           const struct sched_attr *attr)
+{
+        int policy = attr->sched_policy;
+        if (policy == -1) /* setparam */
+                policy = p->policy;
        p->policy = policy;
-        p->rt_priority = prio;
+        if (dl_policy(policy))
+                __setparam_dl(p, attr);
+        else if (fair_policy(policy))
+                p->static_prio = NICE_TO_PRIO(attr->sched_nice);
+        /*
+         * __sched_setscheduler() ensures attr->sched_priority == 0 when
+         * !rt_policy. Always setting this ensures that things like
+         * getparam()/getattr() don't report silly values for !rt tasks.
+         */
+        p->rt_priority = attr->sched_priority;
        p->normal_prio = normal_prio(p);
-        /* we are holding p->pi_lock already */
        p->prio = rt_mutex_getprio(p);
-        if (rt_prio(p->prio))
+        if (dl_prio(p->prio))
+                p->sched_class = &dl_sched_class;
+        else if (rt_prio(p->prio))
                p->sched_class = &rt_sched_class;
        else
                p->sched_class = &fair_sched_class;
        set_load_weight(p);
 }
+static void
+__getparam_dl(struct task_struct *p, struct sched_attr *attr)
+{
+        struct sched_dl_entity *dl_se = &p->dl;
+        attr->sched_priority = p->rt_priority;
+        attr->sched_runtime = dl_se->dl_runtime;
+        attr->sched_deadline = dl_se->dl_deadline;
+        attr->sched_period = dl_se->dl_period;
+        attr->sched_flags = dl_se->flags;
+}
+/*
+ * This function validates the new parameters of a -deadline task.
+ * We ask for the deadline not being zero, and greater or equal
+ * than the runtime, as well as the period of being zero or
+ * greater than deadline. Furthermore, we have to be sure that
+ * user parameters are above the internal resolution (1us); we
+ * check sched_runtime only since it is always the smaller one.
+ */
+static bool
+__checkparam_dl(const struct sched_attr *attr)
+{
+        return attr && attr->sched_deadline != 0 &&
+                (attr->sched_period == 0 ||
+                (s64)(attr->sched_period   - attr->sched_deadline) >= 0) &&
+                (s64)(attr->sched_deadline - attr->sched_runtime ) >= 0  &&
+                attr->sched_runtime >= (2 << (DL_SCALE - 1));
+}
 /*
 * check the target process has a UID that matches the current process's
 */
@@ -3020,10 +3271,12 @@ static bool check_same_owner(struct task_struct *p)
        return match;
 }
-static int __sched_setscheduler(struct task_struct *p, int policy,
+static int __sched_setscheduler(struct task_struct *p,
-                                const struct sched_param *param, bool user)
+                                const struct sched_attr *attr,
+                                bool user)
 {
        int retval, oldprio, oldpolicy = -1, on_rq, running;
+        int policy = attr->sched_policy;
        unsigned long flags;
        const struct sched_class *prev_class;
        struct rq *rq;
@@ -3037,31 +3290,40 @@ recheck:
                reset_on_fork = p->sched_reset_on_fork;
                policy = oldpolicy = p->policy;
        } else {
-                reset_on_fork = !!(policy & SCHED_RESET_ON_FORK);
+                reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
-                policy &= ~SCHED_RESET_ON_FORK;
-                if (policy != SCHED_FIFO && policy != SCHED_RR &&
+                if (policy != SCHED_DEADLINE &&
+                                policy != SCHED_FIFO && policy != SCHED_RR &&
                                policy != SCHED_NORMAL && policy != SCHED_BATCH &&
                                policy != SCHED_IDLE)
                        return -EINVAL;
        }
+        if (attr->sched_flags & ~(SCHED_FLAG_RESET_ON_FORK))
+                return -EINVAL;
        /*
         * Valid priorities for SCHED_FIFO and SCHED_RR are
         * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
         * SCHED_BATCH and SCHED_IDLE is 0.
         */
-        if (param->sched_priority < 0 ||
+        if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
-            (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
+            (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
-            (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
                return -EINVAL;
-        if (rt_policy(policy) != (param->sched_priority != 0))
+        if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
+            (rt_policy(policy) != (attr->sched_priority != 0)))
                return -EINVAL;
        /*
         * Allow unprivileged RT tasks to decrease priority:
         */
        if (user && !capable(CAP_SYS_NICE)) {
+                if (fair_policy(policy)) {
+                        if (attr->sched_nice < TASK_NICE(p) &&
+                            !can_nice(p, attr->sched_nice))
+                                return -EPERM;
+                }
                if (rt_policy(policy)) {
                        unsigned long rlim_rtprio =
                                        task_rlimit(p, RLIMIT_RTPRIO);
@@ -3071,11 +3333,20 @@ recheck:
                                return -EPERM;
                        /* can't increase priority */
-                        if (param->sched_priority > p->rt_priority &&
+                        if (attr->sched_priority > p->rt_priority &&
-                            param->sched_priority > rlim_rtprio)
+                            attr->sched_priority > rlim_rtprio)
                                return -EPERM;
                }
+                 /*
+                  * Can't set/change SCHED_DEADLINE policy at all for now
+                  * (safest behavior); in the future we would like to allow
+                  * unprivileged DL tasks to increase their relative deadline
+                  * or reduce their runtime (both ways reducing utilization)
+                  */
+                if (dl_policy(policy))
+                        return -EPERM;
                /*
                 * Treat SCHED_IDLE as nice 20. Only allow a switch to
                 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
@@ -3120,14 +3391,21 @@ recheck:
        /*
         * If not changing anything there's no need to proceed further:
         */
-        if (unlikely(policy == p->policy && (!rt_policy(policy) ||
+        if (unlikely(policy == p->policy)) {
-                        param->sched_priority == p->rt_priority))) {
+                if (fair_policy(policy) && attr->sched_nice != TASK_NICE(p))
+                        goto change;
+                if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
+                        goto change;
+                if (dl_policy(policy))
+                        goto change;
                task_rq_unlock(rq, p, &flags);
                return 0;
        }
+change:
-#ifdef CONFIG_RT_GROUP_SCHED
        if (user) {
+#ifdef CONFIG_RT_GROUP_SCHED
                /*
                 * Do not allow realtime tasks into groups that have no runtime
                 * assigned.
@@ -3138,8 +3416,24 @@ recheck:
                        task_rq_unlock(rq, p, &flags);
                        return -EPERM;
                }
-        }
 #endif
+#ifdef CONFIG_SMP
+                if (dl_bandwidth_enabled() && dl_policy(policy)) {
+                        cpumask_t *span = rq->rd->span;
+                        /*
+                         * Don't allow tasks with an affinity mask smaller than
+                         * the entire root_domain to become SCHED_DEADLINE. We
+                         * will also fail if there's no bandwidth available.
+                         */
+                        if (!cpumask_subset(span, &p->cpus_allowed) ||
+                            rq->rd->dl_bw.bw == 0) {
+                                task_rq_unlock(rq, p, &flags);
+                                return -EPERM;
+                        }
+                }
+#endif
+        }
        /* recheck policy now with rq lock held */
        if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
@@ -3147,6 +3441,17 @@ recheck:
                task_rq_unlock(rq, p, &flags);
                goto recheck;
        }
+        /*
+         * If setscheduling to SCHED_DEADLINE (or changing the parameters
+         * of a SCHED_DEADLINE task) we need to check if enough bandwidth
+         * is available.
+         */
+        if ((dl_policy(policy) || dl_task(p)) && dl_overflow(p, policy, attr)) {
+                task_rq_unlock(rq, p, &flags);
+                return -EBUSY;
+        }
        on_rq = p->on_rq;
        running = task_current(rq, p);
        if (on_rq)
@@ -3158,7 +3463,7 @@ recheck:
        oldprio = p->prio;
        prev_class = p->sched_class;
-        __setscheduler(rq, p, policy, param->sched_priority);
+        __setscheduler(rq, p, attr);
        if (running)
                p->sched_class->set_curr_task(rq);
@@ -3173,6 +3478,26 @@ recheck:
        return 0;
 }
+static int _sched_setscheduler(struct task_struct *p, int policy,
+                               const struct sched_param *param, bool check)
+{
+        struct sched_attr attr = {
+                .sched_policy   = policy,
+                .sched_priority = param->sched_priority,
+                .sched_nice     = PRIO_TO_NICE(p->static_prio),
+        };
+        /*
+         * Fixup the legacy SCHED_RESET_ON_FORK hack
+         */
+        if (policy & SCHED_RESET_ON_FORK) {
+                attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+                policy &= ~SCHED_RESET_ON_FORK;
+                attr.sched_policy = policy;
+        }
+        return __sched_setscheduler(p, &attr, check);
+}
 /**
 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
 * @p: the task in question.
@@ -3186,10 +3511,16 @@ recheck:
 int sched_setscheduler(struct task_struct *p, int policy,
                       const struct sched_param *param)
 {
-        return __sched_setscheduler(p, policy, param, true);
+        return _sched_setscheduler(p, policy, param, true);
 }
 EXPORT_SYMBOL_GPL(sched_setscheduler);
+int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
+{
+        return __sched_setscheduler(p, attr, true);
+}
+EXPORT_SYMBOL_GPL(sched_setattr);
 /**
 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
 * @p: the task in question.
@@ -3206,7 +3537,7 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
                               const struct sched_param *param)
 {
-        return __sched_setscheduler(p, policy, param, false);
+        return _sched_setscheduler(p, policy, param, false);
 }
 static int
@@ -3231,6 +3562,79 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
        return retval;
 }
+/*
+ * Mimics kernel/events/core.c perf_copy_attr().
+ */
+static int sched_copy_attr(struct sched_attr __user *uattr,
+                           struct sched_attr *attr)
+{
+        u32 size;
+        int ret;
+        if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
+                return -EFAULT;
+        /*
+         * zero the full structure, so that a short copy will be nice.
+         */
+        memset(attr, 0, sizeof(*attr));
+        ret = get_user(size, &uattr->size);
+        if (ret)
+                return ret;
+        if (size > PAGE_SIZE)   /* silly large */
+                goto err_size;
+        if (!size)              /* abi compat */
+                size = SCHED_ATTR_SIZE_VER0;
+        if (size < SCHED_ATTR_SIZE_VER0)
+                goto err_size;
+        /*
+         * If we're handed a bigger struct than we know of,
+         * ensure all the unknown bits are 0 - i.e. new
+         * user-space does not rely on any kernel feature
+         * extensions we dont know about yet.
+         */
+        if (size > sizeof(*attr)) {
+                unsigned char __user *addr;
+                unsigned char __user *end;
+                unsigned char val;
+                addr = (void __user *)uattr + sizeof(*attr);
+                end  = (void __user *)uattr + size;
+                for (; addr < end; addr++) {
+                        ret = get_user(val, addr);
+                        if (ret)
+                                return ret;
+                        if (val)
+                                goto err_size;
+                }
+                size = sizeof(*attr);
+        }
+        ret = copy_from_user(attr, uattr, size);
+        if (ret)
+                return -EFAULT;
+        /*
+         * XXX: do we want to be lenient like existing syscalls; or do we want
+         * to be strict and return an error on out-of-bounds values?
+         */
+        attr->sched_nice = clamp(attr->sched_nice, -20, 19);
+out:
+        return ret;
+err_size:
+        put_user(sizeof(*attr), &uattr->size);
+        ret = -E2BIG;
+        goto out;
+}
 /**
 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
 * @pid: the pid in question.
@@ -3262,6 +3666,34 @@ SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 }
 /**
+ * sys_sched_setattr - same as above, but with extended sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ */
+SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
+                               unsigned int, flags)
+{
+        struct sched_attr attr;
+        struct task_struct *p;
+        int retval;
+        if (!uattr || pid < 0 || flags)
+                return -EINVAL;
+        if (sched_copy_attr(uattr, &attr))
+                return -EFAULT;
+        rcu_read_lock();
+        retval = -ESRCH;
+        p = find_process_by_pid(pid);
+        if (p != NULL)
+                retval = sched_setattr(p, &attr);
+        rcu_read_unlock();
+        return retval;
+}
+/**
 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
 * @pid: the pid in question.
 *
@@ -3316,6 +3748,10 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
        if (retval)
                goto out_unlock;
+        if (task_has_dl_policy(p)) {
+                retval = -EINVAL;
+                goto out_unlock;
+        }
        lp.sched_priority = p->rt_priority;
        rcu_read_unlock();
@@ -3331,6 +3767,96 @@ out_unlock:
        return retval;
 }
+static int sched_read_attr(struct sched_attr __user *uattr,
+                           struct sched_attr *attr,
+                           unsigned int usize)
+{
+        int ret;
+        if (!access_ok(VERIFY_WRITE, uattr, usize))
+                return -EFAULT;
+        /*
+         * If we're handed a smaller struct than we know of,
+         * ensure all the unknown bits are 0 - i.e. old
+         * user-space does not get uncomplete information.
+         */
+        if (usize < sizeof(*attr)) {
+                unsigned char *addr;
+                unsigned char *end;
+                addr = (void *)attr + usize;
+                end  = (void *)attr + sizeof(*attr);
+                for (; addr < end; addr++) {
+                        if (*addr)
+                                goto err_size;
+                }
+                attr->size = usize;
+        }
+        ret = copy_to_user(uattr, attr, attr->size);
+        if (ret)
+                return -EFAULT;
+out:
+        return ret;
+err_size:
+        ret = -E2BIG;
+        goto out;
+}
+/**
+ * sys_sched_getattr - similar to sched_getparam, but with sched_attr
+ * @pid: the pid in question.
+ * @uattr: structure containing the extended parameters.
+ * @size: sizeof(attr) for fwd/bwd comp.
+ */
+SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
+                unsigned int, size, unsigned int, flags)
+{
+        struct sched_attr attr = {
+                .size = sizeof(struct sched_attr),
+        };
+        struct task_struct *p;
+        int retval;
+        if (!uattr || pid < 0 || size > PAGE_SIZE ||
+            size < SCHED_ATTR_SIZE_VER0 || flags)
+                return -EINVAL;
+        rcu_read_lock();
+        p = find_process_by_pid(pid);
+        retval = -ESRCH;
+        if (!p)
+                goto out_unlock;
+        retval = security_task_getscheduler(p);
+        if (retval)
+                goto out_unlock;
+        attr.sched_policy = p->policy;
+        if (p->sched_reset_on_fork)
+                attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
+        if (task_has_dl_policy(p))
+                __getparam_dl(p, &attr);
+        else if (task_has_rt_policy(p))
+                attr.sched_priority = p->rt_priority;
+        else
+                attr.sched_nice = TASK_NICE(p);
+        rcu_read_unlock();
+        retval = sched_read_attr(uattr, &attr, size);
+        return retval;
+out_unlock:
+        rcu_read_unlock();
+        return retval;
+}
 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 {
        cpumask_var_t cpus_allowed, new_mask;
@@ -3375,8 +3901,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
        if (retval)
                goto out_unlock;
        cpuset_cpus_allowed(p, cpus_allowed);
        cpumask_and(new_mask, in_mask, cpus_allowed);
+        /*
+         * Since bandwidth control happens on root_domain basis,
+         * if admission test is enabled, we only admit -deadline
+         * tasks allowed to run on all the CPUs in the task's
+         * root_domain.
+         */
+#ifdef CONFIG_SMP
+        if (task_has_dl_policy(p)) {
+                const struct cpumask *span = task_rq(p)->rd->span;
+                if (dl_bandwidth_enabled() && !cpumask_subset(span, new_mask)) {
+                        retval = -EBUSY;
+                        goto out_unlock;
+                }
+        }
+#endif
 again:
        retval = set_cpus_allowed_ptr(p, new_mask);
@@ -3653,7 +4197,7 @@ again:
        }
        double_rq_lock(rq, p_rq);
-        while (task_rq(p) != p_rq) {
+        if (task_rq(p) != p_rq) {
                double_rq_unlock(rq, p_rq);
                goto again;
        }
@@ -3742,6 +4286,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
        case SCHED_RR:
                ret = MAX_USER_RT_PRIO-1;
                break;
+        case SCHED_DEADLINE:
        case SCHED_NORMAL:
        case SCHED_BATCH:
        case SCHED_IDLE:
@@ -3768,6 +4313,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
        case SCHED_RR:
                ret = 1;
                break;
+        case SCHED_DEADLINE:
        case SCHED_NORMAL:
        case SCHED_BATCH:
        case SCHED_IDLE:
@@ -3811,7 +4357,9 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
                goto out_unlock;
        rq = task_rq_lock(p, &flags);
-        time_slice = p->sched_class->get_rr_interval(rq, p);
+        time_slice = 0;
+        if (p->sched_class->get_rr_interval)
+                time_slice = p->sched_class->get_rr_interval(rq, p);
        task_rq_unlock(rq, p, &flags);
        rcu_read_unlock();
@@ -4090,6 +4638,7 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
        /* TODO: This is not properly updating schedstats */
+        trace_sched_move_numa(p, curr_cpu, target_cpu);
        return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
 }
@@ -4514,13 +5063,31 @@ static int sched_cpu_active(struct notifier_block *nfb,
 static int sched_cpu_inactive(struct notifier_block *nfb,
                                        unsigned long action, void *hcpu)
 {
+        unsigned long flags;
+        long cpu = (long)hcpu;
        switch (action & ~CPU_TASKS_FROZEN) {
        case CPU_DOWN_PREPARE:
-                set_cpu_active((long)hcpu, false);
+                set_cpu_active(cpu, false);
+                /* explicitly allow suspend */
+                if (!(action & CPU_TASKS_FROZEN)) {
+                        struct dl_bw *dl_b = dl_bw_of(cpu);
+                        bool overflow;
+                        int cpus;
+                        raw_spin_lock_irqsave(&dl_b->lock, flags);
+                        cpus = dl_bw_cpus(cpu);
+                        overflow = __dl_overflow(dl_b, cpus, 0, 0);
+                        raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+                        if (overflow)
+                                return notifier_from_errno(-EBUSY);
+                }
                return NOTIFY_OK;
-        default:
-                return NOTIFY_DONE;
        }
+        return NOTIFY_DONE;
 }
 static int __init migration_init(void)
@@ -4739,6 +5306,8 @@ static void free_rootdomain(struct rcu_head *rcu)
        struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
        cpupri_cleanup(&rd->cpupri);
+        cpudl_cleanup(&rd->cpudl);
+        free_cpumask_var(rd->dlo_mask);
        free_cpumask_var(rd->rto_mask);
        free_cpumask_var(rd->online);
        free_cpumask_var(rd->span);
@@ -4790,8 +5359,14 @@ static int init_rootdomain(struct root_domain *rd)
                goto out;
        if (!alloc_cpumask_var(&rd->online, GFP_KERNEL))
                goto free_span;
-        if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+        if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
                goto free_online;
+        if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
+                goto free_dlo_mask;
+        init_dl_bw(&rd->dl_bw);
+        if (cpudl_init(&rd->cpudl) != 0)
+                goto free_dlo_mask;
        if (cpupri_init(&rd->cpupri) != 0)
                goto free_rto_mask;
@@ -4799,6 +5374,8 @@ static int init_rootdomain(struct root_domain *rd)
 free_rto_mask:
        free_cpumask_var(rd->rto_mask);
+free_dlo_mask:
+        free_cpumask_var(rd->dlo_mask);
 free_online:
        free_cpumask_var(rd->online);
 free_span:
@@ -6150,6 +6727,7 @@ void __init sched_init_smp(void)
        free_cpumask_var(non_isolated_cpus);
        init_sched_rt_class();
+        init_sched_dl_class();
 }
 #else
 void __init sched_init_smp(void)
@@ -6219,13 +6797,15 @@ void __init sched_init(void)
 #endif /* CONFIG_CPUMASK_OFFSTACK */
        }
+        init_rt_bandwidth(&def_rt_bandwidth,
+                        global_rt_period(), global_rt_runtime());
+        init_dl_bandwidth(&def_dl_bandwidth,
+                        global_rt_period(), global_rt_runtime());
 #ifdef CONFIG_SMP
        init_defrootdomain();
 #endif
-        init_rt_bandwidth(&def_rt_bandwidth,
-                        global_rt_period(), global_rt_runtime());
 #ifdef CONFIG_RT_GROUP_SCHED
        init_rt_bandwidth(&root_task_group.rt_bandwidth,
                        global_rt_period(), global_rt_runtime());
@@ -6249,6 +6829,7 @@ void __init sched_init(void)
                rq->calc_load_update = jiffies + LOAD_FREQ;
                init_cfs_rq(&rq->cfs);
                init_rt_rq(&rq->rt, rq);
+                init_dl_rq(&rq->dl, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
                root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
@@ -6320,10 +6901,6 @@ void __init sched_init(void)
        INIT_HLIST_HEAD(&init_task.preempt_notifiers);
 #endif
-#ifdef CONFIG_RT_MUTEXES
-        plist_head_init(&init_task.pi_waiters);
-#endif
        /*
         * The boot idle thread does lazy MMU switching as well:
         */
@@ -6397,13 +6974,16 @@ EXPORT_SYMBOL(__might_sleep);
 static void normalize_task(struct rq *rq, struct task_struct *p)
 {
        const struct sched_class *prev_class = p->sched_class;
+        struct sched_attr attr = {
+                .sched_policy = SCHED_NORMAL,
+        };
        int old_prio = p->prio;
        int on_rq;
        on_rq = p->on_rq;
        if (on_rq)
                dequeue_task(rq, p, 0);
-        __setscheduler(rq, p, SCHED_NORMAL, 0);
+        __setscheduler(rq, p, &attr);
        if (on_rq) {
                enqueue_task(rq, p, 0);
                resched_task(rq->curr);
@@ -6433,7 +7013,7 @@ void normalize_rt_tasks(void)
                p->se.statistics.block_start    = 0;
 #endif
-                if (!rt_task(p)) {
+                if (!dl_task(p) && !rt_task(p)) {
                        /*
                         * Renice negative nice level userspace
                         * tasks back to 0:
@@ -6628,16 +7208,6 @@ void sched_move_task(struct task_struct *tsk)
 }
 #endif /* CONFIG_CGROUP_SCHED */
-#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
-static unsigned long to_ratio(u64 period, u64 runtime)
-{
-        if (runtime == RUNTIME_INF)
-                return 1ULL << 20;
-        return div64_u64(runtime << 20, period);
-}
-#endif
 #ifdef CONFIG_RT_GROUP_SCHED
 /*
 * Ensure that the real time constraints are schedulable.
@@ -6811,24 +7381,13 @@ static long sched_group_rt_period(struct task_group *tg)
        do_div(rt_period_us, NSEC_PER_USEC);
        return rt_period_us;
 }
+#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_RT_GROUP_SCHED
 static int sched_rt_global_constraints(void)
 {
-        u64 runtime, period;
        int ret = 0;
-        if (sysctl_sched_rt_period <= 0)
-                return -EINVAL;
-        runtime = global_rt_runtime();
-        period = global_rt_period();
-        /*
-         * Sanity check on the sysctl variables.
-         */
-        if (runtime > period && runtime != RUNTIME_INF)
-                return -EINVAL;
        mutex_lock(&rt_constraints_mutex);
        read_lock(&tasklist_lock);
        ret = __rt_schedulable(NULL, 0, 0);
@@ -6851,17 +7410,7 @@ static int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
 static int sched_rt_global_constraints(void)
 {
        unsigned long flags;
-        int i;
+        int i, ret = 0;
-        if (sysctl_sched_rt_period <= 0)
-                return -EINVAL;
-        /*
-         * There's always some RT tasks in the root group
-         * -- migration, kstopmachine etc..
-         */
-        if (sysctl_sched_rt_runtime == 0)
-                return -EBUSY;
        raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
        for_each_possible_cpu(i) {
@@ -6873,36 +7422,91 @@ static int sched_rt_global_constraints(void)
        }
        raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
-        return 0;
+        return ret;
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
-int sched_rr_handler(struct ctl_table *table, int write,
+static int sched_dl_global_constraints(void)
-                void __user *buffer, size_t *lenp,
-                loff_t *ppos)
 {
-        int ret;
+        u64 runtime = global_rt_runtime();
-        static DEFINE_MUTEX(mutex);
+        u64 period = global_rt_period();
+        u64 new_bw = to_ratio(period, runtime);
+        int cpu, ret = 0;
+        unsigned long flags;
-        mutex_lock(&mutex);
+        /*
-        ret = proc_dointvec(table, write, buffer, lenp, ppos);
+         * Here we want to check the bandwidth not being set to some
-        /* make sure that internally we keep jiffies */
+         * value smaller than the currently allocated bandwidth in
-        /* also, writing zero resets timeslice to default */
+         * any of the root_domains.
-        if (!ret && write) {
+         *
-                sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+         * FIXME: Cycling on all the CPUs is overdoing, but simpler than
-                        RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+         * cycling on root_domains... Discussion on different/better
+         * solutions is welcome!
+         */
+        for_each_possible_cpu(cpu) {
+                struct dl_bw *dl_b = dl_bw_of(cpu);
+                raw_spin_lock_irqsave(&dl_b->lock, flags);
+                if (new_bw < dl_b->total_bw)
+                        ret = -EBUSY;
+                raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+                if (ret)
+                        break;
        }
-        mutex_unlock(&mutex);
        return ret;
 }
+static void sched_dl_do_global(void)
+{
+        u64 new_bw = -1;
+        int cpu;
+        unsigned long flags;
+        def_dl_bandwidth.dl_period = global_rt_period();
+        def_dl_bandwidth.dl_runtime = global_rt_runtime();
+        if (global_rt_runtime() != RUNTIME_INF)
+                new_bw = to_ratio(global_rt_period(), global_rt_runtime());
+        /*
+         * FIXME: As above...
+         */
+        for_each_possible_cpu(cpu) {
+                struct dl_bw *dl_b = dl_bw_of(cpu);
+                raw_spin_lock_irqsave(&dl_b->lock, flags);
+                dl_b->bw = new_bw;
+                raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+        }
+}
+static int sched_rt_global_validate(void)
+{
+        if (sysctl_sched_rt_period <= 0)
+                return -EINVAL;
+        if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
+                (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
+                return -EINVAL;
+        return 0;
+}
+static void sched_rt_do_global(void)
+{
+        def_rt_bandwidth.rt_runtime = global_rt_runtime();
+        def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+}
 int sched_rt_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
-        int ret;
        int old_period, old_runtime;
        static DEFINE_MUTEX(mutex);
+        int ret;
        mutex_lock(&mutex);
        old_period = sysctl_sched_rt_period;
@@ -6911,21 +7515,50 @@ int sched_rt_handler(struct ctl_table *table, int write,
        ret = proc_dointvec(table, write, buffer, lenp, ppos);
        if (!ret && write) {
+                ret = sched_rt_global_validate();
+                if (ret)
+                        goto undo;
                ret = sched_rt_global_constraints();
-                if (ret) {
+                if (ret)
-                        sysctl_sched_rt_period = old_period;
+                        goto undo;
-                        sysctl_sched_rt_runtime = old_runtime;
-                } else {
+                ret = sched_dl_global_constraints();
-                        def_rt_bandwidth.rt_runtime = global_rt_runtime();
+                if (ret)
-                        def_rt_bandwidth.rt_period =
+                        goto undo;
-                                ns_to_ktime(global_rt_period());
-                }
+                sched_rt_do_global();
+                sched_dl_do_global();
+        }
+        if (0) {
+undo:
+                sysctl_sched_rt_period = old_period;
+                sysctl_sched_rt_runtime = old_runtime;
        }
        mutex_unlock(&mutex);
        return ret;
 }
+int sched_rr_handler(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int ret;
+        static DEFINE_MUTEX(mutex);
+        mutex_lock(&mutex);
+        ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        /* make sure that internally we keep jiffies */
+        /* also, writing zero resets timeslice to default */
+        if (!ret && write) {
+                sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+                        RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+        }
+        mutex_unlock(&mutex);
+        return ret;
+}
 #ifdef CONFIG_CGROUP_SCHED
 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
@@ -7258,15 +7891,14 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
        return ret;
 }
-static int cpu_stats_show(struct cgroup_subsys_state *css, struct cftype *cft,
+static int cpu_stats_show(struct seq_file *sf, void *v)
-                struct cgroup_map_cb *cb)
 {
-        struct task_group *tg = css_tg(css);
+        struct task_group *tg = css_tg(seq_css(sf));
        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
-        cb->fill(cb, "nr_periods", cfs_b->nr_periods);
+        seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
-        cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
+        seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
-        cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+        seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
        return 0;
 }
@@ -7320,7 +7952,7 @@ static struct cftype cpu_files[] = {
        },
        {
                .name = "stat",
-                .read_map = cpu_stats_show,
+                .seq_show = cpu_stats_show,
        },
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index f64722ff0299..622e0818f905 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -163,10 +163,9 @@ out:
        return err;
 }
-static int cpuacct_percpu_seq_read(struct cgroup_subsys_state *css,
+static int cpuacct_percpu_seq_show(struct seq_file *m, void *V)
-                                   struct cftype *cft, struct seq_file *m)
 {
-        struct cpuacct *ca = css_ca(css);
+        struct cpuacct *ca = css_ca(seq_css(m));
        u64 percpu;
        int i;
@@ -183,10 +182,9 @@ static const char * const cpuacct_stat_desc[] = {
        [CPUACCT_STAT_SYSTEM] = "system",
 };
-static int cpuacct_stats_show(struct cgroup_subsys_state *css,
+static int cpuacct_stats_show(struct seq_file *sf, void *v)
-                              struct cftype *cft, struct cgroup_map_cb *cb)
 {
-        struct cpuacct *ca = css_ca(css);
+        struct cpuacct *ca = css_ca(seq_css(sf));
        int cpu;
        s64 val = 0;
@@ -196,7 +194,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
                val += kcpustat->cpustat[CPUTIME_NICE];
        }
        val = cputime64_to_clock_t(val);
-        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
+        seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_USER], val);
        val = 0;
        for_each_online_cpu(cpu) {
@@ -207,7 +205,7 @@ static int cpuacct_stats_show(struct cgroup_subsys_state *css,
        }
        val = cputime64_to_clock_t(val);
-        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
+        seq_printf(sf, "%s %lld\n", cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
        return 0;
 }
@@ -220,11 +218,11 @@ static struct cftype files[] = {
        },
        {
                .name = "usage_percpu",
-                .read_seq_string = cpuacct_percpu_seq_read,
+                .seq_show = cpuacct_percpu_seq_show,
        },
        {
                .name = "stat",
-                .read_map = cpuacct_stats_show,
+                .seq_show = cpuacct_stats_show,
        },
        { }     /* terminate */
 };
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
new file mode 100644
index 000000000000..5b9bb42b2d47
--- /dev/null
+++ b/kernel/sched/cpudeadline.c
@@ -0,0 +1,216 @@
+/*
+ *  kernel/sched/cpudl.c
+ *
+ *  Global CPU deadline management
+ *
+ *  Author: Juri Lelli <j.lelli@sssup.it>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; version 2
+ *  of the License.
+ */
+#include <linux/gfp.h>
+#include <linux/kernel.h>
+#include "cpudeadline.h"
+static inline int parent(int i)
+{
+        return (i - 1) >> 1;
+}
+static inline int left_child(int i)
+{
+        return (i << 1) + 1;
+}
+static inline int right_child(int i)
+{
+        return (i << 1) + 2;
+}
+static inline int dl_time_before(u64 a, u64 b)
+{
+        return (s64)(a - b) < 0;
+}
+static void cpudl_exchange(struct cpudl *cp, int a, int b)
+{
+        int cpu_a = cp->elements[a].cpu, cpu_b = cp->elements[b].cpu;
+        swap(cp->elements[a], cp->elements[b]);
+        swap(cp->cpu_to_idx[cpu_a], cp->cpu_to_idx[cpu_b]);
+}
+static void cpudl_heapify(struct cpudl *cp, int idx)
+{
+        int l, r, largest;
+        /* adapted from lib/prio_heap.c */
+        while(1) {
+                l = left_child(idx);
+                r = right_child(idx);
+                largest = idx;
+                if ((l < cp->size) && dl_time_before(cp->elements[idx].dl,
+                                                        cp->elements[l].dl))
+                        largest = l;
+                if ((r < cp->size) && dl_time_before(cp->elements[largest].dl,
+                                                        cp->elements[r].dl))
+                        largest = r;
+                if (largest == idx)
+                        break;
+                /* Push idx down the heap one level and bump one up */
+                cpudl_exchange(cp, largest, idx);
+                idx = largest;
+        }
+}
+static void cpudl_change_key(struct cpudl *cp, int idx, u64 new_dl)
+{
+        WARN_ON(idx == IDX_INVALID || !cpu_present(idx));
+        if (dl_time_before(new_dl, cp->elements[idx].dl)) {
+                cp->elements[idx].dl = new_dl;
+                cpudl_heapify(cp, idx);
+        } else {
+                cp->elements[idx].dl = new_dl;
+                while (idx > 0 && dl_time_before(cp->elements[parent(idx)].dl,
+                                        cp->elements[idx].dl)) {
+                        cpudl_exchange(cp, idx, parent(idx));
+                        idx = parent(idx);
+                }
+        }
+}
+static inline int cpudl_maximum(struct cpudl *cp)
+{
+        return cp->elements[0].cpu;
+}
+/*
+ * cpudl_find - find the best (later-dl) CPU in the system
+ * @cp: the cpudl max-heap context
+ * @p: the task
+ * @later_mask: a mask to fill in with the selected CPUs (or NULL)
+ *
+ * Returns: int - best CPU (heap maximum if suitable)
+ */
+int cpudl_find(struct cpudl *cp, struct task_struct *p,
+               struct cpumask *later_mask)
+{
+        int best_cpu = -1;
+        const struct sched_dl_entity *dl_se = &p->dl;
+        if (later_mask && cpumask_and(later_mask, cp->free_cpus,
+                        &p->cpus_allowed) && cpumask_and(later_mask,
+                        later_mask, cpu_active_mask)) {
+                best_cpu = cpumask_any(later_mask);
+                goto out;
+        } else if (cpumask_test_cpu(cpudl_maximum(cp), &p->cpus_allowed) &&
+                        dl_time_before(dl_se->deadline, cp->elements[0].dl)) {
+                best_cpu = cpudl_maximum(cp);
+                if (later_mask)
+                        cpumask_set_cpu(best_cpu, later_mask);
+        }
+out:
+        WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
+        return best_cpu;
+}
+/*
+ * cpudl_set - update the cpudl max-heap
+ * @cp: the cpudl max-heap context
+ * @cpu: the target cpu
+ * @dl: the new earliest deadline for this cpu
+ *
+ * Notes: assumes cpu_rq(cpu)->lock is locked
+ *
+ * Returns: (void)
+ */
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
+{
+        int old_idx, new_cpu;
+        unsigned long flags;
+        WARN_ON(!cpu_present(cpu));
+        raw_spin_lock_irqsave(&cp->lock, flags);
+        old_idx = cp->cpu_to_idx[cpu];
+        if (!is_valid) {
+                /* remove item */
+                if (old_idx == IDX_INVALID) {
+                        /*
+                         * Nothing to remove if old_idx was invalid.
+                         * This could happen if a rq_offline_dl is
+                         * called for a CPU without -dl tasks running.
+                         */
+                        goto out;
+                }
+                new_cpu = cp->elements[cp->size - 1].cpu;
+                cp->elements[old_idx].dl = cp->elements[cp->size - 1].dl;
+                cp->elements[old_idx].cpu = new_cpu;
+                cp->size--;
+                cp->cpu_to_idx[new_cpu] = old_idx;
+                cp->cpu_to_idx[cpu] = IDX_INVALID;
+                while (old_idx > 0 && dl_time_before(
+                                cp->elements[parent(old_idx)].dl,
+                                cp->elements[old_idx].dl)) {
+                        cpudl_exchange(cp, old_idx, parent(old_idx));
+                        old_idx = parent(old_idx);
+                }
+                cpumask_set_cpu(cpu, cp->free_cpus);
+                cpudl_heapify(cp, old_idx);
+                goto out;
+        }
+        if (old_idx == IDX_INVALID) {
+                cp->size++;
+                cp->elements[cp->size - 1].dl = 0;
+                cp->elements[cp->size - 1].cpu = cpu;
+                cp->cpu_to_idx[cpu] = cp->size - 1;
+                cpudl_change_key(cp, cp->size - 1, dl);
+                cpumask_clear_cpu(cpu, cp->free_cpus);
+        } else {
+                cpudl_change_key(cp, old_idx, dl);
+        }
+out:
+        raw_spin_unlock_irqrestore(&cp->lock, flags);
+}
+/*
+ * cpudl_init - initialize the cpudl structure
+ * @cp: the cpudl max-heap context
+ */
+int cpudl_init(struct cpudl *cp)
+{
+        int i;
+        memset(cp, 0, sizeof(*cp));
+        raw_spin_lock_init(&cp->lock);
+        cp->size = 0;
+        for (i = 0; i < NR_CPUS; i++)
+                cp->cpu_to_idx[i] = IDX_INVALID;
+        if (!alloc_cpumask_var(&cp->free_cpus, GFP_KERNEL))
+                return -ENOMEM;
+        cpumask_setall(cp->free_cpus);
+        return 0;
+}
+/*
+ * cpudl_cleanup - clean up the cpudl structure
+ * @cp: the cpudl max-heap context
+ */
+void cpudl_cleanup(struct cpudl *cp)
+{
+        /*
+         * nothing to do for the moment
+         */
+}
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
new file mode 100644
index 000000000000..a202789a412c
--- /dev/null
+++ b/kernel/sched/cpudeadline.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_CPUDL_H
+#define _LINUX_CPUDL_H
+#include <linux/sched.h>
+#define IDX_INVALID     -1
+struct array_item {
+        u64 dl;
+        int cpu;
+};
+struct cpudl {
+        raw_spinlock_t lock;
+        int size;
+        int cpu_to_idx[NR_CPUS];
+        struct array_item elements[NR_CPUS];
+        cpumask_var_t free_cpus;
+};
+#ifdef CONFIG_SMP
+int cpudl_find(struct cpudl *cp, struct task_struct *p,
+               struct cpumask *later_mask);
+void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
+int cpudl_init(struct cpudl *cp);
+void cpudl_cleanup(struct cpudl *cp);
+#else
+#define cpudl_set(cp, cpu, dl) do { } while (0)
+#define cpudl_init() do { } while (0)
+#endif /* CONFIG_SMP */
+#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
new file mode 100644
index 000000000000..6e79b3faa4cd
--- /dev/null
+++ b/kernel/sched/deadline.c
@@ -0,0 +1,1639 @@
+/*
+ * Deadline Scheduling Class (SCHED_DEADLINE)
+ *
+ * Earliest Deadline First (EDF) + Constant Bandwidth Server (CBS).
+ *
+ * Tasks that periodically executes their instances for less than their
+ * runtime won't miss any of their deadlines.
+ * Tasks that are not periodic or sporadic or that tries to execute more
+ * than their reserved bandwidth will be slowed down (and may potentially
+ * miss some of their deadlines), and won't affect any other task.
+ *
+ * Copyright (C) 2012 Dario Faggioli <raistlin@linux.it>,
+ *                    Juri Lelli <juri.lelli@gmail.com>,
+ *                    Michael Trimarchi <michael@amarulasolutions.com>,
+ *                    Fabio Checconi <fchecconi@gmail.com>
+ */
+#include "sched.h"
+#include <linux/slab.h>
+struct dl_bandwidth def_dl_bandwidth;
+static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
+{
+        return container_of(dl_se, struct task_struct, dl);
+}
+static inline struct rq *rq_of_dl_rq(struct dl_rq *dl_rq)
+{
+        return container_of(dl_rq, struct rq, dl);
+}
+static inline struct dl_rq *dl_rq_of_se(struct sched_dl_entity *dl_se)
+{
+        struct task_struct *p = dl_task_of(dl_se);
+        struct rq *rq = task_rq(p);
+        return &rq->dl;
+}
+static inline int on_dl_rq(struct sched_dl_entity *dl_se)
+{
+        return !RB_EMPTY_NODE(&dl_se->rb_node);
+}
+static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
+{
+        struct sched_dl_entity *dl_se = &p->dl;
+        return dl_rq->rb_leftmost == &dl_se->rb_node;
+}
+void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
+{
+        raw_spin_lock_init(&dl_b->dl_runtime_lock);
+        dl_b->dl_period = period;
+        dl_b->dl_runtime = runtime;
+}
+extern unsigned long to_ratio(u64 period, u64 runtime);
+void init_dl_bw(struct dl_bw *dl_b)
+{
+        raw_spin_lock_init(&dl_b->lock);
+        raw_spin_lock(&def_dl_bandwidth.dl_runtime_lock);
+        if (global_rt_runtime() == RUNTIME_INF)
+                dl_b->bw = -1;
+        else
+                dl_b->bw = to_ratio(global_rt_period(), global_rt_runtime());
+        raw_spin_unlock(&def_dl_bandwidth.dl_runtime_lock);
+        dl_b->total_bw = 0;
+}
+void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq)
+{
+        dl_rq->rb_root = RB_ROOT;
+#ifdef CONFIG_SMP
+        /* zero means no -deadline tasks */
+        dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0;
+        dl_rq->dl_nr_migratory = 0;
+        dl_rq->overloaded = 0;
+        dl_rq->pushable_dl_tasks_root = RB_ROOT;
+#else
+        init_dl_bw(&dl_rq->dl_bw);
+#endif
+}
+#ifdef CONFIG_SMP
+static inline int dl_overloaded(struct rq *rq)
+{
+        return atomic_read(&rq->rd->dlo_count);
+}
+static inline void dl_set_overload(struct rq *rq)
+{
+        if (!rq->online)
+                return;
+        cpumask_set_cpu(rq->cpu, rq->rd->dlo_mask);
+        /*
+         * Must be visible before the overload count is
+         * set (as in sched_rt.c).
+         *
+         * Matched by the barrier in pull_dl_task().
+         */
+        smp_wmb();
+        atomic_inc(&rq->rd->dlo_count);
+}
+static inline void dl_clear_overload(struct rq *rq)
+{
+        if (!rq->online)
+                return;
+        atomic_dec(&rq->rd->dlo_count);
+        cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask);
+}
+static void update_dl_migration(struct dl_rq *dl_rq)
+{
+        if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) {
+                if (!dl_rq->overloaded) {
+                        dl_set_overload(rq_of_dl_rq(dl_rq));
+                        dl_rq->overloaded = 1;
+                }
+        } else if (dl_rq->overloaded) {
+                dl_clear_overload(rq_of_dl_rq(dl_rq));
+                dl_rq->overloaded = 0;
+        }
+}
+static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+        struct task_struct *p = dl_task_of(dl_se);
+        if (p->nr_cpus_allowed > 1)
+                dl_rq->dl_nr_migratory++;
+        update_dl_migration(dl_rq);
+}
+static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+        struct task_struct *p = dl_task_of(dl_se);
+        if (p->nr_cpus_allowed > 1)
+                dl_rq->dl_nr_migratory--;
+        update_dl_migration(dl_rq);
+}
+/*
+ * The list of pushable -deadline task is not a plist, like in
+ * sched_rt.c, it is an rb-tree with tasks ordered by deadline.
+ */
+static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+        struct dl_rq *dl_rq = &rq->dl;
+        struct rb_node **link = &dl_rq->pushable_dl_tasks_root.rb_node;
+        struct rb_node *parent = NULL;
+        struct task_struct *entry;
+        int leftmost = 1;
+        BUG_ON(!RB_EMPTY_NODE(&p->pushable_dl_tasks));
+        while (*link) {
+                parent = *link;
+                entry = rb_entry(parent, struct task_struct,
+                                 pushable_dl_tasks);
+                if (dl_entity_preempt(&p->dl, &entry->dl))
+                        link = &parent->rb_left;
+                else {
+                        link = &parent->rb_right;
+                        leftmost = 0;
+                }
+        }
+        if (leftmost)
+                dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks;
+        rb_link_node(&p->pushable_dl_tasks, parent, link);
+        rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+}
+static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+        struct dl_rq *dl_rq = &rq->dl;
+        if (RB_EMPTY_NODE(&p->pushable_dl_tasks))
+                return;
+        if (dl_rq->pushable_dl_tasks_leftmost == &p->pushable_dl_tasks) {
+                struct rb_node *next_node;
+                next_node = rb_next(&p->pushable_dl_tasks);
+                dl_rq->pushable_dl_tasks_leftmost = next_node;
+        }
+        rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root);
+        RB_CLEAR_NODE(&p->pushable_dl_tasks);
+}
+static inline int has_pushable_dl_tasks(struct rq *rq)
+{
+        return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root);
+}
+static int push_dl_task(struct rq *rq);
+#else
+static inline
+void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+}
+static inline
+void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p)
+{
+}
+static inline
+void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+}
+static inline
+void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+}
+#endif /* CONFIG_SMP */
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
+static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+                                  int flags);
+/*
+ * We are being explicitly informed that a new instance is starting,
+ * and this means that:
+ *  - the absolute deadline of the entity has to be placed at
+ *    current time + relative deadline;
+ *  - the runtime of the entity has to be set to the maximum value.
+ *
+ * The capability of specifying such event is useful whenever a -deadline
+ * entity wants to (try to!) synchronize its behaviour with the scheduler's
+ * one, and to (try to!) reconcile itself with its own scheduling
+ * parameters.
+ */
+static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
+                                       struct sched_dl_entity *pi_se)
+{
+        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+        struct rq *rq = rq_of_dl_rq(dl_rq);
+        WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
+        /*
+         * We use the regular wall clock time to set deadlines in the
+         * future; in fact, we must consider execution overheads (time
+         * spent on hardirq context, etc.).
+         */
+        dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+        dl_se->runtime = pi_se->dl_runtime;
+        dl_se->dl_new = 0;
+}
+/*
+ * Pure Earliest Deadline First (EDF) scheduling does not deal with the
+ * possibility of a entity lasting more than what it declared, and thus
+ * exhausting its runtime.
+ *
+ * Here we are interested in making runtime overrun possible, but we do
+ * not want a entity which is misbehaving to affect the scheduling of all
+ * other entities.
+ * Therefore, a budgeting strategy called Constant Bandwidth Server (CBS)
+ * is used, in order to confine each entity within its own bandwidth.
+ *
+ * This function deals exactly with that, and ensures that when the runtime
+ * of a entity is replenished, its deadline is also postponed. That ensures
+ * the overrunning entity can't interfere with other entity in the system and
+ * can't make them miss their deadlines. Reasons why this kind of overruns
+ * could happen are, typically, a entity voluntarily trying to overcome its
+ * runtime, or it just underestimated it during sched_setscheduler_ex().
+ */
+static void replenish_dl_entity(struct sched_dl_entity *dl_se,
+                                struct sched_dl_entity *pi_se)
+{
+        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+        struct rq *rq = rq_of_dl_rq(dl_rq);
+        BUG_ON(pi_se->dl_runtime <= 0);
+        /*
+         * This could be the case for a !-dl task that is boosted.
+         * Just go with full inherited parameters.
+         */
+        if (dl_se->dl_deadline == 0) {
+                dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+                dl_se->runtime = pi_se->dl_runtime;
+        }
+        /*
+         * We keep moving the deadline away until we get some
+         * available runtime for the entity. This ensures correct
+         * handling of situations where the runtime overrun is
+         * arbitrary large.
+         */
+        while (dl_se->runtime <= 0) {
+                dl_se->deadline += pi_se->dl_period;
+                dl_se->runtime += pi_se->dl_runtime;
+        }
+        /*
+         * At this point, the deadline really should be "in
+         * the future" with respect to rq->clock. If it's
+         * not, we are, for some reason, lagging too much!
+         * Anyway, after having warn userspace abut that,
+         * we still try to keep the things running by
+         * resetting the deadline and the budget of the
+         * entity.
+         */
+        if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
+                static bool lag_once = false;
+                if (!lag_once) {
+                        lag_once = true;
+                        printk_sched("sched: DL replenish lagged to much\n");
+                }
+                dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+                dl_se->runtime = pi_se->dl_runtime;
+        }
+}
+/*
+ * Here we check if --at time t-- an entity (which is probably being
+ * [re]activated or, in general, enqueued) can use its remaining runtime
+ * and its current deadline _without_ exceeding the bandwidth it is
+ * assigned (function returns true if it can't). We are in fact applying
+ * one of the CBS rules: when a task wakes up, if the residual runtime
+ * over residual deadline fits within the allocated bandwidth, then we
+ * can keep the current (absolute) deadline and residual budget without
+ * disrupting the schedulability of the system. Otherwise, we should
+ * refill the runtime and set the deadline a period in the future,
+ * because keeping the current (absolute) deadline of the task would
+ * result in breaking guarantees promised to other tasks (refer to
+ * Documentation/scheduler/sched-deadline.txt for more informations).
+ *
+ * This function returns true if:
+ *
+ *   runtime / (deadline - t) > dl_runtime / dl_period ,
+ *
+ * IOW we can't recycle current parameters.
+ *
+ * Notice that the bandwidth check is done against the period. For
+ * task with deadline equal to period this is the same of using
+ * dl_deadline instead of dl_period in the equation above.
+ */
+static bool dl_entity_overflow(struct sched_dl_entity *dl_se,
+                               struct sched_dl_entity *pi_se, u64 t)
+{
+        u64 left, right;
+        /*
+         * left and right are the two sides of the equation above,
+         * after a bit of shuffling to use multiplications instead
+         * of divisions.
+         *
+         * Note that none of the time values involved in the two
+         * multiplications are absolute: dl_deadline and dl_runtime
+         * are the relative deadline and the maximum runtime of each
+         * instance, runtime is the runtime left for the last instance
+         * and (deadline - t), since t is rq->clock, is the time left
+         * to the (absolute) deadline. Even if overflowing the u64 type
+         * is very unlikely to occur in both cases, here we scale down
+         * as we want to avoid that risk at all. Scaling down by 10
+         * means that we reduce granularity to 1us. We are fine with it,
+         * since this is only a true/false check and, anyway, thinking
+         * of anything below microseconds resolution is actually fiction
+         * (but still we want to give the user that illusion >;).
+         */
+        left = (pi_se->dl_period >> DL_SCALE) * (dl_se->runtime >> DL_SCALE);
+        right = ((dl_se->deadline - t) >> DL_SCALE) *
+                (pi_se->dl_runtime >> DL_SCALE);
+        return dl_time_before(right, left);
+}
+/*
+ * When a -deadline entity is queued back on the runqueue, its runtime and
+ * deadline might need updating.
+ *
+ * The policy here is that we update the deadline of the entity only if:
+ *  - the current deadline is in the past,
+ *  - using the remaining runtime with the current deadline would make
+ *    the entity exceed its bandwidth.
+ */
+static void update_dl_entity(struct sched_dl_entity *dl_se,
+                             struct sched_dl_entity *pi_se)
+{
+        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+        struct rq *rq = rq_of_dl_rq(dl_rq);
+        /*
+         * The arrival of a new instance needs special treatment, i.e.,
+         * the actual scheduling parameters have to be "renewed".
+         */
+        if (dl_se->dl_new) {
+                setup_new_dl_entity(dl_se, pi_se);
+                return;
+        }
+        if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
+            dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
+                dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
+                dl_se->runtime = pi_se->dl_runtime;
+        }
+}
+/*
+ * If the entity depleted all its runtime, and if we want it to sleep
+ * while waiting for some new execution time to become available, we
+ * set the bandwidth enforcement timer to the replenishment instant
+ * and try to activate it.
+ *
+ * Notice that it is important for the caller to know if the timer
+ * actually started or not (i.e., the replenishment instant is in
+ * the future or in the past).
+ */
+static int start_dl_timer(struct sched_dl_entity *dl_se, bool boosted)
+{
+        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+        struct rq *rq = rq_of_dl_rq(dl_rq);
+        ktime_t now, act;
+        ktime_t soft, hard;
+        unsigned long range;
+        s64 delta;
+        if (boosted)
+                return 0;
+        /*
+         * We want the timer to fire at the deadline, but considering
+         * that it is actually coming from rq->clock and not from
+         * hrtimer's time base reading.
+         */
+        act = ns_to_ktime(dl_se->deadline);
+        now = hrtimer_cb_get_time(&dl_se->dl_timer);
+        delta = ktime_to_ns(now) - rq_clock(rq);
+        act = ktime_add_ns(act, delta);
+        /*
+         * If the expiry time already passed, e.g., because the value
+         * chosen as the deadline is too small, don't even try to
+         * start the timer in the past!
+         */
+        if (ktime_us_delta(act, now) < 0)
+                return 0;
+        hrtimer_set_expires(&dl_se->dl_timer, act);
+        soft = hrtimer_get_softexpires(&dl_se->dl_timer);
+        hard = hrtimer_get_expires(&dl_se->dl_timer);
+        range = ktime_to_ns(ktime_sub(hard, soft));
+        __hrtimer_start_range_ns(&dl_se->dl_timer, soft,
+                                 range, HRTIMER_MODE_ABS, 0);
+        return hrtimer_active(&dl_se->dl_timer);
+}
+/*
+ * This is the bandwidth enforcement timer callback. If here, we know
+ * a task is not on its dl_rq, since the fact that the timer was running
+ * means the task is throttled and needs a runtime replenishment.
+ *
+ * However, what we actually do depends on the fact the task is active,
+ * (it is on its rq) or has been removed from there by a call to
+ * dequeue_task_dl(). In the former case we must issue the runtime
+ * replenishment and add the task back to the dl_rq; in the latter, we just
+ * do nothing but clearing dl_throttled, so that runtime and deadline
+ * updating (and the queueing back to dl_rq) will be done by the
+ * next call to enqueue_task_dl().
+ */
+static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
+{
+        struct sched_dl_entity *dl_se = container_of(timer,
+                                                     struct sched_dl_entity,
+                                                     dl_timer);
+        struct task_struct *p = dl_task_of(dl_se);
+        struct rq *rq = task_rq(p);
+        raw_spin_lock(&rq->lock);
+        /*
+         * We need to take care of a possible races here. In fact, the
+         * task might have changed its scheduling policy to something
+         * different from SCHED_DEADLINE or changed its reservation
+         * parameters (through sched_setscheduler()).
+         */
+        if (!dl_task(p) || dl_se->dl_new)
+                goto unlock;
+        sched_clock_tick();
+        update_rq_clock(rq);
+        dl_se->dl_throttled = 0;
+        if (p->on_rq) {
+                enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
+                if (task_has_dl_policy(rq->curr))
+                        check_preempt_curr_dl(rq, p, 0);
+                else
+                        resched_task(rq->curr);
+#ifdef CONFIG_SMP
+                /*
+                 * Queueing this task back might have overloaded rq,
+                 * check if we need to kick someone away.
+                 */
+                if (has_pushable_dl_tasks(rq))
+                        push_dl_task(rq);
+#endif
+        }
+unlock:
+        raw_spin_unlock(&rq->lock);
+        return HRTIMER_NORESTART;
+}
+void init_dl_task_timer(struct sched_dl_entity *dl_se)
+{
+        struct hrtimer *timer = &dl_se->dl_timer;
+        if (hrtimer_active(timer)) {
+                hrtimer_try_to_cancel(timer);
+                return;
+        }
+        hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        timer->function = dl_task_timer;
+}
+static
+int dl_runtime_exceeded(struct rq *rq, struct sched_dl_entity *dl_se)
+{
+        int dmiss = dl_time_before(dl_se->deadline, rq_clock(rq));
+        int rorun = dl_se->runtime <= 0;
+        if (!rorun && !dmiss)
+                return 0;
+        /*
+         * If we are beyond our current deadline and we are still
+         * executing, then we have already used some of the runtime of
+         * the next instance. Thus, if we do not account that, we are
+         * stealing bandwidth from the system at each deadline miss!
+         */
+        if (dmiss) {
+                dl_se->runtime = rorun ? dl_se->runtime : 0;
+                dl_se->runtime -= rq_clock(rq) - dl_se->deadline;
+        }
+        return 1;
+}
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
+/*
+ * Update the current task's runtime statistics (provided it is still
+ * a -deadline task and has not been removed from the dl_rq).
+ */
+static void update_curr_dl(struct rq *rq)
+{
+        struct task_struct *curr = rq->curr;
+        struct sched_dl_entity *dl_se = &curr->dl;
+        u64 delta_exec;
+        if (!dl_task(curr) || !on_dl_rq(dl_se))
+                return;
+        /*
+         * Consumed budget is computed considering the time as
+         * observed by schedulable tasks (excluding time spent
+         * in hardirq context, etc.). Deadlines are instead
+         * computed using hard walltime. This seems to be the more
+         * natural solution, but the full ramifications of this
+         * approach need further study.
+         */
+        delta_exec = rq_clock_task(rq) - curr->se.exec_start;
+        if (unlikely((s64)delta_exec < 0))
+                delta_exec = 0;
+        schedstat_set(curr->se.statistics.exec_max,
+                      max(curr->se.statistics.exec_max, delta_exec));
+        curr->se.sum_exec_runtime += delta_exec;
+        account_group_exec_runtime(curr, delta_exec);
+        curr->se.exec_start = rq_clock_task(rq);
+        cpuacct_charge(curr, delta_exec);
+        sched_rt_avg_update(rq, delta_exec);
+        dl_se->runtime -= delta_exec;
+        if (dl_runtime_exceeded(rq, dl_se)) {
+                __dequeue_task_dl(rq, curr, 0);
+                if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
+                        dl_se->dl_throttled = 1;
+                else
+                        enqueue_task_dl(rq, curr, ENQUEUE_REPLENISH);
+                if (!is_leftmost(curr, &rq->dl))
+                        resched_task(curr);
+        }
+        /*
+         * Because -- for now -- we share the rt bandwidth, we need to
+         * account our runtime there too, otherwise actual rt tasks
+         * would be able to exceed the shared quota.
+         *
+         * Account to the root rt group for now.
+         *
+         * The solution we're working towards is having the RT groups scheduled
+         * using deadline servers -- however there's a few nasties to figure
+         * out before that can happen.
+         */
+        if (rt_bandwidth_enabled()) {
+                struct rt_rq *rt_rq = &rq->rt;
+                raw_spin_lock(&rt_rq->rt_runtime_lock);
+                /*
+                 * We'll let actual RT tasks worry about the overflow here, we
+                 * have our own CBS to keep us inline; only account when RT
+                 * bandwidth is relevant.
+                 */
+                if (sched_rt_bandwidth_account(rt_rq))
+                        rt_rq->rt_time += delta_exec;
+                raw_spin_unlock(&rt_rq->rt_runtime_lock);
+        }
+}
+#ifdef CONFIG_SMP
+static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu);
+static inline u64 next_deadline(struct rq *rq)
+{
+        struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu);
+        if (next && dl_prio(next->prio))
+                return next->dl.deadline;
+        else
+                return 0;
+}
+static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
+{
+        struct rq *rq = rq_of_dl_rq(dl_rq);
+        if (dl_rq->earliest_dl.curr == 0 ||
+            dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
+                /*
+                 * If the dl_rq had no -deadline tasks, or if the new task
+                 * has shorter deadline than the current one on dl_rq, we
+                 * know that the previous earliest becomes our next earliest,
+                 * as the new task becomes the earliest itself.
+                 */
+                dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr;
+                dl_rq->earliest_dl.curr = deadline;
+                cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1);
+        } else if (dl_rq->earliest_dl.next == 0 ||
+                   dl_time_before(deadline, dl_rq->earliest_dl.next)) {
+                /*
+                 * On the other hand, if the new -deadline task has a
+                 * a later deadline than the earliest one on dl_rq, but
+                 * it is earlier than the next (if any), we must
+                 * recompute the next-earliest.
+                 */
+                dl_rq->earliest_dl.next = next_deadline(rq);
+        }
+}
+static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
+{
+        struct rq *rq = rq_of_dl_rq(dl_rq);
+        /*
+         * Since we may have removed our earliest (and/or next earliest)
+         * task we must recompute them.
+         */
+        if (!dl_rq->dl_nr_running) {
+                dl_rq->earliest_dl.curr = 0;
+                dl_rq->earliest_dl.next = 0;
+                cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+        } else {
+                struct rb_node *leftmost = dl_rq->rb_leftmost;
+                struct sched_dl_entity *entry;
+                entry = rb_entry(leftmost, struct sched_dl_entity, rb_node);
+                dl_rq->earliest_dl.curr = entry->deadline;
+                dl_rq->earliest_dl.next = next_deadline(rq);
+                cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1);
+        }
+}
+#else
+static inline void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
+static inline void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) {}
+#endif /* CONFIG_SMP */
+static inline
+void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+        int prio = dl_task_of(dl_se)->prio;
+        u64 deadline = dl_se->deadline;
+        WARN_ON(!dl_prio(prio));
+        dl_rq->dl_nr_running++;
+        inc_nr_running(rq_of_dl_rq(dl_rq));
+        inc_dl_deadline(dl_rq, deadline);
+        inc_dl_migration(dl_se, dl_rq);
+}
+static inline
+void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+        int prio = dl_task_of(dl_se)->prio;
+        WARN_ON(!dl_prio(prio));
+        WARN_ON(!dl_rq->dl_nr_running);
+        dl_rq->dl_nr_running--;
+        dec_nr_running(rq_of_dl_rq(dl_rq));
+        dec_dl_deadline(dl_rq, dl_se->deadline);
+        dec_dl_migration(dl_se, dl_rq);
+}
+static void __enqueue_dl_entity(struct sched_dl_entity *dl_se)
+{
+        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+        struct rb_node **link = &dl_rq->rb_root.rb_node;
+        struct rb_node *parent = NULL;
+        struct sched_dl_entity *entry;
+        int leftmost = 1;
+        BUG_ON(!RB_EMPTY_NODE(&dl_se->rb_node));
+        while (*link) {
+                parent = *link;
+                entry = rb_entry(parent, struct sched_dl_entity, rb_node);
+                if (dl_time_before(dl_se->deadline, entry->deadline))
+                        link = &parent->rb_left;
+                else {
+                        link = &parent->rb_right;
+                        leftmost = 0;
+                }
+        }
+        if (leftmost)
+                dl_rq->rb_leftmost = &dl_se->rb_node;
+        rb_link_node(&dl_se->rb_node, parent, link);
+        rb_insert_color(&dl_se->rb_node, &dl_rq->rb_root);
+        inc_dl_tasks(dl_se, dl_rq);
+}
+static void __dequeue_dl_entity(struct sched_dl_entity *dl_se)
+{
+        struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
+        if (RB_EMPTY_NODE(&dl_se->rb_node))
+                return;
+        if (dl_rq->rb_leftmost == &dl_se->rb_node) {
+                struct rb_node *next_node;
+                next_node = rb_next(&dl_se->rb_node);
+                dl_rq->rb_leftmost = next_node;
+        }
+        rb_erase(&dl_se->rb_node, &dl_rq->rb_root);
+        RB_CLEAR_NODE(&dl_se->rb_node);
+        dec_dl_tasks(dl_se, dl_rq);
+}
+static void
+enqueue_dl_entity(struct sched_dl_entity *dl_se,
+                  struct sched_dl_entity *pi_se, int flags)
+{
+        BUG_ON(on_dl_rq(dl_se));
+        /*
+         * If this is a wakeup or a new instance, the scheduling
+         * parameters of the task might need updating. Otherwise,
+         * we want a replenishment of its runtime.
+         */
+        if (!dl_se->dl_new && flags & ENQUEUE_REPLENISH)
+                replenish_dl_entity(dl_se, pi_se);
+        else
+                update_dl_entity(dl_se, pi_se);
+        __enqueue_dl_entity(dl_se);
+}
+static void dequeue_dl_entity(struct sched_dl_entity *dl_se)
+{
+        __dequeue_dl_entity(dl_se);
+}
+static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+        struct task_struct *pi_task = rt_mutex_get_top_task(p);
+        struct sched_dl_entity *pi_se = &p->dl;
+        /*
+         * Use the scheduling parameters of the top pi-waiter
+         * task if we have one and its (relative) deadline is
+         * smaller than our one... OTW we keep our runtime and
+         * deadline.
+         */
+        if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
+                pi_se = &pi_task->dl;
+        /*
+         * If p is throttled, we do nothing. In fact, if it exhausted
+         * its budget it needs a replenishment and, since it now is on
+         * its rq, the bandwidth timer callback (which clearly has not
+         * run yet) will take care of this.
+         */
+        if (p->dl.dl_throttled)
+                return;
+        enqueue_dl_entity(&p->dl, pi_se, flags);
+        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
+                enqueue_pushable_dl_task(rq, p);
+}
+static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+        dequeue_dl_entity(&p->dl);
+        dequeue_pushable_dl_task(rq, p);
+}
+static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags)
+{
+        update_curr_dl(rq);
+        __dequeue_task_dl(rq, p, flags);
+}
+/*
+ * Yield task semantic for -deadline tasks is:
+ *
+ *   get off from the CPU until our next instance, with
+ *   a new runtime. This is of little use now, since we
+ *   don't have a bandwidth reclaiming mechanism. Anyway,
+ *   bandwidth reclaiming is planned for the future, and
+ *   yield_task_dl will indicate that some spare budget
+ *   is available for other task instances to use it.
+ */
+static void yield_task_dl(struct rq *rq)
+{
+        struct task_struct *p = rq->curr;
+        /*
+         * We make the task go to sleep until its current deadline by
+         * forcing its runtime to zero. This way, update_curr_dl() stops
+         * it and the bandwidth timer will wake it up and will give it
+         * new scheduling parameters (thanks to dl_new=1).
+         */
+        if (p->dl.runtime > 0) {
+                rq->curr->dl.dl_new = 1;
+                p->dl.runtime = 0;
+        }
+        update_curr_dl(rq);
+}
+#ifdef CONFIG_SMP
+static int find_later_rq(struct task_struct *task);
+static int
+select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+        struct task_struct *curr;
+        struct rq *rq;
+        if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+                goto out;
+        rq = cpu_rq(cpu);
+        rcu_read_lock();
+        curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+        /*
+         * If we are dealing with a -deadline task, we must
+         * decide where to wake it up.
+         * If it has a later deadline and the current task
+         * on this rq can't move (provided the waking task
+         * can!) we prefer to send it somewhere else. On the
+         * other hand, if it has a shorter deadline, we
+         * try to make it stay here, it might be important.
+         */
+        if (unlikely(dl_task(curr)) &&
+            (curr->nr_cpus_allowed < 2 ||
+             !dl_entity_preempt(&p->dl, &curr->dl)) &&
+            (p->nr_cpus_allowed > 1)) {
+                int target = find_later_rq(p);
+                if (target != -1)
+                        cpu = target;
+        }
+        rcu_read_unlock();
+out:
+        return cpu;
+}
+static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
+{
+        /*
+         * Current can't be migrated, useless to reschedule,
+         * let's hope p can move out.
+         */
+        if (rq->curr->nr_cpus_allowed == 1 ||
+            cpudl_find(&rq->rd->cpudl, rq->curr, NULL) == -1)
+                return;
+        /*
+         * p is migratable, so let's not schedule it and
+         * see if it is pushed or pulled somewhere else.
+         */
+        if (p->nr_cpus_allowed != 1 &&
+            cpudl_find(&rq->rd->cpudl, p, NULL) != -1)
+                return;
+        resched_task(rq->curr);
+}
+#endif /* CONFIG_SMP */
+/*
+ * Only called when both the current and waking task are -deadline
+ * tasks.
+ */
+static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+                                  int flags)
+{
+        if (dl_entity_preempt(&p->dl, &rq->curr->dl)) {
+                resched_task(rq->curr);
+                return;
+        }
+#ifdef CONFIG_SMP
+        /*
+         * In the unlikely case current and p have the same deadline
+         * let us try to decide what's the best thing to do...
+         */
+        if ((p->dl.deadline == rq->curr->dl.deadline) &&
+            !test_tsk_need_resched(rq->curr))
+                check_preempt_equal_dl(rq, p);
+#endif /* CONFIG_SMP */
+}
+#ifdef CONFIG_SCHED_HRTICK
+static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+{
+        s64 delta = p->dl.dl_runtime - p->dl.runtime;
+        if (delta > 10000)
+                hrtick_start(rq, p->dl.runtime);
+}
+#endif
+static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
+                                                   struct dl_rq *dl_rq)
+{
+        struct rb_node *left = dl_rq->rb_leftmost;
+        if (!left)
+                return NULL;
+        return rb_entry(left, struct sched_dl_entity, rb_node);
+}
+struct task_struct *pick_next_task_dl(struct rq *rq)
+{
+        struct sched_dl_entity *dl_se;
+        struct task_struct *p;
+        struct dl_rq *dl_rq;
+        dl_rq = &rq->dl;
+        if (unlikely(!dl_rq->dl_nr_running))
+                return NULL;
+        dl_se = pick_next_dl_entity(rq, dl_rq);
+        BUG_ON(!dl_se);
+        p = dl_task_of(dl_se);
+        p->se.exec_start = rq_clock_task(rq);
+        /* Running task will never be pushed. */
+       dequeue_pushable_dl_task(rq, p);
+#ifdef CONFIG_SCHED_HRTICK
+        if (hrtick_enabled(rq))
+                start_hrtick_dl(rq, p);
+#endif
+#ifdef CONFIG_SMP
+        rq->post_schedule = has_pushable_dl_tasks(rq);
+#endif /* CONFIG_SMP */
+        return p;
+}
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+{
+        update_curr_dl(rq);
+        if (on_dl_rq(&p->dl) && p->nr_cpus_allowed > 1)
+                enqueue_pushable_dl_task(rq, p);
+}
+static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
+{
+        update_curr_dl(rq);
+#ifdef CONFIG_SCHED_HRTICK
+        if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
+                start_hrtick_dl(rq, p);
+#endif
+}
+static void task_fork_dl(struct task_struct *p)
+{
+        /*
+         * SCHED_DEADLINE tasks cannot fork and this is achieved through
+         * sched_fork()
+         */
+}
+static void task_dead_dl(struct task_struct *p)
+{
+        struct hrtimer *timer = &p->dl.dl_timer;
+        struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+        /*
+         * Since we are TASK_DEAD we won't slip out of the domain!
+         */
+        raw_spin_lock_irq(&dl_b->lock);
+        dl_b->total_bw -= p->dl.dl_bw;
+        raw_spin_unlock_irq(&dl_b->lock);
+        hrtimer_cancel(timer);
+}
+static void set_curr_task_dl(struct rq *rq)
+{
+        struct task_struct *p = rq->curr;
+        p->se.exec_start = rq_clock_task(rq);
+        /* You can't push away the running task */
+        dequeue_pushable_dl_task(rq, p);
+}
+#ifdef CONFIG_SMP
+/* Only try algorithms three times */
+#define DL_MAX_TRIES 3
+static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+        if (!task_running(rq, p) &&
+            (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
+            (p->nr_cpus_allowed > 1))
+                return 1;
+        return 0;
+}
+/* Returns the second earliest -deadline task, NULL otherwise */
+static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu)
+{
+        struct rb_node *next_node = rq->dl.rb_leftmost;
+        struct sched_dl_entity *dl_se;
+        struct task_struct *p = NULL;
+next_node:
+        next_node = rb_next(next_node);
+        if (next_node) {
+                dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node);
+                p = dl_task_of(dl_se);
+                if (pick_dl_task(rq, p, cpu))
+                        return p;
+                goto next_node;
+        }
+        return NULL;
+}
+static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask_dl);
+static int find_later_rq(struct task_struct *task)
+{
+        struct sched_domain *sd;
+        struct cpumask *later_mask = __get_cpu_var(local_cpu_mask_dl);
+        int this_cpu = smp_processor_id();
+        int best_cpu, cpu = task_cpu(task);
+        /* Make sure the mask is initialized first */
+        if (unlikely(!later_mask))
+                return -1;
+        if (task->nr_cpus_allowed == 1)
+                return -1;
+        best_cpu = cpudl_find(&task_rq(task)->rd->cpudl,
+                        task, later_mask);
+        if (best_cpu == -1)
+                return -1;
+        /*
+         * If we are here, some target has been found,
+         * the most suitable of which is cached in best_cpu.
+         * This is, among the runqueues where the current tasks
+         * have later deadlines than the task's one, the rq
+         * with the latest possible one.
+         *
+         * Now we check how well this matches with task's
+         * affinity and system topology.
+         *
+         * The last cpu where the task run is our first
+         * guess, since it is most likely cache-hot there.
+         */
+        if (cpumask_test_cpu(cpu, later_mask))
+                return cpu;
+        /*
+         * Check if this_cpu is to be skipped (i.e., it is
+         * not in the mask) or not.
+         */
+        if (!cpumask_test_cpu(this_cpu, later_mask))
+                this_cpu = -1;
+        rcu_read_lock();
+        for_each_domain(cpu, sd) {
+                if (sd->flags & SD_WAKE_AFFINE) {
+                        /*
+                         * If possible, preempting this_cpu is
+                         * cheaper than migrating.
+                         */
+                        if (this_cpu != -1 &&
+                            cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+                                rcu_read_unlock();
+                                return this_cpu;
+                        }
+                        /*
+                         * Last chance: if best_cpu is valid and is
+                         * in the mask, that becomes our choice.
+                         */
+                        if (best_cpu < nr_cpu_ids &&
+                            cpumask_test_cpu(best_cpu, sched_domain_span(sd))) {
+                                rcu_read_unlock();
+                                return best_cpu;
+                        }
+                }
+        }
+        rcu_read_unlock();
+        /*
+         * At this point, all our guesses failed, we just return
+         * 'something', and let the caller sort the things out.
+         */
+        if (this_cpu != -1)
+                return this_cpu;
+        cpu = cpumask_any(later_mask);
+        if (cpu < nr_cpu_ids)
+                return cpu;
+        return -1;
+}
+/* Locks the rq it finds */
+static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq)
+{
+        struct rq *later_rq = NULL;
+        int tries;
+        int cpu;
+        for (tries = 0; tries < DL_MAX_TRIES; tries++) {
+                cpu = find_later_rq(task);
+                if ((cpu == -1) || (cpu == rq->cpu))
+                        break;
+                later_rq = cpu_rq(cpu);
+                /* Retry if something changed. */
+                if (double_lock_balance(rq, later_rq)) {
+                        if (unlikely(task_rq(task) != rq ||
+                                     !cpumask_test_cpu(later_rq->cpu,
+                                                       &task->cpus_allowed) ||
+                                     task_running(rq, task) || !task->on_rq)) {
+                                double_unlock_balance(rq, later_rq);
+                                later_rq = NULL;
+                                break;
+                        }
+                }
+                /*
+                 * If the rq we found has no -deadline task, or
+                 * its earliest one has a later deadline than our
+                 * task, the rq is a good one.
+                 */
+                if (!later_rq->dl.dl_nr_running ||
+                    dl_time_before(task->dl.deadline,
+                                   later_rq->dl.earliest_dl.curr))
+                        break;
+                /* Otherwise we try again. */
+                double_unlock_balance(rq, later_rq);
+                later_rq = NULL;
+        }
+        return later_rq;
+}
+static struct task_struct *pick_next_pushable_dl_task(struct rq *rq)
+{
+        struct task_struct *p;
+        if (!has_pushable_dl_tasks(rq))
+                return NULL;
+        p = rb_entry(rq->dl.pushable_dl_tasks_leftmost,
+                     struct task_struct, pushable_dl_tasks);
+        BUG_ON(rq->cpu != task_cpu(p));
+        BUG_ON(task_current(rq, p));
+        BUG_ON(p->nr_cpus_allowed <= 1);
+        BUG_ON(!p->on_rq);
+        BUG_ON(!dl_task(p));
+        return p;
+}
+/*
+ * See if the non running -deadline tasks on this rq
+ * can be sent to some other CPU where they can preempt
+ * and start executing.
+ */
+static int push_dl_task(struct rq *rq)
+{
+        struct task_struct *next_task;
+        struct rq *later_rq;
+        if (!rq->dl.overloaded)
+                return 0;
+        next_task = pick_next_pushable_dl_task(rq);
+        if (!next_task)
+                return 0;
+retry:
+        if (unlikely(next_task == rq->curr)) {
+                WARN_ON(1);
+                return 0;
+        }
+        /*
+         * If next_task preempts rq->curr, and rq->curr
+         * can move away, it makes sense to just reschedule
+         * without going further in pushing next_task.
+         */
+        if (dl_task(rq->curr) &&
+            dl_time_before(next_task->dl.deadline, rq->curr->dl.deadline) &&
+            rq->curr->nr_cpus_allowed > 1) {
+                resched_task(rq->curr);
+                return 0;
+        }
+        /* We might release rq lock */
+        get_task_struct(next_task);
+        /* Will lock the rq it'll find */
+        later_rq = find_lock_later_rq(next_task, rq);
+        if (!later_rq) {
+                struct task_struct *task;
+                /*
+                 * We must check all this again, since
+                 * find_lock_later_rq releases rq->lock and it is
+                 * then possible that next_task has migrated.
+                 */
+                task = pick_next_pushable_dl_task(rq);
+                if (task_cpu(next_task) == rq->cpu && task == next_task) {
+                        /*
+                         * The task is still there. We don't try
+                         * again, some other cpu will pull it when ready.
+                         */
+                        dequeue_pushable_dl_task(rq, next_task);
+                        goto out;
+                }
+                if (!task)
+                        /* No more tasks */
+                        goto out;
+                put_task_struct(next_task);
+                next_task = task;
+                goto retry;
+        }
+        deactivate_task(rq, next_task, 0);
+        set_task_cpu(next_task, later_rq->cpu);
+        activate_task(later_rq, next_task, 0);
+        resched_task(later_rq->curr);
+        double_unlock_balance(rq, later_rq);
+out:
+        put_task_struct(next_task);
+        return 1;
+}
+static void push_dl_tasks(struct rq *rq)
+{
+        /* Terminates as it moves a -deadline task */
+        while (push_dl_task(rq))
+                ;
+}
+static int pull_dl_task(struct rq *this_rq)
+{
+        int this_cpu = this_rq->cpu, ret = 0, cpu;
+        struct task_struct *p;
+        struct rq *src_rq;
+        u64 dmin = LONG_MAX;
+        if (likely(!dl_overloaded(this_rq)))
+                return 0;
+        /*
+         * Match the barrier from dl_set_overloaded; this guarantees that if we
+         * see overloaded we must also see the dlo_mask bit.
+         */
+        smp_rmb();
+        for_each_cpu(cpu, this_rq->rd->dlo_mask) {
+                if (this_cpu == cpu)
+                        continue;
+                src_rq = cpu_rq(cpu);
+                /*
+                 * It looks racy, abd it is! However, as in sched_rt.c,
+                 * we are fine with this.
+                 */
+                if (this_rq->dl.dl_nr_running &&
+                    dl_time_before(this_rq->dl.earliest_dl.curr,
+                                   src_rq->dl.earliest_dl.next))
+                        continue;
+                /* Might drop this_rq->lock */
+                double_lock_balance(this_rq, src_rq);
+                /*
+                 * If there are no more pullable tasks on the
+                 * rq, we're done with it.
+                 */
+                if (src_rq->dl.dl_nr_running <= 1)
+                        goto skip;
+                p = pick_next_earliest_dl_task(src_rq, this_cpu);
+                /*
+                 * We found a task to be pulled if:
+                 *  - it preempts our current (if there's one),
+                 *  - it will preempt the last one we pulled (if any).
+                 */
+                if (p && dl_time_before(p->dl.deadline, dmin) &&
+                    (!this_rq->dl.dl_nr_running ||
+                     dl_time_before(p->dl.deadline,
+                                    this_rq->dl.earliest_dl.curr))) {
+                        WARN_ON(p == src_rq->curr);
+                        WARN_ON(!p->on_rq);
+                        /*
+                         * Then we pull iff p has actually an earlier
+                         * deadline than the current task of its runqueue.
+                         */
+                        if (dl_time_before(p->dl.deadline,
+                                           src_rq->curr->dl.deadline))
+                                goto skip;
+                        ret = 1;
+                        deactivate_task(src_rq, p, 0);
+                        set_task_cpu(p, this_cpu);
+                        activate_task(this_rq, p, 0);
+                        dmin = p->dl.deadline;
+                        /* Is there any other task even earlier? */
+                }
+skip:
+                double_unlock_balance(this_rq, src_rq);
+        }
+        return ret;
+}
+static void pre_schedule_dl(struct rq *rq, struct task_struct *prev)
+{
+        /* Try to pull other tasks here */
+        if (dl_task(prev))
+                pull_dl_task(rq);
+}
+static void post_schedule_dl(struct rq *rq)
+{
+        push_dl_tasks(rq);
+}
+/*
+ * Since the task is not running and a reschedule is not going to happen
+ * anytime soon on its runqueue, we try pushing it away now.
+ */
+static void task_woken_dl(struct rq *rq, struct task_struct *p)
+{
+        if (!task_running(rq, p) &&
+            !test_tsk_need_resched(rq->curr) &&
+            has_pushable_dl_tasks(rq) &&
+            p->nr_cpus_allowed > 1 &&
+            dl_task(rq->curr) &&
+            (rq->curr->nr_cpus_allowed < 2 ||
+             dl_entity_preempt(&rq->curr->dl, &p->dl))) {
+                push_dl_tasks(rq);
+        }
+}
+static void set_cpus_allowed_dl(struct task_struct *p,
+                                const struct cpumask *new_mask)
+{
+        struct rq *rq;
+        int weight;
+        BUG_ON(!dl_task(p));
+        /*
+         * Update only if the task is actually running (i.e.,
+         * it is on the rq AND it is not throttled).
+         */
+        if (!on_dl_rq(&p->dl))
+                return;
+        weight = cpumask_weight(new_mask);
+        /*
+         * Only update if the process changes its state from whether it
+         * can migrate or not.
+         */
+        if ((p->nr_cpus_allowed > 1) == (weight > 1))
+                return;
+        rq = task_rq(p);
+        /*
+         * The process used to be able to migrate OR it can now migrate
+         */
+        if (weight <= 1) {
+                if (!task_current(rq, p))
+                        dequeue_pushable_dl_task(rq, p);
+                BUG_ON(!rq->dl.dl_nr_migratory);
+                rq->dl.dl_nr_migratory--;
+        } else {
+                if (!task_current(rq, p))
+                        enqueue_pushable_dl_task(rq, p);
+                rq->dl.dl_nr_migratory++;
+        }
+        update_dl_migration(&rq->dl);
+}
+/* Assumes rq->lock is held */
+static void rq_online_dl(struct rq *rq)
+{
+        if (rq->dl.overloaded)
+                dl_set_overload(rq);
+        if (rq->dl.dl_nr_running > 0)
+                cpudl_set(&rq->rd->cpudl, rq->cpu, rq->dl.earliest_dl.curr, 1);
+}
+/* Assumes rq->lock is held */
+static void rq_offline_dl(struct rq *rq)
+{
+        if (rq->dl.overloaded)
+                dl_clear_overload(rq);
+        cpudl_set(&rq->rd->cpudl, rq->cpu, 0, 0);
+}
+void init_sched_dl_class(void)
+{
+        unsigned int i;
+        for_each_possible_cpu(i)
+                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask_dl, i),
+                                        GFP_KERNEL, cpu_to_node(i));
+}
+#endif /* CONFIG_SMP */
+static void switched_from_dl(struct rq *rq, struct task_struct *p)
+{
+        if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
+                hrtimer_try_to_cancel(&p->dl.dl_timer);
+#ifdef CONFIG_SMP
+        /*
+         * Since this might be the only -deadline task on the rq,
+         * this is the right place to try to pull some other one
+         * from an overloaded cpu, if any.
+         */
+        if (!rq->dl.dl_nr_running)
+                pull_dl_task(rq);
+#endif
+}
+/*
+ * When switching to -deadline, we may overload the rq, then
+ * we try to push someone off, if possible.
+ */
+static void switched_to_dl(struct rq *rq, struct task_struct *p)
+{
+        int check_resched = 1;
+        /*
+         * If p is throttled, don't consider the possibility
+         * of preempting rq->curr, the check will be done right
+         * after its runtime will get replenished.
+         */
+        if (unlikely(p->dl.dl_throttled))
+                return;
+        if (p->on_rq || rq->curr != p) {
+#ifdef CONFIG_SMP
+                if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
+                        /* Only reschedule if pushing failed */
+                        check_resched = 0;
+#endif /* CONFIG_SMP */
+                if (check_resched && task_has_dl_policy(rq->curr))
+                        check_preempt_curr_dl(rq, p, 0);
+        }
+}
+/*
+ * If the scheduling parameters of a -deadline task changed,
+ * a push or pull operation might be needed.
+ */
+static void prio_changed_dl(struct rq *rq, struct task_struct *p,
+                            int oldprio)
+{
+        if (p->on_rq || rq->curr == p) {
+#ifdef CONFIG_SMP
+                /*
+                 * This might be too much, but unfortunately
+                 * we don't have the old deadline value, and
+                 * we can't argue if the task is increasing
+                 * or lowering its prio, so...
+                 */
+                if (!rq->dl.overloaded)
+                        pull_dl_task(rq);
+                /*
+                 * If we now have a earlier deadline task than p,
+                 * then reschedule, provided p is still on this
+                 * runqueue.
+                 */
+                if (dl_time_before(rq->dl.earliest_dl.curr, p->dl.deadline) &&
+                    rq->curr == p)
+                        resched_task(p);
+#else
+                /*
+                 * Again, we don't know if p has a earlier
+                 * or later deadline, so let's blindly set a
+                 * (maybe not needed) rescheduling point.
+                 */
+                resched_task(p);
+#endif /* CONFIG_SMP */
+        } else
+                switched_to_dl(rq, p);
+}
+const struct sched_class dl_sched_class = {
+        .next                   = &rt_sched_class,
+        .enqueue_task           = enqueue_task_dl,
+        .dequeue_task           = dequeue_task_dl,
+        .yield_task             = yield_task_dl,
+        .check_preempt_curr     = check_preempt_curr_dl,
+        .pick_next_task         = pick_next_task_dl,
+        .put_prev_task          = put_prev_task_dl,
+#ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_dl,
+        .set_cpus_allowed       = set_cpus_allowed_dl,
+        .rq_online              = rq_online_dl,
+        .rq_offline             = rq_offline_dl,
+        .pre_schedule           = pre_schedule_dl,
+        .post_schedule          = post_schedule_dl,
+        .task_woken             = task_woken_dl,
+#endif
+        .set_curr_task          = set_curr_task_dl,
+        .task_tick              = task_tick_dl,
+        .task_fork              = task_fork_dl,
+        .task_dead              = task_dead_dl,
+        .prio_changed           = prio_changed_dl,
+        .switched_from          = switched_from_dl,
+        .switched_to            = switched_to_dl,
+};
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 5c34d1817e8f..dd52e7ffb10e 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -139,7 +139,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
                0LL, 0LL, 0LL, 0L, 0LL, 0L, 0LL, 0L);
 #endif
 #ifdef CONFIG_NUMA_BALANCING
-        SEQ_printf(m, " %d", cpu_to_node(task_cpu(p)));
+        SEQ_printf(m, " %d", task_node(p));
 #endif
 #ifdef CONFIG_CGROUP_SCHED
        SEQ_printf(m, " %s", task_group_path(task_group(p)));
@@ -371,7 +371,7 @@ static void sched_debug_header(struct seq_file *m)
        PN(cpu_clk);
        P(jiffies);
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-        P(sched_clock_stable);
+        P(sched_clock_stable());
 #endif
 #undef PN
 #undef P
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c7395d97e4cb..9b4c4f320130 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -872,15 +872,6 @@ static unsigned int task_scan_max(struct task_struct *p)
        return max(smin, smax);
 }
-/*
- * Once a preferred node is selected the scheduler balancer will prefer moving
- * a task to that node for sysctl_numa_balancing_settle_count number of PTE
- * scans. This will give the process the chance to accumulate more faults on
- * the preferred node but still allow the scheduler to move the task again if
- * the nodes CPUs are overloaded.
- */
-unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
 {
        rq->nr_numa_running += (p->numa_preferred_nid != -1);
@@ -930,7 +921,8 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
        if (!p->numa_group)
                return 0;
-        return p->numa_group->faults[2*nid] + p->numa_group->faults[2*nid+1];
+        return p->numa_group->faults[task_faults_idx(nid, 0)] +
+                p->numa_group->faults[task_faults_idx(nid, 1)];
 }
 /*
@@ -1023,7 +1015,7 @@ struct task_numa_env {
        struct numa_stats src_stats, dst_stats;
-        int imbalance_pct, idx;
+        int imbalance_pct;
        struct task_struct *best_task;
        long best_imp;
@@ -1211,7 +1203,7 @@ static int task_numa_migrate(struct task_struct *p)
         * elsewhere, so there is no point in (re)trying.
         */
        if (unlikely(!sd)) {
-                p->numa_preferred_nid = cpu_to_node(task_cpu(p));
+                p->numa_preferred_nid = task_node(p);
                return -EINVAL;
        }
@@ -1258,11 +1250,15 @@ static int task_numa_migrate(struct task_struct *p)
        p->numa_scan_period = task_scan_min(p);
        if (env.best_task == NULL) {
-                int ret = migrate_task_to(p, env.best_cpu);
+                ret = migrate_task_to(p, env.best_cpu);
+                if (ret != 0)
+                        trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
                return ret;
        }
        ret = migrate_swap(p, env.best_task);
+        if (ret != 0)
+                trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
        put_task_struct(env.best_task);
        return ret;
 }
@@ -1278,7 +1274,7 @@ static void numa_migrate_preferred(struct task_struct *p)
        p->numa_migrate_retry = jiffies + HZ;
        /* Success if task is already running on preferred CPU */
-        if (cpu_to_node(task_cpu(p)) == p->numa_preferred_nid)
+        if (task_node(p) == p->numa_preferred_nid)
                return;
        /* Otherwise, try migrate to a CPU on the preferred node */
@@ -1350,7 +1346,6 @@ static void update_task_scan_period(struct task_struct *p,
                 * scanning faster if shared accesses dominate as it may
                 * simply bounce migrations uselessly
                 */
-                period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
                ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
                diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
        }
@@ -1762,6 +1757,8 @@ void task_numa_work(struct callback_head *work)
                        start = end;
                        if (pages <= 0)
                                goto out;
+                        cond_resched();
                } while (end != vma->vm_end);
        }
@@ -2365,13 +2362,7 @@ static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
                }
                wakeup = 0;
        } else {
-                /*
+                __synchronize_entity_decay(se);
-                 * Task re-woke on same cpu (or else migrate_task_rq_fair()
-                 * would have made count negative); we must be careful to avoid
-                 * double-accounting blocked time after synchronizing decays.
-                 */
-                se->avg.last_runnable_update += __synchronize_entity_decay(se)
-                                                        << 20;
        }
        /* migrated tasks did not contribute to our blocked load */
@@ -3923,7 +3914,7 @@ static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
-        if (!tg->parent || !wl) /* the trivial, non-cgroup case */
+        if (!tg->parent)        /* the trivial, non-cgroup case */
                return wl;
        for_each_sched_entity(se) {
@@ -4101,12 +4092,16 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
 */
 static struct sched_group *
 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-                  int this_cpu, int load_idx)
+                  int this_cpu, int sd_flag)
 {
        struct sched_group *idlest = NULL, *group = sd->groups;
        unsigned long min_load = ULONG_MAX, this_load = 0;
+        int load_idx = sd->forkexec_idx;
        int imbalance = 100 + (sd->imbalance_pct-100)/2;
+        if (sd_flag & SD_BALANCE_WAKE)
+                load_idx = sd->wake_idx;
        do {
                unsigned long load, avg_load;
                int local_group;
@@ -4274,7 +4269,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
        }
        while (sd) {
-                int load_idx = sd->forkexec_idx;
                struct sched_group *group;
                int weight;
@@ -4283,10 +4277,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                        continue;
                }
-                if (sd_flag & SD_BALANCE_WAKE)
+                group = find_idlest_group(sd, p, cpu, sd_flag);
-                        load_idx = sd->wake_idx;
-                group = find_idlest_group(sd, p, cpu, load_idx);
                if (!group) {
                        sd = sd->child;
                        continue;
@@ -5512,7 +5503,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        struct sched_group *group, int load_idx,
                        int local_group, struct sg_lb_stats *sgs)
 {
-        unsigned long nr_running;
        unsigned long load;
        int i;
@@ -5521,8 +5511,6 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
                struct rq *rq = cpu_rq(i);
-                nr_running = rq->nr_running;
                /* Bias balancing toward cpus of our domain */
                if (local_group)
                        load = target_load(i, load_idx);
@@ -5530,7 +5518,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        load = source_load(i, load_idx);
                sgs->group_load += load;
-                sgs->sum_nr_running += nr_running;
+                sgs->sum_nr_running += rq->nr_running;
 #ifdef CONFIG_NUMA_BALANCING
                sgs->nr_numa_running += rq->nr_numa_running;
                sgs->nr_preferred_running += rq->nr_preferred_running;
@@ -6521,7 +6509,7 @@ static struct {
        unsigned long next_balance;     /* in jiffy units */
 } nohz ____cacheline_aligned;
-static inline int find_new_ilb(int call_cpu)
+static inline int find_new_ilb(void)
 {
        int ilb = cpumask_first(nohz.idle_cpus_mask);
@@ -6536,13 +6524,13 @@ static inline int find_new_ilb(int call_cpu)
 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
 * CPU (if there is one).
 */
-static void nohz_balancer_kick(int cpu)
+static void nohz_balancer_kick(void)
 {
        int ilb_cpu;
        nohz.next_balance++;
-        ilb_cpu = find_new_ilb(cpu);
+        ilb_cpu = find_new_ilb();
        if (ilb_cpu >= nr_cpu_ids)
                return;
@@ -6652,10 +6640,10 @@ void update_max_interval(void)
 *
 * Balancing parameters are set up in init_sched_domains.
 */
-static void rebalance_domains(int cpu, enum cpu_idle_type idle)
+static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
 {
        int continue_balancing = 1;
-        struct rq *rq = cpu_rq(cpu);
+        int cpu = rq->cpu;
        unsigned long interval;
        struct sched_domain *sd;
        /* Earliest time when we have to do rebalance again */
@@ -6752,9 +6740,9 @@ out:
 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
 * rebalancing for all the cpus for whom scheduler ticks are stopped.
 */
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
+static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
-        struct rq *this_rq = cpu_rq(this_cpu);
+        int this_cpu = this_rq->cpu;
        struct rq *rq;
        int balance_cpu;
@@ -6781,7 +6769,7 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
                update_idle_cpu_load(rq);
                raw_spin_unlock_irq(&rq->lock);
-                rebalance_domains(balance_cpu, CPU_IDLE);
+                rebalance_domains(rq, CPU_IDLE);
                if (time_after(this_rq->next_balance, rq->next_balance))
                        this_rq->next_balance = rq->next_balance;
@@ -6800,14 +6788,14 @@ end:
 *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
 *     domain span are idle.
 */
-static inline int nohz_kick_needed(struct rq *rq, int cpu)
+static inline int nohz_kick_needed(struct rq *rq)
 {
        unsigned long now = jiffies;
        struct sched_domain *sd;
        struct sched_group_power *sgp;
-        int nr_busy;
+        int nr_busy, cpu = rq->cpu;
-        if (unlikely(idle_cpu(cpu)))
+        if (unlikely(rq->idle_balance))
                return 0;
       /*
@@ -6856,7 +6844,7 @@ need_kick:
        return 1;
 }
 #else
-static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
+static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
 #endif
 /*
@@ -6865,38 +6853,39 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
 */
 static void run_rebalance_domains(struct softirq_action *h)
 {
-        int this_cpu = smp_processor_id();
+        struct rq *this_rq = this_rq();
-        struct rq *this_rq = cpu_rq(this_cpu);
        enum cpu_idle_type idle = this_rq->idle_balance ?
                                                CPU_IDLE : CPU_NOT_IDLE;
-        rebalance_domains(this_cpu, idle);
+        rebalance_domains(this_rq, idle);
        /*
         * If this cpu has a pending nohz_balance_kick, then do the
         * balancing on behalf of the other idle cpus whose ticks are
         * stopped.
         */
-        nohz_idle_balance(this_cpu, idle);
+        nohz_idle_balance(this_rq, idle);
 }
-static inline int on_null_domain(int cpu)
+static inline int on_null_domain(struct rq *rq)
 {
-        return !rcu_dereference_sched(cpu_rq(cpu)->sd);
+        return !rcu_dereference_sched(rq->sd);
 }
 /*
 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
 */
-void trigger_load_balance(struct rq *rq, int cpu)
+void trigger_load_balance(struct rq *rq)
 {
        /* Don't need to rebalance while attached to NULL domain */
-        if (time_after_eq(jiffies, rq->next_balance) &&
+        if (unlikely(on_null_domain(rq)))
-            likely(!on_null_domain(cpu)))
+                return;
+        if (time_after_eq(jiffies, rq->next_balance))
                raise_softirq(SCHED_SOFTIRQ);
 #ifdef CONFIG_NO_HZ_COMMON
-        if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+        if (nohz_kick_needed(rq))
-                nohz_balancer_kick(cpu);
+                nohz_balancer_kick();
 #endif
 }
@@ -7012,15 +7001,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        /*
-         * Ensure the task's vruntime is normalized, so that when its
+         * Ensure the task's vruntime is normalized, so that when it's
         * switched back to the fair class the enqueue_entity(.flags=0) will
         * do the right thing.
         *
-         * If it was on_rq, then the dequeue_entity(.flags=0) will already
+         * If it's on_rq, then the dequeue_entity(.flags=0) will already
-         * have normalized the vruntime, if it was !on_rq, then only when
+         * have normalized the vruntime, if it's !on_rq, then only when
         * the task is sleeping will it still have non-normalized vruntime.
         */
-        if (!se->on_rq && p->state != TASK_RUNNING) {
+        if (!p->on_rq && p->state != TASK_RUNNING) {
                /*
                 * Fix up our vruntime so that the current sleep doesn't
                 * cause 'unlimited' sleep bonus.
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 1c4065575fa2..1999021042c7 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -538,6 +538,14 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 #endif /* CONFIG_RT_GROUP_SCHED */
+bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
+{
+        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
+        return (hrtimer_active(&rt_b->rt_period_timer) ||
+                rt_rq->rt_time < rt_b->rt_runtime);
+}
 #ifdef CONFIG_SMP
 /*
 * We ran out of runtime, see if we can borrow some from our neighbours.
@@ -1738,7 +1746,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
            !test_tsk_need_resched(rq->curr) &&
            has_pushable_tasks(rq) &&
            p->nr_cpus_allowed > 1 &&
-            rt_task(rq->curr) &&
+            (dl_task(rq->curr) || rt_task(rq->curr)) &&
            (rq->curr->nr_cpus_allowed < 2 ||
             rq->curr->prio <= p->prio))
                push_rt_tasks(rq);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 88c85b21d633..f964add50f38 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2,6 +2,7 @@
 #include <linux/sched.h>
 #include <linux/sched/sysctl.h>
 #include <linux/sched/rt.h>
+#include <linux/sched/deadline.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
@@ -9,6 +10,7 @@
 #include <linux/slab.h>
 #include "cpupri.h"
+#include "cpudeadline.h"
 #include "cpuacct.h"
 struct rq;
@@ -73,6 +75,13 @@ extern void update_cpu_load_active(struct rq *this_rq);
 #define NICE_0_SHIFT            SCHED_LOAD_SHIFT
 /*
+ * Single value that decides SCHED_DEADLINE internal math precision.
+ * 10 -> just above 1us
+ * 9  -> just above 0.5us
+ */
+#define DL_SCALE (10)
+/*
 * These are the 'tuning knobs' of the scheduler:
 */
@@ -81,11 +90,19 @@ extern void update_cpu_load_active(struct rq *this_rq);
 */
 #define RUNTIME_INF     ((u64)~0ULL)
+static inline int fair_policy(int policy)
+{
+        return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+}
 static inline int rt_policy(int policy)
 {
-        if (policy == SCHED_FIFO || policy == SCHED_RR)
+        return policy == SCHED_FIFO || policy == SCHED_RR;
-                return 1;
+}
-        return 0;
+static inline int dl_policy(int policy)
+{
+        return policy == SCHED_DEADLINE;
 }
 static inline int task_has_rt_policy(struct task_struct *p)
@@ -93,6 +110,25 @@ static inline int task_has_rt_policy(struct task_struct *p)
        return rt_policy(p->policy);
 }
+static inline int task_has_dl_policy(struct task_struct *p)
+{
+        return dl_policy(p->policy);
+}
+static inline bool dl_time_before(u64 a, u64 b)
+{
+        return (s64)(a - b) < 0;
+}
+/*
+ * Tells if entity @a should preempt entity @b.
+ */
+static inline bool
+dl_entity_preempt(struct sched_dl_entity *a, struct sched_dl_entity *b)
+{
+        return dl_time_before(a->deadline, b->deadline);
+}
 /*
 * This is the priority-queue data structure of the RT scheduling class:
 */
@@ -108,6 +144,47 @@ struct rt_bandwidth {
        u64                     rt_runtime;
        struct hrtimer          rt_period_timer;
 };
+/*
+ * To keep the bandwidth of -deadline tasks and groups under control
+ * we need some place where:
+ *  - store the maximum -deadline bandwidth of the system (the group);
+ *  - cache the fraction of that bandwidth that is currently allocated.
+ *
+ * This is all done in the data structure below. It is similar to the
+ * one used for RT-throttling (rt_bandwidth), with the main difference
+ * that, since here we are only interested in admission control, we
+ * do not decrease any runtime while the group "executes", neither we
+ * need a timer to replenish it.
+ *
+ * With respect to SMP, the bandwidth is given on a per-CPU basis,
+ * meaning that:
+ *  - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
+ *  - dl_total_bw array contains, in the i-eth element, the currently
+ *    allocated bandwidth on the i-eth CPU.
+ * Moreover, groups consume bandwidth on each CPU, while tasks only
+ * consume bandwidth on the CPU they're running on.
+ * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
+ * that will be shown the next time the proc or cgroup controls will
+ * be red. It on its turn can be changed by writing on its own
+ * control.
+ */
+struct dl_bandwidth {
+        raw_spinlock_t dl_runtime_lock;
+        u64 dl_runtime;
+        u64 dl_period;
+};
+static inline int dl_bandwidth_enabled(void)
+{
+        return sysctl_sched_rt_runtime >= 0;
+}
+extern struct dl_bw *dl_bw_of(int i);
+struct dl_bw {
+        raw_spinlock_t lock;
+        u64 bw, total_bw;
+};
 extern struct mutex sched_domains_mutex;
@@ -364,6 +441,41 @@ struct rt_rq {
 #endif
 };
+/* Deadline class' related fields in a runqueue */
+struct dl_rq {
+        /* runqueue is an rbtree, ordered by deadline */
+        struct rb_root rb_root;
+        struct rb_node *rb_leftmost;
+        unsigned long dl_nr_running;
+#ifdef CONFIG_SMP
+        /*
+         * Deadline values of the currently executing and the
+         * earliest ready task on this rq. Caching these facilitates
+         * the decision wether or not a ready but not running task
+         * should migrate somewhere else.
+         */
+        struct {
+                u64 curr;
+                u64 next;
+        } earliest_dl;
+        unsigned long dl_nr_migratory;
+        int overloaded;
+        /*
+         * Tasks on this rq that can be pushed away. They are kept in
+         * an rb-tree, ordered by tasks' deadlines, with caching
+         * of the leftmost (earliest deadline) element.
+         */
+        struct rb_root pushable_dl_tasks_root;
+        struct rb_node *pushable_dl_tasks_leftmost;
+#else
+        struct dl_bw dl_bw;
+#endif
+};
 #ifdef CONFIG_SMP
 /*
@@ -382,6 +494,15 @@ struct root_domain {
        cpumask_var_t online;
        /*
+         * The bit corresponding to a CPU gets set here if such CPU has more
+         * than one runnable -deadline task (as it is below for RT tasks).
+         */
+        cpumask_var_t dlo_mask;
+        atomic_t dlo_count;
+        struct dl_bw dl_bw;
+        struct cpudl cpudl;
+        /*
         * The "RT overload" flag: it gets set if a CPU has more than
         * one runnable RT task.
         */
@@ -432,6 +553,7 @@ struct rq {
        struct cfs_rq cfs;
        struct rt_rq rt;
+        struct dl_rq dl;
 #ifdef CONFIG_FAIR_GROUP_SCHED
        /* list of leaf cfs_rq on this cpu: */
@@ -827,8 +949,6 @@ static inline u64 global_rt_runtime(void)
        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
 }
 static inline int task_current(struct rq *rq, struct task_struct *p)
 {
        return rq->curr == p;
@@ -988,6 +1108,7 @@ static const u32 prio_to_wmult[40] = {
 #else
 #define ENQUEUE_WAKING          0
 #endif
+#define ENQUEUE_REPLENISH       8
 #define DEQUEUE_SLEEP           1
@@ -1023,6 +1144,7 @@ struct sched_class {
        void (*set_curr_task) (struct rq *rq);
        void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
        void (*task_fork) (struct task_struct *p);
+        void (*task_dead) (struct task_struct *p);
        void (*switched_from) (struct rq *this_rq, struct task_struct *task);
        void (*switched_to) (struct rq *this_rq, struct task_struct *task);
@@ -1042,6 +1164,7 @@ struct sched_class {
   for (class = sched_class_highest; class; class = class->next)
 extern const struct sched_class stop_sched_class;
+extern const struct sched_class dl_sched_class;
 extern const struct sched_class rt_sched_class;
 extern const struct sched_class fair_sched_class;
 extern const struct sched_class idle_sched_class;
@@ -1051,7 +1174,7 @@ extern const struct sched_class idle_sched_class;
 extern void update_group_power(struct sched_domain *sd, int cpu);
-extern void trigger_load_balance(struct rq *rq, int cpu);
+extern void trigger_load_balance(struct rq *rq);
 extern void idle_balance(int this_cpu, struct rq *this_rq);
 extern void idle_enter_fair(struct rq *this_rq);
@@ -1068,8 +1191,11 @@ static inline void idle_balance(int cpu, struct rq *rq)
 extern void sysrq_sched_debug_show(void);
 extern void sched_init_granularity(void);
 extern void update_max_interval(void);
+extern void init_sched_dl_class(void);
 extern void init_sched_rt_class(void);
 extern void init_sched_fair_class(void);
+extern void init_sched_dl_class(void);
 extern void resched_task(struct task_struct *p);
 extern void resched_cpu(int cpu);
@@ -1077,6 +1203,12 @@ extern void resched_cpu(int cpu);
 extern struct rt_bandwidth def_rt_bandwidth;
 extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern struct dl_bandwidth def_dl_bandwidth;
+extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
+extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
+unsigned long to_ratio(u64 period, u64 runtime);
 extern void update_idle_cpu_load(struct rq *this_rq);
 extern void init_task_runnable_average(struct task_struct *p);
@@ -1353,6 +1485,7 @@ extern void print_rt_stats(struct seq_file *m, int cpu);
 extern void init_cfs_rq(struct cfs_rq *cfs_rq);
 extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
+extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
 extern void cfs_bandwidth_usage_inc(void);
 extern void cfs_bandwidth_usage_dec(void);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 47197de8abd9..fdb6bb0b3356 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -103,7 +103,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
 * Simple, special scheduling class for the per-CPU stop tasks:
 */
 const struct sched_class stop_sched_class = {
-        .next                   = &rt_sched_class,
+        .next                   = &dl_sched_class,
        .enqueue_task           = enqueue_task_stop,
        .dequeue_task           = dequeue_task_stop,