9 files changed, 552 insertions, 165 deletions
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
index 0984a21076a3..64de5f8b0c9e 100644
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@@ -35,6 +35,7 @@ static inline void autogroup_destroy(struct kref *kref)
        ag->tg->rt_se = NULL;
        ag->tg->rt_rq = NULL;
 #endif
+        sched_offline_group(ag->tg);
        sched_destroy_group(ag->tg);
 }
@@ -76,6 +77,8 @@ static inline struct autogroup *autogroup_create(void)
        if (IS_ERR(tg))
                goto out_free;
+        sched_online_group(tg, &root_task_group);
        kref_init(&ag->kref);
        init_rwsem(&ag->lock);
        ag->id = atomic_inc_return(&autogroup_seq_nr);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d6fdcdcbb9b1..7f12624a393c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -83,7 +83,7 @@
 #endif
 #include "sched.h"
-#include "../workqueue_sched.h"
+#include "../workqueue_internal.h"
 #include "../smpboot.h"
 #define CREATE_TRACE_POINTS
@@ -1132,18 +1132,28 @@ EXPORT_SYMBOL_GPL(kick_process);
 */
 static int select_fallback_rq(int cpu, struct task_struct *p)
 {
-        const struct cpumask *nodemask = cpumask_of_node(cpu_to_node(cpu));
+        int nid = cpu_to_node(cpu);
+        const struct cpumask *nodemask = NULL;
        enum { cpuset, possible, fail } state = cpuset;
        int dest_cpu;
-        /* Look for allowed, online CPU in same node. */
+        /*
-        for_each_cpu(dest_cpu, nodemask) {
+         * If the node that the cpu is on has been offlined, cpu_to_node()
-                if (!cpu_online(dest_cpu))
+         * will return -1. There is no cpu on the node, and we should
-                        continue;
+         * select the cpu on the other node.
-                if (!cpu_active(dest_cpu))
+         */
-                        continue;
+        if (nid != -1) {
-                if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+                nodemask = cpumask_of_node(nid);
-                        return dest_cpu;
+                /* Look for allowed, online CPU in same node. */
+                for_each_cpu(dest_cpu, nodemask) {
+                        if (!cpu_online(dest_cpu))
+                                continue;
+                        if (!cpu_active(dest_cpu))
+                                continue;
+                        if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
+                                return dest_cpu;
+                }
        }
        for (;;) {
@@ -1523,7 +1533,8 @@ out:
 */
 int wake_up_process(struct task_struct *p)
 {
-        return try_to_wake_up(p, TASK_ALL, 0);
+        WARN_ON(task_is_stopped_or_traced(p));
+        return try_to_wake_up(p, TASK_NORMAL, 0);
 }
 EXPORT_SYMBOL(wake_up_process);
@@ -1741,9 +1752,8 @@ EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
 static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
 {
        struct preempt_notifier *notifier;
-        struct hlist_node *node;
-        hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+        hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
                notifier->ops->sched_in(notifier, raw_smp_processor_id());
 }
@@ -1752,9 +1762,8 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
                                 struct task_struct *next)
 {
        struct preempt_notifier *notifier;
-        struct hlist_node *node;
-        hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link)
+        hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
                notifier->ops->sched_out(notifier, next);
 }
@@ -1968,11 +1977,10 @@ context_switch(struct rq *rq, struct task_struct *prev,
 }
 /*
- * nr_running, nr_uninterruptible and nr_context_switches:
+ * nr_running and nr_context_switches:
 *
 * externally visible scheduler statistics: current number of runnable
- * threads, current number of uninterruptible-sleeping threads, total
+ * threads, total number of context switches performed since bootup.
- * number of context switches performed since bootup.
 */
 unsigned long nr_running(void)
 {
@@ -1984,23 +1992,6 @@ unsigned long nr_running(void)
        return sum;
 }
-unsigned long nr_uninterruptible(void)
-{
-        unsigned long i, sum = 0;
-        for_each_possible_cpu(i)
-                sum += cpu_rq(i)->nr_uninterruptible;
-        /*
-         * Since we read the counters lockless, it might be slightly
-         * inaccurate. Do not allow it to go below zero though:
-         */
-        if (unlikely((long)sum < 0))
-                sum = 0;
-        return sum;
-}
 unsigned long long nr_context_switches(void)
 {
        int i;
@@ -2785,7 +2776,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
        if (irqs_disabled())
                print_irqtrace_events(prev);
        dump_stack();
-        add_taint(TAINT_WARN);
+        add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
 }
 /*
@@ -4410,20 +4401,32 @@ EXPORT_SYMBOL(yield);
 * It's the caller's job to ensure that the target task struct
 * can't go away on us before we can do any checks.
 *
- * Returns true if we indeed boosted the target task.
+ * Returns:
+ *      true (>0) if we indeed boosted the target task.
+ *      false (0) if we failed to boost the target.
+ *      -ESRCH if there's no task to yield to.
 */
 bool __sched yield_to(struct task_struct *p, bool preempt)
 {
        struct task_struct *curr = current;
        struct rq *rq, *p_rq;
        unsigned long flags;
-        bool yielded = 0;
+        int yielded = 0;
        local_irq_save(flags);
        rq = this_rq();
 again:
        p_rq = task_rq(p);
+        /*
+         * If we're the only runnable task on the rq and target rq also
+         * has only one task, there's absolutely no point in yielding.
+         */
+        if (rq->nr_running == 1 && p_rq->nr_running == 1) {
+                yielded = -ESRCH;
+                goto out_irq;
+        }
        double_rq_lock(rq, p_rq);
        while (task_rq(p) != p_rq) {
                double_rq_unlock(rq, p_rq);
@@ -4431,13 +4434,13 @@ again:
        }
        if (!curr->sched_class->yield_to_task)
-                goto out;
+                goto out_unlock;
        if (curr->sched_class != p->sched_class)
-                goto out;
+                goto out_unlock;
        if (task_running(p_rq, p) || p->state)
-                goto out;
+                goto out_unlock;
        yielded = curr->sched_class->yield_to_task(rq, p, preempt);
        if (yielded) {
@@ -4450,11 +4453,12 @@ again:
                        resched_task(p_rq->curr);
        }
-out:
+out_unlock:
        double_rq_unlock(rq, p_rq);
+out_irq:
        local_irq_restore(flags);
-        if (yielded)
+        if (yielded > 0)
                schedule();
        return yielded;
@@ -4713,6 +4717,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         */
        idle->sched_class = &idle_sched_class;
        ftrace_graph_init_idle_task(idle, cpu);
+        vtime_init_idle(idle);
 #if defined(CONFIG_SMP)
        sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
 #endif
@@ -7206,7 +7211,6 @@ static void free_sched_group(struct task_group *tg)
 struct task_group *sched_create_group(struct task_group *parent)
 {
        struct task_group *tg;
-        unsigned long flags;
        tg = kzalloc(sizeof(*tg), GFP_KERNEL);
        if (!tg)
@@ -7218,6 +7222,17 @@ struct task_group *sched_create_group(struct task_group *parent)
        if (!alloc_rt_sched_group(tg, parent))
                goto err;
+        return tg;
+err:
+        free_sched_group(tg);
+        return ERR_PTR(-ENOMEM);
+}
+void sched_online_group(struct task_group *tg, struct task_group *parent)
+{
+        unsigned long flags;
        spin_lock_irqsave(&task_group_lock, flags);
        list_add_rcu(&tg->list, &task_groups);
@@ -7227,12 +7242,6 @@ struct task_group *sched_create_group(struct task_group *parent)
        INIT_LIST_HEAD(&tg->children);
        list_add_rcu(&tg->siblings, &parent->children);
        spin_unlock_irqrestore(&task_group_lock, flags);
-        return tg;
-err:
-        free_sched_group(tg);
-        return ERR_PTR(-ENOMEM);
 }
 /* rcu callback to free various structures associated with a task group */
@@ -7245,6 +7254,12 @@ static void free_sched_group_rcu(struct rcu_head *rhp)
 /* Destroy runqueue etc associated with a task group */
 void sched_destroy_group(struct task_group *tg)
 {
+        /* wait for possible concurrent references to cfs_rqs complete */
+        call_rcu(&tg->rcu, free_sched_group_rcu);
+}
+void sched_offline_group(struct task_group *tg)
+{
        unsigned long flags;
        int i;
@@ -7256,9 +7271,6 @@ void sched_destroy_group(struct task_group *tg)
        list_del_rcu(&tg->list);
        list_del_rcu(&tg->siblings);
        spin_unlock_irqrestore(&task_group_lock, flags);
-        /* wait for possible concurrent references to cfs_rqs complete */
-        call_rcu(&tg->rcu, free_sched_group_rcu);
 }
 /* change task's runqueue when it moves between groups.
@@ -7554,6 +7566,25 @@ static int sched_rt_global_constraints(void)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
+int sched_rr_handler(struct ctl_table *table, int write,
+                void __user *buffer, size_t *lenp,
+                loff_t *ppos)
+{
+        int ret;
+        static DEFINE_MUTEX(mutex);
+        mutex_lock(&mutex);
+        ret = proc_dointvec(table, write, buffer, lenp, ppos);
+        /* make sure that internally we keep jiffies */
+        /* also, writing zero resets timeslice to default */
+        if (!ret && write) {
+                sched_rr_timeslice = sched_rr_timeslice <= 0 ?
+                        RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+        }
+        mutex_unlock(&mutex);
+        return ret;
+}
 int sched_rt_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
@@ -7610,6 +7641,19 @@ static struct cgroup_subsys_state *cpu_cgroup_css_alloc(struct cgroup *cgrp)
        return &tg->css;
 }
+static int cpu_cgroup_css_online(struct cgroup *cgrp)
+{
+        struct task_group *tg = cgroup_tg(cgrp);
+        struct task_group *parent;
+        if (!cgrp->parent)
+                return 0;
+        parent = cgroup_tg(cgrp->parent);
+        sched_online_group(tg, parent);
+        return 0;
+}
 static void cpu_cgroup_css_free(struct cgroup *cgrp)
 {
        struct task_group *tg = cgroup_tg(cgrp);
@@ -7617,6 +7661,13 @@ static void cpu_cgroup_css_free(struct cgroup *cgrp)
        sched_destroy_group(tg);
 }
+static void cpu_cgroup_css_offline(struct cgroup *cgrp)
+{
+        struct task_group *tg = cgroup_tg(cgrp);
+        sched_offline_group(tg);
+}
 static int cpu_cgroup_can_attach(struct cgroup *cgrp,
                                 struct cgroup_taskset *tset)
 {
@@ -7972,6 +8023,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
        .name           = "cpu",
        .css_alloc      = cpu_cgroup_css_alloc,
        .css_free       = cpu_cgroup_css_free,
+        .css_online     = cpu_cgroup_css_online,
+        .css_offline    = cpu_cgroup_css_offline,
        .can_attach     = cpu_cgroup_can_attach,
        .attach         = cpu_cgroup_attach,
        .exit           = cpu_cgroup_exit,
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 23aa789c53ee..1095e878a46f 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -28,6 +28,8 @@
 */
 #include <linux/gfp.h>
+#include <linux/sched.h>
+#include <linux/sched/rt.h>
 #include "cpupri.h"
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 293b202fcf79..ed12cbb135f4 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -3,6 +3,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/kernel_stat.h>
 #include <linux/static_key.h>
+#include <linux/context_tracking.h>
 #include "sched.h"
@@ -163,7 +164,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
        task_group_account_field(p, index, (__force u64) cputime);
        /* Account for user time used */
-        acct_update_integrals(p);
+        acct_account_cputime(p);
 }
 /*
@@ -213,7 +214,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
        task_group_account_field(p, index, (__force u64) cputime);
        /* Account for system time used */
-        acct_update_integrals(p);
+        acct_account_cputime(p);
 }
 /*
@@ -295,6 +296,7 @@ static __always_inline bool steal_account_process_tick(void)
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 {
        struct signal_struct *sig = tsk->signal;
+        cputime_t utime, stime;
        struct task_struct *t;
        times->utime = sig->utime;
@@ -308,16 +310,15 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        t = tsk;
        do {
-                times->utime += t->utime;
+                task_cputime(tsk, &utime, &stime);
-                times->stime += t->stime;
+                times->utime += utime;
+                times->stime += stime;
                times->sum_exec_runtime += task_sched_runtime(t);
        } while_each_thread(tsk, t);
 out:
        rcu_read_unlock();
 }
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 /*
 * Account a tick to a process and cpustat
@@ -382,11 +383,12 @@ static void irqtime_account_idle_ticks(int ticks)
                irqtime_account_process_tick(current, 0, rq);
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static void irqtime_account_idle_ticks(int ticks) {}
+static inline void irqtime_account_idle_ticks(int ticks) {}
-static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                                                struct rq *rq) {}
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
 /*
 * Account a single tick of cpu time.
 * @p: the process that the cpu time gets accounted to
@@ -397,6 +399,9 @@ void account_process_tick(struct task_struct *p, int user_tick)
        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
        struct rq *rq = this_rq();
+        if (vtime_accounting_enabled())
+                return;
        if (sched_clock_irqtime) {
                irqtime_account_process_tick(p, user_tick, rq);
                return;
@@ -438,8 +443,7 @@ void account_idle_ticks(unsigned long ticks)
        account_idle_time(jiffies_to_cputime(ticks));
 }
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
-#endif
 /*
 * Use precise platform statistics if available:
@@ -461,25 +465,20 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
        *st = cputime.stime;
 }
-void vtime_account_system_irqsafe(struct task_struct *tsk)
-{
-        unsigned long flags;
-        local_irq_save(flags);
-        vtime_account_system(tsk);
-        local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(vtime_account_system_irqsafe);
 #ifndef __ARCH_HAS_VTIME_TASK_SWITCH
 void vtime_task_switch(struct task_struct *prev)
 {
+        if (!vtime_accounting_enabled())
+                return;
        if (is_idle_task(prev))
                vtime_account_idle(prev);
        else
                vtime_account_system(prev);
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
        vtime_account_user(prev);
+#endif
        arch_vtime_task_switch(prev);
 }
 #endif
@@ -493,27 +492,40 @@ void vtime_task_switch(struct task_struct *prev)
 * vtime_account().
 */
 #ifndef __ARCH_HAS_VTIME_ACCOUNT
-void vtime_account(struct task_struct *tsk)
+void vtime_account_irq_enter(struct task_struct *tsk)
 {
-        if (in_interrupt() || !is_idle_task(tsk))
+        if (!vtime_accounting_enabled())
-                vtime_account_system(tsk);
+                return;
-        else
-                vtime_account_idle(tsk);
+        if (!in_interrupt()) {
+                /*
+                 * If we interrupted user, context_tracking_in_user()
+                 * is 1 because the context tracking don't hook
+                 * on irq entry/exit. This way we know if
+                 * we need to flush user time on kernel entry.
+                 */
+                if (context_tracking_in_user()) {
+                        vtime_account_user(tsk);
+                        return;
+                }
+                if (is_idle_task(tsk)) {
+                        vtime_account_idle(tsk);
+                        return;
+                }
+        }
+        vtime_account_system(tsk);
 }
-EXPORT_SYMBOL_GPL(vtime_account);
+EXPORT_SYMBOL_GPL(vtime_account_irq_enter);
 #endif /* __ARCH_HAS_VTIME_ACCOUNT */
-#else
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING */
-#ifndef nsecs_to_cputime
-# define nsecs_to_cputime(__nsecs)      nsecs_to_jiffies(__nsecs)
-#endif
-static cputime_t scale_utime(cputime_t utime, cputime_t rtime, cputime_t total)
+static cputime_t scale_stime(cputime_t stime, cputime_t rtime, cputime_t total)
 {
        u64 temp = (__force u64) rtime;
-        temp *= (__force u64) utime;
+        temp *= (__force u64) stime;
        if (sizeof(cputime_t) == 4)
                temp = div_u64(temp, (__force u32) total);
@@ -531,10 +543,10 @@ static void cputime_adjust(struct task_cputime *curr,
                           struct cputime *prev,
                           cputime_t *ut, cputime_t *st)
 {
-        cputime_t rtime, utime, total;
+        cputime_t rtime, stime, total;
-        utime = curr->utime;
+        stime = curr->stime;
-        total = utime + curr->stime;
+        total = stime + curr->utime;
        /*
         * Tick based cputime accounting depend on random scheduling
@@ -549,17 +561,17 @@ static void cputime_adjust(struct task_cputime *curr,
        rtime = nsecs_to_cputime(curr->sum_exec_runtime);
        if (total)
-                utime = scale_utime(utime, rtime, total);
+                stime = scale_stime(stime, rtime, total);
        else
-                utime = rtime;
+                stime = rtime;
        /*
         * If the tick based count grows faster than the scheduler one,
         * the result of the scaling may go backward.
         * Let's enforce monotonicity.
         */
-        prev->utime = max(prev->utime, utime);
+        prev->stime = max(prev->stime, stime);
-        prev->stime = max(prev->stime, rtime - prev->utime);
+        prev->utime = max(prev->utime, rtime - prev->stime);
        *ut = prev->utime;
        *st = prev->stime;
@@ -568,11 +580,10 @@ static void cputime_adjust(struct task_cputime *curr,
 void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
        struct task_cputime cputime = {
-                .utime = p->utime,
-                .stime = p->stime,
                .sum_exec_runtime = p->se.sum_exec_runtime,
        };
+        task_cputime(p, &cputime.utime, &cputime.stime);
        cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
@@ -586,4 +597,221 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
        thread_group_cputime(p, &cputime);
        cputime_adjust(&cputime, &p->signal->prev_cputime, ut, st);
 }
-#endif
+#endif /* !CONFIG_VIRT_CPU_ACCOUNTING */
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
+static unsigned long long vtime_delta(struct task_struct *tsk)
+{
+        unsigned long long clock;
+        clock = local_clock();
+        if (clock < tsk->vtime_snap)
+                return 0;
+        return clock - tsk->vtime_snap;
+}
+static cputime_t get_vtime_delta(struct task_struct *tsk)
+{
+        unsigned long long delta = vtime_delta(tsk);
+        WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING);
+        tsk->vtime_snap += delta;
+        /* CHECKME: always safe to convert nsecs to cputime? */
+        return nsecs_to_cputime(delta);
+}
+static void __vtime_account_system(struct task_struct *tsk)
+{
+        cputime_t delta_cpu = get_vtime_delta(tsk);
+        account_system_time(tsk, irq_count(), delta_cpu, cputime_to_scaled(delta_cpu));
+}
+void vtime_account_system(struct task_struct *tsk)
+{
+        if (!vtime_accounting_enabled())
+                return;
+        write_seqlock(&tsk->vtime_seqlock);
+        __vtime_account_system(tsk);
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_account_irq_exit(struct task_struct *tsk)
+{
+        if (!vtime_accounting_enabled())
+                return;
+        write_seqlock(&tsk->vtime_seqlock);
+        if (context_tracking_in_user())
+                tsk->vtime_snap_whence = VTIME_USER;
+        __vtime_account_system(tsk);
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_account_user(struct task_struct *tsk)
+{
+        cputime_t delta_cpu;
+        if (!vtime_accounting_enabled())
+                return;
+        delta_cpu = get_vtime_delta(tsk);
+        write_seqlock(&tsk->vtime_seqlock);
+        tsk->vtime_snap_whence = VTIME_SYS;
+        account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_user_enter(struct task_struct *tsk)
+{
+        if (!vtime_accounting_enabled())
+                return;
+        write_seqlock(&tsk->vtime_seqlock);
+        tsk->vtime_snap_whence = VTIME_USER;
+        __vtime_account_system(tsk);
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_guest_enter(struct task_struct *tsk)
+{
+        write_seqlock(&tsk->vtime_seqlock);
+        __vtime_account_system(tsk);
+        current->flags |= PF_VCPU;
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_guest_exit(struct task_struct *tsk)
+{
+        write_seqlock(&tsk->vtime_seqlock);
+        __vtime_account_system(tsk);
+        current->flags &= ~PF_VCPU;
+        write_sequnlock(&tsk->vtime_seqlock);
+}
+void vtime_account_idle(struct task_struct *tsk)
+{
+        cputime_t delta_cpu = get_vtime_delta(tsk);
+        account_idle_time(delta_cpu);
+}
+bool vtime_accounting_enabled(void)
+{
+        return context_tracking_active();
+}
+void arch_vtime_task_switch(struct task_struct *prev)
+{
+        write_seqlock(&prev->vtime_seqlock);
+        prev->vtime_snap_whence = VTIME_SLEEPING;
+        write_sequnlock(&prev->vtime_seqlock);
+        write_seqlock(&current->vtime_seqlock);
+        current->vtime_snap_whence = VTIME_SYS;
+        current->vtime_snap = sched_clock();
+        write_sequnlock(&current->vtime_seqlock);
+}
+void vtime_init_idle(struct task_struct *t)
+{
+        unsigned long flags;
+        write_seqlock_irqsave(&t->vtime_seqlock, flags);
+        t->vtime_snap_whence = VTIME_SYS;
+        t->vtime_snap = sched_clock();
+        write_sequnlock_irqrestore(&t->vtime_seqlock, flags);
+}
+cputime_t task_gtime(struct task_struct *t)
+{
+        unsigned int seq;
+        cputime_t gtime;
+        do {
+                seq = read_seqbegin(&t->vtime_seqlock);
+                gtime = t->gtime;
+                if (t->flags & PF_VCPU)
+                        gtime += vtime_delta(t);
+        } while (read_seqretry(&t->vtime_seqlock, seq));
+        return gtime;
+}
+/*
+ * Fetch cputime raw values from fields of task_struct and
+ * add up the pending nohz execution time since the last
+ * cputime snapshot.
+ */
+static void
+fetch_task_cputime(struct task_struct *t,
+                   cputime_t *u_dst, cputime_t *s_dst,
+                   cputime_t *u_src, cputime_t *s_src,
+                   cputime_t *udelta, cputime_t *sdelta)
+{
+        unsigned int seq;
+        unsigned long long delta;
+        do {
+                *udelta = 0;
+                *sdelta = 0;
+                seq = read_seqbegin(&t->vtime_seqlock);
+                if (u_dst)
+                        *u_dst = *u_src;
+                if (s_dst)
+                        *s_dst = *s_src;
+                /* Task is sleeping, nothing to add */
+                if (t->vtime_snap_whence == VTIME_SLEEPING ||
+                    is_idle_task(t))
+                        continue;
+                delta = vtime_delta(t);
+                /*
+                 * Task runs either in user or kernel space, add pending nohz time to
+                 * the right place.
+                 */
+                if (t->vtime_snap_whence == VTIME_USER || t->flags & PF_VCPU) {
+                        *udelta = delta;
+                } else {
+                        if (t->vtime_snap_whence == VTIME_SYS)
+                                *sdelta = delta;
+                }
+        } while (read_seqretry(&t->vtime_seqlock, seq));
+}
+void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime)
+{
+        cputime_t udelta, sdelta;
+        fetch_task_cputime(t, utime, stime, &t->utime,
+                           &t->stime, &udelta, &sdelta);
+        if (utime)
+                *utime += udelta;
+        if (stime)
+                *stime += sdelta;
+}
+void task_cputime_scaled(struct task_struct *t,
+                         cputime_t *utimescaled, cputime_t *stimescaled)
+{
+        cputime_t udelta, sdelta;
+        fetch_task_cputime(t, utimescaled, stimescaled,
+                           &t->utimescaled, &t->stimescaled, &udelta, &sdelta);
+        if (utimescaled)
+                *utimescaled += cputime_to_scaled(udelta);
+        if (stimescaled)
+                *stimescaled += cputime_to_scaled(sdelta);
+}
+#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2cd3c1b4e582..75024a673520 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -110,13 +110,6 @@ static char *task_group_path(struct task_group *tg)
        if (autogroup_path(tg, group_path, PATH_MAX))
                return group_path;
-        /*
-         * May be NULL if the underlying cgroup isn't fully-created yet
-         */
-        if (!tg->css.cgroup) {
-                group_path[0] = '\0';
-                return group_path;
-        }
        cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
        return group_path;
 }
@@ -222,8 +215,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
                        cfs_rq->runnable_load_avg);
        SEQ_printf(m, "  .%-30s: %lld\n", "blocked_load_avg",
                        cfs_rq->blocked_load_avg);
-        SEQ_printf(m, "  .%-30s: %ld\n", "tg_load_avg",
+        SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_avg",
-                        atomic64_read(&cfs_rq->tg->load_avg));
+                        (unsigned long long)atomic64_read(&cfs_rq->tg->load_avg));
        SEQ_printf(m, "  .%-30s: %lld\n", "tg_load_contrib",
                        cfs_rq->tg_load_contrib);
        SEQ_printf(m, "  .%-30s: %d\n", "tg_runnable_contrib",
@@ -269,11 +262,11 @@ static void print_cpu(struct seq_file *m, int cpu)
        {
                unsigned int freq = cpu_khz ? : 1;
-                SEQ_printf(m, "\ncpu#%d, %u.%03u MHz\n",
+                SEQ_printf(m, "cpu#%d, %u.%03u MHz\n",
                           cpu, freq / 1000, (freq % 1000));
        }
 #else
-        SEQ_printf(m, "\ncpu#%d\n", cpu);
+        SEQ_printf(m, "cpu#%d\n", cpu);
 #endif
 #define P(x)                                                            \
@@ -330,6 +323,7 @@ do {									\
        print_rq(m, rq, cpu);
        rcu_read_unlock();
        spin_unlock_irqrestore(&sched_debug_lock, flags);
+        SEQ_printf(m, "\n");
 }
 static const char *sched_tunable_scaling_names[] = {
@@ -338,11 +332,10 @@ static const char *sched_tunable_scaling_names[] = {
        "linear"
 };
-static int sched_debug_show(struct seq_file *m, void *v)
+static void sched_debug_header(struct seq_file *m)
 {
        u64 ktime, sched_clk, cpu_clk;
        unsigned long flags;
-        int cpu;
        local_irq_save(flags);
        ktime = ktime_to_ns(ktime_get());
@@ -384,33 +377,101 @@ static int sched_debug_show(struct seq_file *m, void *v)
 #undef PN
 #undef P
-        SEQ_printf(m, "  .%-40s: %d (%s)\n", "sysctl_sched_tunable_scaling",
+        SEQ_printf(m, "  .%-40s: %d (%s)\n",
+                "sysctl_sched_tunable_scaling",
                sysctl_sched_tunable_scaling,
                sched_tunable_scaling_names[sysctl_sched_tunable_scaling]);
+        SEQ_printf(m, "\n");
+}
-        for_each_online_cpu(cpu)
+static int sched_debug_show(struct seq_file *m, void *v)
-                print_cpu(m, cpu);
+{
+        int cpu = (unsigned long)(v - 2);
-        SEQ_printf(m, "\n");
+        if (cpu != -1)
+                print_cpu(m, cpu);
+        else
+                sched_debug_header(m);
        return 0;
 }
 void sysrq_sched_debug_show(void)
 {
-        sched_debug_show(NULL, NULL);
+        int cpu;
+        sched_debug_header(NULL);
+        for_each_online_cpu(cpu)
+                print_cpu(NULL, cpu);
+}
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *sched_debug_start(struct seq_file *file, loff_t *offset)
+{
+        unsigned long n = *offset;
+        if (n == 0)
+                return (void *) 1;
+        n--;
+        if (n > 0)
+                n = cpumask_next(n - 1, cpu_online_mask);
+        else
+                n = cpumask_first(cpu_online_mask);
+        *offset = n + 1;
+        if (n < nr_cpu_ids)
+                return (void *)(unsigned long)(n + 2);
+        return NULL;
+}
+static void *sched_debug_next(struct seq_file *file, void *data, loff_t *offset)
+{
+        (*offset)++;
+        return sched_debug_start(file, offset);
+}
+static void sched_debug_stop(struct seq_file *file, void *data)
+{
+}
+static const struct seq_operations sched_debug_sops = {
+        .start = sched_debug_start,
+        .next = sched_debug_next,
+        .stop = sched_debug_stop,
+        .show = sched_debug_show,
+};
+static int sched_debug_release(struct inode *inode, struct file *file)
+{
+        seq_release(inode, file);
+        return 0;
 }
 static int sched_debug_open(struct inode *inode, struct file *filp)
 {
-        return single_open(filp, sched_debug_show, NULL);
+        int ret = 0;
+        ret = seq_open(filp, &sched_debug_sops);
+        return ret;
 }
 static const struct file_operations sched_debug_fops = {
        .open           = sched_debug_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-        .release        = single_release,
+        .release        = sched_debug_release,
 };
 static int __init init_sched_debug_procfs(void)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eea8707234a..7a33e5986fc5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1680,9 +1680,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        }
        /* ensure we never gain time by being placed backwards. */
-        vruntime = max_vruntime(se->vruntime, vruntime);
+        se->vruntime = max_vruntime(se->vruntime, vruntime);
-        se->vruntime = vruntime;
 }
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
@@ -2663,7 +2661,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
        hrtimer_cancel(&cfs_b->slack_timer);
 }
-static void unthrottle_offline_cfs_rqs(struct rq *rq)
+static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
 {
        struct cfs_rq *cfs_rq;
@@ -3254,25 +3252,18 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
 */
 static int select_idle_sibling(struct task_struct *p, int target)
 {
-        int cpu = smp_processor_id();
-        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
        struct sched_group *sg;
-        int i;
+        int i = task_cpu(p);
-        /*
+        if (idle_cpu(target))
-         * If the task is going to be woken-up on this cpu and if it is
+                return target;
-         * already idle, then it is the right target.
-         */
-        if (target == cpu && idle_cpu(cpu))
-                return cpu;
        /*
-         * If the task is going to be woken-up on the cpu where it previously
+         * If the prevous cpu is cache affine and idle, don't be stupid.
-         * ran and if it is currently idle, then it the right target.
         */
-        if (target == prev_cpu && idle_cpu(prev_cpu))
+        if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
-                return prev_cpu;
+                return i;
        /*
         * Otherwise, iterate the domains and find an elegible idle cpu.
@@ -3286,7 +3277,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
                                goto next;
                        for_each_cpu(i, sched_group_cpus(sg)) {
-                                if (!idle_cpu(i))
+                                if (i == target || !idle_cpu(i))
                                        goto next;
                        }
@@ -6101,7 +6092,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
         * idle runqueue:
         */
        if (rq->cfs.load.weight)
-                rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
+                rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
        return rr_interval;
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 418feb01344e..127a2c4cf4ab 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,8 @@
 #include <linux/slab.h>
+int sched_rr_timeslice = RR_TIMESLICE;
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 struct rt_bandwidth def_rt_bandwidth;
@@ -566,7 +568,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 static int do_balance_runtime(struct rt_rq *rt_rq)
 {
        struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
-        struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+        struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
        int i, weight, more = 0;
        u64 rt_period;
@@ -925,8 +927,8 @@ static void update_curr_rt(struct rq *rq)
                return;
        delta_exec = rq->clock_task - curr->se.exec_start;
-        if (unlikely((s64)delta_exec < 0))
+        if (unlikely((s64)delta_exec <= 0))
-                delta_exec = 0;
+                return;
        schedstat_set(curr->se.statistics.exec_max,
                      max(curr->se.statistics.exec_max, delta_exec));
@@ -1427,8 +1429,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
-            (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
+            cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
-            (p->nr_cpus_allowed > 1))
                return 1;
        return 0;
 }
@@ -1889,8 +1890,11 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
         * we may need to handle the pulling of RT tasks
         * now.
         */
-        if (p->on_rq && !rq->rt.rt_nr_running)
+        if (!p->on_rq || rq->rt.rt_nr_running)
-                pull_rt_task(rq);
+                return;
+        if (pull_rt_task(rq))
+                resched_task(rq->curr);
 }
 void init_sched_rt_class(void)
@@ -1985,7 +1989,11 @@ static void watchdog(struct rq *rq, struct task_struct *p)
        if (soft != RLIM_INFINITY) {
                unsigned long next;
-                p->rt.timeout++;
+                if (p->rt.watchdog_stamp != jiffies) {
+                        p->rt.timeout++;
+                        p->rt.watchdog_stamp = jiffies;
+                }
                next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
                if (p->rt.timeout > next)
                        p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
@@ -2010,7 +2018,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
        if (--p->rt.time_slice)
                return;
-        p->rt.time_slice = RR_TIMESLICE;
+        p->rt.time_slice = sched_rr_timeslice;
        /*
         * Requeue to the end of queue if we (and all of our ancestors) are the
@@ -2041,7 +2049,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
         * Time slice is 0 for SCHED_FIFO tasks
         */
        if (task->policy == SCHED_RR)
-                return RR_TIMESLICE;
+                return sched_rr_timeslice;
        else
                return 0;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fc886441436a..cc03cfdf469f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,5 +1,7 @@
 #include <linux/sched.h>
+#include <linux/sched/sysctl.h>
+#include <linux/sched/rt.h>
 #include <linux/mutex.h>
 #include <linux/spinlock.h>
 #include <linux/stop_machine.h>
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 903ffa9e8872..e036eda1a9c9 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -21,14 +21,17 @@ static int show_schedstat(struct seq_file *seq, void *v)
        if (mask_str == NULL)
                return -ENOMEM;
-        seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+        if (v == (void *)1) {
-        seq_printf(seq, "timestamp %lu\n", jiffies);
+                seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
-        for_each_online_cpu(cpu) {
+                seq_printf(seq, "timestamp %lu\n", jiffies);
-                struct rq *rq = cpu_rq(cpu);
+        } else {
+                struct rq *rq;
 #ifdef CONFIG_SMP
                struct sched_domain *sd;
                int dcount = 0;
 #endif
+                cpu = (unsigned long)(v - 2);
+                rq = cpu_rq(cpu);
                /* runqueue-specific stats */
                seq_printf(seq,
@@ -77,30 +80,66 @@ static int show_schedstat(struct seq_file *seq, void *v)
        return 0;
 }
-static int schedstat_open(struct inode *inode, struct file *file)
+/*
+ * This itererator needs some explanation.
+ * It returns 1 for the header position.
+ * This means 2 is cpu 0.
+ * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * to use cpumask_* to iterate over the cpus.
+ */
+static void *schedstat_start(struct seq_file *file, loff_t *offset)
 {
-        unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+        unsigned long n = *offset;
-        char *buf = kmalloc(size, GFP_KERNEL);
-        struct seq_file *m;
-        int res;
-        if (!buf)
+        if (n == 0)
-                return -ENOMEM;
+                return (void *) 1;
-        res = single_open(file, show_schedstat, NULL);
-        if (!res) {
+        n--;
-                m = file->private_data;
-                m->buf = buf;
+        if (n > 0)
-                m->size = size;
+                n = cpumask_next(n - 1, cpu_online_mask);
-        } else
+        else
-                kfree(buf);
+                n = cpumask_first(cpu_online_mask);
-        return res;
+        *offset = n + 1;
+        if (n < nr_cpu_ids)
+                return (void *)(unsigned long)(n + 2);
+        return NULL;
+}
+static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
+{
+        (*offset)++;
+        return schedstat_start(file, offset);
+}
+static void schedstat_stop(struct seq_file *file, void *data)
+{
+}
+static const struct seq_operations schedstat_sops = {
+        .start = schedstat_start,
+        .next  = schedstat_next,
+        .stop  = schedstat_stop,
+        .show  = show_schedstat,
+};
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &schedstat_sops);
 }
+static int schedstat_release(struct inode *inode, struct file *file)
+{
+        return 0;
+};
 static const struct file_operations proc_schedstat_operations = {
        .open    = schedstat_open,
        .read    = seq_read,
        .llseek  = seq_lseek,
-        .release = single_release,
+        .release = schedstat_release,
 };
 static int __init proc_schedstat_init(void)