58 files changed, 4206 insertions, 3050 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index e898c5b9d02c..f70396e5a24b 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,16 +2,15 @@
 # Makefile for the linux kernel.
 #
-obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
+obj-y     = fork.o exec_domain.o panic.o printk.o \
            cpu.o exit.o itimer.o time.o softirq.o resource.o \
            sysctl.o sysctl_binary.o capability.o ptrace.o timer.o user.o \
            signal.o sys.o kmod.o workqueue.o pid.o \
            rcupdate.o extable.o params.o posix-timers.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-            notifier.o ksysfs.o sched_clock.o cred.o \
+            notifier.o ksysfs.o cred.o \
-            async.o range.o
+            async.o range.o groups.o
-obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
 # Do not trace debug files and internal ftrace files
@@ -20,10 +19,11 @@ CFLAGS_REMOVE_lockdep_proc.o = -pg
 CFLAGS_REMOVE_mutex-debug.o = -pg
 CFLAGS_REMOVE_rtmutex-debug.o = -pg
 CFLAGS_REMOVE_cgroup-debug.o = -pg
-CFLAGS_REMOVE_sched_clock.o = -pg
 CFLAGS_REMOVE_irq_work.o = -pg
 endif
+obj-y += sched/
 obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
@@ -99,7 +99,6 @@ obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_TRACEPOINTS) += trace/
-obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
 obj-$(CONFIG_CPU_PM) += cpu_pm.o
@@ -110,15 +109,6 @@ obj-$(CONFIG_PADATA) += padata.o
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
-ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
-# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
-# needed for x86 only.  Why this used to be enabled for all architectures is beyond
-# me.  I suspect most platforms don't need this, but until we know that for sure
-# I turn this off for IA-64 only.  Andreas Schwab says it's also needed on m68k
-# to get a correct value for the wait-channel (WCHAN in ps). --davidm
-CFLAGS_sched.o := $(PROFILING) -fno-omit-frame-pointer
-endif
 $(obj)/configs.o: $(obj)/config_data.h
 # config_data.h contains the same information as ikconfig.h but gzipped.
diff --git a/kernel/acct.c b/kernel/acct.c
index fa7eb3de2ddc..203dfead2e06 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -613,8 +613,8 @@ void acct_collect(long exitcode, int group_dead)
                pacct->ac_flag |= ACORE;
        if (current->flags & PF_SIGNALED)
                pacct->ac_flag |= AXSIG;
-        pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
+        pacct->ac_utime += current->utime;
-        pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
+        pacct->ac_stime += current->stime;
        pacct->ac_minflt += current->min_flt;
        pacct->ac_majflt += current->maj_flt;
        spin_unlock_irq(&current->sighand->siglock);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 563f13609470..5ca38d5d238a 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -178,8 +178,7 @@ static inline void check_for_tasks(int cpu)
        write_lock_irq(&tasklist_lock);
        for_each_process(p) {
                if (task_cpu(p) == cpu && p->state == TASK_RUNNING &&
-                    (!cputime_eq(p->utime, cputime_zero) ||
+                    (p->utime || p->stime))
-                     !cputime_eq(p->stime, cputime_zero)))
                        printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d "
                                "(state = %ld, flags = %x)\n",
                                p->comm, task_pid_nr(p), cpu,
@@ -380,6 +379,7 @@ out:
        cpu_maps_update_done();
        return err;
 }
+EXPORT_SYMBOL_GPL(cpu_up);
 #ifdef CONFIG_PM_SLEEP_SMP
 static cpumask_var_t frozen_cpus;
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 5532dd37aa86..7d6fb40d2188 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -636,7 +636,7 @@ char kdb_task_state_char (const struct task_struct *p)
                (p->exit_state & EXIT_ZOMBIE) ? 'Z' :
                (p->exit_state & EXIT_DEAD) ? 'E' :
                (p->state & TASK_INTERRUPTIBLE) ? 'S' : '?';
-        if (p->pid == 0) {
+        if (is_idle_task(p)) {
                /* Idle task.  Is it really idle, apart from the kdb
                 * interrupt? */
                if (!kdb_task_has_cpu(p) || kgdb_info[cpu].irq_depth == 1) {
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
index 89e5e8aa4c36..22d901f9caf4 100644
--- a/kernel/events/Makefile
+++ b/kernel/events/Makefile
@@ -2,5 +2,5 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_core.o = -pg
 endif
-obj-y := core.o ring_buffer.o
+obj-y := core.o ring_buffer.o callchain.o
 obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
new file mode 100644
index 000000000000..057e24b665cf
--- /dev/null
+++ b/kernel/events/callchain.c
@@ -0,0 +1,191 @@
+/*
+ * Performance events callchain code, extracted from core.c:
+ *
+ *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ *  Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ *  Copyright  �  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
+ *
+ * For licensing details see kernel-base/COPYING
+ */
+#include <linux/perf_event.h>
+#include <linux/slab.h>
+#include "internal.h"
+struct callchain_cpus_entries {
+        struct rcu_head                 rcu_head;
+        struct perf_callchain_entry     *cpu_entries[0];
+};
+static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+static struct callchain_cpus_entries *callchain_cpus_entries;
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+                                  struct pt_regs *regs)
+{
+}
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+                                struct pt_regs *regs)
+{
+}
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+        struct callchain_cpus_entries *entries;
+        int cpu;
+        entries = container_of(head, struct callchain_cpus_entries, rcu_head);
+        for_each_possible_cpu(cpu)
+                kfree(entries->cpu_entries[cpu]);
+        kfree(entries);
+}
+static void release_callchain_buffers(void)
+{
+        struct callchain_cpus_entries *entries;
+        entries = callchain_cpus_entries;
+        rcu_assign_pointer(callchain_cpus_entries, NULL);
+        call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+static int alloc_callchain_buffers(void)
+{
+        int cpu;
+        int size;
+        struct callchain_cpus_entries *entries;
+        /*
+         * We can't use the percpu allocation API for data that can be
+         * accessed from NMI. Use a temporary manual per cpu allocation
+         * until that gets sorted out.
+         */
+        size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
+        entries = kzalloc(size, GFP_KERNEL);
+        if (!entries)
+                return -ENOMEM;
+        size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
+        for_each_possible_cpu(cpu) {
+                entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+                                                         cpu_to_node(cpu));
+                if (!entries->cpu_entries[cpu])
+                        goto fail;
+        }
+        rcu_assign_pointer(callchain_cpus_entries, entries);
+        return 0;
+fail:
+        for_each_possible_cpu(cpu)
+                kfree(entries->cpu_entries[cpu]);
+        kfree(entries);
+        return -ENOMEM;
+}
+int get_callchain_buffers(void)
+{
+        int err = 0;
+        int count;
+        mutex_lock(&callchain_mutex);
+        count = atomic_inc_return(&nr_callchain_events);
+        if (WARN_ON_ONCE(count < 1)) {
+                err = -EINVAL;
+                goto exit;
+        }
+        if (count > 1) {
+                /* If the allocation failed, give up */
+                if (!callchain_cpus_entries)
+                        err = -ENOMEM;
+                goto exit;
+        }
+        err = alloc_callchain_buffers();
+        if (err)
+                release_callchain_buffers();
+exit:
+        mutex_unlock(&callchain_mutex);
+        return err;
+}
+void put_callchain_buffers(void)
+{
+        if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+                release_callchain_buffers();
+                mutex_unlock(&callchain_mutex);
+        }
+}
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+        int cpu;
+        struct callchain_cpus_entries *entries;
+        *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+        if (*rctx == -1)
+                return NULL;
+        entries = rcu_dereference(callchain_cpus_entries);
+        if (!entries)
+                return NULL;
+        cpu = smp_processor_id();
+        return &entries->cpu_entries[cpu][*rctx];
+}
+static void
+put_callchain_entry(int rctx)
+{
+        put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+        int rctx;
+        struct perf_callchain_entry *entry;
+        entry = get_callchain_entry(&rctx);
+        if (rctx == -1)
+                return NULL;
+        if (!entry)
+                goto exit_put;
+        entry->nr = 0;
+        if (!user_mode(regs)) {
+                perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+                perf_callchain_kernel(entry, regs);
+                if (current->mm)
+                        regs = task_pt_regs(current);
+                else
+                        regs = NULL;
+        }
+        if (regs) {
+                perf_callchain_store(entry, PERF_CONTEXT_USER);
+                perf_callchain_user(entry, regs);
+        }
+exit_put:
+        put_callchain_entry(rctx);
+        return entry;
+}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 58690af323e4..890eb02c2f21 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -128,7 +128,7 @@ enum event_type_t {
 * perf_sched_events : >0 events exist
 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
 */
-struct jump_label_key perf_sched_events __read_mostly;
+struct jump_label_key_deferred perf_sched_events __read_mostly;
 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
 static atomic_t nr_mmap_events __read_mostly;
@@ -1130,6 +1130,8 @@ event_sched_out(struct perf_event *event,
        if (!is_software_event(event))
                cpuctx->active_oncpu--;
        ctx->nr_active--;
+        if (event->attr.freq && event->attr.sample_freq)
+                ctx->nr_freq--;
        if (event->attr.exclusive || !cpuctx->active_oncpu)
                cpuctx->exclusive = 0;
 }
@@ -1325,6 +1327,7 @@ retry:
        }
        raw_spin_unlock_irq(&ctx->lock);
 }
+EXPORT_SYMBOL_GPL(perf_event_disable);
 static void perf_set_shadow_time(struct perf_event *event,
                                 struct perf_event_context *ctx,
@@ -1406,6 +1409,8 @@ event_sched_in(struct perf_event *event,
        if (!is_software_event(event))
                cpuctx->active_oncpu++;
        ctx->nr_active++;
+        if (event->attr.freq && event->attr.sample_freq)
+                ctx->nr_freq++;
        if (event->attr.exclusive)
                cpuctx->exclusive = 1;
@@ -1662,8 +1667,7 @@ retry:
 * Note: this works for group members as well as group leaders
 * since the non-leader members' sibling_lists will be empty.
 */
-static void __perf_event_mark_enabled(struct perf_event *event,
+static void __perf_event_mark_enabled(struct perf_event *event)
-                                        struct perf_event_context *ctx)
 {
        struct perf_event *sub;
        u64 tstamp = perf_event_time(event);
@@ -1701,7 +1705,7 @@ static int __perf_event_enable(void *info)
         */
        perf_cgroup_set_timestamp(current, ctx);
-        __perf_event_mark_enabled(event, ctx);
+        __perf_event_mark_enabled(event);
        if (!event_filter_match(event)) {
                if (is_cgroup_event(event))
@@ -1782,7 +1786,7 @@ void perf_event_enable(struct perf_event *event)
 retry:
        if (!ctx->is_active) {
-                __perf_event_mark_enabled(event, ctx);
+                __perf_event_mark_enabled(event);
                goto out;
        }
@@ -1809,6 +1813,7 @@ retry:
 out:
        raw_spin_unlock_irq(&ctx->lock);
 }
+EXPORT_SYMBOL_GPL(perf_event_enable);
 int perf_event_refresh(struct perf_event *event, int refresh)
 {
@@ -2327,6 +2332,9 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period)
        u64 interrupts, now;
        s64 delta;
+        if (!ctx->nr_freq)
+                return;
        list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
                if (event->state != PERF_EVENT_STATE_ACTIVE)
                        continue;
@@ -2382,12 +2390,14 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC;
        struct perf_event_context *ctx = NULL;
-        int rotate = 0, remove = 1;
+        int rotate = 0, remove = 1, freq = 0;
        if (cpuctx->ctx.nr_events) {
                remove = 0;
                if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
                        rotate = 1;
+                if (cpuctx->ctx.nr_freq)
+                        freq = 1;
        }
        ctx = cpuctx->task_ctx;
@@ -2395,33 +2405,40 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
                remove = 0;
                if (ctx->nr_events != ctx->nr_active)
                        rotate = 1;
+                if (ctx->nr_freq)
+                        freq = 1;
        }
+        if (!rotate && !freq)
+                goto done;
        perf_ctx_lock(cpuctx, cpuctx->task_ctx);
        perf_pmu_disable(cpuctx->ctx.pmu);
-        perf_ctx_adjust_freq(&cpuctx->ctx, interval);
-        if (ctx)
-                perf_ctx_adjust_freq(ctx, interval);
-        if (!rotate)
+        if (freq) {
-                goto done;
+                perf_ctx_adjust_freq(&cpuctx->ctx, interval);
+                if (ctx)
+                        perf_ctx_adjust_freq(ctx, interval);
+        }
-        cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
+        if (rotate) {
-        if (ctx)
+                cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-                ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
+                if (ctx)
+                        ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
-        rotate_ctx(&cpuctx->ctx);
+                rotate_ctx(&cpuctx->ctx);
-        if (ctx)
+                if (ctx)
-                rotate_ctx(ctx);
+                        rotate_ctx(ctx);
-        perf_event_sched_in(cpuctx, ctx, current);
+                perf_event_sched_in(cpuctx, ctx, current);
+        }
+        perf_pmu_enable(cpuctx->ctx.pmu);
+        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 done:
        if (remove)
                list_del_init(&cpuctx->rotation_list);
-        perf_pmu_enable(cpuctx->ctx.pmu);
-        perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
 }
 void perf_event_task_tick(void)
@@ -2448,7 +2465,7 @@ static int event_enable_on_exec(struct perf_event *event,
        if (event->state >= PERF_EVENT_STATE_INACTIVE)
                return 0;
-        __perf_event_mark_enabled(event, ctx);
+        __perf_event_mark_enabled(event);
        return 1;
 }
@@ -2480,13 +2497,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
        raw_spin_lock(&ctx->lock);
        task_ctx_sched_out(ctx);
-        list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
+        list_for_each_entry(event, &ctx->event_list, event_entry) {
-                ret = event_enable_on_exec(event, ctx);
-                if (ret)
-                        enabled = 1;
-        }
-        list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
                ret = event_enable_on_exec(event, ctx);
                if (ret)
                        enabled = 1;
@@ -2574,215 +2585,6 @@ static u64 perf_event_read(struct perf_event *event)
 }
 /*
- * Callchain support
- */
-struct callchain_cpus_entries {
-        struct rcu_head                 rcu_head;
-        struct perf_callchain_entry     *cpu_entries[0];
-};
-static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]);
-static atomic_t nr_callchain_events;
-static DEFINE_MUTEX(callchain_mutex);
-struct callchain_cpus_entries *callchain_cpus_entries;
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
-                                  struct pt_regs *regs)
-{
-}
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
-                                struct pt_regs *regs)
-{
-}
-static void release_callchain_buffers_rcu(struct rcu_head *head)
-{
-        struct callchain_cpus_entries *entries;
-        int cpu;
-        entries = container_of(head, struct callchain_cpus_entries, rcu_head);
-        for_each_possible_cpu(cpu)
-                kfree(entries->cpu_entries[cpu]);
-        kfree(entries);
-}
-static void release_callchain_buffers(void)
-{
-        struct callchain_cpus_entries *entries;
-        entries = callchain_cpus_entries;
-        rcu_assign_pointer(callchain_cpus_entries, NULL);
-        call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
-}
-static int alloc_callchain_buffers(void)
-{
-        int cpu;
-        int size;
-        struct callchain_cpus_entries *entries;
-        /*
-         * We can't use the percpu allocation API for data that can be
-         * accessed from NMI. Use a temporary manual per cpu allocation
-         * until that gets sorted out.
-         */
-        size = offsetof(struct callchain_cpus_entries, cpu_entries[nr_cpu_ids]);
-        entries = kzalloc(size, GFP_KERNEL);
-        if (!entries)
-                return -ENOMEM;
-        size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS;
-        for_each_possible_cpu(cpu) {
-                entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
-                                                         cpu_to_node(cpu));
-                if (!entries->cpu_entries[cpu])
-                        goto fail;
-        }
-        rcu_assign_pointer(callchain_cpus_entries, entries);
-        return 0;
-fail:
-        for_each_possible_cpu(cpu)
-                kfree(entries->cpu_entries[cpu]);
-        kfree(entries);
-        return -ENOMEM;
-}
-static int get_callchain_buffers(void)
-{
-        int err = 0;
-        int count;
-        mutex_lock(&callchain_mutex);
-        count = atomic_inc_return(&nr_callchain_events);
-        if (WARN_ON_ONCE(count < 1)) {
-                err = -EINVAL;
-                goto exit;
-        }
-        if (count > 1) {
-                /* If the allocation failed, give up */
-                if (!callchain_cpus_entries)
-                        err = -ENOMEM;
-                goto exit;
-        }
-        err = alloc_callchain_buffers();
-        if (err)
-                release_callchain_buffers();
-exit:
-        mutex_unlock(&callchain_mutex);
-        return err;
-}
-static void put_callchain_buffers(void)
-{
-        if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
-                release_callchain_buffers();
-                mutex_unlock(&callchain_mutex);
-        }
-}
-static int get_recursion_context(int *recursion)
-{
-        int rctx;
-        if (in_nmi())
-                rctx = 3;
-        else if (in_irq())
-                rctx = 2;
-        else if (in_softirq())
-                rctx = 1;
-        else
-                rctx = 0;
-        if (recursion[rctx])
-                return -1;
-        recursion[rctx]++;
-        barrier();
-        return rctx;
-}
-static inline void put_recursion_context(int *recursion, int rctx)
-{
-        barrier();
-        recursion[rctx]--;
-}
-static struct perf_callchain_entry *get_callchain_entry(int *rctx)
-{
-        int cpu;
-        struct callchain_cpus_entries *entries;
-        *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
-        if (*rctx == -1)
-                return NULL;
-        entries = rcu_dereference(callchain_cpus_entries);
-        if (!entries)
-                return NULL;
-        cpu = smp_processor_id();
-        return &entries->cpu_entries[cpu][*rctx];
-}
-static void
-put_callchain_entry(int rctx)
-{
-        put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
-}
-static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-        int rctx;
-        struct perf_callchain_entry *entry;
-        entry = get_callchain_entry(&rctx);
-        if (rctx == -1)
-                return NULL;
-        if (!entry)
-                goto exit_put;
-        entry->nr = 0;
-        if (!user_mode(regs)) {
-                perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
-                perf_callchain_kernel(entry, regs);
-                if (current->mm)
-                        regs = task_pt_regs(current);
-                else
-                        regs = NULL;
-        }
-        if (regs) {
-                perf_callchain_store(entry, PERF_CONTEXT_USER);
-                perf_callchain_user(entry, regs);
-        }
-exit_put:
-        put_callchain_entry(rctx);
-        return entry;
-}
-/*
 * Initialize the perf_event context in a task_struct:
 */
 static void __perf_event_init_context(struct perf_event_context *ctx)
@@ -2946,7 +2748,7 @@ static void free_event(struct perf_event *event)
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                        jump_label_dec(&perf_sched_events);
+                        jump_label_dec_deferred(&perf_sched_events);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_dec(&nr_mmap_events);
                if (event->attr.comm)
@@ -2957,7 +2759,7 @@ static void free_event(struct perf_event *event)
                        put_callchain_buffers();
                if (is_cgroup_event(event)) {
                        atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
-                        jump_label_dec(&perf_sched_events);
+                        jump_label_dec_deferred(&perf_sched_events);
                }
        }
@@ -4820,7 +4622,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
        struct hw_perf_event *hwc = &event->hw;
        int throttle = 0;
-        data->period = event->hw.last_period;
        if (!overflow)
                overflow = perf_swevent_set_period(event);
@@ -4854,6 +4655,12 @@ static void perf_swevent_event(struct perf_event *event, u64 nr,
        if (!is_sampling_event(event))
                return;
+        if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
+                data->period = nr;
+                return perf_swevent_overflow(event, 1, data, regs);
+        } else
+                data->period = event->hw.last_period;
        if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
                return perf_swevent_overflow(event, 1, data, regs);
@@ -5366,7 +5173,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
        regs = get_irq_regs();
        if (regs && !perf_exclude_event(event, regs)) {
-                if (!(event->attr.exclude_idle && current->pid == 0))
+                if (!(event->attr.exclude_idle && is_idle_task(current)))
                        if (perf_event_overflow(event, &data, regs))
                                ret = HRTIMER_NORESTART;
        }
@@ -5981,7 +5788,7 @@ done:
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
-                        jump_label_inc(&perf_sched_events);
+                        jump_label_inc(&perf_sched_events.key);
                if (event->attr.mmap || event->attr.mmap_data)
                        atomic_inc(&nr_mmap_events);
                if (event->attr.comm)
@@ -6219,7 +6026,7 @@ SYSCALL_DEFINE5(perf_event_open,
                 * - that may need work on context switch
                 */
                atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
-                jump_label_inc(&perf_sched_events);
+                jump_label_inc(&perf_sched_events.key);
        }
        /*
@@ -7065,6 +6872,9 @@ void __init perf_event_init(void)
        ret = init_hw_breakpoint();
        WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
+        /* do not patch jump label more than once per second */
+        jump_label_rate_limit(&perf_sched_events, HZ);
 }
 static int __init perf_event_sysfs_init(void)
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 64568a699375..b0b107f90afc 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -1,6 +1,10 @@
 #ifndef _KERNEL_EVENTS_INTERNAL_H
 #define _KERNEL_EVENTS_INTERNAL_H
+#include <linux/hardirq.h>
+/* Buffer handling */
 #define RING_BUFFER_WRITABLE            0x01
 struct ring_buffer {
@@ -67,7 +71,7 @@ static inline int page_order(struct ring_buffer *rb)
 }
 #endif
-static unsigned long perf_data_size(struct ring_buffer *rb)
+static inline unsigned long perf_data_size(struct ring_buffer *rb)
 {
        return rb->nr_pages << (PAGE_SHIFT + page_order(rb));
 }
@@ -96,4 +100,37 @@ __output_copy(struct perf_output_handle *handle,
        } while (len);
 }
+/* Callchain handling */
+extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs);
+extern int get_callchain_buffers(void);
+extern void put_callchain_buffers(void);
+static inline int get_recursion_context(int *recursion)
+{
+        int rctx;
+        if (in_nmi())
+                rctx = 3;
+        else if (in_irq())
+                rctx = 2;
+        else if (in_softirq())
+                rctx = 1;
+        else
+                rctx = 0;
+        if (recursion[rctx])
+                return -1;
+        recursion[rctx]++;
+        barrier();
+        return rctx;
+}
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+        barrier();
+        recursion[rctx]--;
+}
 #endif /* _KERNEL_EVENTS_INTERNAL_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index e6e01b959a0e..d579a459309d 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -121,9 +121,9 @@ static void __exit_signal(struct task_struct *tsk)
                 * We won't ever get here for the group leader, since it
                 * will have been the last reference on the signal_struct.
                 */
-                sig->utime = cputime_add(sig->utime, tsk->utime);
+                sig->utime += tsk->utime;
-                sig->stime = cputime_add(sig->stime, tsk->stime);
+                sig->stime += tsk->stime;
-                sig->gtime = cputime_add(sig->gtime, tsk->gtime);
+                sig->gtime += tsk->gtime;
                sig->min_flt += tsk->min_flt;
                sig->maj_flt += tsk->maj_flt;
                sig->nvcsw += tsk->nvcsw;
@@ -1255,19 +1255,9 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
                spin_lock_irq(&p->real_parent->sighand->siglock);
                psig = p->real_parent->signal;
                sig = p->signal;
-                psig->cutime =
+                psig->cutime += tgutime + sig->cutime;
-                        cputime_add(psig->cutime,
+                psig->cstime += tgstime + sig->cstime;
-                        cputime_add(tgutime,
+                psig->cgtime += p->gtime + sig->gtime + sig->cgtime;
-                                    sig->cutime));
-                psig->cstime =
-                        cputime_add(psig->cstime,
-                        cputime_add(tgstime,
-                                    sig->cstime));
-                psig->cgtime =
-                        cputime_add(psig->cgtime,
-                        cputime_add(p->gtime,
-                        cputime_add(sig->gtime,
-                                    sig->cgtime)));
                psig->cmin_flt +=
                        p->min_flt + sig->min_flt + sig->cmin_flt;
                psig->cmaj_flt +=
diff --git a/kernel/fork.c b/kernel/fork.c
index da4a6a10d088..b058c5820ecd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1023,8 +1023,8 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 */
 static void posix_cpu_timers_init(struct task_struct *tsk)
 {
-        tsk->cputime_expires.prof_exp = cputime_zero;
+        tsk->cputime_expires.prof_exp = 0;
-        tsk->cputime_expires.virt_exp = cputime_zero;
+        tsk->cputime_expires.virt_exp = 0;
        tsk->cputime_expires.sched_exp = 0;
        INIT_LIST_HEAD(&tsk->cpu_timers[0]);
        INIT_LIST_HEAD(&tsk->cpu_timers[1]);
@@ -1132,14 +1132,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        init_sigpending(&p->pending);
-        p->utime = cputime_zero;
+        p->utime = p->stime = p->gtime = 0;
-        p->stime = cputime_zero;
+        p->utimescaled = p->stimescaled = 0;
-        p->gtime = cputime_zero;
-        p->utimescaled = cputime_zero;
-        p->stimescaled = cputime_zero;
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
-        p->prev_utime = cputime_zero;
+        p->prev_utime = p->prev_stime = 0;
-        p->prev_stime = cputime_zero;
 #endif
 #if defined(SPLIT_RSS_COUNTING)
        memset(&p->rss_stat, 0, sizeof(p->rss_stat));
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 200ce832c585..7ca523b249ef 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -143,11 +143,6 @@ int irq_domain_simple_dt_translate(struct irq_domain *d,
        return 0;
 }
-struct irq_domain_ops irq_domain_simple_ops = {
-        .dt_translate = irq_domain_simple_dt_translate,
-};
-EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
 /**
 * irq_domain_create_simple() - Set up a 'simple' translation range
 */
@@ -182,3 +177,10 @@ void irq_domain_generate_simple(const struct of_device_id *match,
 }
 EXPORT_SYMBOL_GPL(irq_domain_generate_simple);
 #endif /* CONFIG_OF_IRQ */
+struct irq_domain_ops irq_domain_simple_ops = {
+#ifdef CONFIG_OF_IRQ
+        .dt_translate = irq_domain_simple_dt_translate,
+#endif /* CONFIG_OF_IRQ */
+};
+EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
diff --git a/kernel/itimer.c b/kernel/itimer.c
index d802883153da..22000c3db0dd 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -52,22 +52,22 @@ static void get_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
        cval = it->expires;
        cinterval = it->incr;
-        if (!cputime_eq(cval, cputime_zero)) {
+        if (cval) {
                struct task_cputime cputime;
                cputime_t t;
                thread_group_cputimer(tsk, &cputime);
                if (clock_id == CPUCLOCK_PROF)
-                        t = cputime_add(cputime.utime, cputime.stime);
+                        t = cputime.utime + cputime.stime;
                else
                        /* CPUCLOCK_VIRT */
                        t = cputime.utime;
-                if (cputime_le(cval, t))
+                if (cval < t)
                        /* about to fire */
                        cval = cputime_one_jiffy;
                else
-                        cval = cputime_sub(cval, t);
+                        cval = cval - t;
        }
        spin_unlock_irq(&tsk->sighand->siglock);
@@ -161,10 +161,9 @@ static void set_cpu_itimer(struct task_struct *tsk, unsigned int clock_id,
        cval = it->expires;
        cinterval = it->incr;
-        if (!cputime_eq(cval, cputime_zero) ||
+        if (cval || nval) {
-            !cputime_eq(nval, cputime_zero)) {
+                if (nval > 0)
-                if (cputime_gt(nval, cputime_zero))
+                        nval += cputime_one_jiffy;
-                        nval = cputime_add(nval, cputime_one_jiffy);
                set_process_cpu_timer(tsk, clock_id, &nval, &cval);
        }
        it->expires = nval;
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 66ff7109f697..30c3c7708132 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -72,15 +72,46 @@ void jump_label_inc(struct jump_label_key *key)
        jump_label_unlock();
 }
-void jump_label_dec(struct jump_label_key *key)
+static void __jump_label_dec(struct jump_label_key *key,
+                unsigned long rate_limit, struct delayed_work *work)
 {
        if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
                return;
-        jump_label_update(key, JUMP_LABEL_DISABLE);
+        if (rate_limit) {
+                atomic_inc(&key->enabled);
+                schedule_delayed_work(work, rate_limit);
+        } else
+                jump_label_update(key, JUMP_LABEL_DISABLE);
        jump_label_unlock();
 }
+static void jump_label_update_timeout(struct work_struct *work)
+{
+        struct jump_label_key_deferred *key =
+                container_of(work, struct jump_label_key_deferred, work.work);
+        __jump_label_dec(&key->key, 0, NULL);
+}
+void jump_label_dec(struct jump_label_key *key)
+{
+        __jump_label_dec(key, 0, NULL);
+}
+void jump_label_dec_deferred(struct jump_label_key_deferred *key)
+{
+        __jump_label_dec(&key->key, key->timeout, &key->work);
+}
+void jump_label_rate_limit(struct jump_label_key_deferred *key,
+                unsigned long rl)
+{
+        key->timeout = rl;
+        INIT_DELAYED_WORK(&key->work, jump_label_update_timeout);
+}
 static int addr_conflict(struct jump_entry *entry, void *start, void *end)
 {
        if (entry->code <= (unsigned long)end &&
@@ -111,7 +142,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
 * running code can override this to make the non-live update case
 * cheaper.
 */
-void __weak arch_jump_label_transform_static(struct jump_entry *entry,
+void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
                                            enum jump_label_type type)
 {
        arch_jump_label_transform(entry, type); 
@@ -217,8 +248,13 @@ void jump_label_apply_nops(struct module *mod)
        if (iter_start == iter_stop)
                return;
-        for (iter = iter_start; iter < iter_stop; iter++)
+        for (iter = iter_start; iter < iter_stop; iter++) {
-                arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
+                struct jump_label_key *iterk;
+                iterk = (struct jump_label_key *)(unsigned long)iter->key;
+                arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
+                                JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
+        }
 }
 static int jump_label_add_module(struct module *mod)
@@ -258,8 +294,7 @@ static int jump_label_add_module(struct module *mod)
                key->next = jlm;
                if (jump_label_enabled(key))
-                        __jump_label_update(key, iter, iter_stop,
+                        __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
-                                            JUMP_LABEL_ENABLE);
        }
        return 0;
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index b2e08c932d91..8889f7dd7c46 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -431,6 +431,7 @@ unsigned int max_lockdep_depth;
 * about it later on, in lockdep_info().
 */
 static int lockdep_init_error;
+static const char *lock_init_error;
 static unsigned long lockdep_init_trace_data[20];
 static struct stack_trace lockdep_init_trace = {
        .max_entries = ARRAY_SIZE(lockdep_init_trace_data),
@@ -499,36 +500,32 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
        usage[i] = '\0';
 }
-static int __print_lock_name(struct lock_class *class)
+static void __print_lock_name(struct lock_class *class)
 {
        char str[KSYM_NAME_LEN];
        const char *name;
        name = class->name;
-        if (!name)
-                name = __get_key_name(class->key, str);
-        return printk("%s", name);
-}
-static void print_lock_name(struct lock_class *class)
-{
-        char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
-        const char *name;
-        get_usage_chars(class, usage);
-        name = class->name;
        if (!name) {
                name = __get_key_name(class->key, str);
-                printk(" (%s", name);
+                printk("%s", name);
        } else {
-                printk(" (%s", name);
+                printk("%s", name);
                if (class->name_version > 1)
                        printk("#%d", class->name_version);
                if (class->subclass)
                        printk("/%d", class->subclass);
        }
+}
+static void print_lock_name(struct lock_class *class)
+{
+        char usage[LOCK_USAGE_CHARS];
+        get_usage_chars(class, usage);
+        printk(" (");
+        __print_lock_name(class);
        printk("){%s}", usage);
 }
@@ -568,11 +565,12 @@ static void lockdep_print_held_locks(struct task_struct *curr)
        }
 }
-static void print_kernel_version(void)
+static void print_kernel_ident(void)
 {
-        printk("%s %.*s\n", init_utsname()->release,
+        printk("%s %.*s %s\n", init_utsname()->release,
                (int)strcspn(init_utsname()->version, " "),
-                init_utsname()->version);
+                init_utsname()->version,
+                print_tainted());
 }
 static int very_verbose(struct lock_class *class)
@@ -656,6 +654,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
        if (unlikely(!lockdep_initialized)) {
                lockdep_init();
                lockdep_init_error = 1;
+                lock_init_error = lock->name;
                save_stack_trace(&lockdep_init_trace);
        }
 #endif
@@ -723,7 +722,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
        class = look_up_lock_class(lock, subclass);
        if (likely(class))
-                return class;
+                goto out_set_class_cache;
        /*
         * Debug-check: all keys must be persistent!
@@ -808,6 +807,7 @@ out_unlock_set:
        graph_unlock();
        raw_local_irq_restore(flags);
+out_set_class_cache:
        if (!subclass || force)
                lock->class_cache[0] = class;
        else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
@@ -1149,7 +1149,7 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
        printk("\n");
        printk("======================================================\n");
        printk("[ INFO: possible circular locking dependency detected ]\n");
-        print_kernel_version();
+        print_kernel_ident();
        printk("-------------------------------------------------------\n");
        printk("%s/%d is trying to acquire lock:\n",
                curr->comm, task_pid_nr(curr));
@@ -1488,7 +1488,7 @@ print_bad_irq_dependency(struct task_struct *curr,
        printk("======================================================\n");
        printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
                irqclass, irqclass);
-        print_kernel_version();
+        print_kernel_ident();
        printk("------------------------------------------------------\n");
        printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
                curr->comm, task_pid_nr(curr),
@@ -1717,7 +1717,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
        printk("\n");
        printk("=============================================\n");
        printk("[ INFO: possible recursive locking detected ]\n");
-        print_kernel_version();
+        print_kernel_ident();
        printk("---------------------------------------------\n");
        printk("%s/%d is trying to acquire lock:\n",
                curr->comm, task_pid_nr(curr));
@@ -2224,7 +2224,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
        printk("\n");
        printk("=================================\n");
        printk("[ INFO: inconsistent lock state ]\n");
-        print_kernel_version();
+        print_kernel_ident();
        printk("---------------------------------\n");
        printk("inconsistent {%s} -> {%s} usage.\n",
@@ -2289,7 +2289,7 @@ print_irq_inversion_bug(struct task_struct *curr,
        printk("\n");
        printk("=========================================================\n");
        printk("[ INFO: possible irq lock inversion dependency detected ]\n");
-        print_kernel_version();
+        print_kernel_ident();
        printk("---------------------------------------------------------\n");
        printk("%s/%d just changed the state of lock:\n",
                curr->comm, task_pid_nr(curr));
@@ -3175,6 +3175,7 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
        printk("\n");
        printk("=====================================\n");
        printk("[ BUG: bad unlock balance detected! ]\n");
+        print_kernel_ident();
        printk("-------------------------------------\n");
        printk("%s/%d is trying to release lock (",
                curr->comm, task_pid_nr(curr));
@@ -3619,6 +3620,7 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
        printk("\n");
        printk("=================================\n");
        printk("[ BUG: bad contention detected! ]\n");
+        print_kernel_ident();
        printk("---------------------------------\n");
        printk("%s/%d is trying to contend lock (",
                curr->comm, task_pid_nr(curr));
@@ -3974,7 +3976,8 @@ void __init lockdep_info(void)
 #ifdef CONFIG_DEBUG_LOCKDEP
        if (lockdep_init_error) {
-                printk("WARNING: lockdep init error! Arch code didn't call lockdep_init() early enough?\n");
+                printk("WARNING: lockdep init error! lock-%s was acquired"
+                        "before lockdep_init\n", lock_init_error);
                printk("Call stack leading to lockdep invocation was:\n");
                print_stack_trace(&lockdep_init_trace, 0);
        }
@@ -3993,6 +3996,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
        printk("\n");
        printk("=========================\n");
        printk("[ BUG: held lock freed! ]\n");
+        print_kernel_ident();
        printk("-------------------------\n");
        printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
                curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
@@ -4050,6 +4054,7 @@ static void print_held_locks_bug(struct task_struct *curr)
        printk("\n");
        printk("=====================================\n");
        printk("[ BUG: lock held at task exit time! ]\n");
+        print_kernel_ident();
        printk("-------------------------------------\n");
        printk("%s/%d is exiting with locks still held!\n",
                curr->comm, task_pid_nr(curr));
@@ -4147,6 +4152,7 @@ void lockdep_sys_exit(void)
                printk("\n");
                printk("================================================\n");
                printk("[ BUG: lock held when returning to user space! ]\n");
+                print_kernel_ident();
                printk("------------------------------------------------\n");
                printk("%s/%d is leaving the kernel with locks still held!\n",
                                curr->comm, curr->pid);
@@ -4166,10 +4172,33 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
        printk("\n");
        printk("===============================\n");
        printk("[ INFO: suspicious RCU usage. ]\n");
+        print_kernel_ident();
        printk("-------------------------------\n");
        printk("%s:%d %s!\n", file, line, s);
        printk("\nother info that might help us debug this:\n\n");
        printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
+        /*
+         * If a CPU is in the RCU-free window in idle (ie: in the section
+         * between rcu_idle_enter() and rcu_idle_exit(), then RCU
+         * considers that CPU to be in an "extended quiescent state",
+         * which means that RCU will be completely ignoring that CPU.
+         * Therefore, rcu_read_lock() and friends have absolutely no
+         * effect on a CPU running in that state. In other words, even if
+         * such an RCU-idle CPU has called rcu_read_lock(), RCU might well
+         * delete data structures out from under it.  RCU really has no
+         * choice here: we need to keep an RCU-free window in idle where
+         * the CPU may possibly enter into low power mode. This way we can
+         * notice an extended quiescent state to other CPUs that started a grace
+         * period. Otherwise we would delay any grace period as long as we run
+         * in the idle task.
+         *
+         * So complain bitterly if someone does call rcu_read_lock(),
+         * rcu_read_lock_bh() and so on from extended quiescent states.
+         */
+        if (rcu_is_cpu_idle())
+                printk("RCU used illegally from extended quiescent state!\n");
        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
        dump_stack();
diff --git a/kernel/panic.c b/kernel/panic.c
index b26593604214..3458469eb7c3 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -237,11 +237,20 @@ void add_taint(unsigned flag)
         * Can't trust the integrity of the kernel anymore.
         * We don't call directly debug_locks_off() because the issue
         * is not necessarily serious enough to set oops_in_progress to 1
-         * Also we want to keep up lockdep for staging development and
+         * Also we want to keep up lockdep for staging/out-of-tree
-         * post-warning case.
+         * development and post-warning case.
         */
-        if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off())
+        switch (flag) {
-                printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
+        case TAINT_CRAP:
+        case TAINT_OOT_MODULE:
+        case TAINT_WARN:
+        case TAINT_FIRMWARE_WORKAROUND:
+                break;
+        default:
+                if (__debug_locks_off())
+                        printk(KERN_WARNING "Disabling lock debugging due to kernel taint\n");
+        }
        set_bit(flag, &tainted_mask);
 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e7cb76dc18f5..125cb67daa21 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -78,7 +78,7 @@ static inline int cpu_time_before(const clockid_t which_clock,
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
                return now.sched < then.sched;
        }  else {
-                return cputime_lt(now.cpu, then.cpu);
+                return now.cpu < then.cpu;
        }
 }
 static inline void cpu_time_add(const clockid_t which_clock,
@@ -88,7 +88,7 @@ static inline void cpu_time_add(const clockid_t which_clock,
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
                acc->sched += val.sched;
        }  else {
-                acc->cpu = cputime_add(acc->cpu, val.cpu);
+                acc->cpu += val.cpu;
        }
 }
 static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
@@ -98,25 +98,12 @@ static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
                a.sched -= b.sched;
        }  else {
-                a.cpu = cputime_sub(a.cpu, b.cpu);
+                a.cpu -= b.cpu;
        }
        return a;
 }
 /*
- * Divide and limit the result to res >= 1
- *
- * This is necessary to prevent signal delivery starvation, when the result of
- * the division would be rounded down to 0.
- */
-static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
-{
-        cputime_t res = cputime_div(time, div);
-        return max_t(cputime_t, res, 1);
-}
-/*
 * Update expiry time from increment, and increase overrun count,
 * given the current clock sample.
 */
@@ -148,28 +135,26 @@ static void bump_cpu_timer(struct k_itimer *timer,
        } else {
                cputime_t delta, incr;
-                if (cputime_lt(now.cpu, timer->it.cpu.expires.cpu))
+                if (now.cpu < timer->it.cpu.expires.cpu)
                        return;
                incr = timer->it.cpu.incr.cpu;
-                delta = cputime_sub(cputime_add(now.cpu, incr),
+                delta = now.cpu + incr - timer->it.cpu.expires.cpu;
-                                    timer->it.cpu.expires.cpu);
                /* Don't use (incr*2 < delta), incr*2 might overflow. */
-                for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
+                for (i = 0; incr < delta - incr; i++)
-                             incr = cputime_add(incr, incr);
+                             incr += incr;
-                for (; i >= 0; incr = cputime_halve(incr), i--) {
+                for (; i >= 0; incr = incr >> 1, i--) {
-                        if (cputime_lt(delta, incr))
+                        if (delta < incr)
                                continue;
-                        timer->it.cpu.expires.cpu =
+                        timer->it.cpu.expires.cpu += incr;
-                                cputime_add(timer->it.cpu.expires.cpu, incr);
                        timer->it_overrun += 1 << i;
-                        delta = cputime_sub(delta, incr);
+                        delta -= incr;
                }
        }
 }
 static inline cputime_t prof_ticks(struct task_struct *p)
 {
-        return cputime_add(p->utime, p->stime);
+        return p->utime + p->stime;
 }
 static inline cputime_t virt_ticks(struct task_struct *p)
 {
@@ -248,8 +233,8 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
        t = tsk;
        do {
-                times->utime = cputime_add(times->utime, t->utime);
+                times->utime += t->utime;
-                times->stime = cputime_add(times->stime, t->stime);
+                times->stime += t->stime;
                times->sum_exec_runtime += task_sched_runtime(t);
        } while_each_thread(tsk, t);
 out:
@@ -258,10 +243,10 @@ out:
 static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
 {
-        if (cputime_gt(b->utime, a->utime))
+        if (b->utime > a->utime)
                a->utime = b->utime;
-        if (cputime_gt(b->stime, a->stime))
+        if (b->stime > a->stime)
                a->stime = b->stime;
        if (b->sum_exec_runtime > a->sum_exec_runtime)
@@ -306,7 +291,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
                return -EINVAL;
        case CPUCLOCK_PROF:
                thread_group_cputime(p, &cputime);
-                cpu->cpu = cputime_add(cputime.utime, cputime.stime);
+                cpu->cpu = cputime.utime + cputime.stime;
                break;
        case CPUCLOCK_VIRT:
                thread_group_cputime(p, &cputime);
@@ -470,26 +455,24 @@ static void cleanup_timers(struct list_head *head,
                           unsigned long long sum_exec_runtime)
 {
        struct cpu_timer_list *timer, *next;
-        cputime_t ptime = cputime_add(utime, stime);
+        cputime_t ptime = utime + stime;
        list_for_each_entry_safe(timer, next, head, entry) {
                list_del_init(&timer->entry);
-                if (cputime_lt(timer->expires.cpu, ptime)) {
+                if (timer->expires.cpu < ptime) {
-                        timer->expires.cpu = cputime_zero;
+                        timer->expires.cpu = 0;
                } else {
-                        timer->expires.cpu = cputime_sub(timer->expires.cpu,
+                        timer->expires.cpu -= ptime;
-                                                         ptime);
                }
        }
        ++head;
        list_for_each_entry_safe(timer, next, head, entry) {
                list_del_init(&timer->entry);
-                if (cputime_lt(timer->expires.cpu, utime)) {
+                if (timer->expires.cpu < utime) {
-                        timer->expires.cpu = cputime_zero;
+                        timer->expires.cpu = 0;
                } else {
-                        timer->expires.cpu = cputime_sub(timer->expires.cpu,
+                        timer->expires.cpu -= utime;
-                                                         utime);
                }
        }
@@ -520,8 +503,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
        struct signal_struct *const sig = tsk->signal;
        cleanup_timers(tsk->signal->cpu_timers,
-                       cputime_add(tsk->utime, sig->utime),
+                       tsk->utime + sig->utime, tsk->stime + sig->stime,
-                       cputime_add(tsk->stime, sig->stime),
                       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
 }
@@ -540,8 +522,7 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
 static inline int expires_gt(cputime_t expires, cputime_t new_exp)
 {
-        return cputime_eq(expires, cputime_zero) ||
+        return expires == 0 || expires > new_exp;
-               cputime_gt(expires, new_exp);
 }
 /*
@@ -651,7 +632,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
        default:
                return -EINVAL;
        case CPUCLOCK_PROF:
-                cpu->cpu = cputime_add(cputime.utime, cputime.stime);
+                cpu->cpu = cputime.utime + cputime.stime;
                break;
        case CPUCLOCK_VIRT:
                cpu->cpu = cputime.utime;
@@ -918,12 +899,12 @@ static void check_thread_timers(struct task_struct *tsk,
        unsigned long soft;
        maxfire = 20;
-        tsk->cputime_expires.prof_exp = cputime_zero;
+        tsk->cputime_expires.prof_exp = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+                if (!--maxfire || prof_ticks(tsk) < t->expires.cpu) {
                        tsk->cputime_expires.prof_exp = t->expires.cpu;
                        break;
                }
@@ -933,12 +914,12 @@ static void check_thread_timers(struct task_struct *tsk,
        ++timers;
        maxfire = 20;
-        tsk->cputime_expires.virt_exp = cputime_zero;
+        tsk->cputime_expires.virt_exp = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+                if (!--maxfire || virt_ticks(tsk) < t->expires.cpu) {
                        tsk->cputime_expires.virt_exp = t->expires.cpu;
                        break;
                }
@@ -1009,20 +990,19 @@ static u32 onecputick;
 static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
                             cputime_t *expires, cputime_t cur_time, int signo)
 {
-        if (cputime_eq(it->expires, cputime_zero))
+        if (!it->expires)
                return;
-        if (cputime_ge(cur_time, it->expires)) {
+        if (cur_time >= it->expires) {
-                if (!cputime_eq(it->incr, cputime_zero)) {
+                if (it->incr) {
-                        it->expires = cputime_add(it->expires, it->incr);
+                        it->expires += it->incr;
                        it->error += it->incr_error;
                        if (it->error >= onecputick) {
-                                it->expires = cputime_sub(it->expires,
+                                it->expires -= cputime_one_jiffy;
-                                                          cputime_one_jiffy);
                                it->error -= onecputick;
                        }
                } else {
-                        it->expires = cputime_zero;
+                        it->expires = 0;
                }
                trace_itimer_expire(signo == SIGPROF ?
@@ -1031,9 +1011,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
                __group_send_sig_info(signo, SEND_SIG_PRIV, tsk);
        }
-        if (!cputime_eq(it->expires, cputime_zero) &&
+        if (it->expires && (!*expires || it->expires < *expires)) {
-            (cputime_eq(*expires, cputime_zero) ||
-             cputime_lt(it->expires, *expires))) {
                *expires = it->expires;
        }
 }
@@ -1048,9 +1026,7 @@ static void check_cpu_itimer(struct task_struct *tsk, struct cpu_itimer *it,
 */
 static inline int task_cputime_zero(const struct task_cputime *cputime)
 {
-        if (cputime_eq(cputime->utime, cputime_zero) &&
+        if (!cputime->utime && !cputime->stime && !cputime->sum_exec_runtime)
-            cputime_eq(cputime->stime, cputime_zero) &&
-            cputime->sum_exec_runtime == 0)
                return 1;
        return 0;
 }
@@ -1076,15 +1052,15 @@ static void check_process_timers(struct task_struct *tsk,
         */
        thread_group_cputimer(tsk, &cputime);
        utime = cputime.utime;
-        ptime = cputime_add(utime, cputime.stime);
+        ptime = utime + cputime.stime;
        sum_sched_runtime = cputime.sum_exec_runtime;
        maxfire = 20;
-        prof_expires = cputime_zero;
+        prof_expires = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *tl = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) {
+                if (!--maxfire || ptime < tl->expires.cpu) {
                        prof_expires = tl->expires.cpu;
                        break;
                }
@@ -1094,12 +1070,12 @@ static void check_process_timers(struct task_struct *tsk,
        ++timers;
        maxfire = 20;
-        virt_expires = cputime_zero;
+        virt_expires = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *tl = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-                if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) {
+                if (!--maxfire || utime < tl->expires.cpu) {
                        virt_expires = tl->expires.cpu;
                        break;
                }
@@ -1154,8 +1130,7 @@ static void check_process_timers(struct task_struct *tsk,
                        }
                }
                x = secs_to_cputime(soft);
-                if (cputime_eq(prof_expires, cputime_zero) ||
+                if (!prof_expires || x < prof_expires) {
-                    cputime_lt(x, prof_expires)) {
                        prof_expires = x;
                }
        }
@@ -1249,12 +1224,9 @@ out:
 static inline int task_cputime_expired(const struct task_cputime *sample,
                                        const struct task_cputime *expires)
 {
-        if (!cputime_eq(expires->utime, cputime_zero) &&
+        if (expires->utime && sample->utime >= expires->utime)
-            cputime_ge(sample->utime, expires->utime))
                return 1;
-        if (!cputime_eq(expires->stime, cputime_zero) &&
+        if (expires->stime && sample->utime + sample->stime >= expires->stime)
-            cputime_ge(cputime_add(sample->utime, sample->stime),
-                       expires->stime))
                return 1;
        if (expires->sum_exec_runtime != 0 &&
            sample->sum_exec_runtime >= expires->sum_exec_runtime)
@@ -1389,18 +1361,18 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
                 * it to be relative, *newval argument is relative and we update
                 * it to be absolute.
                 */
-                if (!cputime_eq(*oldval, cputime_zero)) {
+                if (*oldval) {
-                        if (cputime_le(*oldval, now.cpu)) {
+                        if (*oldval <= now.cpu) {
                                /* Just about to fire. */
                                *oldval = cputime_one_jiffy;
                        } else {
-                                *oldval = cputime_sub(*oldval, now.cpu);
+                                *oldval -= now.cpu;
                        }
                }
-                if (cputime_eq(*newval, cputime_zero))
+                if (!*newval)
                        return;
-                *newval = cputime_add(*newval, now.cpu);
+                *newval += now.cpu;
        }
        /*
diff --git a/kernel/printk.c b/kernel/printk.c
index 7982a0a841ea..989e4a52da76 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -199,7 +199,7 @@ void __init setup_log_buf(int early)
                unsigned long mem;
                mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
-                if (mem == MEMBLOCK_ERROR)
+                if (!mem)
                        return;
                new_log_buf = __va(mem);
        } else {
@@ -688,6 +688,7 @@ static void zap_locks(void)
        oops_timestamp = jiffies;
+        debug_locks_off();
        /* If a crash is occurring, make sure we can't deadlock */
        raw_spin_lock_init(&logbuf_lock);
        /* And make sure that we print immediately */
@@ -840,9 +841,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        boot_delay_msec();
        printk_delay();
-        preempt_disable();
        /* This stops the holder of console_sem just where we want him */
-        raw_local_irq_save(flags);
+        local_irq_save(flags);
        this_cpu = smp_processor_id();
        /*
@@ -856,7 +856,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
                 * recursion and return - but flag the recursion so that
                 * it can be printed at the next appropriate moment:
                 */
-                if (!oops_in_progress) {
+                if (!oops_in_progress && !lockdep_recursing(current)) {
                        recursion_bug = 1;
                        goto out_restore_irqs;
                }
@@ -962,9 +962,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        lockdep_on();
 out_restore_irqs:
-        raw_local_irq_restore(flags);
+        local_irq_restore(flags);
-        preempt_enable();
        return printed_len;
 }
 EXPORT_SYMBOL(printk);
diff --git a/kernel/rcu.h b/kernel/rcu.h
index f600868d550d..aa88baab5f78 100644
--- a/kernel/rcu.h
+++ b/kernel/rcu.h
@@ -30,6 +30,13 @@
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
 /*
+ * Process-level increment to ->dynticks_nesting field.  This allows for
+ * architectures that use half-interrupts and half-exceptions from
+ * process context.
+ */
+#define DYNTICK_TASK_NESTING (LLONG_MAX / 2 - 1)
+/*
 * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
 * by call_rcu() and rcu callback execution, and are therefore not part of the
 * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c5b98e565aee..2bc4e135ff23 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -93,6 +93,8 @@ int rcu_read_lock_bh_held(void)
 {
        if (!debug_lockdep_rcu_enabled())
                return 1;
+        if (rcu_is_cpu_idle())
+                return 0;
        return in_softirq() || irqs_disabled();
 }
 EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
@@ -316,3 +318,13 @@ struct debug_obj_descr rcuhead_debug_descr = {
 };
 EXPORT_SYMBOL_GPL(rcuhead_debug_descr);
 #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+#if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE)
+void do_trace_rcu_torture_read(char *rcutorturename, struct rcu_head *rhp)
+{
+        trace_rcu_torture_read(rcutorturename, rhp);
+}
+EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read);
+#else
+#define do_trace_rcu_torture_read(rcutorturename, rhp) do { } while (0)
+#endif
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 636af6d9c6e5..977296dca0a4 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -53,31 +53,137 @@ static void __call_rcu(struct rcu_head *head,
 #include "rcutiny_plugin.h"
-#ifdef CONFIG_NO_HZ
+static long long rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
-static long rcu_dynticks_nesting = 1;
+/* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */
+static void rcu_idle_enter_common(long long oldval)
+{
+        if (rcu_dynticks_nesting) {
+                RCU_TRACE(trace_rcu_dyntick("--=",
+                                            oldval, rcu_dynticks_nesting));
+                return;
+        }
+        RCU_TRACE(trace_rcu_dyntick("Start", oldval, rcu_dynticks_nesting));
+        if (!is_idle_task(current)) {
+                struct task_struct *idle = idle_task(smp_processor_id());
+                RCU_TRACE(trace_rcu_dyntick("Error on entry: not idle task",
+                                            oldval, rcu_dynticks_nesting));
+                ftrace_dump(DUMP_ALL);
+                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+                          current->pid, current->comm,
+                          idle->pid, idle->comm); /* must be idle task! */
+        }
+        rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
+}
 /*
- * Enter dynticks-idle mode, which is an extended quiescent state
+ * Enter idle, which is an extended quiescent state if we have fully
- * if we have fully entered that mode (i.e., if the new value of
+ * entered that mode (i.e., if the new value of dynticks_nesting is zero).
- * dynticks_nesting is zero).
 */
-void rcu_enter_nohz(void)
+void rcu_idle_enter(void)
 {
-        if (--rcu_dynticks_nesting == 0)
+        unsigned long flags;
-                rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */
+        long long oldval;
+        local_irq_save(flags);
+        oldval = rcu_dynticks_nesting;
+        rcu_dynticks_nesting = 0;
+        rcu_idle_enter_common(oldval);
+        local_irq_restore(flags);
 }
 /*
- * Exit dynticks-idle mode, so that we are no longer in an extended
+ * Exit an interrupt handler towards idle.
- * quiescent state.
 */
-void rcu_exit_nohz(void)
+void rcu_irq_exit(void)
+{
+        unsigned long flags;
+        long long oldval;
+        local_irq_save(flags);
+        oldval = rcu_dynticks_nesting;
+        rcu_dynticks_nesting--;
+        WARN_ON_ONCE(rcu_dynticks_nesting < 0);
+        rcu_idle_enter_common(oldval);
+        local_irq_restore(flags);
+}
+/* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */
+static void rcu_idle_exit_common(long long oldval)
 {
+        if (oldval) {
+                RCU_TRACE(trace_rcu_dyntick("++=",
+                                            oldval, rcu_dynticks_nesting));
+                return;
+        }
+        RCU_TRACE(trace_rcu_dyntick("End", oldval, rcu_dynticks_nesting));
+        if (!is_idle_task(current)) {
+                struct task_struct *idle = idle_task(smp_processor_id());
+                RCU_TRACE(trace_rcu_dyntick("Error on exit: not idle task",
+                          oldval, rcu_dynticks_nesting));
+                ftrace_dump(DUMP_ALL);
+                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+                          current->pid, current->comm,
+                          idle->pid, idle->comm); /* must be idle task! */
+        }
+}
+/*
+ * Exit idle, so that we are no longer in an extended quiescent state.
+ */
+void rcu_idle_exit(void)
+{
+        unsigned long flags;
+        long long oldval;
+        local_irq_save(flags);
+        oldval = rcu_dynticks_nesting;
+        WARN_ON_ONCE(oldval != 0);
+        rcu_dynticks_nesting = DYNTICK_TASK_NESTING;
+        rcu_idle_exit_common(oldval);
+        local_irq_restore(flags);
+}
+/*
+ * Enter an interrupt handler, moving away from idle.
+ */
+void rcu_irq_enter(void)
+{
+        unsigned long flags;
+        long long oldval;
+        local_irq_save(flags);
+        oldval = rcu_dynticks_nesting;
        rcu_dynticks_nesting++;
+        WARN_ON_ONCE(rcu_dynticks_nesting == 0);
+        rcu_idle_exit_common(oldval);
+        local_irq_restore(flags);
+}
+#ifdef CONFIG_PROVE_RCU
+/*
+ * Test whether RCU thinks that the current CPU is idle.
+ */
+int rcu_is_cpu_idle(void)
+{
+        return !rcu_dynticks_nesting;
 }
+EXPORT_SYMBOL(rcu_is_cpu_idle);
+#endif /* #ifdef CONFIG_PROVE_RCU */
-#endif /* #ifdef CONFIG_NO_HZ */
+/*
+ * Test whether the current CPU was interrupted from idle.  Nested
+ * interrupts don't count, we must be running at the first interrupt
+ * level.
+ */
+int rcu_is_cpu_rrupt_from_idle(void)
+{
+        return rcu_dynticks_nesting <= 0;
+}
 /*
 * Helper function for rcu_sched_qs() and rcu_bh_qs().
@@ -126,14 +232,13 @@ void rcu_bh_qs(int cpu)
 /*
 * Check to see if the scheduling-clock interrupt came from an extended
- * quiescent state, and, if so, tell RCU about it.
+ * quiescent state, and, if so, tell RCU about it.  This function must
+ * be called from hardirq context.  It is normally called from the
+ * scheduling-clock interrupt.
 */
 void rcu_check_callbacks(int cpu, int user)
 {
-        if (user ||
+        if (user || rcu_is_cpu_rrupt_from_idle())
-            (idle_cpu(cpu) &&
-             !in_softirq() &&
-             hardirq_count() <= (1 << HARDIRQ_SHIFT)))
                rcu_sched_qs(cpu);
        else if (!in_softirq())
                rcu_bh_qs(cpu);
@@ -154,7 +259,11 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        /* If no RCU callbacks ready to invoke, just return. */
        if (&rcp->rcucblist == rcp->donetail) {
                RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
-                RCU_TRACE(trace_rcu_batch_end(rcp->name, 0));
+                RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
+                                              ACCESS_ONCE(rcp->rcucblist),
+                                              need_resched(),
+                                              is_idle_task(current),
+                                              rcu_is_callbacks_kthread()));
                return;
        }
@@ -183,7 +292,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
                RCU_TRACE(cb_count++);
        }
        RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
-        RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count));
+        RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(),
+                                      is_idle_task(current),
+                                      rcu_is_callbacks_kthread()));
 }
 static void rcu_process_callbacks(struct softirq_action *unused)
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 2b0484a5dc28..9cb1ae4aabdd 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -312,8 +312,8 @@ static int rcu_boost(void)
        rt_mutex_lock(&mtx);
        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
-        return rcu_preempt_ctrlblk.boost_tasks != NULL ||
+        return ACCESS_ONCE(rcu_preempt_ctrlblk.boost_tasks) != NULL ||
-               rcu_preempt_ctrlblk.exp_tasks != NULL;
+               ACCESS_ONCE(rcu_preempt_ctrlblk.exp_tasks) != NULL;
 }
 /*
@@ -885,6 +885,19 @@ static void invoke_rcu_callbacks(void)
        wake_up(&rcu_kthread_wq);
 }
+#ifdef CONFIG_RCU_TRACE
+/*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(void)
+{
+        return rcu_kthread_task == current;
+}
+#endif /* #ifdef CONFIG_RCU_TRACE */
 /*
 * This kthread invokes RCU callbacks whose grace periods have
 * elapsed.  It is awakened as needed, and takes the place of the
@@ -938,6 +951,18 @@ void invoke_rcu_callbacks(void)
        raise_softirq(RCU_SOFTIRQ);
 }
+#ifdef CONFIG_RCU_TRACE
+/*
+ * There is no callback kthread, so this thread is never it.
+ */
+static bool rcu_is_callbacks_kthread(void)
+{
+        return false;
+}
+#endif /* #ifdef CONFIG_RCU_TRACE */
 void rcu_init(void)
 {
        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 764825c2685c..88f17b8a3b1d 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -61,9 +61,11 @@ static int test_no_idle_hz;	/* Test RCU's support for tickless idle CPUs. */
 static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
 static int stutter = 5;         /* Start/stop testing interval (in sec) */
 static int irqreader = 1;       /* RCU readers from irq (timers). */
-static int fqs_duration = 0;    /* Duration of bursts (us), 0 to disable. */
+static int fqs_duration;        /* Duration of bursts (us), 0 to disable. */
-static int fqs_holdoff = 0;     /* Hold time within burst (us). */
+static int fqs_holdoff;         /* Hold time within burst (us). */
 static int fqs_stutter = 3;     /* Wait time between bursts (s). */
+static int onoff_interval;      /* Wait time between CPU hotplugs, 0=disable. */
+static int shutdown_secs;       /* Shutdown time (s).  <=0 for no shutdown. */
 static int test_boost = 1;      /* Test RCU prio boost: 0=no, 1=maybe, 2=yes. */
 static int test_boost_interval = 7; /* Interval between boost tests, seconds. */
 static int test_boost_duration = 4; /* Duration of each boost test, seconds. */
@@ -91,6 +93,10 @@ module_param(fqs_holdoff, int, 0444);
 MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)");
 module_param(fqs_stutter, int, 0444);
 MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)");
+module_param(onoff_interval, int, 0444);
+MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable");
+module_param(shutdown_secs, int, 0444);
+MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), zero to disable.");
 module_param(test_boost, int, 0444);
 MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes.");
 module_param(test_boost_interval, int, 0444);
@@ -119,6 +125,10 @@ static struct task_struct *shuffler_task;
 static struct task_struct *stutter_task;
 static struct task_struct *fqs_task;
 static struct task_struct *boost_tasks[NR_CPUS];
+static struct task_struct *shutdown_task;
+#ifdef CONFIG_HOTPLUG_CPU
+static struct task_struct *onoff_task;
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
 #define RCU_TORTURE_PIPE_LEN 10
@@ -149,6 +159,10 @@ static long n_rcu_torture_boost_rterror;
 static long n_rcu_torture_boost_failure;
 static long n_rcu_torture_boosts;
 static long n_rcu_torture_timers;
+static long n_offline_attempts;
+static long n_offline_successes;
+static long n_online_attempts;
+static long n_online_successes;
 static struct list_head rcu_torture_removed;
 static cpumask_var_t shuffle_tmp_mask;
@@ -160,6 +174,8 @@ static int stutter_pause_test;
 #define RCUTORTURE_RUNNABLE_INIT 0
 #endif
 int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
+module_param(rcutorture_runnable, int, 0444);
+MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot");
 #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
 #define rcu_can_boost() 1
@@ -167,6 +183,7 @@ int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
 #define rcu_can_boost() 0
 #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
+static unsigned long shutdown_time;     /* jiffies to system shutdown. */
 static unsigned long boost_starttime;   /* jiffies of next boost test start. */
 DEFINE_MUTEX(boost_mutex);              /* protect setting boost_starttime */
                                        /*  and boost task create/destroy. */
@@ -182,6 +199,9 @@ static int fullstop = FULLSTOP_RMMOD;
 */
 static DEFINE_MUTEX(fullstop_mutex);
+/* Forward reference. */
+static void rcu_torture_cleanup(void);
 /*
 * Detect and respond to a system shutdown.
 */
@@ -612,6 +632,30 @@ static struct rcu_torture_ops srcu_ops = {
        .name           = "srcu"
 };
+static int srcu_torture_read_lock_raw(void) __acquires(&srcu_ctl)
+{
+        return srcu_read_lock_raw(&srcu_ctl);
+}
+static void srcu_torture_read_unlock_raw(int idx) __releases(&srcu_ctl)
+{
+        srcu_read_unlock_raw(&srcu_ctl, idx);
+}
+static struct rcu_torture_ops srcu_raw_ops = {
+        .init           = srcu_torture_init,
+        .cleanup        = srcu_torture_cleanup,
+        .readlock       = srcu_torture_read_lock_raw,
+        .read_delay     = srcu_read_delay,
+        .readunlock     = srcu_torture_read_unlock_raw,
+        .completed      = srcu_torture_completed,
+        .deferred_free  = rcu_sync_torture_deferred_free,
+        .sync           = srcu_torture_synchronize,
+        .cb_barrier     = NULL,
+        .stats          = srcu_torture_stats,
+        .name           = "srcu_raw"
+};
 static void srcu_torture_synchronize_expedited(void)
 {
        synchronize_srcu_expedited(&srcu_ctl);
@@ -913,6 +957,18 @@ rcu_torture_fakewriter(void *arg)
        return 0;
 }
+void rcutorture_trace_dump(void)
+{
+        static atomic_t beenhere = ATOMIC_INIT(0);
+        if (atomic_read(&beenhere))
+                return;
+        if (atomic_xchg(&beenhere, 1) != 0)
+                return;
+        do_trace_rcu_torture_read(cur_ops->name, (struct rcu_head *)~0UL);
+        ftrace_dump(DUMP_ALL);
+}
 /*
 * RCU torture reader from timer handler.  Dereferences rcu_torture_current,
 * incrementing the corresponding element of the pipeline array.  The
@@ -934,6 +990,7 @@ static void rcu_torture_timer(unsigned long unused)
                                  rcu_read_lock_bh_held() ||
                                  rcu_read_lock_sched_held() ||
                                  srcu_read_lock_held(&srcu_ctl));
+        do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
        if (p == NULL) {
                /* Leave because rcu_torture_writer is not yet underway */
                cur_ops->readunlock(idx);
@@ -951,6 +1008,8 @@ static void rcu_torture_timer(unsigned long unused)
                /* Should not happen, but... */
                pipe_count = RCU_TORTURE_PIPE_LEN;
        }
+        if (pipe_count > 1)
+                rcutorture_trace_dump();
        __this_cpu_inc(rcu_torture_count[pipe_count]);
        completed = cur_ops->completed() - completed;
        if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -994,6 +1053,7 @@ rcu_torture_reader(void *arg)
                                          rcu_read_lock_bh_held() ||
                                          rcu_read_lock_sched_held() ||
                                          srcu_read_lock_held(&srcu_ctl));
+                do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu);
                if (p == NULL) {
                        /* Wait for rcu_torture_writer to get underway */
                        cur_ops->readunlock(idx);
@@ -1009,6 +1069,8 @@ rcu_torture_reader(void *arg)
                        /* Should not happen, but... */
                        pipe_count = RCU_TORTURE_PIPE_LEN;
                }
+                if (pipe_count > 1)
+                        rcutorture_trace_dump();
                __this_cpu_inc(rcu_torture_count[pipe_count]);
                completed = cur_ops->completed() - completed;
                if (completed > RCU_TORTURE_PIPE_LEN) {
@@ -1056,7 +1118,8 @@ rcu_torture_printk(char *page)
        cnt += sprintf(&page[cnt],
                       "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
                       "rtmbe: %d rtbke: %ld rtbre: %ld "
-                       "rtbf: %ld rtb: %ld nt: %ld",
+                       "rtbf: %ld rtb: %ld nt: %ld "
+                       "onoff: %ld/%ld:%ld/%ld",
                       rcu_torture_current,
                       rcu_torture_current_version,
                       list_empty(&rcu_torture_freelist),
@@ -1068,7 +1131,11 @@ rcu_torture_printk(char *page)
                       n_rcu_torture_boost_rterror,
                       n_rcu_torture_boost_failure,
                       n_rcu_torture_boosts,
-                       n_rcu_torture_timers);
+                       n_rcu_torture_timers,
+                       n_online_successes,
+                       n_online_attempts,
+                       n_offline_successes,
+                       n_offline_attempts);
        if (atomic_read(&n_rcu_torture_mberror) != 0 ||
            n_rcu_torture_boost_ktrerror != 0 ||
            n_rcu_torture_boost_rterror != 0 ||
@@ -1232,12 +1299,14 @@ rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, char *tag)
                "shuffle_interval=%d stutter=%d irqreader=%d "
                "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d "
                "test_boost=%d/%d test_boost_interval=%d "
-                "test_boost_duration=%d\n",
+                "test_boost_duration=%d shutdown_secs=%d "
+                "onoff_interval=%d\n",
                torture_type, tag, nrealreaders, nfakewriters,
                stat_interval, verbose, test_no_idle_hz, shuffle_interval,
                stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter,
                test_boost, cur_ops->can_boost,
-                test_boost_interval, test_boost_duration);
+                test_boost_interval, test_boost_duration, shutdown_secs,
+                onoff_interval);
 }
 static struct notifier_block rcutorture_shutdown_nb = {
@@ -1287,6 +1356,131 @@ static int rcutorture_booster_init(int cpu)
        return 0;
 }
+/*
+ * Cause the rcutorture test to shutdown the system after the test has
+ * run for the time specified by the shutdown_secs module parameter.
+ */
+static int
+rcu_torture_shutdown(void *arg)
+{
+        long delta;
+        unsigned long jiffies_snap;
+        VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started");
+        jiffies_snap = ACCESS_ONCE(jiffies);
+        while (ULONG_CMP_LT(jiffies_snap, shutdown_time) &&
+               !kthread_should_stop()) {
+                delta = shutdown_time - jiffies_snap;
+                if (verbose)
+                        printk(KERN_ALERT "%s" TORTURE_FLAG
+                               "rcu_torture_shutdown task: %lu "
+                               "jiffies remaining\n",
+                               torture_type, delta);
+                schedule_timeout_interruptible(delta);
+                jiffies_snap = ACCESS_ONCE(jiffies);
+        }
+        if (kthread_should_stop()) {
+                VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping");
+                return 0;
+        }
+        /* OK, shut down the system. */
+        VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system");
+        shutdown_task = NULL;   /* Avoid self-kill deadlock. */
+        rcu_torture_cleanup();  /* Get the success/failure message. */
+        kernel_power_off();     /* Shut down the system. */
+        return 0;
+}
+#ifdef CONFIG_HOTPLUG_CPU
+/*
+ * Execute random CPU-hotplug operations at the interval specified
+ * by the onoff_interval.
+ */
+static int
+rcu_torture_onoff(void *arg)
+{
+        int cpu;
+        int maxcpu = -1;
+        DEFINE_RCU_RANDOM(rand);
+        VERBOSE_PRINTK_STRING("rcu_torture_onoff task started");
+        for_each_online_cpu(cpu)
+                maxcpu = cpu;
+        WARN_ON(maxcpu < 0);
+        while (!kthread_should_stop()) {
+                cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1);
+                if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) {
+                        if (verbose)
+                                printk(KERN_ALERT "%s" TORTURE_FLAG
+                                       "rcu_torture_onoff task: offlining %d\n",
+                                       torture_type, cpu);
+                        n_offline_attempts++;
+                        if (cpu_down(cpu) == 0) {
+                                if (verbose)
+                                        printk(KERN_ALERT "%s" TORTURE_FLAG
+                                               "rcu_torture_onoff task: "
+                                               "offlined %d\n",
+                                               torture_type, cpu);
+                                n_offline_successes++;
+                        }
+                } else if (cpu_is_hotpluggable(cpu)) {
+                        if (verbose)
+                                printk(KERN_ALERT "%s" TORTURE_FLAG
+                                       "rcu_torture_onoff task: onlining %d\n",
+                                       torture_type, cpu);
+                        n_online_attempts++;
+                        if (cpu_up(cpu) == 0) {
+                                if (verbose)
+                                        printk(KERN_ALERT "%s" TORTURE_FLAG
+                                               "rcu_torture_onoff task: "
+                                               "onlined %d\n",
+                                               torture_type, cpu);
+                                n_online_successes++;
+                        }
+                }
+                schedule_timeout_interruptible(onoff_interval * HZ);
+        }
+        VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping");
+        return 0;
+}
+static int
+rcu_torture_onoff_init(void)
+{
+        if (onoff_interval <= 0)
+                return 0;
+        onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff");
+        if (IS_ERR(onoff_task)) {
+                onoff_task = NULL;
+                return PTR_ERR(onoff_task);
+        }
+        return 0;
+}
+static void rcu_torture_onoff_cleanup(void)
+{
+        if (onoff_task == NULL)
+                return;
+        VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task");
+        kthread_stop(onoff_task);
+}
+#else /* #ifdef CONFIG_HOTPLUG_CPU */
+static void
+rcu_torture_onoff_init(void)
+{
+}
+static void rcu_torture_onoff_cleanup(void)
+{
+}
+#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
 static int rcutorture_cpu_notify(struct notifier_block *self,
                                 unsigned long action, void *hcpu)
 {
@@ -1391,6 +1585,11 @@ rcu_torture_cleanup(void)
                for_each_possible_cpu(i)
                        rcutorture_booster_cleanup(i);
        }
+        if (shutdown_task != NULL) {
+                VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task");
+                kthread_stop(shutdown_task);
+        }
+        rcu_torture_onoff_cleanup();
        /* Wait for all RCU callbacks to fire.  */
@@ -1416,7 +1615,7 @@ rcu_torture_init(void)
        static struct rcu_torture_ops *torture_ops[] =
                { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
                  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
-                  &srcu_ops, &srcu_expedited_ops,
+                  &srcu_ops, &srcu_raw_ops, &srcu_expedited_ops,
                  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
        mutex_lock(&fullstop_mutex);
@@ -1607,6 +1806,18 @@ rcu_torture_init(void)
                        }
                }
        }
+        if (shutdown_secs > 0) {
+                shutdown_time = jiffies + shutdown_secs * HZ;
+                shutdown_task = kthread_run(rcu_torture_shutdown, NULL,
+                                            "rcu_torture_shutdown");
+                if (IS_ERR(shutdown_task)) {
+                        firsterr = PTR_ERR(shutdown_task);
+                        VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown");
+                        shutdown_task = NULL;
+                        goto unwind;
+                }
+        }
+        rcu_torture_onoff_init();
        register_reboot_notifier(&rcutorture_shutdown_nb);
        rcutorture_record_test_transition();
        mutex_unlock(&fullstop_mutex);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 6b76d812740c..6c4a6722abfd 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -69,7 +69,7 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
                NUM_RCU_LVL_3, \
                NUM_RCU_LVL_4, /* == MAX_RCU_LVLS */ \
        }, \
-        .signaled = RCU_GP_IDLE, \
+        .fqs_state = RCU_GP_IDLE, \
        .gpnum = -300, \
        .completed = -300, \
        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
@@ -195,12 +195,10 @@ void rcu_note_context_switch(int cpu)
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
-#ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-        .dynticks_nesting = 1,
+        .dynticks_nesting = DYNTICK_TASK_NESTING,
        .dynticks = ATOMIC_INIT(1),
 };
-#endif /* #ifdef CONFIG_NO_HZ */
 static int blimit = 10;         /* Maximum callbacks per rcu_do_batch. */
 static int qhimark = 10000;     /* If this many pending, ignore blimit. */
@@ -328,11 +326,11 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
                return 1;
        }
-        /* If preemptible RCU, no point in sending reschedule IPI. */
+        /*
-        if (rdp->preemptible)
+         * The CPU is online, so send it a reschedule IPI.  This forces
-                return 0;
+         * it through the scheduler, and (inefficiently) also handles cases
+         * where idle loops fail to inform RCU about the CPU being idle.
-        /* The CPU is online, so send it a reschedule IPI. */
+         */
        if (rdp->cpu != smp_processor_id())
                smp_send_reschedule(rdp->cpu);
        else
@@ -343,59 +341,181 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
 #endif /* #ifdef CONFIG_SMP */
-#ifdef CONFIG_NO_HZ
+/*
+ * rcu_idle_enter_common - inform RCU that current CPU is moving towards idle
+ *
+ * If the new value of the ->dynticks_nesting counter now is zero,
+ * we really have entered idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_idle_enter_common(struct rcu_dynticks *rdtp, long long oldval)
+{
+        trace_rcu_dyntick("Start", oldval, 0);
+        if (!is_idle_task(current)) {
+                struct task_struct *idle = idle_task(smp_processor_id());
+                trace_rcu_dyntick("Error on entry: not idle task", oldval, 0);
+                ftrace_dump(DUMP_ALL);
+                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+                          current->pid, current->comm,
+                          idle->pid, idle->comm); /* must be idle task! */
+        }
+        rcu_prepare_for_idle(smp_processor_id());
+        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+        smp_mb__before_atomic_inc();  /* See above. */
+        atomic_inc(&rdtp->dynticks);
+        smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
+        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
+}
 /**
- * rcu_enter_nohz - inform RCU that current CPU is entering nohz
+ * rcu_idle_enter - inform RCU that current CPU is entering idle
 *
- * Enter nohz mode, in other words, -leave- the mode in which RCU
+ * Enter idle mode, in other words, -leave- the mode in which RCU
 * read-side critical sections can occur.  (Though RCU read-side
- * critical sections can occur in irq handlers in nohz mode, a possibility
+ * critical sections can occur in irq handlers in idle, a possibility
- * handled by rcu_irq_enter() and rcu_irq_exit()).
+ * handled by irq_enter() and irq_exit().)
+ *
+ * We crowbar the ->dynticks_nesting field to zero to allow for
+ * the possibility of usermode upcalls having messed up our count
+ * of interrupt nesting level during the prior busy period.
 */
-void rcu_enter_nohz(void)
+void rcu_idle_enter(void)
 {
        unsigned long flags;
+        long long oldval;
        struct rcu_dynticks *rdtp;
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-        if (--rdtp->dynticks_nesting) {
+        oldval = rdtp->dynticks_nesting;
-                local_irq_restore(flags);
+        rdtp->dynticks_nesting = 0;
-                return;
+        rcu_idle_enter_common(rdtp, oldval);
-        }
-        trace_rcu_dyntick("Start");
-        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
-        smp_mb__before_atomic_inc();  /* See above. */
-        atomic_inc(&rdtp->dynticks);
-        smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
-        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
        local_irq_restore(flags);
 }
-/*
+/**
- * rcu_exit_nohz - inform RCU that current CPU is leaving nohz
+ * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle
+ *
+ * Exit from an interrupt handler, which might possibly result in entering
+ * idle mode, in other words, leaving the mode in which read-side critical
+ * sections can occur.
 *
- * Exit nohz mode, in other words, -enter- the mode in which RCU
+ * This code assumes that the idle loop never does anything that might
- * read-side critical sections normally occur.
+ * result in unbalanced calls to irq_enter() and irq_exit().  If your
+ * architecture violates this assumption, RCU will give you what you
+ * deserve, good and hard.  But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
 */
-void rcu_exit_nohz(void)
+void rcu_irq_exit(void)
 {
        unsigned long flags;
+        long long oldval;
        struct rcu_dynticks *rdtp;
        local_irq_save(flags);
        rdtp = &__get_cpu_var(rcu_dynticks);
-        if (rdtp->dynticks_nesting++) {
+        oldval = rdtp->dynticks_nesting;
-                local_irq_restore(flags);
+        rdtp->dynticks_nesting--;
-                return;
+        WARN_ON_ONCE(rdtp->dynticks_nesting < 0);
-        }
+        if (rdtp->dynticks_nesting)
+                trace_rcu_dyntick("--=", oldval, rdtp->dynticks_nesting);
+        else
+                rcu_idle_enter_common(rdtp, oldval);
+        local_irq_restore(flags);
+}
+/*
+ * rcu_idle_exit_common - inform RCU that current CPU is moving away from idle
+ *
+ * If the new value of the ->dynticks_nesting counter was previously zero,
+ * we really have exited idle, and must do the appropriate accounting.
+ * The caller must have disabled interrupts.
+ */
+static void rcu_idle_exit_common(struct rcu_dynticks *rdtp, long long oldval)
+{
        smp_mb__before_atomic_inc();  /* Force ordering w/previous sojourn. */
        atomic_inc(&rdtp->dynticks);
        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
        smp_mb__after_atomic_inc();  /* See above. */
        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
-        trace_rcu_dyntick("End");
+        rcu_cleanup_after_idle(smp_processor_id());
+        trace_rcu_dyntick("End", oldval, rdtp->dynticks_nesting);
+        if (!is_idle_task(current)) {
+                struct task_struct *idle = idle_task(smp_processor_id());
+                trace_rcu_dyntick("Error on exit: not idle task",
+                                  oldval, rdtp->dynticks_nesting);
+                ftrace_dump(DUMP_ALL);
+                WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s",
+                          current->pid, current->comm,
+                          idle->pid, idle->comm); /* must be idle task! */
+        }
+}
+/**
+ * rcu_idle_exit - inform RCU that current CPU is leaving idle
+ *
+ * Exit idle mode, in other words, -enter- the mode in which RCU
+ * read-side critical sections can occur.
+ *
+ * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NESTING to
+ * allow for the possibility of usermode upcalls messing up our count
+ * of interrupt nesting level during the busy period that is just
+ * now starting.
+ */
+void rcu_idle_exit(void)
+{
+        unsigned long flags;
+        struct rcu_dynticks *rdtp;
+        long long oldval;
+        local_irq_save(flags);
+        rdtp = &__get_cpu_var(rcu_dynticks);
+        oldval = rdtp->dynticks_nesting;
+        WARN_ON_ONCE(oldval != 0);
+        rdtp->dynticks_nesting = DYNTICK_TASK_NESTING;
+        rcu_idle_exit_common(rdtp, oldval);
+        local_irq_restore(flags);
+}
+/**
+ * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle
+ *
+ * Enter an interrupt handler, which might possibly result in exiting
+ * idle mode, in other words, entering the mode in which read-side critical
+ * sections can occur.
+ *
+ * Note that the Linux kernel is fully capable of entering an interrupt
+ * handler that it never exits, for example when doing upcalls to
+ * user mode!  This code assumes that the idle loop never does upcalls to
+ * user mode.  If your architecture does do upcalls from the idle loop (or
+ * does anything else that results in unbalanced calls to the irq_enter()
+ * and irq_exit() functions), RCU will give you what you deserve, good
+ * and hard.  But very infrequently and irreproducibly.
+ *
+ * Use things like work queues to work around this limitation.
+ *
+ * You have been warned.
+ */
+void rcu_irq_enter(void)
+{
+        unsigned long flags;
+        struct rcu_dynticks *rdtp;
+        long long oldval;
+        local_irq_save(flags);
+        rdtp = &__get_cpu_var(rcu_dynticks);
+        oldval = rdtp->dynticks_nesting;
+        rdtp->dynticks_nesting++;
+        WARN_ON_ONCE(rdtp->dynticks_nesting == 0);
+        if (oldval)
+                trace_rcu_dyntick("++=", oldval, rdtp->dynticks_nesting);
+        else
+                rcu_idle_exit_common(rdtp, oldval);
        local_irq_restore(flags);
 }
@@ -442,27 +562,37 @@ void rcu_nmi_exit(void)
        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
 }
+#ifdef CONFIG_PROVE_RCU
 /**
- * rcu_irq_enter - inform RCU of entry to hard irq context
+ * rcu_is_cpu_idle - see if RCU thinks that the current CPU is idle
 *
- * If the CPU was idle with dynamic ticks active, this updates the
+ * If the current CPU is in its idle loop and is neither in an interrupt
- * rdtp->dynticks to let the RCU handling know that the CPU is active.
+ * or NMI handler, return true.
 */
-void rcu_irq_enter(void)
+int rcu_is_cpu_idle(void)
 {
-        rcu_exit_nohz();
+        int ret;
+        preempt_disable();
+        ret = (atomic_read(&__get_cpu_var(rcu_dynticks).dynticks) & 0x1) == 0;
+        preempt_enable();
+        return ret;
 }
+EXPORT_SYMBOL(rcu_is_cpu_idle);
+#endif /* #ifdef CONFIG_PROVE_RCU */
 /**
- * rcu_irq_exit - inform RCU of exit from hard irq context
+ * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle
 *
- * If the CPU was idle with dynamic ticks active, update the rdp->dynticks
+ * If the current CPU is idle or running at a first-level (not nested)
- * to put let the RCU handling be aware that the CPU is going back to idle
+ * interrupt from idle, return true.  The caller must have at least
- * with no ticks.
+ * disabled preemption.
 */
-void rcu_irq_exit(void)
+int rcu_is_cpu_rrupt_from_idle(void)
 {
-        rcu_enter_nohz();
+        return __get_cpu_var(rcu_dynticks).dynticks_nesting <= 1;
 }
 #ifdef CONFIG_SMP
@@ -475,7 +605,7 @@ void rcu_irq_exit(void)
 static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
        rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
-        return 0;
+        return (rdp->dynticks_snap & 0x1) == 0;
 }
 /*
@@ -512,26 +642,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 #endif /* #ifdef CONFIG_SMP */
-#else /* #ifdef CONFIG_NO_HZ */
-#ifdef CONFIG_SMP
-static int dyntick_save_progress_counter(struct rcu_data *rdp)
-{
-        return 0;
-}
-static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
-{
-        return rcu_implicit_offline_qs(rdp);
-}
-#endif /* #ifdef CONFIG_SMP */
-#endif /* #else #ifdef CONFIG_NO_HZ */
-int rcu_cpu_stall_suppress __read_mostly;
 static void record_gp_stall_check_time(struct rcu_state *rsp)
 {
        rsp->gp_start = jiffies;
@@ -866,8 +976,8 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        /* Advance to a new grace period and initialize state. */
        rsp->gpnum++;
        trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
-        WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
+        WARN_ON_ONCE(rsp->fqs_state == RCU_GP_INIT);
-        rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
+        rsp->fqs_state = RCU_GP_INIT; /* Hold off force_quiescent_state. */
        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
        record_gp_stall_check_time(rsp);
@@ -877,7 +987,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                rnp->qsmask = rnp->qsmaskinit;
                rnp->gpnum = rsp->gpnum;
                rnp->completed = rsp->completed;
-                rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
+                rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state OK */
                rcu_start_gp_per_cpu(rsp, rnp, rdp);
                rcu_preempt_boost_start_gp(rnp);
                trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
@@ -927,7 +1037,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        rnp = rcu_get_root(rsp);
        raw_spin_lock(&rnp->lock);              /* irqs already disabled. */
-        rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
+        rsp->fqs_state = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
        raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
@@ -991,7 +1101,7 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        rsp->completed = rsp->gpnum;  /* Declare the grace period complete. */
        trace_rcu_grace_period(rsp->name, rsp->completed, "end");
-        rsp->signaled = RCU_GP_IDLE;
+        rsp->fqs_state = RCU_GP_IDLE;
        rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
 }
@@ -1221,7 +1331,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
        else
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        if (need_report & RCU_OFL_TASKS_EXP_GP)
-                rcu_report_exp_rnp(rsp, rnp);
+                rcu_report_exp_rnp(rsp, rnp, true);
        rcu_node_kthread_setaffinity(rnp, -1);
 }
@@ -1263,7 +1373,9 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        /* If no callbacks are ready, just return.*/
        if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
                trace_rcu_batch_start(rsp->name, 0, 0);
-                trace_rcu_batch_end(rsp->name, 0);
+                trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist),
+                                    need_resched(), is_idle_task(current),
+                                    rcu_is_callbacks_kthread());
                return;
        }
@@ -1291,12 +1403,17 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
                debug_rcu_head_unqueue(list);
                __rcu_reclaim(rsp->name, list);
                list = next;
-                if (++count >= bl)
+                /* Stop only if limit reached and CPU has something to do. */
+                if (++count >= bl &&
+                    (need_resched() ||
+                     (!is_idle_task(current) && !rcu_is_callbacks_kthread())))
                        break;
        }
        local_irq_save(flags);
-        trace_rcu_batch_end(rsp->name, count);
+        trace_rcu_batch_end(rsp->name, count, !!list, need_resched(),
+                            is_idle_task(current),
+                            rcu_is_callbacks_kthread());
        /* Update count, and requeue any remaining callbacks. */
        rdp->qlen -= count;
@@ -1334,16 +1451,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
 * Also schedule RCU core processing.
 *
- * This function must be called with hardirqs disabled.  It is normally
+ * This function must be called from hardirq context.  It is normally
 * invoked from the scheduling-clock interrupt.  If rcu_pending returns
 * false, there is no point in invoking rcu_check_callbacks().
 */
 void rcu_check_callbacks(int cpu, int user)
 {
        trace_rcu_utilization("Start scheduler-tick");
-        if (user ||
+        if (user || rcu_is_cpu_rrupt_from_idle()) {
-            (idle_cpu(cpu) && rcu_scheduler_active &&
-             !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
                /*
                 * Get here if this CPU took its interrupt from user
@@ -1457,7 +1572,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                goto unlock_fqs_ret;  /* no GP in progress, time updated. */
        }
        rsp->fqs_active = 1;
-        switch (rsp->signaled) {
+        switch (rsp->fqs_state) {
        case RCU_GP_IDLE:
        case RCU_GP_INIT:
@@ -1473,7 +1588,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                force_qs_rnp(rsp, dyntick_save_progress_counter);
                raw_spin_lock(&rnp->lock);  /* irqs already disabled */
                if (rcu_gp_in_progress(rsp))
-                        rsp->signaled = RCU_FORCE_QS;
+                        rsp->fqs_state = RCU_FORCE_QS;
                break;
        case RCU_FORCE_QS:
@@ -1812,7 +1927,7 @@ static int rcu_pending(int cpu)
 * by the current CPU, even if none need be done immediately, returning
 * 1 if so.
 */
-static int rcu_needs_cpu_quick_check(int cpu)
+static int rcu_cpu_has_callbacks(int cpu)
 {
        /* RCU callbacks either ready or pending? */
        return per_cpu(rcu_sched_data, cpu).nxtlist ||
@@ -1913,9 +2028,9 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        for (i = 0; i < RCU_NEXT_SIZE; i++)
                rdp->nxttail[i] = &rdp->nxtlist;
        rdp->qlen = 0;
-#ifdef CONFIG_NO_HZ
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
-#endif /* #ifdef CONFIG_NO_HZ */
+        WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_NESTING);
+        WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1);
        rdp->cpu = cpu;
        rdp->rsp = rsp;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1942,6 +2057,10 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
        rdp->qlen_last_fqs_check = 0;
        rdp->n_force_qs_snap = rsp->n_force_qs;
        rdp->blimit = blimit;
+        rdp->dynticks->dynticks_nesting = DYNTICK_TASK_NESTING;
+        atomic_set(&rdp->dynticks->dynticks,
+                   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
+        rcu_prepare_for_idle_init(cpu);
        raw_spin_unlock(&rnp->lock);            /* irqs remain disabled. */
        /*
@@ -2023,6 +2142,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
                rcu_send_cbs_to_online(&rcu_bh_state);
                rcu_send_cbs_to_online(&rcu_sched_state);
                rcu_preempt_send_cbs_to_online();
+                rcu_cleanup_after_idle(cpu);
                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 849ce9ec51fe..fddff92d6676 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,9 +84,10 @@
 * Dynticks per-CPU state.
 */
 struct rcu_dynticks {
-        int dynticks_nesting;   /* Track irq/process nesting level. */
+        long long dynticks_nesting; /* Track irq/process nesting level. */
-        int dynticks_nmi_nesting; /* Track NMI nesting level. */
+                                    /* Process level is worth LLONG_MAX/2. */
-        atomic_t dynticks;      /* Even value for dynticks-idle, else odd. */
+        int dynticks_nmi_nesting;   /* Track NMI nesting level. */
+        atomic_t dynticks;          /* Even value for idle, else odd. */
 };
 /* RCU's kthread states for tracing. */
@@ -274,16 +275,12 @@ struct rcu_data {
                                        /* did other CPU force QS recently? */
        long            blimit;         /* Upper limit on a processed batch */
-#ifdef CONFIG_NO_HZ
        /* 3) dynticks interface. */
        struct rcu_dynticks *dynticks;  /* Shared per-CPU dynticks state. */
        int dynticks_snap;              /* Per-GP tracking for dynticks. */
-#endif /* #ifdef CONFIG_NO_HZ */
        /* 4) reasons this CPU needed to be kicked by force_quiescent_state */
-#ifdef CONFIG_NO_HZ
        unsigned long dynticks_fqs;     /* Kicked due to dynticks idle. */
-#endif /* #ifdef CONFIG_NO_HZ */
        unsigned long offline_fqs;      /* Kicked due to being offline. */
        unsigned long resched_ipi;      /* Sent a resched IPI. */
@@ -302,16 +299,12 @@ struct rcu_data {
        struct rcu_state *rsp;
 };
-/* Values for signaled field in struct rcu_state. */
+/* Values for fqs_state field in struct rcu_state. */
 #define RCU_GP_IDLE             0       /* No grace period in progress. */
 #define RCU_GP_INIT             1       /* Grace period being initialized. */
 #define RCU_SAVE_DYNTICK        2       /* Need to scan dyntick state. */
 #define RCU_FORCE_QS            3       /* Need to force quiescent state. */
-#ifdef CONFIG_NO_HZ
 #define RCU_SIGNAL_INIT         RCU_SAVE_DYNTICK
-#else /* #ifdef CONFIG_NO_HZ */
-#define RCU_SIGNAL_INIT         RCU_FORCE_QS
-#endif /* #else #ifdef CONFIG_NO_HZ */
 #define RCU_JIFFIES_TILL_FORCE_QS        3      /* for rsp->jiffies_force_qs */
@@ -361,7 +354,7 @@ struct rcu_state {
        /* The following fields are guarded by the root rcu_node's lock. */
-        u8      signaled ____cacheline_internodealigned_in_smp;
+        u8      fqs_state ____cacheline_internodealigned_in_smp;
                                                /* Force QS state. */
        u8      fqs_active;                     /* force_quiescent_state() */
                                                /*  is running. */
@@ -451,7 +444,8 @@ static void rcu_preempt_check_callbacks(int cpu);
 static void rcu_preempt_process_callbacks(void);
 void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
 #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU)
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp);
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+                               bool wake);
 #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
@@ -461,6 +455,7 @@ static void __init __rcu_init_preempt(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
 static void invoke_rcu_callbacks_kthread(void);
+static bool rcu_is_callbacks_kthread(void);
 #ifdef CONFIG_RCU_BOOST
 static void rcu_preempt_do_callbacks(void);
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
@@ -473,5 +468,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg);
 #endif /* #ifdef CONFIG_RCU_BOOST */
 static void rcu_cpu_kthread_setrt(int cpu, int to_rt);
 static void __cpuinit rcu_prepare_kthreads(int cpu);
+static void rcu_prepare_for_idle_init(int cpu);
+static void rcu_cleanup_after_idle(int cpu);
+static void rcu_prepare_for_idle(int cpu);
 #endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 4b9b9f8a4184..8bb35d73e1f9 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -312,6 +312,7 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
 {
        int empty;
        int empty_exp;
+        int empty_exp_now;
        unsigned long flags;
        struct list_head *np;
 #ifdef CONFIG_RCU_BOOST
@@ -382,8 +383,10 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
                /*
                 * If this was the last task on the current list, and if
                 * we aren't waiting on any CPUs, report the quiescent state.
-                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
+                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
+                 * so we must take a snapshot of the expedited state.
                 */
+                empty_exp_now = !rcu_preempted_readers_exp(rnp);
                if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
                        trace_rcu_quiescent_state_report("preempt_rcu",
                                                         rnp->gpnum,
@@ -406,8 +409,8 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
                 * If this was the last task on the expedited lists,
                 * then we need to report up the rcu_node hierarchy.
                 */
-                if (!empty_exp && !rcu_preempted_readers_exp(rnp))
+                if (!empty_exp && empty_exp_now)
-                        rcu_report_exp_rnp(&rcu_preempt_state, rnp);
+                        rcu_report_exp_rnp(&rcu_preempt_state, rnp, true);
        } else {
                local_irq_restore(flags);
        }
@@ -729,9 +732,13 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
 * recursively up the tree.  (Calm down, calm down, we do the recursion
 * iteratively!)
 *
+ * Most callers will set the "wake" flag, but the task initiating the
+ * expedited grace period need not wake itself.
+ *
 * Caller must hold sync_rcu_preempt_exp_mutex.
 */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+                               bool wake)
 {
        unsigned long flags;
        unsigned long mask;
@@ -744,7 +751,8 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
                }
                if (rnp->parent == NULL) {
                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                        wake_up(&sync_rcu_preempt_exp_wq);
+                        if (wake)
+                                wake_up(&sync_rcu_preempt_exp_wq);
                        break;
                }
                mask = rnp->grpmask;
@@ -777,7 +785,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
                must_wait = 1;
        }
        if (!must_wait)
-                rcu_report_exp_rnp(rsp, rnp);
+                rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
 }
 /*
@@ -1069,9 +1077,9 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
 * report on tasks preempted in RCU read-side critical sections during
 * expedited RCU grace periods.
 */
-static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
+static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
+                               bool wake)
 {
-        return;
 }
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -1157,8 +1165,6 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
-static struct lock_class_key rcu_boost_class;
 /*
 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1221,15 +1227,13 @@ static int rcu_boost(struct rcu_node *rnp)
         */
        t = container_of(tb, struct task_struct, rcu_node_entry);
        rt_mutex_init_proxy_locked(&mtx, t);
-        /* Avoid lockdep false positives.  This rt_mutex is its own thing. */
-        lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class,
-                                   "rcu_boost_mutex");
        t->rcu_boost_mutex = &mtx;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
-        return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
+        return ACCESS_ONCE(rnp->exp_tasks) != NULL ||
+               ACCESS_ONCE(rnp->boost_tasks) != NULL;
 }
 /*
@@ -1329,6 +1333,15 @@ static void invoke_rcu_callbacks_kthread(void)
 }
 /*
+ * Is the current CPU running the RCU-callbacks kthread?
+ * Caller must have preemption disabled.
+ */
+static bool rcu_is_callbacks_kthread(void)
+{
+        return __get_cpu_var(rcu_cpu_kthread_task) == current;
+}
+/*
 * Set the affinity of the boost kthread.  The CPU-hotplug locks are
 * held, so no one should be messing with the existence of the boost
 * kthread.
@@ -1772,6 +1785,11 @@ static void invoke_rcu_callbacks_kthread(void)
        WARN_ON_ONCE(1);
 }
+static bool rcu_is_callbacks_kthread(void)
+{
+        return false;
+}
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
 {
 }
@@ -1907,7 +1925,7 @@ void synchronize_sched_expedited(void)
                 * grace period works for us.
                 */
                get_online_cpus();
-                snap = atomic_read(&sync_sched_expedited_started) - 1;
+                snap = atomic_read(&sync_sched_expedited_started);
                smp_mb(); /* ensure read is before try_stop_cpus(). */
        }
@@ -1939,88 +1957,243 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 * 1 if so.  This function is part of the RCU implementation; it is -not-
 * an exported member of the RCU API.
 *
- * Because we have preemptible RCU, just check whether this CPU needs
+ * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs
- * any flavor of RCU.  Do not chew up lots of CPU cycles with preemption
+ * any flavor of RCU.
- * disabled in a most-likely vain attempt to cause RCU not to need this CPU.
 */
 int rcu_needs_cpu(int cpu)
 {
-        return rcu_needs_cpu_quick_check(cpu);
+        return rcu_cpu_has_callbacks(cpu);
+}
+/*
+ * Because we do not have RCU_FAST_NO_HZ, don't bother initializing for it.
+ */
+static void rcu_prepare_for_idle_init(int cpu)
+{
+}
+/*
+ * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up
+ * after it.
+ */
+static void rcu_cleanup_after_idle(int cpu)
+{
+}
+/*
+ * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=y,
+ * is nothing.
+ */
+static void rcu_prepare_for_idle(int cpu)
+{
 }
 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#define RCU_NEEDS_CPU_FLUSHES 5
+/*
+ * This code is invoked when a CPU goes idle, at which point we want
+ * to have the CPU do everything required for RCU so that it can enter
+ * the energy-efficient dyntick-idle mode.  This is handled by a
+ * state machine implemented by rcu_prepare_for_idle() below.
+ *
+ * The following three proprocessor symbols control this state machine:
+ *
+ * RCU_IDLE_FLUSHES gives the maximum number of times that we will attempt
+ *      to satisfy RCU.  Beyond this point, it is better to incur a periodic
+ *      scheduling-clock interrupt than to loop through the state machine
+ *      at full power.
+ * RCU_IDLE_OPT_FLUSHES gives the number of RCU_IDLE_FLUSHES that are
+ *      optional if RCU does not need anything immediately from this
+ *      CPU, even if this CPU still has RCU callbacks queued.  The first
+ *      times through the state machine are mandatory: we need to give
+ *      the state machine a chance to communicate a quiescent state
+ *      to the RCU core.
+ * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted
+ *      to sleep in dyntick-idle mode with RCU callbacks pending.  This
+ *      is sized to be roughly one RCU grace period.  Those energy-efficiency
+ *      benchmarkers who might otherwise be tempted to set this to a large
+ *      number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your
+ *      system.  And if you are -that- concerned about energy efficiency,
+ *      just power the system down and be done with it!
+ *
+ * The values below work well in practice.  If future workloads require
+ * adjustment, they can be converted into kernel config parameters, though
+ * making the state machine smarter might be a better option.
+ */
+#define RCU_IDLE_FLUSHES 5              /* Number of dyntick-idle tries. */
+#define RCU_IDLE_OPT_FLUSHES 3          /* Optional dyntick-idle tries. */
+#define RCU_IDLE_GP_DELAY 6             /* Roughly one grace period. */
 static DEFINE_PER_CPU(int, rcu_dyntick_drain);
 static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
+static DEFINE_PER_CPU(struct hrtimer, rcu_idle_gp_timer);
+static ktime_t rcu_idle_gp_wait;
 /*
- * Check to see if any future RCU-related work will need to be done
+ * Allow the CPU to enter dyntick-idle mode if either: (1) There are no
- * by the current CPU, even if none need be done immediately, returning
+ * callbacks on this CPU, (2) this CPU has not yet attempted to enter
- * 1 if so.  This function is part of the RCU implementation; it is -not-
+ * dyntick-idle mode, or (3) this CPU is in the process of attempting to
- * an exported member of the RCU API.
+ * enter dyntick-idle mode.  Otherwise, if we have recently tried and failed
+ * to enter dyntick-idle mode, we refuse to try to enter it.  After all,
+ * it is better to incur scheduling-clock interrupts than to spin
+ * continuously for the same time duration!
+ */
+int rcu_needs_cpu(int cpu)
+{
+        /* If no callbacks, RCU doesn't need the CPU. */
+        if (!rcu_cpu_has_callbacks(cpu))
+                return 0;
+        /* Otherwise, RCU needs the CPU only if it recently tried and failed. */
+        return per_cpu(rcu_dyntick_holdoff, cpu) == jiffies;
+}
+/*
+ * Timer handler used to force CPU to start pushing its remaining RCU
+ * callbacks in the case where it entered dyntick-idle mode with callbacks
+ * pending.  The hander doesn't really need to do anything because the
+ * real work is done upon re-entry to idle, or by the next scheduling-clock
+ * interrupt should idle not be re-entered.
+ */
+static enum hrtimer_restart rcu_idle_gp_timer_func(struct hrtimer *hrtp)
+{
+        trace_rcu_prep_idle("Timer");
+        return HRTIMER_NORESTART;
+}
+/*
+ * Initialize the timer used to pull CPUs out of dyntick-idle mode.
+ */
+static void rcu_prepare_for_idle_init(int cpu)
+{
+        static int firsttime = 1;
+        struct hrtimer *hrtp = &per_cpu(rcu_idle_gp_timer, cpu);
+        hrtimer_init(hrtp, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        hrtp->function = rcu_idle_gp_timer_func;
+        if (firsttime) {
+                unsigned int upj = jiffies_to_usecs(RCU_IDLE_GP_DELAY);
+                rcu_idle_gp_wait = ns_to_ktime(upj * (u64)1000);
+                firsttime = 0;
+        }
+}
+/*
+ * Clean up for exit from idle.  Because we are exiting from idle, there
+ * is no longer any point to rcu_idle_gp_timer, so cancel it.  This will
+ * do nothing if this timer is not active, so just cancel it unconditionally.
+ */
+static void rcu_cleanup_after_idle(int cpu)
+{
+        hrtimer_cancel(&per_cpu(rcu_idle_gp_timer, cpu));
+}
+/*
+ * Check to see if any RCU-related work can be done by the current CPU,
+ * and if so, schedule a softirq to get it done.  This function is part
+ * of the RCU implementation; it is -not- an exported member of the RCU API.
 *
- * Because we are not supporting preemptible RCU, attempt to accelerate
+ * The idea is for the current CPU to clear out all work required by the
- * any current grace periods so that RCU no longer needs this CPU, but
+ * RCU core for the current grace period, so that this CPU can be permitted
- * only if all other CPUs are already in dynticks-idle mode.  This will
+ * to enter dyntick-idle mode.  In some cases, it will need to be awakened
- * allow the CPU cores to be powered down immediately, as opposed to after
+ * at the end of the grace period by whatever CPU ends the grace period.
- * waiting many milliseconds for grace periods to elapse.
+ * This allows CPUs to go dyntick-idle more quickly, and to reduce the
+ * number of wakeups by a modest integer factor.
 *
 * Because it is not legal to invoke rcu_process_callbacks() with irqs
 * disabled, we do one pass of force_quiescent_state(), then do a
 * invoke_rcu_core() to cause rcu_process_callbacks() to be invoked
 * later.  The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ *
+ * The caller must have disabled interrupts.
 */
-int rcu_needs_cpu(int cpu)
+static void rcu_prepare_for_idle(int cpu)
 {
-        int c = 0;
+        unsigned long flags;
-        int snap;
-        int thatcpu;
+        local_irq_save(flags);
-        /* Check for being in the holdoff period. */
+        /*
-        if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies)
+         * If there are no callbacks on this CPU, enter dyntick-idle mode.
-                return rcu_needs_cpu_quick_check(cpu);
+         * Also reset state to avoid prejudicing later attempts.
+         */
-        /* Don't bother unless we are the last non-dyntick-idle CPU. */
+        if (!rcu_cpu_has_callbacks(cpu)) {
-        for_each_online_cpu(thatcpu) {
+                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
-                if (thatcpu == cpu)
+                per_cpu(rcu_dyntick_drain, cpu) = 0;
-                        continue;
+                local_irq_restore(flags);
-                snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
+                trace_rcu_prep_idle("No callbacks");
-                                                     thatcpu).dynticks);
+                return;
-                smp_mb(); /* Order sampling of snap with end of grace period. */
+        }
-                if ((snap & 0x1) != 0) {
-                        per_cpu(rcu_dyntick_drain, cpu) = 0;
+        /*
-                        per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+         * If in holdoff mode, just return.  We will presumably have
-                        return rcu_needs_cpu_quick_check(cpu);
+         * refrained from disabling the scheduling-clock tick.
-                }
+         */
+        if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) {
+                local_irq_restore(flags);
+                trace_rcu_prep_idle("In holdoff");
+                return;
        }
        /* Check and update the rcu_dyntick_drain sequencing. */
        if (per_cpu(rcu_dyntick_drain, cpu) <= 0) {
                /* First time through, initialize the counter. */
-                per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES;
+                per_cpu(rcu_dyntick_drain, cpu) = RCU_IDLE_FLUSHES;
+        } else if (per_cpu(rcu_dyntick_drain, cpu) <= RCU_IDLE_OPT_FLUSHES &&
+                   !rcu_pending(cpu)) {
+                /* Can we go dyntick-idle despite still having callbacks? */
+                trace_rcu_prep_idle("Dyntick with callbacks");
+                per_cpu(rcu_dyntick_drain, cpu) = 0;
+                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
+                hrtimer_start(&per_cpu(rcu_idle_gp_timer, cpu),
+                              rcu_idle_gp_wait, HRTIMER_MODE_REL);
+                return; /* Nothing more to do immediately. */
        } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) {
                /* We have hit the limit, so time to give up. */
                per_cpu(rcu_dyntick_holdoff, cpu) = jiffies;
-                return rcu_needs_cpu_quick_check(cpu);
+                local_irq_restore(flags);
+                trace_rcu_prep_idle("Begin holdoff");
+                invoke_rcu_core();  /* Force the CPU out of dyntick-idle. */
+                return;
        }
-        /* Do one step pushing remaining RCU callbacks through. */
+        /*
+         * Do one step of pushing the remaining RCU callbacks through
+         * the RCU core state machine.
+         */
+#ifdef CONFIG_TREE_PREEMPT_RCU
+        if (per_cpu(rcu_preempt_data, cpu).nxtlist) {
+                local_irq_restore(flags);
+                rcu_preempt_qs(cpu);
+                force_quiescent_state(&rcu_preempt_state, 0);
+                local_irq_save(flags);
+        }
+#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
        if (per_cpu(rcu_sched_data, cpu).nxtlist) {
+                local_irq_restore(flags);
                rcu_sched_qs(cpu);
                force_quiescent_state(&rcu_sched_state, 0);
-                c = c || per_cpu(rcu_sched_data, cpu).nxtlist;
+                local_irq_save(flags);
        }
        if (per_cpu(rcu_bh_data, cpu).nxtlist) {
+                local_irq_restore(flags);
                rcu_bh_qs(cpu);
                force_quiescent_state(&rcu_bh_state, 0);
-                c = c || per_cpu(rcu_bh_data, cpu).nxtlist;
+                local_irq_save(flags);
        }
-        /* If RCU callbacks are still pending, RCU still needs this CPU. */
+        /*
-        if (c)
+         * If RCU callbacks are still pending, RCU still needs this CPU.
+         * So try forcing the callbacks through the grace period.
+         */
+        if (rcu_cpu_has_callbacks(cpu)) {
+                local_irq_restore(flags);
+                trace_rcu_prep_idle("More callbacks");
                invoke_rcu_core();
-        return c;
+        } else {
+                local_irq_restore(flags);
+                trace_rcu_prep_idle("Callbacks drained");
+        }
 }
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 9feffa4c0695..654cfe67f0d1 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -67,13 +67,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
                   rdp->completed, rdp->gpnum,
                   rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
                   rdp->qs_pending);
-#ifdef CONFIG_NO_HZ
+        seq_printf(m, " dt=%d/%llx/%d df=%lu",
-        seq_printf(m, " dt=%d/%d/%d df=%lu",
                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
-#endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
        seq_printf(m, " ql=%ld qs=%c%c%c%c",
                   rdp->qlen,
@@ -141,13 +139,11 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
                   rdp->completed, rdp->gpnum,
                   rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
                   rdp->qs_pending);
-#ifdef CONFIG_NO_HZ
+        seq_printf(m, ",%d,%llx,%d,%lu",
-        seq_printf(m, ",%d,%d,%d,%lu",
                   atomic_read(&rdp->dynticks->dynticks),
                   rdp->dynticks->dynticks_nesting,
                   rdp->dynticks->dynticks_nmi_nesting,
                   rdp->dynticks_fqs);
-#endif /* #ifdef CONFIG_NO_HZ */
        seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
        seq_printf(m, ",%ld,\"%c%c%c%c\"", rdp->qlen,
                   ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
@@ -171,9 +167,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
-#ifdef CONFIG_NO_HZ
        seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
-#endif /* #ifdef CONFIG_NO_HZ */
        seq_puts(m, "\"of\",\"ri\",\"ql\",\"qs\"");
 #ifdef CONFIG_RCU_BOOST
        seq_puts(m, "\"kt\",\"ktl\"");
@@ -278,7 +272,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
        gpnum = rsp->gpnum;
        seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x "
                      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
-                   rsp->completed, gpnum, rsp->signaled,
+                   rsp->completed, gpnum, rsp->fqs_state,
                   (long)(rsp->jiffies_force_qs - jiffies),
                   (int)(jiffies & 0xffff),
                   rsp->n_force_qs, rsp->n_force_qs_ngp,
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 8eafd1bd273e..16502d3a71c8 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -101,6 +101,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
        printk("\n============================================\n");
        printk(  "[ BUG: circular locking deadlock detected! ]\n");
+        printk("%s\n", print_tainted());
        printk(  "--------------------------------------------\n");
        printk("%s/%d is deadlocking current task %s/%d\n\n",
               task->comm, task_pid_nr(task),
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index f9d8482dd487..a242e691c993 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -579,7 +579,6 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                    struct rt_mutex_waiter *waiter)
 {
        int ret = 0;
-        int was_disabled;
        for (;;) {
                /* Try to acquire the lock: */
@@ -602,17 +601,10 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                raw_spin_unlock(&lock->wait_lock);
-                was_disabled = irqs_disabled();
-                if (was_disabled)
-                        local_irq_enable();
                debug_rt_mutex_print_deadlock(waiter);
                schedule_rt_mutex(lock);
-                if (was_disabled)
-                        local_irq_disable();
                raw_spin_lock(&lock->wait_lock);
                set_current_state(state);
        }
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
new file mode 100644
index 000000000000..9a7dd35102a3
--- /dev/null
+++ b/kernel/sched/Makefile
@@ -0,0 +1,20 @@
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_clock.o = -pg
+endif
+ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
+# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
+# needed for x86 only.  Why this used to be enabled for all architectures is beyond
+# me.  I suspect most platforms don't need this, but until we know that for sure
+# I turn this off for IA-64 only.  Andreas Schwab says it's also needed on m68k
+# to get a correct value for the wait-channel (WCHAN in ps). --davidm
+CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
+endif
+obj-y += core.o clock.o idle_task.o fair.o rt.o stop_task.o
+obj-$(CONFIG_SMP) += cpupri.o
+obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
+obj-$(CONFIG_SCHEDSTATS) += stats.o
+obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched_autogroup.c b/kernel/sched/auto_group.c
index 429242f3c484..e8a1f83ee0e7 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched/auto_group.c
@@ -1,15 +1,19 @@
 #ifdef CONFIG_SCHED_AUTOGROUP
+#include "sched.h"
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
+#include <linux/security.h>
+#include <linux/export.h>
 unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
 static struct autogroup autogroup_default;
 static atomic_t autogroup_seq_nr;
-static void __init autogroup_init(struct task_struct *init_task)
+void __init autogroup_init(struct task_struct *init_task)
 {
        autogroup_default.tg = &root_task_group;
        kref_init(&autogroup_default.kref);
@@ -17,7 +21,7 @@ static void __init autogroup_init(struct task_struct *init_task)
        init_task->signal->autogroup = &autogroup_default;
 }
-static inline void autogroup_free(struct task_group *tg)
+void autogroup_free(struct task_group *tg)
 {
        kfree(tg->autogroup);
 }
@@ -59,10 +63,6 @@ static inline struct autogroup *autogroup_task_get(struct task_struct *p)
        return ag;
 }
-#ifdef CONFIG_RT_GROUP_SCHED
-static void free_rt_sched_group(struct task_group *tg);
-#endif
 static inline struct autogroup *autogroup_create(void)
 {
        struct autogroup *ag = kzalloc(sizeof(*ag), GFP_KERNEL);
@@ -108,8 +108,7 @@ out_fail:
        return autogroup_kref_get(&autogroup_default);
 }
-static inline bool
+bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
-task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 {
        if (tg != &root_task_group)
                return false;
@@ -127,22 +126,6 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
        return true;
 }
-static inline bool task_group_is_autogroup(struct task_group *tg)
-{
-        return !!tg->autogroup;
-}
-static inline struct task_group *
-autogroup_task_group(struct task_struct *p, struct task_group *tg)
-{
-        int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
-        if (enabled && task_wants_autogroup(p, tg))
-                return p->signal->autogroup->tg;
-        return tg;
-}
 static void
 autogroup_move_group(struct task_struct *p, struct autogroup *ag)
 {
@@ -263,7 +246,7 @@ out:
 #endif /* CONFIG_PROC_FS */
 #ifdef CONFIG_SCHED_DEBUG
-static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
+int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
        if (!task_group_is_autogroup(tg))
                return 0;
diff --git a/kernel/sched_autogroup.h b/kernel/sched/auto_group.h
index c2f0e7248dca..8bd047142816 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched/auto_group.h
@@ -1,5 +1,8 @@
 #ifdef CONFIG_SCHED_AUTOGROUP
+#include <linux/kref.h>
+#include <linux/rwsem.h>
 struct autogroup {
        /*
         * reference doesn't mean how many thread attach to this
@@ -13,9 +16,28 @@ struct autogroup {
        int                     nice;
 };
-static inline bool task_group_is_autogroup(struct task_group *tg);
+extern void autogroup_init(struct task_struct *init_task);
+extern void autogroup_free(struct task_group *tg);
+static inline bool task_group_is_autogroup(struct task_group *tg)
+{
+        return !!tg->autogroup;
+}
+extern bool task_wants_autogroup(struct task_struct *p, struct task_group *tg);
 static inline struct task_group *
-autogroup_task_group(struct task_struct *p, struct task_group *tg);
+autogroup_task_group(struct task_struct *p, struct task_group *tg)
+{
+        int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
+        if (enabled && task_wants_autogroup(p, tg))
+                return p->signal->autogroup->tg;
+        return tg;
+}
+extern int autogroup_path(struct task_group *tg, char *buf, int buflen);
 #else /* !CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched_clock.c b/kernel/sched/clock.c
index c685e31492df..c685e31492df 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched/clock.c
diff --git a/kernel/sched.c b/kernel/sched/core.c
index a7f381a78469..2a4590fabcad 100644
--- a/kernel/sched.c
+++ b/kernel/sched/core.c
@@ -1,5 +1,5 @@
 /*
- *  kernel/sched.c
+ *  kernel/sched/core.c
 *
 *  Kernel scheduler and related syscalls
 *
@@ -56,7 +56,6 @@
 #include <linux/percpu.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/stop_machine.h>
 #include <linux/sysctl.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
@@ -75,129 +74,17 @@
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
-#include <asm/mutex.h>
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #endif
-#include "sched_cpupri.h"
+#include "sched.h"
-#include "workqueue_sched.h"
+#include "../workqueue_sched.h"
-#include "sched_autogroup.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
-/*
+void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
- * Convert user-nice values [ -20 ... 0 ... 19 ]
- * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
- * and back.
- */
-#define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
-#define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
-#define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
-/*
- * 'User priority' is the nice value converted to something we
- * can work with better when scaling various scheduler parameters,
- * it's a [ 0 ... 39 ] range.
- */
-#define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
-#define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
-#define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
-/*
- * Helpers for converting nanosecond timing to jiffy resolution
- */
-#define NS_TO_JIFFIES(TIME)     ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
-#define NICE_0_LOAD             SCHED_LOAD_SCALE
-#define NICE_0_SHIFT            SCHED_LOAD_SHIFT
-/*
- * These are the 'tuning knobs' of the scheduler:
- *
- * default timeslice is 100 msecs (used only for SCHED_RR tasks).
- * Timeslices get refilled after they expire.
- */
-#define DEF_TIMESLICE           (100 * HZ / 1000)
-/*
- * single value that denotes runtime == period, ie unlimited time.
- */
-#define RUNTIME_INF     ((u64)~0ULL)
-static inline int rt_policy(int policy)
-{
-        if (policy == SCHED_FIFO || policy == SCHED_RR)
-                return 1;
-        return 0;
-}
-static inline int task_has_rt_policy(struct task_struct *p)
-{
-        return rt_policy(p->policy);
-}
-/*
- * This is the priority-queue data structure of the RT scheduling class:
- */
-struct rt_prio_array {
-        DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
-        struct list_head queue[MAX_RT_PRIO];
-};
-struct rt_bandwidth {
-        /* nests inside the rq lock: */
-        raw_spinlock_t          rt_runtime_lock;
-        ktime_t                 rt_period;
-        u64                     rt_runtime;
-        struct hrtimer          rt_period_timer;
-};
-static struct rt_bandwidth def_rt_bandwidth;
-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
-static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
-{
-        struct rt_bandwidth *rt_b =
-                container_of(timer, struct rt_bandwidth, rt_period_timer);
-        ktime_t now;
-        int overrun;
-        int idle = 0;
-        for (;;) {
-                now = hrtimer_cb_get_time(timer);
-                overrun = hrtimer_forward(timer, now, rt_b->rt_period);
-                if (!overrun)
-                        break;
-                idle = do_sched_rt_period_timer(rt_b, overrun);
-        }
-        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
-}
-static
-void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
-{
-        rt_b->rt_period = ns_to_ktime(period);
-        rt_b->rt_runtime = runtime;
-        raw_spin_lock_init(&rt_b->rt_runtime_lock);
-        hrtimer_init(&rt_b->rt_period_timer,
-                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        rt_b->rt_period_timer.function = sched_rt_period_timer;
-}
-static inline int rt_bandwidth_enabled(void)
-{
-        return sysctl_sched_rt_runtime >= 0;
-}
-static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
        unsigned long delta;
        ktime_t soft, hard, now;
@@ -217,580 +104,12 @@ static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
        }
 }
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+DEFINE_MUTEX(sched_domains_mutex);
-{
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
-                return;
-        if (hrtimer_active(&rt_b->rt_period_timer))
-                return;
-        raw_spin_lock(&rt_b->rt_runtime_lock);
-        start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
-        raw_spin_unlock(&rt_b->rt_runtime_lock);
-}
-#ifdef CONFIG_RT_GROUP_SCHED
-static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
-{
-        hrtimer_cancel(&rt_b->rt_period_timer);
-}
-#endif
-/*
- * sched_domains_mutex serializes calls to init_sched_domains,
- * detach_destroy_domains and partition_sched_domains.
- */
-static DEFINE_MUTEX(sched_domains_mutex);
-#ifdef CONFIG_CGROUP_SCHED
-#include <linux/cgroup.h>
-struct cfs_rq;
-static LIST_HEAD(task_groups);
-struct cfs_bandwidth {
-#ifdef CONFIG_CFS_BANDWIDTH
-        raw_spinlock_t lock;
-        ktime_t period;
-        u64 quota, runtime;
-        s64 hierarchal_quota;
-        u64 runtime_expires;
-        int idle, timer_active;
-        struct hrtimer period_timer, slack_timer;
-        struct list_head throttled_cfs_rq;
-        /* statistics */
-        int nr_periods, nr_throttled;
-        u64 throttled_time;
-#endif
-};
-/* task group related information */
-struct task_group {
-        struct cgroup_subsys_state css;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        /* schedulable entities of this group on each cpu */
-        struct sched_entity **se;
-        /* runqueue "owned" by this group on each cpu */
-        struct cfs_rq **cfs_rq;
-        unsigned long shares;
-        atomic_t load_weight;
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-        struct sched_rt_entity **rt_se;
-        struct rt_rq **rt_rq;
-        struct rt_bandwidth rt_bandwidth;
-#endif
-        struct rcu_head rcu;
-        struct list_head list;
-        struct task_group *parent;
-        struct list_head siblings;
-        struct list_head children;
-#ifdef CONFIG_SCHED_AUTOGROUP
-        struct autogroup *autogroup;
-#endif
-        struct cfs_bandwidth cfs_bandwidth;
-};
-/* task_group_lock serializes the addition/removal of task groups */
-static DEFINE_SPINLOCK(task_group_lock);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-# define ROOT_TASK_GROUP_LOAD   NICE_0_LOAD
-/*
- * A weight of 0 or 1 can cause arithmetics problems.
- * A weight of a cfs_rq is the sum of weights of which entities
- * are queued on this cfs_rq, so a weight of a entity should not be
- * too large, so as the shares value of a task group.
- * (The default weight is 1024 - so there's no practical
- *  limitation from this.)
- */
-#define MIN_SHARES      (1UL <<  1)
-#define MAX_SHARES      (1UL << 18)
-static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
-#endif
-/* Default task group.
- *      Every task in system belong to this group at bootup.
- */
-struct task_group root_task_group;
-#endif  /* CONFIG_CGROUP_SCHED */
-/* CFS-related fields in a runqueue */
-struct cfs_rq {
-        struct load_weight load;
-        unsigned long nr_running, h_nr_running;
-        u64 exec_clock;
-        u64 min_vruntime;
-#ifndef CONFIG_64BIT
-        u64 min_vruntime_copy;
-#endif
-        struct rb_root tasks_timeline;
-        struct rb_node *rb_leftmost;
-        struct list_head tasks;
-        struct list_head *balance_iterator;
-        /*
-         * 'curr' points to currently running entity on this cfs_rq.
-         * It is set to NULL otherwise (i.e when none are currently running).
-         */
-        struct sched_entity *curr, *next, *last, *skip;
-#ifdef  CONFIG_SCHED_DEBUG
-        unsigned int nr_spread_over;
-#endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
-        /*
-         * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
-         * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
-         * (like users, containers etc.)
-         *
-         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
-         * list is used during load balance.
-         */
-        int on_list;
-        struct list_head leaf_cfs_rq_list;
-        struct task_group *tg;  /* group that "owns" this runqueue */
-#ifdef CONFIG_SMP
-        /*
-         * the part of load.weight contributed by tasks
-         */
-        unsigned long task_weight;
-        /*
-         *   h_load = weight * f(tg)
-         *
-         * Where f(tg) is the recursive weight fraction assigned to
-         * this group.
-         */
-        unsigned long h_load;
-        /*
-         * Maintaining per-cpu shares distribution for group scheduling
-         *
-         * load_stamp is the last time we updated the load average
-         * load_last is the last time we updated the load average and saw load
-         * load_unacc_exec_time is currently unaccounted execution time
-         */
-        u64 load_avg;
-        u64 load_period;
-        u64 load_stamp, load_last, load_unacc_exec_time;
-        unsigned long load_contribution;
-#endif
-#ifdef CONFIG_CFS_BANDWIDTH
-        int runtime_enabled;
-        u64 runtime_expires;
-        s64 runtime_remaining;
-        u64 throttled_timestamp;
-        int throttled, throttle_count;
-        struct list_head throttled_list;
-#endif
-#endif
-};
-#ifdef CONFIG_FAIR_GROUP_SCHED
-#ifdef CONFIG_CFS_BANDWIDTH
-static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
-{
-        return &tg->cfs_bandwidth;
-}
-static inline u64 default_cfs_period(void);
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
-static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
-static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
-{
-        struct cfs_bandwidth *cfs_b =
-                container_of(timer, struct cfs_bandwidth, slack_timer);
-        do_sched_cfs_slack_timer(cfs_b);
-        return HRTIMER_NORESTART;
-}
-static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
-{
-        struct cfs_bandwidth *cfs_b =
-                container_of(timer, struct cfs_bandwidth, period_timer);
-        ktime_t now;
-        int overrun;
-        int idle = 0;
-        for (;;) {
-                now = hrtimer_cb_get_time(timer);
-                overrun = hrtimer_forward(timer, now, cfs_b->period);
-                if (!overrun)
-                        break;
-                idle = do_sched_cfs_period_timer(cfs_b, overrun);
-        }
-        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
-}
-static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{
-        raw_spin_lock_init(&cfs_b->lock);
-        cfs_b->runtime = 0;
-        cfs_b->quota = RUNTIME_INF;
-        cfs_b->period = ns_to_ktime(default_cfs_period());
-        INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
-        hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        cfs_b->period_timer.function = sched_cfs_period_timer;
-        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-        cfs_b->slack_timer.function = sched_cfs_slack_timer;
-}
-static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
-{
-        cfs_rq->runtime_enabled = 0;
-        INIT_LIST_HEAD(&cfs_rq->throttled_list);
-}
-/* requires cfs_b->lock, may release to reprogram timer */
-static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{
-        /*
-         * The timer may be active because we're trying to set a new bandwidth
-         * period or because we're racing with the tear-down path
-         * (timer_active==0 becomes visible before the hrtimer call-back
-         * terminates).  In either case we ensure that it's re-programmed
-         */
-        while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
-                raw_spin_unlock(&cfs_b->lock);
-                /* ensure cfs_b->lock is available while we wait */
-                hrtimer_cancel(&cfs_b->period_timer);
-                raw_spin_lock(&cfs_b->lock);
-                /* if someone else restarted the timer then we're done */
-                if (cfs_b->timer_active)
-                        return;
-        }
-        cfs_b->timer_active = 1;
-        start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
-}
-static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
-{
-        hrtimer_cancel(&cfs_b->period_timer);
-        hrtimer_cancel(&cfs_b->slack_timer);
-}
-#else
-static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
-static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
-static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
-{
-        return NULL;
-}
-#endif /* CONFIG_CFS_BANDWIDTH */
-#endif /* CONFIG_FAIR_GROUP_SCHED */
-/* Real-Time classes' related field in a runqueue: */
-struct rt_rq {
-        struct rt_prio_array active;
-        unsigned long rt_nr_running;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-        struct {
-                int curr; /* highest queued rt task prio */
-#ifdef CONFIG_SMP
-                int next; /* next highest */
-#endif
-        } highest_prio;
-#endif
-#ifdef CONFIG_SMP
-        unsigned long rt_nr_migratory;
-        unsigned long rt_nr_total;
-        int overloaded;
-        struct plist_head pushable_tasks;
-#endif
-        int rt_throttled;
-        u64 rt_time;
-        u64 rt_runtime;
-        /* Nests inside the rq lock: */
-        raw_spinlock_t rt_runtime_lock;
-#ifdef CONFIG_RT_GROUP_SCHED
-        unsigned long rt_nr_boosted;
-        struct rq *rq;
-        struct list_head leaf_rt_rq_list;
-        struct task_group *tg;
-#endif
-};
-#ifdef CONFIG_SMP
-/*
- * We add the notion of a root-domain which will be used to define per-domain
- * variables. Each exclusive cpuset essentially defines an island domain by
- * fully partitioning the member cpus from any other cpuset. Whenever a new
- * exclusive cpuset is created, we also create and attach a new root-domain
- * object.
- *
- */
-struct root_domain {
-        atomic_t refcount;
-        atomic_t rto_count;
-        struct rcu_head rcu;
-        cpumask_var_t span;
-        cpumask_var_t online;
-        /*
-         * The "RT overload" flag: it gets set if a CPU has more than
-         * one runnable RT task.
-         */
-        cpumask_var_t rto_mask;
-        struct cpupri cpupri;
-};
-/*
- * By default the system creates a single root-domain with all cpus as
- * members (mimicking the global state we have today).
- */
-static struct root_domain def_root_domain;
-#endif /* CONFIG_SMP */
-/*
- * This is the main, per-CPU runqueue data structure.
- *
- * Locking rule: those places that want to lock multiple runqueues
- * (such as the load balancing or the thread migration code), lock
- * acquire operations must be ordered by ascending &runqueue.
- */
-struct rq {
-        /* runqueue lock: */
-        raw_spinlock_t lock;
-        /*
-         * nr_running and cpu_load should be in the same cacheline because
-         * remote CPUs use both these fields when doing load calculation.
-         */
-        unsigned long nr_running;
-        #define CPU_LOAD_IDX_MAX 5
-        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
-        unsigned long last_load_update_tick;
-#ifdef CONFIG_NO_HZ
-        u64 nohz_stamp;
-        unsigned char nohz_balance_kick;
-#endif
-        int skip_clock_update;
-        /* capture load from *all* tasks on this cpu: */
-        struct load_weight load;
-        unsigned long nr_load_updates;
-        u64 nr_switches;
-        struct cfs_rq cfs;
-        struct rt_rq rt;
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        /* list of leaf cfs_rq on this cpu: */
-        struct list_head leaf_cfs_rq_list;
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-        struct list_head leaf_rt_rq_list;
-#endif
-        /*
-         * This is part of a global counter where only the total sum
-         * over all CPUs matters. A task can increase this counter on
-         * one CPU and if it got migrated afterwards it may decrease
-         * it on another CPU. Always updated under the runqueue lock:
-         */
-        unsigned long nr_uninterruptible;
-        struct task_struct *curr, *idle, *stop;
-        unsigned long next_balance;
-        struct mm_struct *prev_mm;
-        u64 clock;
-        u64 clock_task;
-        atomic_t nr_iowait;
-#ifdef CONFIG_SMP
-        struct root_domain *rd;
-        struct sched_domain *sd;
-        unsigned long cpu_power;
-        unsigned char idle_balance;
-        /* For active balancing */
-        int post_schedule;
-        int active_balance;
-        int push_cpu;
-        struct cpu_stop_work active_balance_work;
-        /* cpu of this runqueue: */
-        int cpu;
-        int online;
-        u64 rt_avg;
-        u64 age_stamp;
-        u64 idle_stamp;
-        u64 avg_idle;
-#endif
-#ifdef CONFIG_IRQ_TIME_ACCOUNTING
-        u64 prev_irq_time;
-#endif
-#ifdef CONFIG_PARAVIRT
-        u64 prev_steal_time;
-#endif
-#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-        u64 prev_steal_time_rq;
-#endif
-        /* calc_load related fields */
-        unsigned long calc_load_update;
-        long calc_load_active;
-#ifdef CONFIG_SCHED_HRTICK
-#ifdef CONFIG_SMP
-        int hrtick_csd_pending;
-        struct call_single_data hrtick_csd;
-#endif
-        struct hrtimer hrtick_timer;
-#endif
-#ifdef CONFIG_SCHEDSTATS
-        /* latency stats */
-        struct sched_info rq_sched_info;
-        unsigned long long rq_cpu_time;
-        /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
-        /* sys_sched_yield() stats */
-        unsigned int yld_count;
-        /* schedule() stats */
-        unsigned int sched_switch;
-        unsigned int sched_count;
-        unsigned int sched_goidle;
-        /* try_to_wake_up() stats */
-        unsigned int ttwu_count;
-        unsigned int ttwu_local;
-#endif
-#ifdef CONFIG_SMP
-        struct llist_head wake_list;
-#endif
-};
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
-static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
-static inline int cpu_of(struct rq *rq)
-{
-#ifdef CONFIG_SMP
-        return rq->cpu;
-#else
-        return 0;
-#endif
-}
-#define rcu_dereference_check_sched_domain(p) \
-        rcu_dereference_check((p), \
-                              lockdep_is_held(&sched_domains_mutex))
-/*
- * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
- * See detach_destroy_domains: synchronize_sched for details.
- *
- * The domain tree of any CPU may only be accessed from within
- * preempt-disabled sections.
- */
-#define for_each_domain(cpu, __sd) \
-        for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
-#define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
-#define this_rq()               (&__get_cpu_var(runqueues))
-#define task_rq(p)              cpu_rq(task_cpu(p))
-#define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
-#define raw_rq()                (&__raw_get_cpu_var(runqueues))
-#ifdef CONFIG_CGROUP_SCHED
-/*
- * Return the group to which this tasks belongs.
- *
- * We use task_subsys_state_check() and extend the RCU verification with
- * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
- * task it moves into the cgroup. Therefore by holding either of those locks,
- * we pin the task to the current cgroup.
- */
-static inline struct task_group *task_group(struct task_struct *p)
-{
-        struct task_group *tg;
-        struct cgroup_subsys_state *css;
-        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
-                        lockdep_is_held(&p->pi_lock) ||
-                        lockdep_is_held(&task_rq(p)->lock));
-        tg = container_of(css, struct task_group, css);
-        return autogroup_task_group(p, tg);
-}
-/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
-{
-#ifdef CONFIG_FAIR_GROUP_SCHED
-        p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
-        p->se.parent = task_group(p)->se[cpu];
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-        p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
-        p->rt.parent = task_group(p)->rt_se[cpu];
-#endif
-}
-#else /* CONFIG_CGROUP_SCHED */
-static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
-static inline struct task_group *task_group(struct task_struct *p)
-{
-        return NULL;
-}
-#endif /* CONFIG_CGROUP_SCHED */
 static void update_rq_clock_task(struct rq *rq, s64 delta);
-static void update_rq_clock(struct rq *rq)
+void update_rq_clock(struct rq *rq)
 {
        s64 delta;
@@ -803,44 +122,14 @@ static void update_rq_clock(struct rq *rq)
 }
 /*
- * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
- */
-#ifdef CONFIG_SCHED_DEBUG
-# define const_debug __read_mostly
-#else
-# define const_debug static const
-#endif
-/**
- * runqueue_is_locked - Returns true if the current cpu runqueue is locked
- * @cpu: the processor in question.
- *
- * This interface allows printk to be called with the runqueue lock
- * held and know whether or not it is OK to wake up the klogd.
- */
-int runqueue_is_locked(int cpu)
-{
-        return raw_spin_is_locked(&cpu_rq(cpu)->lock);
-}
-/*
 * Debugging: various feature bits
 */
 #define SCHED_FEAT(name, enabled)       \
-        __SCHED_FEAT_##name ,
-enum {
-#include "sched_features.h"
-};
-#undef SCHED_FEAT
-#define SCHED_FEAT(name, enabled)       \
        (1UL << __SCHED_FEAT_##name) * enabled |
 const_debug unsigned int sysctl_sched_features =
-#include "sched_features.h"
+#include "features.h"
        0;
 #undef SCHED_FEAT
@@ -850,7 +139,7 @@ const_debug unsigned int sysctl_sched_features =
        #name ,
 static __read_mostly char *sched_feat_names[] = {
-#include "sched_features.h"
+#include "features.h"
        NULL
 };
@@ -860,7 +149,7 @@ static int sched_feat_show(struct seq_file *m, void *v)
 {
        int i;
-        for (i = 0; sched_feat_names[i]; i++) {
+        for (i = 0; i < __SCHED_FEAT_NR; i++) {
                if (!(sysctl_sched_features & (1UL << i)))
                        seq_puts(m, "NO_");
                seq_printf(m, "%s ", sched_feat_names[i]);
@@ -870,6 +159,36 @@ static int sched_feat_show(struct seq_file *m, void *v)
        return 0;
 }
+#ifdef HAVE_JUMP_LABEL
+#define jump_label_key__true  jump_label_key_enabled
+#define jump_label_key__false jump_label_key_disabled
+#define SCHED_FEAT(name, enabled)       \
+        jump_label_key__##enabled ,
+struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR] = {
+#include "features.h"
+};
+#undef SCHED_FEAT
+static void sched_feat_disable(int i)
+{
+        if (jump_label_enabled(&sched_feat_keys[i]))
+                jump_label_dec(&sched_feat_keys[i]);
+}
+static void sched_feat_enable(int i)
+{
+        if (!jump_label_enabled(&sched_feat_keys[i]))
+                jump_label_inc(&sched_feat_keys[i]);
+}
+#else
+static void sched_feat_disable(int i) { };
+static void sched_feat_enable(int i) { };
+#endif /* HAVE_JUMP_LABEL */
 static ssize_t
 sched_feat_write(struct file *filp, const char __user *ubuf,
                size_t cnt, loff_t *ppos)
@@ -893,17 +212,20 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
                cmp += 3;
        }
-        for (i = 0; sched_feat_names[i]; i++) {
+        for (i = 0; i < __SCHED_FEAT_NR; i++) {
                if (strcmp(cmp, sched_feat_names[i]) == 0) {
-                        if (neg)
+                        if (neg) {
                                sysctl_sched_features &= ~(1UL << i);
-                        else
+                                sched_feat_disable(i);
+                        } else {
                                sysctl_sched_features |= (1UL << i);
+                                sched_feat_enable(i);
+                        }
                        break;
                }
        }
-        if (!sched_feat_names[i])
+        if (i == __SCHED_FEAT_NR)
                return -EINVAL;
        *ppos += cnt;
@@ -932,10 +254,7 @@ static __init int sched_init_debug(void)
        return 0;
 }
 late_initcall(sched_init_debug);
+#endif /* CONFIG_SCHED_DEBUG */
-#endif
-#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
 /*
 * Number of tasks to iterate in a single balance run.
@@ -957,7 +276,7 @@ const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
 */
 unsigned int sysctl_sched_rt_period = 1000000;
-static __read_mostly int scheduler_running;
+__read_mostly int scheduler_running;
 /*
 * part of the period that we allow rt tasks to run in us.
@@ -965,112 +284,7 @@ static __read_mostly int scheduler_running;
 */
 int sysctl_sched_rt_runtime = 950000;
-static inline u64 global_rt_period(void)
-{
-        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
-}
-static inline u64 global_rt_runtime(void)
-{
-        if (sysctl_sched_rt_runtime < 0)
-                return RUNTIME_INF;
-        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
-}
-#ifndef prepare_arch_switch
-# define prepare_arch_switch(next)      do { } while (0)
-#endif
-#ifndef finish_arch_switch
-# define finish_arch_switch(prev)       do { } while (0)
-#endif
-static inline int task_current(struct rq *rq, struct task_struct *p)
-{
-        return rq->curr == p;
-}
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
-        return p->on_cpu;
-#else
-        return task_current(rq, p);
-#endif
-}
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
-        /*
-         * We can optimise this out completely for !SMP, because the
-         * SMP rebalancing from interrupt is the only thing that cares
-         * here.
-         */
-        next->on_cpu = 1;
-#endif
-}
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
-        /*
-         * After ->on_cpu is cleared, the task can be moved to a different CPU.
-         * We must ensure this doesn't happen until the switch is completely
-         * finished.
-         */
-        smp_wmb();
-        prev->on_cpu = 0;
-#endif
-#ifdef CONFIG_DEBUG_SPINLOCK
-        /* this is a valid case when another task releases the spinlock */
-        rq->lock.owner = current;
-#endif
-        /*
-         * If we are tracking spinlock dependencies then we have to
-         * fix up the runqueue lock - which gets 'carried over' from
-         * prev into current:
-         */
-        spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
-        raw_spin_unlock_irq(&rq->lock);
-}
-#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
-{
-#ifdef CONFIG_SMP
-        /*
-         * We can optimise this out completely for !SMP, because the
-         * SMP rebalancing from interrupt is the only thing that cares
-         * here.
-         */
-        next->on_cpu = 1;
-#endif
-#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-        raw_spin_unlock_irq(&rq->lock);
-#else
-        raw_spin_unlock(&rq->lock);
-#endif
-}
-static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
-{
-#ifdef CONFIG_SMP
-        /*
-         * After ->on_cpu is cleared, the task can be moved to a different CPU.
-         * We must ensure this doesn't happen until the switch is completely
-         * finished.
-         */
-        smp_wmb();
-        prev->on_cpu = 0;
-#endif
-#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-        local_irq_enable();
-#endif
-}
-#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
 /*
 * __task_rq_lock - lock the rq @p resides on.
@@ -1153,20 +367,6 @@ static struct rq *this_rq_lock(void)
 * rq->lock.
 */
-/*
- * Use hrtick when:
- *  - enabled by features
- *  - hrtimer is actually high res
- */
-static inline int hrtick_enabled(struct rq *rq)
-{
-        if (!sched_feat(HRTICK))
-                return 0;
-        if (!cpu_active(cpu_of(rq)))
-                return 0;
-        return hrtimer_is_hres_active(&rq->hrtick_timer);
-}
 static void hrtick_clear(struct rq *rq)
 {
        if (hrtimer_active(&rq->hrtick_timer))
@@ -1210,7 +410,7 @@ static void __hrtick_start(void *arg)
 *
 * called with rq->lock held and irqs disabled
 */
-static void hrtick_start(struct rq *rq, u64 delay)
+void hrtick_start(struct rq *rq, u64 delay)
 {
        struct hrtimer *timer = &rq->hrtick_timer;
        ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
@@ -1254,7 +454,7 @@ static __init void init_hrtick(void)
 *
 * called with rq->lock held and irqs disabled
 */
-static void hrtick_start(struct rq *rq, u64 delay)
+void hrtick_start(struct rq *rq, u64 delay)
 {
        __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
                        HRTIMER_MODE_REL_PINNED, 0);
@@ -1305,7 +505,7 @@ static inline void init_hrtick(void)
 #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
 #endif
-static void resched_task(struct task_struct *p)
+void resched_task(struct task_struct *p)
 {
        int cpu;
@@ -1326,7 +526,7 @@ static void resched_task(struct task_struct *p)
                smp_send_reschedule(cpu);
 }
-static void resched_cpu(int cpu)
+void resched_cpu(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
@@ -1407,7 +607,8 @@ void wake_up_idle_cpu(int cpu)
 static inline bool got_nohz_idle_kick(void)
 {
-        return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
+        int cpu = smp_processor_id();
+        return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
 }
 #else /* CONFIG_NO_HZ */
@@ -1419,12 +620,7 @@ static inline bool got_nohz_idle_kick(void)
 #endif /* CONFIG_NO_HZ */
-static u64 sched_avg_period(void)
+void sched_avg_update(struct rq *rq)
-{
-        return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
-}
-static void sched_avg_update(struct rq *rq)
 {
        s64 period = sched_avg_period();
@@ -1440,193 +636,23 @@ static void sched_avg_update(struct rq *rq)
        }
 }
-static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
-{
-        rq->rt_avg += rt_delta;
-        sched_avg_update(rq);
-}
 #else /* !CONFIG_SMP */
-static void resched_task(struct task_struct *p)
+void resched_task(struct task_struct *p)
 {
        assert_raw_spin_locked(&task_rq(p)->lock);
        set_tsk_need_resched(p);
 }
-static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
-{
-}
-static void sched_avg_update(struct rq *rq)
-{
-}
 #endif /* CONFIG_SMP */
-#if BITS_PER_LONG == 32
-# define WMULT_CONST    (~0UL)
-#else
-# define WMULT_CONST    (1UL << 32)
-#endif
-#define WMULT_SHIFT     32
-/*
- * Shift right and round:
- */
-#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
-/*
- * delta *= weight / lw
- */
-static unsigned long
-calc_delta_mine(unsigned long delta_exec, unsigned long weight,
-                struct load_weight *lw)
-{
-        u64 tmp;
-        /*
-         * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
-         * entities since MIN_SHARES = 2. Treat weight as 1 if less than
-         * 2^SCHED_LOAD_RESOLUTION.
-         */
-        if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
-                tmp = (u64)delta_exec * scale_load_down(weight);
-        else
-                tmp = (u64)delta_exec;
-        if (!lw->inv_weight) {
-                unsigned long w = scale_load_down(lw->weight);
-                if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
-                        lw->inv_weight = 1;
-                else if (unlikely(!w))
-                        lw->inv_weight = WMULT_CONST;
-                else
-                        lw->inv_weight = WMULT_CONST / w;
-        }
-        /*
-         * Check whether we'd overflow the 64-bit multiplication:
-         */
-        if (unlikely(tmp > WMULT_CONST))
-                tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
-                        WMULT_SHIFT/2);
-        else
-                tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
-        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
-}
-static inline void update_load_add(struct load_weight *lw, unsigned long inc)
-{
-        lw->weight += inc;
-        lw->inv_weight = 0;
-}
-static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
-{
-        lw->weight -= dec;
-        lw->inv_weight = 0;
-}
-static inline void update_load_set(struct load_weight *lw, unsigned long w)
-{
-        lw->weight = w;
-        lw->inv_weight = 0;
-}
-/*
- * To aid in avoiding the subversion of "niceness" due to uneven distribution
- * of tasks with abnormal "nice" values across CPUs the contribution that
- * each task makes to its run queue's load is weighted according to its
- * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
- * scaled version of the new time slice allocation that they receive on time
- * slice expiry etc.
- */
-#define WEIGHT_IDLEPRIO                3
-#define WMULT_IDLEPRIO         1431655765
-/*
- * Nice levels are multiplicative, with a gentle 10% change for every
- * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
- * nice 1, it will get ~10% less CPU time than another CPU-bound task
- * that remained on nice 0.
- *
- * The "10% effect" is relative and cumulative: from _any_ nice level,
- * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
- * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
- * If a task goes up by ~10% and another task goes down by ~10% then
- * the relative distance between them is ~25%.)
- */
-static const int prio_to_weight[40] = {
- /* -20 */     88761,     71755,     56483,     46273,     36291,
- /* -15 */     29154,     23254,     18705,     14949,     11916,
- /* -10 */      9548,      7620,      6100,      4904,      3906,
- /*  -5 */      3121,      2501,      1991,      1586,      1277,
- /*   0 */      1024,       820,       655,       526,       423,
- /*   5 */       335,       272,       215,       172,       137,
- /*  10 */       110,        87,        70,        56,        45,
- /*  15 */        36,        29,        23,        18,        15,
-};
-/*
- * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
- *
- * In cases where the weight does not change often, we can use the
- * precalculated inverse to speed up arithmetics by turning divisions
- * into multiplications:
- */
-static const u32 prio_to_wmult[40] = {
- /* -20 */     48388,     59856,     76040,     92818,    118348,
- /* -15 */    147320,    184698,    229616,    287308,    360437,
- /* -10 */    449829,    563644,    704093,    875809,   1099582,
- /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
- /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
- /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
- /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
- /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
-};
-/* Time spent by the tasks of the cpu accounting group executing in ... */
-enum cpuacct_stat_index {
-        CPUACCT_STAT_USER,      /* ... user mode */
-        CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
-        CPUACCT_STAT_NSTATS,
-};
-#ifdef CONFIG_CGROUP_CPUACCT
-static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-static void cpuacct_update_stats(struct task_struct *tsk,
-                enum cpuacct_stat_index idx, cputime_t val);
-#else
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
-static inline void cpuacct_update_stats(struct task_struct *tsk,
-                enum cpuacct_stat_index idx, cputime_t val) {}
-#endif
-static inline void inc_cpu_load(struct rq *rq, unsigned long load)
-{
-        update_load_add(&rq->load, load);
-}
-static inline void dec_cpu_load(struct rq *rq, unsigned long load)
-{
-        update_load_sub(&rq->load, load);
-}
 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
                        (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
-typedef int (*tg_visitor)(struct task_group *, void *);
 /*
 * Iterate task_group tree rooted at *from, calling @down when first entering a
 * node and @up when leaving it for the final time.
 *
 * Caller must hold rcu_lock or sufficient equivalent.
 */
-static int walk_tg_tree_from(struct task_group *from,
+int walk_tg_tree_from(struct task_group *from,
                             tg_visitor down, tg_visitor up, void *data)
 {
        struct task_group *parent, *child;
@@ -1657,270 +683,13 @@ out:
        return ret;
 }
-/*
+int tg_nop(struct task_group *tg, void *data)
- * Iterate the full tree, calling @down when first entering a node and @up when
- * leaving it for the final time.
- *
- * Caller must hold rcu_lock or sufficient equivalent.
- */
-static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
-{
-        return walk_tg_tree_from(&root_task_group, down, up, data);
-}
-static int tg_nop(struct task_group *tg, void *data)
 {
        return 0;
 }
 #endif
-#ifdef CONFIG_SMP
+void update_cpu_load(struct rq *this_rq);
-/* Used instead of source_load when we know the type == 0 */
-static unsigned long weighted_cpuload(const int cpu)
-{
-        return cpu_rq(cpu)->load.weight;
-}
-/*
- * Return a low guess at the load of a migration-source cpu weighted
- * according to the scheduling class and "nice" value.
- *
- * We want to under-estimate the load of migration sources, to
- * balance conservatively.
- */
-static unsigned long source_load(int cpu, int type)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long total = weighted_cpuload(cpu);
-        if (type == 0 || !sched_feat(LB_BIAS))
-                return total;
-        return min(rq->cpu_load[type-1], total);
-}
-/*
- * Return a high guess at the load of a migration-target cpu weighted
- * according to the scheduling class and "nice" value.
- */
-static unsigned long target_load(int cpu, int type)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long total = weighted_cpuload(cpu);
-        if (type == 0 || !sched_feat(LB_BIAS))
-                return total;
-        return max(rq->cpu_load[type-1], total);
-}
-static unsigned long power_of(int cpu)
-{
-        return cpu_rq(cpu)->cpu_power;
-}
-static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
-static unsigned long cpu_avg_load_per_task(int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
-        if (nr_running)
-                return rq->load.weight / nr_running;
-        return 0;
-}
-#ifdef CONFIG_PREEMPT
-static void double_rq_lock(struct rq *rq1, struct rq *rq2);
-/*
- * fair double_lock_balance: Safely acquires both rq->locks in a fair
- * way at the expense of forcing extra atomic operations in all
- * invocations.  This assures that the double_lock is acquired using the
- * same underlying policy as the spinlock_t on this architecture, which
- * reduces latency compared to the unfair variant below.  However, it
- * also adds more overhead and therefore may reduce throughput.
- */
-static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
-        __releases(this_rq->lock)
-        __acquires(busiest->lock)
-        __acquires(this_rq->lock)
-{
-        raw_spin_unlock(&this_rq->lock);
-        double_rq_lock(this_rq, busiest);
-        return 1;
-}
-#else
-/*
- * Unfair double_lock_balance: Optimizes throughput at the expense of
- * latency by eliminating extra atomic operations when the locks are
- * already in proper order on entry.  This favors lower cpu-ids and will
- * grant the double lock to lower cpus over higher ids under contention,
- * regardless of entry order into the function.
- */
-static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
-        __releases(this_rq->lock)
-        __acquires(busiest->lock)
-        __acquires(this_rq->lock)
-{
-        int ret = 0;
-        if (unlikely(!raw_spin_trylock(&busiest->lock))) {
-                if (busiest < this_rq) {
-                        raw_spin_unlock(&this_rq->lock);
-                        raw_spin_lock(&busiest->lock);
-                        raw_spin_lock_nested(&this_rq->lock,
-                                              SINGLE_DEPTH_NESTING);
-                        ret = 1;
-                } else
-                        raw_spin_lock_nested(&busiest->lock,
-                                              SINGLE_DEPTH_NESTING);
-        }
-        return ret;
-}
-#endif /* CONFIG_PREEMPT */
-/*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-{
-        if (unlikely(!irqs_disabled())) {
-                /* printk() doesn't work good under rq->lock */
-                raw_spin_unlock(&this_rq->lock);
-                BUG_ON(1);
-        }
-        return _double_lock_balance(this_rq, busiest);
-}
-static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
-        __releases(busiest->lock)
-{
-        raw_spin_unlock(&busiest->lock);
-        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
-}
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
-        __acquires(rq1->lock)
-        __acquires(rq2->lock)
-{
-        BUG_ON(!irqs_disabled());
-        if (rq1 == rq2) {
-                raw_spin_lock(&rq1->lock);
-                __acquire(rq2->lock);   /* Fake it out ;) */
-        } else {
-                if (rq1 < rq2) {
-                        raw_spin_lock(&rq1->lock);
-                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
-                } else {
-                        raw_spin_lock(&rq2->lock);
-                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
-                }
-        }
-}
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
-        __releases(rq1->lock)
-        __releases(rq2->lock)
-{
-        raw_spin_unlock(&rq1->lock);
-        if (rq1 != rq2)
-                raw_spin_unlock(&rq2->lock);
-        else
-                __release(rq2->lock);
-}
-#else /* CONFIG_SMP */
-/*
- * double_rq_lock - safely lock two runqueues
- *
- * Note this does not disable interrupts like task_rq_lock,
- * you need to do so manually before calling.
- */
-static void double_rq_lock(struct rq *rq1, struct rq *rq2)
-        __acquires(rq1->lock)
-        __acquires(rq2->lock)
-{
-        BUG_ON(!irqs_disabled());
-        BUG_ON(rq1 != rq2);
-        raw_spin_lock(&rq1->lock);
-        __acquire(rq2->lock);   /* Fake it out ;) */
-}
-/*
- * double_rq_unlock - safely unlock two runqueues
- *
- * Note this does not restore interrupts like task_rq_unlock,
- * you need to do so manually after calling.
- */
-static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
-        __releases(rq1->lock)
-        __releases(rq2->lock)
-{
-        BUG_ON(rq1 != rq2);
-        raw_spin_unlock(&rq1->lock);
-        __release(rq2->lock);
-}
-#endif
-static void calc_load_account_idle(struct rq *this_rq);
-static void update_sysctl(void);
-static int get_update_sysctl_factor(void);
-static void update_cpu_load(struct rq *this_rq);
-static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
-{
-        set_task_rq(p, cpu);
-#ifdef CONFIG_SMP
-        /*
-         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
-         * successfully executed on another CPU. We must ensure that updates of
-         * per-task data have been completed by this moment.
-         */
-        smp_wmb();
-        task_thread_info(p)->cpu = cpu;
-#endif
-}
-static const struct sched_class rt_sched_class;
-#define sched_class_highest (&stop_sched_class)
-#define for_each_class(class) \
-   for (class = sched_class_highest; class; class = class->next)
-#include "sched_stats.h"
-static void inc_nr_running(struct rq *rq)
-{
-        rq->nr_running++;
-}
-static void dec_nr_running(struct rq *rq)
-{
-        rq->nr_running--;
-}
 static void set_load_weight(struct task_struct *p)
 {
@@ -1957,7 +726,7 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 /*
 * activate_task - move a task to the runqueue.
 */
-static void activate_task(struct rq *rq, struct task_struct *p, int flags)
+void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible--;
@@ -1968,7 +737,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
 /*
 * deactivate_task - remove a task from the runqueue.
 */
-static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
+void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
        if (task_contributes_to_load(p))
                rq->nr_uninterruptible++;
@@ -2159,14 +928,14 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 static int irqtime_account_hi_update(void)
 {
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        u64 *cpustat = kcpustat_this_cpu->cpustat;
        unsigned long flags;
        u64 latest_ns;
        int ret = 0;
        local_irq_save(flags);
        latest_ns = this_cpu_read(cpu_hardirq_time);
-        if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+        if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_IRQ])
                ret = 1;
        local_irq_restore(flags);
        return ret;
@@ -2174,14 +943,14 @@ static int irqtime_account_hi_update(void)
 static int irqtime_account_si_update(void)
 {
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        u64 *cpustat = kcpustat_this_cpu->cpustat;
        unsigned long flags;
        u64 latest_ns;
        int ret = 0;
        local_irq_save(flags);
        latest_ns = this_cpu_read(cpu_softirq_time);
-        if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+        if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ])
                ret = 1;
        local_irq_restore(flags);
        return ret;
@@ -2193,15 +962,6 @@ static int irqtime_account_si_update(void)
 #endif
-#include "sched_idletask.c"
-#include "sched_fair.c"
-#include "sched_rt.c"
-#include "sched_autogroup.c"
-#include "sched_stoptask.c"
-#ifdef CONFIG_SCHED_DEBUG
-# include "sched_debug.c"
-#endif
 void sched_set_stop_task(int cpu, struct task_struct *stop)
 {
        struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
@@ -2299,7 +1059,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
                p->sched_class->prio_changed(rq, p, oldprio);
 }
-static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
+void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 {
        const struct sched_class *class;
@@ -2325,38 +1085,6 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 }
 #ifdef CONFIG_SMP
-/*
- * Is this task likely cache-hot:
- */
-static int
-task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
-{
-        s64 delta;
-        if (p->sched_class != &fair_sched_class)
-                return 0;
-        if (unlikely(p->policy == SCHED_IDLE))
-                return 0;
-        /*
-         * Buddy candidates are cache hot:
-         */
-        if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
-                        (&p->se == cfs_rq_of(&p->se)->next ||
-                         &p->se == cfs_rq_of(&p->se)->last))
-                return 1;
-        if (sysctl_sched_migration_cost == -1)
-                return 1;
-        if (sysctl_sched_migration_cost == 0)
-                return 0;
-        delta = now - p->se.exec_start;
-        return delta < (s64)sysctl_sched_migration_cost;
-}
 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -2783,6 +1511,11 @@ static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
 }
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+static inline int ttwu_share_cache(int this_cpu, int that_cpu)
+{
+        return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
+}
 #endif /* CONFIG_SMP */
 static void ttwu_queue(struct task_struct *p, int cpu)
@@ -2790,7 +1523,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
        struct rq *rq = cpu_rq(cpu);
 #if defined(CONFIG_SMP)
-        if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+        if (sched_feat(TTWU_QUEUE) && !ttwu_share_cache(smp_processor_id(), cpu)) {
                sched_clock_cpu(cpu); /* sync clocks x-cpu */
                ttwu_queue_remote(p, cpu);
                return;
@@ -3204,6 +1937,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
        local_irq_enable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
        finish_lock_switch(rq, prev);
+        trace_sched_stat_sleeptime(current, rq->clock);
        fire_sched_in_preempt_notifiers(current);
        if (mm)
@@ -3439,7 +2173,7 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
 */
 static atomic_long_t calc_load_tasks_idle;
-static void calc_load_account_idle(struct rq *this_rq)
+void calc_load_account_idle(struct rq *this_rq)
 {
        long delta;
@@ -3583,7 +2317,7 @@ static void calc_global_nohz(unsigned long ticks)
         */
 }
 #else
-static void calc_load_account_idle(struct rq *this_rq)
+void calc_load_account_idle(struct rq *this_rq)
 {
 }
@@ -3726,7 +2460,7 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
 * every tick. We fix it up based on jiffies.
 */
-static void update_cpu_load(struct rq *this_rq)
+void update_cpu_load(struct rq *this_rq)
 {
        unsigned long this_load = this_rq->load.weight;
        unsigned long curr_jiffies = jiffies;
@@ -3804,8 +2538,10 @@ unlock:
 #endif
 DEFINE_PER_CPU(struct kernel_stat, kstat);
+DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
 EXPORT_PER_CPU_SYMBOL(kstat);
+EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
 /*
 * Return any ns on the sched_clock that have not yet been accounted in
@@ -3858,6 +2594,42 @@ unsigned long long task_sched_runtime(struct task_struct *p)
        return ns;
 }
+#ifdef CONFIG_CGROUP_CPUACCT
+struct cgroup_subsys cpuacct_subsys;
+struct cpuacct root_cpuacct;
+#endif
+static inline void task_group_account_field(struct task_struct *p, int index,
+                                            u64 tmp)
+{
+#ifdef CONFIG_CGROUP_CPUACCT
+        struct kernel_cpustat *kcpustat;
+        struct cpuacct *ca;
+#endif
+        /*
+         * Since all updates are sure to touch the root cgroup, we
+         * get ourselves ahead and touch it first. If the root cgroup
+         * is the only cgroup, then nothing else should be necessary.
+         *
+         */
+        __get_cpu_var(kernel_cpustat).cpustat[index] += tmp;
+#ifdef CONFIG_CGROUP_CPUACCT
+        if (unlikely(!cpuacct_subsys.active))
+                return;
+        rcu_read_lock();
+        ca = task_ca(p);
+        while (ca && (ca != &root_cpuacct)) {
+                kcpustat = this_cpu_ptr(ca->cpustat);
+                kcpustat->cpustat[index] += tmp;
+                ca = parent_ca(ca);
+        }
+        rcu_read_unlock();
+#endif
+}
 /*
 * Account user cpu time to a process.
 * @p: the process that the cpu time gets accounted to
@@ -3867,22 +2639,18 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 void account_user_time(struct task_struct *p, cputime_t cputime,
                       cputime_t cputime_scaled)
 {
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        int index;
-        cputime64_t tmp;
        /* Add user time to process. */
-        p->utime = cputime_add(p->utime, cputime);
+        p->utime += cputime;
-        p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
+        p->utimescaled += cputime_scaled;
        account_group_user_time(p, cputime);
+        index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER;
        /* Add user time to cpustat. */
-        tmp = cputime_to_cputime64(cputime);
+        task_group_account_field(p, index, (__force u64) cputime);
-        if (TASK_NICE(p) > 0)
-                cpustat->nice = cputime64_add(cpustat->nice, tmp);
-        else
-                cpustat->user = cputime64_add(cpustat->user, tmp);
-        cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
        /* Account for user time used */
        acct_update_integrals(p);
 }
@@ -3896,24 +2664,21 @@ void account_user_time(struct task_struct *p, cputime_t cputime,
 static void account_guest_time(struct task_struct *p, cputime_t cputime,
                               cputime_t cputime_scaled)
 {
-        cputime64_t tmp;
+        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
-        tmp = cputime_to_cputime64(cputime);
        /* Add guest time to process. */
-        p->utime = cputime_add(p->utime, cputime);
+        p->utime += cputime;
-        p->utimescaled = cputime_add(p->utimescaled, cputime_scaled);
+        p->utimescaled += cputime_scaled;
        account_group_user_time(p, cputime);
-        p->gtime = cputime_add(p->gtime, cputime);
+        p->gtime += cputime;
        /* Add guest time to cpustat. */
        if (TASK_NICE(p) > 0) {
-                cpustat->nice = cputime64_add(cpustat->nice, tmp);
+                cpustat[CPUTIME_NICE] += (__force u64) cputime;
-                cpustat->guest_nice = cputime64_add(cpustat->guest_nice, tmp);
+                cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime;
        } else {
-                cpustat->user = cputime64_add(cpustat->user, tmp);
+                cpustat[CPUTIME_USER] += (__force u64) cputime;
-                cpustat->guest = cputime64_add(cpustat->guest, tmp);
+                cpustat[CPUTIME_GUEST] += (__force u64) cputime;
        }
 }
@@ -3926,18 +2691,15 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
 */
 static inline
 void __account_system_time(struct task_struct *p, cputime_t cputime,
-                        cputime_t cputime_scaled, cputime64_t *target_cputime64)
+                        cputime_t cputime_scaled, int index)
 {
-        cputime64_t tmp = cputime_to_cputime64(cputime);
        /* Add system time to process. */
-        p->stime = cputime_add(p->stime, cputime);
+        p->stime += cputime;
-        p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+        p->stimescaled += cputime_scaled;
        account_group_system_time(p, cputime);
        /* Add system time to cpustat. */
-        *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+        task_group_account_field(p, index, (__force u64) cputime);
-        cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
        /* Account for system time used */
        acct_update_integrals(p);
@@ -3953,8 +2715,7 @@ void __account_system_time(struct task_struct *p, cputime_t cputime,
 void account_system_time(struct task_struct *p, int hardirq_offset,
                         cputime_t cputime, cputime_t cputime_scaled)
 {
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        int index;
-        cputime64_t *target_cputime64;
        if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
                account_guest_time(p, cputime, cputime_scaled);
@@ -3962,13 +2723,13 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
        }
        if (hardirq_count() - hardirq_offset)
-                target_cputime64 = &cpustat->irq;
+                index = CPUTIME_IRQ;
        else if (in_serving_softirq())
-                target_cputime64 = &cpustat->softirq;
+                index = CPUTIME_SOFTIRQ;
        else
-                target_cputime64 = &cpustat->system;
+                index = CPUTIME_SYSTEM;
-        __account_system_time(p, cputime, cputime_scaled, target_cputime64);
+        __account_system_time(p, cputime, cputime_scaled, index);
 }
 /*
@@ -3977,10 +2738,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
 */
 void account_steal_time(cputime_t cputime)
 {
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        cputime64_t cputime64 = cputime_to_cputime64(cputime);
-        cpustat->steal = cputime64_add(cpustat->steal, cputime64);
+        cpustat[CPUTIME_STEAL] += (__force u64) cputime;
 }
 /*
@@ -3989,14 +2749,13 @@ void account_steal_time(cputime_t cputime)
 */
 void account_idle_time(cputime_t cputime)
 {
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        cputime64_t cputime64 = cputime_to_cputime64(cputime);
        struct rq *rq = this_rq();
        if (atomic_read(&rq->nr_iowait) > 0)
-                cpustat->iowait = cputime64_add(cpustat->iowait, cputime64);
+                cpustat[CPUTIME_IOWAIT] += (__force u64) cputime;
        else
-                cpustat->idle = cputime64_add(cpustat->idle, cputime64);
+                cpustat[CPUTIME_IDLE] += (__force u64) cputime;
 }
 static __always_inline bool steal_account_process_tick(void)
@@ -4046,16 +2805,15 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                                                struct rq *rq)
 {
        cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
-        cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+        u64 *cpustat = kcpustat_this_cpu->cpustat;
-        struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
        if (steal_account_process_tick())
                return;
        if (irqtime_account_hi_update()) {
-                cpustat->irq = cputime64_add(cpustat->irq, tmp);
+                cpustat[CPUTIME_IRQ] += (__force u64) cputime_one_jiffy;
        } else if (irqtime_account_si_update()) {
-                cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+                cpustat[CPUTIME_SOFTIRQ] += (__force u64) cputime_one_jiffy;
        } else if (this_cpu_ksoftirqd() == p) {
                /*
                 * ksoftirqd time do not get accounted in cpu_softirq_time.
@@ -4063,7 +2821,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                 * Also, p->stime needs to be updated for ksoftirqd.
                 */
                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                        &cpustat->softirq);
+                                        CPUTIME_SOFTIRQ);
        } else if (user_tick) {
                account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
        } else if (p == rq->idle) {
@@ -4072,7 +2830,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
                account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
        } else {
                __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
-                                        &cpustat->system);
+                                        CPUTIME_SYSTEM);
        }
 }
@@ -4171,7 +2929,7 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
-        cputime_t rtime, utime = p->utime, total = cputime_add(utime, p->stime);
+        cputime_t rtime, utime = p->utime, total = utime + p->stime;
        /*
         * Use CFS's precise accounting:
@@ -4179,11 +2937,11 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
        rtime = nsecs_to_cputime(p->se.sum_exec_runtime);
        if (total) {
-                u64 temp = rtime;
+                u64 temp = (__force u64) rtime;
-                temp *= utime;
+                temp *= (__force u64) utime;
-                do_div(temp, total);
+                do_div(temp, (__force u32) total);
-                utime = (cputime_t)temp;
+                utime = (__force cputime_t) temp;
        } else
                utime = rtime;
@@ -4191,7 +2949,7 @@ void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
         * Compare with previous values, to keep monotonicity:
         */
        p->prev_utime = max(p->prev_utime, utime);
-        p->prev_stime = max(p->prev_stime, cputime_sub(rtime, p->prev_utime));
+        p->prev_stime = max(p->prev_stime, rtime - p->prev_utime);
        *ut = p->prev_utime;
        *st = p->prev_stime;
@@ -4208,21 +2966,20 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
        thread_group_cputime(p, &cputime);
-        total = cputime_add(cputime.utime, cputime.stime);
+        total = cputime.utime + cputime.stime;
        rtime = nsecs_to_cputime(cputime.sum_exec_runtime);
        if (total) {
-                u64 temp = rtime;
+                u64 temp = (__force u64) rtime;
-                temp *= cputime.utime;
+                temp *= (__force u64) cputime.utime;
-                do_div(temp, total);
+                do_div(temp, (__force u32) total);
-                utime = (cputime_t)temp;
+                utime = (__force cputime_t) temp;
        } else
                utime = rtime;
        sig->prev_utime = max(sig->prev_utime, utime);
-        sig->prev_stime = max(sig->prev_stime,
+        sig->prev_stime = max(sig->prev_stime, rtime - sig->prev_utime);
-                              cputime_sub(rtime, sig->prev_utime));
        *ut = sig->prev_utime;
        *st = sig->prev_stime;
@@ -4321,6 +3078,9 @@ static noinline void __schedule_bug(struct task_struct *prev)
 {
        struct pt_regs *regs = get_irq_regs();
+        if (oops_in_progress)
+                return;
        printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
                prev->comm, prev->pid, preempt_count());
@@ -5852,6 +4612,13 @@ again:
                 */
                if (preempt && rq != p_rq)
                        resched_task(p_rq->curr);
+        } else {
+                /*
+                 * We might have set it in task_yield_fair(), but are
+                 * not going to schedule(), so don't want to skip
+                 * the next update.
+                 */
+                rq->skip_clock_update = 0;
        }
 out:
@@ -6019,7 +4786,7 @@ void sched_show_task(struct task_struct *p)
        free = stack_not_used(p);
 #endif
        printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
-                task_pid_nr(p), task_pid_nr(p->real_parent),
+                task_pid_nr(p), task_pid_nr(rcu_dereference(p->real_parent)),
                (unsigned long)task_thread_info(p)->flags);
        show_stack(p, NULL);
@@ -6118,53 +4885,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
 #endif
 }
-/*
- * Increase the granularity value when there are more CPUs,
- * because with more CPUs the 'effective latency' as visible
- * to users decreases. But the relationship is not linear,
- * so pick a second-best guess by going with the log2 of the
- * number of CPUs.
- *
- * This idea comes from the SD scheduler of Con Kolivas:
- */
-static int get_update_sysctl_factor(void)
-{
-        unsigned int cpus = min_t(int, num_online_cpus(), 8);
-        unsigned int factor;
-        switch (sysctl_sched_tunable_scaling) {
-        case SCHED_TUNABLESCALING_NONE:
-                factor = 1;
-                break;
-        case SCHED_TUNABLESCALING_LINEAR:
-                factor = cpus;
-                break;
-        case SCHED_TUNABLESCALING_LOG:
-        default:
-                factor = 1 + ilog2(cpus);
-                break;
-        }
-        return factor;
-}
-static void update_sysctl(void)
-{
-        unsigned int factor = get_update_sysctl_factor();
-#define SET_SYSCTL(name) \
-        (sysctl_##name = (factor) * normalized_sysctl_##name)
-        SET_SYSCTL(sched_min_granularity);
-        SET_SYSCTL(sched_latency);
-        SET_SYSCTL(sched_wakeup_granularity);
-#undef SET_SYSCTL
-}
-static inline void sched_init_granularity(void)
-{
-        update_sysctl();
-}
 #ifdef CONFIG_SMP
 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
@@ -6351,30 +5071,6 @@ static void calc_global_load_remove(struct rq *rq)
        rq->calc_load_active = 0;
 }
-#ifdef CONFIG_CFS_BANDWIDTH
-static void unthrottle_offline_cfs_rqs(struct rq *rq)
-{
-        struct cfs_rq *cfs_rq;
-        for_each_leaf_cfs_rq(rq, cfs_rq) {
-                struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-                if (!cfs_rq->runtime_enabled)
-                        continue;
-                /*
-                 * clock_task is not advancing so we just need to make sure
-                 * there's some valid quota amount
-                 */
-                cfs_rq->runtime_remaining = cfs_b->quota;
-                if (cfs_rq_throttled(cfs_rq))
-                        unthrottle_cfs_rq(cfs_rq);
-        }
-}
-#else
-static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
-#endif
 /*
 * Migrate all tasks from the rq, sleeping tasks will be migrated by
 * try_to_wake_up()->select_task_rq().
@@ -6980,6 +5676,12 @@ out:
        return -ENOMEM;
 }
+/*
+ * By default the system creates a single root-domain with all cpus as
+ * members (mimicking the global state we have today).
+ */
+struct root_domain def_root_domain;
 static void init_defrootdomain(void)
 {
        init_rootdomain(&def_root_domain);
@@ -7051,6 +5753,31 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
 }
 /*
+ * Keep a special pointer to the highest sched_domain that has
+ * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
+ * allows us to avoid some pointer chasing select_idle_sibling().
+ *
+ * Also keep a unique ID per domain (we use the first cpu number in
+ * the cpumask of the domain), this allows us to quickly tell if
+ * two cpus are in the same cache domain, see ttwu_share_cache().
+ */
+DEFINE_PER_CPU(struct sched_domain *, sd_llc);
+DEFINE_PER_CPU(int, sd_llc_id);
+static void update_top_cache_domain(int cpu)
+{
+        struct sched_domain *sd;
+        int id = cpu;
+        sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
+        if (sd)
+                id = cpumask_first(sched_domain_span(sd));
+        rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
+        per_cpu(sd_llc_id, cpu) = id;
+}
+/*
 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
 * hold the hotplug lock.
 */
@@ -7089,6 +5816,8 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
        tmp = rq->sd;
        rcu_assign_pointer(rq->sd, sd);
        destroy_sched_domains(tmp, cpu);
+        update_top_cache_domain(cpu);
 }
 /* cpus with isolated domains */
@@ -7248,7 +5977,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
                        continue;
                sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
-                                GFP_KERNEL, cpu_to_node(i));
+                                GFP_KERNEL, cpu_to_node(cpu));
                if (!sg)
                        goto fail;
@@ -7386,6 +6115,12 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
                return;
        update_group_power(sd, cpu);
+        atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
+}
+int __weak arch_sd_sibling_asym_packing(void)
+{
+       return 0*SD_ASYM_PACKING;
 }
 /*
@@ -8021,29 +6756,6 @@ static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
        }
 }
-static int update_runtime(struct notifier_block *nfb,
-                                unsigned long action, void *hcpu)
-{
-        int cpu = (int)(long)hcpu;
-        switch (action) {
-        case CPU_DOWN_PREPARE:
-        case CPU_DOWN_PREPARE_FROZEN:
-                disable_runtime(cpu_rq(cpu));
-                return NOTIFY_OK;
-        case CPU_DOWN_FAILED:
-        case CPU_DOWN_FAILED_FROZEN:
-        case CPU_ONLINE:
-        case CPU_ONLINE_FROZEN:
-                enable_runtime(cpu_rq(cpu));
-                return NOTIFY_OK;
-        default:
-                return NOTIFY_DONE;
-        }
-}
 void __init sched_init_smp(void)
 {
        cpumask_var_t non_isolated_cpus;
@@ -8092,104 +6804,11 @@ int in_sched_functions(unsigned long addr)
                && addr < (unsigned long)__sched_text_end);
 }
-static void init_cfs_rq(struct cfs_rq *cfs_rq)
+#ifdef CONFIG_CGROUP_SCHED
-{
+struct task_group root_task_group;
-        cfs_rq->tasks_timeline = RB_ROOT;
-        INIT_LIST_HEAD(&cfs_rq->tasks);
-        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
-#ifndef CONFIG_64BIT
-        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
-#endif
-}
-static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
-{
-        struct rt_prio_array *array;
-        int i;
-        array = &rt_rq->active;
-        for (i = 0; i < MAX_RT_PRIO; i++) {
-                INIT_LIST_HEAD(array->queue + i);
-                __clear_bit(i, array->bitmap);
-        }
-        /* delimiter for bitsearch: */
-        __set_bit(MAX_RT_PRIO, array->bitmap);
-#if defined CONFIG_SMP
-        rt_rq->highest_prio.curr = MAX_RT_PRIO;
-        rt_rq->highest_prio.next = MAX_RT_PRIO;
-        rt_rq->rt_nr_migratory = 0;
-        rt_rq->overloaded = 0;
-        plist_head_init(&rt_rq->pushable_tasks);
-#endif
-        rt_rq->rt_time = 0;
-        rt_rq->rt_throttled = 0;
-        rt_rq->rt_runtime = 0;
-        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-}
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
-                                struct sched_entity *se, int cpu,
-                                struct sched_entity *parent)
-{
-        struct rq *rq = cpu_rq(cpu);
-        cfs_rq->tg = tg;
-        cfs_rq->rq = rq;
-#ifdef CONFIG_SMP
-        /* allow initial update_cfs_load() to truncate */
-        cfs_rq->load_stamp = 1;
-#endif
-        init_cfs_rq_runtime(cfs_rq);
-        tg->cfs_rq[cpu] = cfs_rq;
-        tg->se[cpu] = se;
-        /* se could be NULL for root_task_group */
-        if (!se)
-                return;
-        if (!parent)
-                se->cfs_rq = &rq->cfs;
-        else
-                se->cfs_rq = parent->my_q;
-        se->my_q = cfs_rq;
-        update_load_set(&se->load, 0);
-        se->parent = parent;
-}
 #endif
-#ifdef CONFIG_RT_GROUP_SCHED
+DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
-                struct sched_rt_entity *rt_se, int cpu,
-                struct sched_rt_entity *parent)
-{
-        struct rq *rq = cpu_rq(cpu);
-        rt_rq->highest_prio.curr = MAX_RT_PRIO;
-        rt_rq->rt_nr_boosted = 0;
-        rt_rq->rq = rq;
-        rt_rq->tg = tg;
-        tg->rt_rq[cpu] = rt_rq;
-        tg->rt_se[cpu] = rt_se;
-        if (!rt_se)
-                return;
-        if (!parent)
-                rt_se->rt_rq = &rq->rt;
-        else
-                rt_se->rt_rq = parent->my_q;
-        rt_se->my_q = rt_rq;
-        rt_se->parent = parent;
-        INIT_LIST_HEAD(&rt_se->run_list);
-}
-#endif
 void __init sched_init(void)
 {
@@ -8247,9 +6866,17 @@ void __init sched_init(void)
 #ifdef CONFIG_CGROUP_SCHED
        list_add(&root_task_group.list, &task_groups);
        INIT_LIST_HEAD(&root_task_group.children);
+        INIT_LIST_HEAD(&root_task_group.siblings);
        autogroup_init(&init_task);
 #endif /* CONFIG_CGROUP_SCHED */
+#ifdef CONFIG_CGROUP_CPUACCT
+        root_cpuacct.cpustat = &kernel_cpustat;
+        root_cpuacct.cpuusage = alloc_percpu(u64);
+        /* Too early, not expected to fail */
+        BUG_ON(!root_cpuacct.cpuusage);
+#endif
        for_each_possible_cpu(i) {
                struct rq *rq;
@@ -8261,7 +6888,7 @@ void __init sched_init(void)
                init_cfs_rq(&rq->cfs);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
-                root_task_group.shares = root_task_group_load;
+                root_task_group.shares = ROOT_TASK_GROUP_LOAD;
                INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
                /*
                 * How much cpu bandwidth does root_task_group get?
@@ -8311,7 +6938,7 @@ void __init sched_init(void)
                rq->avg_idle = 2*sysctl_sched_migration_cost;
                rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
-                rq->nohz_balance_kick = 0;
+                rq->nohz_flags = 0;
 #endif
 #endif
                init_rq_hrtick(rq);
@@ -8324,10 +6951,6 @@ void __init sched_init(void)
        INIT_HLIST_HEAD(&init_task.preempt_notifiers);
 #endif
-#ifdef CONFIG_SMP
-        open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
-#endif
 #ifdef CONFIG_RT_MUTEXES
        plist_head_init(&init_task.pi_waiters);
 #endif
@@ -8355,17 +6978,11 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
        zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
-#ifdef CONFIG_NO_HZ
-        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
-        alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
-        atomic_set(&nohz.load_balancer, nr_cpu_ids);
-        atomic_set(&nohz.first_pick_cpu, nr_cpu_ids);
-        atomic_set(&nohz.second_pick_cpu, nr_cpu_ids);
-#endif
        /* May be allocated at isolcpus cmdline parse time */
        if (cpu_isolated_map == NULL)
                zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
-#endif /* SMP */
+#endif
+        init_sched_fair_class();
        scheduler_running = 1;
 }
@@ -8517,169 +7134,14 @@ void set_curr_task(int cpu, struct task_struct *p)
 #endif
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static void free_fair_sched_group(struct task_group *tg)
-{
-        int i;
-        destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
-        for_each_possible_cpu(i) {
-                if (tg->cfs_rq)
-                        kfree(tg->cfs_rq[i]);
-                if (tg->se)
-                        kfree(tg->se[i]);
-        }
-        kfree(tg->cfs_rq);
-        kfree(tg->se);
-}
-static
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
-{
-        struct cfs_rq *cfs_rq;
-        struct sched_entity *se;
-        int i;
-        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
-        if (!tg->cfs_rq)
-                goto err;
-        tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
-        if (!tg->se)
-                goto err;
-        tg->shares = NICE_0_LOAD;
-        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
-        for_each_possible_cpu(i) {
-                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
-                                      GFP_KERNEL, cpu_to_node(i));
-                if (!cfs_rq)
-                        goto err;
-                se = kzalloc_node(sizeof(struct sched_entity),
-                                  GFP_KERNEL, cpu_to_node(i));
-                if (!se)
-                        goto err_free_rq;
-                init_cfs_rq(cfs_rq);
-                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
-        }
-        return 1;
-err_free_rq:
-        kfree(cfs_rq);
-err:
-        return 0;
-}
-static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
-{
-        struct rq *rq = cpu_rq(cpu);
-        unsigned long flags;
-        /*
-        * Only empty task groups can be destroyed; so we can speculatively
-        * check on_list without danger of it being re-added.
-        */
-        if (!tg->cfs_rq[cpu]->on_list)
-                return;
-        raw_spin_lock_irqsave(&rq->lock, flags);
-        list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
-        raw_spin_unlock_irqrestore(&rq->lock, flags);
-}
-#else /* !CONFIG_FAIR_GROUP_SCHED */
-static inline void free_fair_sched_group(struct task_group *tg)
-{
-}
-static inline
-int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
-{
-        return 1;
-}
-static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
-{
-}
-#endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
-static void free_rt_sched_group(struct task_group *tg)
-{
-        int i;
-        if (tg->rt_se)
-                destroy_rt_bandwidth(&tg->rt_bandwidth);
-        for_each_possible_cpu(i) {
-                if (tg->rt_rq)
-                        kfree(tg->rt_rq[i]);
-                if (tg->rt_se)
-                        kfree(tg->rt_se[i]);
-        }
-        kfree(tg->rt_rq);
-        kfree(tg->rt_se);
-}
-static
-int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
-{
-        struct rt_rq *rt_rq;
-        struct sched_rt_entity *rt_se;
-        int i;
-        tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
-        if (!tg->rt_rq)
-                goto err;
-        tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
-        if (!tg->rt_se)
-                goto err;
-        init_rt_bandwidth(&tg->rt_bandwidth,
-                        ktime_to_ns(def_rt_bandwidth.rt_period), 0);
-        for_each_possible_cpu(i) {
-                rt_rq = kzalloc_node(sizeof(struct rt_rq),
-                                     GFP_KERNEL, cpu_to_node(i));
-                if (!rt_rq)
-                        goto err;
-                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
-                                     GFP_KERNEL, cpu_to_node(i));
-                if (!rt_se)
-                        goto err_free_rq;
-                init_rt_rq(rt_rq, cpu_rq(i));
-                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
-                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
-        }
-        return 1;
-err_free_rq:
-        kfree(rt_rq);
-err:
-        return 0;
-}
 #else /* !CONFIG_RT_GROUP_SCHED */
-static inline void free_rt_sched_group(struct task_group *tg)
-{
-}
-static inline
-int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
-{
-        return 1;
-}
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_CGROUP_SCHED
+/* task_group_lock serializes the addition/removal of task groups */
+static DEFINE_SPINLOCK(task_group_lock);
 static void free_sched_group(struct task_group *tg)
 {
        free_fair_sched_group(tg);
@@ -8785,47 +7247,6 @@ void sched_move_task(struct task_struct *tsk)
 #endif /* CONFIG_CGROUP_SCHED */
 #ifdef CONFIG_FAIR_GROUP_SCHED
-static DEFINE_MUTEX(shares_mutex);
-int sched_group_set_shares(struct task_group *tg, unsigned long shares)
-{
-        int i;
-        unsigned long flags;
-        /*
-         * We can't change the weight of the root cgroup.
-         */
-        if (!tg->se[0])
-                return -EINVAL;
-        shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
-        mutex_lock(&shares_mutex);
-        if (tg->shares == shares)
-                goto done;
-        tg->shares = shares;
-        for_each_possible_cpu(i) {
-                struct rq *rq = cpu_rq(i);
-                struct sched_entity *se;
-                se = tg->se[i];
-                /* Propagate contribution to hierarchy */
-                raw_spin_lock_irqsave(&rq->lock, flags);
-                for_each_sched_entity(se)
-                        update_cfs_shares(group_cfs_rq(se));
-                raw_spin_unlock_irqrestore(&rq->lock, flags);
-        }
-done:
-        mutex_unlock(&shares_mutex);
-        return 0;
-}
-unsigned long sched_group_shares(struct task_group *tg)
-{
-        return tg->shares;
-}
 #endif
 #if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
@@ -8850,7 +7271,7 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
        struct task_struct *g, *p;
        do_each_thread(g, p) {
-                if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
+                if (rt_task(p) && task_rq(p)->rt.tg == tg)
                        return 1;
        } while_each_thread(g, p);
@@ -9201,8 +7622,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 {
-        int i, ret = 0, runtime_enabled;
+        int i, ret = 0, runtime_enabled, runtime_was_enabled;
-        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
        if (tg == &root_task_group)
                return -EINVAL;
@@ -9229,6 +7650,8 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
                goto out_unlock;
        runtime_enabled = quota != RUNTIME_INF;
+        runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
+        account_cfs_bandwidth_used(runtime_enabled, runtime_was_enabled);
        raw_spin_lock_irq(&cfs_b->lock);
        cfs_b->period = ns_to_ktime(period);
        cfs_b->quota = quota;
@@ -9244,13 +7667,13 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
        for_each_possible_cpu(i) {
                struct cfs_rq *cfs_rq = tg->cfs_rq[i];
-                struct rq *rq = rq_of(cfs_rq);
+                struct rq *rq = cfs_rq->rq;
                raw_spin_lock_irq(&rq->lock);
                cfs_rq->runtime_enabled = runtime_enabled;
                cfs_rq->runtime_remaining = 0;
-                if (cfs_rq_throttled(cfs_rq))
+                if (cfs_rq->throttled)
                        unthrottle_cfs_rq(cfs_rq);
                raw_spin_unlock_irq(&rq->lock);
        }
@@ -9264,7 +7687,7 @@ int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
 {
        u64 quota, period;
-        period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+        period = ktime_to_ns(tg->cfs_bandwidth.period);
        if (cfs_quota_us < 0)
                quota = RUNTIME_INF;
        else
@@ -9277,10 +7700,10 @@ long tg_get_cfs_quota(struct task_group *tg)
 {
        u64 quota_us;
-        if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
+        if (tg->cfs_bandwidth.quota == RUNTIME_INF)
                return -1;
-        quota_us = tg_cfs_bandwidth(tg)->quota;
+        quota_us = tg->cfs_bandwidth.quota;
        do_div(quota_us, NSEC_PER_USEC);
        return quota_us;
@@ -9291,10 +7714,7 @@ int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
        u64 quota, period;
        period = (u64)cfs_period_us * NSEC_PER_USEC;
-        quota = tg_cfs_bandwidth(tg)->quota;
+        quota = tg->cfs_bandwidth.quota;
-        if (period <= 0)
-                return -EINVAL;
        return tg_set_cfs_bandwidth(tg, period, quota);
 }
@@ -9303,7 +7723,7 @@ long tg_get_cfs_period(struct task_group *tg)
 {
        u64 cfs_period_us;
-        cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+        cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
        do_div(cfs_period_us, NSEC_PER_USEC);
        return cfs_period_us;
@@ -9363,13 +7783,13 @@ static u64 normalize_cfs_quota(struct task_group *tg,
 static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
 {
        struct cfs_schedulable_data *d = data;
-        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
        s64 quota = 0, parent_quota = -1;
        if (!tg->parent) {
                quota = RUNTIME_INF;
        } else {
-                struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
+                struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
                quota = normalize_cfs_quota(tg, d);
                parent_quota = parent_b->hierarchal_quota;
@@ -9413,7 +7833,7 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
                struct cgroup_map_cb *cb)
 {
        struct task_group *tg = cgroup_tg(cgrp);
-        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+        struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
        cb->fill(cb, "nr_periods", cfs_b->nr_periods);
        cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
@@ -9514,38 +7934,16 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 * (balbir@in.ibm.com).
 */
-/* track cpu usage of a group of tasks and its child groups */
-struct cpuacct {
-        struct cgroup_subsys_state css;
-        /* cpuusage holds pointer to a u64-type object on every cpu */
-        u64 __percpu *cpuusage;
-        struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
-        struct cpuacct *parent;
-};
-struct cgroup_subsys cpuacct_subsys;
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
-{
-        return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
-                            struct cpuacct, css);
-}
-/* return cpu accounting group to which this task belongs */
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
-{
-        return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
-                            struct cpuacct, css);
-}
 /* create a new cpu accounting group */
 static struct cgroup_subsys_state *cpuacct_create(
        struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
-        struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+        struct cpuacct *ca;
-        int i;
+        if (!cgrp->parent)
+                return &root_cpuacct.css;
+        ca = kzalloc(sizeof(*ca), GFP_KERNEL);
        if (!ca)
                goto out;
@@ -9553,18 +7951,13 @@ static struct cgroup_subsys_state *cpuacct_create(
        if (!ca->cpuusage)
                goto out_free_ca;
-        for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+        ca->cpustat = alloc_percpu(struct kernel_cpustat);
-                if (percpu_counter_init(&ca->cpustat[i], 0))
+        if (!ca->cpustat)
-                        goto out_free_counters;
+                goto out_free_cpuusage;
-        if (cgrp->parent)
-                ca->parent = cgroup_ca(cgrp->parent);
        return &ca->css;
-out_free_counters:
+out_free_cpuusage:
-        while (--i >= 0)
-                percpu_counter_destroy(&ca->cpustat[i]);
        free_percpu(ca->cpuusage);
 out_free_ca:
        kfree(ca);
@@ -9577,10 +7970,8 @@ static void
 cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 {
        struct cpuacct *ca = cgroup_ca(cgrp);
-        int i;
-        for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+        free_percpu(ca->cpustat);
-                percpu_counter_destroy(&ca->cpustat[i]);
        free_percpu(ca->cpuusage);
        kfree(ca);
 }
@@ -9673,16 +8064,31 @@ static const char *cpuacct_stat_desc[] = {
 };
 static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
-                struct cgroup_map_cb *cb)
+                              struct cgroup_map_cb *cb)
 {
        struct cpuacct *ca = cgroup_ca(cgrp);
-        int i;
+        int cpu;
+        s64 val = 0;
+        for_each_online_cpu(cpu) {
+                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
+                val += kcpustat->cpustat[CPUTIME_USER];
+                val += kcpustat->cpustat[CPUTIME_NICE];
+        }
+        val = cputime64_to_clock_t(val);
+        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val);
-        for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+        val = 0;
-                s64 val = percpu_counter_read(&ca->cpustat[i]);
+        for_each_online_cpu(cpu) {
-                val = cputime64_to_clock_t(val);
+                struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu);
-                cb->fill(cb, cpuacct_stat_desc[i], val);
+                val += kcpustat->cpustat[CPUTIME_SYSTEM];
+                val += kcpustat->cpustat[CPUTIME_IRQ];
+                val += kcpustat->cpustat[CPUTIME_SOFTIRQ];
        }
+        val = cputime64_to_clock_t(val);
+        cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val);
        return 0;
 }
@@ -9712,7 +8118,7 @@ static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
 *
 * called with rq->lock held.
 */
-static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
+void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 {
        struct cpuacct *ca;
        int cpu;
@@ -9726,7 +8132,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
        ca = task_ca(tsk);
-        for (; ca; ca = ca->parent) {
+        for (; ca; ca = parent_ca(ca)) {
                u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                *cpuusage += cputime;
        }
@@ -9734,45 +8140,6 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
        rcu_read_unlock();
 }
-/*
- * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
- * in cputime_t units. As a result, cpuacct_update_stats calls
- * percpu_counter_add with values large enough to always overflow the
- * per cpu batch limit causing bad SMP scalability.
- *
- * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
- * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
- * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
- */
-#ifdef CONFIG_SMP
-#define CPUACCT_BATCH   \
-        min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
-#else
-#define CPUACCT_BATCH   0
-#endif
-/*
- * Charge the system/user time to the task's accounting group.
- */
-static void cpuacct_update_stats(struct task_struct *tsk,
-                enum cpuacct_stat_index idx, cputime_t val)
-{
-        struct cpuacct *ca;
-        int batch = CPUACCT_BATCH;
-        if (unlikely(!cpuacct_subsys.active))
-                return;
-        rcu_read_lock();
-        ca = task_ca(tsk);
-        do {
-                __percpu_counter_add(&ca->cpustat[idx], val, batch);
-                ca = ca->parent;
-        } while (ca);
-        rcu_read_unlock();
-}
 struct cgroup_subsys cpuacct_subsys = {
        .name = "cpuacct",
        .create = cpuacct_create,
diff --git a/kernel/sched_cpupri.c b/kernel/sched/cpupri.c
index a86cf9d9eb11..b0d798eaf130 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -1,5 +1,5 @@
 /*
- *  kernel/sched_cpupri.c
+ *  kernel/sched/cpupri.c
 *
 *  CPU priority management
 *
@@ -28,7 +28,7 @@
 */
 #include <linux/gfp.h>
-#include "sched_cpupri.h"
+#include "cpupri.h"
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
 static int convert_prio(int prio)
diff --git a/kernel/sched_cpupri.h b/kernel/sched/cpupri.h
index f6d756173491..f6d756173491 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched/cpupri.h
diff --git a/kernel/sched_debug.c b/kernel/sched/debug.c
index a6710a112b4f..2a075e10004b 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched/debug.c
@@ -1,5 +1,5 @@
 /*
- * kernel/time/sched_debug.c
+ * kernel/sched/debug.c
 *
 * Print the CFS rbtree
 *
@@ -16,6 +16,8 @@
 #include <linux/kallsyms.h>
 #include <linux/utsname.h>
+#include "sched.h"
 static DEFINE_SPINLOCK(sched_debug_lock);
 /*
@@ -373,7 +375,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
        return 0;
 }
-static void sysrq_sched_debug_show(void)
+void sysrq_sched_debug_show(void)
 {
        sched_debug_show(NULL, NULL);
 }
diff --git a/kernel/sched_fair.c b/kernel/sched/fair.c
index 8a39fa3e3c6c..8e42de9105f8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched/fair.c
@@ -23,6 +23,13 @@
 #include <linux/latencytop.h>
 #include <linux/sched.h>
 #include <linux/cpumask.h>
+#include <linux/slab.h>
+#include <linux/profile.h>
+#include <linux/interrupt.h>
+#include <trace/events/sched.h>
+#include "sched.h"
 /*
 * Targeted preemption latency for CPU-bound tasks:
@@ -103,7 +110,110 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif
-static const struct sched_class fair_sched_class;
+/*
+ * Increase the granularity value when there are more CPUs,
+ * because with more CPUs the 'effective latency' as visible
+ * to users decreases. But the relationship is not linear,
+ * so pick a second-best guess by going with the log2 of the
+ * number of CPUs.
+ *
+ * This idea comes from the SD scheduler of Con Kolivas:
+ */
+static int get_update_sysctl_factor(void)
+{
+        unsigned int cpus = min_t(int, num_online_cpus(), 8);
+        unsigned int factor;
+        switch (sysctl_sched_tunable_scaling) {
+        case SCHED_TUNABLESCALING_NONE:
+                factor = 1;
+                break;
+        case SCHED_TUNABLESCALING_LINEAR:
+                factor = cpus;
+                break;
+        case SCHED_TUNABLESCALING_LOG:
+        default:
+                factor = 1 + ilog2(cpus);
+                break;
+        }
+        return factor;
+}
+static void update_sysctl(void)
+{
+        unsigned int factor = get_update_sysctl_factor();
+#define SET_SYSCTL(name) \
+        (sysctl_##name = (factor) * normalized_sysctl_##name)
+        SET_SYSCTL(sched_min_granularity);
+        SET_SYSCTL(sched_latency);
+        SET_SYSCTL(sched_wakeup_granularity);
+#undef SET_SYSCTL
+}
+void sched_init_granularity(void)
+{
+        update_sysctl();
+}
+#if BITS_PER_LONG == 32
+# define WMULT_CONST    (~0UL)
+#else
+# define WMULT_CONST    (1UL << 32)
+#endif
+#define WMULT_SHIFT     32
+/*
+ * Shift right and round:
+ */
+#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
+/*
+ * delta *= weight / lw
+ */
+static unsigned long
+calc_delta_mine(unsigned long delta_exec, unsigned long weight,
+                struct load_weight *lw)
+{
+        u64 tmp;
+        /*
+         * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+         * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+         * 2^SCHED_LOAD_RESOLUTION.
+         */
+        if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+                tmp = (u64)delta_exec * scale_load_down(weight);
+        else
+                tmp = (u64)delta_exec;
+        if (!lw->inv_weight) {
+                unsigned long w = scale_load_down(lw->weight);
+                if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
+                        lw->inv_weight = 1;
+                else if (unlikely(!w))
+                        lw->inv_weight = WMULT_CONST;
+                else
+                        lw->inv_weight = WMULT_CONST / w;
+        }
+        /*
+         * Check whether we'd overflow the 64-bit multiplication:
+         */
+        if (unlikely(tmp > WMULT_CONST))
+                tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
+                        WMULT_SHIFT/2);
+        else
+                tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
+        return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
+}
+const struct sched_class fair_sched_class;
 /**************************************************************
 * CFS operations on generic schedulable entities:
@@ -413,7 +523,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
        rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 }
-static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
+struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 {
        struct rb_node *left = cfs_rq->rb_leftmost;
@@ -434,7 +544,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 }
 #ifdef CONFIG_SCHED_DEBUG
-static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
+struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 {
        struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -684,7 +794,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_add(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
-                inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+                update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
        if (entity_is_task(se)) {
                add_cfs_task_weight(cfs_rq, se->load.weight);
                list_add(&se->group_node, &cfs_rq->tasks);
@@ -697,7 +807,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
        update_load_sub(&cfs_rq->load, se->load.weight);
        if (!parent_entity(se))
-                dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+                update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
        if (entity_is_task(se)) {
                add_cfs_task_weight(cfs_rq, -se->load.weight);
                list_del_init(&se->group_node);
@@ -893,7 +1003,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                if (unlikely(delta > se->statistics.sleep_max))
                        se->statistics.sleep_max = delta;
-                se->statistics.sleep_start = 0;
                se->statistics.sum_sleep_runtime += delta;
                if (tsk) {
@@ -910,7 +1019,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                if (unlikely(delta > se->statistics.block_max))
                        se->statistics.block_max = delta;
-                se->statistics.block_start = 0;
                se->statistics.sum_sleep_runtime += delta;
                if (tsk) {
@@ -920,6 +1028,8 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                                trace_sched_stat_iowait(tsk, delta);
                        }
+                        trace_sched_stat_blocked(tsk, delta);
                        /*
                         * Blocking time is in units of nanosecs, so shift by
                         * 20 to get a milliseconds-range estimation of the
@@ -1287,6 +1397,32 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 */
 #ifdef CONFIG_CFS_BANDWIDTH
+#ifdef HAVE_JUMP_LABEL
+static struct jump_label_key __cfs_bandwidth_used;
+static inline bool cfs_bandwidth_used(void)
+{
+        return static_branch(&__cfs_bandwidth_used);
+}
+void account_cfs_bandwidth_used(int enabled, int was_enabled)
+{
+        /* only need to count groups transitioning between enabled/!enabled */
+        if (enabled && !was_enabled)
+                jump_label_inc(&__cfs_bandwidth_used);
+        else if (!enabled && was_enabled)
+                jump_label_dec(&__cfs_bandwidth_used);
+}
+#else /* HAVE_JUMP_LABEL */
+static bool cfs_bandwidth_used(void)
+{
+        return true;
+}
+void account_cfs_bandwidth_used(int enabled, int was_enabled) {}
+#endif /* HAVE_JUMP_LABEL */
 /*
 * default period for cfs group bandwidth.
 * default: 0.1s, units: nanoseconds
@@ -1308,7 +1444,7 @@ static inline u64 sched_cfs_bandwidth_slice(void)
 *
 * requires cfs_b->lock
 */
-static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
 {
        u64 now;
@@ -1320,6 +1456,11 @@ static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
        cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
 }
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+        return &tg->cfs_bandwidth;
+}
 /* returns 0 on failure to allocate runtime */
 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
@@ -1421,7 +1562,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
                                                   unsigned long delta_exec)
 {
-        if (!cfs_rq->runtime_enabled)
+        if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
                return;
        __account_cfs_rq_runtime(cfs_rq, delta_exec);
@@ -1429,13 +1570,13 @@ static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
 {
-        return cfs_rq->throttled;
+        return cfs_bandwidth_used() && cfs_rq->throttled;
 }
 /* check whether cfs_rq, or any parent, is throttled */
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
-        return cfs_rq->throttle_count;
+        return cfs_bandwidth_used() && cfs_rq->throttle_count;
 }
 /*
@@ -1530,7 +1671,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
        raw_spin_unlock(&cfs_b->lock);
 }
-static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
        struct rq *rq = rq_of(cfs_rq);
        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
@@ -1756,6 +1897,9 @@ static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
+        if (!cfs_bandwidth_used())
+                return;
        if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
                return;
@@ -1801,6 +1945,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 */
 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 {
+        if (!cfs_bandwidth_used())
+                return;
        /* an active group must be handled by the update_curr()->put() path */
        if (!cfs_rq->runtime_enabled || cfs_rq->curr)
                return;
@@ -1818,6 +1965,9 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
 /* conditionally throttle active cfs_rq's from put_prev_entity() */
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
+        if (!cfs_bandwidth_used())
+                return;
        if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
                return;
@@ -1830,7 +1980,112 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        throttle_cfs_rq(cfs_rq);
 }
-#else
+static inline u64 default_cfs_period(void);
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+        struct cfs_bandwidth *cfs_b =
+                container_of(timer, struct cfs_bandwidth, slack_timer);
+        do_sched_cfs_slack_timer(cfs_b);
+        return HRTIMER_NORESTART;
+}
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+        struct cfs_bandwidth *cfs_b =
+                container_of(timer, struct cfs_bandwidth, period_timer);
+        ktime_t now;
+        int overrun;
+        int idle = 0;
+        for (;;) {
+                now = hrtimer_cb_get_time(timer);
+                overrun = hrtimer_forward(timer, now, cfs_b->period);
+                if (!overrun)
+                        break;
+                idle = do_sched_cfs_period_timer(cfs_b, overrun);
+        }
+        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        raw_spin_lock_init(&cfs_b->lock);
+        cfs_b->runtime = 0;
+        cfs_b->quota = RUNTIME_INF;
+        cfs_b->period = ns_to_ktime(default_cfs_period());
+        INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
+        hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        cfs_b->period_timer.function = sched_cfs_period_timer;
+        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        cfs_b->slack_timer.function = sched_cfs_slack_timer;
+}
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        cfs_rq->runtime_enabled = 0;
+        INIT_LIST_HEAD(&cfs_rq->throttled_list);
+}
+/* requires cfs_b->lock, may release to reprogram timer */
+void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        /*
+         * The timer may be active because we're trying to set a new bandwidth
+         * period or because we're racing with the tear-down path
+         * (timer_active==0 becomes visible before the hrtimer call-back
+         * terminates).  In either case we ensure that it's re-programmed
+         */
+        while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+                raw_spin_unlock(&cfs_b->lock);
+                /* ensure cfs_b->lock is available while we wait */
+                hrtimer_cancel(&cfs_b->period_timer);
+                raw_spin_lock(&cfs_b->lock);
+                /* if someone else restarted the timer then we're done */
+                if (cfs_b->timer_active)
+                        return;
+        }
+        cfs_b->timer_active = 1;
+        start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+}
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        hrtimer_cancel(&cfs_b->period_timer);
+        hrtimer_cancel(&cfs_b->slack_timer);
+}
+void unthrottle_offline_cfs_rqs(struct rq *rq)
+{
+        struct cfs_rq *cfs_rq;
+        for_each_leaf_cfs_rq(rq, cfs_rq) {
+                struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+                if (!cfs_rq->runtime_enabled)
+                        continue;
+                /*
+                 * clock_task is not advancing so we just need to make sure
+                 * there's some valid quota amount
+                 */
+                cfs_rq->runtime_remaining = cfs_b->quota;
+                if (cfs_rq_throttled(cfs_rq))
+                        unthrottle_cfs_rq(cfs_rq);
+        }
+}
+#else /* CONFIG_CFS_BANDWIDTH */
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
                                     unsigned long delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
@@ -1852,8 +2107,22 @@ static inline int throttled_lb_pair(struct task_group *tg,
 {
        return 0;
 }
+void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 #endif
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+        return NULL;
+}
+static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+#endif /* CONFIG_CFS_BANDWIDTH */
 /**************************************************
 * CFS operations on tasks:
 */
@@ -1866,7 +2135,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
        WARN_ON(task_rq(p) != rq);
-        if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
+        if (cfs_rq->nr_running > 1) {
                u64 slice = sched_slice(cfs_rq, se);
                u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
                s64 delta = slice - ran;
@@ -1897,7 +2166,7 @@ static void hrtick_update(struct rq *rq)
 {
        struct task_struct *curr = rq->curr;
-        if (curr->sched_class != &fair_sched_class)
+        if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
                return;
        if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
@@ -2020,6 +2289,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 }
 #ifdef CONFIG_SMP
+/* Used instead of source_load when we know the type == 0 */
+static unsigned long weighted_cpuload(const int cpu)
+{
+        return cpu_rq(cpu)->load.weight;
+}
+/*
+ * Return a low guess at the load of a migration-source cpu weighted
+ * according to the scheduling class and "nice" value.
+ *
+ * We want to under-estimate the load of migration sources, to
+ * balance conservatively.
+ */
+static unsigned long source_load(int cpu, int type)
+{
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long total = weighted_cpuload(cpu);
+        if (type == 0 || !sched_feat(LB_BIAS))
+                return total;
+        return min(rq->cpu_load[type-1], total);
+}
+/*
+ * Return a high guess at the load of a migration-target cpu weighted
+ * according to the scheduling class and "nice" value.
+ */
+static unsigned long target_load(int cpu, int type)
+{
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long total = weighted_cpuload(cpu);
+        if (type == 0 || !sched_feat(LB_BIAS))
+                return total;
+        return max(rq->cpu_load[type-1], total);
+}
+static unsigned long power_of(int cpu)
+{
+        return cpu_rq(cpu)->cpu_power;
+}
+static unsigned long cpu_avg_load_per_task(int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
+        if (nr_running)
+                return rq->load.weight / nr_running;
+        return 0;
+}
 static void task_waking_fair(struct task_struct *p)
 {
@@ -2327,7 +2651,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
        struct sched_group *sg;
-        int i, smt = 0;
+        int i;
        /*
         * If the task is going to be woken-up on this cpu and if it is
@@ -2347,17 +2671,9 @@ static int select_idle_sibling(struct task_struct *p, int target)
         * Otherwise, iterate the domains and find an elegible idle cpu.
         */
        rcu_read_lock();
-again:
-        for_each_domain(target, sd) {
-                if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
-                        continue;
-                if (smt && !(sd->flags & SD_SHARE_CPUPOWER))
-                        break;
-                if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
-                        break;
+        sd = rcu_dereference(per_cpu(sd_llc, target));
+        for_each_lower_domain(sd) {
                sg = sd->groups;
                do {
                        if (!cpumask_intersects(sched_group_cpus(sg),
@@ -2376,10 +2692,6 @@ next:
                        sg = sg->next;
                } while (sg != sd->groups);
        }
-        if (!smt) {
-                smt = 1;
-                goto again;
-        }
 done:
        rcu_read_unlock();
@@ -2408,6 +2720,9 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
        int want_sd = 1;
        int sync = wake_flags & WF_SYNC;
+        if (p->rt.nr_cpus_allowed == 1)
+                return prev_cpu;
        if (sd_flag & SD_BALANCE_WAKE) {
                if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
                        want_affine = 1;
@@ -2692,7 +3007,8 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
        } while (cfs_rq);
        p = task_of(se);
-        hrtick_start_fair(rq, p);
+        if (hrtick_enabled(rq))
+                hrtick_start_fair(rq, p);
        return p;
 }
@@ -2736,6 +3052,12 @@ static void yield_task_fair(struct rq *rq)
                 * Update run-time statistics of the 'current'.
                 */
                update_curr(cfs_rq);
+                /*
+                 * Tell update_rq_clock() that we've just updated,
+                 * so we don't do microscopic update in schedule()
+                 * and double the fastpath cost.
+                 */
+                 rq->skip_clock_update = 1;
        }
        set_skip_buddy(se);
@@ -2776,12 +3098,48 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
 }
 /*
+ * Is this task likely cache-hot:
+ */
+static int
+task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
+{
+        s64 delta;
+        if (p->sched_class != &fair_sched_class)
+                return 0;
+        if (unlikely(p->policy == SCHED_IDLE))
+                return 0;
+        /*
+         * Buddy candidates are cache hot:
+         */
+        if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
+                        (&p->se == cfs_rq_of(&p->se)->next ||
+                         &p->se == cfs_rq_of(&p->se)->last))
+                return 1;
+        if (sysctl_sched_migration_cost == -1)
+                return 1;
+        if (sysctl_sched_migration_cost == 0)
+                return 0;
+        delta = now - p->se.exec_start;
+        return delta < (s64)sysctl_sched_migration_cost;
+}
+#define LBF_ALL_PINNED  0x01
+#define LBF_NEED_BREAK  0x02
+#define LBF_ABORT       0x04
+/*
 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
 */
 static
 int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
                     struct sched_domain *sd, enum cpu_idle_type idle,
-                     int *all_pinned)
+                     int *lb_flags)
 {
        int tsk_cache_hot = 0;
        /*
@@ -2794,7 +3152,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
                return 0;
        }
-        *all_pinned = 0;
+        *lb_flags &= ~LBF_ALL_PINNED;
        if (task_running(rq, p)) {
                schedstat_inc(p, se.statistics.nr_failed_migrations_running);
@@ -2868,7 +3226,7 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 static unsigned long
 balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
              unsigned long max_load_move, struct sched_domain *sd,
-              enum cpu_idle_type idle, int *all_pinned,
+              enum cpu_idle_type idle, int *lb_flags,
              struct cfs_rq *busiest_cfs_rq)
 {
        int loops = 0, pulled = 0;
@@ -2879,12 +3237,14 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                goto out;
        list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
-                if (loops++ > sysctl_sched_nr_migrate)
+                if (loops++ > sysctl_sched_nr_migrate) {
+                        *lb_flags |= LBF_NEED_BREAK;
                        break;
+                }
                if ((p->se.load.weight >> 1) > rem_load_move ||
                    !can_migrate_task(p, busiest, this_cpu, sd, idle,
-                                      all_pinned))
+                                      lb_flags))
                        continue;
                pull_task(busiest, p, this_rq, this_cpu);
@@ -2897,8 +3257,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                 * kernels will stop after the first task is pulled to minimize
                 * the critical section.
                 */
-                if (idle == CPU_NEWLY_IDLE)
+                if (idle == CPU_NEWLY_IDLE) {
+                        *lb_flags |= LBF_ABORT;
                        break;
+                }
 #endif
                /*
@@ -3003,7 +3365,7 @@ static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *all_pinned)
+                  int *lb_flags)
 {
        long rem_load_move = max_load_move;
        struct cfs_rq *busiest_cfs_rq;
@@ -3016,6 +3378,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                unsigned long busiest_weight = busiest_cfs_rq->load.weight;
                u64 rem_load, moved_load;
+                if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+                        break;
                /*
                 * empty group or part of a throttled hierarchy
                 */
@@ -3027,7 +3392,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                rem_load = div_u64(rem_load, busiest_h_load + 1);
                moved_load = balance_tasks(this_rq, this_cpu, busiest,
-                                rem_load, sd, idle, all_pinned,
+                                rem_load, sd, idle, lb_flags,
                                busiest_cfs_rq);
                if (!moved_load)
@@ -3053,10 +3418,10 @@ static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  unsigned long max_load_move,
                  struct sched_domain *sd, enum cpu_idle_type idle,
-                  int *all_pinned)
+                  int *lb_flags)
 {
        return balance_tasks(this_rq, this_cpu, busiest,
-                        max_load_move, sd, idle, all_pinned,
+                        max_load_move, sd, idle, lb_flags,
                        &busiest->cfs);
 }
 #endif
@@ -3071,29 +3436,30 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
                      unsigned long max_load_move,
                      struct sched_domain *sd, enum cpu_idle_type idle,
-                      int *all_pinned)
+                      int *lb_flags)
 {
        unsigned long total_load_moved = 0, load_moved;
        do {
                load_moved = load_balance_fair(this_rq, this_cpu, busiest,
                                max_load_move - total_load_moved,
-                                sd, idle, all_pinned);
+                                sd, idle, lb_flags);
                total_load_moved += load_moved;
+                if (*lb_flags & (LBF_NEED_BREAK|LBF_ABORT))
+                        break;
 #ifdef CONFIG_PREEMPT
                /*
                 * NEWIDLE balancing is a source of latency, so preemptible
                 * kernels will stop after the first task is pulled to minimize
                 * the critical section.
                 */
-                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
+                if (idle == CPU_NEWLY_IDLE && this_rq->nr_running) {
-                        break;
+                        *lb_flags |= LBF_ABORT;
-                if (raw_spin_is_contended(&this_rq->lock) ||
-                                raw_spin_is_contended(&busiest->lock))
                        break;
+                }
 #endif
        } while (load_moved && max_load_move > total_load_moved);
@@ -3155,15 +3521,6 @@ struct sg_lb_stats {
 };
 /**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
- */
-static inline unsigned int group_first_cpu(struct sched_group *group)
-{
-        return cpumask_first(sched_group_cpus(group));
-}
-/**
 * get_sd_load_idx - Obtain the load index for a given sched domain.
 * @sd: The sched_domain whose load_idx is to be obtained.
 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
@@ -3412,7 +3769,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
        sdg->sgp->power = power;
 }
-static void update_group_power(struct sched_domain *sd, int cpu)
+void update_group_power(struct sched_domain *sd, int cpu)
 {
        struct sched_domain *child = sd->child;
        struct sched_group *group, *sdg = sd->groups;
@@ -3678,11 +4035,6 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
        } while (sg != sd->groups);
 }
-int __weak arch_sd_sibling_asym_packing(void)
-{
-       return 0*SD_ASYM_PACKING;
-}
 /**
 * check_asym_packing - Check to see if the group is packed into the
 *                      sched doman.
@@ -4046,7 +4398,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
 #define MAX_PINNED_INTERVAL     512
 /* Working cpumask for load_balance and load_balance_newidle. */
-static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
 static int need_active_balance(struct sched_domain *sd, int idle,
                               int busiest_cpu, int this_cpu)
@@ -4097,7 +4449,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                        struct sched_domain *sd, enum cpu_idle_type idle,
                        int *balance)
 {
-        int ld_moved, all_pinned = 0, active_balance = 0;
+        int ld_moved, lb_flags = 0, active_balance = 0;
        struct sched_group *group;
        unsigned long imbalance;
        struct rq *busiest;
@@ -4138,11 +4490,11 @@ redo:
                 * still unbalanced. ld_moved simply stays zero, so it is
                 * correctly treated as an imbalance.
                 */
-                all_pinned = 1;
+                lb_flags |= LBF_ALL_PINNED;
                local_irq_save(flags);
                double_rq_lock(this_rq, busiest);
                ld_moved = move_tasks(this_rq, this_cpu, busiest,
-                                      imbalance, sd, idle, &all_pinned);
+                                      imbalance, sd, idle, &lb_flags);
                double_rq_unlock(this_rq, busiest);
                local_irq_restore(flags);
@@ -4152,8 +4504,16 @@ redo:
                if (ld_moved && this_cpu != smp_processor_id())
                        resched_cpu(this_cpu);
+                if (lb_flags & LBF_ABORT)
+                        goto out_balanced;
+                if (lb_flags & LBF_NEED_BREAK) {
+                        lb_flags &= ~LBF_NEED_BREAK;
+                        goto redo;
+                }
                /* All tasks on this runqueue were pinned by CPU affinity */
-                if (unlikely(all_pinned)) {
+                if (unlikely(lb_flags & LBF_ALL_PINNED)) {
                        cpumask_clear_cpu(cpu_of(busiest), cpus);
                        if (!cpumask_empty(cpus))
                                goto redo;
@@ -4183,7 +4543,7 @@ redo:
                                        tsk_cpus_allowed(busiest->curr))) {
                                raw_spin_unlock_irqrestore(&busiest->lock,
                                                            flags);
-                                all_pinned = 1;
+                                lb_flags |= LBF_ALL_PINNED;
                                goto out_one_pinned;
                        }
@@ -4236,7 +4596,8 @@ out_balanced:
 out_one_pinned:
        /* tune up the balancing interval */
-        if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
+        if (((lb_flags & LBF_ALL_PINNED) &&
+                        sd->balance_interval < MAX_PINNED_INTERVAL) ||
                        (sd->balance_interval < sd->max_interval))
                sd->balance_interval *= 2;
@@ -4249,7 +4610,7 @@ out:
 * idle_balance is called by schedule() if this_cpu is about to become
 * idle. Attempts to pull tasks from other CPUs.
 */
-static void idle_balance(int this_cpu, struct rq *this_rq)
+void idle_balance(int this_cpu, struct rq *this_rq)
 {
        struct sched_domain *sd;
        int pulled_task = 0;
@@ -4364,28 +4725,16 @@ out_unlock:
 #ifdef CONFIG_NO_HZ
 /*
 * idle load balancing details
- * - One of the idle CPUs nominates itself as idle load_balancer, while
- *   entering idle.
- * - This idle load balancer CPU will also go into tickless mode when
- *   it is idle, just like all other idle CPUs
 * - When one of the busy CPUs notice that there may be an idle rebalancing
 *   needed, they will kick the idle load balancer, which then does idle
 *   load balancing for all the idle CPUs.
 */
 static struct {
-        atomic_t load_balancer;
-        atomic_t first_pick_cpu;
-        atomic_t second_pick_cpu;
        cpumask_var_t idle_cpus_mask;
-        cpumask_var_t grp_idle_mask;
+        atomic_t nr_cpus;
        unsigned long next_balance;     /* in jiffy units */
 } nohz ____cacheline_aligned;
-int get_nohz_load_balancer(void)
-{
-        return atomic_read(&nohz.load_balancer);
-}
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 /**
 * lowest_flag_domain - Return lowest sched_domain containing flag.
@@ -4422,33 +4771,6 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
                (sd && (sd->flags & flag)); sd = sd->parent)
 /**
- * is_semi_idle_group - Checks if the given sched_group is semi-idle.
- * @ilb_group:  group to be checked for semi-idleness
- *
- * Returns:     1 if the group is semi-idle. 0 otherwise.
- *
- * We define a sched_group to be semi idle if it has atleast one idle-CPU
- * and atleast one non-idle CPU. This helper function checks if the given
- * sched_group is semi-idle or not.
- */
-static inline int is_semi_idle_group(struct sched_group *ilb_group)
-{
-        cpumask_and(nohz.grp_idle_mask, nohz.idle_cpus_mask,
-                                        sched_group_cpus(ilb_group));
-        /*
-         * A sched_group is semi-idle when it has atleast one busy cpu
-         * and atleast one idle cpu.
-         */
-        if (cpumask_empty(nohz.grp_idle_mask))
-                return 0;
-        if (cpumask_equal(nohz.grp_idle_mask, sched_group_cpus(ilb_group)))
-                return 0;
-        return 1;
-}
-/**
 * find_new_ilb - Finds the optimum idle load balancer for nomination.
 * @cpu:        The cpu which is nominating a new idle_load_balancer.
 *
@@ -4462,9 +4784,9 @@ static inline int is_semi_idle_group(struct sched_group *ilb_group)
 */
 static int find_new_ilb(int cpu)
 {
+        int ilb = cpumask_first(nohz.idle_cpus_mask);
+        struct sched_group *ilbg;
        struct sched_domain *sd;
-        struct sched_group *ilb_group;
-        int ilb = nr_cpu_ids;
        /*
         * Have idle load balancer selection from semi-idle packages only
@@ -4482,23 +4804,28 @@ static int find_new_ilb(int cpu)
        rcu_read_lock();
        for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
-                ilb_group = sd->groups;
+                ilbg = sd->groups;
                do {
-                        if (is_semi_idle_group(ilb_group)) {
+                        if (ilbg->group_weight !=
-                                ilb = cpumask_first(nohz.grp_idle_mask);
+                                atomic_read(&ilbg->sgp->nr_busy_cpus)) {
+                                ilb = cpumask_first_and(nohz.idle_cpus_mask,
+                                                        sched_group_cpus(ilbg));
                                goto unlock;
                        }
-                        ilb_group = ilb_group->next;
+                        ilbg = ilbg->next;
-                } while (ilb_group != sd->groups);
+                } while (ilbg != sd->groups);
        }
 unlock:
        rcu_read_unlock();
 out_done:
-        return ilb;
+        if (ilb < nr_cpu_ids && idle_cpu(ilb))
+                return ilb;
+        return nr_cpu_ids;
 }
 #else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
 static inline int find_new_ilb(int call_cpu)
@@ -4518,99 +4845,68 @@ static void nohz_balancer_kick(int cpu)
        nohz.next_balance++;
-        ilb_cpu = get_nohz_load_balancer();
+        ilb_cpu = find_new_ilb(cpu);
-        if (ilb_cpu >= nr_cpu_ids) {
-                ilb_cpu = cpumask_first(nohz.idle_cpus_mask);
-                if (ilb_cpu >= nr_cpu_ids)
-                        return;
-        }
-        if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
+        if (ilb_cpu >= nr_cpu_ids)
-                cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
+                return;
-                smp_mb();
+        if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
-                /*
+                return;
-                 * Use smp_send_reschedule() instead of resched_cpu().
+        /*
-                 * This way we generate a sched IPI on the target cpu which
+         * Use smp_send_reschedule() instead of resched_cpu().
-                 * is idle. And the softirq performing nohz idle load balance
+         * This way we generate a sched IPI on the target cpu which
-                 * will be run before returning from the IPI.
+         * is idle. And the softirq performing nohz idle load balance
-                 */
+         * will be run before returning from the IPI.
-                smp_send_reschedule(ilb_cpu);
+         */
-        }
+        smp_send_reschedule(ilb_cpu);
        return;
 }
-/*
+static inline void set_cpu_sd_state_busy(void)
- * This routine will try to nominate the ilb (idle load balancing)
- * owner among the cpus whose ticks are stopped. ilb owner will do the idle
- * load balancing on behalf of all those cpus.
- *
- * When the ilb owner becomes busy, we will not have new ilb owner until some
- * idle CPU wakes up and goes back to idle or some busy CPU tries to kick
- * idle load balancing by kicking one of the idle CPUs.
- *
- * Ticks are stopped for the ilb owner as well, with busy CPU kicking this
- * ilb owner CPU in future (when there is a need for idle load balancing on
- * behalf of all idle CPUs).
- */
-void select_nohz_load_balancer(int stop_tick)
 {
+        struct sched_domain *sd;
        int cpu = smp_processor_id();
-        if (stop_tick) {
+        if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-                if (!cpu_active(cpu)) {
+                return;
-                        if (atomic_read(&nohz.load_balancer) != cpu)
+        clear_bit(NOHZ_IDLE, nohz_flags(cpu));
-                                return;
-                        /*
-                         * If we are going offline and still the leader,
-                         * give up!
-                         */
-                        if (atomic_cmpxchg(&nohz.load_balancer, cpu,
-                                           nr_cpu_ids) != cpu)
-                                BUG();
-                        return;
+        rcu_read_lock();
-                }
+        for_each_domain(cpu, sd)
+                atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+        rcu_read_unlock();
+}
-                cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+void set_cpu_sd_state_idle(void)
+{
+        struct sched_domain *sd;
+        int cpu = smp_processor_id();
-                if (atomic_read(&nohz.first_pick_cpu) == cpu)
+        if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-                        atomic_cmpxchg(&nohz.first_pick_cpu, cpu, nr_cpu_ids);
+                return;
-                if (atomic_read(&nohz.second_pick_cpu) == cpu)
+        set_bit(NOHZ_IDLE, nohz_flags(cpu));
-                        atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
-                if (atomic_read(&nohz.load_balancer) >= nr_cpu_ids) {
+        rcu_read_lock();
-                        int new_ilb;
+        for_each_domain(cpu, sd)
+                atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+        rcu_read_unlock();
+}
-                        /* make me the ilb owner */
+/*
-                        if (atomic_cmpxchg(&nohz.load_balancer, nr_cpu_ids,
+ * This routine will record that this cpu is going idle with tick stopped.
-                                           cpu) != nr_cpu_ids)
+ * This info will be used in performing idle load balancing in the future.
-                                return;
+ */
+void select_nohz_load_balancer(int stop_tick)
+{
+        int cpu = smp_processor_id();
-                        /*
+        if (stop_tick) {
-                         * Check to see if there is a more power-efficient
+                if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
-                         * ilb.
-                         */
-                        new_ilb = find_new_ilb(cpu);
-                        if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
-                                atomic_set(&nohz.load_balancer, nr_cpu_ids);
-                                resched_cpu(new_ilb);
-                                return;
-                        }
-                        return;
-                }
-        } else {
-                if (!cpumask_test_cpu(cpu, nohz.idle_cpus_mask))
                        return;
-                cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+                cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
+                atomic_inc(&nohz.nr_cpus);
-                if (atomic_read(&nohz.load_balancer) == cpu)
+                set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
-                        if (atomic_cmpxchg(&nohz.load_balancer, cpu,
-                                           nr_cpu_ids) != cpu)
-                                BUG();
        }
        return;
 }
@@ -4624,7 +4920,7 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
 * Scale the max load_balance interval with the number of CPUs in the system.
 * This trades load-balance latency on larger machines for less cross talk.
 */
-static void update_max_interval(void)
+void update_max_interval(void)
 {
        max_load_balance_interval = HZ*num_online_cpus()/10;
 }
@@ -4716,11 +5012,12 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
        struct rq *rq;
        int balance_cpu;
-        if (idle != CPU_IDLE || !this_rq->nohz_balance_kick)
+        if (idle != CPU_IDLE ||
-                return;
+            !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
+                goto end;
        for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
-                if (balance_cpu == this_cpu)
+                if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
                        continue;
                /*
@@ -4728,10 +5025,8 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
                 * work being done for other cpus. Next load
                 * balancing owner will pick it up.
                 */
-                if (need_resched()) {
+                if (need_resched())
-                        this_rq->nohz_balance_kick = 0;
                        break;
-                }
                raw_spin_lock_irq(&this_rq->lock);
                update_rq_clock(this_rq);
@@ -4745,53 +5040,75 @@ static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
                        this_rq->next_balance = rq->next_balance;
        }
        nohz.next_balance = this_rq->next_balance;
-        this_rq->nohz_balance_kick = 0;
+end:
+        clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
 }
 /*
- * Current heuristic for kicking the idle load balancer
+ * Current heuristic for kicking the idle load balancer in the presence
- * - first_pick_cpu is the one of the busy CPUs. It will kick
+ * of an idle cpu is the system.
- *   idle load balancer when it has more than one process active. This
+ *   - This rq has more than one task.
- *   eliminates the need for idle load balancing altogether when we have
+ *   - At any scheduler domain level, this cpu's scheduler group has multiple
- *   only one running process in the system (common case).
+ *     busy cpu's exceeding the group's power.
- * - If there are more than one busy CPU, idle load balancer may have
+ *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
- *   to run for active_load_balance to happen (i.e., two busy CPUs are
+ *     domain span are idle.
- *   SMT or core siblings and can run better if they move to different
- *   physical CPUs). So, second_pick_cpu is the second of the busy CPUs
- *   which will kick idle load balancer as soon as it has any load.
 */
 static inline int nohz_kick_needed(struct rq *rq, int cpu)
 {
        unsigned long now = jiffies;
-        int ret;
+        struct sched_domain *sd;
-        int first_pick_cpu, second_pick_cpu;
-        if (time_before(now, nohz.next_balance))
+        if (unlikely(idle_cpu(cpu)))
                return 0;
-        if (idle_cpu(cpu))
+       /*
-                return 0;
+        * We may be recently in ticked or tickless idle mode. At the first
+        * busy tick after returning from idle, we will update the busy stats.
+        */
+        set_cpu_sd_state_busy();
+        if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
+                clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
+                cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
+                atomic_dec(&nohz.nr_cpus);
+        }
-        first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
+        /*
-        second_pick_cpu = atomic_read(&nohz.second_pick_cpu);
+         * None are in tickless mode and hence no need for NOHZ idle load
+         * balancing.
+         */
+        if (likely(!atomic_read(&nohz.nr_cpus)))
+                return 0;
-        if (first_pick_cpu < nr_cpu_ids && first_pick_cpu != cpu &&
+        if (time_before(now, nohz.next_balance))
-            second_pick_cpu < nr_cpu_ids && second_pick_cpu != cpu)
                return 0;
-        ret = atomic_cmpxchg(&nohz.first_pick_cpu, nr_cpu_ids, cpu);
+        if (rq->nr_running >= 2)
-        if (ret == nr_cpu_ids || ret == cpu) {
+                goto need_kick;
-                atomic_cmpxchg(&nohz.second_pick_cpu, cpu, nr_cpu_ids);
-                if (rq->nr_running > 1)
+        rcu_read_lock();
-                        return 1;
+        for_each_domain(cpu, sd) {
-        } else {
+                struct sched_group *sg = sd->groups;
-                ret = atomic_cmpxchg(&nohz.second_pick_cpu, nr_cpu_ids, cpu);
+                struct sched_group_power *sgp = sg->sgp;
-                if (ret == nr_cpu_ids || ret == cpu) {
+                int nr_busy = atomic_read(&sgp->nr_busy_cpus);
-                        if (rq->nr_running)
-                                return 1;
+                if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
-                }
+                        goto need_kick_unlock;
+                if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
+                    && (cpumask_first_and(nohz.idle_cpus_mask,
+                                          sched_domain_span(sd)) < cpu))
+                        goto need_kick_unlock;
+                if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
+                        break;
        }
+        rcu_read_unlock();
        return 0;
+need_kick_unlock:
+        rcu_read_unlock();
+need_kick:
+        return 1;
 }
 #else
 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
@@ -4826,14 +5143,14 @@ static inline int on_null_domain(int cpu)
 /*
 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
 */
-static inline void trigger_load_balance(struct rq *rq, int cpu)
+void trigger_load_balance(struct rq *rq, int cpu)
 {
        /* Don't need to rebalance while attached to NULL domain */
        if (time_after_eq(jiffies, rq->next_balance) &&
            likely(!on_null_domain(cpu)))
                raise_softirq(SCHED_SOFTIRQ);
 #ifdef CONFIG_NO_HZ
-        else if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
+        if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
                nohz_balancer_kick(cpu);
 #endif
 }
@@ -4848,15 +5165,6 @@ static void rq_offline_fair(struct rq *rq)
        update_sysctl();
 }
-#else   /* CONFIG_SMP */
-/*
- * on UP we do not need to balance between CPUs:
- */
-static inline void idle_balance(int cpu, struct rq *rq)
-{
-}
 #endif /* CONFIG_SMP */
 /*
@@ -4880,8 +5188,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 */
 static void task_fork_fair(struct task_struct *p)
 {
-        struct cfs_rq *cfs_rq = task_cfs_rq(current);
+        struct cfs_rq *cfs_rq;
-        struct sched_entity *se = &p->se, *curr = cfs_rq->curr;
+        struct sched_entity *se = &p->se, *curr;
        int this_cpu = smp_processor_id();
        struct rq *rq = this_rq();
        unsigned long flags;
@@ -4890,6 +5198,9 @@ static void task_fork_fair(struct task_struct *p)
        update_rq_clock(rq);
+        cfs_rq = task_cfs_rq(current);
+        curr = cfs_rq->curr;
        if (unlikely(task_cpu(p) != this_cpu)) {
                rcu_read_lock();
                __set_task_cpu(p, this_cpu);
@@ -4999,6 +5310,16 @@ static void set_curr_task_fair(struct rq *rq)
        }
 }
+void init_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        cfs_rq->tasks_timeline = RB_ROOT;
+        INIT_LIST_HEAD(&cfs_rq->tasks);
+        cfs_rq->min_vruntime = (u64)(-(1LL << 20));
+#ifndef CONFIG_64BIT
+        cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
+}
 #ifdef CONFIG_FAIR_GROUP_SCHED
 static void task_move_group_fair(struct task_struct *p, int on_rq)
 {
@@ -5015,13 +5336,182 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
         * to another cgroup's rq. This does somewhat interfere with the
         * fair sleeper stuff for the first placement, but who cares.
         */
+        /*
+         * When !on_rq, vruntime of the task has usually NOT been normalized.
+         * But there are some cases where it has already been normalized:
+         *
+         * - Moving a forked child which is waiting for being woken up by
+         *   wake_up_new_task().
+         * - Moving a task which has been woken up by try_to_wake_up() and
+         *   waiting for actually being woken up by sched_ttwu_pending().
+         *
+         * To prevent boost or penalty in the new cfs_rq caused by delta
+         * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
+         */
+        if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
+                on_rq = 1;
        if (!on_rq)
                p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
        set_task_rq(p, task_cpu(p));
        if (!on_rq)
                p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
 }
+void free_fair_sched_group(struct task_group *tg)
+{
+        int i;
+        destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+        for_each_possible_cpu(i) {
+                if (tg->cfs_rq)
+                        kfree(tg->cfs_rq[i]);
+                if (tg->se)
+                        kfree(tg->se[i]);
+        }
+        kfree(tg->cfs_rq);
+        kfree(tg->se);
+}
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+        struct cfs_rq *cfs_rq;
+        struct sched_entity *se;
+        int i;
+        tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
+        if (!tg->cfs_rq)
+                goto err;
+        tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
+        if (!tg->se)
+                goto err;
+        tg->shares = NICE_0_LOAD;
+        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
+        for_each_possible_cpu(i) {
+                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
+                                      GFP_KERNEL, cpu_to_node(i));
+                if (!cfs_rq)
+                        goto err;
+                se = kzalloc_node(sizeof(struct sched_entity),
+                                  GFP_KERNEL, cpu_to_node(i));
+                if (!se)
+                        goto err_free_rq;
+                init_cfs_rq(cfs_rq);
+                init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
+        }
+        return 1;
+err_free_rq:
+        kfree(cfs_rq);
+err:
+        return 0;
+}
+void unregister_fair_sched_group(struct task_group *tg, int cpu)
+{
+        struct rq *rq = cpu_rq(cpu);
+        unsigned long flags;
+        /*
+        * Only empty task groups can be destroyed; so we can speculatively
+        * check on_list without danger of it being re-added.
+        */
+        if (!tg->cfs_rq[cpu]->on_list)
+                return;
+        raw_spin_lock_irqsave(&rq->lock, flags);
+        list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
+        raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+                        struct sched_entity *se, int cpu,
+                        struct sched_entity *parent)
+{
+        struct rq *rq = cpu_rq(cpu);
+        cfs_rq->tg = tg;
+        cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+        /* allow initial update_cfs_load() to truncate */
+        cfs_rq->load_stamp = 1;
 #endif
+        init_cfs_rq_runtime(cfs_rq);
+        tg->cfs_rq[cpu] = cfs_rq;
+        tg->se[cpu] = se;
+        /* se could be NULL for root_task_group */
+        if (!se)
+                return;
+        if (!parent)
+                se->cfs_rq = &rq->cfs;
+        else
+                se->cfs_rq = parent->my_q;
+        se->my_q = cfs_rq;
+        update_load_set(&se->load, 0);
+        se->parent = parent;
+}
+static DEFINE_MUTEX(shares_mutex);
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+        int i;
+        unsigned long flags;
+        /*
+         * We can't change the weight of the root cgroup.
+         */
+        if (!tg->se[0])
+                return -EINVAL;
+        shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
+        mutex_lock(&shares_mutex);
+        if (tg->shares == shares)
+                goto done;
+        tg->shares = shares;
+        for_each_possible_cpu(i) {
+                struct rq *rq = cpu_rq(i);
+                struct sched_entity *se;
+                se = tg->se[i];
+                /* Propagate contribution to hierarchy */
+                raw_spin_lock_irqsave(&rq->lock, flags);
+                for_each_sched_entity(se)
+                        update_cfs_shares(group_cfs_rq(se));
+                raw_spin_unlock_irqrestore(&rq->lock, flags);
+        }
+done:
+        mutex_unlock(&shares_mutex);
+        return 0;
+}
+#else /* CONFIG_FAIR_GROUP_SCHED */
+void free_fair_sched_group(struct task_group *tg) { }
+int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
+{
+        return 1;
+}
+void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
 {
@@ -5041,7 +5531,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
 /*
 * All the scheduling class methods:
 */
-static const struct sched_class fair_sched_class = {
+const struct sched_class fair_sched_class = {
        .next                   = &idle_sched_class,
        .enqueue_task           = enqueue_task_fair,
        .dequeue_task           = dequeue_task_fair,
@@ -5078,7 +5568,7 @@ static const struct sched_class fair_sched_class = {
 };
 #ifdef CONFIG_SCHED_DEBUG
-static void print_cfs_stats(struct seq_file *m, int cpu)
+void print_cfs_stats(struct seq_file *m, int cpu)
 {
        struct cfs_rq *cfs_rq;
@@ -5088,3 +5578,15 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
        rcu_read_unlock();
 }
 #endif
+__init void init_sched_fair_class(void)
+{
+#ifdef CONFIG_SMP
+        open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
+#ifdef CONFIG_NO_HZ
+        zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
+#endif
+#endif /* SMP */
+}
diff --git a/kernel/sched_features.h b/kernel/sched/features.h
index 84802245abd2..e61fd73913d0 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched/features.h
@@ -3,13 +3,13 @@
 * them to run sooner, but does not allow tons of sleepers to
 * rip the spread apart.
 */
-SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
+SCHED_FEAT(GENTLE_FAIR_SLEEPERS, true)
 /*
 * Place new tasks ahead so that they do not starve already running
 * tasks
 */
-SCHED_FEAT(START_DEBIT, 1)
+SCHED_FEAT(START_DEBIT, true)
 /*
 * Based on load and program behaviour, see if it makes sense to place
@@ -17,54 +17,54 @@ SCHED_FEAT(START_DEBIT, 1)
 * improve cache locality. Typically used with SYNC wakeups as
 * generated by pipes and the like, see also SYNC_WAKEUPS.
 */
-SCHED_FEAT(AFFINE_WAKEUPS, 1)
+SCHED_FEAT(AFFINE_WAKEUPS, true)
 /*
 * Prefer to schedule the task we woke last (assuming it failed
 * wakeup-preemption), since its likely going to consume data we
 * touched, increases cache locality.
 */
-SCHED_FEAT(NEXT_BUDDY, 0)
+SCHED_FEAT(NEXT_BUDDY, false)
 /*
 * Prefer to schedule the task that ran last (when we did
 * wake-preempt) as that likely will touch the same data, increases
 * cache locality.
 */
-SCHED_FEAT(LAST_BUDDY, 1)
+SCHED_FEAT(LAST_BUDDY, true)
 /*
 * Consider buddies to be cache hot, decreases the likelyness of a
 * cache buddy being migrated away, increases cache locality.
 */
-SCHED_FEAT(CACHE_HOT_BUDDY, 1)
+SCHED_FEAT(CACHE_HOT_BUDDY, true)
 /*
 * Use arch dependent cpu power functions
 */
-SCHED_FEAT(ARCH_POWER, 0)
+SCHED_FEAT(ARCH_POWER, false)
-SCHED_FEAT(HRTICK, 0)
+SCHED_FEAT(HRTICK, false)
-SCHED_FEAT(DOUBLE_TICK, 0)
+SCHED_FEAT(DOUBLE_TICK, false)
-SCHED_FEAT(LB_BIAS, 1)
+SCHED_FEAT(LB_BIAS, true)
 /*
 * Spin-wait on mutex acquisition when the mutex owner is running on
 * another cpu -- assumes that when the owner is running, it will soon
 * release the lock. Decreases scheduling overhead.
 */
-SCHED_FEAT(OWNER_SPIN, 1)
+SCHED_FEAT(OWNER_SPIN, true)
 /*
 * Decrement CPU power based on time not spent running tasks
 */
-SCHED_FEAT(NONTASK_POWER, 1)
+SCHED_FEAT(NONTASK_POWER, true)
 /*
 * Queue remote wakeups on the target CPU and process them
 * using the scheduler IPI. Reduces rq->lock contention/bounces.
 */
-SCHED_FEAT(TTWU_QUEUE, 1)
+SCHED_FEAT(TTWU_QUEUE, true)
-SCHED_FEAT(FORCE_SD_OVERLAP, 0)
+SCHED_FEAT(FORCE_SD_OVERLAP, false)
-SCHED_FEAT(RT_RUNTIME_SHARE, 1)
+SCHED_FEAT(RT_RUNTIME_SHARE, true)
diff --git a/kernel/sched_idletask.c b/kernel/sched/idle_task.c
index 0a51882534ea..91b4c957f289 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched/idle_task.c
@@ -1,3 +1,5 @@
+#include "sched.h"
 /*
 * idle-task scheduling class.
 *
@@ -71,7 +73,7 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
 /*
 * Simple, special scheduling class for the per-CPU idle tasks:
 */
-static const struct sched_class idle_sched_class = {
+const struct sched_class idle_sched_class = {
        /* .next is NULL */
        /* no enqueue/yield_task for idle tasks */
diff --git a/kernel/sched_rt.c b/kernel/sched/rt.c
index 583a1368afe6..3640ebbb466b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched/rt.c
@@ -3,7 +3,92 @@
 * policies)
 */
+#include "sched.h"
+#include <linux/slab.h>
+static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
+struct rt_bandwidth def_rt_bandwidth;
+static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
+{
+        struct rt_bandwidth *rt_b =
+                container_of(timer, struct rt_bandwidth, rt_period_timer);
+        ktime_t now;
+        int overrun;
+        int idle = 0;
+        for (;;) {
+                now = hrtimer_cb_get_time(timer);
+                overrun = hrtimer_forward(timer, now, rt_b->rt_period);
+                if (!overrun)
+                        break;
+                idle = do_sched_rt_period_timer(rt_b, overrun);
+        }
+        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
+{
+        rt_b->rt_period = ns_to_ktime(period);
+        rt_b->rt_runtime = runtime;
+        raw_spin_lock_init(&rt_b->rt_runtime_lock);
+        hrtimer_init(&rt_b->rt_period_timer,
+                        CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        rt_b->rt_period_timer.function = sched_rt_period_timer;
+}
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+                return;
+        if (hrtimer_active(&rt_b->rt_period_timer))
+                return;
+        raw_spin_lock(&rt_b->rt_runtime_lock);
+        start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
+        raw_spin_unlock(&rt_b->rt_runtime_lock);
+}
+void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
+{
+        struct rt_prio_array *array;
+        int i;
+        array = &rt_rq->active;
+        for (i = 0; i < MAX_RT_PRIO; i++) {
+                INIT_LIST_HEAD(array->queue + i);
+                __clear_bit(i, array->bitmap);
+        }
+        /* delimiter for bitsearch: */
+        __set_bit(MAX_RT_PRIO, array->bitmap);
+#if defined CONFIG_SMP
+        rt_rq->highest_prio.curr = MAX_RT_PRIO;
+        rt_rq->highest_prio.next = MAX_RT_PRIO;
+        rt_rq->rt_nr_migratory = 0;
+        rt_rq->overloaded = 0;
+        plist_head_init(&rt_rq->pushable_tasks);
+#endif
+        rt_rq->rt_time = 0;
+        rt_rq->rt_throttled = 0;
+        rt_rq->rt_runtime = 0;
+        raw_spin_lock_init(&rt_rq->rt_runtime_lock);
+}
 #ifdef CONFIG_RT_GROUP_SCHED
+static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+        hrtimer_cancel(&rt_b->rt_period_timer);
+}
 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
@@ -25,6 +110,91 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
        return rt_se->rt_rq;
 }
+void free_rt_sched_group(struct task_group *tg)
+{
+        int i;
+        if (tg->rt_se)
+                destroy_rt_bandwidth(&tg->rt_bandwidth);
+        for_each_possible_cpu(i) {
+                if (tg->rt_rq)
+                        kfree(tg->rt_rq[i]);
+                if (tg->rt_se)
+                        kfree(tg->rt_se[i]);
+        }
+        kfree(tg->rt_rq);
+        kfree(tg->rt_se);
+}
+void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
+                struct sched_rt_entity *rt_se, int cpu,
+                struct sched_rt_entity *parent)
+{
+        struct rq *rq = cpu_rq(cpu);
+        rt_rq->highest_prio.curr = MAX_RT_PRIO;
+        rt_rq->rt_nr_boosted = 0;
+        rt_rq->rq = rq;
+        rt_rq->tg = tg;
+        tg->rt_rq[cpu] = rt_rq;
+        tg->rt_se[cpu] = rt_se;
+        if (!rt_se)
+                return;
+        if (!parent)
+                rt_se->rt_rq = &rq->rt;
+        else
+                rt_se->rt_rq = parent->my_q;
+        rt_se->my_q = rt_rq;
+        rt_se->parent = parent;
+        INIT_LIST_HEAD(&rt_se->run_list);
+}
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+{
+        struct rt_rq *rt_rq;
+        struct sched_rt_entity *rt_se;
+        int i;
+        tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
+        if (!tg->rt_rq)
+                goto err;
+        tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
+        if (!tg->rt_se)
+                goto err;
+        init_rt_bandwidth(&tg->rt_bandwidth,
+                        ktime_to_ns(def_rt_bandwidth.rt_period), 0);
+        for_each_possible_cpu(i) {
+                rt_rq = kzalloc_node(sizeof(struct rt_rq),
+                                     GFP_KERNEL, cpu_to_node(i));
+                if (!rt_rq)
+                        goto err;
+                rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
+                                     GFP_KERNEL, cpu_to_node(i));
+                if (!rt_se)
+                        goto err_free_rq;
+                init_rt_rq(rt_rq, cpu_rq(i));
+                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
+                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
+        }
+        return 1;
+err_free_rq:
+        kfree(rt_rq);
+err:
+        return 0;
+}
 #else /* CONFIG_RT_GROUP_SCHED */
 #define rt_entity_is_task(rt_se) (1)
@@ -47,6 +217,12 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
        return &rq->rt;
 }
+void free_rt_sched_group(struct task_group *tg) { }
+int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
+{
+        return 1;
+}
 #endif /* CONFIG_RT_GROUP_SCHED */
 #ifdef CONFIG_SMP
@@ -556,6 +732,28 @@ static void enable_runtime(struct rq *rq)
        raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
+int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+        int cpu = (int)(long)hcpu;
+        switch (action) {
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+                disable_runtime(cpu_rq(cpu));
+                return NOTIFY_OK;
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
+        case CPU_ONLINE:
+        case CPU_ONLINE_FROZEN:
+                enable_runtime(cpu_rq(cpu));
+                return NOTIFY_OK;
+        default:
+                return NOTIFY_DONE;
+        }
+}
 static int balance_runtime(struct rt_rq *rt_rq)
 {
        int more = 0;
@@ -648,7 +846,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
        if (rt_rq->rt_throttled)
                return rt_rq_throttled(rt_rq);
-        if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
+        if (runtime >= sched_rt_period(rt_rq))
                return 0;
        balance_runtime(rt_rq);
@@ -957,8 +1155,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 }
 /*
- * Put task to the end of the run list without the overhead of dequeue
+ * Put task to the head or the end of the run list without the overhead of
- * followed by enqueue.
+ * dequeue followed by enqueue.
 */
 static void
 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
@@ -1002,6 +1200,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
        cpu = task_cpu(p);
+        if (p->rt.nr_cpus_allowed == 1)
+                goto out;
        /* For anything but wake ups, just return the task_cpu */
        if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
                goto out;
@@ -1178,8 +1379,6 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 /* Only try algorithms three times */
 #define RT_MAX_TRIES 3
-static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
@@ -1653,13 +1852,14 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
                pull_rt_task(rq);
 }
-static inline void init_sched_rt_class(void)
+void init_sched_rt_class(void)
 {
        unsigned int i;
-        for_each_possible_cpu(i)
+        for_each_possible_cpu(i) {
                zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
                                        GFP_KERNEL, cpu_to_node(i));
+        }
 }
 #endif /* CONFIG_SMP */
@@ -1800,7 +2000,7 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
                return 0;
 }
-static const struct sched_class rt_sched_class = {
+const struct sched_class rt_sched_class = {
        .next                   = &fair_sched_class,
        .enqueue_task           = enqueue_task_rt,
        .dequeue_task           = dequeue_task_rt,
@@ -1835,7 +2035,7 @@ static const struct sched_class rt_sched_class = {
 #ifdef CONFIG_SCHED_DEBUG
 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
-static void print_rt_stats(struct seq_file *m, int cpu)
+void print_rt_stats(struct seq_file *m, int cpu)
 {
        rt_rq_iter_t iter;
        struct rt_rq *rt_rq;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
new file mode 100644
index 000000000000..98c0c2623db8
--- /dev/null
+++ b/kernel/sched/sched.h
@@ -0,0 +1,1166 @@
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/spinlock.h>
+#include <linux/stop_machine.h>
+#include "cpupri.h"
+extern __read_mostly int scheduler_running;
+/*
+ * Convert user-nice values [ -20 ... 0 ... 19 ]
+ * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
+ * and back.
+ */
+#define NICE_TO_PRIO(nice)      (MAX_RT_PRIO + (nice) + 20)
+#define PRIO_TO_NICE(prio)      ((prio) - MAX_RT_PRIO - 20)
+#define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)
+/*
+ * 'User priority' is the nice value converted to something we
+ * can work with better when scaling various scheduler parameters,
+ * it's a [ 0 ... 39 ] range.
+ */
+#define USER_PRIO(p)            ((p)-MAX_RT_PRIO)
+#define TASK_USER_PRIO(p)       USER_PRIO((p)->static_prio)
+#define MAX_USER_PRIO           (USER_PRIO(MAX_PRIO))
+/*
+ * Helpers for converting nanosecond timing to jiffy resolution
+ */
+#define NS_TO_JIFFIES(TIME)     ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
+#define NICE_0_LOAD             SCHED_LOAD_SCALE
+#define NICE_0_SHIFT            SCHED_LOAD_SHIFT
+/*
+ * These are the 'tuning knobs' of the scheduler:
+ *
+ * default timeslice is 100 msecs (used only for SCHED_RR tasks).
+ * Timeslices get refilled after they expire.
+ */
+#define DEF_TIMESLICE           (100 * HZ / 1000)
+/*
+ * single value that denotes runtime == period, ie unlimited time.
+ */
+#define RUNTIME_INF     ((u64)~0ULL)
+static inline int rt_policy(int policy)
+{
+        if (policy == SCHED_FIFO || policy == SCHED_RR)
+                return 1;
+        return 0;
+}
+static inline int task_has_rt_policy(struct task_struct *p)
+{
+        return rt_policy(p->policy);
+}
+/*
+ * This is the priority-queue data structure of the RT scheduling class:
+ */
+struct rt_prio_array {
+        DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */
+        struct list_head queue[MAX_RT_PRIO];
+};
+struct rt_bandwidth {
+        /* nests inside the rq lock: */
+        raw_spinlock_t          rt_runtime_lock;
+        ktime_t                 rt_period;
+        u64                     rt_runtime;
+        struct hrtimer          rt_period_timer;
+};
+extern struct mutex sched_domains_mutex;
+#ifdef CONFIG_CGROUP_SCHED
+#include <linux/cgroup.h>
+struct cfs_rq;
+struct rt_rq;
+static LIST_HEAD(task_groups);
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+        raw_spinlock_t lock;
+        ktime_t period;
+        u64 quota, runtime;
+        s64 hierarchal_quota;
+        u64 runtime_expires;
+        int idle, timer_active;
+        struct hrtimer period_timer, slack_timer;
+        struct list_head throttled_cfs_rq;
+        /* statistics */
+        int nr_periods, nr_throttled;
+        u64 throttled_time;
+#endif
+};
+/* task group related information */
+struct task_group {
+        struct cgroup_subsys_state css;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        /* schedulable entities of this group on each cpu */
+        struct sched_entity **se;
+        /* runqueue "owned" by this group on each cpu */
+        struct cfs_rq **cfs_rq;
+        unsigned long shares;
+        atomic_t load_weight;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+        struct sched_rt_entity **rt_se;
+        struct rt_rq **rt_rq;
+        struct rt_bandwidth rt_bandwidth;
+#endif
+        struct rcu_head rcu;
+        struct list_head list;
+        struct task_group *parent;
+        struct list_head siblings;
+        struct list_head children;
+#ifdef CONFIG_SCHED_AUTOGROUP
+        struct autogroup *autogroup;
+#endif
+        struct cfs_bandwidth cfs_bandwidth;
+};
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#define ROOT_TASK_GROUP_LOAD    NICE_0_LOAD
+/*
+ * A weight of 0 or 1 can cause arithmetics problems.
+ * A weight of a cfs_rq is the sum of weights of which entities
+ * are queued on this cfs_rq, so a weight of a entity should not be
+ * too large, so as the shares value of a task group.
+ * (The default weight is 1024 - so there's no practical
+ *  limitation from this.)
+ */
+#define MIN_SHARES      (1UL <<  1)
+#define MAX_SHARES      (1UL << 18)
+#endif
+/* Default task group.
+ *      Every task in system belong to this group at bootup.
+ */
+extern struct task_group root_task_group;
+typedef int (*tg_visitor)(struct task_group *, void *);
+extern int walk_tg_tree_from(struct task_group *from,
+                             tg_visitor down, tg_visitor up, void *data);
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+{
+        return walk_tg_tree_from(&root_task_group, down, up, data);
+}
+extern int tg_nop(struct task_group *tg, void *data);
+extern void free_fair_sched_group(struct task_group *tg);
+extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent);
+extern void unregister_fair_sched_group(struct task_group *tg, int cpu);
+extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
+                        struct sched_entity *se, int cpu,
+                        struct sched_entity *parent);
+extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
+extern void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
+extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+extern void free_rt_sched_group(struct task_group *tg);
+extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
+extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
+                struct sched_rt_entity *rt_se, int cpu,
+                struct sched_rt_entity *parent);
+#else /* CONFIG_CGROUP_SCHED */
+struct cfs_bandwidth { };
+#endif  /* CONFIG_CGROUP_SCHED */
+/* CFS-related fields in a runqueue */
+struct cfs_rq {
+        struct load_weight load;
+        unsigned long nr_running, h_nr_running;
+        u64 exec_clock;
+        u64 min_vruntime;
+#ifndef CONFIG_64BIT
+        u64 min_vruntime_copy;
+#endif
+        struct rb_root tasks_timeline;
+        struct rb_node *rb_leftmost;
+        struct list_head tasks;
+        struct list_head *balance_iterator;
+        /*
+         * 'curr' points to currently running entity on this cfs_rq.
+         * It is set to NULL otherwise (i.e when none are currently running).
+         */
+        struct sched_entity *curr, *next, *last, *skip;
+#ifdef  CONFIG_SCHED_DEBUG
+        unsigned int nr_spread_over;
+#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
+        /*
+         * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
+         * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
+         * (like users, containers etc.)
+         *
+         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+         * list is used during load balance.
+         */
+        int on_list;
+        struct list_head leaf_cfs_rq_list;
+        struct task_group *tg;  /* group that "owns" this runqueue */
+#ifdef CONFIG_SMP
+        /*
+         * the part of load.weight contributed by tasks
+         */
+        unsigned long task_weight;
+        /*
+         *   h_load = weight * f(tg)
+         *
+         * Where f(tg) is the recursive weight fraction assigned to
+         * this group.
+         */
+        unsigned long h_load;
+        /*
+         * Maintaining per-cpu shares distribution for group scheduling
+         *
+         * load_stamp is the last time we updated the load average
+         * load_last is the last time we updated the load average and saw load
+         * load_unacc_exec_time is currently unaccounted execution time
+         */
+        u64 load_avg;
+        u64 load_period;
+        u64 load_stamp, load_last, load_unacc_exec_time;
+        unsigned long load_contribution;
+#endif /* CONFIG_SMP */
+#ifdef CONFIG_CFS_BANDWIDTH
+        int runtime_enabled;
+        u64 runtime_expires;
+        s64 runtime_remaining;
+        u64 throttled_timestamp;
+        int throttled, throttle_count;
+        struct list_head throttled_list;
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+};
+static inline int rt_bandwidth_enabled(void)
+{
+        return sysctl_sched_rt_runtime >= 0;
+}
+/* Real-Time classes' related field in a runqueue: */
+struct rt_rq {
+        struct rt_prio_array active;
+        unsigned long rt_nr_running;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+        struct {
+                int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
+                int next; /* next highest */
+#endif
+        } highest_prio;
+#endif
+#ifdef CONFIG_SMP
+        unsigned long rt_nr_migratory;
+        unsigned long rt_nr_total;
+        int overloaded;
+        struct plist_head pushable_tasks;
+#endif
+        int rt_throttled;
+        u64 rt_time;
+        u64 rt_runtime;
+        /* Nests inside the rq lock: */
+        raw_spinlock_t rt_runtime_lock;
+#ifdef CONFIG_RT_GROUP_SCHED
+        unsigned long rt_nr_boosted;
+        struct rq *rq;
+        struct list_head leaf_rt_rq_list;
+        struct task_group *tg;
+#endif
+};
+#ifdef CONFIG_SMP
+/*
+ * We add the notion of a root-domain which will be used to define per-domain
+ * variables. Each exclusive cpuset essentially defines an island domain by
+ * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * exclusive cpuset is created, we also create and attach a new root-domain
+ * object.
+ *
+ */
+struct root_domain {
+        atomic_t refcount;
+        atomic_t rto_count;
+        struct rcu_head rcu;
+        cpumask_var_t span;
+        cpumask_var_t online;
+        /*
+         * The "RT overload" flag: it gets set if a CPU has more than
+         * one runnable RT task.
+         */
+        cpumask_var_t rto_mask;
+        struct cpupri cpupri;
+};
+extern struct root_domain def_root_domain;
+#endif /* CONFIG_SMP */
+/*
+ * This is the main, per-CPU runqueue data structure.
+ *
+ * Locking rule: those places that want to lock multiple runqueues
+ * (such as the load balancing or the thread migration code), lock
+ * acquire operations must be ordered by ascending &runqueue.
+ */
+struct rq {
+        /* runqueue lock: */
+        raw_spinlock_t lock;
+        /*
+         * nr_running and cpu_load should be in the same cacheline because
+         * remote CPUs use both these fields when doing load calculation.
+         */
+        unsigned long nr_running;
+        #define CPU_LOAD_IDX_MAX 5
+        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+        unsigned long last_load_update_tick;
+#ifdef CONFIG_NO_HZ
+        u64 nohz_stamp;
+        unsigned long nohz_flags;
+#endif
+        int skip_clock_update;
+        /* capture load from *all* tasks on this cpu: */
+        struct load_weight load;
+        unsigned long nr_load_updates;
+        u64 nr_switches;
+        struct cfs_rq cfs;
+        struct rt_rq rt;
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        /* list of leaf cfs_rq on this cpu: */
+        struct list_head leaf_cfs_rq_list;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+        struct list_head leaf_rt_rq_list;
+#endif
+        /*
+         * This is part of a global counter where only the total sum
+         * over all CPUs matters. A task can increase this counter on
+         * one CPU and if it got migrated afterwards it may decrease
+         * it on another CPU. Always updated under the runqueue lock:
+         */
+        unsigned long nr_uninterruptible;
+        struct task_struct *curr, *idle, *stop;
+        unsigned long next_balance;
+        struct mm_struct *prev_mm;
+        u64 clock;
+        u64 clock_task;
+        atomic_t nr_iowait;
+#ifdef CONFIG_SMP
+        struct root_domain *rd;
+        struct sched_domain *sd;
+        unsigned long cpu_power;
+        unsigned char idle_balance;
+        /* For active balancing */
+        int post_schedule;
+        int active_balance;
+        int push_cpu;
+        struct cpu_stop_work active_balance_work;
+        /* cpu of this runqueue: */
+        int cpu;
+        int online;
+        u64 rt_avg;
+        u64 age_stamp;
+        u64 idle_stamp;
+        u64 avg_idle;
+#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+        u64 prev_irq_time;
+#endif
+#ifdef CONFIG_PARAVIRT
+        u64 prev_steal_time;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+        u64 prev_steal_time_rq;
+#endif
+        /* calc_load related fields */
+        unsigned long calc_load_update;
+        long calc_load_active;
+#ifdef CONFIG_SCHED_HRTICK
+#ifdef CONFIG_SMP
+        int hrtick_csd_pending;
+        struct call_single_data hrtick_csd;
+#endif
+        struct hrtimer hrtick_timer;
+#endif
+#ifdef CONFIG_SCHEDSTATS
+        /* latency stats */
+        struct sched_info rq_sched_info;
+        unsigned long long rq_cpu_time;
+        /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
+        /* sys_sched_yield() stats */
+        unsigned int yld_count;
+        /* schedule() stats */
+        unsigned int sched_switch;
+        unsigned int sched_count;
+        unsigned int sched_goidle;
+        /* try_to_wake_up() stats */
+        unsigned int ttwu_count;
+        unsigned int ttwu_local;
+#endif
+#ifdef CONFIG_SMP
+        struct llist_head wake_list;
+#endif
+};
+static inline int cpu_of(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+        return rq->cpu;
+#else
+        return 0;
+#endif
+}
+DECLARE_PER_CPU(struct rq, runqueues);
+#define cpu_rq(cpu)             (&per_cpu(runqueues, (cpu)))
+#define this_rq()               (&__get_cpu_var(runqueues))
+#define task_rq(p)              cpu_rq(task_cpu(p))
+#define cpu_curr(cpu)           (cpu_rq(cpu)->curr)
+#define raw_rq()                (&__raw_get_cpu_var(runqueues))
+#ifdef CONFIG_SMP
+#define rcu_dereference_check_sched_domain(p) \
+        rcu_dereference_check((p), \
+                              lockdep_is_held(&sched_domains_mutex))
+/*
+ * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
+ * See detach_destroy_domains: synchronize_sched for details.
+ *
+ * The domain tree of any CPU may only be accessed from within
+ * preempt-disabled sections.
+ */
+#define for_each_domain(cpu, __sd) \
+        for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \
+                        __sd; __sd = __sd->parent)
+#define for_each_lower_domain(sd) for (; sd; sd = sd->child)
+/**
+ * highest_flag_domain - Return highest sched_domain containing flag.
+ * @cpu:        The cpu whose highest level of sched domain is to
+ *              be returned.
+ * @flag:       The flag to check for the highest sched_domain
+ *              for the given cpu.
+ *
+ * Returns the highest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
+{
+        struct sched_domain *sd, *hsd = NULL;
+        for_each_domain(cpu, sd) {
+                if (!(sd->flags & flag))
+                        break;
+                hsd = sd;
+        }
+        return hsd;
+}
+DECLARE_PER_CPU(struct sched_domain *, sd_llc);
+DECLARE_PER_CPU(int, sd_llc_id);
+#endif /* CONFIG_SMP */
+#include "stats.h"
+#include "auto_group.h"
+#ifdef CONFIG_CGROUP_SCHED
+/*
+ * Return the group to which this tasks belongs.
+ *
+ * We use task_subsys_state_check() and extend the RCU verification with
+ * pi->lock and rq->lock because cpu_cgroup_attach() holds those locks for each
+ * task it moves into the cgroup. Therefore by holding either of those locks,
+ * we pin the task to the current cgroup.
+ */
+static inline struct task_group *task_group(struct task_struct *p)
+{
+        struct task_group *tg;
+        struct cgroup_subsys_state *css;
+        css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
+                        lockdep_is_held(&p->pi_lock) ||
+                        lockdep_is_held(&task_rq(p)->lock));
+        tg = container_of(css, struct task_group, css);
+        return autogroup_task_group(p, tg);
+}
+/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
+{
+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
+        struct task_group *tg = task_group(p);
+#endif
+#ifdef CONFIG_FAIR_GROUP_SCHED
+        p->se.cfs_rq = tg->cfs_rq[cpu];
+        p->se.parent = tg->se[cpu];
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+        p->rt.rt_rq  = tg->rt_rq[cpu];
+        p->rt.parent = tg->rt_se[cpu];
+#endif
+}
+#else /* CONFIG_CGROUP_SCHED */
+static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
+static inline struct task_group *task_group(struct task_struct *p)
+{
+        return NULL;
+}
+#endif /* CONFIG_CGROUP_SCHED */
+static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+{
+        set_task_rq(p, cpu);
+#ifdef CONFIG_SMP
+        /*
+         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
+         * successfuly executed on another CPU. We must ensure that updates of
+         * per-task data have been completed by this moment.
+         */
+        smp_wmb();
+        task_thread_info(p)->cpu = cpu;
+#endif
+}
+/*
+ * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
+ */
+#ifdef CONFIG_SCHED_DEBUG
+# include <linux/jump_label.h>
+# define const_debug __read_mostly
+#else
+# define const_debug const
+#endif
+extern const_debug unsigned int sysctl_sched_features;
+#define SCHED_FEAT(name, enabled)       \
+        __SCHED_FEAT_##name ,
+enum {
+#include "features.h"
+        __SCHED_FEAT_NR,
+};
+#undef SCHED_FEAT
+#if defined(CONFIG_SCHED_DEBUG) && defined(HAVE_JUMP_LABEL)
+static __always_inline bool static_branch__true(struct jump_label_key *key)
+{
+        return likely(static_branch(key)); /* Not out of line branch. */
+}
+static __always_inline bool static_branch__false(struct jump_label_key *key)
+{
+        return unlikely(static_branch(key)); /* Out of line branch. */
+}
+#define SCHED_FEAT(name, enabled)                                       \
+static __always_inline bool static_branch_##name(struct jump_label_key *key) \
+{                                                                       \
+        return static_branch__##enabled(key);                           \
+}
+#include "features.h"
+#undef SCHED_FEAT
+extern struct jump_label_key sched_feat_keys[__SCHED_FEAT_NR];
+#define sched_feat(x) (static_branch_##x(&sched_feat_keys[__SCHED_FEAT_##x]))
+#else /* !(SCHED_DEBUG && HAVE_JUMP_LABEL) */
+#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
+#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
+static inline u64 global_rt_period(void)
+{
+        return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
+}
+static inline u64 global_rt_runtime(void)
+{
+        if (sysctl_sched_rt_runtime < 0)
+                return RUNTIME_INF;
+        return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
+}
+static inline int task_current(struct rq *rq, struct task_struct *p)
+{
+        return rq->curr == p;
+}
+static inline int task_running(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+        return p->on_cpu;
+#else
+        return task_current(rq, p);
+#endif
+}
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next)      do { } while (0)
+#endif
+#ifndef finish_arch_switch
+# define finish_arch_switch(prev)       do { } while (0)
+#endif
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+        /*
+         * We can optimise this out completely for !SMP, because the
+         * SMP rebalancing from interrupt is the only thing that cares
+         * here.
+         */
+        next->on_cpu = 1;
+#endif
+}
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+        /*
+         * After ->on_cpu is cleared, the task can be moved to a different CPU.
+         * We must ensure this doesn't happen until the switch is completely
+         * finished.
+         */
+        smp_wmb();
+        prev->on_cpu = 0;
+#endif
+#ifdef CONFIG_DEBUG_SPINLOCK
+        /* this is a valid case when another task releases the spinlock */
+        rq->lock.owner = current;
+#endif
+        /*
+         * If we are tracking spinlock dependencies then we have to
+         * fix up the runqueue lock - which gets 'carried over' from
+         * prev into current:
+         */
+        spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
+        raw_spin_unlock_irq(&rq->lock);
+}
+#else /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
+{
+#ifdef CONFIG_SMP
+        /*
+         * We can optimise this out completely for !SMP, because the
+         * SMP rebalancing from interrupt is the only thing that cares
+         * here.
+         */
+        next->on_cpu = 1;
+#endif
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+        raw_spin_unlock_irq(&rq->lock);
+#else
+        raw_spin_unlock(&rq->lock);
+#endif
+}
+static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
+{
+#ifdef CONFIG_SMP
+        /*
+         * After ->on_cpu is cleared, the task can be moved to a different CPU.
+         * We must ensure this doesn't happen until the switch is completely
+         * finished.
+         */
+        smp_wmb();
+        prev->on_cpu = 0;
+#endif
+#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+        local_irq_enable();
+#endif
+}
+#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
+static inline void update_load_add(struct load_weight *lw, unsigned long inc)
+{
+        lw->weight += inc;
+        lw->inv_weight = 0;
+}
+static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
+{
+        lw->weight -= dec;
+        lw->inv_weight = 0;
+}
+static inline void update_load_set(struct load_weight *lw, unsigned long w)
+{
+        lw->weight = w;
+        lw->inv_weight = 0;
+}
+/*
+ * To aid in avoiding the subversion of "niceness" due to uneven distribution
+ * of tasks with abnormal "nice" values across CPUs the contribution that
+ * each task makes to its run queue's load is weighted according to its
+ * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
+ * scaled version of the new time slice allocation that they receive on time
+ * slice expiry etc.
+ */
+#define WEIGHT_IDLEPRIO                3
+#define WMULT_IDLEPRIO         1431655765
+/*
+ * Nice levels are multiplicative, with a gentle 10% change for every
+ * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
+ * nice 1, it will get ~10% less CPU time than another CPU-bound task
+ * that remained on nice 0.
+ *
+ * The "10% effect" is relative and cumulative: from _any_ nice level,
+ * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
+ * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
+ * If a task goes up by ~10% and another task goes down by ~10% then
+ * the relative distance between them is ~25%.)
+ */
+static const int prio_to_weight[40] = {
+ /* -20 */     88761,     71755,     56483,     46273,     36291,
+ /* -15 */     29154,     23254,     18705,     14949,     11916,
+ /* -10 */      9548,      7620,      6100,      4904,      3906,
+ /*  -5 */      3121,      2501,      1991,      1586,      1277,
+ /*   0 */      1024,       820,       655,       526,       423,
+ /*   5 */       335,       272,       215,       172,       137,
+ /*  10 */       110,        87,        70,        56,        45,
+ /*  15 */        36,        29,        23,        18,        15,
+};
+/*
+ * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
+ *
+ * In cases where the weight does not change often, we can use the
+ * precalculated inverse to speed up arithmetics by turning divisions
+ * into multiplications:
+ */
+static const u32 prio_to_wmult[40] = {
+ /* -20 */     48388,     59856,     76040,     92818,    118348,
+ /* -15 */    147320,    184698,    229616,    287308,    360437,
+ /* -10 */    449829,    563644,    704093,    875809,   1099582,
+ /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
+ /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
+ /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
+ /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
+ /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
+};
+/* Time spent by the tasks of the cpu accounting group executing in ... */
+enum cpuacct_stat_index {
+        CPUACCT_STAT_USER,      /* ... user mode */
+        CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
+        CPUACCT_STAT_NSTATS,
+};
+#define sched_class_highest (&stop_sched_class)
+#define for_each_class(class) \
+   for (class = sched_class_highest; class; class = class->next)
+extern const struct sched_class stop_sched_class;
+extern const struct sched_class rt_sched_class;
+extern const struct sched_class fair_sched_class;
+extern const struct sched_class idle_sched_class;
+#ifdef CONFIG_SMP
+extern void trigger_load_balance(struct rq *rq, int cpu);
+extern void idle_balance(int this_cpu, struct rq *this_rq);
+#else   /* CONFIG_SMP */
+static inline void idle_balance(int cpu, struct rq *rq)
+{
+}
+#endif
+extern void sysrq_sched_debug_show(void);
+extern void sched_init_granularity(void);
+extern void update_max_interval(void);
+extern void update_group_power(struct sched_domain *sd, int cpu);
+extern int update_runtime(struct notifier_block *nfb, unsigned long action, void *hcpu);
+extern void init_sched_rt_class(void);
+extern void init_sched_fair_class(void);
+extern void resched_task(struct task_struct *p);
+extern void resched_cpu(int cpu);
+extern struct rt_bandwidth def_rt_bandwidth;
+extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern void update_cpu_load(struct rq *this_rq);
+#ifdef CONFIG_CGROUP_CPUACCT
+#include <linux/cgroup.h>
+/* track cpu usage of a group of tasks and its child groups */
+struct cpuacct {
+        struct cgroup_subsys_state css;
+        /* cpuusage holds pointer to a u64-type object on every cpu */
+        u64 __percpu *cpuusage;
+        struct kernel_cpustat __percpu *cpustat;
+};
+/* return cpu accounting group corresponding to this container */
+static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
+{
+        return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
+                            struct cpuacct, css);
+}
+/* return cpu accounting group to which this task belongs */
+static inline struct cpuacct *task_ca(struct task_struct *tsk)
+{
+        return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
+                            struct cpuacct, css);
+}
+static inline struct cpuacct *parent_ca(struct cpuacct *ca)
+{
+        if (!ca || !ca->css.cgroup->parent)
+                return NULL;
+        return cgroup_ca(ca->css.cgroup->parent);
+}
+extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+#else
+static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+#endif
+static inline void inc_nr_running(struct rq *rq)
+{
+        rq->nr_running++;
+}
+static inline void dec_nr_running(struct rq *rq)
+{
+        rq->nr_running--;
+}
+extern void update_rq_clock(struct rq *rq);
+extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
+extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags);
+extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
+extern const_debug unsigned int sysctl_sched_time_avg;
+extern const_debug unsigned int sysctl_sched_nr_migrate;
+extern const_debug unsigned int sysctl_sched_migration_cost;
+static inline u64 sched_avg_period(void)
+{
+        return (u64)sysctl_sched_time_avg * NSEC_PER_MSEC / 2;
+}
+void calc_load_account_idle(struct rq *this_rq);
+#ifdef CONFIG_SCHED_HRTICK
+/*
+ * Use hrtick when:
+ *  - enabled by features
+ *  - hrtimer is actually high res
+ */
+static inline int hrtick_enabled(struct rq *rq)
+{
+        if (!sched_feat(HRTICK))
+                return 0;
+        if (!cpu_active(cpu_of(rq)))
+                return 0;
+        return hrtimer_is_hres_active(&rq->hrtick_timer);
+}
+void hrtick_start(struct rq *rq, u64 delay);
+#else
+static inline int hrtick_enabled(struct rq *rq)
+{
+        return 0;
+}
+#endif /* CONFIG_SCHED_HRTICK */
+#ifdef CONFIG_SMP
+extern void sched_avg_update(struct rq *rq);
+static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
+{
+        rq->rt_avg += rt_delta;
+        sched_avg_update(rq);
+}
+#else
+static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta) { }
+static inline void sched_avg_update(struct rq *rq) { }
+#endif
+extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
+#ifdef CONFIG_SMP
+#ifdef CONFIG_PREEMPT
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2);
+/*
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations.  This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below.  However, it
+ * also adds more overhead and therefore may reduce throughput.
+ */
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+        __releases(this_rq->lock)
+        __acquires(busiest->lock)
+        __acquires(this_rq->lock)
+{
+        raw_spin_unlock(&this_rq->lock);
+        double_rq_lock(this_rq, busiest);
+        return 1;
+}
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry.  This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+        __releases(this_rq->lock)
+        __acquires(busiest->lock)
+        __acquires(this_rq->lock)
+{
+        int ret = 0;
+        if (unlikely(!raw_spin_trylock(&busiest->lock))) {
+                if (busiest < this_rq) {
+                        raw_spin_unlock(&this_rq->lock);
+                        raw_spin_lock(&busiest->lock);
+                        raw_spin_lock_nested(&this_rq->lock,
+                                              SINGLE_DEPTH_NESTING);
+                        ret = 1;
+                } else
+                        raw_spin_lock_nested(&busiest->lock,
+                                              SINGLE_DEPTH_NESTING);
+        }
+        return ret;
+}
+#endif /* CONFIG_PREEMPT */
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+        if (unlikely(!irqs_disabled())) {
+                /* printk() doesn't work good under rq->lock */
+                raw_spin_unlock(&this_rq->lock);
+                BUG_ON(1);
+        }
+        return _double_lock_balance(this_rq, busiest);
+}
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+        __releases(busiest->lock)
+{
+        raw_spin_unlock(&busiest->lock);
+        lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+        __acquires(rq1->lock)
+        __acquires(rq2->lock)
+{
+        BUG_ON(!irqs_disabled());
+        if (rq1 == rq2) {
+                raw_spin_lock(&rq1->lock);
+                __acquire(rq2->lock);   /* Fake it out ;) */
+        } else {
+                if (rq1 < rq2) {
+                        raw_spin_lock(&rq1->lock);
+                        raw_spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
+                } else {
+                        raw_spin_lock(&rq2->lock);
+                        raw_spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
+                }
+        }
+}
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+        __releases(rq1->lock)
+        __releases(rq2->lock)
+{
+        raw_spin_unlock(&rq1->lock);
+        if (rq1 != rq2)
+                raw_spin_unlock(&rq2->lock);
+        else
+                __release(rq2->lock);
+}
+#else /* CONFIG_SMP */
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
+        __acquires(rq1->lock)
+        __acquires(rq2->lock)
+{
+        BUG_ON(!irqs_disabled());
+        BUG_ON(rq1 != rq2);
+        raw_spin_lock(&rq1->lock);
+        __acquire(rq2->lock);   /* Fake it out ;) */
+}
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+        __releases(rq1->lock)
+        __releases(rq2->lock)
+{
+        BUG_ON(rq1 != rq2);
+        raw_spin_unlock(&rq1->lock);
+        __release(rq2->lock);
+}
+#endif
+extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
+extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
+extern void print_cfs_stats(struct seq_file *m, int cpu);
+extern void print_rt_stats(struct seq_file *m, int cpu);
+extern void init_cfs_rq(struct cfs_rq *cfs_rq);
+extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
+extern void unthrottle_offline_cfs_rqs(struct rq *rq);
+extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
+#ifdef CONFIG_NO_HZ
+enum rq_nohz_flag_bits {
+        NOHZ_TICK_STOPPED,
+        NOHZ_BALANCE_KICK,
+        NOHZ_IDLE,
+};
+#define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags)
+#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
new file mode 100644
index 000000000000..2a581ba8e190
--- /dev/null
+++ b/kernel/sched/stats.c
@@ -0,0 +1,111 @@
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <linux/proc_fs.h>
+#include "sched.h"
+/*
+ * bump this up when changing the output format or the meaning of an existing
+ * format, so that tools can adapt (or abort)
+ */
+#define SCHEDSTAT_VERSION 15
+static int show_schedstat(struct seq_file *seq, void *v)
+{
+        int cpu;
+        int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
+        char *mask_str = kmalloc(mask_len, GFP_KERNEL);
+        if (mask_str == NULL)
+                return -ENOMEM;
+        seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
+        seq_printf(seq, "timestamp %lu\n", jiffies);
+        for_each_online_cpu(cpu) {
+                struct rq *rq = cpu_rq(cpu);
+#ifdef CONFIG_SMP
+                struct sched_domain *sd;
+                int dcount = 0;
+#endif
+                /* runqueue-specific stats */
+                seq_printf(seq,
+                    "cpu%d %u %u %u %u %u %u %llu %llu %lu",
+                    cpu, rq->yld_count,
+                    rq->sched_switch, rq->sched_count, rq->sched_goidle,
+                    rq->ttwu_count, rq->ttwu_local,
+                    rq->rq_cpu_time,
+                    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+                seq_printf(seq, "\n");
+#ifdef CONFIG_SMP
+                /* domain-specific stats */
+                rcu_read_lock();
+                for_each_domain(cpu, sd) {
+                        enum cpu_idle_type itype;
+                        cpumask_scnprintf(mask_str, mask_len,
+                                          sched_domain_span(sd));
+                        seq_printf(seq, "domain%d %s", dcount++, mask_str);
+                        for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
+                                        itype++) {
+                                seq_printf(seq, " %u %u %u %u %u %u %u %u",
+                                    sd->lb_count[itype],
+                                    sd->lb_balanced[itype],
+                                    sd->lb_failed[itype],
+                                    sd->lb_imbalance[itype],
+                                    sd->lb_gained[itype],
+                                    sd->lb_hot_gained[itype],
+                                    sd->lb_nobusyq[itype],
+                                    sd->lb_nobusyg[itype]);
+                        }
+                        seq_printf(seq,
+                                   " %u %u %u %u %u %u %u %u %u %u %u %u\n",
+                            sd->alb_count, sd->alb_failed, sd->alb_pushed,
+                            sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
+                            sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
+                            sd->ttwu_wake_remote, sd->ttwu_move_affine,
+                            sd->ttwu_move_balance);
+                }
+                rcu_read_unlock();
+#endif
+        }
+        kfree(mask_str);
+        return 0;
+}
+static int schedstat_open(struct inode *inode, struct file *file)
+{
+        unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
+        char *buf = kmalloc(size, GFP_KERNEL);
+        struct seq_file *m;
+        int res;
+        if (!buf)
+                return -ENOMEM;
+        res = single_open(file, show_schedstat, NULL);
+        if (!res) {
+                m = file->private_data;
+                m->buf = buf;
+                m->size = size;
+        } else
+                kfree(buf);
+        return res;
+}
+static const struct file_operations proc_schedstat_operations = {
+        .open    = schedstat_open,
+        .read    = seq_read,
+        .llseek  = seq_lseek,
+        .release = single_release,
+};
+static int __init proc_schedstat_init(void)
+{
+        proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
+        return 0;
+}
+module_init(proc_schedstat_init);
diff --git a/kernel/sched_stats.h b/kernel/sched/stats.h
index 87f9e36ea56e..2ef90a51ec5e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched/stats.h
@@ -1,108 +1,5 @@
 #ifdef CONFIG_SCHEDSTATS
-/*
- * bump this up when changing the output format or the meaning of an existing
- * format, so that tools can adapt (or abort)
- */
-#define SCHEDSTAT_VERSION 15
-static int show_schedstat(struct seq_file *seq, void *v)
-{
-        int cpu;
-        int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
-        char *mask_str = kmalloc(mask_len, GFP_KERNEL);
-        if (mask_str == NULL)
-                return -ENOMEM;
-        seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
-        seq_printf(seq, "timestamp %lu\n", jiffies);
-        for_each_online_cpu(cpu) {
-                struct rq *rq = cpu_rq(cpu);
-#ifdef CONFIG_SMP
-                struct sched_domain *sd;
-                int dcount = 0;
-#endif
-                /* runqueue-specific stats */
-                seq_printf(seq,
-                    "cpu%d %u %u %u %u %u %u %llu %llu %lu",
-                    cpu, rq->yld_count,
-                    rq->sched_switch, rq->sched_count, rq->sched_goidle,
-                    rq->ttwu_count, rq->ttwu_local,
-                    rq->rq_cpu_time,
-                    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
-                seq_printf(seq, "\n");
-#ifdef CONFIG_SMP
-                /* domain-specific stats */
-                rcu_read_lock();
-                for_each_domain(cpu, sd) {
-                        enum cpu_idle_type itype;
-                        cpumask_scnprintf(mask_str, mask_len,
-                                          sched_domain_span(sd));
-                        seq_printf(seq, "domain%d %s", dcount++, mask_str);
-                        for (itype = CPU_IDLE; itype < CPU_MAX_IDLE_TYPES;
-                                        itype++) {
-                                seq_printf(seq, " %u %u %u %u %u %u %u %u",
-                                    sd->lb_count[itype],
-                                    sd->lb_balanced[itype],
-                                    sd->lb_failed[itype],
-                                    sd->lb_imbalance[itype],
-                                    sd->lb_gained[itype],
-                                    sd->lb_hot_gained[itype],
-                                    sd->lb_nobusyq[itype],
-                                    sd->lb_nobusyg[itype]);
-                        }
-                        seq_printf(seq,
-                                   " %u %u %u %u %u %u %u %u %u %u %u %u\n",
-                            sd->alb_count, sd->alb_failed, sd->alb_pushed,
-                            sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
-                            sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
-                            sd->ttwu_wake_remote, sd->ttwu_move_affine,
-                            sd->ttwu_move_balance);
-                }
-                rcu_read_unlock();
-#endif
-        }
-        kfree(mask_str);
-        return 0;
-}
-static int schedstat_open(struct inode *inode, struct file *file)
-{
-        unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
-        char *buf = kmalloc(size, GFP_KERNEL);
-        struct seq_file *m;
-        int res;
-        if (!buf)
-                return -ENOMEM;
-        res = single_open(file, show_schedstat, NULL);
-        if (!res) {
-                m = file->private_data;
-                m->buf = buf;
-                m->size = size;
-        } else
-                kfree(buf);
-        return res;
-}
-static const struct file_operations proc_schedstat_operations = {
-        .open    = schedstat_open,
-        .read    = seq_read,
-        .llseek  = seq_lseek,
-        .release = single_release,
-};
-static int __init proc_schedstat_init(void)
-{
-        proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
-        return 0;
-}
-module_init(proc_schedstat_init);
 /*
 * Expects runqueue lock to be held for atomicity of update
@@ -283,8 +180,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
                return;
        raw_spin_lock(&cputimer->lock);
-        cputimer->cputime.utime =
+        cputimer->cputime.utime += cputime;
-                cputime_add(cputimer->cputime.utime, cputime);
        raw_spin_unlock(&cputimer->lock);
 }
@@ -307,8 +203,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
                return;
        raw_spin_lock(&cputimer->lock);
-        cputimer->cputime.stime =
+        cputimer->cputime.stime += cputime;
-                cputime_add(cputimer->cputime.stime, cputime);
        raw_spin_unlock(&cputimer->lock);
 }
diff --git a/kernel/sched_stoptask.c b/kernel/sched/stop_task.c
index 8b44e7fa7fb3..7b386e86fd23 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched/stop_task.c
@@ -1,3 +1,5 @@
+#include "sched.h"
 /*
 * stop-task scheduling class.
 *
@@ -80,7 +82,7 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
 /*
 * Simple, special scheduling class for the per-CPU stop tasks:
 */
-static const struct sched_class stop_sched_class = {
+const struct sched_class stop_sched_class = {
        .next                   = &rt_sched_class,
        .enqueue_task           = enqueue_task_stop,
diff --git a/kernel/signal.c b/kernel/signal.c
index 206551563cce..56ce3a618b28 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1629,10 +1629,8 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
        info.si_uid = __task_cred(tsk)->uid;
        rcu_read_unlock();
-        info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
+        info.si_utime = cputime_to_clock_t(tsk->utime + tsk->signal->utime);
-                                tsk->signal->utime));
+        info.si_stime = cputime_to_clock_t(tsk->stime + tsk->signal->stime);
-        info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
-                                tsk->signal->stime));
        info.si_status = tsk->exit_code & 0x7f;
        if (tsk->exit_code & 0x80)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 2c71d91efff0..4eb3a0fa351e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -347,12 +347,12 @@ void irq_exit(void)
        if (!in_interrupt() && local_softirq_pending())
                invoke_softirq();
-        rcu_irq_exit();
 #ifdef CONFIG_NO_HZ
        /* Make sure that timer wheel updates are propagated */
        if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
-                tick_nohz_stop_sched_tick(0);
+                tick_nohz_irq_exit();
 #endif
+        rcu_irq_exit();
        preempt_enable_no_resched();
 }
diff --git a/kernel/sys.c b/kernel/sys.c
index 481611fbd079..ddf8155bf3f8 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1605,7 +1605,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
        unsigned long maxrss = 0;
        memset((char *) r, 0, sizeof *r);
-        utime = stime = cputime_zero;
+        utime = stime = 0;
        if (who == RUSAGE_THREAD) {
                task_times(current, &utime, &stime);
@@ -1635,8 +1635,8 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
                case RUSAGE_SELF:
                        thread_group_times(p, &tgutime, &tgstime);
-                        utime = cputime_add(utime, tgutime);
+                        utime += tgutime;
-                        stime = cputime_add(stime, tgstime);
+                        stime += tgstime;
                        r->ru_nvcsw += p->signal->nvcsw;
                        r->ru_nivcsw += p->signal->nivcsw;
                        r->ru_minflt += p->signal->min_flt;
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 40420644d0ba..7656642e4b8e 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -275,42 +275,17 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time)
 }
 EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us);
-/**
+static void tick_nohz_stop_sched_tick(struct tick_sched *ts)
- * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
- *
- * When the next event is more than a tick into the future, stop the idle tick
- * Called either from the idle loop or from irq_exit() when an idle period was
- * just interrupted by an interrupt which did not cause a reschedule.
- */
-void tick_nohz_stop_sched_tick(int inidle)
 {
-        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
+        unsigned long seq, last_jiffies, next_jiffies, delta_jiffies;
-        struct tick_sched *ts;
        ktime_t last_update, expires, now;
        struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
        u64 time_delta;
        int cpu;
-        local_irq_save(flags);
        cpu = smp_processor_id();
        ts = &per_cpu(tick_cpu_sched, cpu);
-        /*
-         * Call to tick_nohz_start_idle stops the last_update_time from being
-         * updated. Thus, it must not be called in the event we are called from
-         * irq_exit() with the prior state different than idle.
-         */
-        if (!inidle && !ts->inidle)
-                goto end;
-        /*
-         * Set ts->inidle unconditionally. Even if the system did not
-         * switch to NOHZ mode the cpu frequency governers rely on the
-         * update of the idle time accounting in tick_nohz_start_idle().
-         */
-        ts->inidle = 1;
        now = tick_nohz_start_idle(cpu, ts);
        /*
@@ -326,10 +301,10 @@ void tick_nohz_stop_sched_tick(int inidle)
        }
        if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
-                goto end;
+                return;
        if (need_resched())
-                goto end;
+                return;
        if (unlikely(local_softirq_pending() && cpu_online(cpu))) {
                static int ratelimit;
@@ -339,7 +314,7 @@ void tick_nohz_stop_sched_tick(int inidle)
                               (unsigned int) local_softirq_pending());
                        ratelimit++;
                }
-                goto end;
+                return;
        }
        ts->idle_calls++;
@@ -434,7 +409,6 @@ void tick_nohz_stop_sched_tick(int inidle)
                        ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
                        ts->tick_stopped = 1;
                        ts->idle_jiffies = last_jiffies;
-                        rcu_enter_nohz();
                }
                ts->idle_sleeps++;
@@ -472,8 +446,64 @@ out:
        ts->next_jiffies = next_jiffies;
        ts->last_jiffies = last_jiffies;
        ts->sleep_length = ktime_sub(dev->next_event, now);
-end:
+}
-        local_irq_restore(flags);
+/**
+ * tick_nohz_idle_enter - stop the idle tick from the idle task
+ *
+ * When the next event is more than a tick into the future, stop the idle tick
+ * Called when we start the idle loop.
+ *
+ * The arch is responsible of calling:
+ *
+ * - rcu_idle_enter() after its last use of RCU before the CPU is put
+ *  to sleep.
+ * - rcu_idle_exit() before the first use of RCU after the CPU is woken up.
+ */
+void tick_nohz_idle_enter(void)
+{
+        struct tick_sched *ts;
+        WARN_ON_ONCE(irqs_disabled());
+        /*
+         * Update the idle state in the scheduler domain hierarchy
+         * when tick_nohz_stop_sched_tick() is called from the idle loop.
+         * State will be updated to busy during the first busy tick after
+         * exiting idle.
+         */
+        set_cpu_sd_state_idle();
+        local_irq_disable();
+        ts = &__get_cpu_var(tick_cpu_sched);
+        /*
+         * set ts->inidle unconditionally. even if the system did not
+         * switch to nohz mode the cpu frequency governers rely on the
+         * update of the idle time accounting in tick_nohz_start_idle().
+         */
+        ts->inidle = 1;
+        tick_nohz_stop_sched_tick(ts);
+        local_irq_enable();
+}
+/**
+ * tick_nohz_irq_exit - update next tick event from interrupt exit
+ *
+ * When an interrupt fires while we are idle and it doesn't cause
+ * a reschedule, it may still add, modify or delete a timer, enqueue
+ * an RCU callback, etc...
+ * So we need to re-calculate and reprogram the next tick event.
+ */
+void tick_nohz_irq_exit(void)
+{
+        struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
+        if (!ts->inidle)
+                return;
+        tick_nohz_stop_sched_tick(ts);
 }
 /**
@@ -515,11 +545,13 @@ static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
 }
 /**
- * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
+ * tick_nohz_idle_exit - restart the idle tick from the idle task
 *
 * Restart the idle tick when the CPU is woken up from idle
+ * This also exit the RCU extended quiescent state. The CPU
+ * can use RCU again after this function is called.
 */
-void tick_nohz_restart_sched_tick(void)
+void tick_nohz_idle_exit(void)
 {
        int cpu = smp_processor_id();
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
@@ -529,6 +561,7 @@ void tick_nohz_restart_sched_tick(void)
        ktime_t now;
        local_irq_disable();
        if (ts->idle_active || (ts->inidle && ts->tick_stopped))
                now = ktime_get();
@@ -543,8 +576,6 @@ void tick_nohz_restart_sched_tick(void)
        ts->inidle = 0;
-        rcu_exit_nohz();
        /* Update jiffies first */
        select_nohz_load_balancer(0);
        tick_do_update_jiffies64(now);
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 237841378c03..0c6358186401 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -131,7 +131,7 @@ static inline s64 timekeeping_get_ns_raw(void)
        /* calculate the delta since the last update_wall_time: */
        cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
-        /* return delta convert to nanoseconds using ntp adjusted mult. */
+        /* return delta convert to nanoseconds. */
        return clocksource_cyc2ns(cycle_delta, clock->mult, clock->shift);
 }
@@ -813,11 +813,11 @@ static void timekeeping_adjust(s64 offset)
         * First we shift it down from NTP_SHIFT to clocksource->shifted nsecs.
         *
         * Note we subtract one in the shift, so that error is really error*2.
-         * This "saves" dividing(shifting) intererval twice, but keeps the
+         * This "saves" dividing(shifting) interval twice, but keeps the
-         * (error > interval) comparision as still measuring if error is
+         * (error > interval) comparison as still measuring if error is
         * larger then half an interval.
         *
-         * Note: It does not "save" on aggrivation when reading the code.
+         * Note: It does not "save" on aggravation when reading the code.
         */
        error = timekeeper.ntp_error >> (timekeeper.ntp_error_shift - 1);
        if (error > interval) {
@@ -833,7 +833,7 @@ static void timekeeping_adjust(s64 offset)
                 * nanosecond, and store the amount rounded up into
                 * the error. This causes the likely below to be unlikely.
                 *
-                 * The properfix is to avoid rounding up by using
+                 * The proper fix is to avoid rounding up by using
                 * the high precision timekeeper.xtime_nsec instead of
                 * xtime.tv_nsec everywhere. Fixing this will take some
                 * time.
diff --git a/kernel/timer.c b/kernel/timer.c
index 9c3c62b0c4bc..a297ffcf888e 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -427,6 +427,12 @@ static int timer_fixup_init(void *addr, enum debug_obj_state state)
        }
 }
+/* Stub timer callback for improperly used timers. */
+static void stub_timer(unsigned long data)
+{
+        WARN_ON(1);
+}
 /*
 * fixup_activate is called when:
 * - an active object is activated
@@ -450,7 +456,8 @@ static int timer_fixup_activate(void *addr, enum debug_obj_state state)
                        debug_object_activate(timer, &timer_debug_descr);
                        return 0;
                } else {
-                        WARN_ON_ONCE(1);
+                        setup_timer(timer, stub_timer, 0);
+                        return 1;
                }
                return 0;
@@ -480,12 +487,40 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
        }
 }
+/*
+ * fixup_assert_init is called when:
+ * - an untracked/uninit-ed object is found
+ */
+static int timer_fixup_assert_init(void *addr, enum debug_obj_state state)
+{
+        struct timer_list *timer = addr;
+        switch (state) {
+        case ODEBUG_STATE_NOTAVAILABLE:
+                if (timer->entry.prev == TIMER_ENTRY_STATIC) {
+                        /*
+                         * This is not really a fixup. The timer was
+                         * statically initialized. We just make sure that it
+                         * is tracked in the object tracker.
+                         */
+                        debug_object_init(timer, &timer_debug_descr);
+                        return 0;
+                } else {
+                        setup_timer(timer, stub_timer, 0);
+                        return 1;
+                }
+        default:
+                return 0;
+        }
+}
 static struct debug_obj_descr timer_debug_descr = {
-        .name           = "timer_list",
+        .name                   = "timer_list",
-        .debug_hint     = timer_debug_hint,
+        .debug_hint             = timer_debug_hint,
-        .fixup_init     = timer_fixup_init,
+        .fixup_init             = timer_fixup_init,
-        .fixup_activate = timer_fixup_activate,
+        .fixup_activate         = timer_fixup_activate,
-        .fixup_free     = timer_fixup_free,
+        .fixup_free             = timer_fixup_free,
+        .fixup_assert_init      = timer_fixup_assert_init,
 };
 static inline void debug_timer_init(struct timer_list *timer)
@@ -508,6 +543,11 @@ static inline void debug_timer_free(struct timer_list *timer)
        debug_object_free(timer, &timer_debug_descr);
 }
+static inline void debug_timer_assert_init(struct timer_list *timer)
+{
+        debug_object_assert_init(timer, &timer_debug_descr);
+}
 static void __init_timer(struct timer_list *timer,
                         const char *name,
                         struct lock_class_key *key);
@@ -531,6 +571,7 @@ EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
 static inline void debug_timer_init(struct timer_list *timer) { }
 static inline void debug_timer_activate(struct timer_list *timer) { }
 static inline void debug_timer_deactivate(struct timer_list *timer) { }
+static inline void debug_timer_assert_init(struct timer_list *timer) { }
 #endif
 static inline void debug_init(struct timer_list *timer)
@@ -552,6 +593,11 @@ static inline void debug_deactivate(struct timer_list *timer)
        trace_timer_cancel(timer);
 }
+static inline void debug_assert_init(struct timer_list *timer)
+{
+        debug_timer_assert_init(timer);
+}
 static void __init_timer(struct timer_list *timer,
                         const char *name,
                         struct lock_class_key *key)
@@ -902,6 +948,8 @@ int del_timer(struct timer_list *timer)
        unsigned long flags;
        int ret = 0;
+        debug_assert_init(timer);
        timer_stats_timer_clear_start_info(timer);
        if (timer_pending(timer)) {
                base = lock_timer_base(timer, &flags);
@@ -932,6 +980,8 @@ int try_to_del_timer_sync(struct timer_list *timer)
        unsigned long flags;
        int ret = -1;
+        debug_assert_init(timer);
        base = lock_timer_base(timer, &flags);
        if (base->running_timer == timer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f2bd275bb60f..91dc4bc8bf72 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -338,7 +338,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
        TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
-        TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
+        TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
+        TRACE_ITER_IRQ_INFO;
 static int trace_stop_count;
 static DEFINE_RAW_SPINLOCK(tracing_start_lock);
@@ -426,6 +427,7 @@ static const char *trace_options[] = {
        "record-cmd",
        "overwrite",
        "disable_on_free",
+        "irq-info",
        NULL
 };
@@ -1843,6 +1845,33 @@ static void s_stop(struct seq_file *m, void *p)
        trace_event_read_unlock();
 }
+static void
+get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
+{
+        unsigned long count;
+        int cpu;
+        *total = 0;
+        *entries = 0;
+        for_each_tracing_cpu(cpu) {
+                count = ring_buffer_entries_cpu(tr->buffer, cpu);
+                /*
+                 * If this buffer has skipped entries, then we hold all
+                 * entries for the trace and we need to ignore the
+                 * ones before the time stamp.
+                 */
+                if (tr->data[cpu]->skipped_entries) {
+                        count -= tr->data[cpu]->skipped_entries;
+                        /* total is the same as the entries */
+                        *total += count;
+                } else
+                        *total += count +
+                                ring_buffer_overrun_cpu(tr->buffer, cpu);
+                *entries += count;
+        }
+}
 static void print_lat_help_header(struct seq_file *m)
 {
        seq_puts(m, "#                  _------=> CPU#            \n");
@@ -1855,12 +1884,35 @@ static void print_lat_help_header(struct seq_file *m)
        seq_puts(m, "#     \\   /      |||||  \\    |   /           \n");
 }
-static void print_func_help_header(struct seq_file *m)
+static void print_event_info(struct trace_array *tr, struct seq_file *m)
+{
+        unsigned long total;
+        unsigned long entries;
+        get_total_entries(tr, &total, &entries);
+        seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu   #P:%d\n",
+                   entries, total, num_online_cpus());
+        seq_puts(m, "#\n");
+}
+static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
 {
-        seq_puts(m, "#           TASK-PID    CPU#    TIMESTAMP  FUNCTION\n");
+        print_event_info(tr, m);
+        seq_puts(m, "#           TASK-PID   CPU#      TIMESTAMP  FUNCTION\n");
        seq_puts(m, "#              | |       |          |         |\n");
 }
+static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
+{
+        print_event_info(tr, m);
+        seq_puts(m, "#                              _-----=> irqs-off\n");
+        seq_puts(m, "#                             / _----=> need-resched\n");
+        seq_puts(m, "#                            | / _---=> hardirq/softirq\n");
+        seq_puts(m, "#                            || / _--=> preempt-depth\n");
+        seq_puts(m, "#                            ||| /     delay\n");
+        seq_puts(m, "#           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION\n");
+        seq_puts(m, "#              | |       |   ||||       |         |\n");
+}
 void
 print_trace_header(struct seq_file *m, struct trace_iterator *iter)
@@ -1869,32 +1921,14 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
        struct trace_array *tr = iter->tr;
        struct trace_array_cpu *data = tr->data[tr->cpu];
        struct tracer *type = current_trace;
-        unsigned long entries = 0;
+        unsigned long entries;
-        unsigned long total = 0;
+        unsigned long total;
-        unsigned long count;
        const char *name = "preemption";
-        int cpu;
        if (type)
                name = type->name;
+        get_total_entries(tr, &total, &entries);
-        for_each_tracing_cpu(cpu) {
-                count = ring_buffer_entries_cpu(tr->buffer, cpu);
-                /*
-                 * If this buffer has skipped entries, then we hold all
-                 * entries for the trace and we need to ignore the
-                 * ones before the time stamp.
-                 */
-                if (tr->data[cpu]->skipped_entries) {
-                        count -= tr->data[cpu]->skipped_entries;
-                        /* total is the same as the entries */
-                        total += count;
-                } else
-                        total += count +
-                                ring_buffer_overrun_cpu(tr->buffer, cpu);
-                entries += count;
-        }
        seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
                   name, UTS_RELEASE);
@@ -2140,6 +2174,21 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
        return print_trace_fmt(iter);
 }
+void trace_latency_header(struct seq_file *m)
+{
+        struct trace_iterator *iter = m->private;
+        /* print nothing if the buffers are empty */
+        if (trace_empty(iter))
+                return;
+        if (iter->iter_flags & TRACE_FILE_LAT_FMT)
+                print_trace_header(m, iter);
+        if (!(trace_flags & TRACE_ITER_VERBOSE))
+                print_lat_help_header(m);
+}
 void trace_default_header(struct seq_file *m)
 {
        struct trace_iterator *iter = m->private;
@@ -2155,8 +2204,12 @@ void trace_default_header(struct seq_file *m)
                if (!(trace_flags & TRACE_ITER_VERBOSE))
                        print_lat_help_header(m);
        } else {
-                if (!(trace_flags & TRACE_ITER_VERBOSE))
+                if (!(trace_flags & TRACE_ITER_VERBOSE)) {
-                        print_func_help_header(m);
+                        if (trace_flags & TRACE_ITER_IRQ_INFO)
+                                print_func_help_header_irq(iter->tr, m);
+                        else
+                                print_func_help_header(iter->tr, m);
+                }
        }
 }
@@ -4775,6 +4828,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
 {
        __ftrace_dump(true, oops_dump_mode);
 }
+EXPORT_SYMBOL_GPL(ftrace_dump);
 __init static int tracer_alloc_buffers(void)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 092e1f8d18dc..2c2657462ac3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -370,6 +370,7 @@ void trace_graph_function(struct trace_array *tr,
                    unsigned long ip,
                    unsigned long parent_ip,
                    unsigned long flags, int pc);
+void trace_latency_header(struct seq_file *m);
 void trace_default_header(struct seq_file *m);
 void print_trace_header(struct seq_file *m, struct trace_iterator *iter);
 int trace_empty(struct trace_iterator *iter);
@@ -654,6 +655,7 @@ enum trace_iterator_flags {
        TRACE_ITER_RECORD_CMD           = 0x100000,
        TRACE_ITER_OVERWRITE            = 0x200000,
        TRACE_ITER_STOP_ON_FREE         = 0x400000,
+        TRACE_ITER_IRQ_INFO             = 0x800000,
 };
 /*
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 95dc31efd6dd..f04cc3136bd3 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -27,6 +27,12 @@
 #include "trace.h"
 #include "trace_output.h"
+#define DEFAULT_SYS_FILTER_MESSAGE                                      \
+        "### global filter ###\n"                                       \
+        "# Use this to set filters for multiple events.\n"              \
+        "# Only events with the given fields will be affected.\n"       \
+        "# If no events are modified, an error message will be displayed here"
 enum filter_op_ids
 {
        OP_OR,
@@ -646,7 +652,7 @@ void print_subsystem_event_filter(struct event_subsystem *system,
        if (filter && filter->filter_string)
                trace_seq_printf(s, "%s\n", filter->filter_string);
        else
-                trace_seq_printf(s, "none\n");
+                trace_seq_printf(s, DEFAULT_SYS_FILTER_MESSAGE "\n");
        mutex_unlock(&event_mutex);
 }
@@ -1838,7 +1844,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
        if (!filter)
                goto out;
-        replace_filter_string(filter, filter_string);
+        /* System filters just show a default message */
+        kfree(filter->filter_string);
+        filter->filter_string = NULL;
        /*
         * No event actually uses the system filter
         * we can free it without synchronize_sched().
@@ -1848,14 +1857,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
        parse_init(ps, filter_ops, filter_string);
        err = filter_parse(ps);
-        if (err) {
+        if (err)
-                append_filter_err(ps, system->filter);
+                goto err_filter;
-                goto out;
-        }
        err = replace_system_preds(system, ps, filter_string);
        if (err)
-                append_filter_err(ps, system->filter);
+                goto err_filter;
 out:
        filter_opstack_clear(ps);
@@ -1865,6 +1872,11 @@ out_unlock:
        mutex_unlock(&event_mutex);
        return err;
+err_filter:
+        replace_filter_string(filter, filter_string);
+        append_filter_err(ps, system->filter);
+        goto out;
 }
 #ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 20dad0d7a163..99d20e920368 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -280,9 +280,20 @@ static enum print_line_t irqsoff_print_line(struct trace_iterator *iter)
 }
 static void irqsoff_graph_return(struct ftrace_graph_ret *trace) { }
-static void irqsoff_print_header(struct seq_file *s) { }
 static void irqsoff_trace_open(struct trace_iterator *iter) { }
 static void irqsoff_trace_close(struct trace_iterator *iter) { }
+#ifdef CONFIG_FUNCTION_TRACER
+static void irqsoff_print_header(struct seq_file *s)
+{
+        trace_default_header(s);
+}
+#else
+static void irqsoff_print_header(struct seq_file *s)
+{
+        trace_latency_header(s);
+}
+#endif /* CONFIG_FUNCTION_TRACER */
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 /*
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 51999309a6cf..0d6ff3555942 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -627,11 +627,23 @@ int trace_print_context(struct trace_iterator *iter)
        unsigned long usec_rem = do_div(t, USEC_PER_SEC);
        unsigned long secs = (unsigned long)t;
        char comm[TASK_COMM_LEN];
+        int ret;
        trace_find_cmdline(entry->pid, comm);
-        return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
+        ret = trace_seq_printf(s, "%16s-%-5d [%03d] ",
-                                comm, entry->pid, iter->cpu, secs, usec_rem);
+                               comm, entry->pid, iter->cpu);
+        if (!ret)
+                return 0;
+        if (trace_flags & TRACE_ITER_IRQ_INFO) {
+                ret = trace_print_lat_fmt(s, entry);
+                if (!ret)
+                        return 0;
+        }
+        return trace_seq_printf(s, " %5lu.%06lu: ",
+                                secs, usec_rem);
 }
 int trace_print_lat_context(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e4a70c0c71b6..ff791ea48b57 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -280,9 +280,20 @@ static enum print_line_t wakeup_print_line(struct trace_iterator *iter)
 }
 static void wakeup_graph_return(struct ftrace_graph_ret *trace) { }
-static void wakeup_print_header(struct seq_file *s) { }
 static void wakeup_trace_open(struct trace_iterator *iter) { }
 static void wakeup_trace_close(struct trace_iterator *iter) { }
+#ifdef CONFIG_FUNCTION_TRACER
+static void wakeup_print_header(struct seq_file *s)
+{
+        trace_default_header(s);
+}
+#else
+static void wakeup_print_header(struct seq_file *s)
+{
+        trace_latency_header(s);
+}
+#endif /* CONFIG_FUNCTION_TRACER */
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 /*
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 5bbfac85866e..23b4d784ebdd 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -127,7 +127,7 @@ void acct_update_integrals(struct task_struct *tsk)
                local_irq_save(flags);
                time = tsk->stime + tsk->utime;
-                dtime = cputime_sub(time, tsk->acct_timexpd);
+                dtime = time - tsk->acct_timexpd;
                jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
                delta = value.tv_sec;
                delta = delta * USEC_PER_SEC + value.tv_usec;
diff --git a/kernel/wait.c b/kernel/wait.c
index 26fa7797f90f..7fdd9eaca2c3 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,10 +10,10 @@
 #include <linux/wait.h>
 #include <linux/hash.h>
-void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key)
+void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
 {
        spin_lock_init(&q->lock);
-        lockdep_set_class(&q->lock, key);
+        lockdep_set_class_and_name(&q->lock, key, name);
        INIT_LIST_HEAD(&q->task_list);
 }