140 files changed, 5711 insertions, 2117 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index d06467fc8f7..e898c5b9d02 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,8 +9,8 @@ obj-y     = sched.o fork.o exec_domain.o panic.o printk.o \
            rcupdate.o extable.o params.o posix-timers.o \
            kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
            hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
-            notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \
+            notifier.o ksysfs.o sched_clock.o cred.o \
-            async.o range.o jump_label.o
+            async.o range.o
 obj-y += groups.o
 ifdef CONFIG_FUNCTION_TRACER
@@ -101,12 +101,14 @@ obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_TRACEPOINTS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_IRQ_WORK) += irq_work.o
+obj-$(CONFIG_CPU_PM) += cpu_pm.o
 obj-$(CONFIG_PERF_EVENTS) += events/
 obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
 obj-$(CONFIG_PADATA) += padata.o
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
+obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/async.c b/kernel/async.c
index d5fe7af0de2..80b74b88fef 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -51,7 +51,7 @@ asynchronous and synchronous parts of the kernel.
 #include <linux/async.h>
 #include <linux/atomic.h>
 #include <linux/ktime.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -120,7 +120,7 @@ static void async_run_entry_fn(struct work_struct *work)
        struct async_entry *entry =
                container_of(work, struct async_entry, work);
        unsigned long flags;
-        ktime_t calltime, delta, rettime;
+        ktime_t uninitialized_var(calltime), delta, rettime;
        /* 1) move self to the running queue */
        spin_lock_irqsave(&async_lock, flags);
@@ -269,7 +269,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
 void async_synchronize_cookie_domain(async_cookie_t cookie,
                                     struct list_head *running)
 {
-        ktime_t starttime, delta, endtime;
+        ktime_t uninitialized_var(starttime), delta, endtime;
        if (initcall_debug && system_state == SYSTEM_BOOTING) {
                printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
diff --git a/kernel/audit.c b/kernel/audit.c
index 0a1355ca3d7..09fae2677a4 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -45,7 +45,7 @@
 #include <asm/types.h>
 #include <linux/atomic.h>
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/kthread.h>
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index ce4b054acee..47b7fc1ea89 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -48,7 +48,7 @@
 #include <linux/fs.h>
 #include <linux/namei.h>
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/socket.h>
diff --git a/kernel/capability.c b/kernel/capability.c
index 283c529f8b1..b463871a4e6 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -10,7 +10,7 @@
 #include <linux/audit.h>
 #include <linux/capability.h>
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
 #include <linux/pid_namespace.h>
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 1d2b6ceea95..d9d5648f3cd 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -265,7 +265,7 @@ list_for_each_entry(_root, &roots, root_list)
 /* the list of cgroups eligible for automatic release. Protected by
 * release_list_lock */
 static LIST_HEAD(release_list);
-static DEFINE_SPINLOCK(release_list_lock);
+static DEFINE_RAW_SPINLOCK(release_list_lock);
 static void cgroup_release_agent(struct work_struct *work);
 static DECLARE_WORK(release_agent_work, cgroup_release_agent);
 static void check_for_release(struct cgroup *cgrp);
@@ -2027,7 +2027,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
                goto out_free_group_list;
        /* prevent changes to the threadgroup list while we take a snapshot. */
-        rcu_read_lock();
+        read_lock(&tasklist_lock);
        if (!thread_group_leader(leader)) {
                /*
                 * a race with de_thread from another thread's exec() may strip
@@ -2036,7 +2036,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
                 * throw this task away and try again (from cgroup_procs_write);
                 * this is "double-double-toil-and-trouble-check locking".
                 */
-                rcu_read_unlock();
+                read_unlock(&tasklist_lock);
                retval = -EAGAIN;
                goto out_free_group_list;
        }
@@ -2057,7 +2057,7 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
        } while_each_thread(leader, tsk);
        /* remember the number of threads in the array for later. */
        group_size = i;
-        rcu_read_unlock();
+        read_unlock(&tasklist_lock);
        /*
         * step 1: check that we can legitimately attach to the cgroup.
@@ -2135,14 +2135,17 @@ int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
                oldcgrp = task_cgroup_from_root(tsk, root);
                if (cgrp == oldcgrp)
                        continue;
-                /* attach each task to each subsystem */
-                for_each_subsys(root, ss) {
-                        if (ss->attach_task)
-                                ss->attach_task(cgrp, tsk);
-                }
                /* if the thread is PF_EXITING, it can just get skipped. */
                retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
-                BUG_ON(retval != 0 && retval != -ESRCH);
+                if (retval == 0) {
+                        /* attach each task to each subsystem */
+                        for_each_subsys(root, ss) {
+                                if (ss->attach_task)
+                                        ss->attach_task(cgrp, tsk);
+                        }
+                } else {
+                        BUG_ON(retval != -ESRCH);
+                }
        }
        /* nothing is sensitive to fork() after this point. */
@@ -4014,11 +4017,11 @@ again:
        finish_wait(&cgroup_rmdir_waitq, &wait);
        clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
-        spin_lock(&release_list_lock);
+        raw_spin_lock(&release_list_lock);
        set_bit(CGRP_REMOVED, &cgrp->flags);
        if (!list_empty(&cgrp->release_list))
                list_del_init(&cgrp->release_list);
-        spin_unlock(&release_list_lock);
+        raw_spin_unlock(&release_list_lock);
        cgroup_lock_hierarchy(cgrp->root);
        /* delete this cgroup from parent->children */
@@ -4671,13 +4674,13 @@ static void check_for_release(struct cgroup *cgrp)
                 * already queued for a userspace notification, queue
                 * it now */
                int need_schedule_work = 0;
-                spin_lock(&release_list_lock);
+                raw_spin_lock(&release_list_lock);
                if (!cgroup_is_removed(cgrp) &&
                    list_empty(&cgrp->release_list)) {
                        list_add(&cgrp->release_list, &release_list);
                        need_schedule_work = 1;
                }
-                spin_unlock(&release_list_lock);
+                raw_spin_unlock(&release_list_lock);
                if (need_schedule_work)
                        schedule_work(&release_agent_work);
        }
@@ -4729,7 +4732,7 @@ static void cgroup_release_agent(struct work_struct *work)
 {
        BUG_ON(work != &release_agent_work);
        mutex_lock(&cgroup_mutex);
-        spin_lock(&release_list_lock);
+        raw_spin_lock(&release_list_lock);
        while (!list_empty(&release_list)) {
                char *argv[3], *envp[3];
                int i;
@@ -4738,7 +4741,7 @@ static void cgroup_release_agent(struct work_struct *work)
                                                    struct cgroup,
                                                    release_list);
                list_del_init(&cgrp->release_list);
-                spin_unlock(&release_list_lock);
+                raw_spin_unlock(&release_list_lock);
                pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
                if (!pathbuf)
                        goto continue_free;
@@ -4768,9 +4771,9 @@ static void cgroup_release_agent(struct work_struct *work)
 continue_free:
                kfree(pathbuf);
                kfree(agentbuf);
-                spin_lock(&release_list_lock);
+                raw_spin_lock(&release_list_lock);
        }
-        spin_unlock(&release_list_lock);
+        raw_spin_unlock(&release_list_lock);
        mutex_unlock(&cgroup_mutex);
 }
@@ -4880,9 +4883,9 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
        rcu_assign_pointer(id->css, NULL);
        rcu_assign_pointer(css->id, NULL);
-        spin_lock(&ss->id_lock);
+        write_lock(&ss->id_lock);
        idr_remove(&ss->idr, id->id);
-        spin_unlock(&ss->id_lock);
+        write_unlock(&ss->id_lock);
        kfree_rcu(id, rcu_head);
 }
 EXPORT_SYMBOL_GPL(free_css_id);
@@ -4908,10 +4911,10 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
                error = -ENOMEM;
                goto err_out;
        }
-        spin_lock(&ss->id_lock);
+        write_lock(&ss->id_lock);
        /* Don't use 0. allocates an ID of 1-65535 */
        error = idr_get_new_above(&ss->idr, newid, 1, &myid);
-        spin_unlock(&ss->id_lock);
+        write_unlock(&ss->id_lock);
        /* Returns error when there are no free spaces for new ID.*/
        if (error) {
@@ -4926,9 +4929,9 @@ static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
        return newid;
 remove_idr:
        error = -ENOSPC;
-        spin_lock(&ss->id_lock);
+        write_lock(&ss->id_lock);
        idr_remove(&ss->idr, myid);
-        spin_unlock(&ss->id_lock);
+        write_unlock(&ss->id_lock);
 err_out:
        kfree(newid);
        return ERR_PTR(error);
@@ -4940,7 +4943,7 @@ static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
 {
        struct css_id *newid;
-        spin_lock_init(&ss->id_lock);
+        rwlock_init(&ss->id_lock);
        idr_init(&ss->idr);
        newid = get_new_cssid(ss, 0);
@@ -5035,9 +5038,9 @@ css_get_next(struct cgroup_subsys *ss, int id,
                 * scan next entry from bitmap(tree), tmpid is updated after
                 * idr_get_next().
                 */
-                spin_lock(&ss->id_lock);
+                read_lock(&ss->id_lock);
                tmp = idr_get_next(&ss->idr, &tmpid);
-                spin_unlock(&ss->id_lock);
+                read_unlock(&ss->id_lock);
                if (!tmp)
                        break;
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e691818d7e4..213c0351dad 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -14,7 +14,7 @@
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/cgroup.h>
 #include <linux/fs.h>
@@ -153,6 +153,13 @@ static void freezer_destroy(struct cgroup_subsys *ss,
        kfree(cgroup_freezer(cgroup));
 }
+/* task is frozen or will freeze immediately when next it gets woken */
+static bool is_task_frozen_enough(struct task_struct *task)
+{
+        return frozen(task) ||
+                (task_is_stopped_or_traced(task) && freezing(task));
+}
 /*
 * The call to cgroup_lock() in the freezer.state write method prevents
 * a write to that file racing against an attach, and hence the
@@ -231,7 +238,7 @@ static void update_if_frozen(struct cgroup *cgroup,
        cgroup_iter_start(cgroup, &it);
        while ((task = cgroup_iter_next(cgroup, &it))) {
                ntotal++;
-                if (frozen(task))
+                if (is_task_frozen_enough(task))
                        nfrozen++;
        }
@@ -284,7 +291,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
        while ((task = cgroup_iter_next(cgroup, &it))) {
                if (!freeze_task(task, true))
                        continue;
-                if (frozen(task))
+                if (is_task_frozen_enough(task))
                        continue;
                if (!freezing(task) && !freezer_should_skip(task))
                        num_cant_freeze_now++;
diff --git a/kernel/compat.c b/kernel/compat.c
index e2435ee9993..f346cedfe24 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
 #include <linux/unistd.h>
 #include <linux/security.h>
 #include <linux/timex.h>
+#include <linux/export.h>
 #include <linux/migrate.h>
 #include <linux/posix-timers.h>
 #include <linux/times.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 12b7458f23b..563f1360947 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -10,11 +10,12 @@
 #include <linux/sched.h>
 #include <linux/unistd.h>
 #include <linux/cpu.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kthread.h>
 #include <linux/stop_machine.h>
 #include <linux/mutex.h>
 #include <linux/gfp.h>
+#include <linux/suspend.h>
 #ifdef CONFIG_SMP
 /* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -476,6 +477,79 @@ static int alloc_frozen_cpus(void)
        return 0;
 }
 core_initcall(alloc_frozen_cpus);
+/*
+ * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
+ * hotplug when tasks are about to be frozen. Also, don't allow the freezer
+ * to continue until any currently running CPU hotplug operation gets
+ * completed.
+ * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
+ * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
+ * CPU hotplug path and released only after it is complete. Thus, we
+ * (and hence the freezer) will block here until any currently running CPU
+ * hotplug operation gets completed.
+ */
+void cpu_hotplug_disable_before_freeze(void)
+{
+        cpu_maps_update_begin();
+        cpu_hotplug_disabled = 1;
+        cpu_maps_update_done();
+}
+/*
+ * When tasks have been thawed, re-enable regular CPU hotplug (which had been
+ * disabled while beginning to freeze tasks).
+ */
+void cpu_hotplug_enable_after_thaw(void)
+{
+        cpu_maps_update_begin();
+        cpu_hotplug_disabled = 0;
+        cpu_maps_update_done();
+}
+/*
+ * When callbacks for CPU hotplug notifications are being executed, we must
+ * ensure that the state of the system with respect to the tasks being frozen
+ * or not, as reported by the notification, remains unchanged *throughout the
+ * duration* of the execution of the callbacks.
+ * Hence we need to prevent the freezer from racing with regular CPU hotplug.
+ *
+ * This synchronization is implemented by mutually excluding regular CPU
+ * hotplug and Suspend/Hibernate call paths by hooking onto the Suspend/
+ * Hibernate notifications.
+ */
+static int
+cpu_hotplug_pm_callback(struct notifier_block *nb,
+                        unsigned long action, void *ptr)
+{
+        switch (action) {
+        case PM_SUSPEND_PREPARE:
+        case PM_HIBERNATION_PREPARE:
+                cpu_hotplug_disable_before_freeze();
+                break;
+        case PM_POST_SUSPEND:
+        case PM_POST_HIBERNATION:
+                cpu_hotplug_enable_after_thaw();
+                break;
+        default:
+                return NOTIFY_DONE;
+        }
+        return NOTIFY_OK;
+}
+int cpu_hotplug_pm_sync_init(void)
+{
+        pm_notifier(cpu_hotplug_pm_callback, 0);
+        return 0;
+}
+core_initcall(cpu_hotplug_pm_sync_init);
 #endif /* CONFIG_PM_SLEEP_SMP */
 /**
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
new file mode 100644
index 00000000000..249152e1530
--- /dev/null
+++ b/kernel/cpu_pm.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * Author:
+ *      Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/cpu_pm.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/syscore_ops.h>
+static DEFINE_RWLOCK(cpu_pm_notifier_lock);
+static RAW_NOTIFIER_HEAD(cpu_pm_notifier_chain);
+static int cpu_pm_notify(enum cpu_pm_event event, int nr_to_call, int *nr_calls)
+{
+        int ret;
+        ret = __raw_notifier_call_chain(&cpu_pm_notifier_chain, event, NULL,
+                nr_to_call, nr_calls);
+        return notifier_to_errno(ret);
+}
+/**
+ * cpu_pm_register_notifier - register a driver with cpu_pm
+ * @nb: notifier block to register
+ *
+ * Add a driver to a list of drivers that are notified about
+ * CPU and CPU cluster low power entry and exit.
+ *
+ * This function may sleep, and has the same return conditions as
+ * raw_notifier_chain_register.
+ */
+int cpu_pm_register_notifier(struct notifier_block *nb)
+{
+        unsigned long flags;
+        int ret;
+        write_lock_irqsave(&cpu_pm_notifier_lock, flags);
+        ret = raw_notifier_chain_register(&cpu_pm_notifier_chain, nb);
+        write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_register_notifier);
+/**
+ * cpu_pm_unregister_notifier - unregister a driver with cpu_pm
+ * @nb: notifier block to be unregistered
+ *
+ * Remove a driver from the CPU PM notifier list.
+ *
+ * This function may sleep, and has the same return conditions as
+ * raw_notifier_chain_unregister.
+ */
+int cpu_pm_unregister_notifier(struct notifier_block *nb)
+{
+        unsigned long flags;
+        int ret;
+        write_lock_irqsave(&cpu_pm_notifier_lock, flags);
+        ret = raw_notifier_chain_unregister(&cpu_pm_notifier_chain, nb);
+        write_unlock_irqrestore(&cpu_pm_notifier_lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
+/**
+ * cpm_pm_enter - CPU low power entry notifier
+ *
+ * Notifies listeners that a single CPU is entering a low power state that may
+ * cause some blocks in the same power domain as the cpu to reset.
+ *
+ * Must be called on the affected CPU with interrupts disabled.  Platform is
+ * responsible for ensuring that cpu_pm_enter is not called twice on the same
+ * CPU before cpu_pm_exit is called. Notified drivers can include VFP
+ * co-processor, interrupt controller and it's PM extensions, local CPU
+ * timers context save/restore which shouldn't be interrupted. Hence it
+ * must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_pm_enter(void)
+{
+        int nr_calls;
+        int ret = 0;
+        read_lock(&cpu_pm_notifier_lock);
+        ret = cpu_pm_notify(CPU_PM_ENTER, -1, &nr_calls);
+        if (ret)
+                /*
+                 * Inform listeners (nr_calls - 1) about failure of CPU PM
+                 * PM entry who are notified earlier to prepare for it.
+                 */
+                cpu_pm_notify(CPU_PM_ENTER_FAILED, nr_calls - 1, NULL);
+        read_unlock(&cpu_pm_notifier_lock);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_enter);
+/**
+ * cpm_pm_exit - CPU low power exit notifier
+ *
+ * Notifies listeners that a single CPU is exiting a low power state that may
+ * have caused some blocks in the same power domain as the cpu to reset.
+ *
+ * Notified drivers can include VFP co-processor, interrupt controller
+ * and it's PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_pm_exit(void)
+{
+        int ret;
+        read_lock(&cpu_pm_notifier_lock);
+        ret = cpu_pm_notify(CPU_PM_EXIT, -1, NULL);
+        read_unlock(&cpu_pm_notifier_lock);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_pm_exit);
+/**
+ * cpm_cluster_pm_enter - CPU cluster low power entry notifier
+ *
+ * Notifies listeners that all cpus in a power domain are entering a low power
+ * state that may cause some blocks in the same power domain to reset.
+ *
+ * Must be called after cpu_pm_enter has been called on all cpus in the power
+ * domain, and before cpu_pm_exit has been called on any cpu in the power
+ * domain. Notified drivers can include VFP co-processor, interrupt controller
+ * and it's PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_cluster_pm_enter(void)
+{
+        int nr_calls;
+        int ret = 0;
+        read_lock(&cpu_pm_notifier_lock);
+        ret = cpu_pm_notify(CPU_CLUSTER_PM_ENTER, -1, &nr_calls);
+        if (ret)
+                /*
+                 * Inform listeners (nr_calls - 1) about failure of CPU cluster
+                 * PM entry who are notified earlier to prepare for it.
+                 */
+                cpu_pm_notify(CPU_CLUSTER_PM_ENTER_FAILED, nr_calls - 1, NULL);
+        read_unlock(&cpu_pm_notifier_lock);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_cluster_pm_enter);
+/**
+ * cpm_cluster_pm_exit - CPU cluster low power exit notifier
+ *
+ * Notifies listeners that all cpus in a power domain are exiting form a
+ * low power state that may have caused some blocks in the same power domain
+ * to reset.
+ *
+ * Must be called after cpu_pm_exit has been called on all cpus in the power
+ * domain, and before cpu_pm_exit has been called on any cpu in the power
+ * domain. Notified drivers can include VFP co-processor, interrupt controller
+ * and it's PM extensions, local CPU timers context save/restore which
+ * shouldn't be interrupted. Hence it must be called with interrupts disabled.
+ *
+ * Return conditions are same as __raw_notifier_call_chain.
+ */
+int cpu_cluster_pm_exit(void)
+{
+        int ret;
+        read_lock(&cpu_pm_notifier_lock);
+        ret = cpu_pm_notify(CPU_CLUSTER_PM_EXIT, -1, NULL);
+        read_unlock(&cpu_pm_notifier_lock);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(cpu_cluster_pm_exit);
+#ifdef CONFIG_PM
+static int cpu_pm_suspend(void)
+{
+        int ret;
+        ret = cpu_pm_enter();
+        if (ret)
+                return ret;
+        ret = cpu_cluster_pm_enter();
+        return ret;
+}
+static void cpu_pm_resume(void)
+{
+        cpu_cluster_pm_exit();
+        cpu_pm_exit();
+}
+static struct syscore_ops cpu_pm_syscore_ops = {
+        .suspend = cpu_pm_suspend,
+        .resume = cpu_pm_resume,
+};
+static int cpu_pm_init(void)
+{
+        register_syscore_ops(&cpu_pm_syscore_ops);
+        return 0;
+}
+core_initcall(cpu_pm_init);
+#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 10131fdaff7..9fe58c46a42 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -37,7 +37,7 @@
 #include <linux/mempolicy.h>
 #include <linux/mm.h>
 #include <linux/memory.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
@@ -949,6 +949,8 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 static void cpuset_change_task_nodemask(struct task_struct *tsk,
                                        nodemask_t *newmems)
 {
+        bool masks_disjoint = !nodes_intersects(*newmems, tsk->mems_allowed);
 repeat:
        /*
         * Allow tasks that have access to memory reserves because they have
@@ -963,7 +965,6 @@ repeat:
        nodes_or(tsk->mems_allowed, tsk->mems_allowed, *newmems);
        mpol_rebind_task(tsk, newmems, MPOL_REBIND_STEP1);
        /*
         * ensure checking ->mems_allowed_change_disable after setting all new
         * allowed nodes.
@@ -980,9 +981,11 @@ repeat:
        /*
         * Allocation of memory is very fast, we needn't sleep when waiting
-         * for the read-side.
+         * for the read-side.  No wait is necessary, however, if at least one
+         * node remains unchanged.
         */
-        while (ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
+        while (masks_disjoint &&
+                        ACCESS_ONCE(tsk->mems_allowed_change_disable)) {
                task_unlock(tsk);
                if (!task_curr(tsk))
                        yield();
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index 5f85690285d..c766ee54c0b 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -2,7 +2,7 @@
 #include <linux/crash_dump.h>
 #include <linux/init.h>
 #include <linux/errno.h>
-#include <linux/module.h>
+#include <linux/export.h>
 /*
 * If we have booted due to a crash, max_pfn will be a very low value. We need
@@ -20,8 +20,15 @@ unsigned long saved_max_pfn;
 unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
 /*
+ * stores the size of elf header of crash image
+ */
+unsigned long long elfcorehdr_size;
+/*
 * elfcorehdr= specifies the location of elf core header stored by the crashed
 * kernel. This option will be passed by kexec loader to the capture kernel.
+ *
+ * Syntax: elfcorehdr=[size[KMG]@]offset[KMG]
 */
 static int __init setup_elfcorehdr(char *arg)
 {
@@ -29,6 +36,10 @@ static int __init setup_elfcorehdr(char *arg)
        if (!arg)
                return -EINVAL;
        elfcorehdr_addr = memparse(arg, &end);
+        if (*end == '@') {
+                elfcorehdr_size = elfcorehdr_addr;
+                elfcorehdr_addr = memparse(end + 1, &end);
+        }
        return end > arg ? 0 : -EINVAL;
 }
 early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 174fa84eca3..5791612a404 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -8,7 +8,7 @@
 * as published by the Free Software Foundation; either version
 * 2 of the Licence, or (at your option) any later version.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/cred.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
@@ -508,10 +508,8 @@ int commit_creds(struct cred *new)
                key_fsgid_changed(task);
        /* do it
-         * - What if a process setreuid()'s and this brings the
+         * RLIMIT_NPROC limits on user->processes have already been checked
-         *   new uid over his NPROC rlimit?  We can check this now
+         * in set_user().
-         *   cheaply with the new uid cache, so if it matters
-         *   we should be checking for it.  -DaveM
         */
        alter_cred_subscribers(new, 2);
        if (new->user != old->user)
@@ -646,6 +644,9 @@ void __init cred_init(void)
 */
 struct cred *prepare_kernel_cred(struct task_struct *daemon)
 {
+#ifdef CONFIG_KEYS
+        struct thread_group_cred *tgcred;
+#endif
        const struct cred *old;
        struct cred *new;
@@ -653,6 +654,14 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
        if (!new)
                return NULL;
+#ifdef CONFIG_KEYS
+        tgcred = kmalloc(sizeof(*tgcred), GFP_KERNEL);
+        if (!tgcred) {
+                kmem_cache_free(cred_jar, new);
+                return NULL;
+        }
+#endif
        kdebug("prepare_kernel_cred() alloc %p", new);
        if (daemon)
@@ -669,8 +678,11 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
        get_group_info(new->group_info);
 #ifdef CONFIG_KEYS
-        atomic_inc(&init_tgcred.usage);
+        atomic_set(&tgcred->usage, 1);
-        new->tgcred = &init_tgcred;
+        spin_lock_init(&tgcred->lock);
+        tgcred->process_keyring = NULL;
+        tgcred->session_keyring = NULL;
+        new->tgcred = tgcred;
        new->request_key_auth = NULL;
        new->thread_keyring = NULL;
        new->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 34872482315..c22d8c28ad8 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -217,7 +217,7 @@ void gdbstub_msg_write(const char *s, int len)
                /* Pack in hex chars */
                for (i = 0; i < wcount; i++)
-                        bufptr = pack_hex_byte(bufptr, s[i]);
+                        bufptr = hex_byte_pack(bufptr, s[i]);
                *bufptr = '\0';
                /* Move up */
@@ -249,7 +249,7 @@ char *kgdb_mem2hex(char *mem, char *buf, int count)
        if (err)
                return NULL;
        while (count > 0) {
-                buf = pack_hex_byte(buf, *tmp);
+                buf = hex_byte_pack(buf, *tmp);
                tmp++;
                count--;
        }
@@ -411,14 +411,14 @@ static char *pack_threadid(char *pkt, unsigned char *id)
        limit = id + (BUF_THREAD_ID_SIZE / 2);
        while (id < limit) {
                if (!lzero || *id != 0) {
-                        pkt = pack_hex_byte(pkt, *id);
+                        pkt = hex_byte_pack(pkt, *id);
                        lzero = 0;
                }
                id++;
        }
        if (lzero)
-                pkt = pack_hex_byte(pkt, 0);
+                pkt = hex_byte_pack(pkt, 0);
        return pkt;
 }
@@ -486,7 +486,7 @@ static void gdb_cmd_status(struct kgdb_state *ks)
        dbg_remove_all_break();
        remcom_out_buffer[0] = 'S';
-        pack_hex_byte(&remcom_out_buffer[1], ks->signo);
+        hex_byte_pack(&remcom_out_buffer[1], ks->signo);
 }
 static void gdb_get_regs_helper(struct kgdb_state *ks)
@@ -954,7 +954,7 @@ int gdb_serial_stub(struct kgdb_state *ks)
                /* Reply to host that an exception has occurred */
                ptr = remcom_out_buffer;
                *ptr++ = 'T';
-                ptr = pack_hex_byte(ptr, ks->signo);
+                ptr = hex_byte_pack(ptr, ks->signo);
                ptr += strlen(strcpy(ptr, "thread:"));
                int_to_threadref(thref, shadow_pid(current->pid));
                ptr = pack_threadid(ptr, thref);
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index d9ca9aa481e..8b68ce78ff1 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -11,6 +11,7 @@
 #include <linux/kgdb.h>
 #include <linux/kdb.h>
 #include <linux/kdebug.h>
+#include <linux/export.h>
 #include "kdb_private.h"
 #include "../debug_core.h"
diff --git a/kernel/dma.c b/kernel/dma.c
index f903189c530..68a2306522c 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -9,7 +9,7 @@
 *   [It also happened to remove the sizeof(char *) == sizeof(int)
 *   assumption introduced because of those /proc/dma patches. -- Hennus]
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/spinlock.h>
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b8785e26ee1..0e8457da6f9 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -25,6 +25,7 @@
 #include <linux/reboot.h>
 #include <linux/vmstat.h>
 #include <linux/device.h>
+#include <linux/export.h>
 #include <linux/vmalloc.h>
 #include <linux/hardirq.h>
 #include <linux/rculist.h>
@@ -399,14 +400,54 @@ void perf_cgroup_switch(struct task_struct *task, int mode)
        local_irq_restore(flags);
 }
-static inline void perf_cgroup_sched_out(struct task_struct *task)
+static inline void perf_cgroup_sched_out(struct task_struct *task,
+                                         struct task_struct *next)
 {
-        perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+        struct perf_cgroup *cgrp1;
+        struct perf_cgroup *cgrp2 = NULL;
+        /*
+         * we come here when we know perf_cgroup_events > 0
+         */
+        cgrp1 = perf_cgroup_from_task(task);
+        /*
+         * next is NULL when called from perf_event_enable_on_exec()
+         * that will systematically cause a cgroup_switch()
+         */
+        if (next)
+                cgrp2 = perf_cgroup_from_task(next);
+        /*
+         * only schedule out current cgroup events if we know
+         * that we are switching to a different cgroup. Otherwise,
+         * do no touch the cgroup events.
+         */
+        if (cgrp1 != cgrp2)
+                perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
 }
-static inline void perf_cgroup_sched_in(struct task_struct *task)
+static inline void perf_cgroup_sched_in(struct task_struct *prev,
+                                        struct task_struct *task)
 {
-        perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+        struct perf_cgroup *cgrp1;
+        struct perf_cgroup *cgrp2 = NULL;
+        /*
+         * we come here when we know perf_cgroup_events > 0
+         */
+        cgrp1 = perf_cgroup_from_task(task);
+        /* prev can never be NULL */
+        cgrp2 = perf_cgroup_from_task(prev);
+        /*
+         * only need to schedule in cgroup events if we are changing
+         * cgroup during ctxsw. Cgroup events were not scheduled
+         * out of ctxsw out if that was not the case.
+         */
+        if (cgrp1 != cgrp2)
+                perf_cgroup_switch(task, PERF_CGROUP_SWIN);
 }
 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
@@ -518,11 +559,13 @@ static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
 {
 }
-static inline void perf_cgroup_sched_out(struct task_struct *task)
+static inline void perf_cgroup_sched_out(struct task_struct *task,
+                                         struct task_struct *next)
 {
 }
-static inline void perf_cgroup_sched_in(struct task_struct *task)
+static inline void perf_cgroup_sched_in(struct task_struct *prev,
+                                        struct task_struct *task)
 {
 }
@@ -1988,7 +2031,7 @@ void __perf_event_task_sched_out(struct task_struct *task,
         * cgroup event are system-wide mode only
         */
        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
-                perf_cgroup_sched_out(task);
+                perf_cgroup_sched_out(task, next);
 }
 static void task_ctx_sched_out(struct perf_event_context *ctx)
@@ -2153,7 +2196,8 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
 * accessing the event control register. If a NMI hits, then it will
 * keep the event running.
 */
-void __perf_event_task_sched_in(struct task_struct *task)
+void __perf_event_task_sched_in(struct task_struct *prev,
+                                struct task_struct *task)
 {
        struct perf_event_context *ctx;
        int ctxn;
@@ -2171,7 +2215,7 @@ void __perf_event_task_sched_in(struct task_struct *task)
         * cgroup event are system-wide mode only
         */
        if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
-                perf_cgroup_sched_in(task);
+                perf_cgroup_sched_in(prev, task);
 }
 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -2427,7 +2471,7 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
         * ctxswin cgroup events which are already scheduled
         * in.
         */
-        perf_cgroup_sched_out(current);
+        perf_cgroup_sched_out(current, NULL);
        raw_spin_lock(&ctx->lock);
        task_ctx_sched_out(ctx);
@@ -3353,8 +3397,8 @@ static int perf_event_index(struct perf_event *event)
 }
 static void calc_timer_values(struct perf_event *event,
-                                u64 *running,
+                                u64 *enabled,
-                                u64 *enabled)
+                                u64 *running)
 {
        u64 now, ctx_time;
@@ -3500,7 +3544,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
                struct ring_buffer *rb = event->rb;
                atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
-                vma->vm_mm->locked_vm -= event->mmap_locked;
+                vma->vm_mm->pinned_vm -= event->mmap_locked;
                rcu_assign_pointer(event->rb, NULL);
                mutex_unlock(&event->mmap_mutex);
@@ -3581,7 +3625,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        lock_limit = rlimit(RLIMIT_MEMLOCK);
        lock_limit >>= PAGE_SHIFT;
-        locked = vma->vm_mm->locked_vm + extra;
+        locked = vma->vm_mm->pinned_vm + extra;
        if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
                !capable(CAP_IPC_LOCK)) {
@@ -3607,7 +3651,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
        atomic_long_add(user_extra, &user->locked_vm);
        event->mmap_locked = extra;
        event->mmap_user = get_current_user();
-        vma->vm_mm->locked_vm += event->mmap_locked;
+        vma->vm_mm->pinned_vm += event->mmap_locked;
 unlock:
        if (!ret)
@@ -5715,6 +5759,7 @@ struct pmu *perf_init_event(struct perf_event *event)
        pmu = idr_find(&pmu_idr, event->attr.type);
        rcu_read_unlock();
        if (pmu) {
+                event->pmu = pmu;
                ret = pmu->event_init(event);
                if (ret)
                        pmu = ERR_PTR(ret);
@@ -5722,6 +5767,7 @@ struct pmu *perf_init_event(struct perf_event *event)
        }
        list_for_each_entry_rcu(pmu, &pmus, entry) {
+                event->pmu = pmu;
                ret = pmu->event_init(event);
                if (!ret)
                        goto unlock;
@@ -5848,8 +5894,6 @@ done:
                return ERR_PTR(err);
        }
-        event->pmu = pmu;
        if (!event->parent) {
                if (event->attach_state & PERF_ATTACH_TASK)
                        jump_label_inc(&perf_sched_events);
diff --git a/kernel/exit.c b/kernel/exit.c
index 2913b3509d4..d0b7d988f87 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -681,8 +681,6 @@ static void exit_mm(struct task_struct * tsk)
        enter_lazy_tlb(mm, current);
        /* We don't want this task to be frozen prematurely */
        clear_freeze_flag(tsk);
-        if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-                atomic_dec(&mm->oom_disable_count);
        task_unlock(tsk);
        mm_update_next_owner(mm);
        mmput(mm);
diff --git a/kernel/fork.c b/kernel/fork.c
index e7ceaca8960..da4a6a10d08 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -162,7 +162,6 @@ static void account_kernel_stack(struct thread_info *ti, int account)
 void free_task(struct task_struct *tsk)
 {
-        prop_local_destroy_single(&tsk->dirties);
        account_kernel_stack(tsk->stack, -1);
        free_thread_info(tsk->stack);
        rt_mutex_debug_task_free(tsk);
@@ -274,10 +273,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
        tsk->stack = ti;
-        err = prop_local_init_single(&tsk->dirties);
-        if (err)
-                goto out;
        setup_thread_stack(tsk, orig);
        clear_user_return_notifier(tsk);
        clear_tsk_need_resched(tsk);
@@ -501,7 +496,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
        mm->cached_hole_size = ~0UL;
        mm_init_aio(mm);
        mm_init_owner(mm, p);
-        atomic_set(&mm->oom_disable_count, 0);
        if (likely(!mm_alloc_pgd(mm))) {
                mm->def_flags = 0;
@@ -816,8 +810,6 @@ good_mm:
        /* Initializing for Swap token stuff */
        mm->token_priority = 0;
        mm->last_interval = 0;
-        if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-                atomic_inc(&mm->oom_disable_count);
        tsk->mm = mm;
        tsk->active_mm = mm;
@@ -1111,6 +1103,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                    p->real_cred->user != INIT_USER)
                        goto bad_fork_free;
        }
+        current->flags &= ~PF_NPROC_EXCEEDED;
        retval = copy_creds(p, clone_flags);
        if (retval < 0)
@@ -1301,6 +1294,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        p->pdeath_signal = 0;
        p->exit_state = 0;
+        p->nr_dirtied = 0;
+        p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
        /*
         * Ok, make it visible to the rest of the system.
         * We dont wake it up yet.
@@ -1390,13 +1386,8 @@ bad_fork_cleanup_io:
 bad_fork_cleanup_namespaces:
        exit_task_namespaces(p);
 bad_fork_cleanup_mm:
-        if (p->mm) {
+        if (p->mm)
-                task_lock(p);
-                if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
-                        atomic_dec(&p->mm->oom_disable_count);
-                task_unlock(p);
                mmput(p->mm);
-        }
 bad_fork_cleanup_signal:
        if (!(clone_flags & CLONE_THREAD))
                free_signal_struct(p->signal);
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 7b01de98bb6..7be56c53439 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -6,7 +6,7 @@
 #include <linux/interrupt.h>
 #include <linux/suspend.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/syscalls.h>
 #include <linux/freezer.h>
diff --git a/kernel/futex.c b/kernel/futex.c
index 11cbe052b2e..ea87f4d2f45 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -55,7 +55,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/magic.h>
 #include <linux/pid.h>
 #include <linux/nsproxy.h>
@@ -854,7 +854,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 {
        struct task_struct *new_owner;
        struct futex_pi_state *pi_state = this->pi_state;
-        u32 curval, newval;
+        u32 uninitialized_var(curval), newval;
        if (!pi_state)
                return -EINVAL;
@@ -916,7 +916,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
 {
-        u32 oldval;
+        u32 uninitialized_var(oldval);
        /*
         * There is no waiter, so we unlock the futex. The owner died
@@ -1576,7 +1576,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
        u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
        struct futex_pi_state *pi_state = q->pi_state;
        struct task_struct *oldowner = pi_state->owner;
-        u32 uval, curval, newval;
+        u32 uval, uninitialized_var(curval), newval;
        int ret;
        /* Owner died? */
@@ -1793,7 +1793,7 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
 *
 * Returns:
 *  0 - uaddr contains val and hb has been locked
- * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
 */
 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
                           struct futex_q *q, struct futex_hash_bucket **hb)
@@ -2481,7 +2481,7 @@ err_unlock:
 */
 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
 {
-        u32 uval, nval, mval;
+        u32 uval, uninitialized_var(nval), mval;
 retry:
        if (get_user(uval, uaddr))
diff --git a/kernel/groups.c b/kernel/groups.c
index 1cc476d52dd..99b53d1eb7e 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -2,7 +2,7 @@
 * Supplementary group IDs
 */
 #include <linux/cred.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/security.h>
 #include <linux/syscalls.h>
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2043c08d36c..ae34bf51682 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -32,7 +32,7 @@
 */
 #include <linux/cpu.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/percpu.h>
 #include <linux/hrtimer.h>
 #include <linux/notifier.h>
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ea640120ab8..8b1748d0172 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -13,7 +13,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/lockdep.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sysctl.h>
 /*
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index d5a3009da71..f7c543a801d 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -26,7 +26,7 @@
 int irq_set_chip(unsigned int irq, struct irq_chip *chip)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
        if (!desc)
                return -EINVAL;
@@ -54,7 +54,7 @@ EXPORT_SYMBOL(irq_set_chip);
 int irq_set_irq_type(unsigned int irq, unsigned int type)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
        int ret = 0;
        if (!desc)
@@ -78,7 +78,7 @@ EXPORT_SYMBOL(irq_set_irq_type);
 int irq_set_handler_data(unsigned int irq, void *data)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
        if (!desc)
                return -EINVAL;
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(irq_set_handler_data);
 int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
        if (!desc)
                return -EINVAL;
@@ -119,7 +119,7 @@ int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
 int irq_set_chip_data(unsigned int irq, void *data)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
        if (!desc)
                return -EINVAL;
@@ -178,7 +178,7 @@ void irq_shutdown(struct irq_desc *desc)
        desc->depth = 1;
        if (desc->irq_data.chip->irq_shutdown)
                desc->irq_data.chip->irq_shutdown(&desc->irq_data);
-        if (desc->irq_data.chip->irq_disable)
+        else if (desc->irq_data.chip->irq_disable)
                desc->irq_data.chip->irq_disable(&desc->irq_data);
        else
                desc->irq_data.chip->irq_mask(&desc->irq_data);
@@ -204,6 +204,24 @@ void irq_disable(struct irq_desc *desc)
        }
 }
+void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu)
+{
+        if (desc->irq_data.chip->irq_enable)
+                desc->irq_data.chip->irq_enable(&desc->irq_data);
+        else
+                desc->irq_data.chip->irq_unmask(&desc->irq_data);
+        cpumask_set_cpu(cpu, desc->percpu_enabled);
+}
+void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu)
+{
+        if (desc->irq_data.chip->irq_disable)
+                desc->irq_data.chip->irq_disable(&desc->irq_data);
+        else
+                desc->irq_data.chip->irq_mask(&desc->irq_data);
+        cpumask_clear_cpu(cpu, desc->percpu_enabled);
+}
 static inline void mask_ack_irq(struct irq_desc *desc)
 {
        if (desc->irq_data.chip->irq_mask_ack)
@@ -544,12 +562,44 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
                chip->irq_eoi(&desc->irq_data);
 }
+/**
+ * handle_percpu_devid_irq - Per CPU local irq handler with per cpu dev ids
+ * @irq:        the interrupt number
+ * @desc:       the interrupt description structure for this irq
+ *
+ * Per CPU interrupts on SMP machines without locking requirements. Same as
+ * handle_percpu_irq() above but with the following extras:
+ *
+ * action->percpu_dev_id is a pointer to percpu variables which
+ * contain the real device id for the cpu on which this handler is
+ * called
+ */
+void handle_percpu_devid_irq(unsigned int irq, struct irq_desc *desc)
+{
+        struct irq_chip *chip = irq_desc_get_chip(desc);
+        struct irqaction *action = desc->action;
+        void *dev_id = __this_cpu_ptr(action->percpu_dev_id);
+        irqreturn_t res;
+        kstat_incr_irqs_this_cpu(irq, desc);
+        if (chip->irq_ack)
+                chip->irq_ack(&desc->irq_data);
+        trace_irq_handler_entry(irq, action);
+        res = action->handler(irq, dev_id);
+        trace_irq_handler_exit(irq, action, res);
+        if (chip->irq_eoi)
+                chip->irq_eoi(&desc->irq_data);
+}
 void
 __irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
                  const char *name)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, 0);
        if (!desc)
                return;
@@ -593,7 +643,7 @@ irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
 void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
        if (!desc)
                return;
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
index 3a2cab407b9..c89295a8f66 100644
--- a/kernel/irq/generic-chip.c
+++ b/kernel/irq/generic-chip.c
@@ -6,6 +6,7 @@
 #include <linux/io.h>
 #include <linux/irq.h>
 #include <linux/slab.h>
+#include <linux/export.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/syscore_ops.h>
@@ -211,6 +212,7 @@ irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
        }
        return gc;
 }
+EXPORT_SYMBOL_GPL(irq_alloc_generic_chip);
 /*
 * Separate lockdep class for interrupt chip which can nest irq_desc
@@ -246,7 +248,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
                gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
        for (i = gc->irq_base; msk; msk >>= 1, i++) {
-                if (!msk & 0x01)
+                if (!(msk & 0x01))
                        continue;
                if (flags & IRQ_GC_INIT_NESTED_LOCK)
@@ -258,6 +260,7 @@ void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
        }
        gc->irq_cnt = i - gc->irq_base;
 }
+EXPORT_SYMBOL_GPL(irq_setup_generic_chip);
 /**
 * irq_setup_alt_chip - Switch to alternative chip
@@ -281,6 +284,7 @@ int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
        }
        return -EINVAL;
 }
+EXPORT_SYMBOL_GPL(irq_setup_alt_chip);
 /**
 * irq_remove_generic_chip - Remove a chip
@@ -301,7 +305,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
        raw_spin_unlock(&gc_lock);
        for (; msk; msk >>= 1, i++) {
-                if (!msk & 0x01)
+                if (!(msk & 0x01))
                        continue;
                /* Remove handler first. That will mask the irq line */
@@ -311,6 +315,7 @@ void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
                irq_modify_status(i, clr, set);
        }
 }
+EXPORT_SYMBOL_GPL(irq_remove_generic_chip);
 #ifdef CONFIG_PM
 static int irq_gc_suspend(void)
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 6546431447d..a73dd6c7372 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -71,6 +71,8 @@ extern int irq_startup(struct irq_desc *desc);
 extern void irq_shutdown(struct irq_desc *desc);
 extern void irq_enable(struct irq_desc *desc);
 extern void irq_disable(struct irq_desc *desc);
+extern void irq_percpu_enable(struct irq_desc *desc, unsigned int cpu);
+extern void irq_percpu_disable(struct irq_desc *desc, unsigned int cpu);
 extern void mask_irq(struct irq_desc *desc);
 extern void unmask_irq(struct irq_desc *desc);
@@ -114,14 +116,21 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
                desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
 }
+#define _IRQ_DESC_CHECK         (1 << 0)
+#define _IRQ_DESC_PERCPU        (1 << 1)
+#define IRQ_GET_DESC_CHECK_GLOBAL       (_IRQ_DESC_CHECK)
+#define IRQ_GET_DESC_CHECK_PERCPU       (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
 struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+                    unsigned int check);
 void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
 static inline struct irq_desc *
-irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
+irq_get_desc_buslock(unsigned int irq, unsigned long *flags, unsigned int check)
 {
-        return __irq_get_desc_lock(irq, flags, true);
+        return __irq_get_desc_lock(irq, flags, true, check);
 }
 static inline void
@@ -131,9 +140,9 @@ irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
 }
 static inline struct irq_desc *
-irq_get_desc_lock(unsigned int irq, unsigned long *flags)
+irq_get_desc_lock(unsigned int irq, unsigned long *flags, unsigned int check)
 {
-        return __irq_get_desc_lock(irq, flags, false);
+        return __irq_get_desc_lock(irq, flags, false, check);
 }
 static inline void
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 4c60a50e66b..d86e254b95e 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -9,7 +9,7 @@
 */
 #include <linux/irq.h>
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/radix-tree.h>
@@ -70,7 +70,8 @@ static inline void desc_smp_init(struct irq_desc *desc, int node) { }
 static inline int desc_node(struct irq_desc *desc) { return 0; }
 #endif
-static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
+static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
+                struct module *owner)
 {
        int cpu;
@@ -86,6 +87,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
        desc->irq_count = 0;
        desc->irqs_unhandled = 0;
        desc->name = NULL;
+        desc->owner = owner;
        for_each_possible_cpu(cpu)
                *per_cpu_ptr(desc->kstat_irqs, cpu) = 0;
        desc_smp_init(desc, node);
@@ -128,7 +130,7 @@ static void free_masks(struct irq_desc *desc)
 static inline void free_masks(struct irq_desc *desc) { }
 #endif
-static struct irq_desc *alloc_desc(int irq, int node)
+static struct irq_desc *alloc_desc(int irq, int node, struct module *owner)
 {
        struct irq_desc *desc;
        gfp_t gfp = GFP_KERNEL;
@@ -147,7 +149,7 @@ static struct irq_desc *alloc_desc(int irq, int node)
        raw_spin_lock_init(&desc->lock);
        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-        desc_set_defaults(irq, desc, node);
+        desc_set_defaults(irq, desc, node, owner);
        return desc;
@@ -173,13 +175,14 @@ static void free_desc(unsigned int irq)
        kfree(desc);
 }
-static int alloc_descs(unsigned int start, unsigned int cnt, int node)
+static int alloc_descs(unsigned int start, unsigned int cnt, int node,
+                       struct module *owner)
 {
        struct irq_desc *desc;
        int i;
        for (i = 0; i < cnt; i++) {
-                desc = alloc_desc(start + i, node);
+                desc = alloc_desc(start + i, node, owner);
                if (!desc)
                        goto err;
                mutex_lock(&sparse_irq_lock);
@@ -227,7 +230,7 @@ int __init early_irq_init(void)
                nr_irqs = initcnt;
        for (i = 0; i < initcnt; i++) {
-                desc = alloc_desc(i, node);
+                desc = alloc_desc(i, node, NULL);
                set_bit(i, allocated_irqs);
                irq_insert_desc(i, desc);
        }
@@ -261,7 +264,7 @@ int __init early_irq_init(void)
                alloc_masks(&desc[i], GFP_KERNEL, node);
                raw_spin_lock_init(&desc[i].lock);
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-                desc_set_defaults(i, &desc[i], node);
+                desc_set_defaults(i, &desc[i], node, NULL);
        }
        return arch_early_irq_init();
 }
@@ -276,8 +279,16 @@ static void free_desc(unsigned int irq)
        dynamic_irq_cleanup(irq);
 }
-static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
+static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
+                              struct module *owner)
 {
+        u32 i;
+        for (i = 0; i < cnt; i++) {
+                struct irq_desc *desc = irq_to_desc(start + i);
+                desc->owner = owner;
+        }
        return start;
 }
@@ -333,11 +344,13 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
 * @from:       Start the search from this irq number
 * @cnt:        Number of consecutive irqs to allocate.
 * @node:       Preferred node on which the irq descriptor should be allocated
+ * @owner:      Owning module (can be NULL)
 *
 * Returns the first irq number or error code
 */
 int __ref
-irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
+__irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
+                  struct module *owner)
 {
        int start, ret;
@@ -366,13 +379,13 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
        bitmap_set(allocated_irqs, start, cnt);
        mutex_unlock(&sparse_irq_lock);
-        return alloc_descs(start, cnt, node);
+        return alloc_descs(start, cnt, node, owner);
 err:
        mutex_unlock(&sparse_irq_lock);
        return ret;
 }
-EXPORT_SYMBOL_GPL(irq_alloc_descs);
+EXPORT_SYMBOL_GPL(__irq_alloc_descs);
 /**
 * irq_reserve_irqs - mark irqs allocated
@@ -411,11 +424,22 @@ unsigned int irq_get_next_irq(unsigned int offset)
 }
 struct irq_desc *
-__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
+                    unsigned int check)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        if (desc) {
+                if (check & _IRQ_DESC_CHECK) {
+                        if ((check & _IRQ_DESC_PERCPU) &&
+                            !irq_settings_is_per_cpu_devid(desc))
+                                return NULL;
+                        if (!(check & _IRQ_DESC_PERCPU) &&
+                            irq_settings_is_per_cpu_devid(desc))
+                                return NULL;
+                }
                if (bus)
                        chip_bus_lock(desc);
                raw_spin_lock_irqsave(&desc->lock, *flags);
@@ -430,6 +454,25 @@ void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
                chip_bus_sync_unlock(desc);
 }
+int irq_set_percpu_devid(unsigned int irq)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        if (!desc)
+                return -EINVAL;
+        if (desc->percpu_enabled)
+                return -EINVAL;
+        desc->percpu_enabled = kzalloc(sizeof(*desc->percpu_enabled), GFP_KERNEL);
+        if (!desc->percpu_enabled)
+                return -ENOMEM;
+        irq_set_percpu_devid_flags(irq);
+        return 0;
+}
 /**
 * dynamic_irq_cleanup - cleanup a dynamically allocated irq
 * @irq:        irq number to initialize
@@ -440,7 +483,7 @@ void dynamic_irq_cleanup(unsigned int irq)
        unsigned long flags;
        raw_spin_lock_irqsave(&desc->lock, flags);
-        desc_set_defaults(irq, desc, desc_node(desc));
+        desc_set_defaults(irq, desc, desc_node(desc), NULL);
        raw_spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d5828da3fd3..200ce832c58 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -20,16 +20,20 @@ static DEFINE_MUTEX(irq_domain_mutex);
 void irq_domain_add(struct irq_domain *domain)
 {
        struct irq_data *d;
-        int hwirq;
+        int hwirq, irq;
        /*
         * This assumes that the irq_domain owner has already allocated
         * the irq_descs.  This block will be removed when support for dynamic
         * allocation of irq_descs is added to irq_domain.
         */
-        for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
+        irq_domain_for_each_irq(domain, hwirq, irq) {
-                d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
+                d = irq_get_irq_data(irq);
-                if (d || d->domain) {
+                if (!d) {
+                        WARN(1, "error: assigning domain to non existant irq_desc");
+                        return;
+                }
+                if (d->domain) {
                        /* things are broken; just report, don't clean up */
                        WARN(1, "error: irq_desc already assigned to a domain");
                        return;
@@ -50,15 +54,15 @@ void irq_domain_add(struct irq_domain *domain)
 void irq_domain_del(struct irq_domain *domain)
 {
        struct irq_data *d;
-        int hwirq;
+        int hwirq, irq;
        mutex_lock(&irq_domain_mutex);
        list_del(&domain->list);
        mutex_unlock(&irq_domain_mutex);
        /* Clear the irq_domain assignments */
-        for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
+        irq_domain_for_each_irq(domain, hwirq, irq) {
-                d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
+                d = irq_get_irq_data(irq);
                d->domain = NULL;
        }
 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 0a7840aeb0f..1da999f5e74 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -195,7 +195,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
 int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
        if (!desc)
                return -EINVAL;
@@ -356,7 +356,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
 static int __disable_irq_nosync(unsigned int irq)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
        if (!desc)
                return -EINVAL;
@@ -448,7 +448,7 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 void enable_irq(unsigned int irq)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
        if (!desc)
                return;
@@ -467,6 +467,9 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
        struct irq_desc *desc = irq_to_desc(irq);
        int ret = -ENXIO;
+        if (irq_desc_get_chip(desc)->flags &  IRQCHIP_SKIP_SET_WAKE)
+                return 0;
        if (desc->irq_data.chip->irq_set_wake)
                ret = desc->irq_data.chip->irq_set_wake(&desc->irq_data, on);
@@ -488,7 +491,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
 int irq_set_irq_wake(unsigned int irq, unsigned int on)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_buslock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
        int ret = 0;
        if (!desc)
@@ -529,7 +532,7 @@ EXPORT_SYMBOL(irq_set_irq_wake);
 int can_request_irq(unsigned int irq, unsigned long irqflags)
 {
        unsigned long flags;
-        struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
        int canrequest = 0;
        if (!desc)
@@ -620,8 +623,9 @@ static irqreturn_t irq_nested_primary_handler(int irq, void *dev_id)
 static int irq_wait_for_interrupt(struct irqaction *action)
 {
+        set_current_state(TASK_INTERRUPTIBLE);
        while (!kthread_should_stop()) {
-                set_current_state(TASK_INTERRUPTIBLE);
                if (test_and_clear_bit(IRQTF_RUNTHREAD,
                                       &action->thread_flags)) {
@@ -629,7 +633,9 @@ static int irq_wait_for_interrupt(struct irqaction *action)
                        return 0;
                }
                schedule();
+                set_current_state(TASK_INTERRUPTIBLE);
        }
+        __set_current_state(TASK_RUNNING);
        return -1;
 }
@@ -883,6 +889,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
        if (desc->irq_data.chip == &no_irq_chip)
                return -ENOSYS;
+        if (!try_module_get(desc->owner))
+                return -ENODEV;
        /*
         * Some drivers like serial.c use request_irq() heavily,
         * so we have to be careful not to interfere with a
@@ -906,8 +914,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
         */
        nested = irq_settings_is_nested_thread(desc);
        if (nested) {
-                if (!new->thread_fn)
+                if (!new->thread_fn) {
-                        return -EINVAL;
+                        ret = -EINVAL;
+                        goto out_mput;
+                }
                /*
                 * Replace the primary handler which was provided from
                 * the driver for non nested interrupt handling by the
@@ -929,8 +939,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
                t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
                                   new->name);
-                if (IS_ERR(t))
+                if (IS_ERR(t)) {
-                        return PTR_ERR(t);
+                        ret = PTR_ERR(t);
+                        goto out_mput;
+                }
                /*
                 * We keep the reference to the task struct even if
                 * the thread dies to avoid that the interrupt code
@@ -1095,6 +1107,8 @@ out_thread:
                        kthread_stop(t);
                put_task_struct(t);
        }
+out_mput:
+        module_put(desc->owner);
        return ret;
 }
@@ -1110,6 +1124,8 @@ int setup_irq(unsigned int irq, struct irqaction *act)
        int retval;
        struct irq_desc *desc = irq_to_desc(irq);
+        if (WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+                return -EINVAL;
        chip_bus_lock(desc);
        retval = __setup_irq(irq, desc, act);
        chip_bus_sync_unlock(desc);
@@ -1118,7 +1134,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 }
 EXPORT_SYMBOL_GPL(setup_irq);
- /*
+/*
 * Internal function to unregister an irqaction - used to free
 * regular and special interrupts that are part of the architecture.
 */
@@ -1203,6 +1219,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
                put_task_struct(action->thread);
        }
+        module_put(desc->owner);
        return action;
 }
@@ -1215,7 +1232,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 */
 void remove_irq(unsigned int irq, struct irqaction *act)
 {
-        __free_irq(irq, act->dev_id);
+        struct irq_desc *desc = irq_to_desc(irq);
+        if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc)))
+            __free_irq(irq, act->dev_id);
 }
 EXPORT_SYMBOL_GPL(remove_irq);
@@ -1237,7 +1257,7 @@ void free_irq(unsigned int irq, void *dev_id)
 {
        struct irq_desc *desc = irq_to_desc(irq);
-        if (!desc)
+        if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc)))
                return;
 #ifdef CONFIG_SMP
@@ -1315,7 +1335,8 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
        if (!desc)
                return -EINVAL;
-        if (!irq_settings_can_request(desc))
+        if (!irq_settings_can_request(desc) ||
+            WARN_ON(irq_settings_is_per_cpu_devid(desc)))
                return -EINVAL;
        if (!handler) {
@@ -1400,3 +1421,194 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
        return !ret ? IRQC_IS_HARDIRQ : ret;
 }
 EXPORT_SYMBOL_GPL(request_any_context_irq);
+void enable_percpu_irq(unsigned int irq, unsigned int type)
+{
+        unsigned int cpu = smp_processor_id();
+        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+        if (!desc)
+                return;
+        type &= IRQ_TYPE_SENSE_MASK;
+        if (type != IRQ_TYPE_NONE) {
+                int ret;
+                ret = __irq_set_trigger(desc, irq, type);
+                if (ret) {
+                        WARN(1, "failed to set type for IRQ%d\n", irq);
+                        goto out;
+                }
+        }
+        irq_percpu_enable(desc, cpu);
+out:
+        irq_put_desc_unlock(desc, flags);
+}
+void disable_percpu_irq(unsigned int irq)
+{
+        unsigned int cpu = smp_processor_id();
+        unsigned long flags;
+        struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU);
+        if (!desc)
+                return;
+        irq_percpu_disable(desc, cpu);
+        irq_put_desc_unlock(desc, flags);
+}
+/*
+ * Internal function to unregister a percpu irqaction.
+ */
+static struct irqaction *__free_percpu_irq(unsigned int irq, void __percpu *dev_id)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        struct irqaction *action;
+        unsigned long flags;
+        WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
+        if (!desc)
+                return NULL;
+        raw_spin_lock_irqsave(&desc->lock, flags);
+        action = desc->action;
+        if (!action || action->percpu_dev_id != dev_id) {
+                WARN(1, "Trying to free already-free IRQ %d\n", irq);
+                goto bad;
+        }
+        if (!cpumask_empty(desc->percpu_enabled)) {
+                WARN(1, "percpu IRQ %d still enabled on CPU%d!\n",
+                     irq, cpumask_first(desc->percpu_enabled));
+                goto bad;
+        }
+        /* Found it - now remove it from the list of entries: */
+        desc->action = NULL;
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        unregister_handler_proc(irq, action);
+        module_put(desc->owner);
+        return action;
+bad:
+        raw_spin_unlock_irqrestore(&desc->lock, flags);
+        return NULL;
+}
+/**
+ *      remove_percpu_irq - free a per-cpu interrupt
+ *      @irq: Interrupt line to free
+ *      @act: irqaction for the interrupt
+ *
+ * Used to remove interrupts statically setup by the early boot process.
+ */
+void remove_percpu_irq(unsigned int irq, struct irqaction *act)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        if (desc && irq_settings_is_per_cpu_devid(desc))
+            __free_percpu_irq(irq, act->percpu_dev_id);
+}
+/**
+ *      free_percpu_irq - free an interrupt allocated with request_percpu_irq
+ *      @irq: Interrupt line to free
+ *      @dev_id: Device identity to free
+ *
+ *      Remove a percpu interrupt handler. The handler is removed, but
+ *      the interrupt line is not disabled. This must be done on each
+ *      CPU before calling this function. The function does not return
+ *      until any executing interrupts for this IRQ have completed.
+ *
+ *      This function must not be called from interrupt context.
+ */
+void free_percpu_irq(unsigned int irq, void __percpu *dev_id)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        if (!desc || !irq_settings_is_per_cpu_devid(desc))
+                return;
+        chip_bus_lock(desc);
+        kfree(__free_percpu_irq(irq, dev_id));
+        chip_bus_sync_unlock(desc);
+}
+/**
+ *      setup_percpu_irq - setup a per-cpu interrupt
+ *      @irq: Interrupt line to setup
+ *      @act: irqaction for the interrupt
+ *
+ * Used to statically setup per-cpu interrupts in the early boot process.
+ */
+int setup_percpu_irq(unsigned int irq, struct irqaction *act)
+{
+        struct irq_desc *desc = irq_to_desc(irq);
+        int retval;
+        if (!desc || !irq_settings_is_per_cpu_devid(desc))
+                return -EINVAL;
+        chip_bus_lock(desc);
+        retval = __setup_irq(irq, desc, act);
+        chip_bus_sync_unlock(desc);
+        return retval;
+}
+/**
+ *      request_percpu_irq - allocate a percpu interrupt line
+ *      @irq: Interrupt line to allocate
+ *      @handler: Function to be called when the IRQ occurs.
+ *      @devname: An ascii name for the claiming device
+ *      @dev_id: A percpu cookie passed back to the handler function
+ *
+ *      This call allocates interrupt resources, but doesn't
+ *      automatically enable the interrupt. It has to be done on each
+ *      CPU using enable_percpu_irq().
+ *
+ *      Dev_id must be globally unique. It is a per-cpu variable, and
+ *      the handler gets called with the interrupted CPU's instance of
+ *      that variable.
+ */
+int request_percpu_irq(unsigned int irq, irq_handler_t handler,
+                       const char *devname, void __percpu *dev_id)
+{
+        struct irqaction *action;
+        struct irq_desc *desc;
+        int retval;
+        if (!dev_id)
+                return -EINVAL;
+        desc = irq_to_desc(irq);
+        if (!desc || !irq_settings_can_request(desc) ||
+            !irq_settings_is_per_cpu_devid(desc))
+                return -EINVAL;
+        action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
+        if (!action)
+                return -ENOMEM;
+        action->handler = handler;
+        action->flags = IRQF_PERCPU | IRQF_NO_SUSPEND;
+        action->name = devname;
+        action->percpu_dev_id = dev_id;
+        chip_bus_lock(desc);
+        retval = __setup_irq(irq, desc, action);
+        chip_bus_sync_unlock(desc);
+        if (retval)
+                kfree(action);
+        return retval;
+}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index f76fc00c987..15e53b1766a 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,6 +9,7 @@
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
+#include <linux/syscore_ops.h>
 #include "internals.h"
@@ -39,25 +40,58 @@ void suspend_device_irqs(void)
 }
 EXPORT_SYMBOL_GPL(suspend_device_irqs);
-/**
+static void resume_irqs(bool want_early)
- * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
- *
- * Enable all interrupt lines previously disabled by suspend_device_irqs() that
- * have the IRQS_SUSPENDED flag set.
- */
-void resume_device_irqs(void)
 {
        struct irq_desc *desc;
        int irq;
        for_each_irq_desc(irq, desc) {
                unsigned long flags;
+                bool is_early = desc->action &&
+                        desc->action->flags & IRQF_EARLY_RESUME;
+                if (is_early != want_early)
+                        continue;
                raw_spin_lock_irqsave(&desc->lock, flags);
                __enable_irq(desc, irq, true);
                raw_spin_unlock_irqrestore(&desc->lock, flags);
        }
 }
+/**
+ * irq_pm_syscore_ops - enable interrupt lines early
+ *
+ * Enable all interrupt lines with %IRQF_EARLY_RESUME set.
+ */
+static void irq_pm_syscore_resume(void)
+{
+        resume_irqs(true);
+}
+static struct syscore_ops irq_pm_syscore_ops = {
+        .resume         = irq_pm_syscore_resume,
+};
+static int __init irq_pm_init_ops(void)
+{
+        register_syscore_ops(&irq_pm_syscore_ops);
+        return 0;
+}
+device_initcall(irq_pm_init_ops);
+/**
+ * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
+ *
+ * Enable all non-%IRQF_EARLY_RESUME interrupt lines previously
+ * disabled by suspend_device_irqs() that have the IRQS_SUSPENDED flag
+ * set as well as those with %IRQF_FORCE_RESUME.
+ */
+void resume_device_irqs(void)
+{
+        resume_irqs(false);
+}
 EXPORT_SYMBOL_GPL(resume_device_irqs);
 /**
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
index f1667833d44..1162f1030f1 100644
--- a/kernel/irq/settings.h
+++ b/kernel/irq/settings.h
@@ -13,6 +13,7 @@ enum {
        _IRQ_MOVE_PCNTXT        = IRQ_MOVE_PCNTXT,
        _IRQ_NO_BALANCING       = IRQ_NO_BALANCING,
        _IRQ_NESTED_THREAD      = IRQ_NESTED_THREAD,
+        _IRQ_PER_CPU_DEVID      = IRQ_PER_CPU_DEVID,
        _IRQF_MODIFY_MASK       = IRQF_MODIFY_MASK,
 };
@@ -24,6 +25,7 @@ enum {
 #define IRQ_NOTHREAD            GOT_YOU_MORON
 #define IRQ_NOAUTOEN            GOT_YOU_MORON
 #define IRQ_NESTED_THREAD       GOT_YOU_MORON
+#define IRQ_PER_CPU_DEVID       GOT_YOU_MORON
 #undef IRQF_MODIFY_MASK
 #define IRQF_MODIFY_MASK        GOT_YOU_MORON
@@ -39,6 +41,11 @@ static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
        return desc->status_use_accessors & _IRQ_PER_CPU;
 }
+static inline bool irq_settings_is_per_cpu_devid(struct irq_desc *desc)
+{
+        return desc->status_use_accessors & _IRQ_PER_CPU_DEVID;
+}
 static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
 {
        desc->status_use_accessors |= _IRQ_PER_CPU;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index aa57d5da18c..dc813a948be 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -84,7 +84,9 @@ static int try_one_irq(int irq, struct irq_desc *desc, bool force)
         */
        action = desc->action;
        if (!action || !(action->flags & IRQF_SHARED) ||
-            (action->flags & __IRQF_TIMER) || !action->next)
+            (action->flags & __IRQF_TIMER) ||
+            (action->handler(irq, action->dev_id) == IRQ_HANDLED) ||
+            !action->next)
                goto out;
        /* Already running on another processor */
@@ -115,7 +117,7 @@ static int misrouted_irq(int irq)
        struct irq_desc *desc;
        int i, ok = 0;
-        if (atomic_inc_return(&irq_poll_active) == 1)
+        if (atomic_inc_return(&irq_poll_active) != 1)
                goto out;
        irq_poll_cpu = smp_processor_id();
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index c58fa7da8ae..c3c46c72046 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -6,9 +6,11 @@
 */
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/irq_work.h>
+#include <linux/percpu.h>
 #include <linux/hardirq.h>
+#include <asm/processor.h>
 /*
 * An entry can be in one of four states:
@@ -17,54 +19,34 @@
 * claimed   NULL, 3 -> {pending}       : claimed to be enqueued
 * pending   next, 3 -> {busy}          : queued, pending callback
 * busy      NULL, 2 -> {free, claimed} : callback in progress, can be claimed
- *
- * We use the lower two bits of the next pointer to keep PENDING and BUSY
- * flags.
 */
 #define IRQ_WORK_PENDING        1UL
 #define IRQ_WORK_BUSY           2UL
 #define IRQ_WORK_FLAGS          3UL
-static inline bool irq_work_is_set(struct irq_work *entry, int flags)
+static DEFINE_PER_CPU(struct llist_head, irq_work_list);
-{
-        return (unsigned long)entry->next & flags;
-}
-static inline struct irq_work *irq_work_next(struct irq_work *entry)
-{
-        unsigned long next = (unsigned long)entry->next;
-        next &= ~IRQ_WORK_FLAGS;
-        return (struct irq_work *)next;
-}
-static inline struct irq_work *next_flags(struct irq_work *entry, int flags)
-{
-        unsigned long next = (unsigned long)entry;
-        next |= flags;
-        return (struct irq_work *)next;
-}
-static DEFINE_PER_CPU(struct irq_work *, irq_work_list);
 /*
 * Claim the entry so that no one else will poke at it.
 */
-static bool irq_work_claim(struct irq_work *entry)
+static bool irq_work_claim(struct irq_work *work)
 {
-        struct irq_work *next, *nflags;
+        unsigned long flags, nflags;
-        do {
+        for (;;) {
-                next = entry->next;
+                flags = work->flags;
-                if ((unsigned long)next & IRQ_WORK_PENDING)
+                if (flags & IRQ_WORK_PENDING)
                        return false;
-                nflags = next_flags(next, IRQ_WORK_FLAGS);
+                nflags = flags | IRQ_WORK_FLAGS;
-        } while (cmpxchg(&entry->next, next, nflags) != next);
+                if (cmpxchg(&work->flags, flags, nflags) == flags)
+                        break;
+                cpu_relax();
+        }
        return true;
 }
 void __weak arch_irq_work_raise(void)
 {
        /*
@@ -75,20 +57,15 @@ void __weak arch_irq_work_raise(void)
 /*
 * Queue the entry and raise the IPI if needed.
 */
-static void __irq_work_queue(struct irq_work *entry)
+static void __irq_work_queue(struct irq_work *work)
 {
-        struct irq_work *next;
+        bool empty;
        preempt_disable();
-        do {
+        empty = llist_add(&work->llnode, &__get_cpu_var(irq_work_list));
-                next = __this_cpu_read(irq_work_list);
-                /* Can assign non-atomic because we keep the flags set. */
-                entry->next = next_flags(next, IRQ_WORK_FLAGS);
-        } while (this_cpu_cmpxchg(irq_work_list, next, entry) != next);
        /* The list was empty, raise self-interrupt to start processing. */
-        if (!irq_work_next(entry))
+        if (empty)
                arch_irq_work_raise();
        preempt_enable();
@@ -100,16 +77,16 @@ static void __irq_work_queue(struct irq_work *entry)
 *
 * Can be re-enqueued while the callback is still in progress.
 */
-bool irq_work_queue(struct irq_work *entry)
+bool irq_work_queue(struct irq_work *work)
 {
-        if (!irq_work_claim(entry)) {
+        if (!irq_work_claim(work)) {
                /*
                 * Already enqueued, can't do!
                 */
                return false;
        }
-        __irq_work_queue(entry);
+        __irq_work_queue(work);
        return true;
 }
 EXPORT_SYMBOL_GPL(irq_work_queue);
@@ -120,34 +97,34 @@ EXPORT_SYMBOL_GPL(irq_work_queue);
 */
 void irq_work_run(void)
 {
-        struct irq_work *list;
+        struct irq_work *work;
+        struct llist_head *this_list;
+        struct llist_node *llnode;
-        if (this_cpu_read(irq_work_list) == NULL)
+        this_list = &__get_cpu_var(irq_work_list);
+        if (llist_empty(this_list))
                return;
        BUG_ON(!in_irq());
        BUG_ON(!irqs_disabled());
-        list = this_cpu_xchg(irq_work_list, NULL);
+        llnode = llist_del_all(this_list);
+        while (llnode != NULL) {
-        while (list != NULL) {
+                work = llist_entry(llnode, struct irq_work, llnode);
-                struct irq_work *entry = list;
-                list = irq_work_next(list);
+                llnode = llist_next(llnode);
                /*
-                 * Clear the PENDING bit, after this point the @entry
+                 * Clear the PENDING bit, after this point the @work
                 * can be re-used.
                 */
-                entry->next = next_flags(NULL, IRQ_WORK_BUSY);
+                work->flags = IRQ_WORK_BUSY;
-                entry->func(entry);
+                work->func(work);
                /*
                 * Clear the BUSY bit and return to the free state if
                 * no-one else claimed it meanwhile.
                 */
-                (void)cmpxchg(&entry->next,
+                (void)cmpxchg(&work->flags, IRQ_WORK_BUSY, 0);
-                              next_flags(NULL, IRQ_WORK_BUSY),
-                              NULL);
        }
 }
 EXPORT_SYMBOL_GPL(irq_work_run);
@@ -156,11 +133,11 @@ EXPORT_SYMBOL_GPL(irq_work_run);
 * Synchronize against the irq_work @entry, ensures the entry is not
 * currently in use.
 */
-void irq_work_sync(struct irq_work *entry)
+void irq_work_sync(struct irq_work *work)
 {
        WARN_ON_ONCE(irqs_disabled());
-        while (irq_work_is_set(entry, IRQ_WORK_BUSY))
+        while (work->flags & IRQ_WORK_BUSY)
                cpu_relax();
 }
 EXPORT_SYMBOL_GPL(irq_work_sync);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index a8ce45097f3..bbdfe2a462a 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -104,6 +104,18 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
        return 0;
 }
+/* 
+ * Update code which is definitely not currently executing.
+ * Architectures which need heavyweight synchronization to modify
+ * running code can override this to make the non-live update case
+ * cheaper.
+ */
+void __weak arch_jump_label_transform_static(struct jump_entry *entry,
+                                            enum jump_label_type type)
+{
+        arch_jump_label_transform(entry, type); 
+}
 static void __jump_label_update(struct jump_label_key *key,
                                struct jump_entry *entry,
                                struct jump_entry *stop, int enable)
@@ -121,14 +133,7 @@ static void __jump_label_update(struct jump_label_key *key,
        }
 }
-/*
+void __init jump_label_init(void)
- * Not all archs need this.
- */
-void __weak arch_jump_label_text_poke_early(jump_label_t addr)
-{
-}
-static __init int jump_label_init(void)
 {
        struct jump_entry *iter_start = __start___jump_table;
        struct jump_entry *iter_stop = __stop___jump_table;
@@ -139,22 +144,22 @@ static __init int jump_label_init(void)
        jump_label_sort_entries(iter_start, iter_stop);
        for (iter = iter_start; iter < iter_stop; iter++) {
-                arch_jump_label_text_poke_early(iter->code);
+                struct jump_label_key *iterk;
-                if (iter->key == (jump_label_t)(unsigned long)key)
+                iterk = (struct jump_label_key *)(unsigned long)iter->key;
+                arch_jump_label_transform_static(iter, jump_label_enabled(iterk) ?
+                                                 JUMP_LABEL_ENABLE : JUMP_LABEL_DISABLE);
+                if (iterk == key)
                        continue;
-                key = (struct jump_label_key *)(unsigned long)iter->key;
+                key = iterk;
-                atomic_set(&key->enabled, 0);
                key->entries = iter;
 #ifdef CONFIG_MODULES
                key->next = NULL;
 #endif
        }
        jump_label_unlock();
-        return 0;
 }
-early_initcall(jump_label_init);
 #ifdef CONFIG_MODULES
@@ -212,7 +217,7 @@ void jump_label_apply_nops(struct module *mod)
                return;
        for (iter = iter_start; iter < iter_stop; iter++)
-                arch_jump_label_text_poke_early(iter->code);
+                arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
 }
 static int jump_label_add_module(struct module *mod)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 296fbc84d65..dc7bc082928 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -498,7 +498,7 @@ static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
        while (hole_end <= crashk_res.end) {
                unsigned long i;
-                if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
+                if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT)
                        break;
                if (hole_end > crashk_res.end)
                        break;
@@ -999,6 +999,7 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
                        kimage_free(xchg(&kexec_crash_image, NULL));
                        result = kimage_crash_alloc(&image, entry,
                                                     nr_segments, segments);
+                        crash_map_reserved_pages();
                }
                if (result)
                        goto out;
@@ -1015,6 +1016,8 @@ SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
                                goto out;
                }
                kimage_terminate(image);
+                if (flags & KEXEC_ON_CRASH)
+                        crash_unmap_reserved_pages();
        }
        /* Install the new kernel, and  Uninstall the old */
        image = xchg(dest_image, image);
@@ -1026,6 +1029,18 @@ out:
        return result;
 }
+/*
+ * Add and remove page tables for crashkernel memory
+ *
+ * Provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __weak crash_map_reserved_pages(void)
+{}
+void __weak crash_unmap_reserved_pages(void)
+{}
 #ifdef CONFIG_COMPAT
 asmlinkage long compat_sys_kexec_load(unsigned long entry,
                                unsigned long nr_segments,
@@ -1134,14 +1149,16 @@ int crash_shrink_memory(unsigned long new_size)
                goto unlock;
        }
-        start = roundup(start, PAGE_SIZE);
+        start = roundup(start, KEXEC_CRASH_MEM_ALIGN);
-        end = roundup(start + new_size, PAGE_SIZE);
+        end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN);
+        crash_map_reserved_pages();
        crash_free_reserved_phys_range(end, crashk_res.end);
        if ((start == end) && (crashk_res.parent != NULL))
                release_resource(&crashk_res);
        crashk_res.end = end - 1;
+        crash_unmap_reserved_pages();
 unlock:
        mutex_unlock(&kexec_mutex);
@@ -1380,24 +1397,23 @@ int __init parse_crashkernel(char 		 *cmdline,
 }
+static void update_vmcoreinfo_note(void)
-void crash_save_vmcoreinfo(void)
 {
-        u32 *buf;
+        u32 *buf = vmcoreinfo_note;
        if (!vmcoreinfo_size)
                return;
-        vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
-        buf = (u32 *)vmcoreinfo_note;
        buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
                              vmcoreinfo_size);
        final_note(buf);
 }
+void crash_save_vmcoreinfo(void)
+{
+        vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
+        update_vmcoreinfo_note();
+}
 void vmcoreinfo_append_str(const char *fmt, ...)
 {
        va_list args;
@@ -1483,6 +1499,7 @@ static int __init crash_save_vmcoreinfo_init(void)
        VMCOREINFO_NUMBER(PG_swapcache);
        arch_crash_save_vmcoreinfo();
+        update_vmcoreinfo_note();
        return 0;
 }
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 01a0700e873..c744b88c44e 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -20,7 +20,7 @@
 */
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/log2.h>
diff --git a/kernel/kmod.c b/kernel/kmod.c
index ddc7644c130..a4bea97c75b 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -114,10 +114,12 @@ int __request_module(bool wait, const char *fmt, ...)
        atomic_inc(&kmod_concurrent);
        if (atomic_read(&kmod_concurrent) > max_modprobes) {
                /* We may be blaming an innocent here, but unlikely */
-                if (kmod_loop_msg++ < 5)
+                if (kmod_loop_msg < 5) {
                        printk(KERN_ERR
                               "request_module: runaway loop modprobe %s\n",
                               module_name);
+                        kmod_loop_msg++;
+                }
                atomic_dec(&kmod_concurrent);
                return -ENOMEM;
        }
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index b30fd54eb98..e5d84644823 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -36,7 +36,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/stddef.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/moduleloader.h>
 #include <linux/kallsyms.h>
 #include <linux/freezer.h>
@@ -78,10 +78,10 @@ static bool kprobes_all_disarmed;
 static DEFINE_MUTEX(kprobe_mutex);
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
 static struct {
-        spinlock_t lock ____cacheline_aligned_in_smp;
+        raw_spinlock_t lock ____cacheline_aligned_in_smp;
 } kretprobe_table_locks[KPROBE_TABLE_SIZE];
-static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
+static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
 {
        return &(kretprobe_table_locks[hash].lock);
 }
@@ -1013,9 +1013,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
        hlist_del(&ri->hlist);
        INIT_HLIST_NODE(&ri->hlist);
        if (likely(rp)) {
-                spin_lock(&rp->lock);
+                raw_spin_lock(&rp->lock);
                hlist_add_head(&ri->hlist, &rp->free_instances);
-                spin_unlock(&rp->lock);
+                raw_spin_unlock(&rp->lock);
        } else
                /* Unregistering */
                hlist_add_head(&ri->hlist, head);
@@ -1026,19 +1026,19 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
 __acquires(hlist_lock)
 {
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
-        spinlock_t *hlist_lock;
+        raw_spinlock_t *hlist_lock;
        *head = &kretprobe_inst_table[hash];
        hlist_lock = kretprobe_table_lock_ptr(hash);
-        spin_lock_irqsave(hlist_lock, *flags);
+        raw_spin_lock_irqsave(hlist_lock, *flags);
 }
 static void __kprobes kretprobe_table_lock(unsigned long hash,
        unsigned long *flags)
 __acquires(hlist_lock)
 {
-        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+        raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
-        spin_lock_irqsave(hlist_lock, *flags);
+        raw_spin_lock_irqsave(hlist_lock, *flags);
 }
 void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
@@ -1046,18 +1046,18 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
 __releases(hlist_lock)
 {
        unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
-        spinlock_t *hlist_lock;
+        raw_spinlock_t *hlist_lock;
        hlist_lock = kretprobe_table_lock_ptr(hash);
-        spin_unlock_irqrestore(hlist_lock, *flags);
+        raw_spin_unlock_irqrestore(hlist_lock, *flags);
 }
 static void __kprobes kretprobe_table_unlock(unsigned long hash,
       unsigned long *flags)
 __releases(hlist_lock)
 {
-        spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
+        raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
-        spin_unlock_irqrestore(hlist_lock, *flags);
+        raw_spin_unlock_irqrestore(hlist_lock, *flags);
 }
 /*
@@ -1663,12 +1663,12 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
        /*TODO: consider to only swap the RA after the last pre_handler fired */
        hash = hash_ptr(current, KPROBE_HASH_BITS);
-        spin_lock_irqsave(&rp->lock, flags);
+        raw_spin_lock_irqsave(&rp->lock, flags);
        if (!hlist_empty(&rp->free_instances)) {
                ri = hlist_entry(rp->free_instances.first,
                                struct kretprobe_instance, hlist);
                hlist_del(&ri->hlist);
-                spin_unlock_irqrestore(&rp->lock, flags);
+                raw_spin_unlock_irqrestore(&rp->lock, flags);
                ri->rp = rp;
                ri->task = current;
@@ -1685,7 +1685,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
                kretprobe_table_unlock(hash, &flags);
        } else {
                rp->nmissed++;
-                spin_unlock_irqrestore(&rp->lock, flags);
+                raw_spin_unlock_irqrestore(&rp->lock, flags);
        }
        return 0;
 }
@@ -1721,7 +1721,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
                rp->maxactive = num_possible_cpus();
 #endif
        }
-        spin_lock_init(&rp->lock);
+        raw_spin_lock_init(&rp->lock);
        INIT_HLIST_HEAD(&rp->free_instances);
        for (i = 0; i < rp->maxactive; i++) {
                inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -1959,7 +1959,7 @@ static int __init init_kprobes(void)
        for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
                INIT_HLIST_HEAD(&kprobe_table[i]);
                INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
-                spin_lock_init(&(kretprobe_table_locks[i].lock));
+                raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
        }
        /*
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 3b053c04dd8..4e316e1acf5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -11,10 +11,11 @@
 #include <linux/kobject.h>
 #include <linux/string.h>
 #include <linux/sysfs.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/init.h>
 #include <linux/kexec.h>
 #include <linux/profile.h>
+#include <linux/stat.h>
 #include <linux/sched.h>
 #include <linux/capability.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4ba7cccb499..b6d216a9263 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -12,7 +12,7 @@
 #include <linux/cpuset.h>
 #include <linux/unistd.h>
 #include <linux/file.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/freezer.h>
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 376066e1041..a462b317f9a 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -53,12 +53,12 @@
 #include <linux/notifier.h>
 #include <linux/spinlock.h>
 #include <linux/proc_fs.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/list.h>
 #include <linux/stacktrace.h>
-static DEFINE_SPINLOCK(latency_lock);
+static DEFINE_RAW_SPINLOCK(latency_lock);
 #define MAXLR 128
 static struct latency_record latency_record[MAXLR];
@@ -72,19 +72,19 @@ void clear_all_latency_tracing(struct task_struct *p)
        if (!latencytop_enabled)
                return;
-        spin_lock_irqsave(&latency_lock, flags);
+        raw_spin_lock_irqsave(&latency_lock, flags);
        memset(&p->latency_record, 0, sizeof(p->latency_record));
        p->latency_record_count = 0;
-        spin_unlock_irqrestore(&latency_lock, flags);
+        raw_spin_unlock_irqrestore(&latency_lock, flags);
 }
 static void clear_global_latency_tracing(void)
 {
        unsigned long flags;
-        spin_lock_irqsave(&latency_lock, flags);
+        raw_spin_lock_irqsave(&latency_lock, flags);
        memset(&latency_record, 0, sizeof(latency_record));
-        spin_unlock_irqrestore(&latency_lock, flags);
+        raw_spin_unlock_irqrestore(&latency_lock, flags);
 }
 static void __sched
@@ -190,7 +190,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
        lat.max = usecs;
        store_stacktrace(tsk, &lat);
-        spin_lock_irqsave(&latency_lock, flags);
+        raw_spin_lock_irqsave(&latency_lock, flags);
        account_global_scheduler_latency(tsk, &lat);
@@ -231,7 +231,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
        memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
 out_unlock:
-        spin_unlock_irqrestore(&latency_lock, flags);
+        raw_spin_unlock_irqrestore(&latency_lock, flags);
 }
 static int lstats_show(struct seq_file *m, void *v)
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 8c24294e477..e69434b070d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -96,8 +96,13 @@ static int graph_lock(void)
 static inline int graph_unlock(void)
 {
-        if (debug_locks && !arch_spin_is_locked(&lockdep_lock))
+        if (debug_locks && !arch_spin_is_locked(&lockdep_lock)) {
+                /*
+                 * The lockdep graph lock isn't locked while we expect it to
+                 * be, we're confused now, bye!
+                 */
                return DEBUG_LOCKS_WARN_ON(1);
+        }
        current->lockdep_recursion--;
        arch_spin_unlock(&lockdep_lock);
@@ -134,6 +139,9 @@ static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
 static inline struct lock_class *hlock_class(struct held_lock *hlock)
 {
        if (!hlock->class_idx) {
+                /*
+                 * Someone passed in garbage, we give up.
+                 */
                DEBUG_LOCKS_WARN_ON(1);
                return NULL;
        }
@@ -687,6 +695,10 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
         */
        list_for_each_entry(class, hash_head, hash_entry) {
                if (class->key == key) {
+                        /*
+                         * Huh! same key, different name? Did someone trample
+                         * on some memory? We're most confused.
+                         */
                        WARN_ON_ONCE(class->name != lock->name);
                        return class;
                }
@@ -800,6 +812,10 @@ out_unlock_set:
        else if (subclass < NR_LOCKDEP_CACHING_CLASSES)
                lock->class_cache[subclass] = class;
+        /*
+         * Hash collision, did we smoke some? We found a class with a matching
+         * hash but the subclass -- which is hashed in -- didn't match.
+         */
        if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass))
                return NULL;
@@ -926,7 +942,7 @@ static inline void mark_lock_accessed(struct lock_list *lock,
        unsigned long nr;
        nr = lock - list_entries;
-        WARN_ON(nr >= nr_list_entries);
+        WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
        lock->parent = parent;
        lock->class->dep_gen_id = lockdep_dependency_gen_id;
 }
@@ -936,7 +952,7 @@ static inline unsigned long lock_accessed(struct lock_list *lock)
        unsigned long nr;
        nr = lock - list_entries;
-        WARN_ON(nr >= nr_list_entries);
+        WARN_ON(nr >= nr_list_entries); /* Out-of-bounds, input fail */
        return lock->class->dep_gen_id == lockdep_dependency_gen_id;
 }
@@ -1129,10 +1145,11 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth,
        if (debug_locks_silent)
                return 0;
-        printk("\n=======================================================\n");
+        printk("\n");
-        printk(  "[ INFO: possible circular locking dependency detected ]\n");
+        printk("======================================================\n");
+        printk("[ INFO: possible circular locking dependency detected ]\n");
        print_kernel_version();
-        printk(  "-------------------------------------------------------\n");
+        printk("-------------------------------------------------------\n");
        printk("%s/%d is trying to acquire lock:\n",
                curr->comm, task_pid_nr(curr));
        print_lock(check_src);
@@ -1196,6 +1213,9 @@ static noinline int print_bfs_bug(int ret)
        if (!debug_locks_off_graph_unlock())
                return 0;
+        /*
+         * Breadth-first-search failed, graph got corrupted?
+         */
        WARN(1, "lockdep bfs error:%d\n", ret);
        return 0;
@@ -1463,11 +1483,12 @@ print_bad_irq_dependency(struct task_struct *curr,
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
-        printk("\n======================================================\n");
+        printk("\n");
-        printk(  "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
+        printk("======================================================\n");
+        printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
                irqclass, irqclass);
        print_kernel_version();
-        printk(  "------------------------------------------------------\n");
+        printk("------------------------------------------------------\n");
        printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
                curr->comm, task_pid_nr(curr),
                curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
@@ -1692,10 +1713,11 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
-        printk("\n=============================================\n");
+        printk("\n");
-        printk(  "[ INFO: possible recursive locking detected ]\n");
+        printk("=============================================\n");
+        printk("[ INFO: possible recursive locking detected ]\n");
        print_kernel_version();
-        printk(  "---------------------------------------------\n");
+        printk("---------------------------------------------\n");
        printk("%s/%d is trying to acquire lock:\n",
                curr->comm, task_pid_nr(curr));
        print_lock(next);
@@ -1944,6 +1966,11 @@ out_bug:
        if (!debug_locks_off_graph_unlock())
                return 0;
+        /*
+         * Clearly we all shouldn't be here, but since we made it we
+         * can reliable say we messed up our state. See the above two
+         * gotos for reasons why we could possibly end up here.
+         */
        WARN_ON(1);
        return 0;
@@ -1975,6 +2002,11 @@ static inline int lookup_chain_cache(struct task_struct *curr,
        struct held_lock *hlock_curr, *hlock_next;
        int i, j;
+        /*
+         * We might need to take the graph lock, ensure we've got IRQs
+         * disabled to make this an IRQ-safe lock.. for recursion reasons
+         * lockdep won't complain about its own locking errors.
+         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
        /*
@@ -2126,6 +2158,10 @@ static void check_chain_key(struct task_struct *curr)
                hlock = curr->held_locks + i;
                if (chain_key != hlock->prev_chain_key) {
                        debug_locks_off();
+                        /*
+                         * We got mighty confused, our chain keys don't match
+                         * with what we expect, someone trample on our task state?
+                         */
                        WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
                                curr->lockdep_depth, i,
                                (unsigned long long)chain_key,
@@ -2133,6 +2169,9 @@ static void check_chain_key(struct task_struct *curr)
                        return;
                }
                id = hlock->class_idx - 1;
+                /*
+                 * Whoops ran out of static storage again?
+                 */
                if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
                        return;
@@ -2144,6 +2183,10 @@ static void check_chain_key(struct task_struct *curr)
        }
        if (chain_key != curr->curr_chain_key) {
                debug_locks_off();
+                /*
+                 * More smoking hash instead of calculating it, damn see these
+                 * numbers float.. I bet that a pink elephant stepped on my memory.
+                 */
                WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
                        curr->lockdep_depth, i,
                        (unsigned long long)chain_key,
@@ -2177,10 +2220,11 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
-        printk("\n=================================\n");
+        printk("\n");
-        printk(  "[ INFO: inconsistent lock state ]\n");
+        printk("=================================\n");
+        printk("[ INFO: inconsistent lock state ]\n");
        print_kernel_version();
-        printk(  "---------------------------------\n");
+        printk("---------------------------------\n");
        printk("inconsistent {%s} -> {%s} usage.\n",
                usage_str[prev_bit], usage_str[new_bit]);
@@ -2241,10 +2285,11 @@ print_irq_inversion_bug(struct task_struct *curr,
        if (!debug_locks_off_graph_unlock() || debug_locks_silent)
                return 0;
-        printk("\n=========================================================\n");
+        printk("\n");
-        printk(  "[ INFO: possible irq lock inversion dependency detected ]\n");
+        printk("=========================================================\n");
+        printk("[ INFO: possible irq lock inversion dependency detected ]\n");
        print_kernel_version();
-        printk(  "---------------------------------------------------------\n");
+        printk("---------------------------------------------------------\n");
        printk("%s/%d just changed the state of lock:\n",
                curr->comm, task_pid_nr(curr));
        print_lock(this);
@@ -2525,12 +2570,24 @@ void trace_hardirqs_on_caller(unsigned long ip)
                return;
        }
+        /*
+         * We're enabling irqs and according to our state above irqs weren't
+         * already enabled, yet we find the hardware thinks they are in fact
+         * enabled.. someone messed up their IRQ state tracing.
+         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return;
+        /*
+         * See the fine text that goes along with this variable definition.
+         */
        if (DEBUG_LOCKS_WARN_ON(unlikely(early_boot_irqs_disabled)))
                return;
+        /*
+         * Can't allow enabling interrupts while in an interrupt handler,
+         * that's general bad form and such. Recursion, limited stack etc..
+         */
        if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
                return;
@@ -2558,6 +2615,10 @@ void trace_hardirqs_off_caller(unsigned long ip)
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
+        /*
+         * So we're supposed to get called after you mask local IRQs, but for
+         * some reason the hardware doesn't quite think you did a proper job.
+         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return;
@@ -2590,6 +2651,10 @@ void trace_softirqs_on(unsigned long ip)
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
+        /*
+         * We fancy IRQs being disabled here, see softirq.c, avoids
+         * funny state and nesting things.
+         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return;
@@ -2626,6 +2691,9 @@ void trace_softirqs_off(unsigned long ip)
        if (unlikely(!debug_locks || current->lockdep_recursion))
                return;
+        /*
+         * We fancy IRQs being disabled here, see softirq.c
+         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return;
@@ -2637,6 +2705,9 @@ void trace_softirqs_off(unsigned long ip)
                curr->softirq_disable_ip = ip;
                curr->softirq_disable_event = ++curr->irq_events;
                debug_atomic_inc(softirqs_off_events);
+                /*
+                 * Whoops, we wanted softirqs off, so why aren't they?
+                 */
                DEBUG_LOCKS_WARN_ON(!softirq_count());
        } else
                debug_atomic_inc(redundant_softirqs_off);
@@ -2661,6 +2732,9 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags)
        if (!(gfp_mask & __GFP_FS))
                return;
+        /*
+         * Oi! Can't be having __GFP_FS allocations with IRQs disabled.
+         */
        if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags)))
                return;
@@ -2773,13 +2847,13 @@ static int separate_irq_context(struct task_struct *curr,
        return 0;
 }
-#else
+#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
 static inline
 int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
                enum lock_usage_bit new_bit)
 {
-        WARN_ON(1);
+        WARN_ON(1); /* Impossible innit? when we don't have TRACE_IRQFLAG */
        return 1;
 }
@@ -2799,7 +2873,7 @@ void lockdep_trace_alloc(gfp_t gfp_mask)
 {
 }
-#endif
+#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
 /*
 * Mark a lock with a usage bit, and validate the state transition:
@@ -2880,6 +2954,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
        lock->cpu = raw_smp_processor_id();
 #endif
+        /*
+         * Can't be having no nameless bastards around this place!
+         */
        if (DEBUG_LOCKS_WARN_ON(!name)) {
                lock->name = "NULL";
                return;
@@ -2887,6 +2964,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
        lock->name = name;
+        /*
+         * No key, no joy, we need to hash something.
+         */
        if (DEBUG_LOCKS_WARN_ON(!key))
                return;
        /*
@@ -2894,6 +2974,9 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
         */
        if (!static_obj(key)) {
                printk("BUG: key %p not in .data!\n", key);
+                /*
+                 * What it says above ^^^^^, I suggest you read it.
+                 */
                DEBUG_LOCKS_WARN_ON(1);
                return;
        }
@@ -2932,6 +3015,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        if (unlikely(!debug_locks))
                return 0;
+        /*
+         * Lockdep should run with IRQs disabled, otherwise we could
+         * get an interrupt which would want to take locks, which would
+         * end up in lockdep and have you got a head-ache already?
+         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
@@ -2963,6 +3051,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
         * dependency checks are done)
         */
        depth = curr->lockdep_depth;
+        /*
+         * Ran out of static storage for our per-task lock stack again have we?
+         */
        if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
                return 0;
@@ -2981,6 +3072,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
        }
        hlock = curr->held_locks + depth;
+        /*
+         * Plain impossible, we just registered it and checked it weren't no
+         * NULL like.. I bet this mushroom I ate was good!
+         */
        if (DEBUG_LOCKS_WARN_ON(!class))
                return 0;
        hlock->class_idx = class_idx;
@@ -3015,11 +3110,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
         * the hash, not class->key.
         */
        id = class - lock_classes;
+        /*
+         * Whoops, we did it again.. ran straight out of our static allocation.
+         */
        if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
                return 0;
        chain_key = curr->curr_chain_key;
        if (!depth) {
+                /*
+                 * How can we have a chain hash when we ain't got no keys?!
+                 */
                if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
                        return 0;
                chain_head = 1;
@@ -3065,9 +3166,10 @@ print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
        if (debug_locks_silent)
                return 0;
-        printk("\n=====================================\n");
+        printk("\n");
-        printk(  "[ BUG: bad unlock balance detected! ]\n");
+        printk("=====================================\n");
-        printk(  "-------------------------------------\n");
+        printk("[ BUG: bad unlock balance detected! ]\n");
+        printk("-------------------------------------\n");
        printk("%s/%d is trying to release lock (",
                curr->comm, task_pid_nr(curr));
        print_lockdep_cache(lock);
@@ -3091,6 +3193,9 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
 {
        if (unlikely(!debug_locks))
                return 0;
+        /*
+         * Lockdep should run with IRQs disabled, recursion, head-ache, etc..
+         */
        if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
                return 0;
@@ -3111,9 +3216,20 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock)
                if (!class)
                        class = look_up_lock_class(lock, 0);
-                if (DEBUG_LOCKS_WARN_ON(!class))
+                /*
+                 * If look_up_lock_class() failed to find a class, we're trying
+                 * to test if we hold a lock that has never yet been acquired.
+                 * Clearly if the lock hasn't been acquired _ever_, we're not
+                 * holding it either, so report failure.
+                 */
+                if (!class)
                        return 0;
+                /*
+                 * References, but not a lock we're actually ref-counting?
+                 * State got messed up, follow the sites that change ->references
+                 * and try to make sense of it.
+                 */
                if (DEBUG_LOCKS_WARN_ON(!hlock->nest_lock))
                        return 0;
@@ -3136,6 +3252,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name,
        int i;
        depth = curr->lockdep_depth;
+        /*
+         * This function is about (re)setting the class of a held lock,
+         * yet we're not actually holding any locks. Naughty user!
+         */
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return 0;
@@ -3171,6 +3291,10 @@ found_it:
                        return 0;
        }
+        /*
+         * I took it apart and put it back together again, except now I have
+         * these 'spare' parts.. where shall I put them.
+         */
        if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
                return 0;
        return 1;
@@ -3195,6 +3319,10 @@ lock_release_non_nested(struct task_struct *curr,
         * of held locks:
         */
        depth = curr->lockdep_depth;
+        /*
+         * So we're all set to release this lock.. wait what lock? We don't
+         * own any locks, you've been drinking again?
+         */
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return 0;
@@ -3247,6 +3375,10 @@ found_it:
                        return 0;
        }
+        /*
+         * We had N bottles of beer on the wall, we drank one, but now
+         * there's not N-1 bottles of beer left on the wall...
+         */
        if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
                return 0;
        return 1;
@@ -3277,6 +3409,9 @@ static int lock_release_nested(struct task_struct *curr,
                return lock_release_non_nested(curr, lock, ip);
        curr->lockdep_depth--;
+        /*
+         * No more locks, but somehow we've got hash left over, who left it?
+         */
        if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
                return 0;
@@ -3359,10 +3494,13 @@ static void check_flags(unsigned long flags)
         * check if not in hardirq contexts:
         */
        if (!hardirq_count()) {
-                if (softirq_count())
+                if (softirq_count()) {
+                        /* like the above, but with softirqs */
                        DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
-                else
+                } else {
+                        /* lick the above, does it taste good? */
                        DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
+                }
        }
        if (!debug_locks)
@@ -3472,9 +3610,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock,
        if (debug_locks_silent)
                return 0;
-        printk("\n=================================\n");
+        printk("\n");
-        printk(  "[ BUG: bad contention detected! ]\n");
+        printk("=================================\n");
-        printk(  "---------------------------------\n");
+        printk("[ BUG: bad contention detected! ]\n");
+        printk("---------------------------------\n");
        printk("%s/%d is trying to contend lock (",
                curr->comm, task_pid_nr(curr));
        print_lockdep_cache(lock);
@@ -3500,6 +3639,10 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
        int i, contention_point, contending_point;
        depth = curr->lockdep_depth;
+        /*
+         * Whee, we contended on this lock, except it seems we're not
+         * actually trying to acquire anything much at all..
+         */
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return;
@@ -3549,6 +3692,10 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
        int i, cpu;
        depth = curr->lockdep_depth;
+        /*
+         * Yay, we acquired ownership of this lock we didn't try to
+         * acquire, how the heck did that happen?
+         */
        if (DEBUG_LOCKS_WARN_ON(!depth))
                return;
@@ -3753,8 +3900,12 @@ void lockdep_reset_lock(struct lockdep_map *lock)
                                match |= class == lock->class_cache[j];
                        if (unlikely(match)) {
-                                if (debug_locks_off_graph_unlock())
+                                if (debug_locks_off_graph_unlock()) {
+                                        /*
+                                         * We all just reset everything, how did it match?
+                                         */
                                        WARN_ON(1);
+                                }
                                goto out_restore;
                        }
                }
@@ -3833,9 +3984,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
        if (debug_locks_silent)
                return;
-        printk("\n=========================\n");
+        printk("\n");
-        printk(  "[ BUG: held lock freed! ]\n");
+        printk("=========================\n");
-        printk(  "-------------------------\n");
+        printk("[ BUG: held lock freed! ]\n");
+        printk("-------------------------\n");
        printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
                curr->comm, task_pid_nr(curr), mem_from, mem_to-1);
        print_lock(hlock);
@@ -3889,9 +4041,10 @@ static void print_held_locks_bug(struct task_struct *curr)
        if (debug_locks_silent)
                return;
-        printk("\n=====================================\n");
+        printk("\n");
-        printk(  "[ BUG: lock held at task exit time! ]\n");
+        printk("=====================================\n");
-        printk(  "-------------------------------------\n");
+        printk("[ BUG: lock held at task exit time! ]\n");
+        printk("-------------------------------------\n");
        printk("%s/%d is exiting with locks still held!\n",
                curr->comm, task_pid_nr(curr));
        lockdep_print_held_locks(curr);
@@ -3985,16 +4138,17 @@ void lockdep_sys_exit(void)
        if (unlikely(curr->lockdep_depth)) {
                if (!debug_locks_off())
                        return;
-                printk("\n================================================\n");
+                printk("\n");
-                printk(  "[ BUG: lock held when returning to user space! ]\n");
+                printk("================================================\n");
-                printk(  "------------------------------------------------\n");
+                printk("[ BUG: lock held when returning to user space! ]\n");
+                printk("------------------------------------------------\n");
                printk("%s/%d is leaving the kernel with locks still held!\n",
                                curr->comm, curr->pid);
                lockdep_print_held_locks(curr);
        }
 }
-void lockdep_rcu_dereference(const char *file, const int line)
+void lockdep_rcu_suspicious(const char *file, const int line, const char *s)
 {
        struct task_struct *curr = current;
@@ -4003,15 +4157,15 @@ void lockdep_rcu_dereference(const char *file, const int line)
                return;
 #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */
        /* Note: the following can be executed concurrently, so be careful. */
-        printk("\n===================================================\n");
+        printk("\n");
-        printk(  "[ INFO: suspicious rcu_dereference_check() usage. ]\n");
+        printk("===============================\n");
-        printk(  "---------------------------------------------------\n");
+        printk("[ INFO: suspicious RCU usage. ]\n");
-        printk("%s:%d invoked rcu_dereference_check() without protection!\n",
+        printk("-------------------------------\n");
-                        file, line);
+        printk("%s:%d %s!\n", file, line, s);
        printk("\nother info that might help us debug this:\n\n");
        printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks);
        lockdep_print_held_locks(curr);
        printk("\nstack backtrace:\n");
        dump_stack();
 }
-EXPORT_SYMBOL_GPL(lockdep_rcu_dereference);
+EXPORT_SYMBOL_GPL(lockdep_rcu_suspicious);
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 71edd2f60c0..91c32a0b612 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -11,7 +11,7 @@
 * Code for /proc/lockdep and /proc/lockdep_stats:
 *
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/kallsyms.h>
diff --git a/kernel/module.c b/kernel/module.c
index 04379f92f84..178333c48d1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -16,7 +16,7 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/moduleloader.h>
 #include <linux/ftrace_event.h>
 #include <linux/init.h>
@@ -2487,6 +2487,9 @@ static int check_modinfo(struct module *mod, struct load_info *info)
                return -ENOEXEC;
        }
+        if (!get_modinfo(info, "intree"))
+                add_taint_module(mod, TAINT_OOT_MODULE);
        if (get_modinfo(info, "staging")) {
                add_taint_module(mod, TAINT_CRAP);
                printk(KERN_WARNING "%s: module is from the staging directory,"
@@ -2878,8 +2881,7 @@ static struct module *load_module(void __user *umod,
        }
        /* This has to be done once we're sure module name is unique. */
-        if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
+        dynamic_debug_setup(info.debug, info.num_debug);
-                dynamic_debug_setup(info.debug, info.num_debug);
        /* Find duplicate symbols */
        err = verify_export_symbols(mod);
@@ -2915,8 +2917,7 @@ static struct module *load_module(void __user *umod,
        module_bug_cleanup(mod);
 ddebug:
-        if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
+        dynamic_debug_remove(info.debug);
-                dynamic_debug_remove(info.debug);
 unlock:
        mutex_unlock(&module_mutex);
        synchronize_sched();
@@ -3257,6 +3258,8 @@ static char *module_flags(struct module *mod, char *buf)
                buf[bx++] = '(';
                if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
                        buf[bx++] = 'P';
+                else if (mod->taints & (1 << TAINT_OOT_MODULE))
+                        buf[bx++] = 'O';
                if (mod->taints & (1 << TAINT_FORCED_MODULE))
                        buf[bx++] = 'F';
                if (mod->taints & (1 << TAINT_CRAP))
@@ -3487,50 +3490,3 @@ void module_layout(struct module *mod,
 }
 EXPORT_SYMBOL(module_layout);
 #endif
-#ifdef CONFIG_TRACEPOINTS
-void module_update_tracepoints(void)
-{
-        struct module *mod;
-        mutex_lock(&module_mutex);
-        list_for_each_entry(mod, &modules, list)
-                if (!mod->taints)
-                        tracepoint_update_probe_range(mod->tracepoints_ptrs,
-                                mod->tracepoints_ptrs + mod->num_tracepoints);
-        mutex_unlock(&module_mutex);
-}
-/*
- * Returns 0 if current not found.
- * Returns 1 if current found.
- */
-int module_get_iter_tracepoints(struct tracepoint_iter *iter)
-{
-        struct module *iter_mod;
-        int found = 0;
-        mutex_lock(&module_mutex);
-        list_for_each_entry(iter_mod, &modules, list) {
-                if (!iter_mod->taints) {
-                        /*
-                         * Sorted module list
-                         */
-                        if (iter_mod < iter->module)
-                                continue;
-                        else if (iter_mod > iter->module)
-                                iter->tracepoint = NULL;
-                        found = tracepoint_get_iter_range(&iter->tracepoint,
-                                iter_mod->tracepoints_ptrs,
-                                iter_mod->tracepoints_ptrs
-                                        + iter_mod->num_tracepoints);
-                        if (found) {
-                                iter->module = iter_mod;
-                                break;
-                        }
-                }
-        }
-        mutex_unlock(&module_mutex);
-        return found;
-}
-#endif
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 73da83aff41..7e3443fe1f4 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -14,7 +14,7 @@
 */
 #include <linux/mutex.h>
 #include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/poison.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
diff --git a/kernel/mutex.c b/kernel/mutex.c
index d607ed5dd44..89096dd8786 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -19,7 +19,7 @@
 */
 #include <linux/mutex.h>
 #include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 8d7b435806c..2d5cc4ccff7 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -1,6 +1,6 @@
 #include <linux/kdebug.h>
 #include <linux/kprobes.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/vmalloc.h>
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 9aeab4b98c6..b576f7f14bc 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,7 +14,7 @@
 */
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/nsproxy.h>
 #include <linux/init_task.h>
 #include <linux/mnt_namespace.h>
diff --git a/kernel/padata.c b/kernel/padata.c
index b91941df5e6..b4525993151 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -18,7 +18,7 @@
 * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/cpumask.h>
 #include <linux/err.h>
 #include <linux/cpu.h>
diff --git a/kernel/panic.c b/kernel/panic.c
index d7bb6974efb..b2659360421 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -177,6 +177,7 @@ static const struct tnt tnts[] = {
        { TAINT_WARN,                   'W', ' ' },
        { TAINT_CRAP,                   'C', ' ' },
        { TAINT_FIRMWARE_WORKAROUND,    'I', ' ' },
+        { TAINT_OOT_MODULE,             'O', ' ' },
 };
 /**
@@ -194,6 +195,7 @@ static const struct tnt tnts[] = {
 *  'W' - Taint on warning.
 *  'C' - modules from drivers/staging are loaded.
 *  'I' - Working around severe firmware bug.
+ *  'O' - Out-of-tree module has been loaded.
 *
 *      The string is overwritten by the next call to print_tainted().
 */
diff --git a/kernel/params.c b/kernel/params.c
index 22df3e0d142..65aae11eb93 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,7 @@
    along with this program; if not, write to the Free Software
    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
-#include <linux/moduleparam.h>
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/errno.h>
@@ -67,20 +67,27 @@ static void maybe_kfree_parameter(void *param)
        }
 }
-static inline char dash2underscore(char c)
+static char dash2underscore(char c)
 {
        if (c == '-')
                return '_';
        return c;
 }
-static inline int parameq(const char *input, const char *paramname)
+bool parameqn(const char *a, const char *b, size_t n)
 {
-        unsigned int i;
+        size_t i;
-        for (i = 0; dash2underscore(input[i]) == paramname[i]; i++)
-                if (input[i] == '\0')
+        for (i = 0; i < n; i++) {
-                        return 1;
+                if (dash2underscore(a[i]) != dash2underscore(b[i]))
-        return 0;
+                        return false;
+        }
+        return true;
+}
+bool parameq(const char *a, const char *b)
+{
+        return parameqn(a, b, strlen(a)+1);
 }
 static int parse_one(char *param,
diff --git a/kernel/pid.c b/kernel/pid.c
index e432057f3b2..fa5f72227e5 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -27,7 +27,7 @@
 */
 #include <linux/mm.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/rculist.h>
@@ -418,7 +418,9 @@ EXPORT_SYMBOL(pid_task);
 */
 struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
 {
-        rcu_lockdep_assert(rcu_read_lock_held());
+        rcu_lockdep_assert(rcu_read_lock_held(),
+                           "find_task_by_pid_ns() needs rcu_read_lock()"
+                           " protection");
        return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c8008dd58ef..e7cb76dc18f 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -274,9 +274,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
        struct task_cputime sum;
        unsigned long flags;
-        spin_lock_irqsave(&cputimer->lock, flags);
        if (!cputimer->running) {
-                cputimer->running = 1;
                /*
                 * The POSIX timer interface allows for absolute time expiry
                 * values through the TIMER_ABSTIME flag, therefore we have
@@ -284,10 +282,13 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
                 * it.
                 */
                thread_group_cputime(tsk, &sum);
+                raw_spin_lock_irqsave(&cputimer->lock, flags);
+                cputimer->running = 1;
                update_gt_cputime(&cputimer->cputime, &sum);
-        }
+        } else
+                raw_spin_lock_irqsave(&cputimer->lock, flags);
        *times = cputimer->cputime;
-        spin_unlock_irqrestore(&cputimer->lock, flags);
+        raw_spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 /*
@@ -998,9 +999,9 @@ static void stop_process_timers(struct signal_struct *sig)
        struct thread_group_cputimer *cputimer = &sig->cputimer;
        unsigned long flags;
-        spin_lock_irqsave(&cputimer->lock, flags);
+        raw_spin_lock_irqsave(&cputimer->lock, flags);
        cputimer->running = 0;
-        spin_unlock_irqrestore(&cputimer->lock, flags);
+        raw_spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 static u32 onecputick;
@@ -1290,9 +1291,9 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
        if (sig->cputimer.running) {
                struct task_cputime group_sample;
-                spin_lock(&sig->cputimer.lock);
+                raw_spin_lock(&sig->cputimer.lock);
                group_sample = sig->cputimer.cputime;
-                spin_unlock(&sig->cputimer.lock);
+                raw_spin_unlock(&sig->cputimer.lock);
                if (task_cputime_expired(&group_sample, &sig->cputime_expires))
                        return 1;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 4556182527f..69185ae6b70 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -46,7 +46,7 @@
 #include <linux/syscalls.h>
 #include <linux/wait.h>
 #include <linux/workqueue.h>
-#include <linux/module.h>
+#include <linux/export.h>
 /*
 * Management arrays for POSIX timers.   Timers are kept in slab memory
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index b1914cb9095..deb5461e321 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -27,6 +27,7 @@ config HIBERNATION
        select HIBERNATE_CALLBACKS
        select LZO_COMPRESS
        select LZO_DECOMPRESS
+        select CRC32
        ---help---
          Enable the suspend to disk (STD) functionality, which is usually
          called "hibernation" in user interfaces.  STD checkpoints the
@@ -65,6 +66,9 @@ config HIBERNATION
          For more information take a look at <file:Documentation/power/swsusp.txt>.
+config ARCH_SAVE_PAGE_KEYS
+        bool
 config PM_STD_PARTITION
        string "Default resume partition"
        depends on HIBERNATION
@@ -231,3 +235,11 @@ config PM_CLK
 config PM_GENERIC_DOMAINS
        bool
        depends on PM
+config PM_GENERIC_DOMAINS_RUNTIME
+        def_bool y
+        depends on PM_RUNTIME && PM_GENERIC_DOMAINS
+config CPU_PM
+        bool
+        depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c5ebc6a9064..07e0e28ffba 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,8 +1,8 @@
 ccflags-$(CONFIG_PM_DEBUG)      := -DDEBUG
-obj-$(CONFIG_PM)                += main.o
+obj-$(CONFIG_PM)                += main.o qos.o
-obj-$(CONFIG_PM_SLEEP)          += console.o
+obj-$(CONFIG_VT_CONSOLE_SLEEP)  += console.o
 obj-$(CONFIG_FREEZER)           += process.o
 obj-$(CONFIG_SUSPEND)           += suspend.o
 obj-$(CONFIG_PM_TEST_SUSPEND)   += suspend_test.o
diff --git a/kernel/power/console.c b/kernel/power/console.c
index 218e5af9015..b1dc456474b 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -1,5 +1,5 @@
 /*
- * drivers/power/process.c - Functions for saving/restoring console.
+ * Functions for saving/restoring console.
 *
 * Originally from swsusp.
 */
@@ -10,7 +10,6 @@
 #include <linux/module.h>
 #include "power.h"
-#if defined(CONFIG_VT) && defined(CONFIG_VT_CONSOLE)
 #define SUSPEND_CONSOLE (MAX_NR_CONSOLES-1)
 static int orig_fgconsole, orig_kmsg;
@@ -32,4 +31,3 @@ void pm_restore_console(void)
                vt_kmsg_redirect(orig_kmsg);
        }
 }
-#endif
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 8f7b1db1ece..a6b0503574e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -9,11 +9,13 @@
 * This file is released under the GPLv2.
 */
+#include <linux/export.h>
 #include <linux/suspend.h>
 #include <linux/syscalls.h>
 #include <linux/reboot.h>
 #include <linux/string.h>
 #include <linux/device.h>
+#include <linux/async.h>
 #include <linux/kmod.h>
 #include <linux/delay.h>
 #include <linux/fs.h>
@@ -29,12 +31,14 @@
 #include "power.h"
-static int nocompress = 0;
+static int nocompress;
-static int noresume = 0;
+static int noresume;
+static int resume_wait;
+static int resume_delay;
 static char resume_file[256] = CONFIG_PM_STD_PARTITION;
 dev_t swsusp_resume_device;
 sector_t swsusp_resume_block;
-int in_suspend __nosavedata = 0;
+int in_suspend __nosavedata;
 enum {
        HIBERNATION_INVALID,
@@ -51,6 +55,8 @@ enum {
 static int hibernation_mode = HIBERNATION_SHUTDOWN;
+static bool freezer_test_done;
 static const struct platform_hibernation_ops *hibernation_ops;
 /**
@@ -334,14 +340,31 @@ int hibernation_snapshot(int platform_mode)
        if (error)
                goto Close;
-        error = dpm_prepare(PMSG_FREEZE);
-        if (error)
-                goto Complete_devices;
        /* Preallocate image memory before shutting down devices. */
        error = hibernate_preallocate_memory();
        if (error)
-                goto Complete_devices;
+                goto Close;
+        error = freeze_kernel_threads();
+        if (error)
+                goto Cleanup;
+        if (hibernation_test(TEST_FREEZER) ||
+                hibernation_testmode(HIBERNATION_TESTPROC)) {
+                /*
+                 * Indicate to the caller that we are returning due to a
+                 * successful freezer test.
+                 */
+                freezer_test_done = true;
+                goto Cleanup;
+        }
+        error = dpm_prepare(PMSG_FREEZE);
+        if (error) {
+                dpm_complete(msg);
+                goto Cleanup;
+        }
        suspend_console();
        pm_restrict_gfp_mask();
@@ -370,8 +393,6 @@ int hibernation_snapshot(int platform_mode)
                pm_restore_gfp_mask();
        resume_console();
- Complete_devices:
        dpm_complete(msg);
 Close:
@@ -381,6 +402,10 @@ int hibernation_snapshot(int platform_mode)
 Recover_platform:
        platform_recover(platform_mode);
        goto Resume_devices;
+ Cleanup:
+        swsusp_free();
+        goto Close;
 }
 /**
@@ -463,7 +488,7 @@ static int resume_target_kernel(bool platform_mode)
 * @platform_mode: If set, use platform driver to prepare for the transition.
 *
 * This routine must be called with pm_mutex held.  If it is successful, control
- * reappears in the restored target kernel in hibernation_snaphot().
+ * reappears in the restored target kernel in hibernation_snapshot().
 */
 int hibernation_restore(int platform_mode)
 {
@@ -633,15 +658,13 @@ int hibernate(void)
        if (error)
                goto Finish;
-        if (hibernation_test(TEST_FREEZER))
-                goto Thaw;
-        if (hibernation_testmode(HIBERNATION_TESTPROC))
-                goto Thaw;
        error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
        if (error)
                goto Thaw;
+        if (freezer_test_done) {
+                freezer_test_done = false;
+                goto Thaw;
+        }
        if (in_suspend) {
                unsigned int flags = 0;
@@ -650,6 +673,9 @@ int hibernate(void)
                        flags |= SF_PLATFORM_MODE;
                if (nocompress)
                        flags |= SF_NOCOMPRESS_MODE;
+                else
+                        flags |= SF_CRC32_MODE;
                pr_debug("PM: writing image.\n");
                error = swsusp_write(flags);
                swsusp_free();
@@ -724,6 +750,12 @@ static int software_resume(void)
        pr_debug("PM: Checking hibernation image partition %s\n", resume_file);
+        if (resume_delay) {
+                printk(KERN_INFO "Waiting %dsec before reading resume device...\n",
+                        resume_delay);
+                ssleep(resume_delay);
+        }
        /* Check if the device is there */
        swsusp_resume_device = name_to_dev_t(resume_file);
        if (!swsusp_resume_device) {
@@ -732,6 +764,13 @@ static int software_resume(void)
                 * to wait for this to finish.
                 */
                wait_for_device_probe();
+                if (resume_wait) {
+                        while ((swsusp_resume_device = name_to_dev_t(resume_file)) == 0)
+                                msleep(10);
+                        async_synchronize_full();
+                }
                /*
                 * We can't depend on SCSI devices being available after loading
                 * one of their modules until scsi_complete_async_scans() is
@@ -1060,7 +1099,21 @@ static int __init noresume_setup(char *str)
        return 1;
 }
+static int __init resumewait_setup(char *str)
+{
+        resume_wait = 1;
+        return 1;
+}
+static int __init resumedelay_setup(char *str)
+{
+        resume_delay = simple_strtoul(str, NULL, 0);
+        return 1;
+}
 __setup("noresume", noresume_setup);
 __setup("resume_offset=", resume_offset_setup);
 __setup("resume=", resume_setup);
 __setup("hibernate=", hibernate_setup);
+__setup("resumewait", resumewait_setup);
+__setup("resumedelay=", resumedelay_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6c601f87196..36e0f0903c3 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -8,10 +8,13 @@
 *
 */
+#include <linux/export.h>
 #include <linux/kobject.h>
 #include <linux/string.h>
 #include <linux/resume-trace.h>
 #include <linux/workqueue.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 #include "power.h"
@@ -131,6 +134,101 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
 power_attr(pm_test);
 #endif /* CONFIG_PM_DEBUG */
+#ifdef CONFIG_DEBUG_FS
+static char *suspend_step_name(enum suspend_stat_step step)
+{
+        switch (step) {
+        case SUSPEND_FREEZE:
+                return "freeze";
+        case SUSPEND_PREPARE:
+                return "prepare";
+        case SUSPEND_SUSPEND:
+                return "suspend";
+        case SUSPEND_SUSPEND_NOIRQ:
+                return "suspend_noirq";
+        case SUSPEND_RESUME_NOIRQ:
+                return "resume_noirq";
+        case SUSPEND_RESUME:
+                return "resume";
+        default:
+                return "";
+        }
+}
+static int suspend_stats_show(struct seq_file *s, void *unused)
+{
+        int i, index, last_dev, last_errno, last_step;
+        last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+        last_dev %= REC_FAILED_NUM;
+        last_errno = suspend_stats.last_failed_errno + REC_FAILED_NUM - 1;
+        last_errno %= REC_FAILED_NUM;
+        last_step = suspend_stats.last_failed_step + REC_FAILED_NUM - 1;
+        last_step %= REC_FAILED_NUM;
+        seq_printf(s, "%s: %d\n%s: %d\n%s: %d\n%s: %d\n"
+                        "%s: %d\n%s: %d\n%s: %d\n%s: %d\n",
+                        "success", suspend_stats.success,
+                        "fail", suspend_stats.fail,
+                        "failed_freeze", suspend_stats.failed_freeze,
+                        "failed_prepare", suspend_stats.failed_prepare,
+                        "failed_suspend", suspend_stats.failed_suspend,
+                        "failed_suspend_noirq",
+                                suspend_stats.failed_suspend_noirq,
+                        "failed_resume", suspend_stats.failed_resume,
+                        "failed_resume_noirq",
+                                suspend_stats.failed_resume_noirq);
+        seq_printf(s,   "failures:\n  last_failed_dev:\t%-s\n",
+                        suspend_stats.failed_devs[last_dev]);
+        for (i = 1; i < REC_FAILED_NUM; i++) {
+                index = last_dev + REC_FAILED_NUM - i;
+                index %= REC_FAILED_NUM;
+                seq_printf(s, "\t\t\t%-s\n",
+                        suspend_stats.failed_devs[index]);
+        }
+        seq_printf(s,   "  last_failed_errno:\t%-d\n",
+                        suspend_stats.errno[last_errno]);
+        for (i = 1; i < REC_FAILED_NUM; i++) {
+                index = last_errno + REC_FAILED_NUM - i;
+                index %= REC_FAILED_NUM;
+                seq_printf(s, "\t\t\t%-d\n",
+                        suspend_stats.errno[index]);
+        }
+        seq_printf(s,   "  last_failed_step:\t%-s\n",
+                        suspend_step_name(
+                                suspend_stats.failed_steps[last_step]));
+        for (i = 1; i < REC_FAILED_NUM; i++) {
+                index = last_step + REC_FAILED_NUM - i;
+                index %= REC_FAILED_NUM;
+                seq_printf(s, "\t\t\t%-s\n",
+                        suspend_step_name(
+                                suspend_stats.failed_steps[index]));
+        }
+        return 0;
+}
+static int suspend_stats_open(struct inode *inode, struct file *file)
+{
+        return single_open(file, suspend_stats_show, NULL);
+}
+static const struct file_operations suspend_stats_operations = {
+        .open           = suspend_stats_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int __init pm_debugfs_init(void)
+{
+        debugfs_create_file("suspend_stats", S_IFREG | S_IRUGO,
+                        NULL, NULL, &suspend_stats_operations);
+        return 0;
+}
+late_initcall(pm_debugfs_init);
+#endif /* CONFIG_DEBUG_FS */
 #endif /* CONFIG_PM_SLEEP */
 struct kobject *power_kobj;
@@ -192,8 +290,14 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
                if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
                        break;
        }
-        if (state < PM_SUSPEND_MAX && *s)
+        if (state < PM_SUSPEND_MAX && *s) {
                error = enter_state(state);
+                if (error) {
+                        suspend_stats.fail++;
+                        dpm_save_failed_errno(error);
+                } else
+                        suspend_stats.success++;
+        }
 #endif
 Exit:
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 9a00a0a2628..23a2db1ec44 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -146,6 +146,7 @@ extern int swsusp_swap_in_use(void);
 */
 #define SF_PLATFORM_MODE        1
 #define SF_NOCOMPRESS_MODE      2
+#define SF_CRC32_MODE           4
 /* kernel/power/hibernate.c */
 extern int swsusp_check(void);
@@ -228,7 +229,8 @@ extern int pm_test_level;
 #ifdef CONFIG_SUSPEND_FREEZER
 static inline int suspend_freeze_processes(void)
 {
-        return freeze_processes();
+        int error = freeze_processes();
+        return error ? : freeze_kernel_threads();
 }
 static inline void suspend_thaw_processes(void)
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 0cf3a27a6c9..addbbe5531b 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -135,7 +135,7 @@ static int try_to_freeze_tasks(bool sig_only)
 }
 /**
- *      freeze_processes - tell processes to enter the refrigerator
+ * freeze_processes - Signal user space processes to enter the refrigerator.
 */
 int freeze_processes(void)
 {
@@ -143,20 +143,30 @@ int freeze_processes(void)
        printk("Freezing user space processes ... ");
        error = try_to_freeze_tasks(true);
-        if (error)
+        if (!error) {
-                goto Exit;
+                printk("done.");
-        printk("done.\n");
+                oom_killer_disable();
+        }
+        printk("\n");
+        BUG_ON(in_atomic());
+        return error;
+}
+/**
+ * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
+ */
+int freeze_kernel_threads(void)
+{
+        int error;
        printk("Freezing remaining freezable tasks ... ");
        error = try_to_freeze_tasks(false);
-        if (error)
+        if (!error)
-                goto Exit;
+                printk("done.");
-        printk("done.");
-        oom_killer_disable();
- Exit:
-        BUG_ON(in_atomic());
        printk("\n");
+        BUG_ON(in_atomic());
        return error;
 }
diff --git a/kernel/pm_qos_params.c b/kernel/power/qos.c
index 37f05d0f079..995e3bd3417 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/power/qos.c
@@ -29,7 +29,7 @@
 /*#define DEBUG*/
-#include <linux/pm_qos_params.h>
+#include <linux/pm_qos.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/slab.h>
@@ -43,64 +43,61 @@
 #include <linux/kernel.h>
 #include <linux/uaccess.h>
+#include <linux/export.h>
 /*
- * locking rule: all changes to requests or notifiers lists
+ * locking rule: all changes to constraints or notifiers lists
 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
 * held, taken with _irqsave.  One lock to rule them all
 */
-enum pm_qos_type {
-        PM_QOS_MAX,             /* return the largest value */
-        PM_QOS_MIN              /* return the smallest value */
-};
-/*
- * Note: The lockless read path depends on the CPU accessing
- * target_value atomically.  Atomic access is only guaranteed on all CPU
- * types linux supports for 32 bit quantites
- */
 struct pm_qos_object {
-        struct plist_head requests;
+        struct pm_qos_constraints *constraints;
-        struct blocking_notifier_head *notifiers;
        struct miscdevice pm_qos_power_miscdev;
        char *name;
-        s32 target_value;       /* Do not change to 64 bit */
-        s32 default_value;
-        enum pm_qos_type type;
 };
 static DEFINE_SPINLOCK(pm_qos_lock);
 static struct pm_qos_object null_pm_qos;
 static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier);
-static struct pm_qos_object cpu_dma_pm_qos = {
+static struct pm_qos_constraints cpu_dma_constraints = {
-        .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests),
+        .list = PLIST_HEAD_INIT(cpu_dma_constraints.list),
-        .notifiers = &cpu_dma_lat_notifier,
-        .name = "cpu_dma_latency",
        .target_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
        .default_value = PM_QOS_CPU_DMA_LAT_DEFAULT_VALUE,
        .type = PM_QOS_MIN,
+        .notifiers = &cpu_dma_lat_notifier,
+};
+static struct pm_qos_object cpu_dma_pm_qos = {
+        .constraints = &cpu_dma_constraints,
+        .name = "cpu_dma_latency",
 };
 static BLOCKING_NOTIFIER_HEAD(network_lat_notifier);
-static struct pm_qos_object network_lat_pm_qos = {
+static struct pm_qos_constraints network_lat_constraints = {
-        .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests),
+        .list = PLIST_HEAD_INIT(network_lat_constraints.list),
-        .notifiers = &network_lat_notifier,
-        .name = "network_latency",
        .target_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
        .default_value = PM_QOS_NETWORK_LAT_DEFAULT_VALUE,
-        .type = PM_QOS_MIN
+        .type = PM_QOS_MIN,
+        .notifiers = &network_lat_notifier,
+};
+static struct pm_qos_object network_lat_pm_qos = {
+        .constraints = &network_lat_constraints,
+        .name = "network_latency",
 };
 static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
-static struct pm_qos_object network_throughput_pm_qos = {
+static struct pm_qos_constraints network_tput_constraints = {
-        .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests),
+        .list = PLIST_HEAD_INIT(network_tput_constraints.list),
-        .notifiers = &network_throughput_notifier,
-        .name = "network_throughput",
        .target_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
        .default_value = PM_QOS_NETWORK_THROUGHPUT_DEFAULT_VALUE,
        .type = PM_QOS_MAX,
+        .notifiers = &network_throughput_notifier,
+};
+static struct pm_qos_object network_throughput_pm_qos = {
+        .constraints = &network_tput_constraints,
+        .name = "network_throughput",
 };
@@ -127,17 +124,17 @@ static const struct file_operations pm_qos_power_fops = {
 };
 /* unlocked internal variant */
-static inline int pm_qos_get_value(struct pm_qos_object *o)
+static inline int pm_qos_get_value(struct pm_qos_constraints *c)
 {
-        if (plist_head_empty(&o->requests))
+        if (plist_head_empty(&c->list))
-                return o->default_value;
+                return c->default_value;
-        switch (o->type) {
+        switch (c->type) {
        case PM_QOS_MIN:
-                return plist_first(&o->requests)->prio;
+                return plist_first(&c->list)->prio;
        case PM_QOS_MAX:
-                return plist_last(&o->requests)->prio;
+                return plist_last(&c->list)->prio;
        default:
                /* runtime check for not using enum */
@@ -145,69 +142,73 @@ static inline int pm_qos_get_value(struct pm_qos_object *o)
        }
 }
-static inline s32 pm_qos_read_value(struct pm_qos_object *o)
+s32 pm_qos_read_value(struct pm_qos_constraints *c)
 {
-        return o->target_value;
+        return c->target_value;
 }
-static inline void pm_qos_set_value(struct pm_qos_object *o, s32 value)
+static inline void pm_qos_set_value(struct pm_qos_constraints *c, s32 value)
 {
-        o->target_value = value;
+        c->target_value = value;
 }
-static void update_target(struct pm_qos_object *o, struct plist_node *node,
+/**
-                          int del, int value)
+ * pm_qos_update_target - manages the constraints list and calls the notifiers
+ *  if needed
+ * @c: constraints data struct
+ * @node: request to add to the list, to update or to remove
+ * @action: action to take on the constraints list
+ * @value: value of the request to add or update
+ *
+ * This function returns 1 if the aggregated constraint value has changed, 0
+ *  otherwise.
+ */
+int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
+                         enum pm_qos_req_action action, int value)
 {
        unsigned long flags;
-        int prev_value, curr_value;
+        int prev_value, curr_value, new_value;
        spin_lock_irqsave(&pm_qos_lock, flags);
-        prev_value = pm_qos_get_value(o);
+        prev_value = pm_qos_get_value(c);
-        /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */
+        if (value == PM_QOS_DEFAULT_VALUE)
-        if (value != PM_QOS_DEFAULT_VALUE) {
+                new_value = c->default_value;
+        else
+                new_value = value;
+        switch (action) {
+        case PM_QOS_REMOVE_REQ:
+                plist_del(node, &c->list);
+                break;
+        case PM_QOS_UPDATE_REQ:
                /*
                 * to change the list, we atomically remove, reinit
                 * with new value and add, then see if the extremal
                 * changed
                 */
-                plist_del(node, &o->requests);
+                plist_del(node, &c->list);
-                plist_node_init(node, value);
+        case PM_QOS_ADD_REQ:
-                plist_add(node, &o->requests);
+                plist_node_init(node, new_value);
-        } else if (del) {
+                plist_add(node, &c->list);
-                plist_del(node, &o->requests);
+                break;
-        } else {
+        default:
-                plist_add(node, &o->requests);
+                /* no action */
+                ;
        }
-        curr_value = pm_qos_get_value(o);
-        pm_qos_set_value(o, curr_value);
+        curr_value = pm_qos_get_value(c);
+        pm_qos_set_value(c, curr_value);
        spin_unlock_irqrestore(&pm_qos_lock, flags);
-        if (prev_value != curr_value)
+        if (prev_value != curr_value) {
-                blocking_notifier_call_chain(o->notifiers,
+                blocking_notifier_call_chain(c->notifiers,
                                             (unsigned long)curr_value,
                                             NULL);
-}
+                return 1;
+        } else {
-static int register_pm_qos_misc(struct pm_qos_object *qos)
+                return 0;
-{
-        qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
-        qos->pm_qos_power_miscdev.name = qos->name;
-        qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
-        return misc_register(&qos->pm_qos_power_miscdev);
-}
-static int find_pm_qos_object_by_minor(int minor)
-{
-        int pm_qos_class;
-        for (pm_qos_class = 0;
-                pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
-                if (minor ==
-                        pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
-                        return pm_qos_class;
        }
-        return -1;
 }
 /**
@@ -218,11 +219,11 @@ static int find_pm_qos_object_by_minor(int minor)
 */
 int pm_qos_request(int pm_qos_class)
 {
-        return pm_qos_read_value(pm_qos_array[pm_qos_class]);
+        return pm_qos_read_value(pm_qos_array[pm_qos_class]->constraints);
 }
 EXPORT_SYMBOL_GPL(pm_qos_request);
-int pm_qos_request_active(struct pm_qos_request_list *req)
+int pm_qos_request_active(struct pm_qos_request *req)
 {
        return req->pm_qos_class != 0;
 }
@@ -230,40 +231,36 @@ EXPORT_SYMBOL_GPL(pm_qos_request_active);
 /**
 * pm_qos_add_request - inserts new qos request into the list
- * @dep: pointer to a preallocated handle
+ * @req: pointer to a preallocated handle
 * @pm_qos_class: identifies which list of qos request to use
 * @value: defines the qos request
 *
 * This function inserts a new entry in the pm_qos_class list of requested qos
 * performance characteristics.  It recomputes the aggregate QoS expectations
- * for the pm_qos_class of parameters and initializes the pm_qos_request_list
+ * for the pm_qos_class of parameters and initializes the pm_qos_request
 * handle.  Caller needs to save this handle for later use in updates and
 * removal.
 */
-void pm_qos_add_request(struct pm_qos_request_list *dep,
+void pm_qos_add_request(struct pm_qos_request *req,
                        int pm_qos_class, s32 value)
 {
-        struct pm_qos_object *o =  pm_qos_array[pm_qos_class];
+        if (!req) /*guard against callers passing in null */
-        int new_value;
+                return;
-        if (pm_qos_request_active(dep)) {
+        if (pm_qos_request_active(req)) {
                WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n");
                return;
        }
-        if (value == PM_QOS_DEFAULT_VALUE)
+        req->pm_qos_class = pm_qos_class;
-                new_value = o->default_value;
+        pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
-        else
+                             &req->node, PM_QOS_ADD_REQ, value);
-                new_value = value;
-        plist_node_init(&dep->list, new_value);
-        dep->pm_qos_class = pm_qos_class;
-        update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE);
 }
 EXPORT_SYMBOL_GPL(pm_qos_add_request);
 /**
 * pm_qos_update_request - modifies an existing qos request
- * @pm_qos_req : handle to list element holding a pm_qos request to use
+ * @req : handle to list element holding a pm_qos request to use
 * @value: defines the qos request
 *
 * Updates an existing qos request for the pm_qos_class of parameters along
@@ -271,56 +268,47 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request);
 *
 * Attempts are made to make this code callable on hot code paths.
 */
-void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req,
+void pm_qos_update_request(struct pm_qos_request *req,
                           s32 new_value)
 {
-        s32 temp;
+        if (!req) /*guard against callers passing in null */
-        struct pm_qos_object *o;
-        if (!pm_qos_req) /*guard against callers passing in null */
                return;
-        if (!pm_qos_request_active(pm_qos_req)) {
+        if (!pm_qos_request_active(req)) {
                WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
                return;
        }
-        o = pm_qos_array[pm_qos_req->pm_qos_class];
+        if (new_value != req->node.prio)
+                pm_qos_update_target(
-        if (new_value == PM_QOS_DEFAULT_VALUE)
+                        pm_qos_array[req->pm_qos_class]->constraints,
-                temp = o->default_value;
+                        &req->node, PM_QOS_UPDATE_REQ, new_value);
-        else
-                temp = new_value;
-        if (temp != pm_qos_req->list.prio)
-                update_target(o, &pm_qos_req->list, 0, temp);
 }
 EXPORT_SYMBOL_GPL(pm_qos_update_request);
 /**
 * pm_qos_remove_request - modifies an existing qos request
- * @pm_qos_req: handle to request list element
+ * @req: handle to request list element
 *
- * Will remove pm qos request from the list of requests and
+ * Will remove pm qos request from the list of constraints and
 * recompute the current target value for the pm_qos_class.  Call this
 * on slow code paths.
 */
-void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req)
+void pm_qos_remove_request(struct pm_qos_request *req)
 {
-        struct pm_qos_object *o;
+        if (!req) /*guard against callers passing in null */
-        if (pm_qos_req == NULL)
                return;
                /* silent return to keep pcm code cleaner */
-        if (!pm_qos_request_active(pm_qos_req)) {
+        if (!pm_qos_request_active(req)) {
                WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n");
                return;
        }
-        o = pm_qos_array[pm_qos_req->pm_qos_class];
+        pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
-        update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE);
+                             &req->node, PM_QOS_REMOVE_REQ,
-        memset(pm_qos_req, 0, sizeof(*pm_qos_req));
+                             PM_QOS_DEFAULT_VALUE);
+        memset(req, 0, sizeof(*req));
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_request);
@@ -337,7 +325,8 @@ int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
        int retval;
        retval = blocking_notifier_chain_register(
-                        pm_qos_array[pm_qos_class]->notifiers, notifier);
+                        pm_qos_array[pm_qos_class]->constraints->notifiers,
+                        notifier);
        return retval;
 }
@@ -356,34 +345,57 @@ int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
        int retval;
        retval = blocking_notifier_chain_unregister(
-                        pm_qos_array[pm_qos_class]->notifiers, notifier);
+                        pm_qos_array[pm_qos_class]->constraints->notifiers,
+                        notifier);
        return retval;
 }
 EXPORT_SYMBOL_GPL(pm_qos_remove_notifier);
+/* User space interface to PM QoS classes via misc devices */
+static int register_pm_qos_misc(struct pm_qos_object *qos)
+{
+        qos->pm_qos_power_miscdev.minor = MISC_DYNAMIC_MINOR;
+        qos->pm_qos_power_miscdev.name = qos->name;
+        qos->pm_qos_power_miscdev.fops = &pm_qos_power_fops;
+        return misc_register(&qos->pm_qos_power_miscdev);
+}
+static int find_pm_qos_object_by_minor(int minor)
+{
+        int pm_qos_class;
+        for (pm_qos_class = 0;
+                pm_qos_class < PM_QOS_NUM_CLASSES; pm_qos_class++) {
+                if (minor ==
+                        pm_qos_array[pm_qos_class]->pm_qos_power_miscdev.minor)
+                        return pm_qos_class;
+        }
+        return -1;
+}
 static int pm_qos_power_open(struct inode *inode, struct file *filp)
 {
        long pm_qos_class;
        pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
        if (pm_qos_class >= 0) {
-               struct pm_qos_request_list *req = kzalloc(sizeof(*req), GFP_KERNEL);
+                struct pm_qos_request *req = kzalloc(sizeof(*req), GFP_KERNEL);
                if (!req)
                        return -ENOMEM;
                pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE);
                filp->private_data = req;
-                if (filp->private_data)
+                return 0;
-                        return 0;
        }
        return -EPERM;
 }
 static int pm_qos_power_release(struct inode *inode, struct file *filp)
 {
-        struct pm_qos_request_list *req;
+        struct pm_qos_request *req;
        req = filp->private_data;
        pm_qos_remove_request(req);
@@ -398,17 +410,15 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
 {
        s32 value;
        unsigned long flags;
-        struct pm_qos_object *o;
+        struct pm_qos_request *req = filp->private_data;
-        struct pm_qos_request_list *pm_qos_req = filp->private_data;
-        if (!pm_qos_req)
+        if (!req)
                return -EINVAL;
-        if (!pm_qos_request_active(pm_qos_req))
+        if (!pm_qos_request_active(req))
                return -EINVAL;
-        o = pm_qos_array[pm_qos_req->pm_qos_class];
        spin_lock_irqsave(&pm_qos_lock, flags);
-        value = pm_qos_get_value(o);
+        value = pm_qos_get_value(pm_qos_array[req->pm_qos_class]->constraints);
        spin_unlock_irqrestore(&pm_qos_lock, flags);
        return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
@@ -418,7 +428,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
                size_t count, loff_t *f_pos)
 {
        s32 value;
-        struct pm_qos_request_list *pm_qos_req;
+        struct pm_qos_request *req;
        if (count == sizeof(s32)) {
                if (copy_from_user(&value, buf, sizeof(s32)))
@@ -449,8 +459,8 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
                return -EINVAL;
        }
-        pm_qos_req = filp->private_data;
+        req = filp->private_data;
-        pm_qos_update_request(pm_qos_req, value);
+        pm_qos_update_request(req, value);
        return count;
 }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 06efa54f93d..cbe2c144139 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1339,6 +1339,9 @@ int hibernate_preallocate_memory(void)
        count += highmem;
        count -= totalreserve_pages;
+        /* Add number of pages required for page keys (s390 only). */
+        size += page_key_additional_pages(saveable);
        /* Compute the maximum number of saveable pages to leave in memory. */
        max_size = (count - (size + PAGES_FOR_IO)) / 2
                        - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
@@ -1662,6 +1665,8 @@ pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
                buf[j] = memory_bm_next_pfn(bm);
                if (unlikely(buf[j] == BM_END_OF_MAP))
                        break;
+                /* Save page key for data page (s390 only). */
+                page_key_read(buf + j);
        }
 }
@@ -1821,6 +1826,9 @@ static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
                if (unlikely(buf[j] == BM_END_OF_MAP))
                        break;
+                /* Extract and buffer page key for data page (s390 only). */
+                page_key_memorize(buf + j);
                if (memory_bm_pfn_present(bm, buf[j]))
                        memory_bm_set_bit(bm, buf[j]);
                else
@@ -2223,6 +2231,11 @@ int snapshot_write_next(struct snapshot_handle *handle)
                if (error)
                        return error;
+                /* Allocate buffer for page keys. */
+                error = page_key_alloc(nr_copy_pages);
+                if (error)
+                        return error;
        } else if (handle->cur <= nr_meta_pages + 1) {
                error = unpack_orig_pfns(buffer, &copy_bm);
                if (error)
@@ -2243,6 +2256,8 @@ int snapshot_write_next(struct snapshot_handle *handle)
                }
        } else {
                copy_last_highmem_page();
+                /* Restore page key for data page (s390 only). */
+                page_key_write(handle->buffer);
                handle->buffer = get_buffer(&orig_bm, &ca);
                if (IS_ERR(handle->buffer))
                        return PTR_ERR(handle->buffer);
@@ -2264,6 +2279,9 @@ int snapshot_write_next(struct snapshot_handle *handle)
 void snapshot_write_finalize(struct snapshot_handle *handle)
 {
        copy_last_highmem_page();
+        /* Restore page key for data page (s390 only). */
+        page_key_write(handle->buffer);
+        page_key_free();
        /* Free only if we have loaded the image entirely */
        if (handle->cur > 1 && handle->cur > nr_meta_pages + nr_copy_pages) {
                memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index b6b71ad2208..4953dc054c5 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -12,6 +12,7 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/init.h>
+#include <linux/kmod.h>
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
@@ -21,6 +22,7 @@
 #include <linux/list.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/export.h>
 #include <linux/suspend.h>
 #include <linux/syscore_ops.h>
 #include <trace/events/power.h>
@@ -104,7 +106,10 @@ static int suspend_prepare(void)
                goto Finish;
        error = suspend_freeze_processes();
-        if (!error)
+        if (error) {
+                suspend_stats.failed_freeze++;
+                dpm_save_failed_step(SUSPEND_FREEZE);
+        } else
                return 0;
        suspend_thaw_processes();
@@ -315,8 +320,16 @@ int enter_state(suspend_state_t state)
 */
 int pm_suspend(suspend_state_t state)
 {
-        if (state > PM_SUSPEND_ON && state <= PM_SUSPEND_MAX)
+        int ret;
-                return enter_state(state);
+        if (state > PM_SUSPEND_ON && state < PM_SUSPEND_MAX) {
+                ret = enter_state(state);
+                if (ret) {
+                        suspend_stats.fail++;
+                        dpm_save_failed_errno(ret);
+                } else
+                        suspend_stats.success++;
+                return ret;
+        }
        return -EINVAL;
 }
 EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 7c97c3a0eee..11a594c4ba2 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -27,6 +27,10 @@
 #include <linux/slab.h>
 #include <linux/lzo.h>
 #include <linux/vmalloc.h>
+#include <linux/cpumask.h>
+#include <linux/atomic.h>
+#include <linux/kthread.h>
+#include <linux/crc32.h>
 #include "power.h"
@@ -43,8 +47,7 @@
 *      allocated and populated one at a time, so we only need one memory
 *      page to set up the entire structure.
 *
- *      During resume we also only need to use one swap_map_page structure
+ *      During resume we pick up all swap_map_page structures into a list.
- *      at a time.
 */
 #define MAP_PAGE_ENTRIES        (PAGE_SIZE / sizeof(sector_t) - 1)
@@ -54,6 +57,11 @@ struct swap_map_page {
        sector_t next_swap;
 };
+struct swap_map_page_list {
+        struct swap_map_page *map;
+        struct swap_map_page_list *next;
+};
 /**
 *      The swap_map_handle structure is used for handling swap in
 *      a file-alike way
@@ -61,13 +69,18 @@ struct swap_map_page {
 struct swap_map_handle {
        struct swap_map_page *cur;
+        struct swap_map_page_list *maps;
        sector_t cur_swap;
        sector_t first_sector;
        unsigned int k;
+        unsigned long nr_free_pages, written;
+        u32 crc32;
 };
 struct swsusp_header {
-        char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int)];
+        char reserved[PAGE_SIZE - 20 - sizeof(sector_t) - sizeof(int) -
+                      sizeof(u32)];
+        u32     crc32;
        sector_t image;
        unsigned int flags;     /* Flags to pass to the "boot" kernel */
        char    orig_sig[10];
@@ -199,6 +212,8 @@ static int mark_swapfiles(struct swap_map_handle *handle, unsigned int flags)
                memcpy(swsusp_header->sig, HIBERNATE_SIG, 10);
                swsusp_header->image = handle->first_sector;
                swsusp_header->flags = flags;
+                if (flags & SF_CRC32_MODE)
+                        swsusp_header->crc32 = handle->crc32;
                error = hib_bio_write_page(swsusp_resume_block,
                                        swsusp_header, NULL);
        } else {
@@ -245,6 +260,7 @@ static int swsusp_swap_check(void)
 static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
 {
        void *src;
+        int ret;
        if (!offset)
                return -ENOSPC;
@@ -254,9 +270,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
                if (src) {
                        copy_page(src, buf);
                } else {
-                        WARN_ON_ONCE(1);
+                        ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
-                        bio_chain = NULL;       /* Go synchronous */
+                        if (ret)
-                        src = buf;
+                                return ret;
+                        src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+                        if (src) {
+                                copy_page(src, buf);
+                        } else {
+                                WARN_ON_ONCE(1);
+                                bio_chain = NULL;       /* Go synchronous */
+                                src = buf;
+                        }
                }
        } else {
                src = buf;
@@ -293,6 +317,8 @@ static int get_swap_writer(struct swap_map_handle *handle)
                goto err_rel;
        }
        handle->k = 0;
+        handle->nr_free_pages = nr_free_pages() >> 1;
+        handle->written = 0;
        handle->first_sector = handle->cur_swap;
        return 0;
 err_rel:
@@ -316,20 +342,23 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
                return error;
        handle->cur->entries[handle->k++] = offset;
        if (handle->k >= MAP_PAGE_ENTRIES) {
-                error = hib_wait_on_bio_chain(bio_chain);
-                if (error)
-                        goto out;
                offset = alloc_swapdev_block(root_swap);
                if (!offset)
                        return -ENOSPC;
                handle->cur->next_swap = offset;
-                error = write_page(handle->cur, handle->cur_swap, NULL);
+                error = write_page(handle->cur, handle->cur_swap, bio_chain);
                if (error)
                        goto out;
                clear_page(handle->cur);
                handle->cur_swap = offset;
                handle->k = 0;
        }
+        if (bio_chain && ++handle->written > handle->nr_free_pages) {
+                error = hib_wait_on_bio_chain(bio_chain);
+                if (error)
+                        goto out;
+                handle->written = 0;
+        }
 out:
        return error;
 }
@@ -372,6 +401,13 @@ static int swap_writer_finish(struct swap_map_handle *handle,
                                     LZO_HEADER, PAGE_SIZE)
 #define LZO_CMP_SIZE    (LZO_CMP_PAGES * PAGE_SIZE)
+/* Maximum number of threads for compression/decompression. */
+#define LZO_THREADS     3
+/* Maximum number of pages for read buffering. */
+#define LZO_READ_PAGES  (MAP_PAGE_ENTRIES * 8)
 /**
 *      save_image - save the suspend image data
 */
@@ -419,6 +455,92 @@ static int save_image(struct swap_map_handle *handle,
        return ret;
 }
+/**
+ * Structure used for CRC32.
+ */
+struct crc_data {
+        struct task_struct *thr;                  /* thread */
+        atomic_t ready;                           /* ready to start flag */
+        atomic_t stop;                            /* ready to stop flag */
+        unsigned run_threads;                     /* nr current threads */
+        wait_queue_head_t go;                     /* start crc update */
+        wait_queue_head_t done;                   /* crc update done */
+        u32 *crc32;                               /* points to handle's crc32 */
+        size_t *unc_len[LZO_THREADS];             /* uncompressed lengths */
+        unsigned char *unc[LZO_THREADS];          /* uncompressed data */
+};
+/**
+ * CRC32 update function that runs in its own thread.
+ */
+static int crc32_threadfn(void *data)
+{
+        struct crc_data *d = data;
+        unsigned i;
+        while (1) {
+                wait_event(d->go, atomic_read(&d->ready) ||
+                                  kthread_should_stop());
+                if (kthread_should_stop()) {
+                        d->thr = NULL;
+                        atomic_set(&d->stop, 1);
+                        wake_up(&d->done);
+                        break;
+                }
+                atomic_set(&d->ready, 0);
+                for (i = 0; i < d->run_threads; i++)
+                        *d->crc32 = crc32_le(*d->crc32,
+                                             d->unc[i], *d->unc_len[i]);
+                atomic_set(&d->stop, 1);
+                wake_up(&d->done);
+        }
+        return 0;
+}
+/**
+ * Structure used for LZO data compression.
+ */
+struct cmp_data {
+        struct task_struct *thr;                  /* thread */
+        atomic_t ready;                           /* ready to start flag */
+        atomic_t stop;                            /* ready to stop flag */
+        int ret;                                  /* return code */
+        wait_queue_head_t go;                     /* start compression */
+        wait_queue_head_t done;                   /* compression done */
+        size_t unc_len;                           /* uncompressed length */
+        size_t cmp_len;                           /* compressed length */
+        unsigned char unc[LZO_UNC_SIZE];          /* uncompressed buffer */
+        unsigned char cmp[LZO_CMP_SIZE];          /* compressed buffer */
+        unsigned char wrk[LZO1X_1_MEM_COMPRESS];  /* compression workspace */
+};
+/**
+ * Compression function that runs in its own thread.
+ */
+static int lzo_compress_threadfn(void *data)
+{
+        struct cmp_data *d = data;
+        while (1) {
+                wait_event(d->go, atomic_read(&d->ready) ||
+                                  kthread_should_stop());
+                if (kthread_should_stop()) {
+                        d->thr = NULL;
+                        d->ret = -1;
+                        atomic_set(&d->stop, 1);
+                        wake_up(&d->done);
+                        break;
+                }
+                atomic_set(&d->ready, 0);
+                d->ret = lzo1x_1_compress(d->unc, d->unc_len,
+                                          d->cmp + LZO_HEADER, &d->cmp_len,
+                                          d->wrk);
+                atomic_set(&d->stop, 1);
+                wake_up(&d->done);
+        }
+        return 0;
+}
 /**
 * save_image_lzo - Save the suspend image data compressed with LZO.
@@ -437,42 +559,93 @@ static int save_image_lzo(struct swap_map_handle *handle,
        struct bio *bio;
        struct timeval start;
        struct timeval stop;
-        size_t off, unc_len, cmp_len;
+        size_t off;
-        unsigned char *unc, *cmp, *wrk, *page;
+        unsigned thr, run_threads, nr_threads;
+        unsigned char *page = NULL;
+        struct cmp_data *data = NULL;
+        struct crc_data *crc = NULL;
+        /*
+         * We'll limit the number of threads for compression to limit memory
+         * footprint.
+         */
+        nr_threads = num_online_cpus() - 1;
+        nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
        page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
        if (!page) {
                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
-                return -ENOMEM;
+                ret = -ENOMEM;
+                goto out_clean;
        }
-        wrk = vmalloc(LZO1X_1_MEM_COMPRESS);
+        data = vmalloc(sizeof(*data) * nr_threads);
-        if (!wrk) {
+        if (!data) {
-                printk(KERN_ERR "PM: Failed to allocate LZO workspace\n");
+                printk(KERN_ERR "PM: Failed to allocate LZO data\n");
-                free_page((unsigned long)page);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto out_clean;
        }
+        for (thr = 0; thr < nr_threads; thr++)
+                memset(&data[thr], 0, offsetof(struct cmp_data, go));
-        unc = vmalloc(LZO_UNC_SIZE);
+        crc = kmalloc(sizeof(*crc), GFP_KERNEL);
-        if (!unc) {
+        if (!crc) {
-                printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
+                printk(KERN_ERR "PM: Failed to allocate crc\n");
-                vfree(wrk);
+                ret = -ENOMEM;
-                free_page((unsigned long)page);
+                goto out_clean;
-                return -ENOMEM;
+        }
+        memset(crc, 0, offsetof(struct crc_data, go));
+        /*
+         * Start the compression threads.
+         */
+        for (thr = 0; thr < nr_threads; thr++) {
+                init_waitqueue_head(&data[thr].go);
+                init_waitqueue_head(&data[thr].done);
+                data[thr].thr = kthread_run(lzo_compress_threadfn,
+                                            &data[thr],
+                                            "image_compress/%u", thr);
+                if (IS_ERR(data[thr].thr)) {
+                        data[thr].thr = NULL;
+                        printk(KERN_ERR
+                               "PM: Cannot start compression threads\n");
+                        ret = -ENOMEM;
+                        goto out_clean;
+                }
        }
-        cmp = vmalloc(LZO_CMP_SIZE);
+        /*
-        if (!cmp) {
+         * Adjust number of free pages after all allocations have been done.
-                printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
+         * We don't want to run out of pages when writing.
-                vfree(unc);
+         */
-                vfree(wrk);
+        handle->nr_free_pages = nr_free_pages() >> 1;
-                free_page((unsigned long)page);
-                return -ENOMEM;
+        /*
+         * Start the CRC32 thread.
+         */
+        init_waitqueue_head(&crc->go);
+        init_waitqueue_head(&crc->done);
+        handle->crc32 = 0;
+        crc->crc32 = &handle->crc32;
+        for (thr = 0; thr < nr_threads; thr++) {
+                crc->unc[thr] = data[thr].unc;
+                crc->unc_len[thr] = &data[thr].unc_len;
+        }
+        crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
+        if (IS_ERR(crc->thr)) {
+                crc->thr = NULL;
+                printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+                ret = -ENOMEM;
+                goto out_clean;
        }
        printk(KERN_INFO
+                "PM: Using %u thread(s) for compression.\n"
                "PM: Compressing and saving image data (%u pages) ...     ",
-                nr_to_write);
+                nr_threads, nr_to_write);
        m = nr_to_write / 100;
        if (!m)
                m = 1;
@@ -480,55 +653,83 @@ static int save_image_lzo(struct swap_map_handle *handle,
        bio = NULL;
        do_gettimeofday(&start);
        for (;;) {
-                for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
+                for (thr = 0; thr < nr_threads; thr++) {
-                        ret = snapshot_read_next(snapshot);
+                        for (off = 0; off < LZO_UNC_SIZE; off += PAGE_SIZE) {
-                        if (ret < 0)
+                                ret = snapshot_read_next(snapshot);
-                                goto out_finish;
+                                if (ret < 0)
+                                        goto out_finish;
-                        if (!ret)
+                                if (!ret)
+                                        break;
+                                memcpy(data[thr].unc + off,
+                                       data_of(*snapshot), PAGE_SIZE);
+                                if (!(nr_pages % m))
+                                        printk(KERN_CONT "\b\b\b\b%3d%%",
+                                               nr_pages / m);
+                                nr_pages++;
+                        }
+                        if (!off)
                                break;
-                        memcpy(unc + off, data_of(*snapshot), PAGE_SIZE);
+                        data[thr].unc_len = off;
-                        if (!(nr_pages % m))
+                        atomic_set(&data[thr].ready, 1);
-                                printk(KERN_CONT "\b\b\b\b%3d%%", nr_pages / m);
+                        wake_up(&data[thr].go);
-                        nr_pages++;
                }
-                if (!off)
+                if (!thr)
                        break;
-                unc_len = off;
+                crc->run_threads = thr;
-                ret = lzo1x_1_compress(unc, unc_len,
+                atomic_set(&crc->ready, 1);
-                                       cmp + LZO_HEADER, &cmp_len, wrk);
+                wake_up(&crc->go);
-                if (ret < 0) {
-                        printk(KERN_ERR "PM: LZO compression failed\n");
-                        break;
-                }
-                if (unlikely(!cmp_len ||
+                for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
-                             cmp_len > lzo1x_worst_compress(unc_len))) {
+                        wait_event(data[thr].done,
-                        printk(KERN_ERR "PM: Invalid LZO compressed length\n");
+                                   atomic_read(&data[thr].stop));
-                        ret = -1;
+                        atomic_set(&data[thr].stop, 0);
-                        break;
-                }
-                *(size_t *)cmp = cmp_len;
+                        ret = data[thr].ret;
-                /*
+                        if (ret < 0) {
-                 * Given we are writing one page at a time to disk, we copy
+                                printk(KERN_ERR "PM: LZO compression failed\n");
-                 * that much from the buffer, although the last bit will likely
+                                goto out_finish;
-                 * be smaller than full page. This is OK - we saved the length
+                        }
-                 * of the compressed data, so any garbage at the end will be
-                 * discarded when we read it.
-                 */
-                for (off = 0; off < LZO_HEADER + cmp_len; off += PAGE_SIZE) {
-                        memcpy(page, cmp + off, PAGE_SIZE);
-                        ret = swap_write_page(handle, page, &bio);
+                        if (unlikely(!data[thr].cmp_len ||
-                        if (ret)
+                                     data[thr].cmp_len >
+                                     lzo1x_worst_compress(data[thr].unc_len))) {
+                                printk(KERN_ERR
+                                       "PM: Invalid LZO compressed length\n");
+                                ret = -1;
                                goto out_finish;
+                        }
+                        *(size_t *)data[thr].cmp = data[thr].cmp_len;
+                        /*
+                         * Given we are writing one page at a time to disk, we
+                         * copy that much from the buffer, although the last
+                         * bit will likely be smaller than full page. This is
+                         * OK - we saved the length of the compressed data, so
+                         * any garbage at the end will be discarded when we
+                         * read it.
+                         */
+                        for (off = 0;
+                             off < LZO_HEADER + data[thr].cmp_len;
+                             off += PAGE_SIZE) {
+                                memcpy(page, data[thr].cmp + off, PAGE_SIZE);
+                                ret = swap_write_page(handle, page, &bio);
+                                if (ret)
+                                        goto out_finish;
+                        }
                }
+                wait_event(crc->done, atomic_read(&crc->stop));
+                atomic_set(&crc->stop, 0);
        }
 out_finish:
@@ -536,16 +737,25 @@ out_finish:
        do_gettimeofday(&stop);
        if (!ret)
                ret = err2;
-        if (!ret)
+        if (!ret) {
                printk(KERN_CONT "\b\b\b\bdone\n");
-        else
+        } else {
                printk(KERN_CONT "\n");
+        }
        swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
+out_clean:
-        vfree(cmp);
+        if (crc) {
-        vfree(unc);
+                if (crc->thr)
-        vfree(wrk);
+                        kthread_stop(crc->thr);
-        free_page((unsigned long)page);
+                kfree(crc);
+        }
+        if (data) {
+                for (thr = 0; thr < nr_threads; thr++)
+                        if (data[thr].thr)
+                                kthread_stop(data[thr].thr);
+                vfree(data);
+        }
+        if (page) free_page((unsigned long)page);
        return ret;
 }
@@ -625,8 +835,15 @@ out_finish:
 static void release_swap_reader(struct swap_map_handle *handle)
 {
-        if (handle->cur)
+        struct swap_map_page_list *tmp;
-                free_page((unsigned long)handle->cur);
+        while (handle->maps) {
+                if (handle->maps->map)
+                        free_page((unsigned long)handle->maps->map);
+                tmp = handle->maps;
+                handle->maps = handle->maps->next;
+                kfree(tmp);
+        }
        handle->cur = NULL;
 }
@@ -634,22 +851,46 @@ static int get_swap_reader(struct swap_map_handle *handle,
                unsigned int *flags_p)
 {
        int error;
+        struct swap_map_page_list *tmp, *last;
+        sector_t offset;
        *flags_p = swsusp_header->flags;
        if (!swsusp_header->image) /* how can this happen? */
                return -EINVAL;
-        handle->cur = (struct swap_map_page *)get_zeroed_page(__GFP_WAIT | __GFP_HIGH);
+        handle->cur = NULL;
-        if (!handle->cur)
+        last = handle->maps = NULL;
-                return -ENOMEM;
+        offset = swsusp_header->image;
+        while (offset) {
+                tmp = kmalloc(sizeof(*handle->maps), GFP_KERNEL);
+                if (!tmp) {
+                        release_swap_reader(handle);
+                        return -ENOMEM;
+                }
+                memset(tmp, 0, sizeof(*tmp));
+                if (!handle->maps)
+                        handle->maps = tmp;
+                if (last)
+                        last->next = tmp;
+                last = tmp;
+                tmp->map = (struct swap_map_page *)
+                           __get_free_page(__GFP_WAIT | __GFP_HIGH);
+                if (!tmp->map) {
+                        release_swap_reader(handle);
+                        return -ENOMEM;
+                }
-        error = hib_bio_read_page(swsusp_header->image, handle->cur, NULL);
+                error = hib_bio_read_page(offset, tmp->map, NULL);
-        if (error) {
+                if (error) {
-                release_swap_reader(handle);
+                        release_swap_reader(handle);
-                return error;
+                        return error;
+                }
+                offset = tmp->map->next_swap;
        }
        handle->k = 0;
+        handle->cur = handle->maps->map;
        return 0;
 }
@@ -658,6 +899,7 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
 {
        sector_t offset;
        int error;
+        struct swap_map_page_list *tmp;
        if (!handle->cur)
                return -EINVAL;
@@ -668,13 +910,15 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf,
        if (error)
                return error;
        if (++handle->k >= MAP_PAGE_ENTRIES) {
-                error = hib_wait_on_bio_chain(bio_chain);
                handle->k = 0;
-                offset = handle->cur->next_swap;
+                free_page((unsigned long)handle->maps->map);
-                if (!offset)
+                tmp = handle->maps;
+                handle->maps = handle->maps->next;
+                kfree(tmp);
+                if (!handle->maps)
                        release_swap_reader(handle);
-                else if (!error)
+                else
-                        error = hib_bio_read_page(offset, handle->cur, NULL);
+                        handle->cur = handle->maps->map;
        }
        return error;
 }
@@ -697,7 +941,7 @@ static int load_image(struct swap_map_handle *handle,
                      unsigned int nr_to_read)
 {
        unsigned int m;
-        int error = 0;
+        int ret = 0;
        struct timeval start;
        struct timeval stop;
        struct bio *bio;
@@ -713,15 +957,15 @@ static int load_image(struct swap_map_handle *handle,
        bio = NULL;
        do_gettimeofday(&start);
        for ( ; ; ) {
-                error = snapshot_write_next(snapshot);
+                ret = snapshot_write_next(snapshot);
-                if (error <= 0)
+                if (ret <= 0)
                        break;
-                error = swap_read_page(handle, data_of(*snapshot), &bio);
+                ret = swap_read_page(handle, data_of(*snapshot), &bio);
-                if (error)
+                if (ret)
                        break;
                if (snapshot->sync_read)
-                        error = hib_wait_on_bio_chain(&bio);
+                        ret = hib_wait_on_bio_chain(&bio);
-                if (error)
+                if (ret)
                        break;
                if (!(nr_pages % m))
                        printk("\b\b\b\b%3d%%", nr_pages / m);
@@ -729,17 +973,61 @@ static int load_image(struct swap_map_handle *handle,
        }
        err2 = hib_wait_on_bio_chain(&bio);
        do_gettimeofday(&stop);
-        if (!error)
+        if (!ret)
-                error = err2;
+                ret = err2;
-        if (!error) {
+        if (!ret) {
                printk("\b\b\b\bdone\n");
                snapshot_write_finalize(snapshot);
                if (!snapshot_image_loaded(snapshot))
-                        error = -ENODATA;
+                        ret = -ENODATA;
        } else
                printk("\n");
        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
-        return error;
+        return ret;
+}
+/**
+ * Structure used for LZO data decompression.
+ */
+struct dec_data {
+        struct task_struct *thr;                  /* thread */
+        atomic_t ready;                           /* ready to start flag */
+        atomic_t stop;                            /* ready to stop flag */
+        int ret;                                  /* return code */
+        wait_queue_head_t go;                     /* start decompression */
+        wait_queue_head_t done;                   /* decompression done */
+        size_t unc_len;                           /* uncompressed length */
+        size_t cmp_len;                           /* compressed length */
+        unsigned char unc[LZO_UNC_SIZE];          /* uncompressed buffer */
+        unsigned char cmp[LZO_CMP_SIZE];          /* compressed buffer */
+};
+/**
+ * Deompression function that runs in its own thread.
+ */
+static int lzo_decompress_threadfn(void *data)
+{
+        struct dec_data *d = data;
+        while (1) {
+                wait_event(d->go, atomic_read(&d->ready) ||
+                                  kthread_should_stop());
+                if (kthread_should_stop()) {
+                        d->thr = NULL;
+                        d->ret = -1;
+                        atomic_set(&d->stop, 1);
+                        wake_up(&d->done);
+                        break;
+                }
+                atomic_set(&d->ready, 0);
+                d->unc_len = LZO_UNC_SIZE;
+                d->ret = lzo1x_decompress_safe(d->cmp + LZO_HEADER, d->cmp_len,
+                                               d->unc, &d->unc_len);
+                atomic_set(&d->stop, 1);
+                wake_up(&d->done);
+        }
+        return 0;
 }
 /**
@@ -753,50 +1041,120 @@ static int load_image_lzo(struct swap_map_handle *handle,
                          unsigned int nr_to_read)
 {
        unsigned int m;
-        int error = 0;
+        int ret = 0;
+        int eof = 0;
        struct bio *bio;
        struct timeval start;
        struct timeval stop;
        unsigned nr_pages;
-        size_t i, off, unc_len, cmp_len;
+        size_t off;
-        unsigned char *unc, *cmp, *page[LZO_CMP_PAGES];
+        unsigned i, thr, run_threads, nr_threads;
+        unsigned ring = 0, pg = 0, ring_size = 0,
-        for (i = 0; i < LZO_CMP_PAGES; i++) {
+                 have = 0, want, need, asked = 0;
-                page[i] = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+        unsigned long read_pages;
-                if (!page[i]) {
+        unsigned char **page = NULL;
-                        printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+        struct dec_data *data = NULL;
+        struct crc_data *crc = NULL;
+        /*
+         * We'll limit the number of threads for decompression to limit memory
+         * footprint.
+         */
+        nr_threads = num_online_cpus() - 1;
+        nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
+        page = vmalloc(sizeof(*page) * LZO_READ_PAGES);
+        if (!page) {
+                printk(KERN_ERR "PM: Failed to allocate LZO page\n");
+                ret = -ENOMEM;
+                goto out_clean;
+        }
-                        while (i)
+        data = vmalloc(sizeof(*data) * nr_threads);
-                                free_page((unsigned long)page[--i]);
+        if (!data) {
+                printk(KERN_ERR "PM: Failed to allocate LZO data\n");
+                ret = -ENOMEM;
+                goto out_clean;
+        }
+        for (thr = 0; thr < nr_threads; thr++)
+                memset(&data[thr], 0, offsetof(struct dec_data, go));
-                        return -ENOMEM;
+        crc = kmalloc(sizeof(*crc), GFP_KERNEL);
+        if (!crc) {
+                printk(KERN_ERR "PM: Failed to allocate crc\n");
+                ret = -ENOMEM;
+                goto out_clean;
+        }
+        memset(crc, 0, offsetof(struct crc_data, go));
+        /*
+         * Start the decompression threads.
+         */
+        for (thr = 0; thr < nr_threads; thr++) {
+                init_waitqueue_head(&data[thr].go);
+                init_waitqueue_head(&data[thr].done);
+                data[thr].thr = kthread_run(lzo_decompress_threadfn,
+                                            &data[thr],
+                                            "image_decompress/%u", thr);
+                if (IS_ERR(data[thr].thr)) {
+                        data[thr].thr = NULL;
+                        printk(KERN_ERR
+                               "PM: Cannot start decompression threads\n");
+                        ret = -ENOMEM;
+                        goto out_clean;
                }
        }
-        unc = vmalloc(LZO_UNC_SIZE);
+        /*
-        if (!unc) {
+         * Start the CRC32 thread.
-                printk(KERN_ERR "PM: Failed to allocate LZO uncompressed\n");
+         */
+        init_waitqueue_head(&crc->go);
-                for (i = 0; i < LZO_CMP_PAGES; i++)
+        init_waitqueue_head(&crc->done);
-                        free_page((unsigned long)page[i]);
+        handle->crc32 = 0;
-                return -ENOMEM;
+        crc->crc32 = &handle->crc32;
+        for (thr = 0; thr < nr_threads; thr++) {
+                crc->unc[thr] = data[thr].unc;
+                crc->unc_len[thr] = &data[thr].unc_len;
        }
-        cmp = vmalloc(LZO_CMP_SIZE);
+        crc->thr = kthread_run(crc32_threadfn, crc, "image_crc32");
-        if (!cmp) {
+        if (IS_ERR(crc->thr)) {
-                printk(KERN_ERR "PM: Failed to allocate LZO compressed\n");
+                crc->thr = NULL;
+                printk(KERN_ERR "PM: Cannot start CRC32 thread\n");
+                ret = -ENOMEM;
+                goto out_clean;
+        }
-                vfree(unc);
+        /*
-                for (i = 0; i < LZO_CMP_PAGES; i++)
+         * Adjust number of pages for read buffering, in case we are short.
-                        free_page((unsigned long)page[i]);
+         */
+        read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1;
+        read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES);
-                return -ENOMEM;
+        for (i = 0; i < read_pages; i++) {
+                page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
+                                                  __GFP_WAIT | __GFP_HIGH :
+                                                  __GFP_WAIT);
+                if (!page[i]) {
+                        if (i < LZO_CMP_PAGES) {
+                                ring_size = i;
+                                printk(KERN_ERR
+                                       "PM: Failed to allocate LZO pages\n");
+                                ret = -ENOMEM;
+                                goto out_clean;
+                        } else {
+                                break;
+                        }
+                }
        }
+        want = ring_size = i;
        printk(KERN_INFO
+                "PM: Using %u thread(s) for decompression.\n"
                "PM: Loading and decompressing image data (%u pages) ...     ",
-                nr_to_read);
+                nr_threads, nr_to_read);
        m = nr_to_read / 100;
        if (!m)
                m = 1;
@@ -804,85 +1162,189 @@ static int load_image_lzo(struct swap_map_handle *handle,
        bio = NULL;
        do_gettimeofday(&start);
-        error = snapshot_write_next(snapshot);
+        ret = snapshot_write_next(snapshot);
-        if (error <= 0)
+        if (ret <= 0)
                goto out_finish;
-        for (;;) {
+        for(;;) {
-                error = swap_read_page(handle, page[0], NULL); /* sync */
+                for (i = 0; !eof && i < want; i++) {
-                if (error)
+                        ret = swap_read_page(handle, page[ring], &bio);
-                        break;
+                        if (ret) {
+                                /*
-                cmp_len = *(size_t *)page[0];
+                                 * On real read error, finish. On end of data,
-                if (unlikely(!cmp_len ||
+                                 * set EOF flag and just exit the read loop.
-                             cmp_len > lzo1x_worst_compress(LZO_UNC_SIZE))) {
+                                 */
-                        printk(KERN_ERR "PM: Invalid LZO compressed length\n");
+                                if (handle->cur &&
-                        error = -1;
+                                    handle->cur->entries[handle->k]) {
-                        break;
+                                        goto out_finish;
+                                } else {
+                                        eof = 1;
+                                        break;
+                                }
+                        }
+                        if (++ring >= ring_size)
+                                ring = 0;
                }
+                asked += i;
+                want -= i;
-                for (off = PAGE_SIZE, i = 1;
+                /*
-                     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
+                 * We are out of data, wait for some more.
-                        error = swap_read_page(handle, page[i], &bio);
+                 */
-                        if (error)
+                if (!have) {
+                        if (!asked)
+                                break;
+                        ret = hib_wait_on_bio_chain(&bio);
+                        if (ret)
                                goto out_finish;
+                        have += asked;
+                        asked = 0;
+                        if (eof)
+                                eof = 2;
                }
-                error = hib_wait_on_bio_chain(&bio); /* need all data now */
+                if (crc->run_threads) {
-                if (error)
+                        wait_event(crc->done, atomic_read(&crc->stop));
-                        goto out_finish;
+                        atomic_set(&crc->stop, 0);
+                        crc->run_threads = 0;
-                for (off = 0, i = 0;
-                     off < LZO_HEADER + cmp_len; off += PAGE_SIZE, i++) {
-                        memcpy(cmp + off, page[i], PAGE_SIZE);
                }
-                unc_len = LZO_UNC_SIZE;
+                for (thr = 0; have && thr < nr_threads; thr++) {
-                error = lzo1x_decompress_safe(cmp + LZO_HEADER, cmp_len,
+                        data[thr].cmp_len = *(size_t *)page[pg];
-                                              unc, &unc_len);
+                        if (unlikely(!data[thr].cmp_len ||
-                if (error < 0) {
+                                     data[thr].cmp_len >
-                        printk(KERN_ERR "PM: LZO decompression failed\n");
+                                     lzo1x_worst_compress(LZO_UNC_SIZE))) {
-                        break;
+                                printk(KERN_ERR
+                                       "PM: Invalid LZO compressed length\n");
+                                ret = -1;
+                                goto out_finish;
+                        }
+                        need = DIV_ROUND_UP(data[thr].cmp_len + LZO_HEADER,
+                                            PAGE_SIZE);
+                        if (need > have) {
+                                if (eof > 1) {
+                                        ret = -1;
+                                        goto out_finish;
+                                }
+                                break;
+                        }
+                        for (off = 0;
+                             off < LZO_HEADER + data[thr].cmp_len;
+                             off += PAGE_SIZE) {
+                                memcpy(data[thr].cmp + off,
+                                       page[pg], PAGE_SIZE);
+                                have--;
+                                want++;
+                                if (++pg >= ring_size)
+                                        pg = 0;
+                        }
+                        atomic_set(&data[thr].ready, 1);
+                        wake_up(&data[thr].go);
                }
-                if (unlikely(!unc_len ||
+                /*
-                             unc_len > LZO_UNC_SIZE ||
+                 * Wait for more data while we are decompressing.
-                             unc_len & (PAGE_SIZE - 1))) {
+                 */
-                        printk(KERN_ERR "PM: Invalid LZO uncompressed length\n");
+                if (have < LZO_CMP_PAGES && asked) {
-                        error = -1;
+                        ret = hib_wait_on_bio_chain(&bio);
-                        break;
+                        if (ret)
+                                goto out_finish;
+                        have += asked;
+                        asked = 0;
+                        if (eof)
+                                eof = 2;
                }
-                for (off = 0; off < unc_len; off += PAGE_SIZE) {
+                for (run_threads = thr, thr = 0; thr < run_threads; thr++) {
-                        memcpy(data_of(*snapshot), unc + off, PAGE_SIZE);
+                        wait_event(data[thr].done,
+                                   atomic_read(&data[thr].stop));
+                        atomic_set(&data[thr].stop, 0);
+                        ret = data[thr].ret;
-                        if (!(nr_pages % m))
+                        if (ret < 0) {
-                                printk("\b\b\b\b%3d%%", nr_pages / m);
+                                printk(KERN_ERR
-                        nr_pages++;
+                                       "PM: LZO decompression failed\n");
+                                goto out_finish;
+                        }
-                        error = snapshot_write_next(snapshot);
+                        if (unlikely(!data[thr].unc_len ||
-                        if (error <= 0)
+                                     data[thr].unc_len > LZO_UNC_SIZE ||
+                                     data[thr].unc_len & (PAGE_SIZE - 1))) {
+                                printk(KERN_ERR
+                                       "PM: Invalid LZO uncompressed length\n");
+                                ret = -1;
                                goto out_finish;
+                        }
+                        for (off = 0;
+                             off < data[thr].unc_len; off += PAGE_SIZE) {
+                                memcpy(data_of(*snapshot),
+                                       data[thr].unc + off, PAGE_SIZE);
+                                if (!(nr_pages % m))
+                                        printk("\b\b\b\b%3d%%", nr_pages / m);
+                                nr_pages++;
+                                ret = snapshot_write_next(snapshot);
+                                if (ret <= 0) {
+                                        crc->run_threads = thr + 1;
+                                        atomic_set(&crc->ready, 1);
+                                        wake_up(&crc->go);
+                                        goto out_finish;
+                                }
+                        }
                }
+                crc->run_threads = thr;
+                atomic_set(&crc->ready, 1);
+                wake_up(&crc->go);
        }
 out_finish:
+        if (crc->run_threads) {
+                wait_event(crc->done, atomic_read(&crc->stop));
+                atomic_set(&crc->stop, 0);
+        }
        do_gettimeofday(&stop);
-        if (!error) {
+        if (!ret) {
                printk("\b\b\b\bdone\n");
                snapshot_write_finalize(snapshot);
                if (!snapshot_image_loaded(snapshot))
-                        error = -ENODATA;
+                        ret = -ENODATA;
+                if (!ret) {
+                        if (swsusp_header->flags & SF_CRC32_MODE) {
+                                if(handle->crc32 != swsusp_header->crc32) {
+                                        printk(KERN_ERR
+                                               "PM: Invalid image CRC32!\n");
+                                        ret = -ENODATA;
+                                }
+                        }
+                }
        } else
                printk("\n");
        swsusp_show_speed(&start, &stop, nr_to_read, "Read");
+out_clean:
-        vfree(cmp);
+        for (i = 0; i < ring_size; i++)
-        vfree(unc);
-        for (i = 0; i < LZO_CMP_PAGES; i++)
                free_page((unsigned long)page[i]);
+        if (crc) {
+                if (crc->thr)
+                        kthread_stop(crc->thr);
+                kfree(crc);
+        }
+        if (data) {
+                for (thr = 0; thr < nr_threads; thr++)
+                        if (data[thr].thr)
+                                kthread_stop(data[thr].thr);
+                vfree(data);
+        }
+        if (page) vfree(page);
-        return error;
+        return ret;
 }
 /**
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 42ddbc6f0de..6d8f535c2b8 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -12,6 +12,7 @@
 #include <linux/suspend.h>
 #include <linux/syscalls.h>
 #include <linux/reboot.h>
+#include <linux/kmod.h>
 #include <linux/string.h>
 #include <linux/device.h>
 #include <linux/miscdevice.h>
diff --git a/kernel/printk.c b/kernel/printk.c
index 37dff3429ad..1455a0d4eed 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -100,7 +100,7 @@ static int console_locked, console_suspended;
 * It is also used in interesting ways to provide interlocking in
 * console_unlock();.
 */
-static DEFINE_SPINLOCK(logbuf_lock);
+static DEFINE_RAW_SPINLOCK(logbuf_lock);
 #define LOG_BUF_MASK (log_buf_len-1)
 #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
@@ -212,7 +212,7 @@ void __init setup_log_buf(int early)
                return;
        }
-        spin_lock_irqsave(&logbuf_lock, flags);
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
        log_buf_len = new_log_buf_len;
        log_buf = new_log_buf;
        new_log_buf_len = 0;
@@ -230,7 +230,7 @@ void __init setup_log_buf(int early)
        log_start -= offset;
        con_start -= offset;
        log_end -= offset;
-        spin_unlock_irqrestore(&logbuf_lock, flags);
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
        pr_info("log_buf_len: %d\n", log_buf_len);
        pr_info("early log buf free: %d(%d%%)\n",
@@ -318,8 +318,10 @@ static int check_syslog_permissions(int type, bool from_file)
                        return 0;
                /* For historical reasons, accept CAP_SYS_ADMIN too, with a warning */
                if (capable(CAP_SYS_ADMIN)) {
-                        WARN_ONCE(1, "Attempt to access syslog with CAP_SYS_ADMIN "
+                        printk_once(KERN_WARNING "%s (%d): "
-                                 "but no CAP_SYSLOG (deprecated).\n");
+                                 "Attempt to access syslog with CAP_SYS_ADMIN "
+                                 "but no CAP_SYSLOG (deprecated).\n",
+                                 current->comm, task_pid_nr(current));
                        return 0;
                }
                return -EPERM;
@@ -363,18 +365,18 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                if (error)
                        goto out;
                i = 0;
-                spin_lock_irq(&logbuf_lock);
+                raw_spin_lock_irq(&logbuf_lock);
                while (!error && (log_start != log_end) && i < len) {
                        c = LOG_BUF(log_start);
                        log_start++;
-                        spin_unlock_irq(&logbuf_lock);
+                        raw_spin_unlock_irq(&logbuf_lock);
                        error = __put_user(c,buf);
                        buf++;
                        i++;
                        cond_resched();
-                        spin_lock_irq(&logbuf_lock);
+                        raw_spin_lock_irq(&logbuf_lock);
                }
-                spin_unlock_irq(&logbuf_lock);
+                raw_spin_unlock_irq(&logbuf_lock);
                if (!error)
                        error = i;
                break;
@@ -397,7 +399,7 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                count = len;
                if (count > log_buf_len)
                        count = log_buf_len;
-                spin_lock_irq(&logbuf_lock);
+                raw_spin_lock_irq(&logbuf_lock);
                if (count > logged_chars)
                        count = logged_chars;
                if (do_clear)
@@ -414,12 +416,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
                        if (j + log_buf_len < log_end)
                                break;
                        c = LOG_BUF(j);
-                        spin_unlock_irq(&logbuf_lock);
+                        raw_spin_unlock_irq(&logbuf_lock);
                        error = __put_user(c,&buf[count-1-i]);
                        cond_resched();
-                        spin_lock_irq(&logbuf_lock);
+                        raw_spin_lock_irq(&logbuf_lock);
                }
-                spin_unlock_irq(&logbuf_lock);
+                raw_spin_unlock_irq(&logbuf_lock);
                if (error)
                        break;
                error = i;
@@ -530,6 +532,9 @@ static int __init ignore_loglevel_setup(char *str)
 }
 early_param("ignore_loglevel", ignore_loglevel_setup);
+module_param_named(ignore_loglevel, ignore_loglevel, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(ignore_loglevel, "ignore loglevel setting, to"
+        "print all kernel messages to the console.");
 /*
 * Write out chars from start to end - 1 inclusive
@@ -590,9 +595,6 @@ static size_t log_prefix(const char *p, unsigned int *level, char *special)
                /* multi digit including the level and facility number */
                char *endp = NULL;
-                if (p[1] < '0' && p[1] > '9')
-                        return 0;
                lev = (simple_strtoul(&p[1], &endp, 10) & 7);
                if (endp == NULL || endp[0] != '>')
                        return 0;
@@ -687,7 +689,7 @@ static void zap_locks(void)
        oops_timestamp = jiffies;
        /* If a crash is occurring, make sure we can't deadlock */
-        spin_lock_init(&logbuf_lock);
+        raw_spin_lock_init(&logbuf_lock);
        /* And make sure that we print immediately */
        sema_init(&console_sem, 1);
 }
@@ -800,9 +802,9 @@ static int console_trylock_for_printk(unsigned int cpu)
                }
        }
        printk_cpu = UINT_MAX;
-        spin_unlock(&logbuf_lock);
        if (wake)
                up(&console_sem);
+        raw_spin_unlock(&logbuf_lock);
        return retval;
 }
 static const char recursion_bug_msg [] =
@@ -862,7 +864,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
        }
        lockdep_off();
-        spin_lock(&logbuf_lock);
+        raw_spin_lock(&logbuf_lock);
        printk_cpu = this_cpu;
        if (recursion_bug) {
@@ -1106,6 +1108,10 @@ static int __init console_suspend_disable(char *str)
        return 1;
 }
 __setup("no_console_suspend", console_suspend_disable);
+module_param_named(console_suspend, console_suspend_enabled,
+                bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(console_suspend, "suspend console during suspend"
+        " and hibernate operations");
 /**
 * suspend_console - suspend the console subsystem
@@ -1255,14 +1261,14 @@ void console_unlock(void)
 again:
        for ( ; ; ) {
-                spin_lock_irqsave(&logbuf_lock, flags);
+                raw_spin_lock_irqsave(&logbuf_lock, flags);
                wake_klogd |= log_start - log_end;
                if (con_start == log_end)
                        break;                  /* Nothing to print */
                _con_start = con_start;
                _log_end = log_end;
                con_start = log_end;            /* Flush */
-                spin_unlock(&logbuf_lock);
+                raw_spin_unlock(&logbuf_lock);
                stop_critical_timings();        /* don't trace print latency */
                call_console_drivers(_con_start, _log_end);
                start_critical_timings();
@@ -1274,7 +1280,7 @@ again:
        if (unlikely(exclusive_console))
                exclusive_console = NULL;
-        spin_unlock(&logbuf_lock);
+        raw_spin_unlock(&logbuf_lock);
        up(&console_sem);
@@ -1284,13 +1290,13 @@ again:
         * there's a new owner and the console_unlock() from them will do the
         * flush, no worries.
         */
-        spin_lock(&logbuf_lock);
+        raw_spin_lock(&logbuf_lock);
        if (con_start != log_end)
                retry = 1;
-        spin_unlock_irqrestore(&logbuf_lock, flags);
        if (retry && console_trylock())
                goto again;
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
        if (wake_klogd)
                wake_up_klogd();
 }
@@ -1520,9 +1526,9 @@ void register_console(struct console *newcon)
                 * console_unlock(); will print out the buffered messages
                 * for us.
                 */
-                spin_lock_irqsave(&logbuf_lock, flags);
+                raw_spin_lock_irqsave(&logbuf_lock, flags);
                con_start = log_start;
-                spin_unlock_irqrestore(&logbuf_lock, flags);
+                raw_spin_unlock_irqrestore(&logbuf_lock, flags);
                /*
                 * We're about to replay the log buffer.  Only do this to the
                 * just-registered console to avoid excessive message spam to
@@ -1602,7 +1608,7 @@ static int __init printk_late_init(void)
        struct console *con;
        for_each_console(con) {
-                if (con->flags & CON_BOOT) {
+                if (!keep_bootcon && con->flags & CON_BOOT) {
                        printk(KERN_INFO "turn off boot console %s%d\n",
                                con->name, con->index);
                        unregister_console(con);
@@ -1729,10 +1735,10 @@ void kmsg_dump(enum kmsg_dump_reason reason)
        /* Theoretically, the log could move on after we do this, but
           there's not a lot we can do about that. The new messages
           will overwrite the start of what we dump. */
-        spin_lock_irqsave(&logbuf_lock, flags);
+        raw_spin_lock_irqsave(&logbuf_lock, flags);
        end = log_end & LOG_BUF_MASK;
        chars = logged_chars;
-        spin_unlock_irqrestore(&logbuf_lock, flags);
+        raw_spin_unlock_irqrestore(&logbuf_lock, flags);
        if (chars > end) {
                s1 = log_buf + log_buf_len - chars + end;
diff --git a/kernel/profile.c b/kernel/profile.c
index 961b389fe52..76b8e77773e 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -13,7 +13,7 @@
 *      to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/profile.h>
 #include <linux/bootmem.h>
 #include <linux/notifier.h>
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 9de3ecfd20f..24d04477b25 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -8,7 +8,7 @@
 */
 #include <linux/capability.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
@@ -744,20 +744,17 @@ int ptrace_request(struct task_struct *child, long request,
                        break;
                si = child->last_siginfo;
-                if (unlikely(!si || si->si_code >> 8 != PTRACE_EVENT_STOP))
+                if (likely(si && (si->si_code >> 8) == PTRACE_EVENT_STOP)) {
-                        break;
+                        child->jobctl |= JOBCTL_LISTENING;
+                        /*
-                child->jobctl |= JOBCTL_LISTENING;
+                         * If NOTIFY is set, it means event happened between
+                         * start of this trap and now.  Trigger re-trap.
-                /*
+                         */
-                 * If NOTIFY is set, it means event happened between start
+                        if (child->jobctl & JOBCTL_TRAP_NOTIFY)
-                 * of this trap and now.  Trigger re-trap immediately.
+                                signal_wake_up(child, true);
-                 */
+                        ret = 0;
-                if (child->jobctl & JOBCTL_TRAP_NOTIFY)
+                }
-                        signal_wake_up(child, true);
                unlock_task_sighand(child, &flags);
-                ret = 0;
                break;
        case PTRACE_DETACH:      /* detach a process that was attached. */
diff --git a/kernel/range.c b/kernel/range.c
index 37fa9b99ad5..9b8ae2d6ed6 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -1,7 +1,7 @@
 /*
 * Range add and subtract
 */
-#include <linux/module.h>
+#include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/sort.h>
diff --git a/kernel/rcu.h b/kernel/rcu.h
new file mode 100644
index 00000000000..f600868d550
--- /dev/null
+++ b/kernel/rcu.h
@@ -0,0 +1,85 @@
+/*
+ * Read-Copy Update definitions shared among RCU implementations.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright IBM Corporation, 2011
+ *
+ * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+ */
+#ifndef __LINUX_RCU_H
+#define __LINUX_RCU_H
+#ifdef CONFIG_RCU_TRACE
+#define RCU_TRACE(stmt) stmt
+#else /* #ifdef CONFIG_RCU_TRACE */
+#define RCU_TRACE(stmt)
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+/*
+ * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally
+ * by call_rcu() and rcu callback execution, and are therefore not part of the
+ * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors.
+ */
+#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
+# define STATE_RCU_HEAD_READY   0
+# define STATE_RCU_HEAD_QUEUED  1
+extern struct debug_obj_descr rcuhead_debug_descr;
+static inline void debug_rcu_head_queue(struct rcu_head *head)
+{
+        WARN_ON_ONCE((unsigned long)head & 0x3);
+        debug_object_activate(head, &rcuhead_debug_descr);
+        debug_object_active_state(head, &rcuhead_debug_descr,
+                                  STATE_RCU_HEAD_READY,
+                                  STATE_RCU_HEAD_QUEUED);
+}
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+        debug_object_active_state(head, &rcuhead_debug_descr,
+                                  STATE_RCU_HEAD_QUEUED,
+                                  STATE_RCU_HEAD_READY);
+        debug_object_deactivate(head, &rcuhead_debug_descr);
+}
+#else   /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+static inline void debug_rcu_head_queue(struct rcu_head *head)
+{
+}
+static inline void debug_rcu_head_unqueue(struct rcu_head *head)
+{
+}
+#endif  /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */
+extern void kfree(const void *);
+static inline void __rcu_reclaim(char *rn, struct rcu_head *head)
+{
+        unsigned long offset = (unsigned long)head->func;
+        if (__is_kfree_rcu_offset(offset)) {
+                RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset));
+                kfree((void *)head - offset);
+        } else {
+                RCU_TRACE(trace_rcu_invoke_callback(rn, head));
+                head->func(head);
+        }
+}
+#endif /* __LINUX_RCU_H */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index ddddb320be6..c5b98e565ae 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -43,9 +43,14 @@
 #include <linux/notifier.h>
 #include <linux/cpu.h>
 #include <linux/mutex.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/hardirq.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/rcu.h>
+#include "rcu.h"
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 static struct lock_class_key rcu_lock_key;
 struct lockdep_map rcu_lock_map =
@@ -94,11 +99,16 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
+struct rcu_synchronize {
+        struct rcu_head head;
+        struct completion completion;
+};
 /*
 * Awaken the corresponding synchronize_rcu() instance now that a
 * grace period has elapsed.
 */
-void wakeme_after_rcu(struct rcu_head  *head)
+static void wakeme_after_rcu(struct rcu_head  *head)
 {
        struct rcu_synchronize *rcu;
@@ -106,6 +116,20 @@ void wakeme_after_rcu(struct rcu_head  *head)
        complete(&rcu->completion);
 }
+void wait_rcu_gp(call_rcu_func_t crf)
+{
+        struct rcu_synchronize rcu;
+        init_rcu_head_on_stack(&rcu.head);
+        init_completion(&rcu.completion);
+        /* Will wake me after RCU finished. */
+        crf(&rcu.head, wakeme_after_rcu);
+        /* Wait for it. */
+        wait_for_completion(&rcu.completion);
+        destroy_rcu_head_on_stack(&rcu.head);
+}
+EXPORT_SYMBOL_GPL(wait_rcu_gp);
 #ifdef CONFIG_PROVE_RCU
 /*
 * wrapper function to avoid #include problems.
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 7bbac7d0f5a..636af6d9c6e 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -22,13 +22,12 @@
 * For detailed explanation of Read-Copy Update mechanism see -
 *              Documentation/RCU
 */
-#include <linux/moduleparam.h>
 #include <linux/completion.h>
 #include <linux/interrupt.h>
 #include <linux/notifier.h>
 #include <linux/rcupdate.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/sched.h>
 #include <linux/types.h>
@@ -37,16 +36,17 @@
 #include <linux/cpu.h>
 #include <linux/prefetch.h>
-/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
+#ifdef CONFIG_RCU_TRACE
-static struct task_struct *rcu_kthread_task;
+#include <trace/events/rcu.h>
-static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
-static unsigned long have_rcu_kthread_work;
+#include "rcu.h"
 /* Forward declarations for rcutiny_plugin.h. */
 struct rcu_ctrlblk;
-static void invoke_rcu_kthread(void);
+static void invoke_rcu_callbacks(void);
-static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp);
-static int rcu_kthread(void *arg);
+static void rcu_process_callbacks(struct softirq_action *unused);
 static void __call_rcu(struct rcu_head *head,
                       void (*func)(struct rcu_head *rcu),
                       struct rcu_ctrlblk *rcp);
@@ -96,16 +96,6 @@ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 }
 /*
- * Wake up rcu_kthread() to process callbacks now eligible for invocation
- * or to boost readers.
- */
-static void invoke_rcu_kthread(void)
-{
-        have_rcu_kthread_work = 1;
-        wake_up(&rcu_kthread_wq);
-}
-/*
 * Record an rcu quiescent state.  And an rcu_bh quiescent state while we
 * are at it, given that any rcu quiescent state is also an rcu_bh
 * quiescent state.  Use "+" instead of "||" to defeat short circuiting.
@@ -117,7 +107,7 @@ void rcu_sched_qs(int cpu)
        local_irq_save(flags);
        if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
            rcu_qsctr_help(&rcu_bh_ctrlblk))
-                invoke_rcu_kthread();
+                invoke_rcu_callbacks();
        local_irq_restore(flags);
 }
@@ -130,7 +120,7 @@ void rcu_bh_qs(int cpu)
        local_irq_save(flags);
        if (rcu_qsctr_help(&rcu_bh_ctrlblk))
-                invoke_rcu_kthread();
+                invoke_rcu_callbacks();
        local_irq_restore(flags);
 }
@@ -154,18 +144,23 @@ void rcu_check_callbacks(int cpu, int user)
 * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure
 * whose grace period has elapsed.
 */
-static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
+static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 {
+        char *rn = NULL;
        struct rcu_head *next, *list;
        unsigned long flags;
        RCU_TRACE(int cb_count = 0);
        /* If no RCU callbacks ready to invoke, just return. */
-        if (&rcp->rcucblist == rcp->donetail)
+        if (&rcp->rcucblist == rcp->donetail) {
+                RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
+                RCU_TRACE(trace_rcu_batch_end(rcp->name, 0));
                return;
+        }
        /* Move the ready-to-invoke callbacks to a local list. */
        local_irq_save(flags);
+        RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, -1));
        list = rcp->rcucblist;
        rcp->rcucblist = *rcp->donetail;
        *rcp->donetail = NULL;
@@ -176,49 +171,26 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
        local_irq_restore(flags);
        /* Invoke the callbacks on the local list. */
+        RCU_TRACE(rn = rcp->name);
        while (list) {
                next = list->next;
                prefetch(next);
                debug_rcu_head_unqueue(list);
                local_bh_disable();
-                __rcu_reclaim(list);
+                __rcu_reclaim(rn, list);
                local_bh_enable();
                list = next;
                RCU_TRACE(cb_count++);
        }
        RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count));
+        RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count));
 }
-/*
+static void rcu_process_callbacks(struct softirq_action *unused)
- * This kthread invokes RCU callbacks whose grace periods have
- * elapsed.  It is awakened as needed, and takes the place of the
- * RCU_SOFTIRQ that was used previously for this purpose.
- * This is a kthread, but it is never stopped, at least not until
- * the system goes down.
- */
-static int rcu_kthread(void *arg)
 {
-        unsigned long work;
+        __rcu_process_callbacks(&rcu_sched_ctrlblk);
-        unsigned long morework;
+        __rcu_process_callbacks(&rcu_bh_ctrlblk);
-        unsigned long flags;
+        rcu_preempt_process_callbacks();
-        for (;;) {
-                wait_event_interruptible(rcu_kthread_wq,
-                                         have_rcu_kthread_work != 0);
-                morework = rcu_boost();
-                local_irq_save(flags);
-                work = have_rcu_kthread_work;
-                have_rcu_kthread_work = morework;
-                local_irq_restore(flags);
-                if (work) {
-                        rcu_process_callbacks(&rcu_sched_ctrlblk);
-                        rcu_process_callbacks(&rcu_bh_ctrlblk);
-                        rcu_preempt_process_callbacks();
-                }
-                schedule_timeout_interruptible(1); /* Leave CPU for others. */
-        }
-        return 0;  /* Not reached, but needed to shut gcc up. */
 }
 /*
@@ -280,45 +252,3 @@ void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
        __call_rcu(head, func, &rcu_bh_ctrlblk);
 }
 EXPORT_SYMBOL_GPL(call_rcu_bh);
-void rcu_barrier_bh(void)
-{
-        struct rcu_synchronize rcu;
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu_bh(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_bh);
-void rcu_barrier_sched(void)
-{
-        struct rcu_synchronize rcu;
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu_sched(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-/*
- * Spawn the kthread that invokes RCU callbacks.
- */
-static int __init rcu_spawn_kthreads(void)
-{
-        struct sched_param sp;
-        rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
-        sp.sched_priority = RCU_BOOST_PRIO;
-        sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
-        return 0;
-}
-early_initcall(rcu_spawn_kthreads);
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index f259c676195..2b0484a5dc2 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -23,32 +23,30 @@
 */
 #include <linux/kthread.h>
+#include <linux/module.h>
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
-#ifdef CONFIG_RCU_TRACE
-#define RCU_TRACE(stmt) stmt
-#else /* #ifdef CONFIG_RCU_TRACE */
-#define RCU_TRACE(stmt)
-#endif /* #else #ifdef CONFIG_RCU_TRACE */
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
        struct rcu_head *rcucblist;     /* List of pending callbacks (CBs). */
        struct rcu_head **donetail;     /* ->next pointer of last "done" CB. */
        struct rcu_head **curtail;      /* ->next pointer of last CB. */
        RCU_TRACE(long qlen);           /* Number of pending CBs. */
+        RCU_TRACE(char *name);          /* Name of RCU type. */
 };
 /* Definition for rcupdate control block. */
 static struct rcu_ctrlblk rcu_sched_ctrlblk = {
        .donetail       = &rcu_sched_ctrlblk.rcucblist,
        .curtail        = &rcu_sched_ctrlblk.rcucblist,
+        RCU_TRACE(.name = "rcu_sched")
 };
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
        .donetail       = &rcu_bh_ctrlblk.rcucblist,
        .curtail        = &rcu_bh_ctrlblk.rcucblist,
+        RCU_TRACE(.name = "rcu_bh")
 };
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -131,6 +129,7 @@ static struct rcu_preempt_ctrlblk rcu_preempt_ctrlblk = {
        .rcb.curtail = &rcu_preempt_ctrlblk.rcb.rcucblist,
        .nexttail = &rcu_preempt_ctrlblk.rcb.rcucblist,
        .blkd_tasks = LIST_HEAD_INIT(rcu_preempt_ctrlblk.blkd_tasks),
+        RCU_TRACE(.rcb.name = "rcu_preempt")
 };
 static int rcu_preempted_readers_exp(void);
@@ -247,6 +246,13 @@ static void show_tiny_preempt_stats(struct seq_file *m)
 #include "rtmutex_common.h"
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+/* Controls for rcu_kthread() kthread. */
+static struct task_struct *rcu_kthread_task;
+static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
+static unsigned long have_rcu_kthread_work;
 /*
 * Carry out RCU priority boosting on the task indicated by ->boost_tasks,
 * and advance ->boost_tasks to the next task in the ->blkd_tasks list.
@@ -334,7 +340,7 @@ static int rcu_initiate_boost(void)
                if (rcu_preempt_ctrlblk.exp_tasks == NULL)
                        rcu_preempt_ctrlblk.boost_tasks =
                                rcu_preempt_ctrlblk.gp_tasks;
-                invoke_rcu_kthread();
+                invoke_rcu_callbacks();
        } else
                RCU_TRACE(rcu_initiate_boost_trace());
        return 1;
@@ -353,14 +359,6 @@ static void rcu_preempt_boost_start_gp(void)
 #else /* #ifdef CONFIG_RCU_BOOST */
 /*
- * If there is no RCU priority boosting, we don't boost.
- */
-static int rcu_boost(void)
-{
-        return 0;
-}
-/*
 * If there is no RCU priority boosting, we don't initiate boosting,
 * but we do indicate whether there are blocked readers blocking the
 * current grace period.
@@ -427,7 +425,7 @@ static void rcu_preempt_cpu_qs(void)
        /* If there are done callbacks, cause them to be invoked. */
        if (*rcu_preempt_ctrlblk.rcb.donetail != NULL)
-                invoke_rcu_kthread();
+                invoke_rcu_callbacks();
 }
 /*
@@ -648,7 +646,7 @@ static void rcu_preempt_check_callbacks(void)
                rcu_preempt_cpu_qs();
        if (&rcu_preempt_ctrlblk.rcb.rcucblist !=
            rcu_preempt_ctrlblk.rcb.donetail)
-                invoke_rcu_kthread();
+                invoke_rcu_callbacks();
        if (rcu_preempt_gp_in_progress() &&
            rcu_cpu_blocking_cur_gp() &&
            rcu_preempt_running_reader())
@@ -674,7 +672,7 @@ static void rcu_preempt_remove_callbacks(struct rcu_ctrlblk *rcp)
 */
 static void rcu_preempt_process_callbacks(void)
 {
-        rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
+        __rcu_process_callbacks(&rcu_preempt_ctrlblk.rcb);
 }
 /*
@@ -697,20 +695,6 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu);
-void rcu_barrier(void)
-{
-        struct rcu_synchronize rcu;
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
 /*
 * synchronize_rcu - wait until a grace period has elapsed.
 *
@@ -864,15 +848,6 @@ static void show_tiny_preempt_stats(struct seq_file *m)
 #endif /* #ifdef CONFIG_RCU_TRACE */
 /*
- * Because preemptible RCU does not exist, it is never necessary to
- * boost preempted RCU readers.
- */
-static int rcu_boost(void)
-{
-        return 0;
-}
-/*
 * Because preemptible RCU does not exist, it never has any callbacks
 * to check.
 */
@@ -898,6 +873,78 @@ static void rcu_preempt_process_callbacks(void)
 #endif /* #else #ifdef CONFIG_TINY_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_callbacks(void)
+{
+        have_rcu_kthread_work = 1;
+        wake_up(&rcu_kthread_wq);
+}
+/*
+ * This kthread invokes RCU callbacks whose grace periods have
+ * elapsed.  It is awakened as needed, and takes the place of the
+ * RCU_SOFTIRQ that is used for this purpose when boosting is disabled.
+ * This is a kthread, but it is never stopped, at least not until
+ * the system goes down.
+ */
+static int rcu_kthread(void *arg)
+{
+        unsigned long work;
+        unsigned long morework;
+        unsigned long flags;
+        for (;;) {
+                wait_event_interruptible(rcu_kthread_wq,
+                                         have_rcu_kthread_work != 0);
+                morework = rcu_boost();
+                local_irq_save(flags);
+                work = have_rcu_kthread_work;
+                have_rcu_kthread_work = morework;
+                local_irq_restore(flags);
+                if (work)
+                        rcu_process_callbacks(NULL);
+                schedule_timeout_interruptible(1); /* Leave CPU for others. */
+        }
+        return 0;  /* Not reached, but needed to shut gcc up. */
+}
+/*
+ * Spawn the kthread that invokes RCU callbacks.
+ */
+static int __init rcu_spawn_kthreads(void)
+{
+        struct sched_param sp;
+        rcu_kthread_task = kthread_run(rcu_kthread, NULL, "rcu_kthread");
+        sp.sched_priority = RCU_BOOST_PRIO;
+        sched_setscheduler_nocheck(rcu_kthread_task, SCHED_FIFO, &sp);
+        return 0;
+}
+early_initcall(rcu_spawn_kthreads);
+#else /* #ifdef CONFIG_RCU_BOOST */
+/*
+ * Start up softirq processing of callbacks.
+ */
+void invoke_rcu_callbacks(void)
+{
+        raise_softirq(RCU_SOFTIRQ);
+}
+void rcu_init(void)
+{
+        open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
+}
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 #include <linux/kernel_stat.h>
@@ -913,12 +960,6 @@ void __init rcu_scheduler_starting(void)
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
-#ifdef CONFIG_RCU_BOOST
-#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
-#else /* #ifdef CONFIG_RCU_BOOST */
-#define RCU_BOOST_PRIO 1
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
 #ifdef CONFIG_RCU_TRACE
 #ifdef CONFIG_RCU_BOOST
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 98f51b13bb7..764825c2685 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -73,7 +73,7 @@ module_param(nreaders, int, 0444);
 MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
 module_param(nfakewriters, int, 0444);
 MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads");
-module_param(stat_interval, int, 0444);
+module_param(stat_interval, int, 0644);
 MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
 module_param(verbose, bool, 0444);
 MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
@@ -480,30 +480,6 @@ static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
        call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
 }
-struct rcu_bh_torture_synchronize {
-        struct rcu_head head;
-        struct completion completion;
-};
-static void rcu_bh_torture_wakeme_after_cb(struct rcu_head *head)
-{
-        struct rcu_bh_torture_synchronize *rcu;
-        rcu = container_of(head, struct rcu_bh_torture_synchronize, head);
-        complete(&rcu->completion);
-}
-static void rcu_bh_torture_synchronize(void)
-{
-        struct rcu_bh_torture_synchronize rcu;
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        call_rcu_bh(&rcu.head, rcu_bh_torture_wakeme_after_cb);
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
-}
 static struct rcu_torture_ops rcu_bh_ops = {
        .init           = NULL,
        .cleanup        = NULL,
@@ -512,7 +488,7 @@ static struct rcu_torture_ops rcu_bh_ops = {
        .readunlock     = rcu_bh_torture_read_unlock,
        .completed      = rcu_bh_torture_completed,
        .deferred_free  = rcu_bh_torture_deferred_free,
-        .sync           = rcu_bh_torture_synchronize,
+        .sync           = synchronize_rcu_bh,
        .cb_barrier     = rcu_barrier_bh,
        .fqs            = rcu_bh_force_quiescent_state,
        .stats          = NULL,
@@ -528,7 +504,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
        .readunlock     = rcu_bh_torture_read_unlock,
        .completed      = rcu_bh_torture_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = rcu_bh_torture_synchronize,
+        .sync           = synchronize_rcu_bh,
        .cb_barrier     = NULL,
        .fqs            = rcu_bh_force_quiescent_state,
        .stats          = NULL,
@@ -536,6 +512,22 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
        .name           = "rcu_bh_sync"
 };
+static struct rcu_torture_ops rcu_bh_expedited_ops = {
+        .init           = rcu_sync_torture_init,
+        .cleanup        = NULL,
+        .readlock       = rcu_bh_torture_read_lock,
+        .read_delay     = rcu_read_delay,  /* just reuse rcu's version. */
+        .readunlock     = rcu_bh_torture_read_unlock,
+        .completed      = rcu_bh_torture_completed,
+        .deferred_free  = rcu_sync_torture_deferred_free,
+        .sync           = synchronize_rcu_bh_expedited,
+        .cb_barrier     = NULL,
+        .fqs            = rcu_bh_force_quiescent_state,
+        .stats          = NULL,
+        .irq_capable    = 1,
+        .name           = "rcu_bh_expedited"
+};
 /*
 * Definitions for srcu torture testing.
 */
@@ -659,11 +651,6 @@ static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
        call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
 }
-static void sched_torture_synchronize(void)
-{
-        synchronize_sched();
-}
 static struct rcu_torture_ops sched_ops = {
        .init           = rcu_sync_torture_init,
        .cleanup        = NULL,
@@ -672,7 +659,7 @@ static struct rcu_torture_ops sched_ops = {
        .readunlock     = sched_torture_read_unlock,
        .completed      = rcu_no_completed,
        .deferred_free  = rcu_sched_torture_deferred_free,
-        .sync           = sched_torture_synchronize,
+        .sync           = synchronize_sched,
        .cb_barrier     = rcu_barrier_sched,
        .fqs            = rcu_sched_force_quiescent_state,
        .stats          = NULL,
@@ -688,7 +675,7 @@ static struct rcu_torture_ops sched_sync_ops = {
        .readunlock     = sched_torture_read_unlock,
        .completed      = rcu_no_completed,
        .deferred_free  = rcu_sync_torture_deferred_free,
-        .sync           = sched_torture_synchronize,
+        .sync           = synchronize_sched,
        .cb_barrier     = NULL,
        .fqs            = rcu_sched_force_quiescent_state,
        .stats          = NULL,
@@ -754,7 +741,7 @@ static int rcu_torture_boost(void *arg)
        do {
                /* Wait for the next test interval. */
                oldstarttime = boost_starttime;
-                while (jiffies - oldstarttime > ULONG_MAX / 2) {
+                while (ULONG_CMP_LT(jiffies, oldstarttime)) {
                        schedule_timeout_uninterruptible(1);
                        rcu_stutter_wait("rcu_torture_boost");
                        if (kthread_should_stop() ||
@@ -765,7 +752,7 @@ static int rcu_torture_boost(void *arg)
                /* Do one boost-test interval. */
                endtime = oldstarttime + test_boost_duration * HZ;
                call_rcu_time = jiffies;
-                while (jiffies - endtime > ULONG_MAX / 2) {
+                while (ULONG_CMP_LT(jiffies, endtime)) {
                        /* If we don't have a callback in flight, post one. */
                        if (!rbi.inflight) {
                                smp_mb(); /* RCU core before ->inflight = 1. */
@@ -792,7 +779,8 @@ static int rcu_torture_boost(void *arg)
                 * interval.  Besides, we are running at RT priority,
                 * so delays should be relatively rare.
                 */
-                while (oldstarttime == boost_starttime) {
+                while (oldstarttime == boost_starttime &&
+                       !kthread_should_stop()) {
                        if (mutex_trylock(&boost_mutex)) {
                                boost_starttime = jiffies +
                                                  test_boost_interval * HZ;
@@ -809,11 +797,11 @@ checkwait:	rcu_stutter_wait("rcu_torture_boost");
        /* Clean up and exit. */
        VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
-        destroy_rcu_head_on_stack(&rbi.rcu);
        rcutorture_shutdown_absorb("rcu_torture_boost");
        while (!kthread_should_stop() || rbi.inflight)
                schedule_timeout_uninterruptible(1);
        smp_mb(); /* order accesses to ->inflight before stack-frame death. */
+        destroy_rcu_head_on_stack(&rbi.rcu);
        return 0;
 }
@@ -831,11 +819,13 @@ rcu_torture_fqs(void *arg)
        VERBOSE_PRINTK_STRING("rcu_torture_fqs task started");
        do {
                fqs_resume_time = jiffies + fqs_stutter * HZ;
-                while (jiffies - fqs_resume_time > LONG_MAX) {
+                while (ULONG_CMP_LT(jiffies, fqs_resume_time) &&
+                       !kthread_should_stop()) {
                        schedule_timeout_interruptible(1);
                }
                fqs_burst_remaining = fqs_duration;
-                while (fqs_burst_remaining > 0) {
+                while (fqs_burst_remaining > 0 &&
+                       !kthread_should_stop()) {
                        cur_ops->fqs();
                        udelay(fqs_holdoff);
                        fqs_burst_remaining -= fqs_holdoff;
@@ -1280,8 +1270,9 @@ static int rcutorture_booster_init(int cpu)
        /* Don't allow time recalculation while creating a new task. */
        mutex_lock(&boost_mutex);
        VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task");
-        boost_tasks[cpu] = kthread_create(rcu_torture_boost, NULL,
+        boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL,
-                                          "rcu_torture_boost");
+                                                  cpu_to_node(cpu),
+                                                  "rcu_torture_boost");
        if (IS_ERR(boost_tasks[cpu])) {
                retval = PTR_ERR(boost_tasks[cpu]);
                VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed");
@@ -1424,7 +1415,7 @@ rcu_torture_init(void)
        int firsterr = 0;
        static struct rcu_torture_ops *torture_ops[] =
                { &rcu_ops, &rcu_sync_ops, &rcu_expedited_ops,
-                  &rcu_bh_ops, &rcu_bh_sync_ops,
+                  &rcu_bh_ops, &rcu_bh_sync_ops, &rcu_bh_expedited_ops,
                  &srcu_ops, &srcu_expedited_ops,
                  &sched_ops, &sched_sync_ops, &sched_expedited_ops, };
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ba06207b1dd..6b76d812740 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -38,7 +38,7 @@
 #include <linux/nmi.h>
 #include <linux/atomic.h>
 #include <linux/bitops.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/completion.h>
 #include <linux/moduleparam.h>
 #include <linux/percpu.h>
@@ -52,13 +52,16 @@
 #include <linux/prefetch.h>
 #include "rcutree.h"
+#include <trace/events/rcu.h>
+#include "rcu.h"
 /* Data structures. */
 static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
 #define RCU_STATE_INITIALIZER(structname) { \
-        .level = { &structname.node[0] }, \
+        .level = { &structname##_state.node[0] }, \
        .levelcnt = { \
                NUM_RCU_LVL_0,  /* root of hierarchy. */ \
                NUM_RCU_LVL_1, \
@@ -69,17 +72,17 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS];
        .signaled = RCU_GP_IDLE, \
        .gpnum = -300, \
        .completed = -300, \
-        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname.onofflock), \
+        .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.onofflock), \
-        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname.fqslock), \
+        .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&structname##_state.fqslock), \
        .n_force_qs = 0, \
        .n_force_qs_ngp = 0, \
        .name = #structname, \
 }
-struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched_state);
+struct rcu_state rcu_sched_state = RCU_STATE_INITIALIZER(rcu_sched);
 DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 static struct rcu_state *rcu_state;
@@ -128,8 +131,6 @@ static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
-#define RCU_KTHREAD_PRIO 1      /* RT priority for per-CPU kthreads. */
 /*
 * Track the rcutorture test sequence number and the update version
 * number within a given test.  The rcutorture_testseq is incremented
@@ -156,33 +157,41 @@ static int rcu_gp_in_progress(struct rcu_state *rsp)
 * Note a quiescent state.  Because we do not need to know
 * how many quiescent states passed, just if there was at least
 * one since the start of the grace period, this just sets a flag.
+ * The caller must have disabled preemption.
 */
 void rcu_sched_qs(int cpu)
 {
        struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu);
-        rdp->passed_quiesc_completed = rdp->gpnum - 1;
+        rdp->passed_quiesce_gpnum = rdp->gpnum;
        barrier();
-        rdp->passed_quiesc = 1;
+        if (rdp->passed_quiesce == 0)
+                trace_rcu_grace_period("rcu_sched", rdp->gpnum, "cpuqs");
+        rdp->passed_quiesce = 1;
 }
 void rcu_bh_qs(int cpu)
 {
        struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-        rdp->passed_quiesc_completed = rdp->gpnum - 1;
+        rdp->passed_quiesce_gpnum = rdp->gpnum;
        barrier();
-        rdp->passed_quiesc = 1;
+        if (rdp->passed_quiesce == 0)
+                trace_rcu_grace_period("rcu_bh", rdp->gpnum, "cpuqs");
+        rdp->passed_quiesce = 1;
 }
 /*
 * Note a context switch.  This is a quiescent state for RCU-sched,
 * and requires special handling for preemptible RCU.
+ * The caller must have disabled preemption.
 */
 void rcu_note_context_switch(int cpu)
 {
+        trace_rcu_utilization("Start context switch");
        rcu_sched_qs(cpu);
        rcu_preempt_note_context_switch(cpu);
+        trace_rcu_utilization("End context switch");
 }
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
@@ -193,7 +202,7 @@ DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
 };
 #endif /* #ifdef CONFIG_NO_HZ */
-static int blimit = 10;         /* Maximum callbacks per softirq. */
+static int blimit = 10;         /* Maximum callbacks per rcu_do_batch. */
 static int qhimark = 10000;     /* If this many pending, ignore blimit. */
 static int qlowmark = 100;      /* Once only this many pending, use blimit. */
@@ -314,6 +323,7 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
         * trust its state not to change because interrupts are disabled.
         */
        if (cpu_is_offline(rdp->cpu)) {
+                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "ofl");
                rdp->offline_fqs++;
                return 1;
        }
@@ -354,19 +364,13 @@ void rcu_enter_nohz(void)
                local_irq_restore(flags);
                return;
        }
+        trace_rcu_dyntick("Start");
        /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
        smp_mb__before_atomic_inc();  /* See above. */
        atomic_inc(&rdtp->dynticks);
        smp_mb__after_atomic_inc();  /* Force ordering with next sojourn. */
        WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
        local_irq_restore(flags);
-        /* If the interrupt queued a callback, get out of dyntick mode. */
-        if (in_irq() &&
-            (__get_cpu_var(rcu_sched_data).nxtlist ||
-             __get_cpu_var(rcu_bh_data).nxtlist ||
-             rcu_preempt_needs_cpu(smp_processor_id())))
-                set_need_resched();
 }
 /*
@@ -391,6 +395,7 @@ void rcu_exit_nohz(void)
        /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
        smp_mb__after_atomic_inc();  /* See above. */
        WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
+        trace_rcu_dyntick("End");
        local_irq_restore(flags);
 }
@@ -481,11 +486,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
 */
 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
 {
-        unsigned long curr;
+        unsigned int curr;
-        unsigned long snap;
+        unsigned int snap;
-        curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
+        curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks);
-        snap = (unsigned long)rdp->dynticks_snap;
+        snap = (unsigned int)rdp->dynticks_snap;
        /*
         * If the CPU passed through or entered a dynticks idle phase with
@@ -495,7 +500,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
         * read-side critical section that started before the beginning
         * of the current RCU grace period.
         */
-        if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
+        if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) {
+                trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, "dti");
                rdp->dynticks_fqs++;
                return 1;
        }
@@ -537,6 +543,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
        int cpu;
        long delta;
        unsigned long flags;
+        int ndetected;
        struct rcu_node *rnp = rcu_get_root(rsp);
        /* Only let one CPU complain about others per time interval. */
@@ -553,7 +560,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
         * Now rat on any tasks that got kicked up to the root rcu_node
         * due to CPU offlining.
         */
-        rcu_print_task_stall(rnp);
+        ndetected = rcu_print_task_stall(rnp);
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        /*
@@ -565,17 +572,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
               rsp->name);
        rcu_for_each_leaf_node(rsp, rnp) {
                raw_spin_lock_irqsave(&rnp->lock, flags);
-                rcu_print_task_stall(rnp);
+                ndetected += rcu_print_task_stall(rnp);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                if (rnp->qsmask == 0)
                        continue;
                for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
-                        if (rnp->qsmask & (1UL << cpu))
+                        if (rnp->qsmask & (1UL << cpu)) {
                                printk(" %d", rnp->grplo + cpu);
+                                ndetected++;
+                        }
        }
        printk("} (detected by %d, t=%ld jiffies)\n",
               smp_processor_id(), (long)(jiffies - rsp->gp_start));
-        trigger_all_cpu_backtrace();
+        if (ndetected == 0)
+                printk(KERN_ERR "INFO: Stall ended before state dump start\n");
+        else if (!trigger_all_cpu_backtrace())
+                dump_stack();
        /* If so configured, complain about tasks blocking the grace period. */
@@ -596,7 +608,8 @@ static void print_cpu_stall(struct rcu_state *rsp)
         */
        printk(KERN_ERR "INFO: %s detected stall on CPU %d (t=%lu jiffies)\n",
               rsp->name, smp_processor_id(), jiffies - rsp->gp_start);
-        trigger_all_cpu_backtrace();
+        if (!trigger_all_cpu_backtrace())
+                dump_stack();
        raw_spin_lock_irqsave(&rnp->lock, flags);
        if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall))
@@ -678,9 +691,10 @@ static void __note_new_gpnum(struct rcu_state *rsp, struct rcu_node *rnp, struct
                 * go looking for one.
                 */
                rdp->gpnum = rnp->gpnum;
+                trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpustart");
                if (rnp->qsmask & rdp->grpmask) {
                        rdp->qs_pending = 1;
-                        rdp->passed_quiesc = 0;
+                        rdp->passed_quiesce = 0;
                } else
                        rdp->qs_pending = 0;
        }
@@ -741,6 +755,7 @@ __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_dat
                /* Remember that we saw this grace-period completion. */
                rdp->completed = rnp->completed;
+                trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuend");
                /*
                 * If we were in an extended quiescent state, we may have
@@ -826,31 +841,31 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        struct rcu_node *rnp = rcu_get_root(rsp);
-        if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) {
+        if (!rcu_scheduler_fully_active ||
-                if (cpu_needs_another_gp(rsp, rdp))
+            !cpu_needs_another_gp(rsp, rdp)) {
-                        rsp->fqs_need_gp = 1;
+                /*
-                if (rnp->completed == rsp->completed) {
+                 * Either the scheduler hasn't yet spawned the first
-                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                 * non-idle task or this CPU does not need another
-                        return;
+                 * grace period.  Either way, don't start a new grace
-                }
+                 * period.
-                raw_spin_unlock(&rnp->lock);     /* irqs remain disabled. */
+                 */
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                return;
+        }
+        if (rsp->fqs_active) {
                /*
-                 * Propagate new ->completed value to rcu_node structures
+                 * This CPU needs a grace period, but force_quiescent_state()
-                 * so that other CPUs don't have to wait until the start
+                 * is running.  Tell it to start one on this CPU's behalf.
-                 * of the next grace period to process their callbacks.
                 */
-                rcu_for_each_node_breadth_first(rsp, rnp) {
+                rsp->fqs_need_gp = 1;
-                        raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                raw_spin_unlock_irqrestore(&rnp->lock, flags);
-                        rnp->completed = rsp->completed;
-                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
-                }
-                local_irq_restore(flags);
                return;
        }
        /* Advance to a new grace period and initialize state. */
        rsp->gpnum++;
+        trace_rcu_grace_period(rsp->name, rsp->gpnum, "start");
        WARN_ON_ONCE(rsp->signaled == RCU_GP_INIT);
        rsp->signaled = RCU_GP_INIT; /* Hold off force_quiescent_state. */
        rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS;
@@ -865,6 +880,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
                rcu_start_gp_per_cpu(rsp, rnp, rdp);
                rcu_preempt_boost_start_gp(rnp);
+                trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
+                                            rnp->level, rnp->grplo,
+                                            rnp->grphi, rnp->qsmask);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
@@ -901,6 +919,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
                if (rnp == rdp->mynode)
                        rcu_start_gp_per_cpu(rsp, rnp, rdp);
                rcu_preempt_boost_start_gp(rnp);
+                trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
+                                            rnp->level, rnp->grplo,
+                                            rnp->grphi, rnp->qsmask);
                raw_spin_unlock(&rnp->lock);    /* irqs remain disabled. */
        }
@@ -922,6 +943,8 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        __releases(rcu_get_root(rsp)->lock)
 {
        unsigned long gp_duration;
+        struct rcu_node *rnp = rcu_get_root(rsp);
+        struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
        WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
@@ -933,7 +956,41 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
        gp_duration = jiffies - rsp->gp_start;
        if (gp_duration > rsp->gp_max)
                rsp->gp_max = gp_duration;
-        rsp->completed = rsp->gpnum;
+        /*
+         * We know the grace period is complete, but to everyone else
+         * it appears to still be ongoing.  But it is also the case
+         * that to everyone else it looks like there is nothing that
+         * they can do to advance the grace period.  It is therefore
+         * safe for us to drop the lock in order to mark the grace
+         * period as completed in all of the rcu_node structures.
+         *
+         * But if this CPU needs another grace period, it will take
+         * care of this while initializing the next grace period.
+         * We use RCU_WAIT_TAIL instead of the usual RCU_DONE_TAIL
+         * because the callbacks have not yet been advanced: Those
+         * callbacks are waiting on the grace period that just now
+         * completed.
+         */
+        if (*rdp->nxttail[RCU_WAIT_TAIL] == NULL) {
+                raw_spin_unlock(&rnp->lock);     /* irqs remain disabled. */
+                /*
+                 * Propagate new ->completed value to rcu_node structures
+                 * so that other CPUs don't have to wait until the start
+                 * of the next grace period to process their callbacks.
+                 */
+                rcu_for_each_node_breadth_first(rsp, rnp) {
+                        raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+                        rnp->completed = rsp->gpnum;
+                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                }
+                rnp = rcu_get_root(rsp);
+                raw_spin_lock(&rnp->lock); /* irqs already disabled. */
+        }
+        rsp->completed = rsp->gpnum;  /* Declare the grace period complete. */
+        trace_rcu_grace_period(rsp->name, rsp->completed, "end");
        rsp->signaled = RCU_GP_IDLE;
        rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
 }
@@ -962,6 +1019,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
                        return;
                }
                rnp->qsmask &= ~mask;
+                trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
+                                                 mask, rnp->qsmask, rnp->level,
+                                                 rnp->grplo, rnp->grphi,
+                                                 !!rnp->gp_tasks);
                if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
                        /* Other bits still set at this level, so done. */
@@ -1000,7 +1061,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 * based on quiescent states detected in an earlier grace period!
 */
 static void
-rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastcomp)
+rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long lastgp)
 {
        unsigned long flags;
        unsigned long mask;
@@ -1008,17 +1069,15 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las
        rnp = rdp->mynode;
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        if (lastcomp != rnp->completed) {
+        if (lastgp != rnp->gpnum || rnp->completed == rnp->gpnum) {
                /*
-                 * Someone beat us to it for this grace period, so leave.
+                 * The grace period in which this quiescent state was
-                 * The race with GP start is resolved by the fact that we
+                 * recorded has ended, so don't report it upwards.
-                 * hold the leaf rcu_node lock, so that the per-CPU bits
+                 * We will instead need a new quiescent state that lies
-                 * cannot yet be initialized -- so we would simply find our
+                 * within the current grace period.
-                 * CPU's bit already cleared in rcu_report_qs_rnp() if this
-                 * race occurred.
                 */
-                rdp->passed_quiesc = 0; /* try again later! */
+                rdp->passed_quiesce = 0;        /* need qs for new gp. */
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
                return;
        }
@@ -1062,14 +1121,14 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
         * Was there a quiescent state since the beginning of the grace
         * period? If no, then exit and wait for the next call.
         */
-        if (!rdp->passed_quiesc)
+        if (!rdp->passed_quiesce)
                return;
        /*
         * Tell RCU we are done (but rcu_report_qs_rdp() will be the
         * judge of that).
         */
-        rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesc_completed);
+        rcu_report_qs_rdp(rdp->cpu, rsp, rdp, rdp->passed_quiesce_gpnum);
 }
 #ifdef CONFIG_HOTPLUG_CPU
@@ -1130,11 +1189,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
                if (rnp->qsmaskinit != 0) {
                        if (rnp != rdp->mynode)
                                raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
+                        else
+                                trace_rcu_grace_period(rsp->name,
+                                                       rnp->gpnum + 1 -
+                                                       !!(rnp->qsmask & mask),
+                                                       "cpuofl");
                        break;
                }
-                if (rnp == rdp->mynode)
+                if (rnp == rdp->mynode) {
+                        trace_rcu_grace_period(rsp->name,
+                                               rnp->gpnum + 1 -
+                                               !!(rnp->qsmask & mask),
+                                               "cpuofl");
                        need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp);
-                else
+                } else
                        raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
                mask = rnp->grpmask;
                rnp = rnp->parent;
@@ -1190,17 +1258,22 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
        unsigned long flags;
        struct rcu_head *next, *list, **tail;
-        int count;
+        int bl, count;
        /* If no callbacks are ready, just return.*/
-        if (!cpu_has_callbacks_ready_to_invoke(rdp))
+        if (!cpu_has_callbacks_ready_to_invoke(rdp)) {
+                trace_rcu_batch_start(rsp->name, 0, 0);
+                trace_rcu_batch_end(rsp->name, 0);
                return;
+        }
        /*
         * Extract the list of ready callbacks, disabling to prevent
         * races with call_rcu() from interrupt handlers.
         */
        local_irq_save(flags);
+        bl = rdp->blimit;
+        trace_rcu_batch_start(rsp->name, rdp->qlen, bl);
        list = rdp->nxtlist;
        rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL];
        *rdp->nxttail[RCU_DONE_TAIL] = NULL;
@@ -1216,13 +1289,14 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
                next = list->next;
                prefetch(next);
                debug_rcu_head_unqueue(list);
-                __rcu_reclaim(list);
+                __rcu_reclaim(rsp->name, list);
                list = next;
-                if (++count >= rdp->blimit)
+                if (++count >= bl)
                        break;
        }
        local_irq_save(flags);
+        trace_rcu_batch_end(rsp->name, count);
        /* Update count, and requeue any remaining callbacks. */
        rdp->qlen -= count;
@@ -1250,7 +1324,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
        local_irq_restore(flags);
-        /* Re-raise the RCU softirq if there are callbacks remaining. */
+        /* Re-invoke RCU core processing if there are callbacks remaining. */
        if (cpu_has_callbacks_ready_to_invoke(rdp))
                invoke_rcu_core();
 }
@@ -1258,7 +1332,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 /*
 * Check to see if this CPU is in a non-context-switch quiescent state
 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh).
- * Also schedule the RCU softirq handler.
+ * Also schedule RCU core processing.
 *
 * This function must be called with hardirqs disabled.  It is normally
 * invoked from the scheduling-clock interrupt.  If rcu_pending returns
@@ -1266,6 +1340,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 */
 void rcu_check_callbacks(int cpu, int user)
 {
+        trace_rcu_utilization("Start scheduler-tick");
        if (user ||
            (idle_cpu(cpu) && rcu_scheduler_active &&
             !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
@@ -1299,6 +1374,7 @@ void rcu_check_callbacks(int cpu, int user)
        rcu_preempt_check_callbacks(cpu);
        if (rcu_pending(cpu))
                invoke_rcu_core();
+        trace_rcu_utilization("End scheduler-tick");
 }
 #ifdef CONFIG_SMP
@@ -1360,10 +1436,14 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
        unsigned long flags;
        struct rcu_node *rnp = rcu_get_root(rsp);
-        if (!rcu_gp_in_progress(rsp))
+        trace_rcu_utilization("Start fqs");
+        if (!rcu_gp_in_progress(rsp)) {
+                trace_rcu_utilization("End fqs");
                return;  /* No grace period in progress, nothing to force. */
+        }
        if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) {
                rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
+                trace_rcu_utilization("End fqs");
                return; /* Someone else is already on the job. */
        }
        if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies))
@@ -1412,11 +1492,13 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
                raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */
                rsp->fqs_need_gp = 0;
                rcu_start_gp(rsp, flags); /* releases rnp->lock */
+                trace_rcu_utilization("End fqs");
                return;
        }
        raw_spin_unlock(&rnp->lock);  /* irqs remain disabled */
 unlock_fqs_ret:
        raw_spin_unlock_irqrestore(&rsp->fqslock, flags);
+        trace_rcu_utilization("End fqs");
 }
 #else /* #ifdef CONFIG_SMP */
@@ -1429,9 +1511,9 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 #endif /* #else #ifdef CONFIG_SMP */
 /*
- * This does the RCU processing work from softirq context for the
+ * This does the RCU core processing work for the specified rcu_state
- * specified rcu_state and rcu_data structures.  This may be called
+ * and rcu_data structures.  This may be called only from the CPU to
- * only from the CPU to whom the rdp belongs.
+ * whom the rdp belongs.
 */
 static void
 __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
@@ -1468,24 +1550,24 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 }
 /*
- * Do softirq processing for the current CPU.
+ * Do RCU core processing for the current CPU.
 */
 static void rcu_process_callbacks(struct softirq_action *unused)
 {
+        trace_rcu_utilization("Start RCU core");
        __rcu_process_callbacks(&rcu_sched_state,
                                &__get_cpu_var(rcu_sched_data));
        __rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
        rcu_preempt_process_callbacks();
+        trace_rcu_utilization("End RCU core");
-        /* If we are last CPU on way to dyntick-idle mode, accelerate it. */
-        rcu_needs_cpu_flush();
 }
 /*
- * Wake up the current CPU's kthread.  This replaces raise_softirq()
+ * Schedule RCU callback invocation.  If the specified type of RCU
- * in earlier versions of RCU.  Note that because we are running on
+ * does not support RCU priority boosting, just do a direct call,
- * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
+ * otherwise wake up the per-CPU kernel kthread.  Note that because we
- * cannot disappear out from under us.
+ * are running on the current CPU with interrupts disabled, the
+ * rcu_cpu_kthread_task cannot disappear out from under us.
 */
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 {
@@ -1530,6 +1612,12 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
        rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
        rdp->qlen++;
+        if (__is_kfree_rcu_offset((unsigned long)func))
+                trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
+                                         rdp->qlen);
+        else
+                trace_rcu_callback(rsp->name, head, rdp->qlen);
        /* If interrupts were disabled, don't dive into RCU core. */
        if (irqs_disabled_flags(flags)) {
                local_irq_restore(flags);
@@ -1613,18 +1701,9 @@ EXPORT_SYMBOL_GPL(call_rcu_bh);
 */
 void synchronize_sched(void)
 {
-        struct rcu_synchronize rcu;
        if (rcu_blocking_is_gp())
                return;
+        wait_rcu_gp(call_rcu_sched);
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu_sched(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_sched);
@@ -1639,18 +1718,9 @@ EXPORT_SYMBOL_GPL(synchronize_sched);
 */
 void synchronize_rcu_bh(void)
 {
-        struct rcu_synchronize rcu;
        if (rcu_blocking_is_gp())
                return;
+        wait_rcu_gp(call_rcu_bh);
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu_bh(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
@@ -1671,7 +1741,8 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        check_cpu_stall(rsp, rdp);
        /* Is the RCU core waiting for a quiescent state from this CPU? */
-        if (rdp->qs_pending && !rdp->passed_quiesc) {
+        if (rcu_scheduler_fully_active &&
+            rdp->qs_pending && !rdp->passed_quiesce) {
                /*
                 * If force_quiescent_state() coming soon and this CPU
@@ -1683,7 +1754,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
                    ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
                                 jiffies))
                        set_need_resched();
-        } else if (rdp->qs_pending && rdp->passed_quiesc) {
+        } else if (rdp->qs_pending && rdp->passed_quiesce) {
                rdp->n_rp_report_qs++;
                return 1;
        }
@@ -1846,6 +1917,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
        rdp->dynticks = &per_cpu(rcu_dynticks, cpu);
 #endif /* #ifdef CONFIG_NO_HZ */
        rdp->cpu = cpu;
+        rdp->rsp = rsp;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 }
@@ -1865,8 +1937,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
        /* Set up local state, ensuring consistent view of global state. */
        raw_spin_lock_irqsave(&rnp->lock, flags);
-        rdp->passed_quiesc = 0;  /* We could be racing with new GP, */
-        rdp->qs_pending = 1;     /*  so set up to respond to current GP. */
        rdp->beenonline = 1;     /* We have now been online. */
        rdp->preemptible = preemptible;
        rdp->qlen_last_fqs_check = 0;
@@ -1891,9 +1961,17 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
                rnp->qsmaskinit |= mask;
                mask = rnp->grpmask;
                if (rnp == rdp->mynode) {
-                        rdp->gpnum = rnp->completed; /* if GP in progress... */
+                        /*
+                         * If there is a grace period in progress, we will
+                         * set up to wait for it next time we run the
+                         * RCU core code.
+                         */
+                        rdp->gpnum = rnp->completed;
                        rdp->completed = rnp->completed;
-                        rdp->passed_quiesc_completed = rnp->completed - 1;
+                        rdp->passed_quiesce = 0;
+                        rdp->qs_pending = 0;
+                        rdp->passed_quiesce_gpnum = rnp->gpnum - 1;
+                        trace_rcu_grace_period(rsp->name, rdp->gpnum, "cpuonl");
                }
                raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
                rnp = rnp->parent;
@@ -1919,6 +1997,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
        struct rcu_node *rnp = rdp->mynode;
+        trace_rcu_utilization("Start CPU hotplug");
        switch (action) {
        case CPU_UP_PREPARE:
        case CPU_UP_PREPARE_FROZEN:
@@ -1954,6 +2033,7 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
        default:
                break;
        }
+        trace_rcu_utilization("End CPU hotplug");
        return NOTIFY_OK;
 }
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 01b2ccda26f..849ce9ec51f 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -230,9 +230,9 @@ struct rcu_data {
                                        /*  in order to detect GP end. */
        unsigned long   gpnum;          /* Highest gp number that this CPU */
                                        /*  is aware of having started. */
-        unsigned long   passed_quiesc_completed;
+        unsigned long   passed_quiesce_gpnum;
-                                        /* Value of completed at time of qs. */
+                                        /* gpnum at time of quiescent state. */
-        bool            passed_quiesc;  /* User-mode/idle loop etc. */
+        bool            passed_quiesce; /* User-mode/idle loop etc. */
        bool            qs_pending;     /* Core waits for quiesc state. */
        bool            beenonline;     /* CPU online at least once. */
        bool            preemptible;    /* Preemptible RCU? */
@@ -299,6 +299,7 @@ struct rcu_data {
        unsigned long n_rp_need_nothing;
        int cpu;
+        struct rcu_state *rsp;
 };
 /* Values for signaled field in struct rcu_state. */
@@ -417,6 +418,13 @@ extern struct rcu_state rcu_preempt_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DECLARE_PER_CPU(char, rcu_cpu_has_work);
+#endif /* #ifdef CONFIG_RCU_BOOST */
 #ifndef RCU_TREE_NONCORE
 /* Forward declarations for rcutree_plugin.h */
@@ -430,7 +438,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
 static void rcu_stop_cpu_kthread(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_print_detail_task_stall(struct rcu_state *rsp);
-static void rcu_print_task_stall(struct rcu_node *rnp);
+static int rcu_print_task_stall(struct rcu_node *rnp);
 static void rcu_preempt_stall_reset(void);
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
@@ -450,7 +458,6 @@ static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
 static void rcu_preempt_send_cbs_to_online(void);
 static void __init __rcu_init_preempt(void);
-static void rcu_needs_cpu_flush(void);
 static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
 static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
 static void invoke_rcu_callbacks_kthread(void);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 8aafbb80b8b..4b9b9f8a418 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -27,6 +27,14 @@
 #include <linux/delay.h>
 #include <linux/stop_machine.h>
+#define RCU_KTHREAD_PRIO 1
+#ifdef CONFIG_RCU_BOOST
+#define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO
+#else
+#define RCU_BOOST_PRIO RCU_KTHREAD_PRIO
+#endif
 /*
 * Check the RCU kernel configuration parameters and print informative
 * messages about anything out of the ordinary.  If you like #ifdef, you
@@ -64,7 +72,7 @@ static void __init rcu_bootup_announce_oddness(void)
 #ifdef CONFIG_TREE_PREEMPT_RCU
-struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
+struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt);
 DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
 static struct rcu_state *rcu_state = &rcu_preempt_state;
@@ -122,9 +130,11 @@ static void rcu_preempt_qs(int cpu)
 {
        struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu);
-        rdp->passed_quiesc_completed = rdp->gpnum - 1;
+        rdp->passed_quiesce_gpnum = rdp->gpnum;
        barrier();
-        rdp->passed_quiesc = 1;
+        if (rdp->passed_quiesce == 0)
+                trace_rcu_grace_period("rcu_preempt", rdp->gpnum, "cpuqs");
+        rdp->passed_quiesce = 1;
        current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS;
 }
@@ -190,6 +200,11 @@ static void rcu_preempt_note_context_switch(int cpu)
                        if (rnp->qsmask & rdp->grpmask)
                                rnp->gp_tasks = &t->rcu_node_entry;
                }
+                trace_rcu_preempt_task(rdp->rsp->name,
+                                       t->pid,
+                                       (rnp->qsmask & rdp->grpmask)
+                                       ? rnp->gpnum
+                                       : rnp->gpnum + 1);
                raw_spin_unlock_irqrestore(&rnp->lock, flags);
        } else if (t->rcu_read_lock_nesting < 0 &&
                   t->rcu_read_unlock_special) {
@@ -299,6 +314,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
        int empty_exp;
        unsigned long flags;
        struct list_head *np;
+#ifdef CONFIG_RCU_BOOST
+        struct rt_mutex *rbmp = NULL;
+#endif /* #ifdef CONFIG_RCU_BOOST */
        struct rcu_node *rnp;
        int special;
@@ -344,6 +362,9 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
                smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
                np = rcu_next_node_entry(t, rnp);
                list_del_init(&t->rcu_node_entry);
+                t->rcu_blocked_node = NULL;
+                trace_rcu_unlock_preempted_task("rcu_preempt",
+                                                rnp->gpnum, t->pid);
                if (&t->rcu_node_entry == rnp->gp_tasks)
                        rnp->gp_tasks = np;
                if (&t->rcu_node_entry == rnp->exp_tasks)
@@ -351,30 +372,34 @@ static noinline void rcu_read_unlock_special(struct task_struct *t)
 #ifdef CONFIG_RCU_BOOST
                if (&t->rcu_node_entry == rnp->boost_tasks)
                        rnp->boost_tasks = np;
-                /* Snapshot and clear ->rcu_boosted with rcu_node lock held. */
+                /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */
-                if (t->rcu_boosted) {
+                if (t->rcu_boost_mutex) {
-                        special |= RCU_READ_UNLOCK_BOOSTED;
+                        rbmp = t->rcu_boost_mutex;
-                        t->rcu_boosted = 0;
+                        t->rcu_boost_mutex = NULL;
                }
 #endif /* #ifdef CONFIG_RCU_BOOST */
-                t->rcu_blocked_node = NULL;
                /*
                 * If this was the last task on the current list, and if
                 * we aren't waiting on any CPUs, report the quiescent state.
                 * Note that rcu_report_unblock_qs_rnp() releases rnp->lock.
                 */
-                if (empty)
+                if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) {
-                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
+                        trace_rcu_quiescent_state_report("preempt_rcu",
-                else
+                                                         rnp->gpnum,
+                                                         0, rnp->qsmask,
+                                                         rnp->level,
+                                                         rnp->grplo,
+                                                         rnp->grphi,
+                                                         !!rnp->gp_tasks);
                        rcu_report_unblock_qs_rnp(rnp, flags);
+                } else
+                        raw_spin_unlock_irqrestore(&rnp->lock, flags);
 #ifdef CONFIG_RCU_BOOST
                /* Unboost if we were boosted. */
-                if (special & RCU_READ_UNLOCK_BOOSTED) {
+                if (rbmp)
-                        rt_mutex_unlock(t->rcu_boost_mutex);
+                        rt_mutex_unlock(rbmp);
-                        t->rcu_boost_mutex = NULL;
-                }
 #endif /* #ifdef CONFIG_RCU_BOOST */
                /*
@@ -399,10 +424,10 @@ void __rcu_read_unlock(void)
 {
        struct task_struct *t = current;
-        barrier();  /* needed if we ever invoke rcu_read_unlock in rcutree.c */
        if (t->rcu_read_lock_nesting != 1)
                --t->rcu_read_lock_nesting;
        else {
+                barrier();  /* critical section before exit code. */
                t->rcu_read_lock_nesting = INT_MIN;
                barrier();  /* assign before ->rcu_read_unlock_special load */
                if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special)))
@@ -466,16 +491,20 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 * Scan the current list of tasks blocked within RCU read-side critical
 * sections, printing out the tid of each.
 */
-static void rcu_print_task_stall(struct rcu_node *rnp)
+static int rcu_print_task_stall(struct rcu_node *rnp)
 {
        struct task_struct *t;
+        int ndetected = 0;
        if (!rcu_preempt_blocked_readers_cgp(rnp))
-                return;
+                return 0;
        t = list_entry(rnp->gp_tasks,
                       struct task_struct, rcu_node_entry);
-        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
+        list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) {
                printk(" P%d", t->pid);
+                ndetected++;
+        }
+        return ndetected;
 }
 /*
@@ -656,18 +685,9 @@ EXPORT_SYMBOL_GPL(call_rcu);
 */
 void synchronize_rcu(void)
 {
-        struct rcu_synchronize rcu;
        if (!rcu_scheduler_active)
                return;
+        wait_rcu_gp(call_rcu);
-        init_rcu_head_on_stack(&rcu.head);
-        init_completion(&rcu.completion);
-        /* Will wake me after RCU finished. */
-        call_rcu(&rcu.head, wakeme_after_rcu);
-        /* Wait for it. */
-        wait_for_completion(&rcu.completion);
-        destroy_rcu_head_on_stack(&rcu.head);
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu);
@@ -968,8 +988,9 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
 * Because preemptible RCU does not exist, we never have to check for
 * tasks blocked within RCU read-side critical sections.
 */
-static void rcu_print_task_stall(struct rcu_node *rnp)
+static int rcu_print_task_stall(struct rcu_node *rnp)
 {
+        return 0;
 }
 /*
@@ -1136,6 +1157,8 @@ static void rcu_initiate_boost_trace(struct rcu_node *rnp)
 #endif /* #else #ifdef CONFIG_RCU_TRACE */
+static struct lock_class_key rcu_boost_class;
 /*
 * Carry out RCU priority boosting on the task indicated by ->exp_tasks
 * or ->boost_tasks, advancing the pointer to the next task in the
@@ -1198,8 +1221,10 @@ static int rcu_boost(struct rcu_node *rnp)
         */
        t = container_of(tb, struct task_struct, rcu_node_entry);
        rt_mutex_init_proxy_locked(&mtx, t);
+        /* Avoid lockdep false positives.  This rt_mutex is its own thing. */
+        lockdep_set_class_and_name(&mtx.wait_lock, &rcu_boost_class,
+                                   "rcu_boost_mutex");
        t->rcu_boost_mutex = &mtx;
-        t->rcu_boosted = 1;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
        rt_mutex_lock(&mtx);  /* Side effect: boosts task t's priority. */
        rt_mutex_unlock(&mtx);  /* Keep lockdep happy. */
@@ -1228,9 +1253,12 @@ static int rcu_boost_kthread(void *arg)
        int spincnt = 0;
        int more2boost;
+        trace_rcu_utilization("Start boost kthread@init");
        for (;;) {
                rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
+                trace_rcu_utilization("End boost kthread@rcu_wait");
                rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
+                trace_rcu_utilization("Start boost kthread@rcu_wait");
                rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
                more2boost = rcu_boost(rnp);
                if (more2boost)
@@ -1238,11 +1266,14 @@ static int rcu_boost_kthread(void *arg)
                else
                        spincnt = 0;
                if (spincnt > 10) {
+                        trace_rcu_utilization("End boost kthread@rcu_yield");
                        rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
+                        trace_rcu_utilization("Start boost kthread@rcu_yield");
                        spincnt = 0;
                }
        }
        /* NOTREACHED */
+        trace_rcu_utilization("End boost kthread@notreached");
        return 0;
 }
@@ -1291,11 +1322,9 @@ static void invoke_rcu_callbacks_kthread(void)
        local_irq_save(flags);
        __this_cpu_write(rcu_cpu_has_work, 1);
-        if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
+        if (__this_cpu_read(rcu_cpu_kthread_task) != NULL &&
-                local_irq_restore(flags);
+            current != __this_cpu_read(rcu_cpu_kthread_task))
-                return;
+                wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
-        }
-        wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
        local_irq_restore(flags);
 }
@@ -1343,13 +1372,13 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
        if (rnp->boost_kthread_task != NULL)
                return 0;
        t = kthread_create(rcu_boost_kthread, (void *)rnp,
-                           "rcub%d", rnp_index);
+                           "rcub/%d", rnp_index);
        if (IS_ERR(t))
                return PTR_ERR(t);
        raw_spin_lock_irqsave(&rnp->lock, flags);
        rnp->boost_kthread_task = t;
        raw_spin_unlock_irqrestore(&rnp->lock, flags);
-        sp.sched_priority = RCU_KTHREAD_PRIO;
+        sp.sched_priority = RCU_BOOST_PRIO;
        sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
        wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
        return 0;
@@ -1444,6 +1473,7 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
 {
        struct sched_param sp;
        struct timer_list yield_timer;
+        int prio = current->rt_priority;
        setup_timer_on_stack(&yield_timer, f, arg);
        mod_timer(&yield_timer, jiffies + 2);
@@ -1451,7 +1481,8 @@ static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
        sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
        set_user_nice(current, 19);
        schedule();
-        sp.sched_priority = RCU_KTHREAD_PRIO;
+        set_user_nice(current, 0);
+        sp.sched_priority = prio;
        sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
        del_timer(&yield_timer);
 }
@@ -1489,7 +1520,8 @@ static int rcu_cpu_kthread_should_stop(int cpu)
 /*
 * Per-CPU kernel thread that invokes RCU callbacks.  This replaces the
- * earlier RCU softirq.
+ * RCU softirq used in flavors and configurations of RCU that do not
+ * support RCU priority boosting.
 */
 static int rcu_cpu_kthread(void *arg)
 {
@@ -1500,9 +1532,12 @@ static int rcu_cpu_kthread(void *arg)
        char work;
        char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+        trace_rcu_utilization("Start CPU kthread@init");
        for (;;) {
                *statusp = RCU_KTHREAD_WAITING;
+                trace_rcu_utilization("End CPU kthread@rcu_wait");
                rcu_wait(*workp != 0 || kthread_should_stop());
+                trace_rcu_utilization("Start CPU kthread@rcu_wait");
                local_bh_disable();
                if (rcu_cpu_kthread_should_stop(cpu)) {
                        local_bh_enable();
@@ -1523,11 +1558,14 @@ static int rcu_cpu_kthread(void *arg)
                        spincnt = 0;
                if (spincnt > 10) {
                        *statusp = RCU_KTHREAD_YIELDING;
+                        trace_rcu_utilization("End CPU kthread@rcu_yield");
                        rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+                        trace_rcu_utilization("Start CPU kthread@rcu_yield");
                        spincnt = 0;
                }
        }
        *statusp = RCU_KTHREAD_STOPPED;
+        trace_rcu_utilization("End CPU kthread@term");
        return 0;
 }
@@ -1560,7 +1598,10 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
        if (!rcu_scheduler_fully_active ||
            per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
                return 0;
-        t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
+        t = kthread_create_on_node(rcu_cpu_kthread,
+                                   (void *)(long)cpu,
+                                   cpu_to_node(cpu),
+                                   "rcuc/%d", cpu);
        if (IS_ERR(t))
                return PTR_ERR(t);
        if (cpu_online(cpu))
@@ -1669,7 +1710,7 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
                return 0;
        if (rnp->node_kthread_task == NULL) {
                t = kthread_create(rcu_node_kthread, (void *)rnp,
-                                   "rcun%d", rnp_index);
+                                   "rcun/%d", rnp_index);
                if (IS_ERR(t))
                        return PTR_ERR(t);
                raw_spin_lock_irqsave(&rnp->lock, flags);
@@ -1907,15 +1948,6 @@ int rcu_needs_cpu(int cpu)
        return rcu_needs_cpu_quick_check(cpu);
 }
-/*
- * Check to see if we need to continue a callback-flush operations to
- * allow the last CPU to enter dyntick-idle mode.  But fast dyntick-idle
- * entry is not configured, so we never do need to.
- */
-static void rcu_needs_cpu_flush(void)
-{
-}
 #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */
 #define RCU_NEEDS_CPU_FLUSHES 5
@@ -1991,20 +2023,4 @@ int rcu_needs_cpu(int cpu)
        return c;
 }
-/*
- * Check to see if we need to continue a callback-flush operations to
- * allow the last CPU to enter dyntick-idle mode.
- */
-static void rcu_needs_cpu_flush(void)
-{
-        int cpu = smp_processor_id();
-        unsigned long flags;
-        if (per_cpu(rcu_dyntick_drain, cpu) <= 0)
-                return;
-        local_irq_save(flags);
-        (void)rcu_needs_cpu(cpu);
-        local_irq_restore(flags);
-}
 #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index 3b0c0986afc..9feffa4c069 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -48,11 +48,6 @@
 #ifdef CONFIG_RCU_BOOST
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
-DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
-DECLARE_PER_CPU(char, rcu_cpu_has_work);
 static char convert_kthread_status(unsigned int kthread_status)
 {
        if (kthread_status > RCU_KTHREAD_MAX)
@@ -66,11 +61,11 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
        if (!rdp->beenonline)
                return;
-        seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d",
+        seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pgp=%lu qp=%d",
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? '!' : ' ',
                   rdp->completed, rdp->gpnum,
-                   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+                   rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
                   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
        seq_printf(m, " dt=%d/%d/%d df=%lu",
@@ -144,7 +139,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
                   rdp->cpu,
                   cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"",
                   rdp->completed, rdp->gpnum,
-                   rdp->passed_quiesc, rdp->passed_quiesc_completed,
+                   rdp->passed_quiesce, rdp->passed_quiesce_gpnum,
                   rdp->qs_pending);
 #ifdef CONFIG_NO_HZ
        seq_printf(m, ",%d,%d,%d,%lu",
@@ -175,7 +170,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
 static int show_rcudata_csv(struct seq_file *m, void *unused)
 {
-        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
+        seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pgp\",\"pq\",");
 #ifdef CONFIG_NO_HZ
        seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
 #endif /* #ifdef CONFIG_NO_HZ */
diff --git a/kernel/relay.c b/kernel/relay.c
index 859ea5a9605..226fade4d72 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -15,7 +15,7 @@
 #include <linux/errno.h>
 #include <linux/stddef.h>
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/string.h>
 #include <linux/relay.h>
 #include <linux/vmalloc.h>
diff --git a/kernel/resource.c b/kernel/resource.c
index 3b3cedc5259..7640b3a947d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,7 +7,7 @@
 * Arbitrary resource management.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
@@ -419,6 +419,9 @@ static int __find_resource(struct resource *root, struct resource *old,
                else
                        tmp.end = root->end;
+                if (tmp.end < tmp.start)
+                        goto next;
                resource_clip(&tmp, constraint->min, constraint->max);
                arch_remove_reservations(&tmp);
@@ -436,8 +439,10 @@ static int __find_resource(struct resource *root, struct resource *old,
                                return 0;
                        }
                }
-                if (!this)
+next:           if (!this || this->end == root->end)
                        break;
                if (this != old)
                        tmp.start = this->end + 1;
                this = this->sibling;
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index 3c7cbc2c33b..8eafd1bd273 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -18,7 +18,7 @@
 */
 #include <linux/sched.h>
 #include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/syscalls.h>
@@ -29,61 +29,6 @@
 #include "rtmutex_common.h"
-# define TRACE_WARN_ON(x)                       WARN_ON(x)
-# define TRACE_BUG_ON(x)                        BUG_ON(x)
-# define TRACE_OFF()                                            \
-do {                                                            \
-        if (rt_trace_on) {                                      \
-                rt_trace_on = 0;                                \
-                console_verbose();                              \
-                if (raw_spin_is_locked(&current->pi_lock))      \
-                        raw_spin_unlock(&current->pi_lock);     \
-        }                                                       \
-} while (0)
-# define TRACE_OFF_NOLOCK()                                     \
-do {                                                            \
-        if (rt_trace_on) {                                      \
-                rt_trace_on = 0;                                \
-                console_verbose();                              \
-        }                                                       \
-} while (0)
-# define TRACE_BUG_LOCKED()                     \
-do {                                            \
-        TRACE_OFF();                            \
-        BUG();                                  \
-} while (0)
-# define TRACE_WARN_ON_LOCKED(c)                \
-do {                                            \
-        if (unlikely(c)) {                      \
-                TRACE_OFF();                    \
-                WARN_ON(1);                     \
-        }                                       \
-} while (0)
-# define TRACE_BUG_ON_LOCKED(c)                 \
-do {                                            \
-        if (unlikely(c))                        \
-                TRACE_BUG_LOCKED();             \
-} while (0)
-#ifdef CONFIG_SMP
-# define SMP_TRACE_BUG_ON_LOCKED(c)     TRACE_BUG_ON_LOCKED(c)
-#else
-# define SMP_TRACE_BUG_ON_LOCKED(c)     do { } while (0)
-#endif
-/*
- * deadlock detection flag. We turn it off when we detect
- * the first problem because we dont want to recurse back
- * into the tracing code when doing error printk or
- * executing a BUG():
- */
-static int rt_trace_on = 1;
 static void printk_task(struct task_struct *p)
 {
        if (p)
@@ -111,8 +56,8 @@ static void printk_lock(struct rt_mutex *lock, int print_owner)
 void rt_mutex_debug_task_free(struct task_struct *task)
 {
-        WARN_ON(!plist_head_empty(&task->pi_waiters));
+        DEBUG_LOCKS_WARN_ON(!plist_head_empty(&task->pi_waiters));
-        WARN_ON(task->pi_blocked_on);
+        DEBUG_LOCKS_WARN_ON(task->pi_blocked_on);
 }
 /*
@@ -125,7 +70,7 @@ void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
 {
        struct task_struct *task;
-        if (!rt_trace_on || detect || !act_waiter)
+        if (!debug_locks || detect || !act_waiter)
                return;
        task = rt_mutex_owner(act_waiter->lock);
@@ -139,7 +84,7 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
 {
        struct task_struct *task;
-        if (!waiter->deadlock_lock || !rt_trace_on)
+        if (!waiter->deadlock_lock || !debug_locks)
                return;
        rcu_read_lock();
@@ -149,7 +94,10 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
                return;
        }
-        TRACE_OFF_NOLOCK();
+        if (!debug_locks_off()) {
+                rcu_read_unlock();
+                return;
+        }
        printk("\n============================================\n");
        printk(  "[ BUG: circular locking deadlock detected! ]\n");
@@ -180,7 +128,6 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
        printk("[ turning off deadlock detection."
               "Please report this trace. ]\n\n");
-        local_irq_disable();
 }
 void debug_rt_mutex_lock(struct rt_mutex *lock)
@@ -189,7 +136,7 @@ void debug_rt_mutex_lock(struct rt_mutex *lock)
 void debug_rt_mutex_unlock(struct rt_mutex *lock)
 {
-        TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
+        DEBUG_LOCKS_WARN_ON(rt_mutex_owner(lock) != current);
 }
 void
@@ -199,7 +146,7 @@ debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
 void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
 {
-        TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
+        DEBUG_LOCKS_WARN_ON(!rt_mutex_owner(lock));
 }
 void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
@@ -213,8 +160,8 @@ void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
 void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
 {
        put_pid(waiter->deadlock_task_pid);
-        TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
+        DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->list_entry));
-        TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
+        DEBUG_LOCKS_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
        memset(waiter, 0x22, sizeof(*waiter));
 }
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 5c9ccd38096..3d9f31cd79e 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -7,7 +7,7 @@
 *
 */
 #include <linux/kthread.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/sysdev.h>
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 255e1662acd..f9d8482dd48 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -11,7 +11,7 @@
 *  See Documentation/rt-mutex-design.txt for details.
 */
 #include <linux/spinlock.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/timer.h>
@@ -579,6 +579,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                    struct rt_mutex_waiter *waiter)
 {
        int ret = 0;
+        int was_disabled;
        for (;;) {
                /* Try to acquire the lock: */
@@ -601,10 +602,17 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
                raw_spin_unlock(&lock->wait_lock);
+                was_disabled = irqs_disabled();
+                if (was_disabled)
+                        local_irq_enable();
                debug_rt_mutex_print_deadlock(waiter);
                schedule_rt_mutex(lock);
+                if (was_disabled)
+                        local_irq_disable();
                raw_spin_lock(&lock->wait_lock);
                set_current_state(state);
        }
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 9f48f3d82e9..b152f74f02d 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -7,7 +7,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/rwsem.h>
 #include <asm/system.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index e1290ecee3c..d6b149ccf92 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
 #include <linux/slab.h>
+#include <linux/init_task.h>
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
@@ -196,10 +197,28 @@ static inline int rt_bandwidth_enabled(void)
        return sysctl_sched_rt_runtime >= 0;
 }
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period)
 {
-        ktime_t now;
+        unsigned long delta;
+        ktime_t soft, hard, now;
+        for (;;) {
+                if (hrtimer_active(period_timer))
+                        break;
+                now = hrtimer_cb_get_time(period_timer);
+                hrtimer_forward(period_timer, now, period);
+                soft = hrtimer_get_softexpires(period_timer);
+                hard = hrtimer_get_expires(period_timer);
+                delta = ktime_to_ns(ktime_sub(hard, soft));
+                __hrtimer_start_range_ns(period_timer, soft, delta,
+                                         HRTIMER_MODE_ABS_PINNED, 0);
+        }
+}
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
        if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
                return;
@@ -207,22 +226,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
                return;
        raw_spin_lock(&rt_b->rt_runtime_lock);
-        for (;;) {
+        start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
-                unsigned long delta;
-                ktime_t soft, hard;
-                if (hrtimer_active(&rt_b->rt_period_timer))
-                        break;
-                now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
-                hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-                soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
-                hard = hrtimer_get_expires(&rt_b->rt_period_timer);
-                delta = ktime_to_ns(ktime_sub(hard, soft));
-                __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
-                                HRTIMER_MODE_ABS_PINNED, 0);
-        }
        raw_spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -247,6 +251,24 @@ struct cfs_rq;
 static LIST_HEAD(task_groups);
+struct cfs_bandwidth {
+#ifdef CONFIG_CFS_BANDWIDTH
+        raw_spinlock_t lock;
+        ktime_t period;
+        u64 quota, runtime;
+        s64 hierarchal_quota;
+        u64 runtime_expires;
+        int idle, timer_active;
+        struct hrtimer period_timer, slack_timer;
+        struct list_head throttled_cfs_rq;
+        /* statistics */
+        int nr_periods, nr_throttled;
+        u64 throttled_time;
+#endif
+};
 /* task group related information */
 struct task_group {
        struct cgroup_subsys_state css;
@@ -278,6 +300,8 @@ struct task_group {
 #ifdef CONFIG_SCHED_AUTOGROUP
        struct autogroup *autogroup;
 #endif
+        struct cfs_bandwidth cfs_bandwidth;
 };
 /* task_group_lock serializes the addition/removal of task groups */
@@ -311,7 +335,7 @@ struct task_group root_task_group;
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
        struct load_weight load;
-        unsigned long nr_running;
+        unsigned long nr_running, h_nr_running;
        u64 exec_clock;
        u64 min_vruntime;
@@ -377,9 +401,120 @@ struct cfs_rq {
        unsigned long load_contribution;
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+        int runtime_enabled;
+        u64 runtime_expires;
+        s64 runtime_remaining;
+        u64 throttled_timestamp;
+        int throttled, throttle_count;
+        struct list_head throttled_list;
+#endif
 #endif
 };
+#ifdef CONFIG_FAIR_GROUP_SCHED
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+        return &tg->cfs_bandwidth;
+}
+static inline u64 default_cfs_period(void);
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
+static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
+{
+        struct cfs_bandwidth *cfs_b =
+                container_of(timer, struct cfs_bandwidth, slack_timer);
+        do_sched_cfs_slack_timer(cfs_b);
+        return HRTIMER_NORESTART;
+}
+static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
+{
+        struct cfs_bandwidth *cfs_b =
+                container_of(timer, struct cfs_bandwidth, period_timer);
+        ktime_t now;
+        int overrun;
+        int idle = 0;
+        for (;;) {
+                now = hrtimer_cb_get_time(timer);
+                overrun = hrtimer_forward(timer, now, cfs_b->period);
+                if (!overrun)
+                        break;
+                idle = do_sched_cfs_period_timer(cfs_b, overrun);
+        }
+        return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
+}
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        raw_spin_lock_init(&cfs_b->lock);
+        cfs_b->runtime = 0;
+        cfs_b->quota = RUNTIME_INF;
+        cfs_b->period = ns_to_ktime(default_cfs_period());
+        INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
+        hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        cfs_b->period_timer.function = sched_cfs_period_timer;
+        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+        cfs_b->slack_timer.function = sched_cfs_slack_timer;
+}
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        cfs_rq->runtime_enabled = 0;
+        INIT_LIST_HEAD(&cfs_rq->throttled_list);
+}
+/* requires cfs_b->lock, may release to reprogram timer */
+static void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        /*
+         * The timer may be active because we're trying to set a new bandwidth
+         * period or because we're racing with the tear-down path
+         * (timer_active==0 becomes visible before the hrtimer call-back
+         * terminates).  In either case we ensure that it's re-programmed
+         */
+        while (unlikely(hrtimer_active(&cfs_b->period_timer))) {
+                raw_spin_unlock(&cfs_b->lock);
+                /* ensure cfs_b->lock is available while we wait */
+                hrtimer_cancel(&cfs_b->period_timer);
+                raw_spin_lock(&cfs_b->lock);
+                /* if someone else restarted the timer then we're done */
+                if (cfs_b->timer_active)
+                        return;
+        }
+        cfs_b->timer_active = 1;
+        start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
+}
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        hrtimer_cancel(&cfs_b->period_timer);
+        hrtimer_cancel(&cfs_b->slack_timer);
+}
+#else
+static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
+static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
+{
+        return NULL;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
        struct rt_prio_array active;
@@ -510,7 +645,7 @@ struct rq {
        unsigned long cpu_power;
-        unsigned char idle_at_tick;
+        unsigned char idle_balance;
        /* For active balancing */
        int post_schedule;
        int active_balance;
@@ -520,8 +655,6 @@ struct rq {
        int cpu;
        int online;
-        unsigned long avg_load_per_task;
        u64 rt_avg;
        u64 age_stamp;
        u64 idle_stamp;
@@ -570,7 +703,7 @@ struct rq {
 #endif
 #ifdef CONFIG_SMP
-        struct task_struct *wake_list;
+        struct llist_head wake_list;
 #endif
 };
@@ -1272,6 +1405,18 @@ void wake_up_idle_cpu(int cpu)
                smp_send_reschedule(cpu);
 }
+static inline bool got_nohz_idle_kick(void)
+{
+        return idle_cpu(smp_processor_id()) && this_rq()->nohz_balance_kick;
+}
+#else /* CONFIG_NO_HZ */
+static inline bool got_nohz_idle_kick(void)
+{
+        return false;
+}
 #endif /* CONFIG_NO_HZ */
 static u64 sched_avg_period(void)
@@ -1471,24 +1616,28 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
        update_load_sub(&rq->load, load);
 }
-#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
+#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
+                        (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
 typedef int (*tg_visitor)(struct task_group *, void *);
 /*
- * Iterate the full tree, calling @down when first entering a node and @up when
+ * Iterate task_group tree rooted at *from, calling @down when first entering a
- * leaving it for the final time.
+ * node and @up when leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
 */
-static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+static int walk_tg_tree_from(struct task_group *from,
+                             tg_visitor down, tg_visitor up, void *data)
 {
        struct task_group *parent, *child;
        int ret;
-        rcu_read_lock();
+        parent = from;
-        parent = &root_task_group;
 down:
        ret = (*down)(parent, data);
        if (ret)
-                goto out_unlock;
+                goto out;
        list_for_each_entry_rcu(child, &parent->children, siblings) {
                parent = child;
                goto down;
@@ -1497,19 +1646,29 @@ up:
                continue;
        }
        ret = (*up)(parent, data);
-        if (ret)
+        if (ret || parent == from)
-                goto out_unlock;
+                goto out;
        child = parent;
        parent = parent->parent;
        if (parent)
                goto up;
-out_unlock:
+out:
-        rcu_read_unlock();
        return ret;
 }
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ *
+ * Caller must hold rcu_lock or sufficient equivalent.
+ */
+static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
+{
+        return walk_tg_tree_from(&root_task_group, down, up, data);
+}
 static int tg_nop(struct task_group *tg, void *data)
 {
        return 0;
@@ -1569,11 +1728,9 @@ static unsigned long cpu_avg_load_per_task(int cpu)
        unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
        if (nr_running)
-                rq->avg_load_per_task = rq->load.weight / nr_running;
+                return rq->load.weight / nr_running;
-        else
-                rq->avg_load_per_task = 0;
-        return rq->avg_load_per_task;
+        return 0;
 }
 #ifdef CONFIG_PREEMPT
@@ -1739,7 +1896,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
 #ifdef CONFIG_SMP
        /*
         * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
-         * successfuly executed on another CPU. We must ensure that updates of
+         * successfully executed on another CPU. We must ensure that updates of
         * per-task data have been completed by this moment.
         */
        smp_wmb();
@@ -1806,7 +1963,6 @@ static void activate_task(struct rq *rq, struct task_struct *p, int flags)
                rq->nr_uninterruptible--;
        enqueue_task(rq, p, flags);
-        inc_nr_running(rq);
 }
 /*
@@ -1818,7 +1974,6 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
                rq->nr_uninterruptible++;
        dequeue_task(rq, p, flags);
-        dec_nr_running(rq);
 }
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -2390,11 +2545,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
        /* Look for allowed, online CPU in same node. */
        for_each_cpu_and(dest_cpu, nodemask, cpu_active_mask)
-                if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+                if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                        return dest_cpu;
        /* Any allowed, online CPU? */
-        dest_cpu = cpumask_any_and(&p->cpus_allowed, cpu_active_mask);
+        dest_cpu = cpumask_any_and(tsk_cpus_allowed(p), cpu_active_mask);
        if (dest_cpu < nr_cpu_ids)
                return dest_cpu;
@@ -2431,7 +2586,7 @@ int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
         * [ this allows ->select_task() to simply return task_cpu(p) and
         *   not worry about this generic constraint ]
         */
-        if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
+        if (unlikely(!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) ||
                     !cpu_online(cpu)))
                cpu = select_fallback_rq(task_cpu(p), p);
@@ -2556,42 +2711,26 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 }
 #ifdef CONFIG_SMP
-static void sched_ttwu_do_pending(struct task_struct *list)
+static void sched_ttwu_pending(void)
 {
        struct rq *rq = this_rq();
+        struct llist_node *llist = llist_del_all(&rq->wake_list);
+        struct task_struct *p;
        raw_spin_lock(&rq->lock);
-        while (list) {
+        while (llist) {
-                struct task_struct *p = list;
+                p = llist_entry(llist, struct task_struct, wake_entry);
-                list = list->wake_entry;
+                llist = llist_next(llist);
                ttwu_do_activate(rq, p, 0);
        }
        raw_spin_unlock(&rq->lock);
 }
-#ifdef CONFIG_HOTPLUG_CPU
-static void sched_ttwu_pending(void)
-{
-        struct rq *rq = this_rq();
-        struct task_struct *list = xchg(&rq->wake_list, NULL);
-        if (!list)
-                return;
-        sched_ttwu_do_pending(list);
-}
-#endif /* CONFIG_HOTPLUG_CPU */
 void scheduler_ipi(void)
 {
-        struct rq *rq = this_rq();
+        if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
-        struct task_struct *list = xchg(&rq->wake_list, NULL);
-        if (!list)
                return;
        /*
@@ -2608,25 +2747,21 @@ void scheduler_ipi(void)
         * somewhat pessimize the simple resched case.
         */
        irq_enter();
-        sched_ttwu_do_pending(list);
+        sched_ttwu_pending();
+        /*
+         * Check if someone kicked us for doing the nohz idle load balance.
+         */
+        if (unlikely(got_nohz_idle_kick() && !need_resched())) {
+                this_rq()->idle_balance = 1;
+                raise_softirq_irqoff(SCHED_SOFTIRQ);
+        }
        irq_exit();
 }
 static void ttwu_queue_remote(struct task_struct *p, int cpu)
 {
-        struct rq *rq = cpu_rq(cpu);
+        if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
-        struct task_struct *next = rq->wake_list;
-        for (;;) {
-                struct task_struct *old = next;
-                p->wake_entry = next;
-                next = cmpxchg(&rq->wake_list, old, p);
-                if (next == old)
-                        break;
-        }
-        if (!next)
                smp_send_reschedule(cpu);
 }
@@ -2848,19 +2983,23 @@ void sched_fork(struct task_struct *p)
        p->state = TASK_RUNNING;
        /*
+         * Make sure we do not leak PI boosting priority to the child.
+         */
+        p->prio = current->normal_prio;
+        /*
         * Revert to default priority/policy on fork if requested.
         */
        if (unlikely(p->sched_reset_on_fork)) {
-                if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+                if (task_has_rt_policy(p)) {
                        p->policy = SCHED_NORMAL;
-                        p->normal_prio = p->static_prio;
-                }
-                if (PRIO_TO_NICE(p->static_prio) < 0) {
                        p->static_prio = NICE_TO_PRIO(0);
-                        p->normal_prio = p->static_prio;
+                        p->rt_priority = 0;
-                        set_load_weight(p);
+                } else if (PRIO_TO_NICE(p->static_prio) < 0)
-                }
+                        p->static_prio = NICE_TO_PRIO(0);
+                p->prio = p->normal_prio = __normal_prio(p);
+                set_load_weight(p);
                /*
                 * We don't need the reset flag anymore after the fork. It has
@@ -2869,11 +3008,6 @@ void sched_fork(struct task_struct *p)
                p->sched_reset_on_fork = 0;
        }
-        /*
-         * Make sure we do not leak PI boosting priority to the child.
-         */
-        p->prio = current->normal_prio;
        if (!rt_prio(p->prio))
                p->sched_class = &fair_sched_class;
@@ -3065,7 +3199,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        local_irq_disable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
-        perf_event_task_sched_in(current);
+        perf_event_task_sched_in(prev, current);
 #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
        local_irq_enable();
 #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
@@ -4116,7 +4250,7 @@ void scheduler_tick(void)
        perf_event_task_tick();
 #ifdef CONFIG_SMP
-        rq->idle_at_tick = idle_cpu(cpu);
+        rq->idle_balance = idle_cpu(cpu);
        trigger_load_balance(rq, cpu);
 #endif
 }
@@ -4213,6 +4347,7 @@ static inline void schedule_debug(struct task_struct *prev)
         */
        if (unlikely(in_atomic_preempt_off() && !prev->exit_state))
                __schedule_bug(prev);
+        rcu_sleep_check();
        profile_hit(SCHED_PROFILING, __builtin_return_address(0));
@@ -4239,7 +4374,7 @@ pick_next_task(struct rq *rq)
         * Optimization: we know that if all tasks are in
         * the fair class we can call that function directly:
         */
-        if (likely(rq->nr_running == rq->cfs.nr_running)) {
+        if (likely(rq->nr_running == rq->cfs.h_nr_running)) {
                p = fair_sched_class.pick_next_task(rq);
                if (likely(p))
                        return p;
@@ -4255,9 +4390,9 @@ pick_next_task(struct rq *rq)
 }
 /*
- * schedule() is the main scheduler function.
+ * __schedule() is the main scheduler function.
 */
-asmlinkage void __sched schedule(void)
+static void __sched __schedule(void)
 {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
@@ -4298,16 +4433,6 @@ need_resched:
                                if (to_wakeup)
                                        try_to_wake_up_local(to_wakeup);
                        }
-                        /*
-                         * If we are going to sleep and we have plugged IO
-                         * queued, make sure to submit it to avoid deadlocks.
-                         */
-                        if (blk_needs_flush_plug(prev)) {
-                                raw_spin_unlock(&rq->lock);
-                                blk_schedule_flush_plug(prev);
-                                raw_spin_lock(&rq->lock);
-                        }
                }
                switch_count = &prev->nvcsw;
        }
@@ -4345,6 +4470,26 @@ need_resched:
        if (need_resched())
                goto need_resched;
 }
+static inline void sched_submit_work(struct task_struct *tsk)
+{
+        if (!tsk->state)
+                return;
+        /*
+         * If we are going to sleep and we have plugged IO queued,
+         * make sure to submit it to avoid deadlocks.
+         */
+        if (blk_needs_flush_plug(tsk))
+                blk_schedule_flush_plug(tsk);
+}
+asmlinkage void __sched schedule(void)
+{
+        struct task_struct *tsk = current;
+        sched_submit_work(tsk);
+        __schedule();
+}
 EXPORT_SYMBOL(schedule);
 #ifdef CONFIG_MUTEX_SPIN_ON_OWNER
@@ -4411,7 +4556,7 @@ asmlinkage void __sched notrace preempt_schedule(void)
        do {
                add_preempt_count_notrace(PREEMPT_ACTIVE);
-                schedule();
+                __schedule();
                sub_preempt_count_notrace(PREEMPT_ACTIVE);
                /*
@@ -4439,7 +4584,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
        do {
                add_preempt_count(PREEMPT_ACTIVE);
                local_irq_enable();
-                schedule();
+                __schedule();
                local_irq_disable();
                sub_preempt_count(PREEMPT_ACTIVE);
@@ -4666,6 +4811,9 @@ EXPORT_SYMBOL(wait_for_completion);
 * This waits for either a completion of a specific task to be signaled or for a
 * specified timeout to expire. The timeout is in jiffies. It is not
 * interruptible.
+ *
+ * The return value is 0 if timed out, and positive (at least 1, or number of
+ * jiffies left till timeout) if completed.
 */
 unsigned long __sched
 wait_for_completion_timeout(struct completion *x, unsigned long timeout)
@@ -4680,6 +4828,8 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
 *
 * This waits for completion of a specific task to be signaled. It is
 * interruptible.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
 */
 int __sched wait_for_completion_interruptible(struct completion *x)
 {
@@ -4697,6 +4847,9 @@ EXPORT_SYMBOL(wait_for_completion_interruptible);
 *
 * This waits for either a completion of a specific task to be signaled or for a
 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
 */
 long __sched
 wait_for_completion_interruptible_timeout(struct completion *x,
@@ -4712,6 +4865,8 @@ EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
 *
 * This waits to be signaled for completion of a specific task. It can be
 * interrupted by a kill signal.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if completed.
 */
 int __sched wait_for_completion_killable(struct completion *x)
 {
@@ -4730,6 +4885,9 @@ EXPORT_SYMBOL(wait_for_completion_killable);
 * This waits for either a completion of a specific task to be
 * signaled or for a specified timeout to expire. It can be
 * interrupted by a kill signal. The timeout is in jiffies.
+ *
+ * The return value is -ERESTARTSYS if interrupted, 0 if timed out,
+ * positive (at least 1, or number of jiffies left till timeout) if completed.
 */
 long __sched
 wait_for_completion_killable_timeout(struct completion *x,
@@ -5015,7 +5173,20 @@ EXPORT_SYMBOL(task_nice);
 */
 int idle_cpu(int cpu)
 {
-        return cpu_curr(cpu) == cpu_rq(cpu)->idle;
+        struct rq *rq = cpu_rq(cpu);
+        if (rq->curr != rq->idle)
+                return 0;
+        if (rq->nr_running)
+                return 0;
+#ifdef CONFIG_SMP
+        if (!llist_empty(&rq->wake_list))
+                return 0;
+#endif
+        return 1;
 }
 /**
@@ -5564,7 +5735,7 @@ static inline int should_resched(void)
 static void __cond_resched(void)
 {
        add_preempt_count(PREEMPT_ACTIVE);
-        schedule();
+        __schedule();
        sub_preempt_count(PREEMPT_ACTIVE);
 }
@@ -5865,7 +6036,7 @@ void show_state_filter(unsigned long state_filter)
        printk(KERN_INFO
                "  task                        PC stack   pid father\n");
 #endif
-        read_lock(&tasklist_lock);
+        rcu_read_lock();
        do_each_thread(g, p) {
                /*
                 * reset the NMI-timeout, listing all files on a slow
@@ -5881,7 +6052,7 @@ void show_state_filter(unsigned long state_filter)
 #ifdef CONFIG_SCHED_DEBUG
        sysrq_sched_debug_show();
 #endif
-        read_unlock(&tasklist_lock);
+        rcu_read_unlock();
        /*
         * Only show locks if all tasks are dumped:
         */
@@ -5942,18 +6113,12 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         */
        idle->sched_class = &idle_sched_class;
        ftrace_graph_init_idle_task(idle, cpu);
+#if defined(CONFIG_SMP)
+        sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
+#endif
 }
 /*
- * In a system that switches off the HZ timer nohz_cpu_mask
- * indicates which cpus entered this state. This is used
- * in the rcu update to wait only for active cpus. For system
- * which do not switch off the HZ timer nohz_cpu_mask should
- * always be CPU_BITS_NONE.
- */
-cpumask_var_t nohz_cpu_mask;
-/*
 * Increase the granularity value when there are more CPUs,
 * because with more CPUs the 'effective latency' as visible
 * to users decreases. But the relationship is not linear,
@@ -6005,10 +6170,9 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
 {
        if (p->sched_class && p->sched_class->set_cpus_allowed)
                p->sched_class->set_cpus_allowed(p, new_mask);
-        else {
-                cpumask_copy(&p->cpus_allowed, new_mask);
+        cpumask_copy(&p->cpus_allowed, new_mask);
-                p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+        p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
-        }
 }
 /*
@@ -6106,7 +6270,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
        if (task_cpu(p) != src_cpu)
                goto done;
        /* Affinity changed (again). */
-        if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
+        if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                goto fail;
        /*
@@ -6187,6 +6351,30 @@ static void calc_global_load_remove(struct rq *rq)
        rq->calc_load_active = 0;
 }
+#ifdef CONFIG_CFS_BANDWIDTH
+static void unthrottle_offline_cfs_rqs(struct rq *rq)
+{
+        struct cfs_rq *cfs_rq;
+        for_each_leaf_cfs_rq(rq, cfs_rq) {
+                struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+                if (!cfs_rq->runtime_enabled)
+                        continue;
+                /*
+                 * clock_task is not advancing so we just need to make sure
+                 * there's some valid quota amount
+                 */
+                cfs_rq->runtime_remaining = cfs_b->quota;
+                if (cfs_rq_throttled(cfs_rq))
+                        unthrottle_cfs_rq(cfs_rq);
+        }
+}
+#else
+static void unthrottle_offline_cfs_rqs(struct rq *rq) {}
+#endif
 /*
 * Migrate all tasks from the rq, sleeping tasks will be migrated by
 * try_to_wake_up()->select_task_rq().
@@ -6212,6 +6400,9 @@ static void migrate_tasks(unsigned int dead_cpu)
         */
        rq->stop = NULL;
+        /* Ensure any throttled groups are reachable by pick_next_task */
+        unthrottle_offline_cfs_rqs(rq);
        for ( ; ; ) {
                /*
                 * There's this thread running, bail when that's the only
@@ -6913,8 +7104,6 @@ static int __init isolated_cpu_setup(char *str)
 __setup("isolcpus=", isolated_cpu_setup);
-#define SD_NODES_PER_DOMAIN 16
 #ifdef CONFIG_NUMA
 /**
@@ -7419,6 +7608,7 @@ static void __sdt_free(const struct cpumask *cpu_map)
                        struct sched_domain *sd = *per_cpu_ptr(sdd->sd, j);
                        if (sd && (sd->flags & SD_OVERLAP))
                                free_sched_groups(sd->groups, 0);
+                        kfree(*per_cpu_ptr(sdd->sd, j));
                        kfree(*per_cpu_ptr(sdd->sg, j));
                        kfree(*per_cpu_ptr(sdd->sgp, j));
                }
@@ -7954,6 +8144,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
        /* allow initial update_cfs_load() to truncate */
        cfs_rq->load_stamp = 1;
 #endif
+        init_cfs_rq_runtime(cfs_rq);
        tg->cfs_rq[cpu] = cfs_rq;
        tg->se[cpu] = se;
@@ -8093,6 +8284,7 @@ void __init sched_init(void)
                 * We achieve this by letting root_task_group's tasks sit
                 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
                 */
+                init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
                init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
 #endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8122,7 +8314,6 @@ void __init sched_init(void)
                rq_attach_root(rq, &def_root_domain);
 #ifdef CONFIG_NO_HZ
                rq->nohz_balance_kick = 0;
-                init_sched_softirq_csd(&per_cpu(remote_sched_softirq_cb, i));
 #endif
 #endif
                init_rq_hrtick(rq);
@@ -8164,8 +8355,6 @@ void __init sched_init(void)
         */
        current->sched_class = &fair_sched_class;
-        /* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-        zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
        zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
 #ifdef CONFIG_NO_HZ
@@ -8195,6 +8384,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
 {
        static unsigned long prev_jiffy;        /* ratelimiting */
+        rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
        if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
            system_state != SYSTEM_RUNNING || oops_in_progress)
                return;
@@ -8334,6 +8524,8 @@ static void free_fair_sched_group(struct task_group *tg)
 {
        int i;
+        destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
        for_each_possible_cpu(i) {
                if (tg->cfs_rq)
                        kfree(tg->cfs_rq[i]);
@@ -8361,6 +8553,8 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
        tg->shares = NICE_0_LOAD;
+        init_cfs_bandwidth(tg_cfs_bandwidth(tg));
        for_each_possible_cpu(i) {
                cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
                                      GFP_KERNEL, cpu_to_node(i));
@@ -8636,12 +8830,7 @@ unsigned long sched_group_shares(struct task_group *tg)
 }
 #endif
-#ifdef CONFIG_RT_GROUP_SCHED
+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_CFS_BANDWIDTH)
-/*
- * Ensure that the real time constraints are schedulable.
- */
-static DEFINE_MUTEX(rt_constraints_mutex);
 static unsigned long to_ratio(u64 period, u64 runtime)
 {
        if (runtime == RUNTIME_INF)
@@ -8649,6 +8838,13 @@ static unsigned long to_ratio(u64 period, u64 runtime)
        return div64_u64(runtime << 20, period);
 }
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Ensure that the real time constraints are schedulable.
+ */
+static DEFINE_MUTEX(rt_constraints_mutex);
 /* Must be called with tasklist_lock held */
 static inline int tg_has_rt_tasks(struct task_group *tg)
@@ -8669,7 +8865,7 @@ struct rt_schedulable_data {
        u64 rt_runtime;
 };
-static int tg_schedulable(struct task_group *tg, void *data)
+static int tg_rt_schedulable(struct task_group *tg, void *data)
 {
        struct rt_schedulable_data *d = data;
        struct task_group *child;
@@ -8727,16 +8923,22 @@ static int tg_schedulable(struct task_group *tg, void *data)
 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
 {
+        int ret;
        struct rt_schedulable_data data = {
                .tg = tg,
                .rt_period = period,
                .rt_runtime = runtime,
        };
-        return walk_tg_tree(tg_schedulable, tg_nop, &data);
+        rcu_read_lock();
+        ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
+        rcu_read_unlock();
+        return ret;
 }
-static int tg_set_bandwidth(struct task_group *tg,
+static int tg_set_rt_bandwidth(struct task_group *tg,
                u64 rt_period, u64 rt_runtime)
 {
        int i, err = 0;
@@ -8775,7 +8977,7 @@ int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
        if (rt_runtime_us < 0)
                rt_runtime = RUNTIME_INF;
-        return tg_set_bandwidth(tg, rt_period, rt_runtime);
+        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 long sched_group_rt_runtime(struct task_group *tg)
@@ -8800,7 +9002,7 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
        if (rt_period == 0)
                return -EINVAL;
-        return tg_set_bandwidth(tg, rt_period, rt_runtime);
+        return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
 long sched_group_rt_period(struct task_group *tg)
@@ -8990,6 +9192,238 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
        return (u64) scale_load_down(tg->shares);
 }
+#ifdef CONFIG_CFS_BANDWIDTH
+static DEFINE_MUTEX(cfs_constraints_mutex);
+const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
+const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+        int i, ret = 0, runtime_enabled;
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+        if (tg == &root_task_group)
+                return -EINVAL;
+        /*
+         * Ensure we have at some amount of bandwidth every period.  This is
+         * to prevent reaching a state of large arrears when throttled via
+         * entity_tick() resulting in prolonged exit starvation.
+         */
+        if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
+                return -EINVAL;
+        /*
+         * Likewise, bound things on the otherside by preventing insane quota
+         * periods.  This also allows us to normalize in computing quota
+         * feasibility.
+         */
+        if (period > max_cfs_quota_period)
+                return -EINVAL;
+        mutex_lock(&cfs_constraints_mutex);
+        ret = __cfs_schedulable(tg, period, quota);
+        if (ret)
+                goto out_unlock;
+        runtime_enabled = quota != RUNTIME_INF;
+        raw_spin_lock_irq(&cfs_b->lock);
+        cfs_b->period = ns_to_ktime(period);
+        cfs_b->quota = quota;
+        __refill_cfs_bandwidth_runtime(cfs_b);
+        /* restart the period timer (if active) to handle new period expiry */
+        if (runtime_enabled && cfs_b->timer_active) {
+                /* force a reprogram */
+                cfs_b->timer_active = 0;
+                __start_cfs_bandwidth(cfs_b);
+        }
+        raw_spin_unlock_irq(&cfs_b->lock);
+        for_each_possible_cpu(i) {
+                struct cfs_rq *cfs_rq = tg->cfs_rq[i];
+                struct rq *rq = rq_of(cfs_rq);
+                raw_spin_lock_irq(&rq->lock);
+                cfs_rq->runtime_enabled = runtime_enabled;
+                cfs_rq->runtime_remaining = 0;
+                if (cfs_rq_throttled(cfs_rq))
+                        unthrottle_cfs_rq(cfs_rq);
+                raw_spin_unlock_irq(&rq->lock);
+        }
+out_unlock:
+        mutex_unlock(&cfs_constraints_mutex);
+        return ret;
+}
+int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
+{
+        u64 quota, period;
+        period = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+        if (cfs_quota_us < 0)
+                quota = RUNTIME_INF;
+        else
+                quota = (u64)cfs_quota_us * NSEC_PER_USEC;
+        return tg_set_cfs_bandwidth(tg, period, quota);
+}
+long tg_get_cfs_quota(struct task_group *tg)
+{
+        u64 quota_us;
+        if (tg_cfs_bandwidth(tg)->quota == RUNTIME_INF)
+                return -1;
+        quota_us = tg_cfs_bandwidth(tg)->quota;
+        do_div(quota_us, NSEC_PER_USEC);
+        return quota_us;
+}
+int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
+{
+        u64 quota, period;
+        period = (u64)cfs_period_us * NSEC_PER_USEC;
+        quota = tg_cfs_bandwidth(tg)->quota;
+        if (period <= 0)
+                return -EINVAL;
+        return tg_set_cfs_bandwidth(tg, period, quota);
+}
+long tg_get_cfs_period(struct task_group *tg)
+{
+        u64 cfs_period_us;
+        cfs_period_us = ktime_to_ns(tg_cfs_bandwidth(tg)->period);
+        do_div(cfs_period_us, NSEC_PER_USEC);
+        return cfs_period_us;
+}
+static s64 cpu_cfs_quota_read_s64(struct cgroup *cgrp, struct cftype *cft)
+{
+        return tg_get_cfs_quota(cgroup_tg(cgrp));
+}
+static int cpu_cfs_quota_write_s64(struct cgroup *cgrp, struct cftype *cftype,
+                                s64 cfs_quota_us)
+{
+        return tg_set_cfs_quota(cgroup_tg(cgrp), cfs_quota_us);
+}
+static u64 cpu_cfs_period_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+        return tg_get_cfs_period(cgroup_tg(cgrp));
+}
+static int cpu_cfs_period_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+                                u64 cfs_period_us)
+{
+        return tg_set_cfs_period(cgroup_tg(cgrp), cfs_period_us);
+}
+struct cfs_schedulable_data {
+        struct task_group *tg;
+        u64 period, quota;
+};
+/*
+ * normalize group quota/period to be quota/max_period
+ * note: units are usecs
+ */
+static u64 normalize_cfs_quota(struct task_group *tg,
+                               struct cfs_schedulable_data *d)
+{
+        u64 quota, period;
+        if (tg == d->tg) {
+                period = d->period;
+                quota = d->quota;
+        } else {
+                period = tg_get_cfs_period(tg);
+                quota = tg_get_cfs_quota(tg);
+        }
+        /* note: these should typically be equivalent */
+        if (quota == RUNTIME_INF || quota == -1)
+                return RUNTIME_INF;
+        return to_ratio(period, quota);
+}
+static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
+{
+        struct cfs_schedulable_data *d = data;
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+        s64 quota = 0, parent_quota = -1;
+        if (!tg->parent) {
+                quota = RUNTIME_INF;
+        } else {
+                struct cfs_bandwidth *parent_b = tg_cfs_bandwidth(tg->parent);
+                quota = normalize_cfs_quota(tg, d);
+                parent_quota = parent_b->hierarchal_quota;
+                /*
+                 * ensure max(child_quota) <= parent_quota, inherit when no
+                 * limit is set
+                 */
+                if (quota == RUNTIME_INF)
+                        quota = parent_quota;
+                else if (parent_quota != RUNTIME_INF && quota > parent_quota)
+                        return -EINVAL;
+        }
+        cfs_b->hierarchal_quota = quota;
+        return 0;
+}
+static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
+{
+        int ret;
+        struct cfs_schedulable_data data = {
+                .tg = tg,
+                .period = period,
+                .quota = quota,
+        };
+        if (quota != RUNTIME_INF) {
+                do_div(data.period, NSEC_PER_USEC);
+                do_div(data.quota, NSEC_PER_USEC);
+        }
+        rcu_read_lock();
+        ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
+        rcu_read_unlock();
+        return ret;
+}
+static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
+                struct cgroup_map_cb *cb)
+{
+        struct task_group *tg = cgroup_tg(cgrp);
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+        cb->fill(cb, "nr_periods", cfs_b->nr_periods);
+        cb->fill(cb, "nr_throttled", cfs_b->nr_throttled);
+        cb->fill(cb, "throttled_time", cfs_b->throttled_time);
+        return 0;
+}
+#endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -9024,6 +9458,22 @@ static struct cftype cpu_files[] = {
                .write_u64 = cpu_shares_write_u64,
        },
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+        {
+                .name = "cfs_quota_us",
+                .read_s64 = cpu_cfs_quota_read_s64,
+                .write_s64 = cpu_cfs_quota_write_s64,
+        },
+        {
+                .name = "cfs_period_us",
+                .read_u64 = cpu_cfs_period_read_u64,
+                .write_u64 = cpu_cfs_period_write_u64,
+        },
+        {
+                .name = "stat",
+                .read_map = cpu_stats_show,
+        },
+#endif
 #ifdef CONFIG_RT_GROUP_SCHED
        {
                .name = "rt_runtime_us",
@@ -9333,4 +9783,3 @@ struct cgroup_subsys cpuacct_subsys = {
        .subsys_id = cpuacct_subsys_id,
 };
 #endif  /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 9d8af0b3fb6..c685e31492d 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -62,7 +62,7 @@
 */
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/percpu.h>
 #include <linux/ktime.h>
 #include <linux/sched.h>
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
index 2722dc1b413..a86cf9d9eb1 100644
--- a/kernel/sched_cpupri.c
+++ b/kernel/sched_cpupri.c
@@ -47,9 +47,6 @@ static int convert_prio(int prio)
        return cpupri;
 }
-#define for_each_cpupri_active(array, idx)                    \
-        for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES)
 /**
 * cpupri_find - find the best (lowest-pri) CPU in the system
 * @cp: The cpupri context
@@ -71,11 +68,38 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
        int                  idx      = 0;
        int                  task_pri = convert_prio(p->prio);
-        for_each_cpupri_active(cp->pri_active, idx) {
+        if (task_pri >= MAX_RT_PRIO)
-                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+                return 0;
-                if (idx >= task_pri)
+        for (idx = 0; idx < task_pri; idx++) {
-                        break;
+                struct cpupri_vec *vec  = &cp->pri_to_cpu[idx];
+                int skip = 0;
+                if (!atomic_read(&(vec)->count))
+                        skip = 1;
+                /*
+                 * When looking at the vector, we need to read the counter,
+                 * do a memory barrier, then read the mask.
+                 *
+                 * Note: This is still all racey, but we can deal with it.
+                 *  Ideally, we only want to look at masks that are set.
+                 *
+                 *  If a mask is not set, then the only thing wrong is that we
+                 *  did a little more work than necessary.
+                 *
+                 *  If we read a zero count but the mask is set, because of the
+                 *  memory barriers, that can only happen when the highest prio
+                 *  task for a run queue has left the run queue, in which case,
+                 *  it will be followed by a pull. If the task we are processing
+                 *  fails to find a proper place to go, that pull request will
+                 *  pull this task if the run queue is running at a lower
+                 *  priority.
+                 */
+                smp_rmb();
+                /* Need to do the rmb for every iteration */
+                if (skip)
+                        continue;
                if (cpumask_any_and(&p->cpus_allowed, vec->mask) >= nr_cpu_ids)
                        continue;
@@ -115,7 +139,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
 {
        int                 *currpri = &cp->cpu_to_pri[cpu];
        int                  oldpri  = *currpri;
-        unsigned long        flags;
+        int                  do_mb = 0;
        newpri = convert_prio(newpri);
@@ -128,32 +152,46 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
         * If the cpu was currently mapped to a different value, we
         * need to map it to the new value then remove the old value.
         * Note, we must add the new value first, otherwise we risk the
-         * cpu being cleared from pri_active, and this cpu could be
+         * cpu being missed by the priority loop in cpupri_find.
-         * missed for a push or pull.
         */
        if (likely(newpri != CPUPRI_INVALID)) {
                struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
-                raw_spin_lock_irqsave(&vec->lock, flags);
                cpumask_set_cpu(cpu, vec->mask);
-                vec->count++;
+                /*
-                if (vec->count == 1)
+                 * When adding a new vector, we update the mask first,
-                        set_bit(newpri, cp->pri_active);
+                 * do a write memory barrier, and then update the count, to
+                 * make sure the vector is visible when count is set.
-                raw_spin_unlock_irqrestore(&vec->lock, flags);
+                 */
+                smp_mb__before_atomic_inc();
+                atomic_inc(&(vec)->count);
+                do_mb = 1;
        }
        if (likely(oldpri != CPUPRI_INVALID)) {
                struct cpupri_vec *vec  = &cp->pri_to_cpu[oldpri];
-                raw_spin_lock_irqsave(&vec->lock, flags);
+                /*
+                 * Because the order of modification of the vec->count
-                vec->count--;
+                 * is important, we must make sure that the update
-                if (!vec->count)
+                 * of the new prio is seen before we decrement the
-                        clear_bit(oldpri, cp->pri_active);
+                 * old prio. This makes sure that the loop sees
+                 * one or the other when we raise the priority of
+                 * the run queue. We don't care about when we lower the
+                 * priority, as that will trigger an rt pull anyway.
+                 *
+                 * We only need to do a memory barrier if we updated
+                 * the new priority vec.
+                 */
+                if (do_mb)
+                        smp_mb__after_atomic_inc();
+                /*
+                 * When removing from the vector, we decrement the counter first
+                 * do a memory barrier and then clear the mask.
+                 */
+                atomic_dec(&(vec)->count);
+                smp_mb__after_atomic_inc();
                cpumask_clear_cpu(cpu, vec->mask);
-                raw_spin_unlock_irqrestore(&vec->lock, flags);
        }
        *currpri = newpri;
@@ -175,8 +213,7 @@ int cpupri_init(struct cpupri *cp)
        for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
                struct cpupri_vec *vec = &cp->pri_to_cpu[i];
-                raw_spin_lock_init(&vec->lock);
+                atomic_set(&vec->count, 0);
-                vec->count = 0;
                if (!zalloc_cpumask_var(&vec->mask, GFP_KERNEL))
                        goto cleanup;
        }
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 9fc7d386fea..f6d75617349 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -4,7 +4,6 @@
 #include <linux/sched.h>
 #define CPUPRI_NR_PRIORITIES    (MAX_RT_PRIO + 2)
-#define CPUPRI_NR_PRI_WORDS     BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
 #define CPUPRI_INVALID -1
 #define CPUPRI_IDLE     0
@@ -12,14 +11,12 @@
 /* values 2-101 are RT priorities 0-99 */
 struct cpupri_vec {
-        raw_spinlock_t lock;
+        atomic_t        count;
-        int        count;
+        cpumask_var_t   mask;
-        cpumask_var_t mask;
 };
 struct cpupri {
        struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
-        long              pri_active[CPUPRI_NR_PRI_WORDS];
        int               cpu_to_pri[NR_CPUS];
 };
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index bc8ee999381..a78ed2736ba 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -89,6 +89,20 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
 */
 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
+ * each time a cfs_rq requests quota.
+ *
+ * Note: in the case that the slice exceeds the runtime remaining (either due
+ * to consumption or the quota being specified to be smaller than the slice)
+ * we will always only issue the remaining available time.
+ *
+ * default: 5 msec, units: microseconds
+  */
+unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
+#endif
 static const struct sched_class fair_sched_class;
 /**************************************************************
@@ -292,6 +306,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 #endif  /* CONFIG_FAIR_GROUP_SCHED */
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                   unsigned long delta_exec);
 /**************************************************************
 * Scheduling class tree data structure manipulation methods:
@@ -583,6 +599,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
                cpuacct_charge(curtask, delta_exec);
                account_group_exec_runtime(curtask, delta_exec);
        }
+        account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 static inline void
@@ -688,6 +706,8 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
+/* we need this in update_cfs_load and load-balance functions below */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
 # ifdef CONFIG_SMP
 static void update_cfs_rq_load_contribution(struct cfs_rq *cfs_rq,
                                            int global_update)
@@ -710,7 +730,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
        u64 now, delta;
        unsigned long load = cfs_rq->load.weight;
-        if (cfs_rq->tg == &root_task_group)
+        if (cfs_rq->tg == &root_task_group || throttled_hierarchy(cfs_rq))
                return;
        now = rq_of(cfs_rq)->clock_task;
@@ -752,19 +772,32 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
                list_del_leaf_cfs_rq(cfs_rq);
 }
+static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+{
+        long tg_weight;
+        /*
+         * Use this CPU's actual weight instead of the last load_contribution
+         * to gain a more accurate current total weight. See
+         * update_cfs_rq_load_contribution().
+         */
+        tg_weight = atomic_read(&tg->load_weight);
+        tg_weight -= cfs_rq->load_contribution;
+        tg_weight += cfs_rq->load.weight;
+        return tg_weight;
+}
 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
 {
-        long load_weight, load, shares;
+        long tg_weight, load, shares;
+        tg_weight = calc_tg_weight(tg, cfs_rq);
        load = cfs_rq->load.weight;
-        load_weight = atomic_read(&tg->load_weight);
-        load_weight += load;
-        load_weight -= cfs_rq->load_contribution;
        shares = (tg->shares * load);
-        if (load_weight)
+        if (tg_weight)
-                shares /= load_weight;
+                shares /= tg_weight;
        if (shares < MIN_SHARES)
                shares = MIN_SHARES;
@@ -819,7 +852,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq)
        tg = cfs_rq->tg;
        se = tg->se[cpu_of(rq_of(cfs_rq))];
-        if (!se)
+        if (!se || throttled_hierarchy(cfs_rq))
                return;
 #ifndef CONFIG_SMP
        if (likely(se->load.weight == tg->shares))
@@ -950,6 +983,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
        se->vruntime = vruntime;
 }
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -979,8 +1014,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
                __enqueue_entity(cfs_rq, se);
        se->on_rq = 1;
-        if (cfs_rq->nr_running == 1)
+        if (cfs_rq->nr_running == 1) {
                list_add_leaf_cfs_rq(cfs_rq);
+                check_enqueue_throttle(cfs_rq);
+        }
 }
 static void __clear_buddies_last(struct sched_entity *se)
@@ -1028,6 +1065,8 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
                __clear_buddies_skip(se);
 }
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void
 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
@@ -1066,6 +1105,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
        if (!(flags & DEQUEUE_SLEEP))
                se->vruntime -= cfs_rq->min_vruntime;
+        /* return excess runtime on last dequeue */
+        return_cfs_rq_runtime(cfs_rq);
        update_min_vruntime(cfs_rq);
        update_cfs_shares(cfs_rq);
 }
@@ -1077,6 +1119,8 @@ static void
 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 {
        unsigned long ideal_runtime, delta_exec;
+        struct sched_entity *se;
+        s64 delta;
        ideal_runtime = sched_slice(cfs_rq, curr);
        delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
@@ -1095,22 +1139,17 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
         * narrow margin doesn't have to wait for a full slice.
         * This also mitigates buddy induced latencies under load.
         */
-        if (!sched_feat(WAKEUP_PREEMPT))
-                return;
        if (delta_exec < sysctl_sched_min_granularity)
                return;
-        if (cfs_rq->nr_running > 1) {
+        se = __pick_first_entity(cfs_rq);
-                struct sched_entity *se = __pick_first_entity(cfs_rq);
+        delta = curr->vruntime - se->vruntime;
-                s64 delta = curr->vruntime - se->vruntime;
-                if (delta < 0)
+        if (delta < 0)
-                        return;
+                return;
-                if (delta > ideal_runtime)
+        if (delta > ideal_runtime)
-                        resched_task(rq_of(cfs_rq)->curr);
+                resched_task(rq_of(cfs_rq)->curr);
-        }
 }
 static void
@@ -1185,6 +1224,8 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
        return se;
 }
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 {
        /*
@@ -1194,6 +1235,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
        if (prev->on_rq)
                update_curr(cfs_rq);
+        /* throttle cfs_rqs exceeding runtime */
+        check_cfs_rq_runtime(cfs_rq);
        check_spread(cfs_rq, prev);
        if (prev->on_rq) {
                update_stats_wait_start(cfs_rq, prev);
@@ -1233,10 +1277,583 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
                return;
 #endif
-        if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
+        if (cfs_rq->nr_running > 1)
                check_preempt_tick(cfs_rq, curr);
 }
+/**************************************************
+ * CFS bandwidth control machinery
+ */
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * default period for cfs group bandwidth.
+ * default: 0.1s, units: nanoseconds
+ */
+static inline u64 default_cfs_period(void)
+{
+        return 100000000ULL;
+}
+static inline u64 sched_cfs_bandwidth_slice(void)
+{
+        return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
+}
+/*
+ * Replenish runtime according to assigned quota and update expiration time.
+ * We use sched_clock_cpu directly instead of rq->clock to avoid adding
+ * additional synchronization around rq->lock.
+ *
+ * requires cfs_b->lock
+ */
+static void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
+{
+        u64 now;
+        if (cfs_b->quota == RUNTIME_INF)
+                return;
+        now = sched_clock_cpu(smp_processor_id());
+        cfs_b->runtime = cfs_b->quota;
+        cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+}
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        struct task_group *tg = cfs_rq->tg;
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
+        u64 amount = 0, min_amount, expires;
+        /* note: this is a positive sum as runtime_remaining <= 0 */
+        min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+        raw_spin_lock(&cfs_b->lock);
+        if (cfs_b->quota == RUNTIME_INF)
+                amount = min_amount;
+        else {
+                /*
+                 * If the bandwidth pool has become inactive, then at least one
+                 * period must have elapsed since the last consumption.
+                 * Refresh the global state and ensure bandwidth timer becomes
+                 * active.
+                 */
+                if (!cfs_b->timer_active) {
+                        __refill_cfs_bandwidth_runtime(cfs_b);
+                        __start_cfs_bandwidth(cfs_b);
+                }
+                if (cfs_b->runtime > 0) {
+                        amount = min(cfs_b->runtime, min_amount);
+                        cfs_b->runtime -= amount;
+                        cfs_b->idle = 0;
+                }
+        }
+        expires = cfs_b->runtime_expires;
+        raw_spin_unlock(&cfs_b->lock);
+        cfs_rq->runtime_remaining += amount;
+        /*
+         * we may have advanced our local expiration to account for allowed
+         * spread between our sched_clock and the one on which runtime was
+         * issued.
+         */
+        if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+                cfs_rq->runtime_expires = expires;
+        return cfs_rq->runtime_remaining > 0;
+}
+/*
+ * Note: This depends on the synchronization provided by sched_clock and the
+ * fact that rq->clock snapshots this value.
+ */
+static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+        struct rq *rq = rq_of(cfs_rq);
+        /* if the deadline is ahead of our clock, nothing to do */
+        if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
+                return;
+        if (cfs_rq->runtime_remaining < 0)
+                return;
+        /*
+         * If the local deadline has passed we have to consider the
+         * possibility that our sched_clock is 'fast' and the global deadline
+         * has not truly expired.
+         *
+         * Fortunately we can check determine whether this the case by checking
+         * whether the global deadline has advanced.
+         */
+        if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+                /* extend local deadline, drift is bounded above by 2 ticks */
+                cfs_rq->runtime_expires += TICK_NSEC;
+        } else {
+                /* global deadline is ahead, expiration has passed */
+                cfs_rq->runtime_remaining = 0;
+        }
+}
+static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                     unsigned long delta_exec)
+{
+        /* dock delta_exec before expiring quota (as it could span periods) */
+        cfs_rq->runtime_remaining -= delta_exec;
+        expire_cfs_rq_runtime(cfs_rq);
+        if (likely(cfs_rq->runtime_remaining > 0))
+                return;
+        /*
+         * if we're unable to extend our runtime we resched so that the active
+         * hierarchy can be throttled
+         */
+        if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
+                resched_task(rq_of(cfs_rq)->curr);
+}
+static __always_inline void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                                   unsigned long delta_exec)
+{
+        if (!cfs_rq->runtime_enabled)
+                return;
+        __account_cfs_rq_runtime(cfs_rq, delta_exec);
+}
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+        return cfs_rq->throttled;
+}
+/* check whether cfs_rq, or any parent, is throttled */
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+        return cfs_rq->throttle_count;
+}
+/*
+ * Ensure that neither of the group entities corresponding to src_cpu or
+ * dest_cpu are members of a throttled hierarchy when performing group
+ * load-balance operations.
+ */
+static inline int throttled_lb_pair(struct task_group *tg,
+                                    int src_cpu, int dest_cpu)
+{
+        struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
+        src_cfs_rq = tg->cfs_rq[src_cpu];
+        dest_cfs_rq = tg->cfs_rq[dest_cpu];
+        return throttled_hierarchy(src_cfs_rq) ||
+               throttled_hierarchy(dest_cfs_rq);
+}
+/* updated child weight may affect parent so we have to do this bottom up */
+static int tg_unthrottle_up(struct task_group *tg, void *data)
+{
+        struct rq *rq = data;
+        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+        cfs_rq->throttle_count--;
+#ifdef CONFIG_SMP
+        if (!cfs_rq->throttle_count) {
+                u64 delta = rq->clock_task - cfs_rq->load_stamp;
+                /* leaving throttled state, advance shares averaging windows */
+                cfs_rq->load_stamp += delta;
+                cfs_rq->load_last += delta;
+                /* update entity weight now that we are on_rq again */
+                update_cfs_shares(cfs_rq);
+        }
+#endif
+        return 0;
+}
+static int tg_throttle_down(struct task_group *tg, void *data)
+{
+        struct rq *rq = data;
+        struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
+        /* group is entering throttled state, record last load */
+        if (!cfs_rq->throttle_count)
+                update_cfs_load(cfs_rq, 0);
+        cfs_rq->throttle_count++;
+        return 0;
+}
+static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        struct rq *rq = rq_of(cfs_rq);
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+        struct sched_entity *se;
+        long task_delta, dequeue = 1;
+        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+        /* account load preceding throttle */
+        rcu_read_lock();
+        walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
+        rcu_read_unlock();
+        task_delta = cfs_rq->h_nr_running;
+        for_each_sched_entity(se) {
+                struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+                /* throttled entity or throttle-on-deactivate */
+                if (!se->on_rq)
+                        break;
+                if (dequeue)
+                        dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+                qcfs_rq->h_nr_running -= task_delta;
+                if (qcfs_rq->load.weight)
+                        dequeue = 0;
+        }
+        if (!se)
+                rq->nr_running -= task_delta;
+        cfs_rq->throttled = 1;
+        cfs_rq->throttled_timestamp = rq->clock;
+        raw_spin_lock(&cfs_b->lock);
+        list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+        raw_spin_unlock(&cfs_b->lock);
+}
+static void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
+{
+        struct rq *rq = rq_of(cfs_rq);
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+        struct sched_entity *se;
+        int enqueue = 1;
+        long task_delta;
+        se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
+        cfs_rq->throttled = 0;
+        raw_spin_lock(&cfs_b->lock);
+        cfs_b->throttled_time += rq->clock - cfs_rq->throttled_timestamp;
+        list_del_rcu(&cfs_rq->throttled_list);
+        raw_spin_unlock(&cfs_b->lock);
+        cfs_rq->throttled_timestamp = 0;
+        update_rq_clock(rq);
+        /* update hierarchical throttle state */
+        walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
+        if (!cfs_rq->load.weight)
+                return;
+        task_delta = cfs_rq->h_nr_running;
+        for_each_sched_entity(se) {
+                if (se->on_rq)
+                        enqueue = 0;
+                cfs_rq = cfs_rq_of(se);
+                if (enqueue)
+                        enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+                cfs_rq->h_nr_running += task_delta;
+                if (cfs_rq_throttled(cfs_rq))
+                        break;
+        }
+        if (!se)
+                rq->nr_running += task_delta;
+        /* determine whether we need to wake up potentially idle cpu */
+        if (rq->curr == rq->idle && rq->cfs.nr_running)
+                resched_task(rq->curr);
+}
+static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
+                u64 remaining, u64 expires)
+{
+        struct cfs_rq *cfs_rq;
+        u64 runtime = remaining;
+        rcu_read_lock();
+        list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
+                                throttled_list) {
+                struct rq *rq = rq_of(cfs_rq);
+                raw_spin_lock(&rq->lock);
+                if (!cfs_rq_throttled(cfs_rq))
+                        goto next;
+                runtime = -cfs_rq->runtime_remaining + 1;
+                if (runtime > remaining)
+                        runtime = remaining;
+                remaining -= runtime;
+                cfs_rq->runtime_remaining += runtime;
+                cfs_rq->runtime_expires = expires;
+                /* we check whether we're throttled above */
+                if (cfs_rq->runtime_remaining > 0)
+                        unthrottle_cfs_rq(cfs_rq);
+next:
+                raw_spin_unlock(&rq->lock);
+                if (!remaining)
+                        break;
+        }
+        rcu_read_unlock();
+        return remaining;
+}
+/*
+ * Responsible for refilling a task_group's bandwidth and unthrottling its
+ * cfs_rqs as appropriate. If there has been no activity within the last
+ * period the timer is deactivated until scheduling resumes; cfs_b->idle is
+ * used to track this state.
+ */
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+{
+        u64 runtime, runtime_expires;
+        int idle = 1, throttled;
+        raw_spin_lock(&cfs_b->lock);
+        /* no need to continue the timer with no bandwidth constraint */
+        if (cfs_b->quota == RUNTIME_INF)
+                goto out_unlock;
+        throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+        /* idle depends on !throttled (for the case of a large deficit) */
+        idle = cfs_b->idle && !throttled;
+        cfs_b->nr_periods += overrun;
+        /* if we're going inactive then everything else can be deferred */
+        if (idle)
+                goto out_unlock;
+        __refill_cfs_bandwidth_runtime(cfs_b);
+        if (!throttled) {
+                /* mark as potentially idle for the upcoming period */
+                cfs_b->idle = 1;
+                goto out_unlock;
+        }
+        /* account preceding periods in which throttling occurred */
+        cfs_b->nr_throttled += overrun;
+        /*
+         * There are throttled entities so we must first use the new bandwidth
+         * to unthrottle them before making it generally available.  This
+         * ensures that all existing debts will be paid before a new cfs_rq is
+         * allowed to run.
+         */
+        runtime = cfs_b->runtime;
+        runtime_expires = cfs_b->runtime_expires;
+        cfs_b->runtime = 0;
+        /*
+         * This check is repeated as we are holding onto the new bandwidth
+         * while we unthrottle.  This can potentially race with an unthrottled
+         * group trying to acquire new bandwidth from the global pool.
+         */
+        while (throttled && runtime > 0) {
+                raw_spin_unlock(&cfs_b->lock);
+                /* we can't nest cfs_b->lock while distributing bandwidth */
+                runtime = distribute_cfs_runtime(cfs_b, runtime,
+                                                 runtime_expires);
+                raw_spin_lock(&cfs_b->lock);
+                throttled = !list_empty(&cfs_b->throttled_cfs_rq);
+        }
+        /* return (any) remaining runtime */
+        cfs_b->runtime = runtime;
+        /*
+         * While we are ensured activity in the period following an
+         * unthrottle, this also covers the case in which the new bandwidth is
+         * insufficient to cover the existing bandwidth deficit.  (Forcing the
+         * timer to remain active while there are any throttled entities.)
+         */
+        cfs_b->idle = 0;
+out_unlock:
+        if (idle)
+                cfs_b->timer_active = 0;
+        raw_spin_unlock(&cfs_b->lock);
+        return idle;
+}
+/* a cfs_rq won't donate quota below this amount */
+static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
+/* minimum remaining period time to redistribute slack quota */
+static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
+/* how long we wait to gather additional slack before distributing */
+static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
+/* are we near the end of the current quota period? */
+static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
+{
+        struct hrtimer *refresh_timer = &cfs_b->period_timer;
+        u64 remaining;
+        /* if the call-back is running a quota refresh is already occurring */
+        if (hrtimer_callback_running(refresh_timer))
+                return 1;
+        /* is a quota refresh about to occur? */
+        remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
+        if (remaining < min_expire)
+                return 1;
+        return 0;
+}
+static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
+{
+        u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
+        /* if there's a quota refresh soon don't bother with slack */
+        if (runtime_refresh_within(cfs_b, min_left))
+                return;
+        start_bandwidth_timer(&cfs_b->slack_timer,
+                                ns_to_ktime(cfs_bandwidth_slack_period));
+}
+/* we know any runtime found here is valid as update_curr() precedes return */
+static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+        s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
+        if (slack_runtime <= 0)
+                return;
+        raw_spin_lock(&cfs_b->lock);
+        if (cfs_b->quota != RUNTIME_INF &&
+            cfs_rq->runtime_expires == cfs_b->runtime_expires) {
+                cfs_b->runtime += slack_runtime;
+                /* we are under rq->lock, defer unthrottling using a timer */
+                if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
+                    !list_empty(&cfs_b->throttled_cfs_rq))
+                        start_cfs_slack_bandwidth(cfs_b);
+        }
+        raw_spin_unlock(&cfs_b->lock);
+        /* even if it's not valid for return we don't want to try again */
+        cfs_rq->runtime_remaining -= slack_runtime;
+}
+static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
+                return;
+        __return_cfs_rq_runtime(cfs_rq);
+}
+/*
+ * This is done with a timer (instead of inline with bandwidth return) since
+ * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
+ */
+static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
+{
+        u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+        u64 expires;
+        /* confirm we're still not at a refresh boundary */
+        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration))
+                return;
+        raw_spin_lock(&cfs_b->lock);
+        if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
+                runtime = cfs_b->runtime;
+                cfs_b->runtime = 0;
+        }
+        expires = cfs_b->runtime_expires;
+        raw_spin_unlock(&cfs_b->lock);
+        if (!runtime)
+                return;
+        runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
+        raw_spin_lock(&cfs_b->lock);
+        if (expires == cfs_b->runtime_expires)
+                cfs_b->runtime = runtime;
+        raw_spin_unlock(&cfs_b->lock);
+}
+/*
+ * When a group wakes up we want to make sure that its quota is not already
+ * expired/exceeded, otherwise it may be allowed to steal additional ticks of
+ * runtime as update_curr() throttling can not not trigger until it's on-rq.
+ */
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
+{
+        /* an active group must be handled by the update_curr()->put() path */
+        if (!cfs_rq->runtime_enabled || cfs_rq->curr)
+                return;
+        /* ensure the group is not already throttled */
+        if (cfs_rq_throttled(cfs_rq))
+                return;
+        /* update runtime allocation */
+        account_cfs_rq_runtime(cfs_rq, 0);
+        if (cfs_rq->runtime_remaining <= 0)
+                throttle_cfs_rq(cfs_rq);
+}
+/* conditionally throttle active cfs_rq's from put_prev_entity() */
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+        if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
+                return;
+        /*
+         * it's possible for a throttled entity to be forced into a running
+         * state (e.g. set_curr_task), in this case we're finished.
+         */
+        if (cfs_rq_throttled(cfs_rq))
+                return;
+        throttle_cfs_rq(cfs_rq);
+}
+#else
+static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
+                                     unsigned long delta_exec) {}
+static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+        return 0;
+}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
+{
+        return 0;
+}
+static inline int throttled_lb_pair(struct task_group *tg,
+                                    int src_cpu, int dest_cpu)
+{
+        return 0;
+}
+#endif
 /**************************************************
 * CFS operations on tasks:
 */
@@ -1313,16 +1930,33 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                        break;
                cfs_rq = cfs_rq_of(se);
                enqueue_entity(cfs_rq, se, flags);
+                /*
+                 * end evaluation on encountering a throttled cfs_rq
+                 *
+                 * note: in the case of encountering a throttled cfs_rq we will
+                 * post the final h_nr_running increment below.
+                */
+                if (cfs_rq_throttled(cfs_rq))
+                        break;
+                cfs_rq->h_nr_running++;
                flags = ENQUEUE_WAKEUP;
        }
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
+                cfs_rq->h_nr_running++;
+                if (cfs_rq_throttled(cfs_rq))
+                        break;
                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
        }
+        if (!se)
+                inc_nr_running(rq);
        hrtick_update(rq);
 }
@@ -1343,6 +1977,16 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                cfs_rq = cfs_rq_of(se);
                dequeue_entity(cfs_rq, se, flags);
+                /*
+                 * end evaluation on encountering a throttled cfs_rq
+                 *
+                 * note: in the case of encountering a throttled cfs_rq we will
+                 * post the final h_nr_running decrement below.
+                */
+                if (cfs_rq_throttled(cfs_rq))
+                        break;
+                cfs_rq->h_nr_running--;
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
                        /*
@@ -1361,11 +2005,17 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
+                cfs_rq->h_nr_running--;
+                if (cfs_rq_throttled(cfs_rq))
+                        break;
                update_cfs_load(cfs_rq, 0);
                update_cfs_shares(cfs_rq);
        }
+        if (!se)
+                dec_nr_running(rq);
        hrtick_update(rq);
 }
@@ -1399,42 +2049,105 @@ static void task_waking_fair(struct task_struct *p)
 * Adding load to a group doesn't make a group heavier, but can cause movement
 * of group shares between cpus. Assuming the shares were perfectly aligned one
 * can calculate the shift in shares.
+ *
+ * Calculate the effective load difference if @wl is added (subtracted) to @tg
+ * on this @cpu and results in a total addition (subtraction) of @wg to the
+ * total group weight.
+ *
+ * Given a runqueue weight distribution (rw_i) we can compute a shares
+ * distribution (s_i) using:
+ *
+ *   s_i = rw_i / \Sum rw_j                                             (1)
+ *
+ * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
+ * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
+ * shares distribution (s_i):
+ *
+ *   rw_i = {   2,   4,   1,   0 }
+ *   s_i  = { 2/7, 4/7, 1/7,   0 }
+ *
+ * As per wake_affine() we're interested in the load of two CPUs (the CPU the
+ * task used to run on and the CPU the waker is running on), we need to
+ * compute the effect of waking a task on either CPU and, in case of a sync
+ * wakeup, compute the effect of the current task going to sleep.
+ *
+ * So for a change of @wl to the local @cpu with an overall group weight change
+ * of @wl we can compute the new shares distribution (s'_i) using:
+ *
+ *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                            (2)
+ *
+ * Suppose we're interested in CPUs 0 and 1, and want to compute the load
+ * differences in waking a task to CPU 0. The additional task changes the
+ * weight and shares distributions like:
+ *
+ *   rw'_i = {   3,   4,   1,   0 }
+ *   s'_i  = { 3/8, 4/8, 1/8,   0 }
+ *
+ * We can then compute the difference in effective weight by using:
+ *
+ *   dw_i = S * (s'_i - s_i)                                            (3)
+ *
+ * Where 'S' is the group weight as seen by its parent.
+ *
+ * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
+ * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
+ * 4/7) times the weight of the group.
 */
 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
 {
        struct sched_entity *se = tg->se[cpu];
-        if (!tg->parent)
+        if (!tg->parent)        /* the trivial, non-cgroup case */
                return wl;
        for_each_sched_entity(se) {
-                long lw, w;
+                long w, W;
                tg = se->my_q->tg;
-                w = se->my_q->load.weight;
-                /* use this cpu's instantaneous contribution */
+                /*
-                lw = atomic_read(&tg->load_weight);
+                 * W = @wg + \Sum rw_j
-                lw -= se->my_q->load_contribution;
+                 */
-                lw += w + wg;
+                W = wg + calc_tg_weight(tg, se->my_q);
-                wl += w;
+                /*
+                 * w = rw_i + @wl
+                 */
+                w = se->my_q->load.weight + wl;
-                if (lw > 0 && wl < lw)
+                /*
-                        wl = (wl * tg->shares) / lw;
+                 * wl = S * s'_i; see (2)
+                 */
+                if (W > 0 && w < W)
+                        wl = (w * tg->shares) / W;
                else
                        wl = tg->shares;
-                /* zero point is MIN_SHARES */
+                /*
+                 * Per the above, wl is the new se->load.weight value; since
+                 * those are clipped to [MIN_SHARES, ...) do so now. See
+                 * calc_cfs_shares().
+                 */
                if (wl < MIN_SHARES)
                        wl = MIN_SHARES;
+                /*
+                 * wl = dw_i = S * (s'_i - s_i); see (3)
+                 */
                wl -= se->load.weight;
+                /*
+                 * Recursively apply this logic to all parent groups to compute
+                 * the final effective load change on the root group. Since
+                 * only the @tg group gets extra weight, all parent groups can
+                 * only redistribute existing shares. @wl is the shift in shares
+                 * resulting from this level per the above.
+                 */
                wg = 0;
        }
        return wl;
 }
 #else
 static inline unsigned long effective_load(struct task_group *tg, int cpu,
@@ -1547,7 +2260,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                /* Skip over this group if it has no CPUs allowed */
                if (!cpumask_intersects(sched_group_cpus(group),
-                                        &p->cpus_allowed))
+                                        tsk_cpus_allowed(p)))
                        continue;
                local_group = cpumask_test_cpu(this_cpu,
@@ -1593,7 +2306,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
        int i;
        /* Traverse only the allowed CPUs */
-        for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) {
+        for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
                load = weighted_cpuload(i);
                if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -1613,7 +2326,8 @@ static int select_idle_sibling(struct task_struct *p, int target)
        int cpu = smp_processor_id();
        int prev_cpu = task_cpu(p);
        struct sched_domain *sd;
-        int i;
+        struct sched_group *sg;
+        int i, smt = 0;
        /*
         * If the task is going to be woken-up on this cpu and if it is
@@ -1633,25 +2347,38 @@ static int select_idle_sibling(struct task_struct *p, int target)
         * Otherwise, iterate the domains and find an elegible idle cpu.
         */
        rcu_read_lock();
+again:
        for_each_domain(target, sd) {
-                if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
+                if (!smt && (sd->flags & SD_SHARE_CPUPOWER))
-                        break;
+                        continue;
-                for_each_cpu_and(i, sched_domain_span(sd), &p->cpus_allowed) {
+                if (!(sd->flags & SD_SHARE_PKG_RESOURCES)) {
-                        if (idle_cpu(i)) {
+                        if (!smt) {
-                                target = i;
+                                smt = 1;
-                                break;
+                                goto again;
                        }
+                        break;
                }
-                /*
+                sg = sd->groups;
-                 * Lets stop looking for an idle sibling when we reached
+                do {
-                 * the domain that spans the current cpu and prev_cpu.
+                        if (!cpumask_intersects(sched_group_cpus(sg),
-                 */
+                                                tsk_cpus_allowed(p)))
-                if (cpumask_test_cpu(cpu, sched_domain_span(sd)) &&
+                                goto next;
-                    cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
-                        break;
+                        for_each_cpu(i, sched_group_cpus(sg)) {
+                                if (!idle_cpu(i))
+                                        goto next;
+                        }
+                        target = cpumask_first_and(sched_group_cpus(sg),
+                                        tsk_cpus_allowed(p));
+                        goto done;
+next:
+                        sg = sg->next;
+                } while (sg != sd->groups);
        }
+done:
        rcu_read_unlock();
        return target;
@@ -1680,7 +2407,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
        int sync = wake_flags & WF_SYNC;
        if (sd_flag & SD_BALANCE_WAKE) {
-                if (cpumask_test_cpu(cpu, &p->cpus_allowed))
+                if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
                        want_affine = 1;
                new_cpu = prev_cpu;
        }
@@ -1875,6 +2602,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (unlikely(se == pse))
                return;
+        /*
+         * This is possible from callers such as pull_task(), in which we
+         * unconditionally check_prempt_curr() after an enqueue (which may have
+         * lead to a throttle).  This both saves work and prevents false
+         * next-buddy nomination below.
+         */
+        if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
+                return;
        if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
                set_next_buddy(pse);
                next_buddy_marked = 1;
@@ -1883,6 +2619,12 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        /*
         * We can come here with TIF_NEED_RESCHED already set from new task
         * wake up path.
+         *
+         * Note: this also catches the edge-case of curr being in a throttled
+         * group (e.g. via set_curr_task), since update_curr() (in the
+         * enqueue of curr) will have resulted in resched being set.  This
+         * prevents us from potentially nominating it as a false LAST_BUDDY
+         * below.
         */
        if (test_tsk_need_resched(curr))
                return;
@@ -1899,10 +2641,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
        if (unlikely(p->policy != SCHED_NORMAL))
                return;
-        if (!sched_feat(WAKEUP_PREEMPT))
-                return;
        find_matching_se(&se, &pse);
        update_curr(cfs_rq_of(se));
        BUG_ON(!pse);
@@ -2005,7 +2743,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 {
        struct sched_entity *se = &p->se;
-        if (!se->on_rq)
+        /* throttled hierarchies are not runnable */
+        if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
                return false;
        /* Tell the scheduler that we'd really like pse to run next. */
@@ -2049,7 +2788,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
         * 2) cannot be migrated to this CPU due to cpus_allowed, or
         * 3) are cache-hot on their current CPU.
         */
-        if (!cpumask_test_cpu(this_cpu, &p->cpus_allowed)) {
+        if (!cpumask_test_cpu(this_cpu, tsk_cpus_allowed(p))) {
                schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
                return 0;
        }
@@ -2102,6 +2841,9 @@ move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
        for_each_leaf_cfs_rq(busiest, cfs_rq) {
                list_for_each_entry_safe(p, n, &cfs_rq->tasks, se.group_node) {
+                        if (throttled_lb_pair(task_group(p),
+                                              busiest->cpu, this_cpu))
+                                break;
                        if (!can_migrate_task(p, busiest, this_cpu,
                                                sd, idle, &pinned))
@@ -2217,8 +2959,13 @@ static void update_shares(int cpu)
         * Iterates the task_group tree in a bottom up fashion, see
         * list_add_leaf_cfs_rq() for details.
         */
-        for_each_leaf_cfs_rq(rq, cfs_rq)
+        for_each_leaf_cfs_rq(rq, cfs_rq) {
+                /* throttled entities do not contribute to load */
+                if (throttled_hierarchy(cfs_rq))
+                        continue;
                update_shares_cpu(cfs_rq->tg, cpu);
+        }
        rcu_read_unlock();
 }
@@ -2268,9 +3015,10 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                u64 rem_load, moved_load;
                /*
-                 * empty group
+                 * empty group or part of a throttled hierarchy
                 */
-                if (!busiest_cfs_rq->task_weight)
+                if (!busiest_cfs_rq->task_weight ||
+                    throttled_lb_pair(busiest_cfs_rq->tg, cpu_of(busiest), this_cpu))
                        continue;
                rem_load = (u64)rem_load_move * busiest_weight;
@@ -2854,7 +3602,7 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
 }
 /**
- * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
 * @sd: sched_domain whose statistics are to be updated.
 * @this_cpu: Cpu for which load balance is currently performed.
 * @idle: Idle status of this_cpu
@@ -3430,7 +4178,7 @@ redo:
                         * moved to this_cpu
                         */
                        if (!cpumask_test_cpu(this_cpu,
-                                              &busiest->curr->cpus_allowed)) {
+                                        tsk_cpus_allowed(busiest->curr))) {
                                raw_spin_unlock_irqrestore(&busiest->lock,
                                                            flags);
                                all_pinned = 1;
@@ -3612,22 +4360,6 @@ out_unlock:
 }
 #ifdef CONFIG_NO_HZ
-static DEFINE_PER_CPU(struct call_single_data, remote_sched_softirq_cb);
-static void trigger_sched_softirq(void *data)
-{
-        raise_softirq_irqoff(SCHED_SOFTIRQ);
-}
-static inline void init_sched_softirq_csd(struct call_single_data *csd)
-{
-        csd->func = trigger_sched_softirq;
-        csd->info = NULL;
-        csd->flags = 0;
-        csd->priv = 0;
-}
 /*
 * idle load balancing details
 * - One of the idle CPUs nominates itself as idle load_balancer, while
@@ -3667,7 +4399,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
        struct sched_domain *sd;
        for_each_domain(cpu, sd)
-                if (sd && (sd->flags & flag))
+                if (sd->flags & flag)
                        break;
        return sd;
@@ -3793,11 +4525,16 @@ static void nohz_balancer_kick(int cpu)
        }
        if (!cpu_rq(ilb_cpu)->nohz_balance_kick) {
-                struct call_single_data *cp;
                cpu_rq(ilb_cpu)->nohz_balance_kick = 1;
-                cp = &per_cpu(remote_sched_softirq_cb, cpu);
-                __smp_call_function_single(ilb_cpu, cp, 0);
+                smp_mb();
+                /*
+                 * Use smp_send_reschedule() instead of resched_cpu().
+                 * This way we generate a sched IPI on the target cpu which
+                 * is idle. And the softirq performing nohz idle load balance
+                 * will be run before returning from the IPI.
+                 */
+                smp_send_reschedule(ilb_cpu);
        }
        return;
 }
@@ -4030,7 +4767,7 @@ static inline int nohz_kick_needed(struct rq *rq, int cpu)
        if (time_before(now, nohz.next_balance))
                return 0;
-        if (rq->idle_at_tick)
+        if (idle_cpu(cpu))
                return 0;
        first_pick_cpu = atomic_read(&nohz.first_pick_cpu);
@@ -4066,7 +4803,7 @@ static void run_rebalance_domains(struct softirq_action *h)
 {
        int this_cpu = smp_processor_id();
        struct rq *this_rq = cpu_rq(this_cpu);
-        enum cpu_idle_type idle = this_rq->idle_at_tick ?
+        enum cpu_idle_type idle = this_rq->idle_balance ?
                                                CPU_IDLE : CPU_NOT_IDLE;
        rebalance_domains(this_cpu, idle);
@@ -4251,8 +4988,13 @@ static void set_curr_task_fair(struct rq *rq)
 {
        struct sched_entity *se = &rq->curr->se;
-        for_each_sched_entity(se)
+        for_each_sched_entity(se) {
-                set_next_entity(cfs_rq_of(se), se);
+                struct cfs_rq *cfs_rq = cfs_rq_of(se);
+                set_next_entity(cfs_rq, se);
+                /* ensure bandwidth has been allocated on our new cfs_rq */
+                account_cfs_rq_runtime(cfs_rq, 0);
+        }
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 2e74677cb04..84802245abd 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -12,11 +12,6 @@ SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1)
 SCHED_FEAT(START_DEBIT, 1)
 /*
- * Should wakeups try to preempt running tasks.
- */
-SCHED_FEAT(WAKEUP_PREEMPT, 1)
-/*
 * Based on load and program behaviour, see if it makes sense to place
 * a newly woken task on the same cpu as the task that woke it --
 * improve cache locality. Typically used with SYNC wakeups as
@@ -72,3 +67,4 @@ SCHED_FEAT(NONTASK_POWER, 1)
 SCHED_FEAT(TTWU_QUEUE, 1)
 SCHED_FEAT(FORCE_SD_OVERLAP, 0)
+SCHED_FEAT(RT_RUNTIME_SHARE, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 97540f0c9e4..583a1368afe 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -124,21 +124,33 @@ static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        update_rt_migration(rt_rq);
 }
+static inline int has_pushable_tasks(struct rq *rq)
+{
+        return !plist_head_empty(&rq->rt.pushable_tasks);
+}
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 {
        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
        plist_node_init(&p->pushable_tasks, p->prio);
        plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+        /* Update the highest prio pushable task */
+        if (p->prio < rq->rt.highest_prio.next)
+                rq->rt.highest_prio.next = p->prio;
 }
 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 {
        plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
-}
-static inline int has_pushable_tasks(struct rq *rq)
+        /* Update the new highest prio pushable task */
-{
+        if (has_pushable_tasks(rq)) {
-        return !plist_head_empty(&rq->rt.pushable_tasks);
+                p = plist_first_entry(&rq->rt.pushable_tasks,
+                                      struct task_struct, pushable_tasks);
+                rq->rt.highest_prio.next = p->prio;
+        } else
+                rq->rt.highest_prio.next = MAX_RT_PRIO;
 }
 #else
@@ -548,6 +560,9 @@ static int balance_runtime(struct rt_rq *rt_rq)
 {
        int more = 0;
+        if (!sched_feat(RT_RUNTIME_SHARE))
+                return more;
        if (rt_rq->rt_time > rt_rq->rt_runtime) {
                raw_spin_unlock(&rt_rq->rt_runtime_lock);
                more = do_balance_runtime(rt_rq);
@@ -643,6 +658,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
        if (rt_rq->rt_time > runtime) {
                rt_rq->rt_throttled = 1;
+                printk_once(KERN_WARNING "sched: RT throttling activated\n");
                if (rt_rq_throttled(rt_rq)) {
                        sched_rt_rq_dequeue(rt_rq);
                        return 1;
@@ -698,47 +714,13 @@ static void update_curr_rt(struct rq *rq)
 #if defined CONFIG_SMP
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
-static inline int next_prio(struct rq *rq)
-{
-        struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
-        if (next && rt_prio(next->prio))
-                return next->prio;
-        else
-                return MAX_RT_PRIO;
-}
 static void
 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
-        if (prio < prev_prio) {
+        if (rq->online && prio < prev_prio)
+                cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
-                /*
-                 * If the new task is higher in priority than anything on the
-                 * run-queue, we know that the previous high becomes our
-                 * next-highest.
-                 */
-                rt_rq->highest_prio.next = prev_prio;
-                if (rq->online)
-                        cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
-        } else if (prio == rt_rq->highest_prio.curr)
-                /*
-                 * If the next task is equal in priority to the highest on
-                 * the run-queue, then we implicitly know that the next highest
-                 * task cannot be any lower than current
-                 */
-                rt_rq->highest_prio.next = prio;
-        else if (prio < rt_rq->highest_prio.next)
-                /*
-                 * Otherwise, we need to recompute next-highest
-                 */
-                rt_rq->highest_prio.next = next_prio(rq);
 }
 static void
@@ -746,9 +728,6 @@ dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
        struct rq *rq = rq_of_rt_rq(rt_rq);
-        if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
-                rt_rq->highest_prio.next = next_prio(rq);
        if (rq->online && rt_rq->highest_prio.curr != prev_prio)
                cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 }
@@ -961,6 +940,8 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
+        inc_nr_running(rq);
 }
 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
@@ -971,6 +952,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        dequeue_rt_entity(rt_se);
        dequeue_pushable_task(rq, p);
+        dec_nr_running(rq);
 }
 /*
@@ -1017,10 +1000,12 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
        struct rq *rq;
        int cpu;
-        if (sd_flag != SD_BALANCE_WAKE)
-                return smp_processor_id();
        cpu = task_cpu(p);
+        /* For anything but wake ups, just return the task_cpu */
+        if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+                goto out;
        rq = cpu_rq(cpu);
        rcu_read_lock();
@@ -1050,7 +1035,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
         */
        if (curr && unlikely(rt_task(curr)) &&
            (curr->rt.nr_cpus_allowed < 2 ||
-             curr->prio < p->prio) &&
+             curr->prio <= p->prio) &&
            (p->rt.nr_cpus_allowed > 1)) {
                int target = find_lowest_rq(p);
@@ -1059,6 +1044,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
        }
        rcu_read_unlock();
+out:
        return cpu;
 }
@@ -1178,7 +1164,6 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
        update_curr_rt(rq);
-        p->se.exec_start = 0;
        /*
         * The previous task needs to be made eligible for pushing
@@ -1198,7 +1183,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
 {
        if (!task_running(rq, p) &&
-            (cpu < 0 || cpumask_test_cpu(cpu, &p->cpus_allowed)) &&
+            (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) &&
            (p->rt.nr_cpus_allowed > 1))
                return 1;
        return 0;
@@ -1343,7 +1328,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                         */
                        if (unlikely(task_rq(task) != rq ||
                                     !cpumask_test_cpu(lowest_rq->cpu,
-                                                       &task->cpus_allowed) ||
+                                                       tsk_cpus_allowed(task)) ||
                                     task_running(rq, task) ||
                                     !task->on_rq)) {
@@ -1394,6 +1379,7 @@ static int push_rt_task(struct rq *rq)
 {
        struct task_struct *next_task;
        struct rq *lowest_rq;
+        int ret = 0;
        if (!rq->rt.overloaded)
                return 0;
@@ -1426,7 +1412,7 @@ retry:
        if (!lowest_rq) {
                struct task_struct *task;
                /*
-                 * find lock_lowest_rq releases rq->lock
+                 * find_lock_lowest_rq releases rq->lock
                 * so it is possible that next_task has migrated.
                 *
                 * We need to make sure that the task is still on the same
@@ -1436,12 +1422,11 @@ retry:
                task = pick_next_pushable_task(rq);
                if (task_cpu(next_task) == rq->cpu && task == next_task) {
                        /*
-                         * If we get here, the task hasn't moved at all, but
+                         * The task hasn't migrated, and is still the next
-                         * it has failed to push.  We will not try again,
+                         * eligible task, but we failed to find a run-queue
-                         * since the other cpus will pull from us when they
+                         * to push it to.  Do not retry in this case, since
-                         * are ready.
+                         * other cpus will pull from us when ready.
                         */
-                        dequeue_pushable_task(rq, next_task);
                        goto out;
                }
@@ -1460,6 +1445,7 @@ retry:
        deactivate_task(rq, next_task, 0);
        set_task_cpu(next_task, lowest_rq->cpu);
        activate_task(lowest_rq, next_task, 0);
+        ret = 1;
        resched_task(lowest_rq->curr);
@@ -1468,7 +1454,7 @@ retry:
 out:
        put_task_struct(next_task);
-        return 1;
+        return ret;
 }
 static void push_rt_tasks(struct rq *rq)
@@ -1581,7 +1567,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
            p->rt.nr_cpus_allowed > 1 &&
            rt_task(rq->curr) &&
            (rq->curr->rt.nr_cpus_allowed < 2 ||
-             rq->curr->prio < p->prio))
+             rq->curr->prio <= p->prio))
                push_rt_tasks(rq);
 }
@@ -1626,9 +1612,6 @@ static void set_cpus_allowed_rt(struct task_struct *p,
                update_rt_migration(&rq->rt);
        }
-        cpumask_copy(&p->cpus_allowed, new_mask);
-        p->rt.nr_cpus_allowed = weight;
 }
 /* Assumes rq->lock is held */
@@ -1863,4 +1846,3 @@ static void print_rt_stats(struct seq_file *m, int cpu)
        rcu_read_unlock();
 }
 #endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 331e01bcd02..87f9e36ea56 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -282,10 +282,10 @@ static inline void account_group_user_time(struct task_struct *tsk,
        if (!cputimer->running)
                return;
-        spin_lock(&cputimer->lock);
+        raw_spin_lock(&cputimer->lock);
        cputimer->cputime.utime =
                cputime_add(cputimer->cputime.utime, cputime);
-        spin_unlock(&cputimer->lock);
+        raw_spin_unlock(&cputimer->lock);
 }
 /**
@@ -306,10 +306,10 @@ static inline void account_group_system_time(struct task_struct *tsk,
        if (!cputimer->running)
                return;
-        spin_lock(&cputimer->lock);
+        raw_spin_lock(&cputimer->lock);
        cputimer->cputime.stime =
                cputime_add(cputimer->cputime.stime, cputime);
-        spin_unlock(&cputimer->lock);
+        raw_spin_unlock(&cputimer->lock);
 }
 /**
@@ -330,7 +330,7 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
        if (!cputimer->running)
                return;
-        spin_lock(&cputimer->lock);
+        raw_spin_lock(&cputimer->lock);
        cputimer->cputime.sum_exec_runtime += ns;
-        spin_unlock(&cputimer->lock);
+        raw_spin_unlock(&cputimer->lock);
 }
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 6f437632afa..8b44e7fa7fb 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -34,11 +34,13 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
 static void
 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
+        inc_nr_running(rq);
 }
 static void
 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
+        dec_nr_running(rq);
 }
 static void yield_task_stop(struct rq *rq)
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 94a62c0d4ad..60636a4e25c 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -27,7 +27,7 @@
 #include <linux/compiler.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/semaphore.h>
 #include <linux/spinlock.h>
@@ -54,12 +54,12 @@ void down(struct semaphore *sem)
 {
        unsigned long flags;
-        spin_lock_irqsave(&sem->lock, flags);
+        raw_spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                __down(sem);
-        spin_unlock_irqrestore(&sem->lock, flags);
+        raw_spin_unlock_irqrestore(&sem->lock, flags);
 }
 EXPORT_SYMBOL(down);
@@ -77,12 +77,12 @@ int down_interruptible(struct semaphore *sem)
        unsigned long flags;
        int result = 0;
-        spin_lock_irqsave(&sem->lock, flags);
+        raw_spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                result = __down_interruptible(sem);
-        spin_unlock_irqrestore(&sem->lock, flags);
+        raw_spin_unlock_irqrestore(&sem->lock, flags);
        return result;
 }
@@ -103,12 +103,12 @@ int down_killable(struct semaphore *sem)
        unsigned long flags;
        int result = 0;
-        spin_lock_irqsave(&sem->lock, flags);
+        raw_spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                result = __down_killable(sem);
-        spin_unlock_irqrestore(&sem->lock, flags);
+        raw_spin_unlock_irqrestore(&sem->lock, flags);
        return result;
 }
@@ -132,11 +132,11 @@ int down_trylock(struct semaphore *sem)
        unsigned long flags;
        int count;
-        spin_lock_irqsave(&sem->lock, flags);
+        raw_spin_lock_irqsave(&sem->lock, flags);
        count = sem->count - 1;
        if (likely(count >= 0))
                sem->count = count;
-        spin_unlock_irqrestore(&sem->lock, flags);
+        raw_spin_unlock_irqrestore(&sem->lock, flags);
        return (count < 0);
 }
@@ -157,12 +157,12 @@ int down_timeout(struct semaphore *sem, long jiffies)
        unsigned long flags;
        int result = 0;
-        spin_lock_irqsave(&sem->lock, flags);
+        raw_spin_lock_irqsave(&sem->lock, flags);
        if (likely(sem->count > 0))
                sem->count--;
        else
                result = __down_timeout(sem, jiffies);
-        spin_unlock_irqrestore(&sem->lock, flags);
+        raw_spin_unlock_irqrestore(&sem->lock, flags);
        return result;
 }
@@ -179,12 +179,12 @@ void up(struct semaphore *sem)
 {
        unsigned long flags;
-        spin_lock_irqsave(&sem->lock, flags);
+        raw_spin_lock_irqsave(&sem->lock, flags);
        if (likely(list_empty(&sem->wait_list)))
                sem->count++;
        else
                __up(sem);
-        spin_unlock_irqrestore(&sem->lock, flags);
+        raw_spin_unlock_irqrestore(&sem->lock, flags);
 }
 EXPORT_SYMBOL(up);
@@ -217,9 +217,9 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
                if (timeout <= 0)
                        goto timed_out;
                __set_task_state(task, state);
-                spin_unlock_irq(&sem->lock);
+                raw_spin_unlock_irq(&sem->lock);
                timeout = schedule_timeout(timeout);
-                spin_lock_irq(&sem->lock);
+                raw_spin_lock_irq(&sem->lock);
                if (waiter.up)
                        return 0;
        }
diff --git a/kernel/signal.c b/kernel/signal.c
index 291c9700be7..b3f78d09a10 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -11,7 +11,7 @@
 */
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
@@ -1344,13 +1344,24 @@ int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
        return error;
 }
+static int kill_as_cred_perm(const struct cred *cred,
+                             struct task_struct *target)
+{
+        const struct cred *pcred = __task_cred(target);
+        if (cred->user_ns != pcred->user_ns)
+                return 0;
+        if (cred->euid != pcred->suid && cred->euid != pcred->uid &&
+            cred->uid  != pcred->suid && cred->uid  != pcred->uid)
+                return 0;
+        return 1;
+}
 /* like kill_pid_info(), but doesn't use uid/euid of "current" */
-int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
+int kill_pid_info_as_cred(int sig, struct siginfo *info, struct pid *pid,
-                      uid_t uid, uid_t euid, u32 secid)
+                         const struct cred *cred, u32 secid)
 {
        int ret = -EINVAL;
        struct task_struct *p;
-        const struct cred *pcred;
        unsigned long flags;
        if (!valid_signal(sig))
@@ -1362,10 +1373,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
                ret = -ESRCH;
                goto out_unlock;
        }
-        pcred = __task_cred(p);
+        if (si_fromuser(info) && !kill_as_cred_perm(cred, p)) {
-        if (si_fromuser(info) &&
-            euid != pcred->suid && euid != pcred->uid &&
-            uid  != pcred->suid && uid  != pcred->uid) {
                ret = -EPERM;
                goto out_unlock;
        }
@@ -1384,7 +1392,7 @@ out_unlock:
        rcu_read_unlock();
        return ret;
 }
-EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
+EXPORT_SYMBOL_GPL(kill_pid_info_as_cred);
 /*
 * kill_something_info() interprets pid in interesting ways just like kill(2).
diff --git a/kernel/smp.c b/kernel/smp.c
index fb67dfa8394..db197d60489 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -6,7 +6,7 @@
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/gfp.h>
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fca82c32042..2c71d91efff 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -10,7 +10,7 @@
 *      Remote softirq infrastructure is by Jens Axboe.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kernel_stat.h>
 #include <linux/interrupt.h>
 #include <linux/init.h>
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index be6517fb9c1..84c7d96918b 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -19,7 +19,7 @@
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
 #include <linux/debug_locks.h>
-#include <linux/module.h>
+#include <linux/export.h>
 /*
 * If lockdep is enabled then we use the non-preemption spin-ops
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 73ce23feaea..0febf61e1aa 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -24,7 +24,7 @@
 *
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mutex.h>
 #include <linux/percpu.h>
 #include <linux/preempt.h>
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index d20c6983aad..00fe55cc5a8 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -7,7 +7,7 @@
 */
 #include <linux/sched.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kallsyms.h>
 #include <linux/stacktrace.h>
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ba5070ce576..2f194e96571 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -12,7 +12,7 @@
 #include <linux/cpu.h>
 #include <linux/init.h>
 #include <linux/kthread.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/stop_machine.h>
@@ -41,6 +41,7 @@ struct cpu_stopper {
 };
 static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
+static bool stop_machine_initialized = false;
 static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
 {
@@ -386,6 +387,8 @@ static int __init cpu_stop_init(void)
        cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
        register_cpu_notifier(&cpu_stop_cpu_notifier);
+        stop_machine_initialized = true;
        return 0;
 }
 early_initcall(cpu_stop_init);
@@ -485,6 +488,25 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
                                            .num_threads = num_online_cpus(),
                                            .active_cpus = cpus };
+        if (!stop_machine_initialized) {
+                /*
+                 * Handle the case where stop_machine() is called
+                 * early in boot before stop_machine() has been
+                 * initialized.
+                 */
+                unsigned long flags;
+                int ret;
+                WARN_ON_ONCE(smdata.num_threads != 1);
+                local_irq_save(flags);
+                hard_irq_disable();
+                ret = (*fn)(data);
+                local_irq_restore(flags);
+                return ret;
+        }
        /* Set the initial state and stop all online cpus. */
        set_state(&smdata, STOPMACHINE_PREPARE);
        return stop_cpus(cpu_online_mask, stop_machine_cpu_stop, &smdata);
diff --git a/kernel/sys.c b/kernel/sys.c
index a101ba36c44..481611fbd07 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -4,7 +4,7 @@
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
@@ -12,6 +12,7 @@
 #include <linux/prctl.h>
 #include <linux/highuid.h>
 #include <linux/fs.h>
+#include <linux/kmod.h>
 #include <linux/perf_event.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
@@ -37,6 +38,8 @@
 #include <linux/fs_struct.h>
 #include <linux/gfp.h>
 #include <linux/syscore_ops.h>
+#include <linux/version.h>
+#include <linux/ctype.h>
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -44,6 +47,8 @@
 #include <linux/user_namespace.h>
 #include <linux/kmsg_dump.h>
+/* Move somewhere else to avoid recompiling? */
+#include <generated/utsrelease.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -621,11 +626,18 @@ static int set_user(struct cred *new)
        if (!new_user)
                return -EAGAIN;
+        /*
+         * We don't fail in case of NPROC limit excess here because too many
+         * poorly written programs don't check set*uid() return code, assuming
+         * it never fails if called by root.  We may still enforce NPROC limit
+         * for programs doing set*uid()+execve() by harmlessly deferring the
+         * failure to the execve() stage.
+         */
        if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
-                        new_user != INIT_USER) {
+                        new_user != INIT_USER)
-                free_uid(new_user);
+                current->flags |= PF_NPROC_EXCEEDED;
-                return -EAGAIN;
+        else
-        }
+                current->flags &= ~PF_NPROC_EXCEEDED;
        free_uid(new->user);
        new->user = new_user;
@@ -1154,6 +1166,34 @@ DECLARE_RWSEM(uts_sem);
 #define override_architecture(name)     0
 #endif
+/*
+ * Work around broken programs that cannot handle "Linux 3.0".
+ * Instead we map 3.x to 2.6.40+x, so e.g. 3.0 would be 2.6.40
+ */
+static int override_release(char __user *release, int len)
+{
+        int ret = 0;
+        char buf[65];
+        if (current->personality & UNAME26) {
+                char *rest = UTS_RELEASE;
+                int ndots = 0;
+                unsigned v;
+                while (*rest) {
+                        if (*rest == '.' && ++ndots >= 3)
+                                break;
+                        if (!isdigit(*rest) && *rest != '.')
+                                break;
+                        rest++;
+                }
+                v = ((LINUX_VERSION_CODE >> 8) & 0xff) + 40;
+                snprintf(buf, len, "2.6.%u%s", v, rest);
+                ret = copy_to_user(release, buf, len);
+        }
+        return ret;
+}
 SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
 {
        int errno = 0;
@@ -1163,6 +1203,8 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name)
                errno = -EFAULT;
        up_read(&uts_sem);
+        if (!errno && override_release(name->release, sizeof(name->release)))
+                errno = -EFAULT;
        if (!errno && override_architecture(name))
                errno = -EFAULT;
        return errno;
@@ -1184,6 +1226,8 @@ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
                error = -EFAULT;
        up_read(&uts_sem);
+        if (!error && override_release(name->release, sizeof(name->release)))
+                error = -EFAULT;
        if (!error && override_architecture(name))
                error = -EFAULT;
        return error;
@@ -1218,6 +1262,8 @@ SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
        if (!error && override_architecture(name))
                error = -EFAULT;
+        if (!error && override_release(name->release, sizeof(name->release)))
+                error = -EFAULT;
        return error ? -EFAULT : 0;
 }
 #endif
@@ -1241,6 +1287,7 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
                memset(u->nodename + len, 0, sizeof(u->nodename) - len);
                errno = 0;
        }
+        uts_proc_notify(UTS_PROC_HOSTNAME);
        up_write(&uts_sem);
        return errno;
 }
@@ -1291,6 +1338,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
                memset(u->domainname + len, 0, sizeof(u->domainname) - len);
                errno = 0;
        }
+        uts_proc_notify(UTS_PROC_DOMAINNAME);
        up_write(&uts_sem);
        return errno;
 }
@@ -1714,6 +1762,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                                              sizeof(me->comm) - 1) < 0)
                                return -EFAULT;
                        set_task_comm(me, comm);
+                        proc_comm_connector(me);
                        return 0;
                case PR_GET_NAME:
                        get_task_comm(comm, me);
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 62cbc8877fe..47bfa16430d 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -16,7 +16,6 @@ asmlinkage long sys_ni_syscall(void)
        return -ENOSYS;
 }
-cond_syscall(sys_nfsservctl);
 cond_syscall(sys_quotactl);
 cond_syscall(sys32_quotactl);
 cond_syscall(sys_acct);
@@ -146,6 +145,10 @@ cond_syscall(sys_io_submit);
 cond_syscall(sys_io_cancel);
 cond_syscall(sys_io_getevents);
 cond_syscall(sys_syslog);
+cond_syscall(sys_process_vm_readv);
+cond_syscall(sys_process_vm_writev);
+cond_syscall(compat_sys_process_vm_readv);
+cond_syscall(compat_sys_process_vm_writev);
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 11d65b531e5..ae271964385 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -57,6 +57,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/kmod.h>
+#include <linux/capability.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -134,6 +135,7 @@ static int minolduid;
 static int min_percpu_pagelist_fract = 8;
 static int ngroups_max = NGROUPS_MAX;
+static const int cap_last_cap = CAP_LAST_CAP;
 #ifdef CONFIG_INOTIFY_USER
 #include <linux/inotify.h>
@@ -151,14 +153,6 @@ extern int pwrsw_enabled;
 extern int unaligned_enabled;
 #endif
-#ifdef CONFIG_S390
-#ifdef CONFIG_MATHEMU
-extern int sysctl_ieee_emulation_warnings;
-#endif
-extern int sysctl_userprocess_debug;
-extern int spin_retry;
-#endif
 #ifdef CONFIG_IA64
 extern int no_unaligned_warning;
 extern int unaligned_dump_stack;
@@ -379,6 +373,16 @@ static struct ctl_table kern_table[] = {
                .extra2         = &one,
        },
 #endif
+#ifdef CONFIG_CFS_BANDWIDTH
+        {
+                .procname       = "sched_cfs_bandwidth_slice_us",
+                .data           = &sysctl_sched_cfs_bandwidth_slice,
+                .maxlen         = sizeof(unsigned int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &one,
+        },
+#endif
 #ifdef CONFIG_PROVE_LOCKING
        {
                .procname       = "prove_locking",
@@ -730,6 +734,13 @@ static struct ctl_table kern_table[] = {
                .mode           = 0444,
                .proc_handler   = proc_dointvec,
        },
+        {
+                .procname       = "cap_last_cap",
+                .data           = (void *)&cap_last_cap,
+                .maxlen         = sizeof(int),
+                .mode           = 0444,
+                .proc_handler   = proc_dointvec,
+        },
 #if defined(CONFIG_LOCKUP_DETECTOR)
        {
                .procname       = "watchdog",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index 3b8e028b960..6318b511afa 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1,6 +1,6 @@
 #include <linux/stat.h>
 #include <linux/sysctl.h>
-#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
+#include "../fs/xfs/xfs_sysctl.h"
 #include <linux/sunrpc/debug.h>
 #include <linux/string.h>
 #include <net/ip_vs.h>
@@ -214,7 +214,7 @@ static const struct bin_table bin_net_ipv4_route_table[] = {
        { CTL_INT,      NET_IPV4_ROUTE_GC_MIN_INTERVAL,         "gc_min_interval" },
        { CTL_INT,      NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,      "gc_min_interval_ms" },
        { CTL_INT,      NET_IPV4_ROUTE_GC_TIMEOUT,              "gc_timeout" },
-        { CTL_INT,      NET_IPV4_ROUTE_GC_INTERVAL,             "gc_interval" },
+        /* NET_IPV4_ROUTE_GC_INTERVAL "gc_interval" no longer used */
        { CTL_INT,      NET_IPV4_ROUTE_REDIRECT_LOAD,           "redirect_load" },
        { CTL_INT,      NET_IPV4_ROUTE_REDIRECT_NUMBER,         "redirect_number" },
        { CTL_INT,      NET_IPV4_ROUTE_REDIRECT_SILENCE,        "redirect_silence" },
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 4e4932a7b36..362da653813 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1,6 +1,6 @@
 #include <linux/stat.h>
 #include <linux/sysctl.h>
-#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
+#include "../fs/xfs/xfs_sysctl.h"
 #include <linux/sunrpc/debug.h>
 #include <linux/string.h>
 #include <net/ip_vs.h>
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index e19ce1454ee..e66046456f4 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -655,6 +655,7 @@ static struct genl_ops taskstats_ops = {
        .cmd            = TASKSTATS_CMD_GET,
        .doit           = taskstats_user_cmd,
        .policy         = taskstats_cmd_get_policy,
+        .flags          = GENL_ADMIN_PERM,
 };
 static struct genl_ops cgroupstats_ops = {
diff --git a/kernel/time.c b/kernel/time.c
index d7760621452..73e416db0a1 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -27,7 +27,7 @@
 *      with nanosecond accuracy
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/timex.h>
 #include <linux/capability.h>
 #include <linux/clocksource.h>
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index c340ca658f3..ce033c7aa2e 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -18,6 +18,7 @@
 *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 #include <linux/device.h>
+#include <linux/export.h>
 #include <linux/file.h>
 #include <linux/posix-clock.h>
 #include <linux/slab.h>
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 7e2e0817cbf..40420644d0b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -139,7 +139,6 @@ static void tick_nohz_update_jiffies(ktime_t now)
        struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
        unsigned long flags;
-        cpumask_clear_cpu(cpu, nohz_cpu_mask);
        ts->idle_waketime = now;
        local_irq_save(flags);
@@ -418,9 +417,6 @@ void tick_nohz_stop_sched_tick(int inidle)
                else
                        expires.tv64 = KTIME_MAX;
-                if (delta_jiffies > 1)
-                        cpumask_set_cpu(cpu, nohz_cpu_mask);
                /* Skip reprogram of event if its not changed */
                if (ts->tick_stopped && ktime_equal(expires, dev->next_event))
                        goto out;
@@ -470,7 +466,6 @@ void tick_nohz_stop_sched_tick(int inidle)
                 * softirq.
                 */
                tick_do_update_jiffies64(ktime_get());
-                cpumask_clear_cpu(cpu, nohz_cpu_mask);
        }
        raise_softirq_irqoff(TIMER_SOFTIRQ);
 out:
@@ -553,7 +548,6 @@ void tick_nohz_restart_sched_tick(void)
        /* Update jiffies first */
        select_nohz_load_balancer(0);
        tick_do_update_jiffies64(now);
-        cpumask_clear_cpu(cpu, nohz_cpu_mask);
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
        /*
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index a5d0a3a85dd..0b537f27b55 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -81,7 +81,7 @@ struct entry {
 /*
 * Spinlock protecting the tables - not taken during lookup:
 */
-static DEFINE_SPINLOCK(table_lock);
+static DEFINE_RAW_SPINLOCK(table_lock);
 /*
 * Per-CPU lookup locks for fast hash lookup:
@@ -188,7 +188,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
        prev = NULL;
        curr = *head;
-        spin_lock(&table_lock);
+        raw_spin_lock(&table_lock);
        /*
         * Make sure we have not raced with another CPU:
         */
@@ -215,7 +215,7 @@ static struct entry *tstat_lookup(struct entry *entry, char *comm)
                        *head = curr;
        }
 out_unlock:
-        spin_unlock(&table_lock);
+        raw_spin_unlock(&table_lock);
        return curr;
 }
diff --git a/kernel/timer.c b/kernel/timer.c
index 8cff36119e4..dbaa62422b1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -20,7 +20,7 @@
 */
 #include <linux/kernel_stat.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 2ad39e556cb..cd3134510f3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -82,7 +82,7 @@ config EVENT_POWER_TRACING_DEPRECATED
          power:power_frequency
          This is for userspace compatibility
          and will vanish after 5 kernel iterations,
-          namely 2.6.41.
+          namely 3.1.
 config CONTEXT_SWITCH_TRACER
        bool
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 761c510a06c..5f39a07fe5e 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -15,6 +15,8 @@ ifdef CONFIG_TRACING_BRANCHES
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 endif
+CFLAGS_trace_events_filter.o := -I$(src)
 #
 # Make the trace clocks available generally: it's infrastructure
 # relied on by ptrace for example:
@@ -53,6 +55,9 @@ endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
+ifeq ($(CONFIG_PM_RUNTIME),y)
+obj-$(CONFIG_TRACEPOINTS) += rpm-traces.o
+endif
 ifeq ($(CONFIG_TRACING),y)
 obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
 endif
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 6957aa298df..16fc34a0806 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,6 +23,7 @@
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
+#include <linux/export.h>
 #include <linux/time.h>
 #include <linux/uaccess.h>
@@ -206,6 +207,8 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
        what |= MASK_TC_BIT(rw, RAHEAD);
        what |= MASK_TC_BIT(rw, META);
        what |= MASK_TC_BIT(rw, DISCARD);
+        what |= MASK_TC_BIT(rw, FLUSH);
+        what |= MASK_TC_BIT(rw, FUA);
        pid = tsk->pid;
        if (act_log_check(bt, what, sector, pid))
@@ -1054,6 +1057,9 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
                goto out;
        }
+        if (tc & BLK_TC_FLUSH)
+                rwbs[i++] = 'F';
        if (tc & BLK_TC_DISCARD)
                rwbs[i++] = 'D';
        else if (tc & BLK_TC_WRITE)
@@ -1063,10 +1069,10 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
        else
                rwbs[i++] = 'N';
+        if (tc & BLK_TC_FUA)
+                rwbs[i++] = 'F';
        if (tc & BLK_TC_AHEAD)
                rwbs[i++] = 'A';
-        if (tc & BLK_TC_BARRIER)
-                rwbs[i++] = 'B';
        if (tc & BLK_TC_SYNC)
                rwbs[i++] = 'S';
        if (tc & BLK_TC_META)
@@ -1132,7 +1138,7 @@ typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
 static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
 {
-        char rwbs[6];
+        char rwbs[RWBS_LEN];
        unsigned long long ts  = iter->ts;
        unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
        unsigned secs          = (unsigned long)ts;
@@ -1148,7 +1154,7 @@ static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
 static int blk_log_action(struct trace_iterator *iter, const char *act)
 {
-        char rwbs[6];
+        char rwbs[RWBS_LEN];
        const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
        fill_rwbs(rwbs, t);
@@ -1561,7 +1567,7 @@ static const struct {
 } mask_maps[] = {
        { BLK_TC_READ,          "read"          },
        { BLK_TC_WRITE,         "write"         },
-        { BLK_TC_BARRIER,       "barrier"       },
+        { BLK_TC_FLUSH,         "flush"         },
        { BLK_TC_SYNC,          "sync"          },
        { BLK_TC_QUEUE,         "queue"         },
        { BLK_TC_REQUEUE,       "requeue"       },
@@ -1573,6 +1579,7 @@ static const struct {
        { BLK_TC_META,          "meta"          },
        { BLK_TC_DISCARD,       "discard"       },
        { BLK_TC_DRV_DATA,      "drv_data"      },
+        { BLK_TC_FUA,           "fua"           },
 };
 static int blk_trace_str2mask(const char *str)
@@ -1788,6 +1795,9 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 {
        int i = 0;
+        if (rw & REQ_FLUSH)
+                rwbs[i++] = 'F';
        if (rw & WRITE)
                rwbs[i++] = 'W';
        else if (rw & REQ_DISCARD)
@@ -1797,6 +1807,8 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
        else
                rwbs[i++] = 'N';
+        if (rw & REQ_FUA)
+                rwbs[i++] = 'F';
        if (rw & REQ_RAHEAD)
                rwbs[i++] = 'A';
        if (rw & REQ_SYNC)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c3e4575e782..900b409543d 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -22,6 +22,7 @@
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
 #include <linux/uaccess.h>
+#include <linux/module.h>
 #include <linux/ftrace.h>
 #include <linux/sysctl.h>
 #include <linux/slab.h>
@@ -3863,6 +3864,14 @@ void ftrace_kill(void)
 }
 /**
+ * Test if ftrace is dead or not.
+ */
+int ftrace_is_dead(void)
+{
+        return ftrace_disabled;
+}
+/**
 * register_ftrace_function - register a function for profiling
 * @ops - ops structure that holds the function for profiling.
 *
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 731201bf4ac..f5b7b5c1195 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -478,7 +478,7 @@ struct ring_buffer_per_cpu {
        int                             cpu;
        atomic_t                        record_disabled;
        struct ring_buffer              *buffer;
-        spinlock_t                      reader_lock;    /* serialize readers */
+        raw_spinlock_t                  reader_lock;    /* serialize readers */
        arch_spinlock_t                 lock;
        struct lock_class_key           lock_key;
        struct list_head                *pages;
@@ -488,12 +488,14 @@ struct ring_buffer_per_cpu {
        struct buffer_page              *reader_page;
        unsigned long                   lost_events;
        unsigned long                   last_overrun;
+        local_t                         entries_bytes;
        local_t                         commit_overrun;
        local_t                         overrun;
        local_t                         entries;
        local_t                         committing;
        local_t                         commits;
        unsigned long                   read;
+        unsigned long                   read_bytes;
        u64                             write_stamp;
        u64                             read_stamp;
 };
@@ -1062,7 +1064,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
        cpu_buffer->cpu = cpu;
        cpu_buffer->buffer = buffer;
-        spin_lock_init(&cpu_buffer->reader_lock);
+        raw_spin_lock_init(&cpu_buffer->reader_lock);
        lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
        cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
@@ -1259,7 +1261,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
        struct list_head *p;
        unsigned i;
-        spin_lock_irq(&cpu_buffer->reader_lock);
+        raw_spin_lock_irq(&cpu_buffer->reader_lock);
        rb_head_page_deactivate(cpu_buffer);
        for (i = 0; i < nr_pages; i++) {
@@ -1277,7 +1279,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
        rb_check_pages(cpu_buffer);
 out:
-        spin_unlock_irq(&cpu_buffer->reader_lock);
+        raw_spin_unlock_irq(&cpu_buffer->reader_lock);
 }
 static void
@@ -1288,7 +1290,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
        struct list_head *p;
        unsigned i;
-        spin_lock_irq(&cpu_buffer->reader_lock);
+        raw_spin_lock_irq(&cpu_buffer->reader_lock);
        rb_head_page_deactivate(cpu_buffer);
        for (i = 0; i < nr_pages; i++) {
@@ -1303,7 +1305,7 @@ rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
        rb_check_pages(cpu_buffer);
 out:
-        spin_unlock_irq(&cpu_buffer->reader_lock);
+        raw_spin_unlock_irq(&cpu_buffer->reader_lock);
 }
 /**
@@ -1708,6 +1710,7 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
                 * the counters.
                 */
                local_add(entries, &cpu_buffer->overrun);
+                local_sub(BUF_PAGE_SIZE, &cpu_buffer->entries_bytes);
                /*
                 * The entries will be zeroed out when we move the
@@ -1863,6 +1866,9 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
        event = __rb_page_index(tail_page, tail);
        kmemcheck_annotate_bitfield(event, bitfield);
+        /* account for padding bytes */
+        local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
        /*
         * Save the original length to the meta data.
         * This will be used by the reader to add lost event
@@ -2054,6 +2060,9 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        if (!tail)
                tail_page->page->time_stamp = ts;
+        /* account for these added bytes */
+        local_add(length, &cpu_buffer->entries_bytes);
        return event;
 }
@@ -2076,6 +2085,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
        if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
                unsigned long write_mask =
                        local_read(&bpage->write) & ~RB_WRITE_MASK;
+                unsigned long event_length = rb_event_length(event);
                /*
                 * This is on the tail page. It is possible that
                 * a write could come in and move the tail page
@@ -2085,8 +2095,11 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
                old_index += write_mask;
                new_index += write_mask;
                index = local_cmpxchg(&bpage->write, old_index, new_index);
-                if (index == old_index)
+                if (index == old_index) {
+                        /* update counters */
+                        local_sub(event_length, &cpu_buffer->entries_bytes);
                        return 1;
+                }
        }
        /* could not discard */
@@ -2661,6 +2674,58 @@ rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer)
 }
 /**
+ * ring_buffer_oldest_event_ts - get the oldest event timestamp from the buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to read from.
+ */
+unsigned long ring_buffer_oldest_event_ts(struct ring_buffer *buffer, int cpu)
+{
+        unsigned long flags;
+        struct ring_buffer_per_cpu *cpu_buffer;
+        struct buffer_page *bpage;
+        unsigned long ret;
+        if (!cpumask_test_cpu(cpu, buffer->cpumask))
+                return 0;
+        cpu_buffer = buffer->buffers[cpu];
+        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        /*
+         * if the tail is on reader_page, oldest time stamp is on the reader
+         * page
+         */
+        if (cpu_buffer->tail_page == cpu_buffer->reader_page)
+                bpage = cpu_buffer->reader_page;
+        else
+                bpage = rb_set_head_page(cpu_buffer);
+        ret = bpage->page->time_stamp;
+        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_oldest_event_ts);
+/**
+ * ring_buffer_bytes_cpu - get the number of bytes consumed in a cpu buffer
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to read from.
+ */
+unsigned long ring_buffer_bytes_cpu(struct ring_buffer *buffer, int cpu)
+{
+        struct ring_buffer_per_cpu *cpu_buffer;
+        unsigned long ret;
+        if (!cpumask_test_cpu(cpu, buffer->cpumask))
+                return 0;
+        cpu_buffer = buffer->buffers[cpu];
+        ret = local_read(&cpu_buffer->entries_bytes) - cpu_buffer->read_bytes;
+        return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_bytes_cpu);
+/**
 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
 * @buffer: The ring buffer
 * @cpu: The per CPU buffer to get the entries from.
@@ -2804,9 +2869,9 @@ void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
        cpu_buffer = iter->cpu_buffer;
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        rb_iter_reset(iter);
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
@@ -3265,12 +3330,12 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts,
 again:
        local_irq_save(flags);
        if (dolock)
-                spin_lock(&cpu_buffer->reader_lock);
+                raw_spin_lock(&cpu_buffer->reader_lock);
        event = rb_buffer_peek(cpu_buffer, ts, lost_events);
        if (event && event->type_len == RINGBUF_TYPE_PADDING)
                rb_advance_reader(cpu_buffer);
        if (dolock)
-                spin_unlock(&cpu_buffer->reader_lock);
+                raw_spin_unlock(&cpu_buffer->reader_lock);
        local_irq_restore(flags);
        if (event && event->type_len == RINGBUF_TYPE_PADDING)
@@ -3295,9 +3360,9 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        unsigned long flags;
 again:
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        event = rb_iter_peek(iter, ts);
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
        if (event && event->type_len == RINGBUF_TYPE_PADDING)
                goto again;
@@ -3337,7 +3402,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
        cpu_buffer = buffer->buffers[cpu];
        local_irq_save(flags);
        if (dolock)
-                spin_lock(&cpu_buffer->reader_lock);
+                raw_spin_lock(&cpu_buffer->reader_lock);
        event = rb_buffer_peek(cpu_buffer, ts, lost_events);
        if (event) {
@@ -3346,7 +3411,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts,
        }
        if (dolock)
-                spin_unlock(&cpu_buffer->reader_lock);
+                raw_spin_unlock(&cpu_buffer->reader_lock);
        local_irq_restore(flags);
 out:
@@ -3438,11 +3503,11 @@ ring_buffer_read_start(struct ring_buffer_iter *iter)
        cpu_buffer = iter->cpu_buffer;
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        arch_spin_lock(&cpu_buffer->lock);
        rb_iter_reset(iter);
        arch_spin_unlock(&cpu_buffer->lock);
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -3477,7 +3542,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
        struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
        unsigned long flags;
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 again:
        event = rb_iter_peek(iter, ts);
        if (!event)
@@ -3488,7 +3553,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
        rb_advance_iter(iter);
 out:
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
        return event;
 }
@@ -3527,11 +3592,13 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->reader_page->read = 0;
        local_set(&cpu_buffer->commit_overrun, 0);
+        local_set(&cpu_buffer->entries_bytes, 0);
        local_set(&cpu_buffer->overrun, 0);
        local_set(&cpu_buffer->entries, 0);
        local_set(&cpu_buffer->committing, 0);
        local_set(&cpu_buffer->commits, 0);
        cpu_buffer->read = 0;
+        cpu_buffer->read_bytes = 0;
        cpu_buffer->write_stamp = 0;
        cpu_buffer->read_stamp = 0;
@@ -3557,7 +3624,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
        atomic_inc(&cpu_buffer->record_disabled);
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        if (RB_WARN_ON(cpu_buffer, local_read(&cpu_buffer->committing)))
                goto out;
@@ -3569,7 +3636,7 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
        arch_spin_unlock(&cpu_buffer->lock);
 out:
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
        atomic_dec(&cpu_buffer->record_disabled);
 }
@@ -3607,10 +3674,10 @@ int ring_buffer_empty(struct ring_buffer *buffer)
                cpu_buffer = buffer->buffers[cpu];
                local_irq_save(flags);
                if (dolock)
-                        spin_lock(&cpu_buffer->reader_lock);
+                        raw_spin_lock(&cpu_buffer->reader_lock);
                ret = rb_per_cpu_empty(cpu_buffer);
                if (dolock)
-                        spin_unlock(&cpu_buffer->reader_lock);
+                        raw_spin_unlock(&cpu_buffer->reader_lock);
                local_irq_restore(flags);
                if (!ret)
@@ -3641,10 +3708,10 @@ int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
        cpu_buffer = buffer->buffers[cpu];
        local_irq_save(flags);
        if (dolock)
-                spin_lock(&cpu_buffer->reader_lock);
+                raw_spin_lock(&cpu_buffer->reader_lock);
        ret = rb_per_cpu_empty(cpu_buffer);
        if (dolock)
-                spin_unlock(&cpu_buffer->reader_lock);
+                raw_spin_unlock(&cpu_buffer->reader_lock);
        local_irq_restore(flags);
        return ret;
@@ -3841,7 +3908,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
        if (!bpage)
                goto out;
-        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
+        raw_spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
        reader = rb_get_reader_page(cpu_buffer);
        if (!reader)
@@ -3918,6 +3985,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
        } else {
                /* update the entry counter */
                cpu_buffer->read += rb_page_entries(reader);
+                cpu_buffer->read_bytes += BUF_PAGE_SIZE;
                /* swap the pages */
                rb_init_page(bpage);
@@ -3964,7 +4032,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
 out_unlock:
-        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+        raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 out:
        return ret;
diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c
new file mode 100644
index 00000000000..4b3b5eaf94d
--- /dev/null
+++ b/kernel/trace/rpm-traces.c
@@ -0,0 +1,20 @@
+/*
+ * Power trace points
+ *
+ * Copyright (C) 2009 Ming Lei <ming.lei@canonical.com>
+ */
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/usb.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/rpm.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_return_int);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_idle);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_suspend);
+EXPORT_TRACEPOINT_SYMBOL_GPL(rpm_resume);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e5df02c69b1..f2bd275bb60 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -341,7 +341,7 @@ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
        TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
 static int trace_stop_count;
-static DEFINE_SPINLOCK(tracing_start_lock);
+static DEFINE_RAW_SPINLOCK(tracing_start_lock);
 static void wakeup_work_handler(struct work_struct *work)
 {
@@ -435,6 +435,7 @@ static struct {
 } trace_clocks[] = {
        { trace_clock_local,    "local" },
        { trace_clock_global,   "global" },
+        { trace_clock_counter,  "counter" },
 };
 int trace_clock_id;
@@ -960,7 +961,7 @@ void tracing_start(void)
        if (tracing_disabled)
                return;
-        spin_lock_irqsave(&tracing_start_lock, flags);
+        raw_spin_lock_irqsave(&tracing_start_lock, flags);
        if (--trace_stop_count) {
                if (trace_stop_count < 0) {
                        /* Someone screwed up their debugging */
@@ -985,7 +986,7 @@ void tracing_start(void)
        ftrace_start();
 out:
-        spin_unlock_irqrestore(&tracing_start_lock, flags);
+        raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
 }
 /**
@@ -1000,7 +1001,7 @@ void tracing_stop(void)
        unsigned long flags;
        ftrace_stop();
-        spin_lock_irqsave(&tracing_start_lock, flags);
+        raw_spin_lock_irqsave(&tracing_start_lock, flags);
        if (trace_stop_count++)
                goto out;
@@ -1018,7 +1019,7 @@ void tracing_stop(void)
        arch_spin_unlock(&ftrace_max_lock);
 out:
-        spin_unlock_irqrestore(&tracing_start_lock, flags);
+        raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
 }
 void trace_stop_cmdline_recording(void);
@@ -2159,6 +2160,14 @@ void trace_default_header(struct seq_file *m)
        }
 }
+static void test_ftrace_alive(struct seq_file *m)
+{
+        if (!ftrace_is_dead())
+                return;
+        seq_printf(m, "# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+        seq_printf(m, "#          MAY BE MISSING FUNCTION EVENTS\n");
+}
 static int s_show(struct seq_file *m, void *v)
 {
        struct trace_iterator *iter = v;
@@ -2168,6 +2177,7 @@ static int s_show(struct seq_file *m, void *v)
                if (iter->tr) {
                        seq_printf(m, "# tracer: %s\n", iter->trace->name);
                        seq_puts(m, "#\n");
+                        test_ftrace_alive(m);
                }
                if (iter->trace && iter->trace->print_header)
                        iter->trace->print_header(m);
@@ -2710,9 +2720,9 @@ static const char readme_msg[] =
        "# cat /sys/kernel/debug/tracing/trace_options\n"
        "noprint-parent nosym-offset nosym-addr noverbose\n"
        "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
-        "# echo 1 > /sys/kernel/debug/tracing/tracing_enabled\n"
+        "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n"
        "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
-        "# echo 0 > /sys/kernel/debug/tracing/tracing_enabled\n"
+        "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n"
 ;
 static ssize_t
@@ -3569,6 +3579,30 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 }
 static ssize_t
+tracing_total_entries_read(struct file *filp, char __user *ubuf,
+                                size_t cnt, loff_t *ppos)
+{
+        struct trace_array *tr = filp->private_data;
+        char buf[64];
+        int r, cpu;
+        unsigned long size = 0, expanded_size = 0;
+        mutex_lock(&trace_types_lock);
+        for_each_tracing_cpu(cpu) {
+                size += tr->entries >> 10;
+                if (!ring_buffer_expanded)
+                        expanded_size += trace_buf_size >> 10;
+        }
+        if (ring_buffer_expanded)
+                r = sprintf(buf, "%lu\n", size);
+        else
+                r = sprintf(buf, "%lu (expanded: %lu)\n", size, expanded_size);
+        mutex_unlock(&trace_types_lock);
+        return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+static ssize_t
 tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
                          size_t cnt, loff_t *ppos)
 {
@@ -3594,22 +3628,24 @@ tracing_free_buffer_release(struct inode *inode, struct file *filp)
        return 0;
 }
-static int mark_printk(const char *fmt, ...)
-{
-        int ret;
-        va_list args;
-        va_start(args, fmt);
-        ret = trace_vprintk(0, fmt, args);
-        va_end(args);
-        return ret;
-}
 static ssize_t
 tracing_mark_write(struct file *filp, const char __user *ubuf,
                                        size_t cnt, loff_t *fpos)
 {
-        char *buf;
+        unsigned long addr = (unsigned long)ubuf;
-        size_t written;
+        struct ring_buffer_event *event;
+        struct ring_buffer *buffer;
+        struct print_entry *entry;
+        unsigned long irq_flags;
+        struct page *pages[2];
+        int nr_pages = 1;
+        ssize_t written;
+        void *page1;
+        void *page2;
+        int offset;
+        int size;
+        int len;
+        int ret;
        if (tracing_disabled)
                return -EINVAL;
@@ -3617,28 +3653,81 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
        if (cnt > TRACE_BUF_SIZE)
                cnt = TRACE_BUF_SIZE;
-        buf = kmalloc(cnt + 2, GFP_KERNEL);
+        /*
-        if (buf == NULL)
+         * Userspace is injecting traces into the kernel trace buffer.
-                return -ENOMEM;
+         * We want to be as non intrusive as possible.
+         * To do so, we do not want to allocate any special buffers
+         * or take any locks, but instead write the userspace data
+         * straight into the ring buffer.
+         *
+         * First we need to pin the userspace buffer into memory,
+         * which, most likely it is, because it just referenced it.
+         * But there's no guarantee that it is. By using get_user_pages_fast()
+         * and kmap_atomic/kunmap_atomic() we can get access to the
+         * pages directly. We then write the data directly into the
+         * ring buffer.
+         */
+        BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
-        if (copy_from_user(buf, ubuf, cnt)) {
+        /* check if we cross pages */
-                kfree(buf);
+        if ((addr & PAGE_MASK) != ((addr + cnt) & PAGE_MASK))
-                return -EFAULT;
+                nr_pages = 2;
+        offset = addr & (PAGE_SIZE - 1);
+        addr &= PAGE_MASK;
+        ret = get_user_pages_fast(addr, nr_pages, 0, pages);
+        if (ret < nr_pages) {
+                while (--ret >= 0)
+                        put_page(pages[ret]);
+                written = -EFAULT;
+                goto out;
+        }
+        page1 = kmap_atomic(pages[0]);
+        if (nr_pages == 2)
+                page2 = kmap_atomic(pages[1]);
+        local_save_flags(irq_flags);
+        size = sizeof(*entry) + cnt + 2; /* possible \n added */
+        buffer = global_trace.buffer;
+        event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
+                                          irq_flags, preempt_count());
+        if (!event) {
+                /* Ring buffer disabled, return as if not open for write */
+                written = -EBADF;
+                goto out_unlock;
        }
-        if (buf[cnt-1] != '\n') {
-                buf[cnt] = '\n';
+        entry = ring_buffer_event_data(event);
-                buf[cnt+1] = '\0';
+        entry->ip = _THIS_IP_;
+        if (nr_pages == 2) {
+                len = PAGE_SIZE - offset;
+                memcpy(&entry->buf, page1 + offset, len);
+                memcpy(&entry->buf[len], page2, cnt - len);
        } else
-                buf[cnt] = '\0';
+                memcpy(&entry->buf, page1 + offset, cnt);
-        written = mark_printk("%s", buf);
+        if (entry->buf[cnt - 1] != '\n') {
-        kfree(buf);
+                entry->buf[cnt] = '\n';
-        *fpos += written;
+                entry->buf[cnt + 1] = '\0';
+        } else
+                entry->buf[cnt] = '\0';
+        ring_buffer_unlock_commit(buffer, event);
-        /* don't tell userspace we wrote more - it might confuse them */
+        written = cnt;
-        if (written > cnt)
-                written = cnt;
+        *fpos += written;
+ out_unlock:
+        if (nr_pages == 2)
+                kunmap_atomic(page2);
+        kunmap_atomic(page1);
+        while (nr_pages > 0)
+                put_page(pages[--nr_pages]);
+ out:
        return written;
 }
@@ -3739,6 +3828,12 @@ static const struct file_operations tracing_entries_fops = {
        .llseek         = generic_file_llseek,
 };
+static const struct file_operations tracing_total_entries_fops = {
+        .open           = tracing_open_generic,
+        .read           = tracing_total_entries_read,
+        .llseek         = generic_file_llseek,
+};
 static const struct file_operations tracing_free_buffer_fops = {
        .write          = tracing_free_buffer_write,
        .release        = tracing_free_buffer_release,
@@ -3808,8 +3903,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
        if (info->read < PAGE_SIZE)
                goto read;
-        info->read = 0;
        trace_access_lock(info->cpu);
        ret = ring_buffer_read_page(info->tr->buffer,
                                    &info->spare,
@@ -3819,6 +3912,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
        if (ret < 0)
                return 0;
+        info->read = 0;
 read:
        size = PAGE_SIZE - info->read;
        if (size > count)
@@ -4026,6 +4121,8 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
        struct trace_array *tr = &global_trace;
        struct trace_seq *s;
        unsigned long cnt;
+        unsigned long long t;
+        unsigned long usec_rem;
        s = kmalloc(sizeof(*s), GFP_KERNEL);
        if (!s)
@@ -4042,6 +4139,17 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
        cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
        trace_seq_printf(s, "commit overrun: %ld\n", cnt);
+        cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
+        trace_seq_printf(s, "bytes: %ld\n", cnt);
+        t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
+        usec_rem = do_div(t, USEC_PER_SEC);
+        trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem);
+        t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
+        usec_rem = do_div(t, USEC_PER_SEC);
+        trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
        count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
        kfree(s);
@@ -4450,6 +4558,9 @@ static __init int tracer_init_debugfs(void)
        trace_create_file("buffer_size_kb", 0644, d_tracer,
                        &global_trace, &tracing_entries_fops);
+        trace_create_file("buffer_total_size_kb", 0444, d_tracer,
+                        &global_trace, &tracing_total_entries_fops);
        trace_create_file("free_buffer", 0644, d_tracer,
                        &global_trace, &tracing_free_buffer_fops);
@@ -4566,6 +4677,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
        tracing_off();
+        /* Did function tracer already get disabled? */
+        if (ftrace_is_dead()) {
+                printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+                printk("#          MAY BE MISSING FUNCTION EVENTS\n");
+        }
        if (disable_tracing)
                ftrace_kill();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 616846bcfee..092e1f8d18d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -579,11 +579,13 @@ static inline int ftrace_trace_task(struct task_struct *task)
        return test_tsk_trace_trace(task);
 }
+extern int ftrace_is_dead(void);
 #else
 static inline int ftrace_trace_task(struct task_struct *task)
 {
        return 1;
 }
+static inline int ftrace_is_dead(void) { return 0; }
 #endif
 /*
@@ -761,16 +763,10 @@ struct filter_pred {
        filter_pred_fn_t        fn;
        u64                     val;
        struct regex            regex;
-        /*
+        unsigned short          *ops;
-         * Leaf nodes use field_name, ops is used by AND and OR
+#ifdef CONFIG_FTRACE_STARTUP_TEST
-         * nodes. The field_name is always freed when freeing a pred.
+        struct ftrace_event_field *field;
-         * We can overload field_name for ops and have it freed
+#endif
-         * as well.
-         */
-        union {
-                char            *field_name;
-                unsigned short  *ops;
-        };
        int                     offset;
        int                     not;
        int                     op;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 6302747a139..394783531cb 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -113,3 +113,15 @@ u64 notrace trace_clock_global(void)
        return now;
 }
+static atomic64_t trace_counter;
+/*
+ * trace_clock_counter(): simply an atomic counter.
+ * Use the trace_counter "counter" for cases where you do not care
+ * about timings, but are interested in strict ordering.
+ */
+u64 notrace trace_clock_counter(void)
+{
+        return atomic64_add_return(1, &trace_counter);
+}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 256764ecccd..816d3d07497 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -381,6 +381,63 @@ get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
        return pred;
 }
+enum walk_return {
+        WALK_PRED_ABORT,
+        WALK_PRED_PARENT,
+        WALK_PRED_DEFAULT,
+};
+typedef int (*filter_pred_walkcb_t) (enum move_type move,
+                                     struct filter_pred *pred,
+                                     int *err, void *data);
+static int walk_pred_tree(struct filter_pred *preds,
+                          struct filter_pred *root,
+                          filter_pred_walkcb_t cb, void *data)
+{
+        struct filter_pred *pred = root;
+        enum move_type move = MOVE_DOWN;
+        int done = 0;
+        if  (!preds)
+                return -EINVAL;
+        do {
+                int err = 0, ret;
+                ret = cb(move, pred, &err, data);
+                if (ret == WALK_PRED_ABORT)
+                        return err;
+                if (ret == WALK_PRED_PARENT)
+                        goto get_parent;
+                switch (move) {
+                case MOVE_DOWN:
+                        if (pred->left != FILTER_PRED_INVALID) {
+                                pred = &preds[pred->left];
+                                continue;
+                        }
+                        goto get_parent;
+                case MOVE_UP_FROM_LEFT:
+                        pred = &preds[pred->right];
+                        move = MOVE_DOWN;
+                        continue;
+                case MOVE_UP_FROM_RIGHT:
+ get_parent:
+                        if (pred == root)
+                                break;
+                        pred = get_pred_parent(pred, preds,
+                                               pred->parent,
+                                               &move);
+                        continue;
+                }
+                done = 1;
+        } while (!done);
+        /* We are fine. */
+        return 0;
+}
 /*
 * A series of AND or ORs where found together. Instead of
 * climbing up and down the tree branches, an array of the
@@ -410,99 +467,91 @@ static int process_ops(struct filter_pred *preds,
        for (i = 0; i < op->val; i++) {
                pred = &preds[op->ops[i]];
-                match = pred->fn(pred, rec);
+                if (!WARN_ON_ONCE(!pred->fn))
+                        match = pred->fn(pred, rec);
                if (!!match == type)
                        return match;
        }
        return match;
 }
+struct filter_match_preds_data {
+        struct filter_pred *preds;
+        int match;
+        void *rec;
+};
+static int filter_match_preds_cb(enum move_type move, struct filter_pred *pred,
+                                 int *err, void *data)
+{
+        struct filter_match_preds_data *d = data;
+        *err = 0;
+        switch (move) {
+        case MOVE_DOWN:
+                /* only AND and OR have children */
+                if (pred->left != FILTER_PRED_INVALID) {
+                        /* If ops is set, then it was folded. */
+                        if (!pred->ops)
+                                return WALK_PRED_DEFAULT;
+                        /* We can treat folded ops as a leaf node */
+                        d->match = process_ops(d->preds, pred, d->rec);
+                } else {
+                        if (!WARN_ON_ONCE(!pred->fn))
+                                d->match = pred->fn(pred, d->rec);
+                }
+                return WALK_PRED_PARENT;
+        case MOVE_UP_FROM_LEFT:
+                /*
+                 * Check for short circuits.
+                 *
+                 * Optimization: !!match == (pred->op == OP_OR)
+                 *   is the same as:
+                 * if ((match && pred->op == OP_OR) ||
+                 *     (!match && pred->op == OP_AND))
+                 */
+                if (!!d->match == (pred->op == OP_OR))
+                        return WALK_PRED_PARENT;
+                break;
+        case MOVE_UP_FROM_RIGHT:
+                break;
+        }
+        return WALK_PRED_DEFAULT;
+}
 /* return 1 if event matches, 0 otherwise (discard) */
 int filter_match_preds(struct event_filter *filter, void *rec)
 {
-        int match = -1;
-        enum move_type move = MOVE_DOWN;
        struct filter_pred *preds;
-        struct filter_pred *pred;
        struct filter_pred *root;
-        int n_preds;
+        struct filter_match_preds_data data = {
-        int done = 0;
+                /* match is currently meaningless */
+                .match = -1,
+                .rec   = rec,
+        };
+        int n_preds, ret;
        /* no filter is considered a match */
        if (!filter)
                return 1;
        n_preds = filter->n_preds;
        if (!n_preds)
                return 1;
        /*
         * n_preds, root and filter->preds are protect with preemption disabled.
         */
-        preds = rcu_dereference_sched(filter->preds);
        root = rcu_dereference_sched(filter->root);
        if (!root)
                return 1;
-        pred = root;
+        data.preds = preds = rcu_dereference_sched(filter->preds);
+        ret = walk_pred_tree(preds, root, filter_match_preds_cb, &data);
-        /* match is currently meaningless */
+        WARN_ON(ret);
-        match = -1;
+        return data.match;
-        do {
-                switch (move) {
-                case MOVE_DOWN:
-                        /* only AND and OR have children */
-                        if (pred->left != FILTER_PRED_INVALID) {
-                                /* If ops is set, then it was folded. */
-                                if (!pred->ops) {
-                                        /* keep going to down the left side */
-                                        pred = &preds[pred->left];
-                                        continue;
-                                }
-                                /* We can treat folded ops as a leaf node */
-                                match = process_ops(preds, pred, rec);
-                        } else
-                                match = pred->fn(pred, rec);
-                        /* If this pred is the only pred */
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                case MOVE_UP_FROM_LEFT:
-                        /*
-                         * Check for short circuits.
-                         *
-                         * Optimization: !!match == (pred->op == OP_OR)
-                         *   is the same as:
-                         * if ((match && pred->op == OP_OR) ||
-                         *     (!match && pred->op == OP_AND))
-                         */
-                        if (!!match == (pred->op == OP_OR)) {
-                                if (pred == root)
-                                        break;
-                                pred = get_pred_parent(pred, preds,
-                                                       pred->parent, &move);
-                                continue;
-                        }
-                        /* now go down the right side of the tree. */
-                        pred = &preds[pred->right];
-                        move = MOVE_DOWN;
-                        continue;
-                case MOVE_UP_FROM_RIGHT:
-                        /* We finished this equation. */
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                }
-                done = 1;
-        } while (!done);
-        return match;
 }
 EXPORT_SYMBOL_GPL(filter_match_preds);
@@ -628,22 +677,6 @@ find_event_field(struct ftrace_event_call *call, char *name)
        return __find_event_field(head, name);
 }
-static void filter_free_pred(struct filter_pred *pred)
-{
-        if (!pred)
-                return;
-        kfree(pred->field_name);
-        kfree(pred);
-}
-static void filter_clear_pred(struct filter_pred *pred)
-{
-        kfree(pred->field_name);
-        pred->field_name = NULL;
-        pred->regex.len = 0;
-}
 static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
 {
        stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
@@ -689,20 +722,13 @@ __pop_pred_stack(struct pred_stack *stack)
 static int filter_set_pred(struct event_filter *filter,
                           int idx,
                           struct pred_stack *stack,
-                           struct filter_pred *src,
+                           struct filter_pred *src)
-                           filter_pred_fn_t fn)
 {
        struct filter_pred *dest = &filter->preds[idx];
        struct filter_pred *left;
        struct filter_pred *right;
        *dest = *src;
-        if (src->field_name) {
-                dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
-                if (!dest->field_name)
-                        return -ENOMEM;
-        }
-        dest->fn = fn;
        dest->index = idx;
        if (dest->op == OP_OR || dest->op == OP_AND) {
@@ -743,11 +769,7 @@ static int filter_set_pred(struct event_filter *filter,
 static void __free_preds(struct event_filter *filter)
 {
-        int i;
        if (filter->preds) {
-                for (i = 0; i < filter->a_preds; i++)
-                        kfree(filter->preds[i].field_name);
                kfree(filter->preds);
                filter->preds = NULL;
        }
@@ -840,23 +862,19 @@ static void filter_free_subsystem_filters(struct event_subsystem *system)
        }
 }
-static int filter_add_pred_fn(struct filter_parse_state *ps,
+static int filter_add_pred(struct filter_parse_state *ps,
-                              struct ftrace_event_call *call,
+                           struct event_filter *filter,
-                              struct event_filter *filter,
+                           struct filter_pred *pred,
-                              struct filter_pred *pred,
+                           struct pred_stack *stack)
-                              struct pred_stack *stack,
-                              filter_pred_fn_t fn)
 {
-        int idx, err;
+        int err;
        if (WARN_ON(filter->n_preds == filter->a_preds)) {
                parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
                return -ENOSPC;
        }
-        idx = filter->n_preds;
+        err = filter_set_pred(filter, filter->n_preds, stack, pred);
-        filter_clear_pred(&filter->preds[idx]);
-        err = filter_set_pred(filter, idx, stack, pred, fn);
        if (err)
                return err;
@@ -937,31 +955,15 @@ static filter_pred_fn_t select_comparison_fn(int op, int field_size,
        return fn;
 }
-static int filter_add_pred(struct filter_parse_state *ps,
+static int init_pred(struct filter_parse_state *ps,
-                           struct ftrace_event_call *call,
+                     struct ftrace_event_field *field,
-                           struct event_filter *filter,
+                     struct filter_pred *pred)
-                           struct filter_pred *pred,
-                           struct pred_stack *stack,
-                           bool dry_run)
 {
-        struct ftrace_event_field *field;
+        filter_pred_fn_t fn = filter_pred_none;
-        filter_pred_fn_t fn;
        unsigned long long val;
        int ret;
-        fn = pred->fn = filter_pred_none;
-        if (pred->op == OP_AND)
-                goto add_pred_fn;
-        else if (pred->op == OP_OR)
-                goto add_pred_fn;
-        field = find_event_field(call, pred->field_name);
-        if (!field) {
-                parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
-                return -EINVAL;
-        }
        pred->offset = field->offset;
        if (!is_legal_op(field, pred->op)) {
@@ -1001,9 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
        if (pred->op == OP_NE)
                pred->not = 1;
-add_pred_fn:
+        pred->fn = fn;
-        if (!dry_run)
-                return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
        return 0;
 }
@@ -1302,39 +1302,37 @@ parse_operand:
        return 0;
 }
-static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
+static struct filter_pred *create_pred(struct filter_parse_state *ps,
+                                       struct ftrace_event_call *call,
+                                       int op, char *operand1, char *operand2)
 {
-        struct filter_pred *pred;
+        struct ftrace_event_field *field;
+        static struct filter_pred pred;
-        pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+        memset(&pred, 0, sizeof(pred));
-        if (!pred)
+        pred.op = op;
-                return NULL;
-        pred->field_name = kstrdup(operand1, GFP_KERNEL);
+        if (op == OP_AND || op == OP_OR)
-        if (!pred->field_name) {
+                return &pred;
-                kfree(pred);
+        if (!operand1 || !operand2) {
+                parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
                return NULL;
        }
-        strcpy(pred->regex.pattern, operand2);
+        field = find_event_field(call, operand1);
-        pred->regex.len = strlen(pred->regex.pattern);
+        if (!field) {
+                parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
-        pred->op = op;
-        return pred;
-}
-static struct filter_pred *create_logical_pred(int op)
-{
-        struct filter_pred *pred;
-        pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-        if (!pred)
                return NULL;
+        }
-        pred->op = op;
+        strcpy(pred.regex.pattern, operand2);
+        pred.regex.len = strlen(pred.regex.pattern);
-        return pred;
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+        pred.field = field;
+#endif
+        return init_pred(ps, field, &pred) ? NULL : &pred;
 }
 static int check_preds(struct filter_parse_state *ps)
@@ -1375,6 +1373,23 @@ static int count_preds(struct filter_parse_state *ps)
        return n_preds;
 }
+struct check_pred_data {
+        int count;
+        int max;
+};
+static int check_pred_tree_cb(enum move_type move, struct filter_pred *pred,
+                              int *err, void *data)
+{
+        struct check_pred_data *d = data;
+        if (WARN_ON(d->count++ > d->max)) {
+                *err = -EINVAL;
+                return WALK_PRED_ABORT;
+        }
+        return WALK_PRED_DEFAULT;
+}
 /*
 * The tree is walked at filtering of an event. If the tree is not correctly
 * built, it may cause an infinite loop. Check here that the tree does
@@ -1383,107 +1398,76 @@ static int count_preds(struct filter_parse_state *ps)
 static int check_pred_tree(struct event_filter *filter,
                           struct filter_pred *root)
 {
-        struct filter_pred *preds;
+        struct check_pred_data data = {
-        struct filter_pred *pred;
+                /*
-        enum move_type move = MOVE_DOWN;
+                 * The max that we can hit a node is three times.
-        int count = 0;
+                 * Once going down, once coming up from left, and
-        int done = 0;
+                 * once coming up from right. This is more than enough
-        int max;
+                 * since leafs are only hit a single time.
+                 */
-        /*
+                .max   = 3 * filter->n_preds,
-         * The max that we can hit a node is three times.
+                .count = 0,
-         * Once going down, once coming up from left, and
+        };
-         * once coming up from right. This is more than enough
-         * since leafs are only hit a single time.
-         */
-        max = 3 * filter->n_preds;
-        preds = filter->preds;
+        return walk_pred_tree(filter->preds, root,
-        if  (!preds)
+                              check_pred_tree_cb, &data);
-                return -EINVAL;
+}
-        pred = root;
-        do {
+static int count_leafs_cb(enum move_type move, struct filter_pred *pred,
-                if (WARN_ON(count++ > max))
+                          int *err, void *data)
-                        return -EINVAL;
+{
+        int *count = data;
-                switch (move) {
+        if ((move == MOVE_DOWN) &&
-                case MOVE_DOWN:
+            (pred->left == FILTER_PRED_INVALID))
-                        if (pred->left != FILTER_PRED_INVALID) {
+                (*count)++;
-                                pred = &preds[pred->left];
-                                continue;
-                        }
-                        /* A leaf at the root is just a leaf in the tree */
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                case MOVE_UP_FROM_LEFT:
-                        pred = &preds[pred->right];
-                        move = MOVE_DOWN;
-                        continue;
-                case MOVE_UP_FROM_RIGHT:
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                }
-                done = 1;
-        } while (!done);
-        /* We are fine. */
+        return WALK_PRED_DEFAULT;
-        return 0;
 }
 static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
 {
-        struct filter_pred *pred;
+        int count = 0, ret;
-        enum move_type move = MOVE_DOWN;
-        int count = 0;
-        int done = 0;
-        pred = root;
+        ret = walk_pred_tree(preds, root, count_leafs_cb, &count);
+        WARN_ON(ret);
+        return count;
+}
-        do {
+struct fold_pred_data {
-                switch (move) {
+        struct filter_pred *root;
-                case MOVE_DOWN:
+        int count;
-                        if (pred->left != FILTER_PRED_INVALID) {
+        int children;
-                                pred = &preds[pred->left];
+};
-                                continue;
-                        }
-                        /* A leaf at the root is just a leaf in the tree */
-                        if (pred == root)
-                                return 1;
-                        count++;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                case MOVE_UP_FROM_LEFT:
-                        pred = &preds[pred->right];
-                        move = MOVE_DOWN;
-                        continue;
-                case MOVE_UP_FROM_RIGHT:
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                }
-                done = 1;
-        } while (!done);
-        return count;
+static int fold_pred_cb(enum move_type move, struct filter_pred *pred,
+                        int *err, void *data)
+{
+        struct fold_pred_data *d = data;
+        struct filter_pred *root = d->root;
+        if (move != MOVE_DOWN)
+                return WALK_PRED_DEFAULT;
+        if (pred->left != FILTER_PRED_INVALID)
+                return WALK_PRED_DEFAULT;
+        if (WARN_ON(d->count == d->children)) {
+                *err = -EINVAL;
+                return WALK_PRED_ABORT;
+        }
+        pred->index &= ~FILTER_PRED_FOLD;
+        root->ops[d->count++] = pred->index;
+        return WALK_PRED_DEFAULT;
 }
 static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
 {
-        struct filter_pred *pred;
+        struct fold_pred_data data = {
-        enum move_type move = MOVE_DOWN;
+                .root  = root,
-        int count = 0;
+                .count = 0,
+        };
        int children;
-        int done = 0;
        /* No need to keep the fold flag */
        root->index &= ~FILTER_PRED_FOLD;
@@ -1501,37 +1485,26 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
                return -ENOMEM;
        root->val = children;
+        data.children = children;
+        return walk_pred_tree(preds, root, fold_pred_cb, &data);
+}
-        pred = root;
+static int fold_pred_tree_cb(enum move_type move, struct filter_pred *pred,
-        do {
+                             int *err, void *data)
-                switch (move) {
+{
-                case MOVE_DOWN:
+        struct filter_pred *preds = data;
-                        if (pred->left != FILTER_PRED_INVALID) {
-                                pred = &preds[pred->left];
-                                continue;
-                        }
-                        if (WARN_ON(count == children))
-                                return -EINVAL;
-                        pred->index &= ~FILTER_PRED_FOLD;
-                        root->ops[count++] = pred->index;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                case MOVE_UP_FROM_LEFT:
-                        pred = &preds[pred->right];
-                        move = MOVE_DOWN;
-                        continue;
-                case MOVE_UP_FROM_RIGHT:
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                }
-                done = 1;
-        } while (!done);
-        return 0;
+        if (move != MOVE_DOWN)
+                return WALK_PRED_DEFAULT;
+        if (!(pred->index & FILTER_PRED_FOLD))
+                return WALK_PRED_DEFAULT;
+        *err = fold_pred(preds, pred);
+        if (*err)
+                return WALK_PRED_ABORT;
+        /* eveyrhing below is folded, continue with parent */
+        return WALK_PRED_PARENT;
 }
 /*
@@ -1542,51 +1515,8 @@ static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
 static int fold_pred_tree(struct event_filter *filter,
                           struct filter_pred *root)
 {
-        struct filter_pred *preds;
+        return walk_pred_tree(filter->preds, root, fold_pred_tree_cb,
-        struct filter_pred *pred;
+                              filter->preds);
-        enum move_type move = MOVE_DOWN;
-        int done = 0;
-        int err;
-        preds = filter->preds;
-        if  (!preds)
-                return -EINVAL;
-        pred = root;
-        do {
-                switch (move) {
-                case MOVE_DOWN:
-                        if (pred->index & FILTER_PRED_FOLD) {
-                                err = fold_pred(preds, pred);
-                                if (err)
-                                        return err;
-                                /* Folded nodes are like leafs */
-                        } else if (pred->left != FILTER_PRED_INVALID) {
-                                pred = &preds[pred->left];
-                                continue;
-                        }
-                        /* A leaf at the root is just a leaf in the tree */
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                case MOVE_UP_FROM_LEFT:
-                        pred = &preds[pred->right];
-                        move = MOVE_DOWN;
-                        continue;
-                case MOVE_UP_FROM_RIGHT:
-                        if (pred == root)
-                                break;
-                        pred = get_pred_parent(pred, preds,
-                                               pred->parent, &move);
-                        continue;
-                }
-                done = 1;
-        } while (!done);
-        return 0;
 }
 static int replace_preds(struct ftrace_event_call *call,
@@ -1643,27 +1573,17 @@ static int replace_preds(struct ftrace_event_call *call,
                        goto fail;
                }
-                if (elt->op == OP_AND || elt->op == OP_OR) {
+                pred = create_pred(ps, call, elt->op, operand1, operand2);
-                        pred = create_logical_pred(elt->op);
+                if (!pred) {
-                        goto add_pred;
-                }
-                if (!operand1 || !operand2) {
-                        parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
                        err = -EINVAL;
                        goto fail;
                }
-                pred = create_pred(elt->op, operand1, operand2);
+                if (!dry_run) {
-add_pred:
+                        err = filter_add_pred(ps, filter, pred, &stack);
-                if (!pred) {
+                        if (err)
-                        err = -ENOMEM;
+                                goto fail;
-                        goto fail;
                }
-                err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
-                filter_free_pred(pred);
-                if (err)
-                        goto fail;
                operand1 = operand2 = NULL;
        }
@@ -1958,17 +1878,14 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
        int err;
        struct event_filter *filter;
        struct filter_parse_state *ps;
-        struct ftrace_event_call *call = NULL;
+        struct ftrace_event_call *call;
        mutex_lock(&event_mutex);
-        list_for_each_entry(call, &ftrace_events, list) {
+        call = event->tp_event;
-                if (call->event.type == event_id)
-                        break;
-        }
        err = -EINVAL;
-        if (&call->list == &ftrace_events)
+        if (!call)
                goto out_unlock;
        err = -EEXIST;
@@ -2012,3 +1929,215 @@ out_unlock:
 #endif /* CONFIG_PERF_EVENTS */
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+#define CREATE_TRACE_POINTS
+#include "trace_events_filter_test.h"
+static int test_get_filter(char *filter_str, struct ftrace_event_call *call,
+                           struct event_filter **pfilter)
+{
+        struct event_filter *filter;
+        struct filter_parse_state *ps;
+        int err = -ENOMEM;
+        filter = __alloc_filter();
+        if (!filter)
+                goto out;
+        ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+        if (!ps)
+                goto free_filter;
+        parse_init(ps, filter_ops, filter_str);
+        err = filter_parse(ps);
+        if (err)
+                goto free_ps;
+        err = replace_preds(call, filter, ps, filter_str, false);
+        if (!err)
+                *pfilter = filter;
+ free_ps:
+        filter_opstack_clear(ps);
+        postfix_clear(ps);
+        kfree(ps);
+ free_filter:
+        if (err)
+                __free_filter(filter);
+ out:
+        return err;
+}
+#define DATA_REC(m, va, vb, vc, vd, ve, vf, vg, vh, nvisit) \
+{ \
+        .filter = FILTER, \
+        .rec    = { .a = va, .b = vb, .c = vc, .d = vd, \
+                    .e = ve, .f = vf, .g = vg, .h = vh }, \
+        .match  = m, \
+        .not_visited = nvisit, \
+}
+#define YES 1
+#define NO  0
+static struct test_filter_data_t {
+        char *filter;
+        struct ftrace_raw_ftrace_test_filter rec;
+        int match;
+        char *not_visited;
+} test_filter_data[] = {
+#define FILTER "a == 1 && b == 1 && c == 1 && d == 1 && " \
+               "e == 1 && f == 1 && g == 1 && h == 1"
+        DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, ""),
+        DATA_REC(NO,  0, 1, 1, 1, 1, 1, 1, 1, "bcdefgh"),
+        DATA_REC(NO,  1, 1, 1, 1, 1, 1, 1, 0, ""),
+#undef FILTER
+#define FILTER "a == 1 || b == 1 || c == 1 || d == 1 || " \
+               "e == 1 || f == 1 || g == 1 || h == 1"
+        DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 0, ""),
+        DATA_REC(YES, 0, 0, 0, 0, 0, 0, 0, 1, ""),
+        DATA_REC(YES, 1, 0, 0, 0, 0, 0, 0, 0, "bcdefgh"),
+#undef FILTER
+#define FILTER "(a == 1 || b == 1) && (c == 1 || d == 1) && " \
+               "(e == 1 || f == 1) && (g == 1 || h == 1)"
+        DATA_REC(NO,  0, 0, 1, 1, 1, 1, 1, 1, "dfh"),
+        DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
+        DATA_REC(YES, 1, 0, 1, 0, 0, 1, 0, 1, "bd"),
+        DATA_REC(NO,  1, 0, 1, 0, 0, 1, 0, 0, "bd"),
+#undef FILTER
+#define FILTER "(a == 1 && b == 1) || (c == 1 && d == 1) || " \
+               "(e == 1 && f == 1) || (g == 1 && h == 1)"
+        DATA_REC(YES, 1, 0, 1, 1, 1, 1, 1, 1, "efgh"),
+        DATA_REC(YES, 0, 0, 0, 0, 0, 0, 1, 1, ""),
+        DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 1, ""),
+#undef FILTER
+#define FILTER "(a == 1 && b == 1) && (c == 1 && d == 1) && " \
+               "(e == 1 && f == 1) || (g == 1 && h == 1)"
+        DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 0, "gh"),
+        DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 1, ""),
+        DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, ""),
+#undef FILTER
+#define FILTER "((a == 1 || b == 1) || (c == 1 || d == 1) || " \
+               "(e == 1 || f == 1)) && (g == 1 || h == 1)"
+        DATA_REC(YES, 1, 1, 1, 1, 1, 1, 0, 1, "bcdef"),
+        DATA_REC(NO,  0, 0, 0, 0, 0, 0, 0, 0, ""),
+        DATA_REC(YES, 1, 1, 1, 1, 1, 0, 1, 1, "h"),
+#undef FILTER
+#define FILTER "((((((((a == 1) && (b == 1)) || (c == 1)) && (d == 1)) || " \
+               "(e == 1)) && (f == 1)) || (g == 1)) && (h == 1))"
+        DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "ceg"),
+        DATA_REC(NO,  0, 1, 0, 1, 0, 1, 0, 1, ""),
+        DATA_REC(NO,  1, 0, 1, 0, 1, 0, 1, 0, ""),
+#undef FILTER
+#define FILTER "((((((((a == 1) || (b == 1)) && (c == 1)) || (d == 1)) && " \
+               "(e == 1)) || (f == 1)) && (g == 1)) || (h == 1))"
+        DATA_REC(YES, 1, 1, 1, 1, 1, 1, 1, 1, "bdfh"),
+        DATA_REC(YES, 0, 1, 0, 1, 0, 1, 0, 1, ""),
+        DATA_REC(YES, 1, 0, 1, 0, 1, 0, 1, 0, "bdfh"),
+};
+#undef DATA_REC
+#undef FILTER
+#undef YES
+#undef NO
+#define DATA_CNT (sizeof(test_filter_data)/sizeof(struct test_filter_data_t))
+static int test_pred_visited;
+static int test_pred_visited_fn(struct filter_pred *pred, void *event)
+{
+        struct ftrace_event_field *field = pred->field;
+        test_pred_visited = 1;
+        printk(KERN_INFO "\npred visited %s\n", field->name);
+        return 1;
+}
+static int test_walk_pred_cb(enum move_type move, struct filter_pred *pred,
+                             int *err, void *data)
+{
+        char *fields = data;
+        if ((move == MOVE_DOWN) &&
+            (pred->left == FILTER_PRED_INVALID)) {
+                struct ftrace_event_field *field = pred->field;
+                if (!field) {
+                        WARN(1, "all leafs should have field defined");
+                        return WALK_PRED_DEFAULT;
+                }
+                if (!strchr(fields, *field->name))
+                        return WALK_PRED_DEFAULT;
+                WARN_ON(!pred->fn);
+                pred->fn = test_pred_visited_fn;
+        }
+        return WALK_PRED_DEFAULT;
+}
+static __init int ftrace_test_event_filter(void)
+{
+        int i;
+        printk(KERN_INFO "Testing ftrace filter: ");
+        for (i = 0; i < DATA_CNT; i++) {
+                struct event_filter *filter = NULL;
+                struct test_filter_data_t *d = &test_filter_data[i];
+                int err;
+                err = test_get_filter(d->filter, &event_ftrace_test_filter,
+                                      &filter);
+                if (err) {
+                        printk(KERN_INFO
+                               "Failed to get filter for '%s', err %d\n",
+                               d->filter, err);
+                        break;
+                }
+                /*
+                 * The preemption disabling is not really needed for self
+                 * tests, but the rcu dereference will complain without it.
+                 */
+                preempt_disable();
+                if (*d->not_visited)
+                        walk_pred_tree(filter->preds, filter->root,
+                                       test_walk_pred_cb,
+                                       d->not_visited);
+                test_pred_visited = 0;
+                err = filter_match_preds(filter, &d->rec);
+                preempt_enable();
+                __free_filter(filter);
+                if (test_pred_visited) {
+                        printk(KERN_INFO
+                               "Failed, unwanted pred visited for filter %s\n",
+                               d->filter);
+                        break;
+                }
+                if (err != d->match) {
+                        printk(KERN_INFO
+                               "Failed to match filter '%s', expected %d\n",
+                               d->filter, d->match);
+                        break;
+                }
+        }
+        if (i == DATA_CNT)
+                printk(KERN_CONT "OK\n");
+        return 0;
+}
+late_initcall(ftrace_test_event_filter);
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h
new file mode 100644
index 00000000000..bfd4dba0d60
--- /dev/null
+++ b/kernel/trace/trace_events_filter_test.h
@@ -0,0 +1,50 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM test
+#if !defined(_TRACE_TEST_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_TEST_H
+#include <linux/tracepoint.h>
+TRACE_EVENT(ftrace_test_filter,
+        TP_PROTO(int a, int b, int c, int d, int e, int f, int g, int h),
+        TP_ARGS(a, b, c, d, e, f, g, h),
+        TP_STRUCT__entry(
+                __field(int, a)
+                __field(int, b)
+                __field(int, c)
+                __field(int, d)
+                __field(int, e)
+                __field(int, f)
+                __field(int, g)
+                __field(int, h)
+        ),
+        TP_fast_assign(
+                __entry->a = a;
+                __entry->b = b;
+                __entry->c = c;
+                __entry->d = d;
+                __entry->e = e;
+                __entry->f = f;
+                __entry->g = g;
+                __entry->h = h;
+        ),
+        TP_printk("a %d, b %d, c %d, d %d, e %d, f %d, g %d, h %d",
+                  __entry->a, __entry->b, __entry->c, __entry->d,
+                  __entry->e, __entry->f, __entry->g, __entry->h)
+);
+#endif /* _TRACE_TEST_H || TRACE_HEADER_MULTI_READ */
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_events_filter_test
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 667aa8cc0cf..20dad0d7a16 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -23,7 +23,7 @@ static int				tracer_enabled __read_mostly;
 static DEFINE_PER_CPU(int, tracing_cpu);
-static DEFINE_SPINLOCK(max_trace_lock);
+static DEFINE_RAW_SPINLOCK(max_trace_lock);
 enum {
        TRACER_IRQS_OFF         = (1 << 1),
@@ -321,7 +321,7 @@ check_critical_timing(struct trace_array *tr,
        if (!report_latency(delta))
                goto out;
-        spin_lock_irqsave(&max_trace_lock, flags);
+        raw_spin_lock_irqsave(&max_trace_lock, flags);
        /* check if we are still the max latency */
        if (!report_latency(delta))
@@ -344,7 +344,7 @@ check_critical_timing(struct trace_array *tr,
        max_sequence++;
 out_unlock:
-        spin_unlock_irqrestore(&max_trace_lock, flags);
+        raw_spin_unlock_irqrestore(&max_trace_lock, flags);
 out:
        data->critical_sequence = max_sequence;
@@ -505,13 +505,13 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
 #ifdef CONFIG_PREEMPT_TRACER
 void trace_preempt_on(unsigned long a0, unsigned long a1)
 {
-        if (preempt_trace())
+        if (preempt_trace() && !irq_trace())
                stop_critical_timing(a0, a1);
 }
 void trace_preempt_off(unsigned long a0, unsigned long a1)
 {
-        if (preempt_trace())
+        if (preempt_trace() && !irq_trace())
                start_critical_timing(a0, a1);
 }
 #endif /* CONFIG_PREEMPT_TRACER */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5fb3697bf0e..00d527c945a 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -836,11 +836,17 @@ static void __unregister_trace_probe(struct trace_probe *tp)
 }
 /* Unregister a trace_probe and probe_event: call with locking probe_lock */
-static void unregister_trace_probe(struct trace_probe *tp)
+static int unregister_trace_probe(struct trace_probe *tp)
 {
+        /* Enabled event can not be unregistered */
+        if (trace_probe_is_enabled(tp))
+                return -EBUSY;
        __unregister_trace_probe(tp);
        list_del(&tp->list);
        unregister_probe_event(tp);
+        return 0;
 }
 /* Register a trace_probe and probe_event */
@@ -854,7 +860,9 @@ static int register_trace_probe(struct trace_probe *tp)
        /* Delete old (same name) event if exist */
        old_tp = find_trace_probe(tp->call.name, tp->call.class->system);
        if (old_tp) {
-                unregister_trace_probe(old_tp);
+                ret = unregister_trace_probe(old_tp);
+                if (ret < 0)
+                        goto end;
                free_trace_probe(old_tp);
        }
@@ -892,6 +900,7 @@ static int trace_probe_module_callback(struct notifier_block *nb,
        mutex_lock(&probe_lock);
        list_for_each_entry(tp, &probe_list, list) {
                if (trace_probe_within_module(tp, mod)) {
+                        /* Don't need to check busy - this should have gone. */
                        __unregister_trace_probe(tp);
                        ret = __register_trace_probe(tp);
                        if (ret)
@@ -1205,10 +1214,11 @@ static int create_trace_probe(int argc, char **argv)
                        return -ENOENT;
                }
                /* delete an event */
-                unregister_trace_probe(tp);
+                ret = unregister_trace_probe(tp);
-                free_trace_probe(tp);
+                if (ret == 0)
+                        free_trace_probe(tp);
                mutex_unlock(&probe_lock);
-                return 0;
+                return ret;
        }
        if (argc < 2) {
@@ -1317,18 +1327,29 @@ error:
        return ret;
 }
-static void release_all_trace_probes(void)
+static int release_all_trace_probes(void)
 {
        struct trace_probe *tp;
+        int ret = 0;
        mutex_lock(&probe_lock);
+        /* Ensure no probe is in use. */
+        list_for_each_entry(tp, &probe_list, list)
+                if (trace_probe_is_enabled(tp)) {
+                        ret = -EBUSY;
+                        goto end;
+                }
        /* TODO: Use batch unregistration */
        while (!list_empty(&probe_list)) {
                tp = list_entry(probe_list.next, struct trace_probe, list);
                unregister_trace_probe(tp);
                free_trace_probe(tp);
        }
+end:
        mutex_unlock(&probe_lock);
+        return ret;
 }
 /* Probes listing interfaces */
@@ -1380,9 +1401,13 @@ static const struct seq_operations probes_seq_op = {
 static int probes_open(struct inode *inode, struct file *file)
 {
-        if ((file->f_mode & FMODE_WRITE) &&
+        int ret;
-            (file->f_flags & O_TRUNC))
-                release_all_trace_probes();
+        if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
+                ret = release_all_trace_probes();
+                if (ret < 0)
+                        return ret;
+        }
        return seq_open(file, &probes_seq_op);
 }
@@ -2055,6 +2080,21 @@ static __init int kprobe_trace_self_tests_init(void)
        ret = target(1, 2, 3, 4, 5, 6);
+        /* Disable trace points before removing it */
+        tp = find_trace_probe("testprobe", KPROBE_EVENT_SYSTEM);
+        if (WARN_ON_ONCE(tp == NULL)) {
+                pr_warning("error on getting test probe.\n");
+                warn++;
+        } else
+                disable_trace_probe(tp, TP_FLAG_TRACE);
+        tp = find_trace_probe("testprobe2", KPROBE_EVENT_SYSTEM);
+        if (WARN_ON_ONCE(tp == NULL)) {
+                pr_warning("error on getting 2nd test probe.\n");
+                warn++;
+        } else
+                disable_trace_probe(tp, TP_FLAG_TRACE);
        ret = command_trace_probe("-:testprobe");
        if (WARN_ON_ONCE(ret)) {
                pr_warning("error on deleting a probe.\n");
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 1f06468a10d..6fd4ffd042f 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -59,18 +59,19 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
                        continue;
                }
+                fmt = NULL;
                tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
-                if (tb_fmt)
+                if (tb_fmt) {
                        fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
-                if (tb_fmt && fmt) {
+                        if (fmt) {
-                        list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+                                list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
-                        strcpy(fmt, *iter);
+                                strcpy(fmt, *iter);
-                        tb_fmt->fmt = fmt;
+                                tb_fmt->fmt = fmt;
-                        *iter = tb_fmt->fmt;
+                        } else
-                } else {
+                                kfree(tb_fmt);
-                        kfree(tb_fmt);
-                        *iter = NULL;
                }
+                *iter = fmt;
        }
        mutex_unlock(&btrace_mutex);
 }
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index ee7b5a0bb9f..cb654542c1a 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -2,6 +2,7 @@
 #include <trace/events/syscalls.h>
 #include <linux/slab.h>
 #include <linux/kernel.h>
+#include <linux/module.h>       /* for MODULE_NAME_LEN via KSYM_SYMBOL_LEN */
 #include <linux/ftrace.h>
 #include <linux/perf_event.h>
 #include <asm/syscall.h>
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index b219f1449c5..db110b8ae03 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -34,11 +34,16 @@ extern struct tracepoint * const __stop___tracepoints_ptrs[];
 static const int tracepoint_debug;
 /*
- * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the
+ * Tracepoints mutex protects the builtin and module tracepoints and the hash
- * builtin and module tracepoints and the hash table.
+ * table, as well as the local module list.
 */
 static DEFINE_MUTEX(tracepoints_mutex);
+#ifdef CONFIG_MODULES
+/* Local list of struct module */
+static LIST_HEAD(tracepoint_module_list);
+#endif /* CONFIG_MODULES */
 /*
 * Tracepoint hash table, containing the active tracepoints.
 * Protected by tracepoints_mutex.
@@ -292,9 +297,10 @@ static void disable_tracepoint(struct tracepoint *elem)
 * @end: end of the range
 *
 * Updates the probe callback corresponding to a range of tracepoints.
+ * Called with tracepoints_mutex held.
 */
-void tracepoint_update_probe_range(struct tracepoint * const *begin,
+static void tracepoint_update_probe_range(struct tracepoint * const *begin,
-                                   struct tracepoint * const *end)
+                                          struct tracepoint * const *end)
 {
        struct tracepoint * const *iter;
        struct tracepoint_entry *mark_entry;
@@ -302,7 +308,6 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin,
        if (!begin)
                return;
-        mutex_lock(&tracepoints_mutex);
        for (iter = begin; iter < end; iter++) {
                mark_entry = get_tracepoint((*iter)->name);
                if (mark_entry) {
@@ -312,11 +317,27 @@ void tracepoint_update_probe_range(struct tracepoint * const *begin,
                        disable_tracepoint(*iter);
                }
        }
-        mutex_unlock(&tracepoints_mutex);
 }
+#ifdef CONFIG_MODULES
+void module_update_tracepoints(void)
+{
+        struct tp_module *tp_mod;
+        list_for_each_entry(tp_mod, &tracepoint_module_list, list)
+                tracepoint_update_probe_range(tp_mod->tracepoints_ptrs,
+                        tp_mod->tracepoints_ptrs + tp_mod->num_tracepoints);
+}
+#else /* CONFIG_MODULES */
+void module_update_tracepoints(void)
+{
+}
+#endif /* CONFIG_MODULES */
 /*
 * Update probes, removing the faulty probes.
+ * Called with tracepoints_mutex held.
 */
 static void tracepoint_update_probes(void)
 {
@@ -359,11 +380,12 @@ int tracepoint_probe_register(const char *name, void *probe, void *data)
        mutex_lock(&tracepoints_mutex);
        old = tracepoint_add_probe(name, probe, data);
-        mutex_unlock(&tracepoints_mutex);
+        if (IS_ERR(old)) {
-        if (IS_ERR(old))
+                mutex_unlock(&tracepoints_mutex);
                return PTR_ERR(old);
+        }
        tracepoint_update_probes();             /* may update entry */
+        mutex_unlock(&tracepoints_mutex);
        release_probes(old);
        return 0;
 }
@@ -402,11 +424,12 @@ int tracepoint_probe_unregister(const char *name, void *probe, void *data)
        mutex_lock(&tracepoints_mutex);
        old = tracepoint_remove_probe(name, probe, data);
-        mutex_unlock(&tracepoints_mutex);
+        if (IS_ERR(old)) {
-        if (IS_ERR(old))
+                mutex_unlock(&tracepoints_mutex);
                return PTR_ERR(old);
+        }
        tracepoint_update_probes();             /* may update entry */
+        mutex_unlock(&tracepoints_mutex);
        release_probes(old);
        return 0;
 }
@@ -489,9 +512,8 @@ void tracepoint_probe_update_all(void)
        if (!list_empty(&old_probes))
                list_replace_init(&old_probes, &release_probes);
        need_update = 0;
-        mutex_unlock(&tracepoints_mutex);
        tracepoint_update_probes();
+        mutex_unlock(&tracepoints_mutex);
        list_for_each_entry_safe(pos, next, &release_probes, u.list) {
                list_del(&pos->u.list);
                call_rcu_sched(&pos->u.rcu, rcu_free_old_probes);
@@ -509,7 +531,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_update_all);
 * Will return the first tracepoint in the range if the input tracepoint is
 * NULL.
 */
-int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
+static int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
        struct tracepoint * const *begin, struct tracepoint * const *end)
 {
        if (!*tracepoint && begin != end) {
@@ -520,11 +542,12 @@ int tracepoint_get_iter_range(struct tracepoint * const **tracepoint,
                return 1;
        return 0;
 }
-EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
+#ifdef CONFIG_MODULES
 static void tracepoint_get_iter(struct tracepoint_iter *iter)
 {
        int found = 0;
+        struct tp_module *iter_mod;
        /* Core kernel tracepoints */
        if (!iter->module) {
@@ -534,12 +557,43 @@ static void tracepoint_get_iter(struct tracepoint_iter *iter)
                if (found)
                        goto end;
        }
-        /* tracepoints in modules. */
+        /* Tracepoints in modules */
-        found = module_get_iter_tracepoints(iter);
+        mutex_lock(&tracepoints_mutex);
+        list_for_each_entry(iter_mod, &tracepoint_module_list, list) {
+                /*
+                 * Sorted module list
+                 */
+                if (iter_mod < iter->module)
+                        continue;
+                else if (iter_mod > iter->module)
+                        iter->tracepoint = NULL;
+                found = tracepoint_get_iter_range(&iter->tracepoint,
+                        iter_mod->tracepoints_ptrs,
+                        iter_mod->tracepoints_ptrs
+                                + iter_mod->num_tracepoints);
+                if (found) {
+                        iter->module = iter_mod;
+                        break;
+                }
+        }
+        mutex_unlock(&tracepoints_mutex);
 end:
        if (!found)
                tracepoint_iter_reset(iter);
 }
+#else /* CONFIG_MODULES */
+static void tracepoint_get_iter(struct tracepoint_iter *iter)
+{
+        int found = 0;
+        /* Core kernel tracepoints */
+        found = tracepoint_get_iter_range(&iter->tracepoint,
+                        __start___tracepoints_ptrs,
+                        __stop___tracepoints_ptrs);
+        if (!found)
+                tracepoint_iter_reset(iter);
+}
+#endif /* CONFIG_MODULES */
 void tracepoint_iter_start(struct tracepoint_iter *iter)
 {
@@ -566,26 +620,98 @@ EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
 void tracepoint_iter_reset(struct tracepoint_iter *iter)
 {
+#ifdef CONFIG_MODULES
        iter->module = NULL;
+#endif /* CONFIG_MODULES */
        iter->tracepoint = NULL;
 }
 EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
 #ifdef CONFIG_MODULES
+static int tracepoint_module_coming(struct module *mod)
+{
+        struct tp_module *tp_mod, *iter;
+        int ret = 0;
+        /*
+         * We skip modules that tain the kernel, especially those with different
+         * module header (for forced load), to make sure we don't cause a crash.
+         */
+        if (mod->taints)
+                return 0;
+        mutex_lock(&tracepoints_mutex);
+        tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL);
+        if (!tp_mod) {
+                ret = -ENOMEM;
+                goto end;
+        }
+        tp_mod->num_tracepoints = mod->num_tracepoints;
+        tp_mod->tracepoints_ptrs = mod->tracepoints_ptrs;
+        /*
+         * tracepoint_module_list is kept sorted by struct module pointer
+         * address for iteration on tracepoints from a seq_file that can release
+         * the mutex between calls.
+         */
+        list_for_each_entry_reverse(iter, &tracepoint_module_list, list) {
+                BUG_ON(iter == tp_mod); /* Should never be in the list twice */
+                if (iter < tp_mod) {
+                        /* We belong to the location right after iter. */
+                        list_add(&tp_mod->list, &iter->list);
+                        goto module_added;
+                }
+        }
+        /* We belong to the beginning of the list */
+        list_add(&tp_mod->list, &tracepoint_module_list);
+module_added:
+        tracepoint_update_probe_range(mod->tracepoints_ptrs,
+                mod->tracepoints_ptrs + mod->num_tracepoints);
+end:
+        mutex_unlock(&tracepoints_mutex);
+        return ret;
+}
+static int tracepoint_module_going(struct module *mod)
+{
+        struct tp_module *pos;
+        mutex_lock(&tracepoints_mutex);
+        tracepoint_update_probe_range(mod->tracepoints_ptrs,
+                mod->tracepoints_ptrs + mod->num_tracepoints);
+        list_for_each_entry(pos, &tracepoint_module_list, list) {
+                if (pos->tracepoints_ptrs == mod->tracepoints_ptrs) {
+                        list_del(&pos->list);
+                        kfree(pos);
+                        break;
+                }
+        }
+        /*
+         * In the case of modules that were tainted at "coming", we'll simply
+         * walk through the list without finding it. We cannot use the "tainted"
+         * flag on "going", in case a module taints the kernel only after being
+         * loaded.
+         */
+        mutex_unlock(&tracepoints_mutex);
+        return 0;
+}
 int tracepoint_module_notify(struct notifier_block *self,
                             unsigned long val, void *data)
 {
        struct module *mod = data;
+        int ret = 0;
        switch (val) {
        case MODULE_STATE_COMING:
+                ret = tracepoint_module_coming(mod);
+                break;
+        case MODULE_STATE_LIVE:
+                break;
        case MODULE_STATE_GOING:
-                tracepoint_update_probe_range(mod->tracepoints_ptrs,
+                ret = tracepoint_module_going(mod);
-                        mod->tracepoints_ptrs + mod->num_tracepoints);
                break;
        }
-        return 0;
+        return ret;
 }
 struct notifier_block tracepoint_module_nb = {
@@ -598,7 +724,6 @@ static int init_tracepoints(void)
        return register_module_notifier(&tracepoint_module_nb);
 }
 __initcall(init_tracepoints);
 #endif /* CONFIG_MODULES */
 #ifdef CONFIG_HAVE_SYSCALL_TRACEPOINTS
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 24dc60d9fa1..5bbfac85866 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -78,6 +78,7 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
 #define KB 1024
 #define MB (1024*KB)
+#define KB_MASK (~(KB-1))
 /*
 * fill in extended accounting fields
 */
@@ -95,14 +96,14 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
                stats->hiwater_vm    = get_mm_hiwater_vm(mm)  * PAGE_SIZE / KB;
                mmput(mm);
        }
-        stats->read_char        = p->ioac.rchar;
+        stats->read_char        = p->ioac.rchar & KB_MASK;
-        stats->write_char       = p->ioac.wchar;
+        stats->write_char       = p->ioac.wchar & KB_MASK;
-        stats->read_syscalls    = p->ioac.syscr;
+        stats->read_syscalls    = p->ioac.syscr & KB_MASK;
-        stats->write_syscalls   = p->ioac.syscw;
+        stats->write_syscalls   = p->ioac.syscw & KB_MASK;
 #ifdef CONFIG_TASK_IO_ACCOUNTING
-        stats->read_bytes       = p->ioac.read_bytes;
+        stats->read_bytes       = p->ioac.read_bytes & KB_MASK;
-        stats->write_bytes      = p->ioac.write_bytes;
+        stats->write_bytes      = p->ioac.write_bytes & KB_MASK;
-        stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes;
+        stats->cancelled_write_bytes = p->ioac.cancelled_write_bytes & KB_MASK;
 #else
        stats->read_bytes       = 0;
        stats->write_bytes      = 0;
diff --git a/kernel/up.c b/kernel/up.c
index 1ff27a28bb7..c54c75e9faf 100644
--- a/kernel/up.c
+++ b/kernel/up.c
@@ -4,7 +4,7 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/smp.h>
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index 92cb706c7fc..1744bb80f1f 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -2,7 +2,7 @@
 #include <linux/user-return-notifier.h>
 #include <linux/percpu.h>
 #include <linux/sched.h>
-#include <linux/module.h>
+#include <linux/export.h>
 static DEFINE_PER_CPU(struct hlist_head, return_notifier_list);
diff --git a/kernel/user.c b/kernel/user.c
index 9e03e9c1df8..71dd2363ab0 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -14,7 +14,7 @@
 #include <linux/bitops.h>
 #include <linux/key.h>
 #include <linux/interrupt.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/user_namespace.h>
 /*
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 9da289c34f2..3b906e98b1d 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -5,7 +5,7 @@
 *  License.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/nsproxy.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index bff131b9510..405caf91aad 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -9,7 +9,7 @@
 *  License.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/err.h>
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index a2cd77e70d4..63da38c2d82 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -9,10 +9,11 @@
 *  License.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/uts.h>
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
+#include <linux/wait.h>
 static void *get_uts(ctl_table *table, int write)
 {
@@ -51,12 +52,19 @@ static int proc_do_uts_string(ctl_table *table, int write,
        uts_table.data = get_uts(table, write);
        r = proc_dostring(&uts_table,write,buffer,lenp, ppos);
        put_uts(table, write, uts_table.data);
+        if (write)
+                proc_sys_poll_notify(table->poll);
        return r;
 }
 #else
 #define proc_do_uts_string NULL
 #endif
+static DEFINE_CTL_TABLE_POLL(hostname_poll);
+static DEFINE_CTL_TABLE_POLL(domainname_poll);
 static struct ctl_table uts_kern_table[] = {
        {
                .procname       = "ostype",
@@ -85,6 +93,7 @@ static struct ctl_table uts_kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.nodename),
                .mode           = 0644,
                .proc_handler   = proc_do_uts_string,
+                .poll           = &hostname_poll,
        },
        {
                .procname       = "domainname",
@@ -92,6 +101,7 @@ static struct ctl_table uts_kern_table[] = {
                .maxlen         = sizeof(init_uts_ns.name.domainname),
                .mode           = 0644,
                .proc_handler   = proc_do_uts_string,
+                .poll           = &domainname_poll,
        },
        {}
 };
@@ -105,6 +115,19 @@ static struct ctl_table uts_root_table[] = {
        {}
 };
+#ifdef CONFIG_PROC_SYSCTL
+/*
+ * Notify userspace about a change in a certain entry of uts_kern_table,
+ * identified by the parameter proc.
+ */
+void uts_proc_notify(enum uts_proc proc)
+{
+        struct ctl_table *table = &uts_kern_table[proc];
+        proc_sys_poll_notify(table->poll);
+}
+#endif
 static int __init utsname_sysctl_init(void)
 {
        register_sysctl_table(uts_root_table);
diff --git a/kernel/wait.c b/kernel/wait.c
index f45ea8d2a1c..26fa7797f90 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -4,7 +4,7 @@
 * (C) 2004 William Irwin, Oracle
 */
 #include <linux/init.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/wait.h>
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 36491cd5b7d..1d7bca7f4f5 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -321,7 +321,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 */
 static int watchdog(void *unused)
 {
-        static struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
+        struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
        struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
        sched_setscheduler(current, SCHED_FIFO, &param);
@@ -350,7 +350,8 @@ static int watchdog(void *unused)
                set_current_state(TASK_INTERRUPTIBLE);
        }
        __set_current_state(TASK_RUNNING);
+        param.sched_priority = 0;
+        sched_setscheduler(current, SCHED_NORMAL, &param);
        return 0;
 }
@@ -438,7 +439,7 @@ static int watchdog_enable(int cpu)
        /* create the watchdog thread */
        if (!p) {
-                p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
+                p = kthread_create_on_node(watchdog, NULL, cpu_to_node(cpu), "watchdog/%d", cpu);
                if (IS_ERR(p)) {
                        printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
                        if (!err) {
@@ -480,6 +481,8 @@ static void watchdog_disable(int cpu)
        }
 }
+/* sysctl functions */
+#ifdef CONFIG_SYSCTL
 static void watchdog_enable_all_cpus(void)
 {
        int cpu;
@@ -509,8 +512,6 @@ static void watchdog_disable_all_cpus(void)
 }
-/* sysctl functions */
-#ifdef CONFIG_SYSCTL
 /*
 * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
 */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 25fb1b0e53f..42fa9ad0a81 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -23,7 +23,7 @@
 * Please read Documentation/workqueue.txt for details.
 */
-#include <linux/module.h>
+#include <linux/export.h>
 #include <linux/kernel.h>
 #include <linux/sched.h>
 #include <linux/init.h>
@@ -2412,8 +2412,13 @@ reflush:
        for_each_cwq_cpu(cpu, wq) {
                struct cpu_workqueue_struct *cwq = get_cwq(cpu, wq);
+                bool drained;
-                if (!cwq->nr_active && list_empty(&cwq->delayed_works))
+                spin_lock_irq(&cwq->gcwq->lock);
+                drained = !cwq->nr_active && list_empty(&cwq->delayed_works);
+                spin_unlock_irq(&cwq->gcwq->lock);
+                if (drained)
                        continue;
                if (++flush_cnt == 10 ||